From 99c0ce977481f2270e17ec722c11d93169c9ee38 Mon Sep 17 00:00:00 2001 From: Stephen Oost Date: Wed, 4 Sep 2024 10:47:34 -0700 Subject: [PATCH 001/393] prov/tcp: initialize addr_size when duplicating an av This addresses https://scan4.scan.coverity.com/#/project-view/61734/10344?selectedIssue=441178 Signed-off-by: Stephen Oost --- prov/tcp/src/xnet_rdm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/tcp/src/xnet_rdm.c b/prov/tcp/src/xnet_rdm.c index 420f606f658..68f320cef4e 100644 --- a/prov/tcp/src/xnet_rdm.c +++ b/prov/tcp/src/xnet_rdm.c @@ -700,7 +700,7 @@ static int xnet_mplex_av_dup(struct util_ep *ep, struct xnet_mplex_av *mplex_av, { int ret, i; struct util_av *subav; - size_t addr_size; + size_t addr_size = sizeof(struct sockaddr_in6); char addr[sizeof(struct sockaddr_in6)]; struct fi_av_attr av_attr = { .type = ep->domain->av_type, From ffeb3a83f837bd2c68ea5bf4b466556795a32ee0 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Fri, 30 Aug 2024 12:49:24 -0700 Subject: [PATCH 002/393] prov/efa: Check p2p support to use rdma read p2p is required for rdma read because the user buffer has to be registered to efa device. Add p2p check for subprotocols that use RDMA read. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_msg.c | 3 ++- prov/efa/src/rdm/efa_rdm_rma.c | 28 +++++++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index a19040c9b93..2e95a4a721f 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -88,7 +88,8 @@ int efa_rdm_msg_select_rtm(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *tx readbase_rtm = efa_rdm_peer_select_readbase_rtm(txe->peer, efa_rdm_ep, txe); - if (txe->total_len >= hmem_info[iface].min_read_msg_size && + if (use_p2p && + txe->total_len >= hmem_info[iface].min_read_msg_size && efa_rdm_interop_rdma_read(efa_rdm_ep, txe->peer) && (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(efa_rdm_ep)))) return readbase_rtm; diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index 5ebecc3f97c..15769f5bc56 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -117,7 +117,8 @@ ssize_t efa_rdm_rma_post_efa_emulated_read(struct efa_rdm_ep *ep, struct efa_rdm ssize_t efa_rdm_rma_post_read(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) { bool use_device_read = false; - ssize_t ret; + int use_p2p; + ssize_t ret, err; /* * A handshake is required to choose the correct protocol (whether to use device read). @@ -127,7 +128,14 @@ ssize_t efa_rdm_rma_post_read(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) if (!(txe->peer->is_self) && !(txe->peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)) return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); - if (efa_rdm_interop_rdma_read(ep, txe->peer)) { + /* Check p2p support. Cannot use device read when p2p is not available. */ + err = efa_rdm_ep_use_p2p(ep, txe->desc[0]); + if (err < 0) + return err; + + use_p2p = err; + + if (use_p2p && efa_rdm_interop_rdma_read(ep, txe->peer)) { /* RDMA read interoperability check also checks domain.use_device_rdma, * so we do not check it here */ @@ -137,11 +145,6 @@ ssize_t efa_rdm_rma_post_read(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) return -FI_EOPNOTSUPP; } - /* - * Not going to check efa_ep->hmem_p2p_opt here, if the remote side - * gave us a valid MR we should just honor the request even if p2p is - * disabled. - */ if (use_device_read) { ret = efa_rdm_ope_prepare_to_post_read(txe); if (ret) @@ -351,7 +354,7 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) { ssize_t err; bool delivery_complete_requested; - int ctrl_type, iface; + int ctrl_type, iface, use_p2p; size_t max_eager_rtw_data_size; /* @@ -388,9 +391,16 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) max_eager_rtw_data_size = efa_rdm_txe_max_req_data_capacity(ep, txe, EFA_RDM_EAGER_RTW_PKT); } + err = efa_rdm_ep_use_p2p(ep, txe->desc[0]); + if (err < 0) + return err; + + use_p2p = err; + iface = txe->desc[0] ? ((struct efa_mr*) txe->desc[0])->peer.iface : FI_HMEM_SYSTEM; - if (txe->total_len >= efa_rdm_ep_domain(ep)->hmem_info[iface].min_read_write_size && + if (use_p2p && + txe->total_len >= efa_rdm_ep_domain(ep)->hmem_info[iface].min_read_write_size && efa_rdm_interop_rdma_read(ep, txe->peer) && (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(ep)))) { err = efa_rdm_ope_post_send(txe, EFA_RDM_LONGREAD_RTW_PKT); From 036e2dd556a9bf18dfcc30f6a405c0515a27deb8 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Fri, 6 Sep 2024 11:27:39 -0700 Subject: [PATCH 003/393] prov/shm: fix incorrect capability set SMR_DOMAIN_CAPS was recently added to include FI_PEER, FI_AV_USER_ID but a typo caused an smr_info data corruption Signed-off-by: Alexia Ingerson --- prov/shm/src/smr_attr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prov/shm/src/smr_attr.c b/prov/shm/src/smr_attr.c index 7b300f118bb..c1e987ec50d 100644 --- a/prov/shm/src/smr_attr.c +++ b/prov/shm/src/smr_attr.c @@ -149,8 +149,7 @@ struct fi_info smr_hmem_info = { }; struct fi_info smr_info = { - .caps = SMR_TX_CAPS | SMR_RX_CAPS | FI_MULTI_RECV | FI_LOCAL_COMM, - SMR_DOMAIN_CAPS, + .caps = SMR_TX_CAPS | SMR_RX_CAPS | FI_MULTI_RECV | SMR_DOMAIN_CAPS, .addr_format = FI_ADDR_STR, .tx_attr = &smr_tx_attr, .rx_attr = &smr_rx_attr, From 0a4bd475a6e6bb8baac4c74e7f9a91c1abdcce9a Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 10 Sep 2024 20:33:55 +0000 Subject: [PATCH 004/393] prov/efa: Adjust log level for shm disabling. Disabling shm via fi_setopt shouldn't be a warning. Move the logging to info level. Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 43241795ad2..c706d784a5d 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1437,7 +1437,7 @@ static int efa_rdm_ep_set_cuda_api_permitted(struct efa_rdm_ep *ep, bool cuda_ap static int efa_rdm_ep_set_shared_memory_permitted(struct efa_rdm_ep *ep, bool shm_permitted) { if (!shm_permitted) { - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "FI_OPT_SHARED_MEMORY_PERMITTED set to false\n"); ep->shm_permitted = false; return FI_SUCCESS; From 32220d7dc4b2ee4fb7075c59c99b3ad877306828 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Fri, 6 Sep 2024 13:10:19 -0700 Subject: [PATCH 005/393] prov/efa: Sender switch to emulated long CTS write if p2p unavailable The emulated long-read write subprotocol uses RDMA read to emulate a write operation. When p2p is not available, sender should fall back to emulated long CTS write protocol. Also change the log level to info. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_ope.c | 4 ++-- prov/efa/src/rdm/efa_rdm_pke_nonreq.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index 5002c938f45..a44f2debacc 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -1781,7 +1781,7 @@ ssize_t efa_rdm_ope_post_send_fallback(struct efa_rdm_ope *ope, switch (pkt_type) { case EFA_RDM_LONGREAD_MSGRTM_PKT: case EFA_RDM_RUNTREAD_MSGRTM_PKT: - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS untagged " "protocol because memory registration limit " "was reached on the sender\n"); @@ -1789,7 +1789,7 @@ ssize_t efa_rdm_ope_post_send_fallback(struct efa_rdm_ope *ope, ope, EFA_RDM_LONGCTS_MSGRTM_PKT); case EFA_RDM_LONGREAD_TAGRTM_PKT: case EFA_RDM_RUNTREAD_TAGRTM_PKT: - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS tagged protocol " "because memory registration limit was " "reached on the sender\n"); diff --git a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c index c990eacedc9..3c384743c77 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c +++ b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c @@ -700,14 +700,19 @@ void efa_rdm_pke_handle_read_nack_recv(struct efa_rdm_pke *pkt_entry) efa_rdm_pke_release_rx(pkt_entry); txe->internal_flags |= EFA_RDM_OPE_READ_NACK; - if (txe->op == ofi_op_tagged) { - EFA_WARN(FI_LOG_EP_CTRL, + if (txe->op == ofi_op_write) { + EFA_INFO(FI_LOG_EP_CTRL, + "Sender fallback to emulated long CTS write " + "protocol because p2p is not available\n"); + efa_rdm_ope_post_send_or_queue(txe, EFA_RDM_LONGCTS_RTW_PKT); + } else if (txe->op == ofi_op_tagged) { + EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS tagged " "protocol because memory registration limit " "was reached on the receiver\n"); efa_rdm_ope_post_send_or_queue(txe, EFA_RDM_LONGCTS_TAGRTM_PKT); } else { - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS untagged " "protocol because memory registration limit " "was reached on the receiver\n"); From deaa841d887a2a5231b20228c01a69450706c19c Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Fri, 6 Sep 2024 13:05:23 -0700 Subject: [PATCH 006/393] prov/efa: Receiver send NACK if p2p is unavailable RDMA read requires p2p. Use READ NACK protocol when p2p is not available. Also change the log level to info. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_pke_rtm.c | 43 +++-------------- prov/efa/src/rdm/efa_rdm_pke_rtw.c | 4 +- prov/efa/src/rdm/efa_rdm_pke_utils.h | 72 ++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 39 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtm.c b/prov/efa/src/rdm/efa_rdm_pke_rtm.c index 4d8dc735e4b..a96494f02a4 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtm.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtm.c @@ -857,14 +857,12 @@ ssize_t efa_rdm_pke_proc_matched_mulreq_rtm(struct efa_rdm_pke *pkt_entry) struct efa_rdm_ep *ep; struct efa_rdm_ope *rxe; struct efa_rdm_pke *cur, *nxt; - struct efa_rdm_peer *peer; int pkt_type; ssize_t ret, err; uint64_t msg_id; ep = pkt_entry->ep; rxe = pkt_entry->ope; - peer = rxe->peer; pkt_type = efa_rdm_pke_get_base_hdr(pkt_entry)->type; ret = 0; @@ -883,20 +881,9 @@ ssize_t efa_rdm_pke_proc_matched_mulreq_rtm(struct efa_rdm_pke *pkt_entry) efa_rdm_tracepoint(runtread_read_posted, rxe->msg_id, (size_t) rxe->cq_entry.op_context, rxe->total_len); - err = efa_rdm_ope_post_remote_read_or_queue(rxe); - if (err) { - if (err == -FI_ENOMR) { - if (efa_rdm_peer_support_read_nack(peer)) - /* Only set the flag here. The NACK - * packet is sent after all runting read - * RTM packets have been received */ - rxe->internal_flags |= EFA_RDM_OPE_READ_NACK; - else - ret = -FI_EAGAIN; - } else { - return err; - } - } + err = efa_rdm_pke_post_remote_read_or_nack(ep, pkt_entry, rxe); + if (err) + return err; } } @@ -912,7 +899,7 @@ ssize_t efa_rdm_pke_proc_matched_mulreq_rtm(struct efa_rdm_pke *pkt_entry) if (efa_rdm_ope_mulreq_total_data_size(rxe, pkt_type) == rxe->bytes_received_via_mulreq) { if (rxe->internal_flags & EFA_RDM_OPE_READ_NACK) { - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Receiver sending long read NACK " "packet because memory registration " "limit was reached on the receiver\n"); @@ -1198,12 +1185,10 @@ ssize_t efa_rdm_pke_proc_matched_longread_rtm(struct efa_rdm_pke *pkt_entry) struct efa_rdm_longread_rtm_base_hdr *rtm_hdr; struct fi_rma_iov *read_iov; struct efa_rdm_ep *ep; - struct efa_rdm_peer *peer; int err; rxe = pkt_entry->ope; ep = rxe->ep; - peer = rxe->peer; rtm_hdr = efa_rdm_pke_get_longread_rtm_base_hdr(pkt_entry); read_iov = (struct fi_rma_iov *)(pkt_entry->wiredata + efa_rdm_pke_get_req_hdr_size(pkt_entry)); @@ -1216,24 +1201,8 @@ ssize_t efa_rdm_pke_proc_matched_longread_rtm(struct efa_rdm_pke *pkt_entry) efa_rdm_tracepoint(longread_read_posted, rxe->msg_id, (size_t) rxe->cq_entry.op_context, rxe->total_len); - err = efa_rdm_ope_post_remote_read_or_queue(rxe); - if (err == -FI_ENOMR) { - if (efa_rdm_peer_support_read_nack(peer)) { - EFA_WARN(FI_LOG_EP_CTRL, "Receiver sending long read " - "NACK packet because memory " - "registration limit was " - "reached on the receiver\n"); - efa_rdm_rxe_map_insert(&ep->rxe_map, pkt_entry, rxe); - rxe->internal_flags |= EFA_RDM_OPE_READ_NACK; - err = efa_rdm_ope_post_send_or_queue( - rxe, EFA_RDM_READ_NACK_PKT); - } else { - /* Peer does not support the READ_NACK packet. So we - * return EAGAIN and hope that the app runs progress - * again which will free some MR registrations */ - err = -FI_EAGAIN; - } - } + err = efa_rdm_pke_post_remote_read_or_nack(ep, pkt_entry, rxe); + efa_rdm_pke_release_rx(pkt_entry); return err; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtw.c b/prov/efa/src/rdm/efa_rdm_pke_rtw.c index c7dc43f2490..2a1b6366d40 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtw.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtw.c @@ -557,14 +557,14 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) memcpy(rxe->rma_iov, read_iov, rxe->rma_iov_count * sizeof(struct fi_rma_iov)); + err = efa_rdm_pke_post_remote_read_or_nack(rxe->ep, pkt_entry, rxe); + efa_rdm_pke_release_rx(pkt_entry); - err = efa_rdm_ope_post_remote_read_or_queue(rxe); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RDMA post read or queue failed.\n"); efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RDMA_READ_POST); efa_rdm_rxe_release(rxe); - efa_rdm_pke_release_rx(pkt_entry); } } diff --git a/prov/efa/src/rdm/efa_rdm_pke_utils.h b/prov/efa/src/rdm/efa_rdm_pke_utils.h index 529749d9258..c7363965dc1 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_utils.h +++ b/prov/efa/src/rdm/efa_rdm_pke_utils.h @@ -94,6 +94,78 @@ efa_rdm_pke_copy_from_hmem_iov(struct efa_mr *iov_mr, struct efa_rdm_pke *pke, return copied; } +/** + * @brief This function either posts RDMA read, or sends a NACK packet when p2p + * is not available or memory registration limit was reached on the receiver. + * + * @param[in] ep endpoint + * @param[in] pkt_entry packet entry + * @param[in] rxe RX entry + * + * @return 0 on success, or a negative error code. + */ +static inline int +efa_rdm_pke_post_remote_read_or_nack(struct efa_rdm_ep *ep, + struct efa_rdm_pke *pkt_entry, + struct efa_rdm_ope *rxe) +{ + int err = 0; + int pkt_type; + int p2p_avail; + + pkt_type = efa_rdm_pke_get_base_hdr(pkt_entry)->type; + err = efa_rdm_ep_use_p2p(ep, rxe->desc[0]); + if (err < 0) + return err; + + p2p_avail = err; + if (p2p_avail) { + err = efa_rdm_ope_post_remote_read_or_queue(rxe); + } else if (efa_rdm_peer_support_read_nack(rxe->peer)) { + EFA_INFO(FI_LOG_EP_CTRL, + "Receiver sending long read " + "NACK packet because P2P is not available, " + "unable to post RDMA read.\n"); + goto send_nack; + } else { + EFA_INFO(FI_LOG_EP_CTRL, "P2P is not available, " + "unable to post RDMA read.\n"); + return -FI_EOPNOTSUPP; + } + + if (err == -FI_ENOMR) { + if (efa_rdm_peer_support_read_nack(rxe->peer)) { + EFA_INFO(FI_LOG_EP_CTRL, "Receiver sending long read " + "NACK packet because memory " + "registration limit was " + "reached on the receiver.\n"); + goto send_nack; + } else { + /* Peer does not support the READ_NACK packet. So we + * return EAGAIN and hope that the app runs progress + * again which will free some MR registrations */ + return -FI_EAGAIN; + } + } + + return err; + +send_nack: + rxe->internal_flags |= EFA_RDM_OPE_READ_NACK; + /* Only set the flag for runting read. The NACK + * packet is sent after all runting read + * RTM packets have been received */ + if (efa_rdm_pkt_type_is_runtread(pkt_type)) { + return 0; + } + + if (efa_rdm_pkt_type_is_rtm(pkt_type)) { + efa_rdm_rxe_map_insert(&ep->rxe_map, pkt_entry, rxe); + } + + return efa_rdm_ope_post_send_or_queue(rxe, EFA_RDM_READ_NACK_PKT); +} + size_t efa_rdm_pke_get_payload_offset(struct efa_rdm_pke *pkt_entry); ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke, From b791d2f7fa98e5adceb04e8a52f6be2c4edf0eda Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Mon, 9 Sep 2024 11:42:03 -0700 Subject: [PATCH 007/393] prov/efa: Update read nack protocol docs Signed-off-by: Jessie Yang --- prov/efa/docs/efa_rdm_protocol_v4.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 9f0b457a1bf..968087ca89b 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -1505,8 +1505,9 @@ in order to support CQ entry generation in case the sender uses ### 4.7 Long read and runting read nack protocol Long read and runting read protocols in Libfabric 1.20 and above use a nack protocol -when the receiver is unable to register a memory region for the RDMA read operation. -Failure to register the memory region is typically because of a hardware limitation. +when the receiver is unable to register a memory region for the RDMA read operation +or P2P support is unavailable for the RDMA read operation, typically because of a +hardware limitation. Table: 4.2 Format of the READ_NACK packet @@ -1521,12 +1522,14 @@ Table: 4.2 Format of the READ_NACK packet The nack protocols work as follows * Sender has decided to use the long read or runting read protocol -* The receiver receives the RTM packet(s) +* The receiver receives the RTM packet(s) or RTW packet - One LONGREAD_RTM packet in case of long read protocol - Multiple RUNTREAD_RTM packets in case of runting read protocol -* The receiver attempts to register a memory region for the RDMA operation but fails -* After all RTM packets have been processed, the receiver sends a READ_NACK packet to the sender -* The sender then switches to the long CTS protocol and sends a LONGCTS_RTM packet + - One LONGREAD_RTW packet in case of emulated long-read write protocol +* The receiver attempts to register a memory region for the RDMA operation but fails, +or P2P is unavailable for the RDMA operation +* After all RTM/RTW packets have been processed, the receiver sends a READ_NACK packet to the sender +* The sender then switches to the long CTS protocol and sends a LONGCTS_RTM/LONGCTS_RTW packet * The receiver sends a CTS packet and the data transfer continues as in the long CTS protocol The LONGCTS_RTM packet sent in the nack protocol does not contain any application data. From f58b7b3aaeea38730a3a7a68a5a645bdb4be113f Mon Sep 17 00:00:00 2001 From: Stephen Oost Date: Tue, 10 Sep 2024 09:57:25 -0700 Subject: [PATCH 008/393] prov/tcp: fix incorrect usage of av insert apis when multiplexing When multiplexing av inserts to sub avs, correctly track and check the subav fi_addrs Signed-off-by: Stephen Oost --- prov/tcp/src/xnet_av.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/prov/tcp/src/xnet_av.c b/prov/tcp/src/xnet_av.c index 7cf77604a58..14b82ccdafd 100644 --- a/prov/tcp/src/xnet_av.c +++ b/prov/tcp/src/xnet_av.c @@ -69,23 +69,27 @@ static int xnet_mplex_av_insert(struct fid_av *av_fid, const void *addr, size_t int ret; struct fid_list_entry *item; struct fid_av *subav_fid; - fi_addr_t sub_fi_addr; + fi_addr_t *sub_fi_addr; struct xnet_mplex_av *av = container_of(av_fid, struct xnet_mplex_av, - util_av.av_fid.fid); - + util_av.av_fid); + sub_fi_addr = calloc(count, sizeof(fi_addr_t)); + if (!sub_fi_addr) + return -FI_ENOMEM; ofi_genlock_lock(&av->lock); ret = ofi_ip_av_insert(&av->util_av.av_fid, addr, count, fi_addr, flags, context); if (ret < count) goto out; - dlist_foreach_container(&av->subav_list, struct fid_list_entry, item, entry) { + dlist_foreach_container(&av->subav_list, struct fid_list_entry, item, entry) { subav_fid = container_of(item->fid, struct fid_av, fid); - ret = fi_av_insert(subav_fid, addr, count, &sub_fi_addr, flags, context); + ret = fi_av_insert(subav_fid, addr, count, sub_fi_addr, flags, context); if (ret < count) break; - assert(*fi_addr == sub_fi_addr); + assert(!fi_addr || memcmp(fi_addr, sub_fi_addr, + sizeof(fi_addr_t) * count) == 0); } out: ofi_genlock_unlock(&av->lock); + free(sub_fi_addr); return ret; } @@ -97,26 +101,29 @@ static int xnet_mplex_av_insertsym(struct fid_av *av_fid, const char *node, int ret; struct fid_list_entry *item; struct fid_av *subav_fid; - fi_addr_t sub_fi_addr; + fi_addr_t *sub_fi_addr; struct xnet_mplex_av *av = container_of(av_fid, struct xnet_mplex_av, util_av.av_fid.fid); - + sub_fi_addr = calloc(nodecnt * svccnt, sizeof(fi_addr_t)); + if (!sub_fi_addr) + return -FI_ENOMEM; ofi_genlock_lock(&av->lock); ret = ofi_ip_av_insertsym(&av->util_av.av_fid, node, nodecnt, service, svccnt, fi_addr, flags, context); - if (ret) + if (ret < nodecnt * svccnt) goto out; dlist_foreach_container(&av->subav_list, struct fid_list_entry, item, entry) { subav_fid = container_of(item->fid, struct fid_av, fid); ret = fi_av_insertsym(subav_fid, node, nodecnt, service, svccnt, - &sub_fi_addr, flags, context); - if (ret) + sub_fi_addr, flags, context); + if (ret <= nodecnt * svccnt) break; - assert(*fi_addr == sub_fi_addr); + assert(!fi_addr || memcmp(fi_addr, sub_fi_addr, + sizeof(fi_addr_t) * nodecnt * svccnt) == 0); } out: ofi_genlock_unlock(&av->lock); - + free(sub_fi_addr); return ret; } @@ -130,19 +137,18 @@ static int xnet_mplex_av_insertsvc(struct fid_av *av_fid, const char *node, fi_addr_t sub_fi_addr; struct xnet_mplex_av *av = container_of(av_fid, struct xnet_mplex_av, util_av.av_fid.fid); - ofi_genlock_lock(&av->lock); ret = ofi_ip_av_insertsvc(&av->util_av.av_fid, node, service, fi_addr, flags, context); - if (ret) + if (ret <= 0) goto out; dlist_foreach_container(&av->subav_list, struct fid_list_entry, item, entry) { subav_fid = container_of(item->fid, struct fid_av, fid); ret = fi_av_insertsvc(subav_fid, node, service, &sub_fi_addr, flags, context); - if (ret) + if (ret <= 0) break; - assert(*fi_addr == sub_fi_addr); + assert(!fi_addr || *fi_addr == sub_fi_addr); } out: ofi_genlock_unlock(&av->lock); From 01bb36cd5632c44e13a9faa2142c9ad68c7405b0 Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Mon, 29 Jul 2024 14:05:03 -0500 Subject: [PATCH 009/393] prov/opx: Remove function table entries for reliability types other than ONLOAD Signed-off-by: Ben Lynam --- prov/opx/configure.m4 | 2 +- .../include/rdma/opx/fi_opx_cq_ops_table.h | 4 +- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 31 +- prov/opx/src/fi_opx_cq_ops_table_locking.c | 259 +------------- .../src/fi_opx_cq_ops_table_locking_8192.c | 223 ++---------- .../src/fi_opx_cq_ops_table_locking_runtime.c | 204 ++--------- .../opx/src/fi_opx_cq_ops_table_non_locking.c | 260 +------------- .../fi_opx_cq_ops_table_non_locking_8192.c | 204 ++--------- .../fi_opx_cq_ops_table_non_locking_runtime.c | 200 +---------- prov/opx/src/fi_opx_ep.c | 71 +--- prov/opx/src/fi_opx_msg.c | 245 +++---------- prov/opx/src/fi_opx_tagged.c | 322 +++++------------- 12 files changed, 285 insertions(+), 1740 deletions(-) diff --git a/prov/opx/configure.m4 b/prov/opx/configure.m4 index 952c7553420..b8cb174a1b6 100644 --- a/prov/opx/configure.m4 +++ b/prov/opx/configure.m4 @@ -90,7 +90,7 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ AS_CASE([x$OPX_RELIABILITY], [xnone], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_NONE], [xoffload], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_OFFLOAD], - dnl [xruntime], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_RUNTIME], + dnl [xruntime], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_ONLOAD], [OPX_RELIABILITY=OFI_RELIABILITY_KIND_ONLOAD]) AC_SUBST(opx_reliability, [$OPX_RELIABILITY]) diff --git a/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h b/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h index ca55f96fb02..c070c60d7f4 100644 --- a/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h +++ b/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 by Cornelis Networks. + * Copyright (C) 2021-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -46,7 +46,7 @@ /* Number of types in enum fi_cq_format */ #define FI_CQ_FORMAT_COUNT 5 -typedef struct fi_ops_cq op_matrix_t[FI_CQ_FORMAT_COUNT][OFI_RELIABILITY_KIND_COUNT][FI_OPX_COMMS_COUNT]; +typedef struct fi_ops_cq op_matrix_t[FI_CQ_FORMAT_COUNT][1 /* OFI_RELIABILITY_KIND_ONLOAD */][FI_OPX_COMMS_COUNT]; static ssize_t fi_opx_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, uint64_t flags) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index df436159f77..9cd3d7d0158 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -3105,39 +3105,22 @@ void fi_opx_ep_rx_poll (struct fid_ep *ep, { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - const enum ofi_reliability_kind kind = opx_ep->reliability->state.kind; const uint64_t rx_caps = (caps & (FI_LOCAL_COMM | FI_REMOTE_COMM)) ? caps : opx_ep->rx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); - if ( OFI_LIKELY((reliability == OPX_RELIABILITY) && (hdrq_mask == FI_OPX_HDRQ_MASK_RUNTIME)) ) { /* constant compile-time expression */ + if (OFI_LIKELY(hdrq_mask == FI_OPX_HDRQ_MASK_RUNTIME)) { /* constant compile-time expression */ FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); - } else if ( (reliability == OFI_RELIABILITY_KIND_RUNTIME) && (hdrq_mask == FI_OPX_HDRQ_MASK_2048) ) { /* constant compile-time expression */ - if (kind == OFI_RELIABILITY_KIND_ONLOAD) { - FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048); - } else { - FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048); - } - } else if ( (reliability == OFI_RELIABILITY_KIND_RUNTIME) && (hdrq_mask == FI_OPX_HDRQ_MASK_8192) ) { /* constant compile-time expression */ - if (kind == OFI_RELIABILITY_KIND_ONLOAD) { - FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192); - } else { - FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192); - } - } else if (hdrq_mask == FI_OPX_HDRQ_MASK_2048) { /* constant compile-time expression */ + OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME); + } else if (hdrq_mask == FI_OPX_HDRQ_MASK_2048) { /* constant compile-time expression */ FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - reliability, FI_OPX_HDRQ_MASK_2048); - } else if (hdrq_mask == FI_OPX_HDRQ_MASK_8192) { /* constant compile-time expression */ + OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048); + } else if (hdrq_mask == FI_OPX_HDRQ_MASK_8192) { /* constant compile-time expression */ FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - reliability, FI_OPX_HDRQ_MASK_8192); + OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192); } else { FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - reliability, hdrq_mask); + OFI_RELIABILITY_KIND_ONLOAD, hdrq_mask); } fi_opx_ep_do_pending_work(opx_ep); diff --git a/prov/opx/src/fi_opx_cq_ops_table_locking.c b/prov/opx/src/fi_opx_cq_ops_table_locking.c index 4efedeff078..f7b59b0f54f 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_locking.c +++ b/prov/opx/src/fi_opx_cq_ops_table_locking.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,291 +35,68 @@ /* HDRQ_MASK = 2k value (2047 * 0x20) */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) static struct fi_ops_cq fi_opx_cq_locking_2048_ops_table[] = { // Format: FI_CQ_FORMAT_UNSPEC - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - // Format: FI_CQ_FORMAT_CONTEXT - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - // Format: FI_CQ_FORMAT_MSG - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - //Format: FI_CQ_FORMAT_DATA - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - // Format: FI_CQ_FORMAT_TAGGED - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - }; static op_matrix_t *fi_opx_cq_locking_2048_ops = (op_matrix_t *)&fi_opx_cq_locking_2048_ops_table; @@ -328,5 +105,5 @@ struct fi_ops_cq * fi_opx_cq_select_locking_2048_ops(const enum fi_cq_format for const enum ofi_reliability_kind reliability, const uint64_t comm_caps) { - return &(*fi_opx_cq_locking_2048_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_locking_2048_ops)[format][0][comm_caps]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c b/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c index 88b94f433a0..198f93a8616 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c +++ b/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,254 +35,71 @@ /* HDRQ_MASK = 8k value (8191 * 0x20) */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) static struct fi_ops_cq fi_opx_cq_locking_8192_ops_table[] = { - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - }; static op_matrix_t *fi_opx_cq_locking_8192_ops = (op_matrix_t *)&fi_opx_cq_locking_8192_ops_table; struct fi_ops_cq * fi_opx_cq_select_locking_8192_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps) { - return &(*fi_opx_cq_locking_8192_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_locking_8192_ops)[format][0][comm_caps]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c b/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c index 94bcd635c3f..16d2b67fb2d 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c +++ b/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,234 +34,70 @@ /* HDRQ_MASK = runtime value (not 2047 or 8191, won't be optimal) */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) static struct fi_ops_cq fi_opx_cq_locking_runtime_ops_table[] = { - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - }; static op_matrix_t *fi_opx_cq_locking_runtime_ops = (op_matrix_t *)&fi_opx_cq_locking_runtime_ops_table; struct fi_ops_cq * fi_opx_cq_select_locking_runtime_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps) { - return &(*fi_opx_cq_locking_runtime_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_locking_runtime_ops)[format][0][comm_caps]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_non_locking.c b/prov/opx/src/fi_opx_cq_ops_table_non_locking.c index 6fcd46a0057..50b4c6f03e5 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_non_locking.c +++ b/prov/opx/src/fi_opx_cq_ops_table_non_locking.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,293 +34,69 @@ /* HDRQ_MASK = 2k value (2047 * 0x20) */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) static struct fi_ops_cq fi_opx_cq_non_locking_2048_ops_table[] = { // Format: FI_CQ_FORMAT_UNSPEC - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - // Format: FI_CQ_FORMAT_CONTEXT - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - // Format: FI_CQ_FORMAT_MSG - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - //Format: FI_CQ_FORMAT_DATA - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - // Format: FI_CQ_FORMAT_TAGGED - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - }; @@ -330,5 +106,5 @@ struct fi_ops_cq * fi_opx_cq_select_non_locking_2048_ops(const enum fi_cq_format const enum ofi_reliability_kind reliability, const uint64_t comm_caps) { - return &(*fi_opx_cq_non_locking_2048_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_non_locking_2048_ops)[format][0][comm_caps]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c b/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c index 8b04675c2c7..7b1ad22c6ac 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c +++ b/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,233 +34,69 @@ /* HDRQ_MASK = 8k value (8191 * 0x20) */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) static struct fi_ops_cq fi_opx_cq_non_locking_8192_ops_table[] = { - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), }; static op_matrix_t *fi_opx_cq_non_locking_8192_ops = (op_matrix_t *)&fi_opx_cq_non_locking_8192_ops_table; struct fi_ops_cq * fi_opx_cq_select_non_locking_8192_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps) { - return &(*fi_opx_cq_non_locking_8192_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_non_locking_8192_ops)[format][0][comm_caps]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c b/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c index 681ee867953..5ca74f424b9 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c +++ b/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,227 +35,63 @@ /* HDRQ_MASK = runtime value (not 2047 or 8191, won't be optimal) */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ -/* ---- OFI_RELIABILITY_KIND_NONE */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ----- OFI_RELIABILITY_KIND_OFFLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - -/* ---- OFI_RELIABILITY_KIND_RUNTIME */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) static struct fi_ops_cq fi_opx_cq_non_locking_runtime_ops_table[] = { - - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_NONE, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_OFFLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_RUNTIME, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), }; ssize_t @@ -289,5 +125,5 @@ struct fi_ops_cq * fi_opx_cq_select_non_locking_runtime_ops(const enum fi_cq_for const enum ofi_reliability_kind reliability, const uint64_t comm_caps) { - return &(*fi_opx_cq_non_locking_runtime_ops)[format][reliability][comm_caps]; + return &(*fi_opx_cq_non_locking_runtime_ops)[format][0][comm_caps]; } diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 08c5895ac60..d2890e1b42a 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -2698,66 +2698,31 @@ void fi_opx_ep_rx_reliability_process_packet (struct fid_ep * ep, const uint8_t opcode = hdr->stl.bth.opcode; - struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - const enum ofi_reliability_kind reliability_kind = opx_ep->reliability->state.kind; - /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ const size_t payload_bytes = total_bytes - sizeof(union fi_opx_hfi1_packet_hdr); if (OFI_LIKELY(opcode >= FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)) { - - if (reliability_kind == OFI_RELIABILITY_KIND_OFFLOAD) { - - fi_opx_ep_rx_process_header(ep, hdr, - (const union fi_opx_hfi1_packet_payload * const) payload, - payload_bytes, - FI_TAGGED, - opcode, - origin_rs, - OPX_INTRANODE_FALSE, - FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (reliability_kind == OFI_RELIABILITY_KIND_ONLOAD) { - - fi_opx_ep_rx_process_header(ep, hdr, - (const union fi_opx_hfi1_packet_payload * const) payload, - payload_bytes, - FI_TAGGED, - opcode, - origin_rs, - OPX_INTRANODE_FALSE, - FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_ONLOAD); - } + fi_opx_ep_rx_process_header(ep, hdr, + (const union fi_opx_hfi1_packet_payload * const) payload, + payload_bytes, + FI_TAGGED, + opcode, + origin_rs, + OPX_INTRANODE_FALSE, + FI_OPX_LOCK_NOT_REQUIRED, + OFI_RELIABILITY_KIND_ONLOAD); } else { - - if (reliability_kind == OFI_RELIABILITY_KIND_OFFLOAD) { - - fi_opx_ep_rx_process_header(ep, hdr, - (const union fi_opx_hfi1_packet_payload * const) payload, - payload_bytes, - FI_MSG, - opcode, - origin_rs, - OPX_INTRANODE_FALSE, - FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (reliability_kind == OFI_RELIABILITY_KIND_ONLOAD) { - - fi_opx_ep_rx_process_header(ep, hdr, - (const union fi_opx_hfi1_packet_payload * const) payload, - payload_bytes, - FI_MSG, - opcode, - origin_rs, - OPX_INTRANODE_FALSE, - FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_ONLOAD); - } + fi_opx_ep_rx_process_header(ep, hdr, + (const union fi_opx_hfi1_packet_payload * const) payload, + payload_bytes, + FI_MSG, + opcode, + origin_rs, + OPX_INTRANODE_FALSE, + FI_OPX_LOCK_NOT_REQUIRED, + OFI_RELIABILITY_KIND_ONLOAD); } } diff --git a/prov/opx/src/fi_opx_msg.c b/prov/opx/src/fi_opx_msg.c index 3674366c32a..eafba58397b 100644 --- a/prov/opx/src/fi_opx_msg.c +++ b/prov/opx/src/fi_opx_msg.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 by Cornelis Networks. + * Copyright (C) 2021-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -122,66 +122,29 @@ ssize_t fi_opx_senddata(struct fid_ep *ep, const void *buf, size_t len, void *de } /* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) /* FI_LOCAL_COMM = 0x0008000000000000ull */ -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) /* FI_REMOTE_COMM = 0x0010000000000000ull */ -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) @@ -207,66 +170,28 @@ static struct fi_ops_msg \ } /* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); /* FI_LOCAL_COMM = 0x0008000000000000ull */ -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); /* FI_REMOTE_COMM = 0x0010000000000000ull */ -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); @@ -336,108 +261,50 @@ int fi_opx_enable_msg_ops(struct fid_ep *ep) const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); const enum ofi_reliability_kind reliability = opx_ep->reliability->state.kind; + if (OFI_UNLIKELY(reliability != OFI_RELIABILITY_KIND_ONLOAD)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Invalid reliability kind %u\n", reliability); + return -FI_EINVAL; + } if (!lock_required) { if (opx_ep->av->type == FI_AV_TABLE) { if (comm_caps == FI_LOCAL_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } else if (comm_caps == FI_REMOTE_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } - - } else if (opx_ep->av->type == FI_AV_MAP) { - + } else if (opx_ep->av->type == FI_AV_MAP) { if (comm_caps == FI_LOCAL_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } else if (comm_caps == FI_REMOTE_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } - } else { + } else { /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } } else { if (opx_ep->av->type == FI_AV_TABLE) { - if (comm_caps == FI_LOCAL_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } else if (comm_caps == FI_REMOTE_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } } else if (opx_ep->av->type == FI_AV_MAP) { - if (comm_caps == FI_LOCAL_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } else if (comm_caps == FI_REMOTE_COMM) { - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); } - } else { + } else { /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } diff --git a/prov/opx/src/fi_opx_tagged.c b/prov/opx/src/fi_opx_tagged.c index cb8a4b95c18..8159bacfbbf 100644 --- a/prov/opx/src/fi_opx_tagged.c +++ b/prov/opx/src/fi_opx_tagged.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -294,74 +294,29 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, } +/* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD) - -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - - +/* FI_LOCAL_COMM = 0x0008000000000000ull */ +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +/* FI_REMOTE_COMM = 0x0010000000000000ull */ +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) @@ -371,89 +326,44 @@ FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x001000000000 #define FI_OPX_TAGGED_OPS_STRUCT_NAME_(LOCK,AV,CAPS,RELIABILITY) \ fi_opx_ops_tagged_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY -#define FI_OPX_TAGGED_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY) \ +#define FI_OPX_TAGGED_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY) \ static struct fi_ops_tagged \ FI_OPX_TAGGED_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY) __attribute__ ((unused)) = { \ .size = sizeof(struct fi_ops_tagged), \ - .recv = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, LOCK, AV, CAPS, RELIABILITY), \ + .recv = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, LOCK, AV, CAPS, RELIABILITY), \ .recvv = fi_no_tagged_recvv, \ .recvmsg = fi_opx_trecvmsg, \ - .send = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, LOCK, AV, CAPS, RELIABILITY), \ + .send = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, LOCK, AV, CAPS, RELIABILITY), \ .sendv = fi_no_tagged_sendv, \ .sendmsg = fi_opx_tsendmsg, \ .inject = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, LOCK, AV, CAPS, RELIABILITY), \ .senddata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, LOCK, AV, CAPS, RELIABILITY), \ - .injectdata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, LOCK, AV, CAPS, RELIABILITY), \ + .injectdata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, LOCK, AV, CAPS, RELIABILITY),\ } -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_NONE); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_NONE); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_NONE); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_OFFLOAD); - -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED,FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - - - - +/* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); + +/* FI_LOCAL_COMM = 0x0008000000000000ull */ +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); + +/* FI_REMOTE_COMM = 0x0010000000000000ull */ +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); ssize_t fi_opx_tsearch(struct fid_ep *ep, uint64_t *tag, @@ -499,8 +409,8 @@ int fi_opx_enable_tagged_ops(struct fid_ep *ep) struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - if (!opx_ep || !opx_ep->domain) - goto err; + if (!opx_ep || !opx_ep->domain) + goto err; if (!(opx_ep->tx->caps & FI_TAGGED) || !(opx_ep->rx->caps & FI_TAGGED)) { /* Tagged ops not enabled on this endpoint */ @@ -517,11 +427,17 @@ int fi_opx_enable_tagged_ops(struct fid_ep *ep) const enum fi_threading threading = opx_ep->domain->threading; if (OFI_UNLIKELY(fi_opx_threading_unknown(threading))) { - opx_ep->ep_fid.tagged = &fi_opx_no_tagged_ops; - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Unknown thread mode, tagged ops not enabled on EP\n"); + opx_ep->ep_fid.tagged = &fi_opx_no_tagged_ops; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Unknown thread mode, tagged ops not enabled on EP\n"); return 0; - } + } + + if (OFI_UNLIKELY(opx_ep->reliability->state.kind != OFI_RELIABILITY_KIND_ONLOAD)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Invalid reliability kind %u\n", opx_ep->reliability->state.kind); + return -FI_EINVAL; + } uint64_t comm_caps = opx_ep->rx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); if (comm_caps == 0) @@ -531,105 +447,41 @@ int fi_opx_enable_tagged_ops(struct fid_ep *ep) if (!lock_required) { if (opx_ep->av->type == FI_AV_TABLE) { - - if (comm_caps == 0x0008000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == 0x0010000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* 0x0018000000000000ull */ - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); } - } else if (opx_ep->av->type == FI_AV_MAP) { - - if (comm_caps == 0x0008000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == 0x0010000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* 0x0018000000000000ull */ - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); } - } else { + } else { /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } } else { if (opx_ep->av->type == FI_AV_TABLE) { - - if (comm_caps == 0x0008000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == 0x0010000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* 0x0018000000000000ull */ - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); } - } else if (opx_ep->av->type == FI_AV_MAP) { - - if (comm_caps == 0x0008000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else if (comm_caps == 0x0010000000000000ull) { - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - - } else { /* 0x0018000000000000ull */ - - if (opx_ep->reliability->state.kind == OFI_RELIABILITY_KIND_ONLOAD) - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - else - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_OFFLOAD); - + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); } } else { /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ From f27a33ae0ba7ba8ac15e2e665a7a6c10154337ba Mon Sep 17 00:00:00 2001 From: Mike Wilkins Date: Wed, 31 Jul 2024 08:36:56 -0500 Subject: [PATCH 010/393] prov/opx: Limit the number of reliability pings on credit-constrained flows Signed-off-by: Mike Wilkins --- .../opx/include/rdma/opx/fi_opx_reliability.h | 18 ++- prov/opx/src/fi_opx_init.c | 2 + prov/opx/src/fi_opx_reliability.c | 144 ++++++++++++++---- 3 files changed, 136 insertions(+), 28 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_reliability.h b/prov/opx/include/rdma/opx/fi_opx_reliability.h index 065e9dc28ce..d6bd81b65ef 100644 --- a/prov/opx/include/rdma/opx/fi_opx_reliability.h +++ b/prov/opx/include/rdma/opx/fi_opx_reliability.h @@ -118,6 +118,12 @@ union fi_opx_reliability_deferred_work { struct fi_opx_reliability_tx_pio_replay_params pio_replay; }; +#define OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MIN (1) +#define OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MAX (65535) +#define OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT (128) +#define OPX_RELIABILITY_MAX_CONGESTED_PINGS_MIN (1) +#define OPX_RELIABILITY_MAX_CONGESTED_PINGS_MAX (65535) +#define OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT (4) struct fi_opx_reliability_service { struct fi_opx_atomic_fifo fifo; /* 27 qws = 216 bytes */ @@ -134,7 +140,10 @@ struct fi_opx_reliability_service { /* == CACHE LINE == */ RbtHandle flow; /* 1 qw = 8 bytes */ uint64_t ping_start_key; - uint64_t unused; + uint16_t max_uncongested_pings; + uint16_t max_congested_pings; + uint8_t congested_flag; + uint8_t unused_padding2[3]; struct { uint64_t unused_cacheline_1; @@ -360,6 +369,13 @@ RbtIterator fi_opx_rbt_begin(RbtHandle h) { return i != &rbt->sentinel ? i : NULL; } +__OPX_FORCE_INLINE__ +void fi_opx_rbt_key(RbtIterator it, uint64_t *key) { + NodeType *i = it; + + *key = (uint64_t) i->key; +} + __OPX_FORCE_INLINE__ void fi_opx_rbt_key_value(RbtHandle h, RbtIterator it, void **key, void **val) { NodeType *i = it; diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 8e005c83a4b..21b27428442 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -707,6 +707,8 @@ OPX_INI OPX_DEFAULT_JOB_KEY_STR); fi_param_define(&fi_opx_provider, "force_cpuaffinity", FI_PARAM_BOOL, "Causes the thread to bind itself to the cpu core it is running on. Defaults to \"No\""); fi_param_define(&fi_opx_provider, "reliability_service_usec_max", FI_PARAM_INT, "The number of microseconds between pings for un-acknowledged packets. Defaults to 500 usec."); + fi_param_define(&fi_opx_provider, "reliability_max_uncongested_pings", FI_PARAM_INT, "The maximum number of reliability pings sent in a single timer iteration when the network link is uncongested. Value must be between %d and %d. Defaults to %d.", OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MIN, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MAX, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT); + fi_param_define(&fi_opx_provider, "reliability_max_congested_pings", FI_PARAM_INT, "The maximum number of reliability pings sent in a single timer iteration when the network link is congested. Value must be between %d and %d. Defaults to %d.", OPX_RELIABILITY_MAX_CONGESTED_PINGS_MIN, OPX_RELIABILITY_MAX_CONGESTED_PINGS_MAX, OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT); fi_param_define(&fi_opx_provider, "reliability_service_pre_ack_rate", FI_PARAM_INT, "The number of packets to receive from a particular sender before preemptively acknowledging them without waiting for a ping. Valid values are powers of 2 in the range of 0-32,768, where 0 indicates no preemptive acking. Defaults to 64."); fi_param_define(&fi_opx_provider, "selinux", FI_PARAM_BOOL, "Set to true if you're running a security-enhanced Linux. This enables updating the Jkey used based on system settings. Defaults to \"No\""); fi_param_define(&fi_opx_provider, "hfi_select", FI_PARAM_STRING, "Overrides the normal algorithm used to choose which HFI a process will use. See the documentation for more information."); diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index cf3e538d3ca..3009876ce7b 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -1843,10 +1843,16 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RELI_RX_NACK"); } +enum opx_reliability_ping_result { + OPX_RELIABILITY_PING_NO_REPLAYS = -1, + OPX_RELIABILITY_PING_NO_CREDITS, //NO_CREDITS = 0 to make the if statements in ping_remote clean + OPX_RELIABILITY_PING_SENT +}; + __OPX_FORCE_INLINE__ -uint64_t fi_opx_reliability_send_ping(struct fid_ep *ep, +ssize_t fi_opx_reliability_send_ping(struct fid_ep *ep, struct fi_opx_reliability_service * service, - RbtIterator itr) + RbtIterator itr, uint64_t key_value) { OPX_TRACER_TRACE_RELI(OPX_TRACER_BEGIN, "RELI_SEND_PING"); struct fi_opx_reliability_tx_replay ** value_ptr = @@ -1856,16 +1862,9 @@ uint64_t fi_opx_reliability_send_ping(struct fid_ep *ep, if (OFI_UNLIKELY(head == NULL)) { OPX_TRACER_TRACE_RELI(OPX_TRACER_END_ERROR, "RELI_SEND_PING"); - return 0; + return OPX_RELIABILITY_PING_NO_REPLAYS; } - const union fi_opx_reliability_service_flow_key key = { - .slid = (uint32_t)head->scb.hdr.stl.lrh.slid, - .tx = (uint32_t)FI_OPX_HFI1_PACKET_ORIGIN_TX(&head->scb.hdr), - .dlid = (uint32_t)head->scb.hdr.stl.lrh.dlid, - .rx = (uint32_t)head->scb.hdr.stl.bth.rx, - }; - const uint64_t dlid = (uint64_t)head->scb.hdr.stl.lrh.dlid; const uint64_t rx = (uint64_t)head->target_reliability_rx; @@ -1880,26 +1879,34 @@ uint64_t fi_opx_reliability_send_ping(struct fid_ep *ep, // Send one ping to cover the entire replay range. ssize_t rc = fi_opx_hfi1_tx_reliability_inject(ep, - key.value, dlid, rx, + key_value, dlid, rx, psn_start, psn_count, FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING); - INC_PING_STAT_COND(rc == FI_SUCCESS, PINGS_SENT, key.value, psn_start, psn_count); + INC_PING_STAT_COND(rc == FI_SUCCESS, PINGS_SENT, key_value, psn_start, psn_count); OPX_TRACER_TRACE_RELI(OPX_TRACER_END_SUCCESS, "RELI_SEND_PING"); - return (rc == FI_SUCCESS) ? 0 : key.value; + + if(rc){ + return OPX_RELIABILITY_PING_NO_CREDITS; + } + + return OPX_RELIABILITY_PING_SENT; } void fi_reliability_service_ping_remote (struct fid_ep *ep, struct fi_opx_reliability_service * service) { - /* for each flow in the rbtree ... */ RbtIterator start_key_itr; RbtIterator itr; - uint64_t fail_key = 0; + uint64_t key_value = 0; + ssize_t rc = OPX_RELIABILITY_PING_SENT; + uint16_t num_pings = 0; + uint16_t max_pings = service->tx.congested_flag ? service->tx.max_congested_pings : service->tx.max_uncongested_pings; + uint64_t start_key = service->tx.ping_start_key; if (start_key) { itr = fi_opx_rbt_find(service->tx.flow, (void*)start_key); @@ -1910,41 +1917,82 @@ void fi_reliability_service_ping_remote (struct fid_ep *ep, } /* Loop until we hit the end of the tree, or we fail on a particular ping */ - while (itr && !fail_key) { - - fail_key = fi_opx_reliability_send_ping(ep, service, itr); + while (itr && rc && num_pings < max_pings) { + fi_opx_rbt_key(itr, &key_value); + rc = fi_opx_reliability_send_ping(ep, service, itr, key_value); + /* advance to the next dlid */ - itr = rbtNext(service->tx.flow, itr); + itr = rbtNext(service->tx.flow, itr); + + if(rc == OPX_RELIABILITY_PING_SENT) { + ++num_pings; + } } - /* We failed on a particular ping. Store the failing key to be the first to try next time, and stop */ - if (fail_key) { - service->tx.ping_start_key = fail_key; + /* We ran out of credits on a particular ping. + * Store the failing key to be the first to try next time, + * set the congested flag to limit future pings, and stop */ + if (!rc) { + service->tx.congested_flag = 1; + service->tx.ping_start_key = key_value; return; } + // We sent the max number of pings this round, save the next key and stop + if (num_pings == max_pings) { + if (itr) { + fi_opx_rbt_key(itr, &key_value); + service->tx.ping_start_key = key_value; + return; + } + service->tx.ping_start_key = 0; + return; + } + /* We hit the end of the tree. If there was no starting key, we've iterated through the whole tree and we're done. */ if (!start_key) { + // Unset the congested flag + service->tx.congested_flag = 0; return; } /* Wrap back around from the beginning of the tree and iterate until we've hit the starting key */ itr = rbtBegin(service->tx.flow); - while (itr && itr != start_key_itr && !fail_key) { + while (itr && itr != start_key_itr && rc && num_pings < max_pings) { + fi_opx_rbt_key(itr, &key_value); - fail_key = fi_opx_reliability_send_ping(ep, service, itr); + rc = fi_opx_reliability_send_ping(ep, service, itr, key_value); /* advance to the next dlid */ itr = rbtNext(service->tx.flow, itr); + + if(rc == OPX_RELIABILITY_PING_SENT) { + ++num_pings; + } } - if (fail_key) { - service->tx.ping_start_key = fail_key; - } else { + if (!rc) { + service->tx.congested_flag = 1; + service->tx.ping_start_key = key_value; + return; + } + + if (num_pings == max_pings) { + if(itr){ + fi_opx_rbt_key(itr, &key_value); + service->tx.ping_start_key = key_value; + return; + } service->tx.ping_start_key = 0; + return; } + + service->tx.ping_start_key = 0; + + // We iterated through the whole tree, unset the congested flag + service->tx.congested_flag = 0; } void fi_opx_reliability_service_process_pending (struct fi_opx_reliability_service * service) @@ -2354,6 +2402,48 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser service->usec_next = fi_opx_timer_next_event_usec(&service->tx.timer, &service->tx.timestamp, service->usec_max); + /* + * Initialize send ping flag(s) + * + * ONLOAD only + */ + service->tx.congested_flag = 0; + + /* + * Maximum number of reliability pings per timer in congested/uncongested scenarios + * + * OFFLOAD and ONLOAD + */ + int max_uncongested_pings; + if(fi_param_get_int(fi_opx_global.prov, "reliability_max_uncongested_pings", &max_uncongested_pings) == FI_SUCCESS) { + if (max_uncongested_pings < OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MIN || max_uncongested_pings > OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MAX) { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS has value %d which is outside the valid range of %d-%d. Using default rate of %d\n", max_uncongested_pings, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MIN, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_MAX, OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT); + max_uncongested_pings = OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT; + } else { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Using environment-specified FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS of %d\n", max_uncongested_pings); + } + } else { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS not specified, using default value of %d\n", OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT); + max_uncongested_pings = OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT; + } + service->tx.max_uncongested_pings = max_uncongested_pings; + + int max_congested_pings; + if(fi_param_get_int(fi_opx_global.prov, "reliability_max_congested_pings", &max_congested_pings) == FI_SUCCESS) { + if (max_congested_pings < OPX_RELIABILITY_MAX_CONGESTED_PINGS_MIN || max_congested_pings > OPX_RELIABILITY_MAX_CONGESTED_PINGS_MAX) { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS has value %d which is outside the valid range of %d-%d. Using default rate of %d\n", max_congested_pings, OPX_RELIABILITY_MAX_CONGESTED_PINGS_MIN, OPX_RELIABILITY_MAX_CONGESTED_PINGS_MAX, OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT); + max_congested_pings = OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT; + } else { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Using environment-specified FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS of %d\n", max_congested_pings); + } + } else { + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS not specified, using default value of %d\n", OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT); + max_congested_pings = OPX_RELIABILITY_MAX_CONGESTED_PINGS_DEFAULT; + } + service->tx.max_congested_pings = max_congested_pings; + /* * Maximum number of commands to process from atomic fifo before * stopping to do something else From d870b8a7a4f06b9c567b1e12a97f201b54699a0a Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Thu, 1 Aug 2024 10:41:27 -0500 Subject: [PATCH 011/393] prov/opx: Don't try to get HMEM iface for NULL pointers When doing an inject, ensure the user buffer is not NULL and the buffer length is not 0 before attempting to get the HMEM iface for the buffer. Signed-off-by: Ben Lynam --- .../include/rdma/opx/fi_opx_hfi1_transport.h | 42 ++++++++++--------- prov/opx/include/rdma/opx/fi_opx_hmem.h | 4 +- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index 6dd6aea096f..46d607322f7 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -691,16 +691,18 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, if (!hdr) return rc; #ifdef OPX_HMEM - uint64_t hmem_device; - enum fi_hmem_iface iface = fi_opx_hmem_get_iface(buf, NULL, &hmem_device); - - if (iface != FI_HMEM_SYSTEM) { - opx_copy_from_hmem(iface, hmem_device, OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, - buf, len, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); - buf = opx_ep->hmem_copy_buf; - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode - .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .send.inject); + if (buf && len) { + uint64_t hmem_device; + enum fi_hmem_iface iface = fi_opx_hmem_get_iface(buf, NULL, &hmem_device); + + if (iface != FI_HMEM_SYSTEM) { + opx_copy_from_hmem(iface, hmem_device, OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, + buf, len, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + buf = opx_ep->hmem_copy_buf; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.inject); + } } #endif hdr->qw[0] = opx_ep->tx->inject.hdr.qw[0] | lrh_dlid; @@ -759,16 +761,18 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } #ifdef OPX_HMEM - uint64_t hmem_device; - enum fi_hmem_iface iface = fi_opx_hmem_get_iface(buf, NULL, &hmem_device); + if (buf && len) { + uint64_t hmem_device; + enum fi_hmem_iface iface = fi_opx_hmem_get_iface(buf, NULL, &hmem_device); - if (iface != FI_HMEM_SYSTEM) { - opx_copy_from_hmem(iface, hmem_device, OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, - buf, len, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); - buf = opx_ep->hmem_copy_buf; - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi - .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .send.inject); + if (iface != FI_HMEM_SYSTEM) { + opx_copy_from_hmem(iface, hmem_device, OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, + buf, len, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + buf = opx_ep->hmem_copy_buf; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.inject); + } } #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_hmem.h b/prov/opx/include/rdma/opx/fi_opx_hmem.h index 677dc509a08..6fa0db63a2b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hmem.h +++ b/prov/opx/include/rdma/opx/fi_opx_hmem.h @@ -65,6 +65,8 @@ enum fi_hmem_iface fi_opx_hmem_get_iface(const void *ptr, const struct fi_opx_mr *desc, uint64_t *device) { + assert(ptr != NULL); + #ifdef OPX_HMEM if (desc) { switch (desc->attr.iface) { @@ -226,7 +228,7 @@ int opx_copy_from_hmem(enum fi_hmem_iface iface, uint64_t device, uint64_t hmem_ } break; #endif - + default: OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "OFI-COPY-FROM-HMEM"); ret = ofi_copy_from_hmem(iface, device, dest, src, len); From 8b915a9982751e17668c1b1badf882dfe367d268 Mon Sep 17 00:00:00 2001 From: Mike Wilkins Date: Fri, 2 Aug 2024 13:47:46 -0500 Subject: [PATCH 012/393] prov/opx: Create GPU-specific SDMA/RZV thresholds Signed-off-by: Mike Wilkins --- prov/opx/include/rdma/opx/fi_opx_hfi1.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1.h b/prov/opx/include/rdma/opx/fi_opx_hfi1.h index b59ee0054b4..b27a8bb4285 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1.h @@ -91,9 +91,16 @@ #define OPX_MP_EGR_DISABLE_NOT_SET (0) #define OPX_MP_EGR_DISABLE_DEFAULT (OPX_MP_EGR_DISABLE_NOT_SET) -#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT+1) /* Default for payload threshold size for RZV */ -#define OPX_RZV_MIN_PAYLOAD_BYTES_MAX (OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX+1) /* Max value */ +/* Default for payload threshold size for RZV */ +#if HAVE_CUDA +#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (4096) +#elif HAVE_ROCR +#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (256) +#else +#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT+1) +#endif #define OPX_RZV_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) /* Min value */ +#define OPX_RZV_MIN_PAYLOAD_BYTES_MAX (OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX+1) /* Max value */ /* The total size for a single packet used in a multi-packet eager send. This is packet payload plus 64 bytes for the PBC and packet header. @@ -183,11 +190,19 @@ static_assert(OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX >= OPX_MP_EGR_MAX_PAYLOAD_BYTES_D #define FI_OPX_HFI1_SDMA_MAX_COMP_INDEX (128) // This should what opx_ep->hfi->info.sdma.queue_size is set to. +/* Default for payload threshold size for SDMA */ #ifndef FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT +#if HAVE_CUDA +#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (4096) +#elif HAVE_ROCR +#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (256) +#else #define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (16385) #endif -#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) -#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX (INT_MAX-1) +#endif +#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) /* Min Value */ +#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX (INT_MAX-1) /* Max Value */ + static_assert(!(FI_OPX_HFI1_SDMA_MAX_COMP_INDEX & (FI_OPX_HFI1_SDMA_MAX_COMP_INDEX - 1)), "FI_OPX_HFI1_SDMA_MAX_COMP_INDEX must be power of 2!\n"); From d516dfb5c989dd37c13c0e34f39b864082a5a7f5 Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Tue, 6 Aug 2024 09:06:57 -0500 Subject: [PATCH 013/393] prov/opx: Fix wrong function used when copying from HMEM/rocr. Signed-off-by: Ben Lynam --- prov/opx/include/rdma/opx/fi_opx_hmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_hmem.h b/prov/opx/include/rdma/opx/fi_opx_hmem.h index 6fa0db63a2b..8252a907b78 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hmem.h +++ b/prov/opx/include/rdma/opx/fi_opx_hmem.h @@ -223,7 +223,7 @@ int opx_copy_from_hmem(enum fi_hmem_iface iface, uint64_t device, uint64_t hmem_ } else { /* Perform standard rocr_memcopy*/ OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "AMD-ROCR-MEMCOPY-FROM-HMEM"); - ret = rocr_copy_to_dev(device, dest, src, len); + ret = rocr_copy_from_dev(device, dest, src, len); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "AMD-ROCR-MEMCOPY-FROM-HMEM"); } break; From 4c775de3596108c0e2f5dce4fd2c6c0c963cf219 Mon Sep 17 00:00:00 2001 From: Jack Morrison Date: Mon, 5 Aug 2024 09:44:42 -0400 Subject: [PATCH 014/393] github/actions: Add Cornelis Networks internal CI Cornelis Networks maintains a CI system to support libfabric development internally. This workflow file enables its triggering. The workflow will not run on changes proposed to ofiwg/libfabric, only cornelisnetworks/libfabric-internal (a private repo). Signed-off-by: Jack Morrison --- .github/workflows/cn.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/cn.yml diff --git a/.github/workflows/cn.yml b/.github/workflows/cn.yml new file mode 100644 index 00000000000..f28b00ca3f8 --- /dev/null +++ b/.github/workflows/cn.yml @@ -0,0 +1,21 @@ +name: 'Cornelis' + +on: + workflow_dispatch: + pull_request: + types: + - labeled + - opened + - reopened + - synchronize + branches: + - main + paths-ignore: + - 'man/**' + - 'docs/**' + +jobs: + opx-ci: + name: OPX CI + if: github.repository == 'cornelisnetworks/libfabric-internal' + uses: cornelisnetworks/libfabric-devel/.github/workflows/cn.yml@master From 19817f721b3344099c07008c6ad0b344452d230d Mon Sep 17 00:00:00 2001 From: Bob Cernohous Date: Thu, 8 Aug 2024 19:00:48 -0500 Subject: [PATCH 015/393] prov/opx: Initial 16B header support CN5000 16B header support. Signed-off-by: Bob Cernohous Signed-off-by: Archana Venkatesha Co-authored-by: Archana Venkatesha --- man/fi_opx.7.md | 4 + prov/opx/include/opa_byteorder.h | 65 +- prov/opx/include/rdma/opx/fi_opx.h | 50 +- prov/opx/include/rdma/opx/fi_opx_addr.h | 10 +- prov/opx/include/rdma/opx/fi_opx_atomic.h | 41 +- .../include/rdma/opx/fi_opx_cq_ops_table.h | 20 +- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 1576 +++++----- prov/opx/include/rdma/opx/fi_opx_eq.h | 95 +- .../rdma/opx/fi_opx_fabric_transport.h | 10 +- .../include/rdma/opx/fi_opx_flight_recorder.h | 11 +- prov/opx/include/rdma/opx/fi_opx_hfi1.h | 164 +- .../include/rdma/opx/fi_opx_hfi1_inlines.h | 152 +- prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h | 144 +- .../opx/include/rdma/opx/fi_opx_hfi1_packet.h | 915 +++++- .../include/rdma/opx/fi_opx_hfi1_progress.h | 290 +- prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h | 31 +- .../include/rdma/opx/fi_opx_hfi1_transport.h | 2541 ++++++++++++++--- .../include/rdma/opx/fi_opx_hfi1_version.h | 227 +- prov/opx/include/rdma/opx/fi_opx_hfi1_wfr.h | 29 +- .../opx/include/rdma/opx/fi_opx_reliability.h | 173 +- prov/opx/include/rdma/opx/fi_opx_rma.h | 37 +- prov/opx/include/rdma/opx/fi_opx_rma_ops.h | 44 +- prov/opx/include/rdma/opx/fi_opx_tagged.h | 44 +- prov/opx/include/rdma/opx/opx_hfi1_sim.h | 17 +- prov/opx/src/fi_opx_atomic.c | 544 +++- prov/opx/src/fi_opx_cntr.c | 12 +- prov/opx/src/fi_opx_cq.c | 69 +- prov/opx/src/fi_opx_cq_ops_table_locking.c | 230 +- .../src/fi_opx_cq_ops_table_locking_8192.c | 222 +- .../src/fi_opx_cq_ops_table_locking_runtime.c | 222 +- .../opx/src/fi_opx_cq_ops_table_non_locking.c | 229 +- .../fi_opx_cq_ops_table_non_locking_8192.c | 223 +- .../fi_opx_cq_ops_table_non_locking_runtime.c | 303 +- prov/opx/src/fi_opx_ep.c | 760 +++-- prov/opx/src/fi_opx_hfi1.c | 1185 ++++++-- prov/opx/src/fi_opx_hfi1_jkr.c | 123 +- prov/opx/src/fi_opx_hfi1_sdma.c | 4 +- prov/opx/src/fi_opx_hfi1_wfr.c | 25 +- prov/opx/src/fi_opx_init.c | 9 +- prov/opx/src/fi_opx_msg.c | 365 ++- prov/opx/src/fi_opx_reliability.c | 809 ++++-- prov/opx/src/fi_opx_rma.c | 448 ++- prov/opx/src/fi_opx_tagged.c | 566 +++- prov/opx/src/opa_proto.c | 12 +- 44 files changed, 9920 insertions(+), 3130 deletions(-) diff --git a/man/fi_opx.7.md b/man/fi_opx.7.md index 4aa6f60a482..90b63dc5eb9 100644 --- a/man/fi_opx.7.md +++ b/man/fi_opx.7.md @@ -247,6 +247,10 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL. The default threshold is 8192. This has no meaning if Libfabric was not configured with GDRCopy or ROCR support. +*FI_OPX_MIXED_NETWORK* +: Integer. Indicates that the network is a mix of OPA100 and CN5000. Needs to be set to 1 + in case of mixed network. Default is 0. + # SEE ALSO [`fabric`(7)](fabric.7.html), diff --git a/prov/opx/include/opa_byteorder.h b/prov/opx/include/opa_byteorder.h index 43bf33cf73b..f943ccc96d2 100644 --- a/prov/opx/include/opa_byteorder.h +++ b/prov/opx/include/opa_byteorder.h @@ -6,7 +6,7 @@ GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021 Cornelis Networks. + Copyright(c) 2021,2024 Cornelis Networks. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -20,7 +20,7 @@ BSD LICENSE Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021 Cornelis Networks. + Copyright(c) 2021,2024 Cornelis Networks. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -51,7 +51,7 @@ */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ -/* Copyright (C) 2021-2024 by Cornelis Networks. */ +/* Copyright (C) 2021,2024 by Cornelis Networks. */ #ifndef OPA_BYTEORDER_H #define OPA_BYTEORDER_H @@ -70,6 +70,8 @@ extern "C" { typedef __u16 __le16; typedef __u16 __be16; +typedef __u32 __le24; /* for readability. Only use 24 of 32 bits */ +typedef __u32 __be24; /* for readability. Only use 24 of 32 bits */ typedef __u32 __le32; typedef __u32 __be32; typedef __u64 __le64; @@ -77,6 +79,8 @@ typedef __u64 __be64; static __inline__ __u16 __hfi_fswab16(__u16) __attribute__ ((always_inline)); +static __inline__ __u32 __hfi_fswab24(__u32) + __attribute__ ((always_inline)); static __inline__ __u32 __hfi_fswab32(__u32) __attribute__ ((always_inline)); static __inline__ __u64 __hfi_fswab64(__u64) @@ -85,7 +89,15 @@ static __inline__ __u64 __hfi_fswab64(__u64) static __inline__ __u16 __hfi_fswab16(__u16 x) { return ((x & (__u16) 0x00ffU) << 8) | ((x & (__u16) 0xff00U) >> 8); -} static __inline__ __u32 __hfi_fswab32(__u32 x) { +} + +static __inline__ __u32 __hfi_fswab24(__u32 x) { + return ((x & (__u32) 0x000000ffUL) << 16) | + ((x & (__u32) 0x0000ff00UL) << 0) | + ((x & (__u32) 0x00ff0000UL) >> 16); +} + +static __inline__ __u32 __hfi_fswab32(__u32 x) { return ((x & (__u32) 0x000000ffUL) << 24) | ((x & (__u32) 0x0000ff00UL) << 8) | ((x & (__u32) 0x00ff0000UL) >> 8) @@ -105,6 +117,8 @@ static __inline__ __u64 __hfi_fswab64(__u64 x) { static __inline__ __u16 __cpu_to_le16(__le16) __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_le24(__le24) + __attribute__ ((always_inline)); static __inline__ __u32 __cpu_to_le32(__le32) __attribute__ ((always_inline)); static __inline__ __u64 __cpu_to_le64(__le64) @@ -112,6 +126,8 @@ static __inline__ __u64 __cpu_to_le64(__le64) static __inline__ __u16 __le16_to_cpu(__le16) __attribute__ ((always_inline)); +static __inline__ __u32 __le24_to_cpu(__le24) + __attribute__ ((always_inline)); static __inline__ __u32 __le32_to_cpu(__le32) __attribute__ ((always_inline)); static __inline__ __u64 __le64_to_cpu(__le64) @@ -119,6 +135,8 @@ static __inline__ __u64 __le64_to_cpu(__le64) static __inline__ __u16 __cpu_to_be16(__be16) __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_be24(__be24) + __attribute__ ((always_inline)); static __inline__ __u32 __cpu_to_be32(__be32) __attribute__ ((always_inline)); static __inline__ __u64 __cpu_to_be64(__be64) @@ -126,6 +144,8 @@ static __inline__ __u64 __cpu_to_be64(__be64) static __inline__ __u16 __be16_to_cpu(__be16) __attribute__ ((always_inline)); +static __inline__ __u32 __be24_to_cpu(__be24) + __attribute__ ((always_inline)); static __inline__ __u32 __be32_to_cpu(__be32) __attribute__ ((always_inline)); static __inline__ __u64 __be64_to_cpu(__be64) @@ -140,6 +160,10 @@ static __inline__ __le16 __cpu_to_le16(__u16 x) { return x; } +static __inline__ __le24 __cpu_to_le24(__u32 x) { + return x; +} + static __inline__ __le32 __cpu_to_le32(__u32 x) { return x; } @@ -155,6 +179,10 @@ static __inline__ __u16 __le16_to_cpu(__le16 x) { return x; } +static __inline__ __u32 __le24_to_cpu(__le24 x) { + return x; +} + static __inline__ __u32 __le32_to_cpu(__le32 x) { return x; } @@ -170,6 +198,10 @@ static __inline__ __be16 __cpu_to_be16(__u16 x) { return __hfi_fswab16(x); } +static __inline__ __be24 __cpu_to_be24(__u32 x) { + return __hfi_fswab24(x); +} + static __inline__ __be32 __cpu_to_be32(__u32 x) { return __hfi_fswab32(x); } @@ -185,6 +217,10 @@ static __inline__ __u16 __be16_to_cpu(__be16 x) { return __hfi_fswab16(x); } +static __inline__ __u32 __be24_to_cpu(__be24 x) { + return __hfi_fswab24(x); +} + static __inline__ __u32 __be32_to_cpu(__be32 x) { return __hfi_fswab32(x); } @@ -202,6 +238,10 @@ static __inline__ __le16 __cpu_to_le16(__u16 x) { return __hfi_fswab16(x); } +static __inline__ __le24 __cpu_to_le24(__u32 x) { + return __hfi_fswab24(x); +} + static __inline__ __le32 __cpu_to_le32(__u32 x) { return __hfi_fswab32(x); } @@ -217,6 +257,10 @@ static __inline__ __u16 __le16_to_cpu(__le16 x) { return __hfi_fswab16(x); } +static __inline__ __u24 __le24_to_cpu(__le24 x) { + return __hfi_fswab24(x); +} + static __inline__ __u32 __le32_to_cpu(__le32 x) { return __hfi_fswab32(x); } @@ -232,6 +276,10 @@ static __inline__ __be16 __cpu_to_be16(__u16 x) { return x; } +static __inline__ __be24 __cpu_to_be24(__u24 x) { + return x; +} + static __inline__ __be32 __cpu_to_be32(__u32 x) { return x; } @@ -247,6 +295,10 @@ static __inline__ __u16 __be16_to_cpu(__be16 x) { return x; } +static __inline__ __u32 __be24_tp_cpu(__be24 x) { + return x; +} + static __inline__ __u32 __be32_to_cpu(__be32 x) { return x; } @@ -262,4 +314,9 @@ static __inline__ __u64 __be64_to_cpu(__be64 x) { #ifdef __cplusplus } /* extern "C" */ #endif + +static __inline__ __be24 __le24_to_be24(__le24 x) { return __hfi_fswab24((__u32)x); } +static __inline__ __be24 __be24_to_le24(__be24 x) { return __hfi_fswab24((__u32)x); } + + #endif /* OPA_BYTEORDER_H */ diff --git a/prov/opx/include/rdma/opx/fi_opx.h b/prov/opx/include/rdma/opx/fi_opx.h index 1b344d23ad9..5de50e4e66f 100644 --- a/prov/opx/include/rdma/opx/fi_opx.h +++ b/prov/opx/include/rdma/opx/fi_opx.h @@ -101,12 +101,40 @@ struct fi_opx_daos_hfi_rank { UT_hash_handle hh; /* makes this structure hashable */ }; +/* hfi1 type for bit logic */ enum opx_hfi1_type { OPX_HFI1_UNDEF = 0, // undefined - OPX_HFI1_WFR = 4, // Omni-path (all generations) - OPX_HFI1_JKR = 5 // CN5000 (initial generation) + OPX_HFI1_JKR_9B = 1, // CN5000 built for mixed network. Internal use + OPX_HFI1_WFR = 2, // Omni-path (all generations) + OPX_HFI1_JKR = 4 // CN5000 (initial generation) }; +/* Will remove after 16B SDMA support is finished */ +#define OPX_NO_9B_SUPPORT(_hfi1_type) \ +do { \ + if(!(_hfi1_type & OPX_HFI1_JKR)) { \ + fprintf(stderr, "%s NO JKR 9B SUPPORT for %u %s\n", __func__,\ + _hfi1_type, \ + _hfi1_type & OPX_HFI1_WFR ? "OPX_HFI1_WFR" : \ + _hfi1_type & OPX_HFI1_JKR_9B ? "OPX_HFI1_JKR_9B" : \ + "UNKNOWN" ); \ + if(getenv("OPX_9B_ABORT")) abort(); \ + } \ + assert(_hfi1_type != OPX_HFI1_UNDEF); \ +} while(0) + + +#define OPX_NO_16B_SUPPORT(_hfi1_type) \ +do { \ + if(!(_hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B))) { \ + fprintf(stderr, "%s NO 16B SUPPORT for %u %s\n", __func__, \ + _hfi1_type, _hfi1_type & OPX_HFI1_JKR ? "OPX_HFI1_JKR" : \ + "UNKNOWN" ); \ + if(getenv("OPX_16B_ABORT")) abort(); \ + } \ + assert(_hfi1_type != OPX_HFI1_UNDEF); \ +} while(0) + struct fi_opx_hfi_local_info { struct fi_opx_hfi_local_lookup *hfi_local_lookup_hashmap; enum opx_hfi1_type type; @@ -125,19 +153,13 @@ struct fi_opx_hfi_local_info { #undef OPX_SIM_ENABLED #endif -/* Build constant for JKR/WFR path optimization */ -#if (defined(OPX_WFR) && defined(OPX_JKR)) -/* Both JKR and WFR runtime support (not constant) */ #define OPX_HFI1_TYPE fi_opx_global.hfi_local_info.type -#elif defined(OPX_WFR) -#define OPX_HFI1_TYPE OPX_HFI1_WFR -#elif defined(OPX_JKR) -#define OPX_HFI1_TYPE OPX_HFI1_JKR -#else -/* Currently default to WFR (only) */ -#define OPX_WFR -#define OPX_HFI1_TYPE OPX_HFI1_WFR -#endif + + +/* Default is both JKR and WFR runtime support (no constant), + use a local or global variable */ + +#define OPX_PRE_CN5000 1 struct fi_opx_hfi_local_lookup_key { uint16_t lid; diff --git a/prov/opx/include/rdma/opx/fi_opx_addr.h b/prov/opx/include/rdma/opx/fi_opx_addr.h index cdc4f75a450..5527154490c 100644 --- a/prov/opx/include/rdma/opx/fi_opx_addr.h +++ b/prov/opx/include/rdma/opx/fi_opx_addr.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -47,8 +47,12 @@ union fi_opx_uid { fi_opx_uid_t fi; struct { - uint16_t endpoint_id;/* node-scoped endpoint identifier */ - uint16_t lid; /* fabric-scoped node identifier (big-endian) */ + uint8_t endpoint_id;/* node-scoped endpoint identifier */ + uint8_t lid_3B; /* fabric-scoped node identifier (3rd byte) */ + uint16_t lid; /* fabric-scoped node identifier (big-endian) */ + } __attribute__((__packed__)); + struct { + uint32_t lid_32; /* fabric-scoped node identifier (big-endian) */ } __attribute__((__packed__)); } __attribute__((__packed__)); diff --git a/prov/opx/include/rdma/opx/fi_opx_atomic.h b/prov/opx/include/rdma/opx/fi_opx_atomic.h index f66a83a0d84..1fb209507f7 100644 --- a/prov/opx/include/rdma/opx/fi_opx_atomic.h +++ b/prov/opx/include/rdma/opx/fi_opx_atomic.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -42,35 +42,35 @@ * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_ATOMIC_SPECIALIZED_FUNC(LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_ATOMIC_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_ATOMIC_SPECIALIZED_FUNC(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_ATOMIC_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY) \ - static inline ssize_t fi_opx_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ +#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + static inline ssize_t fi_opx_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t count, void *desc, fi_addr_t dst_addr, \ uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, \ void *context) \ { \ return fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_inject_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_inject_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t count, fi_addr_t dst_addr, \ uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op) \ { \ return fi_opx_inject_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, \ - op, LOCK, AV, CAPS, RELIABILITY); \ + op, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_fetch_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_fetch_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, \ void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, \ enum fi_datatype datatype, enum fi_op op, void *context) \ { \ return fi_opx_fetch_atomic_generic(ep, buf, count, desc, result, result_desc, \ dest_addr, addr, key, datatype, op, context, \ - LOCK, AV, CAPS, RELIABILITY); \ + LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_compare_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_compare_atomic_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t count, void *desc, const void *compare, \ void *compare_desc, void *result, void *result_desc, fi_addr_t dest_addr, \ uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, \ @@ -79,14 +79,14 @@ return fi_opx_compare_atomic_generic(ep, buf, count, desc, compare, compare_desc, \ result, result_desc, dest_addr, addr, key, \ datatype, op, context, LOCK, AV, CAPS, \ - RELIABILITY); \ + RELIABILITY, HFI1_TYPE); \ } -#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_##TYPE##_##LOCK##_##AV##_##CAPS##_##RELIABILITY +#define FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_##TYPE##_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE #ifdef __cplusplus extern "C" { @@ -121,7 +121,8 @@ ssize_t fi_opx_fetch_atomic_generic(struct fid_ep *ep, const void *buf, size_t c uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_compare_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, @@ -129,13 +130,15 @@ ssize_t fi_opx_compare_atomic_generic(struct fid_ep *ep, const void *buf, size_t uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); #ifdef __cplusplus } diff --git a/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h b/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h index c070c60d7f4..8554faa85c4 100644 --- a/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h +++ b/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h @@ -46,7 +46,10 @@ /* Number of types in enum fi_cq_format */ #define FI_CQ_FORMAT_COUNT 5 -typedef struct fi_ops_cq op_matrix_t[FI_CQ_FORMAT_COUNT][1 /* OFI_RELIABILITY_KIND_ONLOAD */][FI_OPX_COMMS_COUNT]; +/* Number of types in enum opx_hfi1_type */ +#define OPX_HFI1_TYPE_COUNT 3 + +typedef struct fi_ops_cq op_matrix_t[FI_CQ_FORMAT_COUNT][1 /* OFI_RELIABILITY_KIND_ONLOAD */][FI_OPX_COMMS_COUNT][OPX_HFI1_TYPE_COUNT]; static ssize_t fi_opx_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, uint64_t flags) @@ -118,14 +121,14 @@ fi_opx_cq_strerror(struct fid_cq *cq, int prov_errno, const void *err_data, return NULL; } -#define FI_OPX_CQ_OPS_STRUCT_NAME(FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ - fi_opx_ops_cq_ ## FORMAT ## _ ## LOCK ## _ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ +#define FI_OPX_CQ_OPS_STRUCT_NAME(FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + fi_opx_ops_cq_ ## FORMAT ## _ ## LOCK ## _ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ -#define FI_OPX_CQ_OPS_STRUCT_INIT(FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ +#define FI_OPX_CQ_OPS_STRUCT_INIT(FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ { \ .size = sizeof(struct fi_ops_cq), \ - .read = FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, FORMAT, LOCK, RELIABILITY, MASK, CAPS), \ - .readfrom = FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, FORMAT, LOCK, RELIABILITY, MASK, CAPS), \ + .read = FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE), \ + .readfrom = FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE), \ .readerr = fi_opx_cq_readerr, \ .sread = fi_opx_cq_sread, \ .sreadfrom = fi_opx_cq_sreadfrom, \ @@ -133,9 +136,4 @@ fi_opx_cq_strerror(struct fid_cq *cq, int prov_errno, const void *err_data, .strerror = fi_opx_cq_strerror, \ } -#define FI_OPX_CQ_OPS_STRUCT(FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ -static struct fi_ops_cq \ - FI_OPX_CQ_OPS_STRUCT_NAME(FORMAT, LOCK, RELIABILITY, MASK, CAPS) = \ - FI_OPX_CQ_OPS_STRUCT_INIT(FORMAT, LOCK, RELIABILITY, MASK, CAPS) - #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 9cd3d7d0158..7d54792a133 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -99,12 +99,12 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_MSG_SPECIALIZED_FUNC(LOCK,AV,CAPS,RELIABILITY) \ - FI_OPX_MSG_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY) +#define FI_OPX_MSG_SPECIALIZED_FUNC(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ + FI_OPX_MSG_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) -#define FI_OPX_MSG_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY) \ +#define FI_OPX_MSG_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ static inline ssize_t \ - fi_opx_send_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_send_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ void *desc, fi_addr_t dest_addr, void *context) \ { \ @@ -116,19 +116,20 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); 0, /* override_flags */ \ 0, /* flags */ \ CAPS | FI_MSG, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_recv_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_recv_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, void *buf, size_t len, \ void *desc, fi_addr_t src_addr, void *context) \ { \ return fi_opx_recv_generic(ep, buf, len, desc, \ src_addr, 0, (uint64_t)-1, context, \ - LOCK, AV, FI_MSG, RELIABILITY); \ + LOCK, AV, FI_MSG, RELIABILITY, HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_inject_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_inject_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ fi_addr_t dest_addr) \ { \ @@ -137,18 +138,19 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); LOCK, /* lock_required */ \ AV, /* av_type */ \ CAPS | FI_MSG, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_recvmsg_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_recvmsg_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE\ (struct fid_ep *ep, const struct fi_msg *msg, \ uint64_t flags) \ { \ return fi_opx_recvmsg_generic(ep, msg, flags, \ - LOCK, AV, RELIABILITY); \ + LOCK, AV, RELIABILITY, HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_senddata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_senddata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ void *desc, uint64_t data, fi_addr_t dest_addr, \ void *context) \ @@ -161,26 +163,28 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); 0, /* override_flags */ \ 0, /* flags */ \ CAPS | FI_MSG, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ static inline ssize_t \ - fi_opx_injectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_injectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ uint64_t data, fi_addr_t dest_addr) \ - { \ + { \ return fi_opx_ep_tx_inject(ep, buf, len, \ dest_addr, 0, data, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ CAPS | FI_MSG, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } -#define FI_OPX_MSG_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_MSG_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_MSG_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY,HFI1_TYPE) \ + FI_OPX_MSG_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY,HFI1_TYPE) -#define FI_OPX_MSG_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_ ## TYPE ## _ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY +#define FI_OPX_MSG_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY,HFI1_TYPE) \ + fi_opx_ ## TYPE ## _ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE @@ -198,8 +202,9 @@ enum opx_work_type { OPX_WORK_TYPE_TID_SETUP, OPX_WORK_TYPE_LAST }; + OPX_COMPILE_TIME_ASSERT(OPX_WORK_TYPE_SDMA == 0, - "OPX_WORK_TYPE_SDMA needs to be 0/first value in the enum!"); + "OPX_WORK_TYPE_SDMA needs to be 0/first value in the enum!"); static const char * const OPX_WORK_TYPE_STR[] = { [OPX_WORK_TYPE_SDMA] = "SDMA", @@ -225,15 +230,18 @@ struct fi_opx_stx { struct fi_opx_reliability_service reliability_service; /* ONLOAD only */ uint8_t reliability_rx; /* ONLOAD only */ - /* == CACHE LINE 4,5,6 == */ + /* == CACHE LINE 4-9 == */ struct { - struct fi_opx_hfi1_txe_scb inject; - struct fi_opx_hfi1_txe_scb send; - struct fi_opx_hfi1_txe_scb rzv; + struct fi_opx_hfi1_txe_scb_9B inject; + struct fi_opx_hfi1_txe_scb_9B send; + struct fi_opx_hfi1_txe_scb_9B rzv; + struct fi_opx_hfi1_txe_scb_16B inject_16B; + struct fi_opx_hfi1_txe_scb_16B send_16B; + struct fi_opx_hfi1_txe_scb_16B rzv_16B; } tx; - /* == CACHE LINE 7 == */ + /* == CACHE LINE 10 == */ struct fi_opx_hfi1_rxe_state rxe_state; /* ignored for ofi tx */ int64_t ref_cnt; @@ -255,7 +263,7 @@ struct fi_opx_stx { */ struct fi_opx_ep_tx { - /* == CACHE LINE 0,1 == */ + /* == CACHE LINE 0 == */ volatile union fi_opx_hfi1_pio_state *pio_state; /* 1 qw = 8 bytes */ volatile uint64_t * pio_scb_sop_first; @@ -263,8 +271,6 @@ struct fi_opx_ep_tx { uint16_t pio_max_eager_tx_bytes; uint16_t pio_flow_eager_tx_bytes; - struct fi_opx_hfi1_txe_scb inject; /* qws 5,6, and 7 specified at runtime */ - volatile uint64_t * pio_credits_addr; /* const; only used to infrequently "refresh" credit information */ volatile uint64_t * pio_scb_first; /* const; only eager and rendezvous */ uint64_t cq_bind_flags; @@ -274,12 +280,25 @@ struct fi_opx_ep_tx { uint8_t force_credit_return; uint8_t use_sdma; - /* == CACHE LINE 2,3 == */ + /* == CACHE LINE 1,2 == */ + struct fi_opx_hfi1_txe_scb_9B inject_9B; /* qws 5,6, and 7 specified at runtime */ - struct fi_opx_hfi1_txe_scb send; - struct fi_opx_hfi1_txe_scb rzv; + /* == CACHE LINE 3,4 == */ + struct fi_opx_hfi1_txe_scb_9B send_9B; - /* == CACHE LINE 4 == */ + /* == CACHE LINE 5,6 == */ + struct fi_opx_hfi1_txe_scb_9B rzv_9B; + + /* == CACHE LINE 7,8 == */ + struct fi_opx_hfi1_txe_scb_16B inject_16B; + + /* == CACHE LINE 9,10 == */ + struct fi_opx_hfi1_txe_scb_16B send_16B; + + /* == CACHE LINE 11,12 == */ + struct fi_opx_hfi1_txe_scb_16B rzv_16B; + + /* == CACHE LINE 13 == */ union fi_opx_addr * av_addr; /* only FI_ADDR_TABLE */ uint64_t av_count; /* only FI_ADDR_TABLE */ @@ -290,11 +309,11 @@ struct fi_opx_ep_tx { struct fi_opx_cq * cq; struct fi_opx_context_slist * cq_pending_ptr; /* only rendezvous (typically) */ - /* == CACHE LINE 5 == */ + /* == CACHE LINE 14 == */ struct slist work_pending[OPX_WORK_TYPE_LAST]; - /* == CACHE LINE 6 == */ + /* == CACHE LINE 15 == */ struct slist work_pending_completion; struct ofi_bufpool *work_pending_pool; @@ -306,34 +325,42 @@ struct fi_opx_ep_tx { uint16_t mp_eager_max_payload_bytes; uint8_t unused_cacheline6[6]; - /* == CACHE LINE 7 == */ + /* == CACHE LINE 16 == */ struct opx_sdma_queue sdma_request_queue; struct slist sdma_pending_queue; struct ofi_bufpool *sdma_request_pool; uint64_t unused_cacheline7[2]; - /* == CACHE LINE 8, ... == */ + /* == CACHE LINE 17, ... == */ int64_t ref_cnt; struct fi_opx_stx *stx; - // struct opx_shm_tx is very large and should go last! struct opx_shm_tx shm; void *mem; } __attribute__((__aligned__(L2_CACHE_LINE_SIZE))) __attribute__((__packed__)); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, send) == (FI_OPX_CACHE_LINE_SIZE * 2), - "Offset of fi_opx_ep_tx->send should start at cacheline 2!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, rzv) == (FI_OPX_CACHE_LINE_SIZE * 3), - "Offset of fi_opx_ep_tx->rzv should start at cacheline 3!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, av_addr) == (FI_OPX_CACHE_LINE_SIZE * 4), - "Offset of fi_opx_ep_tx->av_addr should start at cacheline 4!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending) == (FI_OPX_CACHE_LINE_SIZE * 5), - "Offset of fi_opx_ep_tx->work_pending should start at cacheline 5!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending_completion) == (FI_OPX_CACHE_LINE_SIZE * 6), - "Offset of fi_opx_ep_tx->work_pending_completion should start at cacheline 6!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, sdma_request_queue) == (FI_OPX_CACHE_LINE_SIZE * 7), - "Offset of fi_opx_ep_tx->sdma_request_queue should start at cacheline 7!"); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, ref_cnt) == (FI_OPX_CACHE_LINE_SIZE * 8), - "Offset of fi_opx_ep_tx->ref_cnt should start at cacheline 8!"); + +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, inject_9B) == (FI_OPX_CACHE_LINE_SIZE * 1), + "Offset of fi_opx_ep_tx->inject_9B should start at cacheline 1!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, send_9B) == (FI_OPX_CACHE_LINE_SIZE * 3), + "Offset of fi_opx_ep_tx->send_9B should start at cacheline 3!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, rzv_9B) == (FI_OPX_CACHE_LINE_SIZE * 5), + "Offset of fi_opx_ep_tx->rzv_9B should start at cacheline 5!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, inject_16B) == (FI_OPX_CACHE_LINE_SIZE * 7), + "Offset of fi_opx_ep_tx->inject_16B should start at cacheline 7!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, send_16B) == (FI_OPX_CACHE_LINE_SIZE * 9), + "Offset of fi_opx_ep_tx->send_16B should start at cacheline 9!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, rzv_16B) == (FI_OPX_CACHE_LINE_SIZE * 11), + "Offset of fi_opx_ep_tx->rzv_16B should start at cacheline 11!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, av_addr) == (FI_OPX_CACHE_LINE_SIZE * 13), + "Offset of fi_opx_ep_tx->av_addr should start at cacheline 13!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending) == (FI_OPX_CACHE_LINE_SIZE * 14), + "Offset of fi_opx_ep_tx->work_pending should start at cacheline 14!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending_completion) == (FI_OPX_CACHE_LINE_SIZE * 15), + "Offset of fi_opx_ep_tx->work_pending_completion should start at cacheline 15!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, sdma_request_queue) == (FI_OPX_CACHE_LINE_SIZE * 16), + "Offset of fi_opx_ep_tx->sdma_request_queue should start at cacheline 16!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, ref_cnt) == (FI_OPX_CACHE_LINE_SIZE * 17), + "Offset of fi_opx_ep_tx->ref_cnt should start at cacheline 17!"); struct fi_opx_ep_rx { @@ -399,7 +426,7 @@ struct fi_opx_ep_rx { volatile uint64_t * head_register; } egrq __attribute__((__packed__)); - /* == CACHE LINE 5,6 == */ + /* == CACHE LINE 5,6,7,8,9,10,11,12 == */ /* * NOTE: These cachelines are shared between the application-facing @@ -409,11 +436,12 @@ struct fi_opx_ep_rx { * This 'tx' information is used when sending acks, etc. */ struct { - struct fi_opx_hfi1_txe_scb dput; - struct fi_opx_hfi1_txe_scb cts; + struct fi_opx_hfi1_txe_scb_9B dput_9B; + struct fi_opx_hfi1_txe_scb_9B cts_9B; + struct fi_opx_hfi1_txe_scb_16B dput_16B; + struct fi_opx_hfi1_txe_scb_16B cts_16B; } tx; - /* -- non-critical -- */ uint64_t min_multi_recv; struct fi_opx_domain *domain; @@ -568,6 +596,7 @@ OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep, init_send_cntr) == (FI_OPX_CA OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep, lock) == ((FI_OPX_CACHE_LINE_SIZE * 5)+52), "Offset of fi_opx_ep->lock should start before cacheline 6!"); + /* * A 'scalable endpoint' may not be directly specified in a data movement * functions, such as fi_tsend(), as it is only a container for multiple @@ -622,55 +651,63 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, const uint64_t rx_op_flags, const uint64_t is_context_ext, const uint64_t is_hmem, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hf1_type); void fi_opx_ep_rx_process_header_tag (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes, const uint8_t opcode, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hf1_type, + uint32_t slid); void fi_opx_ep_rx_process_header_msg (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes, const uint8_t opcode, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hf1_type, + uint32_t slid); void fi_opx_ep_rx_reliability_process_packet (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_rs); void fi_opx_ep_rx_append_ue_msg (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, const uint32_t rank_inst, const bool daos_enabled, - struct fi_opx_debug_counters *debug_counters); + struct fi_opx_debug_counters *debug_counters, + const uint64_t slid); void fi_opx_ep_rx_append_ue_tag (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, const uint32_t rank_inst, const bool daos_enabled, - struct fi_opx_debug_counters *debug_counters); + struct fi_opx_debug_counters *debug_counters, + const uint64_t slid); void fi_opx_ep_rx_append_ue_egr (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, - const size_t payload_bytes); + const size_t payload_bytes, + const uint64_t slid); int fi_opx_ep_tx_check (struct fi_opx_ep_tx * tx, enum fi_av_type av_type); @@ -811,10 +848,12 @@ uint64_t fi_opx_ep_is_matching_packet(const uint64_t origin_tag, } + __OPX_FORCE_INLINE__ struct fi_opx_hfi1_ue_packet *fi_opx_ep_find_matching_packet(struct fi_opx_ep *opx_ep, union fi_opx_context * context, - const uint64_t kind) + const uint64_t kind, + const enum opx_hfi1_type hfi1_type) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.match.default_searches); struct fi_opx_hfi1_ue_packet *uepkt = opx_ep->rx->queue[kind].ue.head; @@ -838,7 +877,7 @@ struct fi_opx_hfi1_ue_packet *fi_opx_ep_find_matching_packet(struct fi_opx_ep *o opx_ep, uepkt->daos_info.rank, uepkt->daos_info.rank_inst, - fi_opx_hfi_is_intranode(uepkt->hdr.stl.lrh.slid))) { + opx_lrh_is_intranode(&(uepkt->hdr), hfi1_type))) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.match.default_misses); uepkt = uepkt->next; } @@ -851,21 +890,23 @@ struct fi_opx_hfi1_ue_packet *fi_opx_ep_find_matching_packet(struct fi_opx_ep *o __OPX_FORCE_INLINE__ uint64_t is_match (struct fi_opx_ep * opx_ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, union fi_opx_context * context, uint32_t rank, uint32_t rank_inst, - unsigned is_intranode) + unsigned is_intranode, + const uint64_t slid) { const union fi_opx_addr src_addr = { .fi = context->src_addr }; - const fi_opx_uid_t origin_uid_fi = fi_opx_hfi1_packet_hdr_uid(hdr); + const fi_opx_uid_t origin_uid_fi = fi_opx_hfi1_packet_hdr_uid(hdr, slid); const uint64_t ignore = context->ignore; const uint64_t target_tag = context->tag; const uint64_t origin_tag = hdr->match.ofi_tag; const uint64_t target_tag_and_not_ignore = target_tag & ~ignore; const uint64_t origin_tag_and_not_ignore = origin_tag & ~ignore; + const uint64_t answer = ( (origin_tag_and_not_ignore == target_tag_and_not_ignore) && @@ -879,11 +920,19 @@ uint64_t is_match (struct fi_opx_ep * opx_ep, ) ) ); + #ifdef IS_MATCH_DEBUG fprintf(stderr, "%s:%s():%d context = %p, context->src_addr = 0x%016lx, context->ignore = 0x%016lx, context->tag = 0x%016lx, src_addr.uid.fi = 0x%08x\n", __FILE__, __func__, __LINE__, context, context->src_addr, context->ignore, context->tag, src_addr.uid.fi); - fprintf(stderr, "%s:%s():%d hdr->match.slid = 0x%04x (%u), hdr->match.origin_tx = 0x%02x (%u), origin_uid_fi = 0x%08x\n", __FILE__, __func__, __LINE__, - hdr->match.slid, hdr->match.slid, hdr->match.origin_tx, hdr->match.origin_tx, origin_uid_fi); + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fprintf(stderr, "%s:%s():%d hdr->match.slid = 0x%04x (%u), hdr->match.origin_tx = 0x%02x (%u), origin_uid_fi = 0x%08x\n", __FILE__, __func__, __LINE__, + hdr->lrh_9B.slid, hdr->lrh_9B.slid, hdr->match.origin_tx, hdr->match.origin_tx, origin_uid_fi); + } else { + fprintf(stderr, "%s:%s():%d hdr->match.slid = 0x%04x/0x%04lx (%u), hdr->match.origin_tx = 0x%02x (%u), origin_uid_fi = 0x%08x\n", __FILE__, __func__, __LINE__, + htonl((uint64_t)((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid))),((uint64_t)((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid))), + htonl((uint64_t)((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid))), + hdr->match.origin_tx, hdr->match.origin_tx, origin_uid_fi); + } fprintf(stderr, "%s:%s():%d hdr->match.ofi_tag = 0x%016lx, target_tag_and_not_ignore = 0x%016lx, origin_tag_and_not_ignore = 0x%016lx, FI_ADDR_UNSPEC = 0x%08lx\n", __FILE__, __func__, __LINE__, hdr->match.ofi_tag, target_tag_and_not_ignore, origin_tag_and_not_ignore, FI_ADDR_UNSPEC); if (opx_ep->daos_info.hfi_rank_enabled && is_intranode) { @@ -940,6 +989,378 @@ void fi_opx_enqueue_completed(struct fi_opx_context_slist *queue, fi_opx_context_slist_insert_tail(real_context, queue); } +__OPX_FORCE_INLINE__ +void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, + const union fi_opx_hfi1_packet_payload * const payload, + struct fi_opx_ep * opx_ep, + const uint64_t origin_tag, + const uint8_t opcode, + union fi_opx_context *context, + const uint64_t is_context_ext, + const uint64_t is_multi_receive, + const unsigned is_intranode, + const uint64_t is_hmem, + const int lock_required, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + const uintptr_t origin_byte_counter_vaddr, + const struct fi_opx_hmem_iov *iov, + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info, + const struct fi_opx_hmem_iov *src_dst_iov, + const uint8_t * const immediate_byte, + const uint64_t * const immediate_qw, + const union cacheline * const immediate_block) +{ + assert( (opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) || (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV -- RENDEZVOUS RTS (%X) (begin) context %p is_multi_recv (%lu)\n", + opcode, context, is_multi_receive); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS"); + + const uint64_t ofi_data = hdr->match.ofi_data; + const uint64_t niov = hdr->rendezvous.niov; + const uint64_t xfer_len = hdr->rendezvous.message_length; + const uint64_t is_noncontig = hdr->rendezvous.flags & FI_OPX_PKT_RZV_FLAGS_NONCONTIG; + void *recv_buf = context->buf; + struct fi_opx_ep_rx * const rx = opx_ep->rx; + const uint64_t recv_len = context->len; + + if (is_multi_receive) { /* compile-time constant expression */ + assert(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS); + const uint8_t u8_rx = hdr->rendezvous.origin_rx; + const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); + union fi_opx_context * original_multi_recv_context = context; + context = (union fi_opx_context *)((uintptr_t)recv_buf - sizeof(union fi_opx_context)); + + assert((((uintptr_t)context) & 0x07) == 0); + context->flags = FI_RECV | FI_MSG | FI_OPX_CQ_CONTEXT_MULTIRECV; + context->buf = recv_buf; + context->len = xfer_len; + context->data = ofi_data; + context->tag = 0; /* tag is not valid for multi-receives */ + context->multi_recv_context = original_multi_recv_context; + context->byte_counter = xfer_len; + context->next = NULL; + uint8_t * rbuf = (uint8_t *)recv_buf; + + if (OFI_LIKELY(is_noncontig)) { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.multi_recv_rzv_noncontig); + FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, + hdr, + payload, + u8_rx, niov, + origin_byte_counter_vaddr, + context, + (uintptr_t)(rbuf), /* receive buffer virtual address */ + FI_HMEM_SYSTEM, /* receive buffer iface */ + 0UL, /* receive buffer device */ + 0UL, /* immediate_data */ + 0UL, /* immediate_end_block_count */ + iov, + FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG, + is_intranode, + reliability, /* compile-time constant expression */ + u32_ext_rx, + hfi1_type); + } else { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.multi_recv_rzv_contig); + assert(niov == 1); + const uint64_t immediate_byte_count = immediate_info.byte_count; + const uint64_t immediate_qw_count = immediate_info.qw_count; + const uint64_t immediate_block_count = immediate_info.block_count; + const uint64_t immediate_total = immediate_byte_count + + immediate_qw_count * sizeof(uint64_t) + + immediate_block_count * sizeof(union cacheline); + const uint64_t immediate_end_block_count = immediate_info.end_block_count; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", + immediate_total, immediate_byte_count, immediate_qw_count, immediate_block_count); + + context->byte_counter -= immediate_total; + + FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, + hdr, + payload, + u8_rx, niov, + origin_byte_counter_vaddr, + context, + (uintptr_t)(rbuf + immediate_total), /* receive buffer virtual address */ + FI_HMEM_SYSTEM, /* receive buffer iface */ + 0UL, /* receive buffer device */ + immediate_total, + immediate_end_block_count, + src_dst_iov, + FI_OPX_HFI_DPUT_OPCODE_RZV, + is_intranode, + reliability, /* compile-time constant expression */ + u32_ext_rx, + hfi1_type); + + /* + * copy the immediate payload data + */ + unsigned i; + + if (immediate_byte_count) { + for (i=0; ilen -= bytes_consumed; + original_multi_recv_context->byte_counter++; // re-using the byte counter as a "pending flag" + original_multi_recv_context->tag = (uintptr_t)opx_ep; // re-using tag to store the ep + original_multi_recv_context->buf = (void*)((uintptr_t)(original_multi_recv_context->buf) + bytes_consumed); + assert(context->next == NULL); + if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } + fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); + + } else if (OFI_LIKELY(xfer_len <= recv_len)) { + + context->len = xfer_len; + context->data = ofi_data; + context->tag = origin_tag; + context->next = NULL; + context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | + ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) ? FI_TAGGED : FI_MSG); + + const uint8_t u8_rx = hdr->rendezvous.origin_rx; + const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); + + if (OFI_LIKELY(niov == 1)) { + assert(!is_noncontig); + + uint64_t rbuf_device; + enum fi_hmem_iface rbuf_iface; + uint64_t hmem_handle; + if (is_hmem) { /* Branch should compile out */ + struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + rbuf_device = hmem_info->device; + rbuf_iface = hmem_info->iface; + hmem_handle = hmem_info->hmem_dev_reg_handle; + FI_OPX_DEBUG_COUNTERS_INC_COND(is_intranode, opx_ep->debug_counters.hmem.intranode + .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) + ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .recv.rzv); + FI_OPX_DEBUG_COUNTERS_INC_COND(!is_intranode, opx_ep->debug_counters.hmem.hfi + .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) + ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .recv.rzv); + } else { + rbuf_device = 0; + hmem_handle = 0; + rbuf_iface = FI_HMEM_SYSTEM; + } + uint8_t * rbuf = (uint8_t *)recv_buf; + + const uint64_t immediate_byte_count = immediate_info.byte_count; + const uint64_t immediate_qw_count = immediate_info.qw_count; + const uint64_t immediate_block_count = immediate_info.block_count; + const uint64_t immediate_total = immediate_byte_count + + immediate_qw_count * sizeof(uint64_t) + + immediate_block_count * sizeof(union cacheline); + const uint64_t immediate_end_block_count = immediate_info.end_block_count; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", + immediate_total, immediate_byte_count, immediate_qw_count, immediate_block_count); + context->byte_counter = xfer_len - immediate_total; + + FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, + hdr, + payload, + u8_rx, 1, + origin_byte_counter_vaddr, + context, + (uintptr_t) (rbuf + immediate_total), + rbuf_iface, + rbuf_device, + immediate_total, + immediate_end_block_count, + src_dst_iov, + FI_OPX_HFI_DPUT_OPCODE_RZV, + is_intranode, + reliability, /* compile-time constant expression */ + u32_ext_rx, + hfi1_type); + + /* + * copy the immediate payload data + */ + if (is_hmem) { + rbuf = opx_ep->hmem_copy_buf; + } + unsigned i; + + if (immediate_byte_count) { + for (i=0; ihmem_copy_buf) + + (immediate_block_count * sizeof(union cacheline)); + if (immediate_total) { + opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, + recv_buf, opx_ep->hmem_copy_buf, immediate_total, + OPX_HMEM_DEV_REG_RECV_THRESHOLD); + } + } + + /* up to 1 block of immediate end data after the immediate blocks + Copy this to the end of rbuf */ + if (immediate_end_block_count) { + uint8_t *rbuf_start = (uint8_t *)recv_buf; + rbuf_start += xfer_len - (immediate_end_block_count << 6); + if (!is_hmem) { + memcpy(rbuf_start, + immediate_block[immediate_block_count].qw, + (immediate_end_block_count << 6)); + } else { + opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, rbuf_start, + immediate_block[immediate_block_count].qw, + (immediate_end_block_count << 6), + OPX_HMEM_DEV_REG_RECV_THRESHOLD); + } + } + + } else { + /*fi_opx_hfi1_dump_packet_hdr(hdr, __func__, __LINE__); */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "rendezvous non-contiguous source data not implemented; abort\n"); + abort(); + } + + /* post a pending completion event for the individual receive */ + if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } + fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); + + } else { /* truncation - unlikely */ + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "RENDEZVOUS truncation - xfer_len %lu > recv_len %lu posting error\n", xfer_len, recv_len); + + /* Post a CTS Truncation error (FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC) to unblock the Tx of RTS */ + + context->len = xfer_len; + context->data = ofi_data; + context->tag = origin_tag; + context->next = NULL; + context->byte_counter = 0; + context->flags = FI_RECV | FI_REMOTE_CQ_DATA | + ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) ? FI_TAGGED : FI_MSG); + const uint8_t u8_rx = hdr->rendezvous.origin_rx; + const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); + + uint8_t * rbuf = (uint8_t *)recv_buf; + + FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, + hdr, + payload, + u8_rx, 1, + origin_byte_counter_vaddr, + context, + (uintptr_t)(rbuf), /* receive buffer virtual address */ + FI_HMEM_SYSTEM, /* receive buffer iface */ + 0UL, /* receive buffer device */ + 0UL, /* immediate_data */ + 0UL, /* immediate_end_block_count */ + src_dst_iov, + FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC, + is_intranode, + reliability, /* compile-time constant expression */ + u32_ext_rx, + hfi1_type); + + if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } + fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); + + /* Post a E_TRUNC to our local RX error queue because a client called receive + with too small a buffer. Tell them about it via the error cq */ + + struct fi_opx_context_ext * ext = NULL; + if (is_context_ext) { + ext = (struct fi_opx_context_ext *)context; + ext->err_entry.op_context = ext->msg.op_context; + } else { + ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); + if (OFI_UNLIKELY(ext == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Out of memory error.\n"); + abort(); + } + ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; + ext->err_entry.op_context = context; + } + + ext->err_entry.flags = context->flags; + ext->err_entry.len = recv_len; + ext->err_entry.buf = recv_buf; + ext->err_entry.data = ofi_data; + ext->err_entry.tag = origin_tag; + ext->err_entry.olen = xfer_len - recv_len; + ext->err_entry.err = FI_ETRUNC; + ext->err_entry.prov_errno = 0; + ext->err_entry.err_data = NULL; + ext->err_entry.err_data_size = 0; + + ext->opx_context.byte_counter = 0; + ext->opx_context.next = NULL; + + /* post an 'error' completion event */ + if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } + fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV -- RENDEZVOUS RTS (end) context %p\n",context); +} + /** * \brief Complete a receive operation that has matched the packet header with * the match information @@ -950,7 +1371,7 @@ void fi_opx_enqueue_completed(struct fi_opx_context_slist *queue, */ __OPX_FORCE_INLINE__ void complete_receive_operation_internal (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const uint64_t origin_tag, union fi_opx_context ** context_ptr, @@ -960,7 +1381,8 @@ void complete_receive_operation_internal (struct fid_ep *ep, const unsigned is_intranode, const uint64_t is_hmem, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert((is_hmem && is_context_ext) || !is_hmem); @@ -970,8 +1392,6 @@ void complete_receive_operation_internal (struct fid_ep *ep, struct fi_opx_ep_rx * const rx = opx_ep->rx; union fi_opx_context *context = *context_ptr; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - const uint64_t recv_len = context->len; /* * The context buffer pointer has already been set to the appropriate @@ -981,6 +1401,8 @@ void complete_receive_operation_internal (struct fid_ep *ep, */ void * recv_buf = context->buf; + OPX_DEBUG_PRINT_HDR(hdr, hfi1_type); + if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT || opcode == FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -1076,7 +1498,8 @@ void complete_receive_operation_internal (struct fid_ep *ep, } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "INJECT send_len %lu <= recv_len %lu; enqueue cq (completed)\n", send_len, recv_len); + "INJECT send_len %lu <= recv_len %lu; enqueue cq (completed) ofi_data = %ld tag = %ld\n", + send_len, recv_len, ofi_data, origin_tag); context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) ? FI_TAGGED : FI_MSG); @@ -1229,10 +1652,10 @@ void complete_receive_operation_internal (struct fid_ep *ep, .recv.eager); } - /* fi_opx_hfi1_dump_packet_hdr((union fi_opx_hfi1_packet_hdr *)hdr, __func__, __LINE__); */ + /* fi_opx_hfi1_dump_packet_hdr(hdr, __func__, __LINE__); */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "EAGER send_len %lu <= recv_len %lu; enqueue cq (completed)\n", send_len, recv_len); + "EAGER send_len %lu <= recv_len %lu; enqueue cq (completed), tag %#lX/%#lX, ofi_data %#lX \n", send_len, recv_len, context->tag, origin_tag, ofi_data); context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER) ? FI_TAGGED : FI_MSG); @@ -1296,7 +1719,13 @@ void complete_receive_operation_internal (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-MP-EAGER-FIRST"); const uint64_t ofi_data = hdr->match.ofi_data; - const uint64_t payload_qws_total = (((uint64_t) ntohs(hdr->stl.lrh.pktlen)) - 15) >> 1; + + uint64_t payload_qws_total; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + payload_qws_total = (((uint64_t) ntohs(hdr->lrh_9B.pktlen)) - 15) >> 1; + } else{ + payload_qws_total = (uint64_t)(hdr->lrh_16B.pktlen - 9); + } const uint64_t packet_payload_len = hdr->mp_eager_first.xfer_bytes_tail + (payload_qws_total << 3); const uint64_t payload_total_len = hdr->mp_eager_first.payload_bytes_total & FI_OPX_HFI1_KDETH_VERSION_OFF_MASK; @@ -1394,10 +1823,15 @@ void complete_receive_operation_internal (struct fid_ep *ep, ext->opx_context.next = NULL; *context_ptr = (union fi_opx_context*)ext; } +#ifndef NDEBUG + if (context->byte_counter == 0) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- MULTI PACKET EAGER FIRST UNEXPECTED COMPLETE\n"); + } +#endif OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-MP-EAGER-FIRST"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV -- MULTI PACKET EAGER FIRST (end)\n"); + "===================================== RECV -- MULTI PACKET EAGER FIRST byte counter %lu (end)\n",context->byte_counter); } else if (opcode == FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH) { @@ -1405,7 +1839,11 @@ void complete_receive_operation_internal (struct fid_ep *ep, "===================================== RECV -- MULTI PACKET EAGER NTH (begin)\n"); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-MP-EAGER-NTH"); - const uint64_t payload_qws_total = (((uint64_t) ntohs(hdr->stl.lrh.pktlen)) - 15) >> 1; + uint64_t payload_qws_total; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + payload_qws_total = (((uint64_t) ntohs(hdr->lrh_9B.pktlen)) - 15) >> 1; + else + payload_qws_total = (uint64_t) hdr->lrh_16B.pktlen - 9; const uint64_t send_len = hdr->mp_eager_nth.xfer_bytes_tail + (payload_qws_total << 3); const uint64_t xfer_len = send_len + hdr->mp_eager_nth.payload_offset; @@ -1487,13 +1925,18 @@ void complete_receive_operation_internal (struct fid_ep *ep, recv_buf, opx_ep->hmem_copy_buf, send_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); } - /* fi_opx_hfi1_dump_packet_hdr((union fi_opx_hfi1_packet_hdr *)hdr, __func__, __LINE__); */ + /* fi_opx_hfi1_dump_packet_hdr(hdr, __func__, __LINE__);*/ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Multi-packet EAGER (nth) send_len %lu <= recv_len %lu; enqueue cq (pending)\n", send_len, recv_len); assert(context->byte_counter >= send_len); context->byte_counter -= send_len; +#ifndef NDEBUG + if (context->byte_counter == 0) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- MULTI PACKET EAGER NTH COMPLETE\n"); + } +#endif } else { /* truncation - unlikely */ /* We verified the context had enough buffer space for the entire multi-packet payload * when we processed the first multi-egr packet. So if xver_len > recv_len, then something @@ -1505,393 +1948,62 @@ void complete_receive_operation_internal (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-MP-EAGER-NTH"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV -- MULTI PACKET EAGER NTH (end)\n"); - - } else { /* rendezvous packet */ - - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV -- RENDEZVOUS RTS (%X) (begin) context %p is_multi_recv (%lu)\n", - opcode, context, is_multi_receive); - OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS"); - - const uint64_t ofi_data = hdr->match.ofi_data; - const uint64_t niov = hdr->rendezvous.niov; - const uint64_t xfer_len = hdr->rendezvous.message_length; + "===================================== RECV -- MULTI PACKET EAGER NTH byte counter %lu (end)\n",context->byte_counter); + } else if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { /* 9B rendezvous packet */ + union fi_opx_hfi1_packet_payload *p = (union fi_opx_hfi1_packet_payload *) payload; + const uint64_t is_noncontig = hdr->rendezvous.flags & FI_OPX_PKT_RZV_FLAGS_NONCONTIG; - if (is_multi_receive) { /* compile-time constant expression */ - assert(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS); - const uint8_t u8_rx = hdr->rendezvous.origin_rx; - const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); - union fi_opx_context * original_multi_recv_context = context; - context = (union fi_opx_context *)((uintptr_t)recv_buf - sizeof(union fi_opx_context)); + uintptr_t origin_byte_counter_vaddr = (is_noncontig == 1) ? p->rendezvous.noncontiguous.origin_byte_counter_vaddr : + p->rendezvous.contiguous.origin_byte_counter_vaddr; - assert((((uintptr_t)context) & 0x07) == 0); - context->flags = FI_RECV | FI_MSG | FI_OPX_CQ_CONTEXT_MULTIRECV; - context->buf = recv_buf; - context->len = xfer_len; - context->data = ofi_data; - context->tag = 0; /* tag is not valid for multi-receives */ - context->multi_recv_context = original_multi_recv_context; - context->byte_counter = xfer_len; - context->next = NULL; - uint8_t * rbuf = (uint8_t *)recv_buf; - union fi_opx_hfi1_packet_payload *p = (union fi_opx_hfi1_packet_payload *)payload; - - if (OFI_LIKELY(is_noncontig)) { - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.multi_recv_rzv_noncontig); - FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, - (const void * const)hdr, - (const void * const)payload, - u8_rx, niov, - p->rendezvous.noncontiguous.origin_byte_counter_vaddr, - context, - (uintptr_t)(rbuf), /* receive buffer virtual address */ - FI_HMEM_SYSTEM, /* receive buffer iface */ - 0UL, /* receive buffer device */ - 0UL, /* immediate_data */ - 0UL, /* immediate_end_block_count */ - &p->rendezvous.noncontiguous.iov[0], - FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG, - is_intranode, - reliability, /* compile-time constant expression */ - u32_ext_rx); - } else { - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.multi_recv_rzv_contig); - assert(niov == 1); - const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { + struct fi_opx_hmem_iov *iov = &p->rendezvous.noncontiguous.iov[0]; + + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { .qw0 = p->rendezvous.contiguous.immediate_info - }; - const uint64_t immediate_byte_count = immediate_info.byte_count; - const uint64_t immediate_qw_count = immediate_info.qw_count; - const uint64_t immediate_fragment = ((immediate_byte_count + immediate_qw_count + 63) >> 6); - const uint64_t immediate_block_count = immediate_info.block_count; - const uint64_t immediate_total = immediate_byte_count + - immediate_qw_count * sizeof(uint64_t) + - immediate_block_count * sizeof(union cacheline); - const uint64_t immediate_end_block_count = immediate_info.end_block_count; - - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", - immediate_total, immediate_byte_count, immediate_qw_count, immediate_block_count); - - context->byte_counter -= immediate_total; - const struct fi_opx_hmem_iov src_iov = { + }; + const struct fi_opx_hmem_iov src_dst_iov = { .buf = p->rendezvous.contiguous.src_vaddr, .len = (p->rendezvous.contiguous.src_blocks << 6), .device = p->rendezvous.contiguous.src_device_id, .iface = (enum fi_hmem_iface) p->rendezvous.contiguous.src_iface - }; - - FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, - (const void * const)hdr, - (const void * const)payload, - u8_rx, niov, - p->rendezvous.contiguous.origin_byte_counter_vaddr, - context, - (uintptr_t)(rbuf + immediate_total), /* receive buffer virtual address */ - FI_HMEM_SYSTEM, /* receive buffer iface */ - 0UL, /* receive buffer device */ - immediate_total, - immediate_end_block_count, - &src_iov, - FI_OPX_HFI_DPUT_OPCODE_RZV, - is_intranode, - reliability, /* compile-time constant expression */ - u32_ext_rx); - - /* - * copy the immediate payload data - */ - unsigned i; - - if (immediate_byte_count) { - const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; - for (i=0; irendezvous.contiguous.immediate_qw; - uint64_t * rbuf_qw = (uint64_t *)rbuf; - for (i=0; irendezvous.contiguous.cache_line_1 + immediate_fragment; - union cacheline * rbuf_block = (union cacheline *)rbuf; - for (i=0; irendezvous.contiguous.cache_line_1 + immediate_fragment; - uint8_t *rbuf_start = (uint8_t *)recv_buf; - rbuf_start += xfer_len - (immediate_end_block_count << 6); - memcpy(rbuf_start, immediate_block[immediate_block_count].qw, - (immediate_end_block_count << 6)); - } - } - - uint64_t bytes_consumed = ((xfer_len + 8) & (~0x07ull)) + sizeof(union fi_opx_context); - original_multi_recv_context->len -= bytes_consumed; - original_multi_recv_context->byte_counter++; // re-using the byte counter as a "pending flag" - original_multi_recv_context->tag = (uintptr_t)opx_ep; // re-using tag to store the ep - original_multi_recv_context->buf = (void*)((uintptr_t)(original_multi_recv_context->buf) + bytes_consumed); - assert(context->next == NULL); - if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); - - } else if (OFI_LIKELY(xfer_len <= recv_len)) { - - context->len = xfer_len; - context->data = ofi_data; - context->tag = origin_tag; - context->next = NULL; - context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) ? FI_TAGGED : FI_MSG); - - - const uint8_t u8_rx = hdr->rendezvous.origin_rx; - const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); - - if (OFI_LIKELY(niov == 1)) { - assert(!is_noncontig); - assert(payload != NULL); - - uint64_t rbuf_device; - enum fi_hmem_iface rbuf_iface; - uint64_t hmem_handle; - if (is_hmem) { /* Branch should compile out */ - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; - rbuf_device = hmem_info->device; - rbuf_iface = hmem_info->iface; - hmem_handle = hmem_info->hmem_dev_reg_handle; - FI_OPX_DEBUG_COUNTERS_INC_COND(is_intranode, opx_ep->debug_counters.hmem.intranode - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .recv.rzv); - FI_OPX_DEBUG_COUNTERS_INC_COND(!is_intranode, opx_ep->debug_counters.hmem.hfi - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .recv.rzv); - } else { - rbuf_device = 0; - hmem_handle = 0; - rbuf_iface = FI_HMEM_SYSTEM; - } - uint8_t * rbuf = (uint8_t *)recv_buf; - union fi_opx_hfi1_packet_payload *p = (union fi_opx_hfi1_packet_payload *)payload; - - const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { + }; + const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; + const uint64_t * const immediate_qw = p->rendezvous.contiguous.immediate_qw; + const uint64_t immediate_fragment = ((immediate_info.byte_count + immediate_info.byte_count + 63) >> 6); + const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; + + fi_opx_handle_recv_rts(hdr, payload, opx_ep, origin_tag, opcode, + context, is_context_ext, is_multi_receive, is_intranode, is_hmem, + lock_required, reliability, hfi1_type, origin_byte_counter_vaddr, + iov, immediate_info, &src_dst_iov, immediate_byte, immediate_qw, immediate_block); + + } else { /* (hfi1_type & OPX_HFI1_JKR) 16B rendezvous packet */ + union fi_opx_hfi1_packet_payload_16B *p = (union fi_opx_hfi1_packet_payload_16B *) payload; + const uint64_t is_noncontig = hdr->rendezvous.flags & FI_OPX_PKT_RZV_FLAGS_NONCONTIG; + uintptr_t origin_byte_counter_vaddr = (is_noncontig == 1) ? p->rendezvous.noncontiguous.origin_byte_counter_vaddr : + p->rendezvous.contiguous.origin_byte_counter_vaddr; + struct fi_opx_hmem_iov *iov = &p->rendezvous.noncontiguous.iov[0]; + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { .qw0 = p->rendezvous.contiguous.immediate_info - }; - const uint64_t immediate_byte_count = immediate_info.byte_count; - const uint64_t immediate_qw_count = immediate_info.qw_count; - const uint64_t immediate_fragment = ((immediate_byte_count + immediate_qw_count + 63) >> 6); - const uint64_t immediate_block_count = immediate_info.block_count; - const uint64_t immediate_total = immediate_byte_count + - immediate_qw_count * sizeof(uint64_t) + - immediate_block_count * sizeof(union cacheline); - const uint64_t immediate_end_block_count = immediate_info.end_block_count; - - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", - immediate_total, immediate_byte_count, immediate_qw_count, immediate_block_count); - context->byte_counter = xfer_len - immediate_total; - const struct fi_opx_hmem_iov src_iov = { - .buf = p->rendezvous.contiguous.src_vaddr, - .len = (p->rendezvous.contiguous.src_blocks << 6), - .device = p->rendezvous.contiguous.src_device_id, - .iface = (enum fi_hmem_iface) p->rendezvous.contiguous.src_iface - }; - FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, - (const void * const)hdr, - (const void * const)payload, - u8_rx, 1, - p->rendezvous.contiguous.origin_byte_counter_vaddr, - context, - (uintptr_t) (rbuf + immediate_total), - rbuf_iface, - rbuf_device, - immediate_total, - immediate_end_block_count, - &src_iov, - FI_OPX_HFI_DPUT_OPCODE_RZV, - is_intranode, - reliability, /* compile-time constant expression */ - u32_ext_rx); - - /* - * copy the immediate payload data - */ - if (is_hmem) { - rbuf = opx_ep->hmem_copy_buf; - } - unsigned i; - - if (immediate_byte_count) { - const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; - for (i=0; irendezvous.contiguous.immediate_qw; - uint64_t * rbuf_qw = (uint64_t *)rbuf; - for (i=0; irendezvous.contiguous.cache_line_1 + immediate_fragment; - union cacheline * rbuf_block = (union cacheline *)rbuf; - for (i=0; ihmem_copy_buf) + - (immediate_block_count * sizeof(union cacheline)); - if (immediate_total) { - opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, - recv_buf, opx_ep->hmem_copy_buf, immediate_total, - OPX_HMEM_DEV_REG_RECV_THRESHOLD); - } - } - - /* up to 1 block of immediate end data after the immediate blocks - Copy this to the end of rbuf */ - if (immediate_end_block_count) { - const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; - uint8_t *rbuf_start = (uint8_t *)recv_buf; - rbuf_start += xfer_len - (immediate_end_block_count << 6); - if (!is_hmem) { - memcpy(rbuf_start, - immediate_block[immediate_block_count].qw, - (immediate_end_block_count << 6)); - } else { - opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, rbuf_start, - immediate_block[immediate_block_count].qw, - (immediate_end_block_count << 6), - OPX_HMEM_DEV_REG_RECV_THRESHOLD); - } - } - - } else { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "rendezvous non-contiguous source data not implemented; abort\n"); - abort(); - } - - /* post a pending completion event for the individual receive */ - if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); - - - } else { /* truncation - unlikely */ - - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "RENDEZVOUS truncation - xfer_len %lu > recv_len %lu posting error\n", xfer_len, recv_len); - - /* Post a CTS Truncation error (FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC) to unblock the Tx of RTS */ - - context->len = xfer_len; - context->data = ofi_data; - context->tag = origin_tag; - context->next = NULL; - context->byte_counter = 0; - context->flags = FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) ? FI_TAGGED : FI_MSG); - const uint8_t u8_rx = hdr->rendezvous.origin_rx; - const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); - - assert(payload != NULL); - uint8_t * rbuf = (uint8_t *)recv_buf; - union fi_opx_hfi1_packet_payload *p = (union fi_opx_hfi1_packet_payload *)payload; + }; - const struct fi_opx_hmem_iov dst_iov = { + const struct fi_opx_hmem_iov src_dst_iov = { .buf = p->rendezvous.contiguous.src_vaddr, .len = (p->rendezvous.contiguous.src_blocks << 6), .device = p->rendezvous.contiguous.src_device_id, .iface = (enum fi_hmem_iface) p->rendezvous.contiguous.src_iface - }; - - FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, - (const void * const)hdr, - (const void * const)payload, - u8_rx, 1, - p->rendezvous.contiguous.origin_byte_counter_vaddr, - context, - (uintptr_t)(rbuf), /* receive buffer virtual address */ - FI_HMEM_SYSTEM, /* receive buffer iface */ - 0UL, /* receive buffer device */ - 0UL, /* immediate_data */ - 0UL, /* immediate_end_block_count */ - &dst_iov, - FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC, - is_intranode, - reliability, /* compile-time constant expression */ - u32_ext_rx); - - if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); - - /* Post a E_TRUNC to our local RX error queue because a client called receive - with too small a buffer. Tell them about it via the error cq */ - - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - ext->err_entry.op_context = ext->msg.op_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory error.\n"); - abort(); - } - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - ext->err_entry.op_context = context; - } - - ext->err_entry.flags = context->flags; - ext->err_entry.len = recv_len; - ext->err_entry.buf = recv_buf; - ext->err_entry.data = ofi_data; - ext->err_entry.tag = origin_tag; - ext->err_entry.olen = xfer_len - recv_len; - ext->err_entry.err = FI_ETRUNC; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; - - ext->opx_context.byte_counter = 0; - ext->opx_context.next = NULL; - - /* post an 'error' completion event */ - if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); - } - - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS"); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV -- RENDEZVOUS RTS (end) context %p\n",context); - - } /* rendezvous packet */ - + }; + const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; + const uint64_t * const immediate_qw = p->rendezvous.contiguous.immediate_qw; + const uint64_t immediate_fragment = ((immediate_info.byte_count + immediate_info.byte_count + 63) >> 6); + const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; + + fi_opx_handle_recv_rts(hdr, payload, opx_ep, origin_tag, opcode, + context, is_context_ext, is_multi_receive, is_intranode, is_hmem, + lock_required, reliability, hfi1_type, origin_byte_counter_vaddr, + iov, immediate_info, &src_dst_iov, immediate_byte, immediate_qw, immediate_block); + } FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); } @@ -1905,7 +2017,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, */ __OPX_FORCE_INLINE__ void complete_receive_operation(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const uint64_t origin_tag, union fi_opx_context * context, @@ -1915,13 +2027,14 @@ void complete_receive_operation(struct fid_ep *ep, const unsigned is_intranode, const uint64_t is_hmem, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { union fi_opx_context * original_context = context; (void) original_context; complete_receive_operation_internal(ep, hdr, payload, origin_tag, &context, opcode, is_context_ext, is_multi_receive, - is_intranode, is_hmem, lock_required, reliability); + is_intranode, is_hmem, lock_required, reliability, hfi1_type); assert(context == original_context); } @@ -1975,12 +2088,13 @@ ssize_t fi_opx_shm_dynamic_tx_connect(const unsigned is_intranode, __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- %s RENDEZVOUS CTS (begin)\n", is_intranode ? "SHM":"HFI"); @@ -1998,7 +2112,7 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, const uint32_t niov = hdr->cts.target.vaddr.niov; uint64_t * origin_byte_counter = (uint64_t *)hdr->cts.target.vaddr.origin_byte_counter_vaddr; OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-CTS-HFI:%p", (void *) target_context_vaddr); - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, (const void * const) hdr, (const void * const) payload, 0, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, hdr, (const void * const) payload, 0, u8_rx, origin_rs, niov, dput_iov, (const uint8_t) (FI_NOOP - 1), (const uint8_t) (FI_VOID - 1), @@ -2007,7 +2121,8 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, hdr->cts.target.opcode, NULL, is_intranode, /* compile-time constant expression */ reliability, /* compile-time constant expression */ - u32_ext_rx); + u32_ext_rx, + hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-CTS-HFI:%p", (void *) target_context_vaddr); } break; @@ -2017,7 +2132,7 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, const uintptr_t target_context_vaddr = hdr->cts.target.vaddr.target_context_vaddr; const uint32_t niov = hdr->cts.target.vaddr.niov; uint64_t * origin_byte_counter = (uint64_t *)hdr->cts.target.vaddr.origin_byte_counter_vaddr; - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, (const void * const) hdr, (const void * const) payload, 0, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, hdr, (const void * const) payload, 0, u8_rx, origin_rs, niov, dput_iov, (const uint8_t) (FI_NOOP - 1), (const uint8_t) (FI_VOID - 1), @@ -2027,7 +2142,8 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, NULL, is_intranode, /* compile-time constant expression */ reliability, /* compile-time constant expression */ - u32_ext_rx); + u32_ext_rx, + hfi1_type); } break; case FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC: @@ -2068,7 +2184,7 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, #else const union fi_opx_hfi1_dput_iov * const dput_iov_ptr = payload->cts.iov; #endif - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, (const void * const) hdr, (const void * const) payload, 0, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, hdr, (const void * const) payload, 0, u8_rx, origin_rs, niov, dput_iov_ptr, hdr->cts.target.mr.op, hdr->cts.target.mr.dt, @@ -2079,12 +2195,13 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, NULL, is_intranode, /* compile-time constant expression */ reliability, /* compile-time constant expression */ - u32_ext_rx); + u32_ext_rx, + hfi1_type); } break; case FI_OPX_HFI_DPUT_OPCODE_FENCE: { - opx_hfi1_dput_fence(opx_ep, hdr, u8_rx, u32_ext_rx); + opx_hfi1_dput_fence(opx_ep, hdr, u8_rx, u32_ext_rx, hfi1_type); } break; default: @@ -2100,13 +2217,14 @@ void fi_opx_atomic_completion_action(union fi_opx_hfi1_deferred_work * work_stat __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- %s RENDEZVOUS DATA Opcode=%0hhX (begin)\n", is_intranode ? "SHM":"HFI", hdr->dput.target.opcode); @@ -2124,7 +2242,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; @@ -2173,18 +2291,28 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * so use actual packet size here reported in LRH as the * number of 4-byte words in the packet; header + payload - icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes_to_copy = - (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const uint16_t bytes = - (uint16_t)(total_bytes_to_copy - - sizeof(union fi_opx_hfi1_packet_hdr)); + uint16_t lrh_pktlen_le; + size_t total_bytes_to_copy; + uint16_t bytes; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(hdr->lrh_9B.pktlen); + total_bytes_to_copy = + (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + bytes = (uint16_t)(total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)); + } else { + lrh_pktlen_le = hdr->lrh_16B.pktlen; + total_bytes_to_copy = + (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + bytes = (uint16_t)((total_bytes_to_copy - + sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B))); + } assert(bytes <= FI_OPX_HFI1_PACKET_MTU); /* SDMA expected receive w/TID will use CTRL 1, 2 or 3. Replays should indicate we are not using TID (CTRL 0) */ - int tidctrl = KDETH_GET(hdr->stl.kdeth.offset_ver_tid, TIDCTRL); + int tidctrl = KDETH_GET(hdr->kdeth.offset_ver_tid, TIDCTRL); assert((tidctrl == 0) || (tidctrl == 1) || (tidctrl == 2) || (tidctrl == 3)); /* Copy only if there's a replay payload and TID direct rdma was NOT done. @@ -2216,9 +2344,9 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "NOT REPLAY tidctrl %#x, tid %#X, tid0M %#X, tidoffset %#X rbuf_qws %p, " "sbuf_qws %p, bytes %u/%#x, target_context->byte_counter %p\n", - tidctrl, KDETH_GET(hdr->stl.kdeth.offset_ver_tid, TID), - KDETH_GET(hdr->stl.kdeth.offset_ver_tid, OM), - KDETH_GET(hdr->stl.kdeth.offset_ver_tid, OFFSET), + tidctrl, KDETH_GET(hdr->kdeth.offset_ver_tid, TID), + KDETH_GET(hdr->kdeth.offset_ver_tid, OM), + KDETH_GET(hdr->kdeth.offset_ver_tid, OFFSET), (void*)rbuf_qws, (void*)sbuf_qws, bytes, bytes, &target_context->byte_counter); } @@ -2280,7 +2408,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; assert(bytes <= FI_OPX_HFI1_PACKET_MTU); @@ -2313,7 +2441,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; @@ -2361,7 +2489,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; @@ -2387,7 +2515,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, // Do the FETCH part of this atomic fetch operation union fi_opx_hfi1_deferred_work *work = - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, (const void * const) hdr, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, hdr, (const void * const) payload, bytes, u8_rx, origin_rs, 1, &dput_iov, hdr->dput.target.op, @@ -2399,7 +2527,8 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, fi_opx_atomic_completion_action, is_intranode, reliability, - u32_ext_rx); + u32_ext_rx, + hfi1_type); if(work == NULL) { // The FETCH completed without being deferred, now do // the actual atomic operation. @@ -2433,7 +2562,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, * in the PSN to indicate this is the last packet. The payload * size of the last packet may be smaller than the other packets * in the multi-packet send, so set the payload bytes accordingly */ - const uint16_t bytes = (ntohl(hdr->stl.bth.psn) & 0x80000000) ? + const uint16_t bytes = (ntohl(hdr->bth.psn) & 0x80000000) ? hdr->dput.target.last_bytes : hdr->dput.target.bytes; @@ -2459,7 +2588,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, // Do the FETCH part of this atomic fetch operation union fi_opx_hfi1_deferred_work *work = - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, (const void * const) hdr, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, hdr, (const void * const) payload, bytes, u8_rx, origin_rs, 1, &dput_iov, hdr->dput.target.op, @@ -2471,7 +2600,8 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, fi_opx_atomic_completion_action, is_intranode, reliability, - u32_ext_rx); + u32_ext_rx, + hfi1_type); if(work == NULL) { // The FETCH completed without being deferred, now do // the actual atomic operation. @@ -2511,7 +2641,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_non_eager(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint64_t static_flags, @@ -2519,22 +2649,23 @@ void fi_opx_ep_rx_process_header_non_eager(struct fid_ep *ep, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - if (opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { fi_opx_ep_rx_process_header_rzv_cts(opx_ep, hdr, payload, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, + hfi1_type); } else if (opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { fi_opx_ep_rx_process_header_rzv_data(opx_ep, hdr, payload, payload_bytes, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, + hfi1_type); } else if (opcode == FI_OPX_HFI_BTH_OPCODE_ACK) { FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "unimplemented opcode (%u); abort\n", opcode); @@ -2552,16 +2683,26 @@ void fi_opx_ep_rx_process_header_non_eager(struct fid_ep *ep, "reliability exception with opcode %d, dropped\n", opcode); } else { fprintf(stderr, "unimplemented opcode (%#x); abort\n", opcode); + fprintf(stderr, "%s:%u payload %p, payload bytes %zu, is_instranode %u, %#16.16llX %#16.16llX %#16.16llX %#16.16llX %#16.16llX %#16.16llX %#16.16llX \n", + __func__, __LINE__, payload, payload_bytes, is_intranode, + (long long) hdr->qw_9B[0], + (long long) hdr->qw_9B[1], + (long long) hdr->qw_9B[2], + (long long) hdr->qw_9B[3], + (long long) hdr->qw_9B[4], + (long long) hdr->qw_9B[5], + (long long) hdr->qw_9B[6]); abort(); } } __OPX_FORCE_INLINE__ -uint64_t fi_opx_mp_egr_id_from_nth_packet(const union fi_opx_hfi1_packet_hdr *hdr) { - +uint64_t fi_opx_mp_egr_id_from_nth_packet(const union opx_hfi1_packet_hdr *hdr, + const uint64_t slid) +{ return ((uint64_t) hdr->mp_eager_nth.mp_egr_uid) | (((uint64_t)hdr->reliability.origin_tx) << 48) | - (((uint64_t)hdr->stl.lrh.slid) << 32); + (((uint64_t)slid) << 32); } __OPX_FORCE_INLINE__ @@ -2570,7 +2711,8 @@ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, union fi_opx_mp_egr_id mp_egr_id, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const uint64_t is_context_ext = context->flags & FI_OPX_CQ_CONTEXT_EXT; @@ -2580,7 +2722,14 @@ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, FI_OPX_DEBUG_COUNTERS_DECLARE_TMP(length); while (uepkt && context->byte_counter) { - if (fi_opx_mp_egr_id_from_nth_packet(&uepkt->hdr) == mp_egr_id.id) { + uint64_t slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = (uint64_t)(uepkt->hdr.lrh_9B.slid); + } else { + slid = htons(((uepkt->hdr.lrh_16B.slid20 << 20) | (uepkt->hdr.lrh_16B.slid))); + } + + if (fi_opx_mp_egr_id_from_nth_packet(&uepkt->hdr, slid) == mp_egr_id.id) { complete_receive_operation(ep, &uepkt->hdr, @@ -2593,7 +2742,8 @@ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, OPX_INTRANODE_FALSE, is_hmem, lock_required, - reliability); + reliability, + hfi1_type); /* Remove this packet and get the next one */ uepkt = fi_opx_hfi1_ue_packet_slist_remove_item(uepkt, @@ -2609,7 +2759,7 @@ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint64_t static_flags, @@ -2617,7 +2767,9 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + const uint64_t slid) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -2638,7 +2790,8 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, context, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, - is_intranode) + is_intranode, + slid) ) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "context = %p\n", context); prev = context; @@ -2653,12 +2806,12 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, fi_opx_ep_rx_append_ue_tag(opx_ep->rx, hdr, payload, payload_bytes, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->daos_info.hfi_rank_enabled, - FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep)); + FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep), slid); else fi_opx_ep_rx_append_ue_msg(opx_ep->rx, hdr, payload, payload_bytes, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->daos_info.hfi_rank_enabled, - FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep)); + FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep), slid); return; } @@ -2677,16 +2830,17 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, OPX_INTRANODE_FALSE, /* Should always be false for mp_eager */ is_hmem, lock_required, - reliability); + reliability, + hfi1_type); const union fi_opx_mp_egr_id mp_egr_id = { .uid = hdr->reliability.psn, .origin_tx = hdr->reliability.origin_tx, - .slid = hdr->stl.lrh.slid, + .slid = slid, .unused = 0}; /* Process any other early arrival packets that are part of this multi-packet egr */ - fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, lock_required, reliability); + fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, lock_required, reliability, hfi1_type); /* Only add this to the multi-packet egr queue if we still expect additional packets to come in */ if (context->byte_counter) { @@ -2708,7 +2862,7 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint64_t static_flags, @@ -2716,7 +2870,9 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + const uint64_t slid) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -2724,7 +2880,7 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, /* Search mp-eager queue for the context w/ matching mp-eager ID */ - const uint64_t mp_egr_id = fi_opx_mp_egr_id_from_nth_packet(hdr); + const uint64_t mp_egr_id = fi_opx_mp_egr_id_from_nth_packet(hdr, slid); union fi_opx_context *context = opx_ep->rx->mp_egr_queue.mq.head; union fi_opx_context *prev = NULL; @@ -2745,7 +2901,7 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, "process_header_mp_eager_nth: did not find a match .. add this packet to the unexpected queue\n"); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_nth_no_match); - fi_opx_ep_rx_append_ue_egr(opx_ep->rx, hdr, payload, payload_bytes); + fi_opx_ep_rx_append_ue_egr(opx_ep->rx, hdr, payload, payload_bytes, slid); return; } @@ -2765,7 +2921,8 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, is_intranode, context->flags & FI_OPX_CQ_CONTEXT_HMEM, lock_required, - reliability); + reliability, + hfi1_type); if (!context->byte_counter) { /* Remove from the mp-eager queue */ @@ -2785,7 +2942,7 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, static inline void fi_opx_ep_rx_process_header (struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint64_t static_flags, @@ -2793,7 +2950,9 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + const uint64_t slid) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -2803,7 +2962,7 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, static_flags, opcode, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, hfi1_type); return; } else if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST || opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) { @@ -2811,7 +2970,8 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, static_flags, opcode, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, + hfi1_type, slid); return; } else if (opcode == FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH) { @@ -2819,12 +2979,10 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, static_flags, opcode, origin_rs, is_intranode, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); return; } - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - assert(opcode >= FI_OPX_HFI_BTH_OPCODE_MSG_INJECT); /* search the match queue */ @@ -2841,7 +2999,8 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, context, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, - is_intranode)) { + is_intranode, + slid)) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "context = %p\n", context); prev = context; context = context->next; @@ -2854,12 +3013,12 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, fi_opx_ep_rx_append_ue_tag(opx_ep->rx, hdr, payload, payload_bytes, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->daos_info.hfi_rank_enabled, - FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep)); + FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep), slid); } else { fi_opx_ep_rx_append_ue_msg(opx_ep->rx, hdr, payload, payload_bytes, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->daos_info.hfi_rank_enabled, - FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep)); + FI_OPX_DEBUG_COUNTERS_GET_PTR(opx_ep), slid); } return; @@ -2894,7 +3053,8 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, is_intranode, rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM, lock_required, - reliability); + reliability, + hfi1_type); return; @@ -2917,7 +3077,8 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, is_intranode, OPX_HMEM_FALSE, lock_required, - reliability); + reliability, + hfi1_type); if (context->len < opx_ep->rx->min_multi_recv) { /* after processing this message there is not @@ -3097,11 +3258,12 @@ void fi_opx_ep_do_pending_work(struct fi_opx_ep *opx_ep) fi_opx_ep_do_pending_sdma_work(opx_ep); } -static inline -void fi_opx_ep_rx_poll (struct fid_ep *ep, - const uint64_t caps, - const enum ofi_reliability_kind reliability, - const uint64_t hdrq_mask) +__OPX_FORCE_INLINE__ +void fi_opx_ep_rx_poll_internal (struct fid_ep *ep, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -3111,16 +3273,20 @@ void fi_opx_ep_rx_poll (struct fid_ep *ep, if (OFI_LIKELY(hdrq_mask == FI_OPX_HDRQ_MASK_RUNTIME)) { /* constant compile-time expression */ FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME); + OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, + hfi1_type); } else if (hdrq_mask == FI_OPX_HDRQ_MASK_2048) { /* constant compile-time expression */ FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048); + OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, + hfi1_type); } else if (hdrq_mask == FI_OPX_HDRQ_MASK_8192) { /* constant compile-time expression */ FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192); + OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, + hfi1_type); } else { FI_OPX_FABRIC_POLL_MANY(ep, FI_OPX_LOCK_NOT_REQUIRED, rx_caps, - OFI_RELIABILITY_KIND_ONLOAD, hdrq_mask); + OFI_RELIABILITY_KIND_ONLOAD, hdrq_mask, + hfi1_type); } fi_opx_ep_do_pending_work(opx_ep); @@ -3132,6 +3298,24 @@ void fi_opx_ep_rx_poll (struct fid_ep *ep, } } +static inline +void fi_opx_ep_rx_poll (struct fid_ep *ep, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) +{ + if (hfi1_type & OPX_HFI1_WFR) { + fi_opx_ep_rx_poll_internal(ep, caps, reliability, hdrq_mask, OPX_HFI1_WFR); + } else if (hfi1_type & OPX_HFI1_JKR) { + fi_opx_ep_rx_poll_internal(ep, caps, reliability, hdrq_mask, OPX_HFI1_JKR); + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + fi_opx_ep_rx_poll_internal(ep, caps, reliability, hdrq_mask, OPX_HFI1_JKR_9B); + } else { + abort(); + } +} + __OPX_FORCE_INLINE__ int fi_opx_ep_cancel_context(struct fi_opx_ep * opx_ep, const uint64_t cancel_context, @@ -3189,7 +3373,8 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, const uint64_t is_context_ext, const uint64_t is_hmem, const int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert(static_flags & (FI_TAGGED | FI_MSG)); const uint64_t kind = (static_flags & FI_TAGGED) ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG; @@ -3202,7 +3387,7 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, "searching unexpected queue\n"); __attribute__((__unused__)) bool from_hash_queue = false; - struct fi_opx_hfi1_ue_packet *uepkt = fi_opx_ep_find_matching_packet(opx_ep, context, kind); + struct fi_opx_hfi1_ue_packet *uepkt = fi_opx_ep_find_matching_packet(opx_ep, context, kind, hfi1_type); #ifndef FI_OPX_MATCH_HASH_DISABLE if (!uepkt && kind == FI_OPX_KIND_TAG) { @@ -3214,36 +3399,44 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, #endif if (uepkt) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "found a match, uepkt = %p\n", uepkt); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "UEPKT found a match, uepkt = %p\n", uepkt); - uint8_t is_mp_eager = (uepkt->hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST || - uepkt->hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST); + uint8_t is_mp_eager = (uepkt->hdr.bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST || + uepkt->hdr.bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST); - const unsigned is_intranode = fi_opx_hfi_is_intranode(uepkt->hdr.stl.lrh.slid); + const unsigned is_intranode = opx_lrh_is_intranode(&(uepkt->hdr), hfi1_type); if (is_mp_eager) { complete_receive_operation_internal(ep, &uepkt->hdr, &uepkt->payload, uepkt->hdr.match.ofi_tag, &context, - uepkt->hdr.stl.bth.opcode, + uepkt->hdr.bth.opcode, is_context_ext, OPX_MULTI_RECV_FALSE, is_intranode, is_hmem, lock_required, - reliability); + reliability, + hfi1_type); /* Since this is the first multi-packet eager packet, the uid portion of the mp_egr_id will be this packet's PSN */ + uint64_t slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = (uint64_t)uepkt->hdr.lrh_9B.slid; + } else { + slid = htons((uint64_t)((uepkt->hdr.lrh_16B.slid20 << 20) | (uepkt->hdr.lrh_16B.slid))); + } const union fi_opx_mp_egr_id mp_egr_id = { .uid = uepkt->hdr.reliability.psn, .origin_tx = uepkt->hdr.reliability.origin_tx, - .slid = uepkt->hdr.stl.lrh.slid, + .slid = slid, .unused = 0 }; - fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, lock_required, reliability); + fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, + lock_required, reliability, hfi1_type); if (context->byte_counter) { context->mp_egr_id = mp_egr_id; @@ -3266,13 +3459,14 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, &uepkt->payload, uepkt->hdr.match.ofi_tag, context, - uepkt->hdr.stl.bth.opcode, + uepkt->hdr.bth.opcode, is_context_ext, OPX_MULTI_RECV_FALSE, is_intranode, is_hmem, lock_required, - reliability); + reliability, + hfi1_type); } #ifndef FI_OPX_MATCH_HASH_DISABLE @@ -3285,7 +3479,6 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, fi_opx_hfi1_ue_packet_slist_remove_item(uepkt, &opx_ep->rx->queue[kind].ue); #endif - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); return 0; } @@ -3298,7 +3491,6 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, context->next = NULL; fi_opx_context_slist_insert_tail(context, &opx_ep->rx->queue[kind].mq); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); return 0; } @@ -3321,11 +3513,10 @@ int fi_opx_ep_rx_process_context ( const uint64_t rx_op_flags, const uint64_t is_context_ext, const uint64_t is_hmem, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - if (cancel_context) { /* branch should compile out */ int rc = fi_opx_ep_cancel_context(opx_ep, cancel_context, context, rx_op_flags, is_context_ext, lock_required); @@ -3339,13 +3530,13 @@ int fi_opx_ep_rx_process_context ( return fi_opx_ep_process_context_match_ue_packets(opx_ep, static_flags, context, OPX_CONTEXT_EXTENDED_TRUE, OPX_HMEM_TRUE, - lock_required, reliability); + lock_required, reliability, hfi1_type); } return fi_opx_ep_process_context_match_ue_packets(opx_ep, static_flags, context, OPX_CONTEXT_EXTENDED_FALSE, OPX_HMEM_FALSE, - lock_required, reliability); + lock_required, reliability, hfi1_type); } else { /* @@ -3356,10 +3547,9 @@ int fi_opx_ep_rx_process_context ( "process peek, claim, or multi-receive context\n"); fi_opx_ep_rx_process_context_noinline(opx_ep, static_flags, - context, rx_op_flags, is_context_ext, is_hmem, lock_required, av_type, reliability); + context, rx_op_flags, is_context_ext, is_hmem, lock_required, av_type, reliability, hfi1_type); } - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); return 0; } @@ -3400,7 +3590,8 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t static_flags, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert(((static_flags & (FI_TAGGED | FI_MSG)) == FI_TAGGED) || ((static_flags & (FI_TAGGED | FI_MSG)) == FI_MSG)); @@ -3474,7 +3665,8 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, OPX_HMEM_TRUE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } else #endif { @@ -3487,7 +3679,8 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECV RETURN\n"); @@ -3510,7 +3703,8 @@ static inline ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, const struct fi_msg *msg, uint64_t flags, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECVMSG\n"); FI_OPX_DEBUG_COUNTERS_INC_COND(!(flags & FI_MULTI_RECV), opx_ep->debug_counters.recv.posted_recv_msg); @@ -3549,7 +3743,8 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, OPX_CONTEXT_EXTENDED_FALSE, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); return rc; @@ -3571,7 +3766,8 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, OPX_CONTEXT_EXTENDED_FALSE, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); return rc; @@ -3627,7 +3823,8 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, OPX_CONTEXT_EXTENDED_TRUE, OPX_HMEM_TRUE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG (HMEM) RETURN\n"); return rc; @@ -3654,7 +3851,8 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, OPX_CONTEXT_EXTENDED_FALSE, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); return rc; @@ -3684,7 +3882,8 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, OPX_CONTEXT_EXTENDED_TRUE, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); @@ -3763,106 +3962,153 @@ ssize_t fi_opx_hfi1_tx_send_try_mp_egr (struct fid_ep *ep, const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; assert (!fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)); - assert (len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE); + assert (len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type)); const uint64_t bth_rx = ((uint64_t)addr.hfi1_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid); + const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); /* Write the first packet */ uint32_t first_packet_psn; + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER (begin)\n"); uint8_t *buf_bytes_ptr = (uint8_t *) buf; - ssize_t rc = fi_opx_hfi1_tx_send_mp_egr_first (opx_ep, (void **) &buf_bytes_ptr, len, desc, - opx_ep->hmem_copy_buf, pbc_dlid, bth_rx, lrh_dlid, - addr, tag, data, lock_required, - caps, reliability, &first_packet_psn, - hmem_iface, hmem_device); + ssize_t rc; + rc = fi_opx_hfi1_tx_send_mp_egr_first_common (opx_ep, (void **) &buf_bytes_ptr, len, desc, + opx_ep->hmem_copy_buf, pbc_dlid, bth_rx, lrh_dlid, + addr, tag, data, lock_required, + caps, reliability, &first_packet_psn, + hmem_iface, hmem_device, hfi1_type); if (rc != FI_SUCCESS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_fall_back_to_rzv); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER (return %zd)\n", rc); + return rc; } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_first_packets); /* The first packet was successful. We're now committed to finishing this */ - ssize_t payload_remaining = len - FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; - uint32_t payload_offset = FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; - buf_bytes_ptr += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; + ssize_t payload_remaining = len - FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + uint32_t payload_offset = FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + buf_bytes_ptr += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER FIRST NTH (payload_remaining %zu)\n", payload_remaining); /* Write all the full nth packets */ - while (payload_remaining >= FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE) { - rc = fi_opx_hfi1_tx_send_mp_egr_nth(opx_ep, (void *)buf_bytes_ptr, payload_offset, + while (payload_remaining >= FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type)) { + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + rc = fi_opx_hfi1_tx_send_mp_egr_nth(opx_ep, (void *)buf_bytes_ptr, payload_offset, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, - lock_required, reliability); + lock_required, reliability, hfi1_type); + } else { + rc = fi_opx_hfi1_tx_send_mp_egr_nth_16B(opx_ep, (void *)buf_bytes_ptr, payload_offset, + first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, + lock_required, reliability, hfi1_type); + } if (rc != FI_SUCCESS) { if (rc == -FI_ENOBUFS) { /* Insufficient credits. Try forcing a credit return and retry. */ - fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps); + fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps, hfi1_type); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_nth_force_cr); } else { - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_full_replay_buffer_rx_poll); } do { - rc = fi_opx_hfi1_tx_send_mp_egr_nth(opx_ep, (void *)buf_bytes_ptr, payload_offset, + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + rc = fi_opx_hfi1_tx_send_mp_egr_nth(opx_ep, (void *)buf_bytes_ptr, payload_offset, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, - lock_required, reliability); + lock_required, reliability, hfi1_type); + } else { + rc = fi_opx_hfi1_tx_send_mp_egr_nth_16B(opx_ep, (void *)buf_bytes_ptr, payload_offset, + first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, + lock_required, reliability, hfi1_type); + } + if (rc == -FI_EAGAIN) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_full_replay_buffer_rx_poll); - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } } while (rc != FI_SUCCESS); } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_nth_packets); - payload_remaining -= FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; - buf_bytes_ptr += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; - payload_offset += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE; + payload_remaining -= FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + buf_bytes_ptr += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + payload_offset += FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER (payload_remaining %zu)\n", payload_remaining); } + /* Write all the last packet (if necessary) */ if (payload_remaining > 0) { - rc = fi_opx_hfi1_tx_send_mp_egr_last(opx_ep, (void *)buf_bytes_ptr, payload_offset, + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER LAST (payload_remaining %zu)\n", payload_remaining); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + rc = fi_opx_hfi1_tx_send_mp_egr_last(opx_ep, (void *)buf_bytes_ptr, payload_offset, + payload_remaining, + first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, + lock_required, reliability, hfi1_type); + } else { + rc = fi_opx_hfi1_tx_send_mp_egr_last_16B(opx_ep, (void *)buf_bytes_ptr, payload_offset, payload_remaining, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, addr, - lock_required, reliability); + lock_required, reliability, hfi1_type); + } if (rc != FI_SUCCESS) { if (rc == -FI_ENOBUFS) { /* Insufficient credits. Try forcing a credit return and retry. */ - fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps); + fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps,hfi1_type); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_nth_force_cr); } else { - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_full_replay_buffer_rx_poll); } do { - rc = fi_opx_hfi1_tx_send_mp_egr_last(opx_ep, (void *)buf_bytes_ptr, payload_offset, + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + rc = fi_opx_hfi1_tx_send_mp_egr_last(opx_ep, (void *)buf_bytes_ptr, payload_offset, + payload_remaining, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, + addr, lock_required, reliability, hfi1_type); + } else { + rc = fi_opx_hfi1_tx_send_mp_egr_last_16B(opx_ep, (void *)buf_bytes_ptr, payload_offset, payload_remaining, first_packet_psn, pbc_dlid, bth_rx, lrh_dlid, - addr, lock_required, reliability); + addr, lock_required, reliability, hfi1_type); + } if (rc == -FI_EAGAIN) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_full_replay_buffer_rx_poll); - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } } while (rc != FI_SUCCESS); } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_nth_packets); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER LAST (payload_remaining %zu)\n", payload_remaining); + } if (OFI_LIKELY(do_cq_completion)) { fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); } + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER USER (end)\n"); + return FI_SUCCESS; } @@ -3885,28 +4131,28 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, - const bool mp_eager_fallback) + const bool mp_eager_fallback, + const enum opx_hfi1_type hfi1_type) { ssize_t rc; - if(is_contiguous) { rc = FI_OPX_FABRIC_TX_SEND_EGR(ep, buf, len, - desc, addr.fi, tag, context, data, - lock_required, - override_flags, tx_op_flags, addr.hfi1_rx, - caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + desc, addr.fi, tag, context, data, + lock_required, + override_flags, tx_op_flags, addr.hfi1_rx, + caps, reliability, do_cq_completion, + hmem_iface, hmem_device, hfi1_type); } else { rc = FI_OPX_FABRIC_TX_SENDV_EGR(ep, local_iov, niov, total_len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, addr.hfi1_rx, caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + hmem_iface, hmem_device, hfi1_type); } - if (OFI_LIKELY(rc == FI_SUCCESS)) { return rc; + #ifndef FI_OPX_MP_EGR_DISABLE } else if (rc == -FI_ENOBUFS && mp_eager_fallback) { /* Insufficient credits. If the payload is big enough, @@ -3918,11 +4164,11 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, if (rc == -FI_ENOBUFS) { /* Insufficient credits. Try forcing a credit return and retry. */ - fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps); + fi_opx_force_credit_return(ep, addr.fi, addr.hfi1_rx, caps,hfi1_type); } else { /* Likely full replay buffers or waiting for reliability handshake init. A poll might help */ - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } /* Note that we'll only iterate this loop more than once if we got here @@ -3931,20 +4177,20 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, do { if(is_contiguous) { rc = FI_OPX_FABRIC_TX_SEND_EGR(ep, buf, len, - desc, addr.fi, tag, context, data, - lock_required, - override_flags, tx_op_flags, addr.hfi1_rx, - caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + desc, addr.fi, tag, context, data, + lock_required, + override_flags, tx_op_flags, addr.hfi1_rx, + caps, reliability, do_cq_completion, + hmem_iface, hmem_device, hfi1_type); } else { rc = FI_OPX_FABRIC_TX_SENDV_EGR(ep, local_iov, niov, total_len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, addr.hfi1_rx, caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + hmem_iface, hmem_device, hfi1_type); } - fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } while (rc == -FI_ENOBUFS && loop++ < FI_OPX_EP_TX_SEND_EAGER_MAX_RETRIES); return rc; @@ -3964,7 +4210,8 @@ ssize_t fi_opx_ep_tx_send_rzv(struct fid_ep *ep, const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); union fi_opx_context * opx_context = (union fi_opx_context *)context; @@ -3988,24 +4235,44 @@ ssize_t fi_opx_ep_tx_send_rzv(struct fid_ep *ep, } do { - if (is_contiguous) { - rc = FI_OPX_FABRIC_TX_SEND_RZV( - ep, buf, len, desc, addr.fi, tag, context, data, - lock_required, override_flags, tx_op_flags, addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hmem_iface, hmem_device); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + if (is_contiguous) { + rc = FI_OPX_FABRIC_TX_SEND_RZV( + ep, buf, len, desc, addr.fi, tag, context, data, + lock_required, override_flags, tx_op_flags, addr.hfi1_rx, + byte_counter_ptr, + byte_counter, + caps, reliability, hmem_iface, hmem_device, hfi1_type); + } else { + rc = FI_OPX_FABRIC_TX_SENDV_RZV( + ep, local_iov, niov, total_len, desc, addr.fi, tag, + context, data, lock_required, override_flags, tx_op_flags, + addr.hfi1_rx, + byte_counter_ptr, + byte_counter, + caps, reliability, hmem_iface, hmem_device, hfi1_type); + } } else { - rc = FI_OPX_FABRIC_TX_SENDV_RZV( - ep, local_iov, niov, total_len, desc, addr.fi, tag, - context, data, lock_required, override_flags, tx_op_flags, - addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hmem_iface, hmem_device); + if (is_contiguous) { + rc = FI_OPX_FABRIC_TX_SEND_RZV_16B( + ep, buf, len, desc, addr.fi, tag, context, data, + lock_required, override_flags, tx_op_flags, addr.hfi1_rx, + byte_counter_ptr, + byte_counter, + caps, reliability, hmem_iface, hmem_device, hfi1_type); + } else { + /*rc = FI_OPX_FABRIC_TX_SENDV_RZV( + ep, local_iov, niov, total_len, desc, addr.fi, tag, + context, data, lock_required, override_flags, tx_op_flags, + addr.hfi1_rx, + byte_counter_ptr, + byte_counter, + caps, reliability, hfi1_type); */ + abort(); + } } if (OFI_UNLIKELY(rc == -EAGAIN)) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } } while (rc == -EAGAIN); @@ -4028,7 +4295,8 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, const unsigned override_flags, uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND (begin)\n"); @@ -4081,7 +4349,7 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, addr.hfi1_rx, addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND"); return -FI_EAGAIN; } @@ -4103,15 +4371,14 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, fi_opx_ep_tx_do_cq_completion(opx_ep, override_flags, tx_op_flags); if (total_len < opx_ep->tx->rzv_min_payload_bytes) { - const bool mp_eager_fallback = (total_len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE && + const bool mp_eager_fallback = (total_len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type) && total_len <= opx_ep->tx->mp_eager_max_payload_bytes); if (total_len <= opx_ep->tx->pio_max_eager_tx_bytes) { - rc = fi_opx_ep_tx_send_try_eager(ep, buf, len, desc, addr, tag, context, local_iov, niov, total_len, data, lock_required, is_contiguous, override_flags, tx_op_flags, caps, reliability, do_cq_completion, hmem_iface, hmem_device, - mp_eager_fallback); + mp_eager_fallback, hfi1_type); if (OFI_LIKELY(rc == FI_SUCCESS)) { OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); return rc; @@ -4131,7 +4398,7 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, rc = fi_opx_hfi1_tx_send_try_mp_egr(ep, buf, len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, caps, reliability, do_cq_completion, - FI_HMEM_SYSTEM, 0ul); + FI_HMEM_SYSTEM, 0ul, hfi1_type); if (OFI_LIKELY(rc == FI_SUCCESS)) { OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); return rc; @@ -4162,7 +4429,8 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, caps, reliability, do_cq_completion, - hmem_iface, hmem_device); + hmem_iface, hmem_device, + hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -4182,7 +4450,8 @@ ssize_t fi_opx_ep_tx_send(struct fid_ep *ep, const unsigned override_flags, uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -4191,7 +4460,8 @@ ssize_t fi_opx_ep_tx_send(struct fid_ep *ep, ssize_t rc = fi_opx_ep_tx_send_internal(ep, buf, len, desc, dest_addr, tag, context, data, FI_OPX_LOCK_NOT_REQUIRED, av_type, is_contiguous, override_flags, - tx_op_flags, caps, reliability); + tx_op_flags, caps, reliability, + hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -4209,7 +4479,8 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { // Exactly one of FI_MSG or FI_TAGGED should be on assert((caps & (FI_MSG | FI_TAGGED)) && @@ -4231,7 +4502,8 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, OPX_FLAGS_OVERRIDE_TRUE, FI_SELECTIVE_COMPLETION, // op flags to turn off context caps, - reliability); + reliability, + hfi1_type); } else { assert(len <= FI_OPX_HFI1_PACKET_IMM); } @@ -4252,7 +4524,7 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, const union fi_opx_addr addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,dest_addr); const ssize_t rc = FI_OPX_FABRIC_TX_INJECT(ep, buf, len, addr.fi, tag, data, - lock_required, addr.hfi1_rx, caps, reliability); + lock_required, addr.hfi1_rx, caps, reliability, hfi1_type); if (OFI_UNLIKELY(rc == -EAGAIN)) { // In this case we are probably out of replay buffers. To deal @@ -4260,7 +4532,7 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, // process any incoming ACKs, hopefully releasing a buffer for // reuse. fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, - FI_OPX_HDRQ_MASK_RUNTIME); + FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "INJECT"); @@ -4280,7 +4552,8 @@ ssize_t fi_opx_ep_tx_inject(struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -4288,7 +4561,7 @@ ssize_t fi_opx_ep_tx_inject(struct fid_ep *ep, ssize_t rc = fi_opx_ep_tx_inject_internal(ep, buf, len, dest_addr, tag, data, FI_OPX_LOCK_NOT_REQUIRED, av_type, - caps, reliability); + caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -4301,14 +4574,15 @@ ssize_t fi_opx_recv_generic(struct fid_ep *ep, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t static_flags, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_ep_rx_recv_internal(opx_ep, buf, len, desc, src_addr, tag, ignore, context, FI_OPX_LOCK_NOT_REQUIRED, av_type, - static_flags, reliability); + static_flags, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -4318,12 +4592,14 @@ __OPX_FORCE_INLINE__ ssize_t fi_opx_recvmsg_generic(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type ) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_ep_rx_recvmsg_internal(opx_ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, reliability); + ssize_t rc = fi_opx_ep_rx_recvmsg_internal(opx_ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, + reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; diff --git a/prov/opx/include/rdma/opx/fi_opx_eq.h b/prov/opx/include/rdma/opx/fi_opx_eq.h index d77289f1de6..597f8031389 100644 --- a/prov/opx/include/rdma/opx/fi_opx_eq.h +++ b/prov/opx/include/rdma/opx/fi_opx_eq.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2022 Cornelis Networks. + * Copyright (C) 2022-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -46,53 +46,53 @@ * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FORMAT, RELIABILITY, MASK, CAPS) \ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS) +#define FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE) -#define FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS)\ +#define FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE)\ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_cq_read_ ## FORMAT ## _0_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ + fi_opx_cq_read_ ## FORMAT ## _0_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ (struct fid_cq *cq, void *buf, size_t count) \ { \ return fi_opx_cq_read_generic_non_locking(cq, buf, count, \ - FORMAT, RELIABILITY, MASK, CAPS); \ + FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_cq_readfrom_ ## FORMAT ## _0_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ + fi_opx_cq_readfrom_ ## FORMAT ## _0_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ (struct fid_cq *cq, void *buf, size_t count, \ fi_addr_t *src_addr) \ { \ return fi_opx_cq_readfrom_generic_non_locking(cq, buf, count, \ src_addr, FORMAT, RELIABILITY, MASK, \ - CAPS); \ + CAPS, HFI1_TYPE); \ } \ -#define FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FORMAT, RELIABILITY, MASK, CAPS) \ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS) +#define FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE) -#define FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS)\ +#define FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING_(FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE)\ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_cq_read_ ## FORMAT ## _1_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ + fi_opx_cq_read_ ## FORMAT ## _1_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ (struct fid_cq *cq, void *buf, size_t count) \ { \ return fi_opx_cq_read_generic_locking(cq, buf, count, \ - FORMAT, RELIABILITY, MASK, CAPS); \ + FORMAT, RELIABILITY, MASK, CAPS, HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_cq_readfrom_ ## FORMAT ## _1_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS \ + fi_opx_cq_readfrom_ ## FORMAT ## _1_ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE \ (struct fid_cq *cq, void *buf, size_t count, \ fi_addr_t *src_addr) \ { \ return fi_opx_cq_readfrom_generic_locking(cq, buf, count, \ src_addr, FORMAT, RELIABILITY, MASK, \ - CAPS); \ + CAPS, HFI1_TYPE); \ } \ -#define FI_OPX_CQ_SPECIALIZED_FUNC_NAME(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ - FI_OPX_CQ_SPECIALIZED_FUNC_NAME_(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS) +#define FI_OPX_CQ_SPECIALIZED_FUNC_NAME(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + FI_OPX_CQ_SPECIALIZED_FUNC_NAME_(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) -#define FI_OPX_CQ_SPECIALIZED_FUNC_NAME_(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS) \ - fi_opx_ ## TYPE ## _ ## FORMAT ## _ ## LOCK ## _ ## RELIABILITY ## _ ## MASK ## _ ## CAPS +#define FI_OPX_CQ_SPECIALIZED_FUNC_NAME_(TYPE, FORMAT, LOCK, RELIABILITY, MASK, CAPS, HFI1_TYPE) \ + fi_opx_ ## TYPE ## _ ## FORMAT ## _ ## LOCK ## _ ## RELIABILITY ## _ ## MASK ## _ ## CAPS ## _ ## HFI1_TYPE #ifdef __cplusplus @@ -180,27 +180,33 @@ int fi_opx_cq_enqueue_err (struct fi_opx_cq * opx_cq, struct fi_ops_cq * fi_opx_cq_select_non_locking_2048_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_non_locking_8192_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_non_locking_runtime_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_locking_2048_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_locking_8192_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type); struct fi_ops_cq * fi_opx_cq_select_locking_runtime_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps); + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type); void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); @@ -396,7 +402,8 @@ ssize_t fi_opx_cq_poll_inline(struct fid_cq *cq, void *buf, size_t count, const int lock_required, const enum ofi_reliability_kind reliability, const uint64_t hdrq_mask, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { ssize_t num_entries = 0; @@ -423,35 +430,35 @@ ssize_t fi_opx_cq_poll_inline(struct fid_cq *cq, void *buf, size_t count, if (hdrq_mask == FI_OPX_HDRQ_MASK_2048) { /* constant compile-time expression */ for (i=0; iprogress.ep[i]->lock); - fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_2048); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_2048, hfi1_type); fi_opx_unlock(&opx_cq->progress.ep[i]->lock); } } else if (hdrq_mask == FI_OPX_HDRQ_MASK_8192) { for (i=0; iprogress.ep[i]->lock); - fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_8192); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_8192, hfi1_type); fi_opx_unlock(&opx_cq->progress.ep[i]->lock); } } else { for (i=0; iprogress.ep[i]->lock); - fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); fi_opx_unlock(&opx_cq->progress.ep[i]->lock); } } } else { if (hdrq_mask == FI_OPX_HDRQ_MASK_2048) { /* constant compile-time expression */ for (i=0; iprogress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_2048); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_2048, hfi1_type); } } else if (hdrq_mask == FI_OPX_HDRQ_MASK_8192) { for (i=0; iprogress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_8192); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_8192, hfi1_type); } } else { for (i=0; iprogress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } } } @@ -504,9 +511,10 @@ ssize_t fi_opx_cq_read_generic_non_locking (struct fid_cq *cq, void *buf, size_t const enum fi_cq_format format, const enum ofi_reliability_kind reliability, const uint64_t hdrq_mask, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { - return fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps); + return fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); } __OPX_FORCE_INLINE__ @@ -514,11 +522,12 @@ ssize_t fi_opx_cq_read_generic_locking (struct fid_cq *cq, void *buf, size_t cou const enum fi_cq_format format, const enum ofi_reliability_kind reliability, const uint64_t hdrq_mask, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { int ret; fi_opx_lock(&((struct fi_opx_cq *) cq)->lock); - ret = fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_REQUIRED, reliability, hdrq_mask, caps); + ret = fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); fi_opx_unlock(&((struct fi_opx_cq *) cq)->lock); return ret; @@ -529,10 +538,11 @@ ssize_t fi_opx_cq_readfrom_generic_non_locking (struct fid_cq *cq, void *buf, si const enum fi_cq_format format, const enum ofi_reliability_kind reliability, const uint64_t hdrq_mask, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { int ret; - ret = fi_opx_cq_poll_inline(cq, buf, count, src_addr, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps); + ret = fi_opx_cq_poll_inline(cq, buf, count, src_addr, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); if (ret > 0) { unsigned n; for (n=0; nlock); - ret = fi_opx_cq_poll_inline(cq, buf, count, src_addr, format, FI_OPX_LOCK_REQUIRED, reliability, hdrq_mask, caps); + ret = fi_opx_cq_poll_inline(cq, buf, count, src_addr, format, FI_OPX_LOCK_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); fi_opx_unlock(&((struct fi_opx_cq *) cq)->lock); if (ret > 0) { unsigned n; diff --git a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h index 7222d220b09..2de99f8b765 100644 --- a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,13 +39,15 @@ #include "rdma/opx/fi_opx_hfi1_transport.h" #define FI_OPX_FABRIC_TX_INJECT fi_opx_hfi1_tx_inject -#define FI_OPX_FABRIC_TX_SEND_EGR fi_opx_hfi1_tx_send_egr -#define FI_OPX_FABRIC_TX_SENDV_EGR fi_opx_hfi1_tx_sendv_egr +#define FI_OPX_FABRIC_TX_SEND_EGR fi_opx_hfi1_tx_send_egr_select +#define FI_OPX_FABRIC_TX_SENDV_EGR fi_opx_hfi1_tx_sendv_egr_select #define FI_OPX_FABRIC_TX_SEND_RZV fi_opx_hfi1_tx_send_rzv #define FI_OPX_FABRIC_TX_SENDV_RZV fi_opx_hfi1_tx_sendv_rzv #define FI_OPX_FABRIC_RX_RZV_RTS fi_opx_hfi1_rx_rzv_rts #define FI_OPX_FABRIC_RX_RZV_CTS fi_opx_hfi1_rx_rzv_cts -#define FI_OPX_FABRIC_TX_DO_PUT fi_opx_hfi1_do_dput +#define FI_OPX_FABRIC_TX_DO_PUT fi_opx_hfi1_do_dput + +#define FI_OPX_FABRIC_TX_SEND_RZV_16B fi_opx_hfi1_tx_send_rzv_16B #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_flight_recorder.h b/prov/opx/include/rdma/opx/fi_opx_flight_recorder.h index 658b99bf321..249b3d1d328 100644 --- a/prov/opx/include/rdma/opx/fi_opx_flight_recorder.h +++ b/prov/opx/include/rdma/opx/fi_opx_flight_recorder.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -41,7 +41,8 @@ #include "fi_opx_hfi1_packet.h" #include "fi_opx_timer.h" -#define FLIGHT_RECORDER_ENTRY_DATA_LEN (sizeof(union fi_opx_hfi1_packet_payload) + sizeof(union fi_opx_hfi1_packet_hdr)) +#define FLIGHT_RECORDER_ENTRY_DATA_LEN (sizeof(union fi_opx_hfi1_packet_payload) + sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)) + #ifndef FLIGHT_RECORDER_ENTRY_COUNT #define FLIGHT_RECORDER_ENTRY_COUNT (1024) #endif @@ -187,8 +188,8 @@ void flight_recorder_dump (struct flight_recorder * fr) } if (entry[i].type == FR_ENTRY_TYPE_PACKET_HDR) { fprintf(stderr, "PACKET HDR|\n"); - fi_opx_hfi1_dump_packet_hdr((union fi_opx_hfi1_packet_hdr *)entry[i].data, - "#FLIGHT_RECORDER", 0); + //fi_opx_hfi1_dump_packet_hdr((union opx_hfi1_packet_hdr *)entry[i].data, + // "#FLIGHT_RECORDER", 0); } else if (entry[i].type == FR_ENTRY_TYPE_PACKET) { flight_recorder_dump_packet_payload(entry); } else { @@ -220,7 +221,7 @@ void flight_recorder_dump (struct flight_recorder * fr) flight_recorder_init_next_entry((fr), (event_id), \ FR_ENTRY_TYPE_PACKET_HDR); \ memcpy((void *)next->data, (void *) &(packet_hdr), \ - sizeof(union fi_opx_hfi1_packet_hdr)); \ + sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)); \ if ((fr)->count + 1 == FLIGHT_RECORDER_ENTRY_COUNT) \ flight_recorder_dump((fr)); \ } diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1.h b/prov/opx/include/rdma/opx/fi_opx_hfi1.h index b27a8bb4285..22fd27eccf1 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1.h @@ -102,8 +102,11 @@ #define OPX_RZV_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) /* Min value */ #define OPX_RZV_MIN_PAYLOAD_BYTES_MAX (OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX+1) /* Max value */ -/* The total size for a single packet used in a multi-packet eager send. - This is packet payload plus 64 bytes for the PBC and packet header. +/* The PBC length to use for a single packet in a multi-packet eager send. + + This is packet payload plus the PBC plus the packet header plus + tail (16B only). + All packets in a multi-packet eager send will be this size, except possibly the last one, which may be smaller. @@ -112,14 +115,25 @@ #define FI_OPX_MP_EGR_CHUNK_SIZE (4160) /* For full MP-Eager chunks, we pack 16 bytes of payload data in the - packet header. So the actual payload size for a full chunk is the - total chunk size minus 64 bytes for PBC and packet header, plus 16 - bytes for the space we use for payload data in the packet header. - Or, more simply, 48 bytes less than the total chunk size. */ -#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE (FI_OPX_MP_EGR_CHUNK_SIZE - 48) -#define FI_OPX_MP_EGR_CHUNK_CREDITS (FI_OPX_MP_EGR_CHUNK_SIZE >> 6) -#define FI_OPX_MP_EGR_CHUNK_DWS (FI_OPX_MP_EGR_CHUNK_SIZE >> 2) -#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS (FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE >> 3) + packet header. + + So the actual user payload __consumed__ for a full chunk is the + FI_OPX_MP_EGR_CHUNK_SIZE minus the PBC minus the header minus + the tail (16B only) plus 16 bytes payload packed in the header. + + The payload itself will be FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE - 16 + */ + +#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type) \ + ((hfi1_type & OPX_HFI1_JKR) ? \ + (FI_OPX_MP_EGR_CHUNK_SIZE - ((8 /* PBC */ + 64 /* hdr */ + 8 /* tail */) - 16 /* payload */)) :\ + (FI_OPX_MP_EGR_CHUNK_SIZE - ((8 /* PBC */ + 56 /* hdr */) - 16 /* payload */))) + /* PAYLOAD BYTES CONSUMED */ + +#define FI_OPX_MP_EGR_CHUNK_CREDITS (FI_OPX_MP_EGR_CHUNK_SIZE >> 6) /* PACKET CREDITS TOTAL */ +#define FI_OPX_MP_EGR_CHUNK_DWS (FI_OPX_MP_EGR_CHUNK_SIZE >> 2) /* PBC DWS */ +#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type) \ + ((FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type)) >> 3) /* PAYLOAD QWS CONSUMED */ #define FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL 16 #define FI_OPX_MP_EGR_XFER_BYTES_TAIL 0x0010000000000000ull @@ -227,20 +241,106 @@ abort(); return 0; } -struct fi_opx_hfi1_txe_scb { +/* Also refer to union opx_hfi1_packet_hdr comment - union { - uint64_t qw0; /* a.k.a. 'struct hfi_pbc' */ - //struct hfi_pbc pbc; - }; - union fi_opx_hfi1_packet_hdr hdr; + SCB (Send Control Block) is 8 QW's written to PIO SOP. + + Optimally, store 8 contiguous QW's. + + Cannot define a common 9B/16B structure that is contiguous, + so send code is 9B/16B aware. + + TX SCB + ===================================================== + GENERIC 9B 16B + ========= ================== =================== +QW[0] PBC +QW[1] HDR qw_9B[0] LRH qw_16B[0] LRH +QW[2] HDR qw_9B[1] BTH qw_16B[1] LRH +QW[3] HDR qw_9B[2] BTH/KDETH qw_16B[2] BTH +QW[4] HDR qw_9B[3] KDETH qw_16B[3] BTH/KDETH +QW[5] HDR qw_9B[4] USER/SW qw_16B[4] KDETH +QW[6] HDR qw_9B[5] USER/SW qw_16B[5] USER/SW +QW[7] HDR qw_9B[6] USER/SW qw_16B[6] USER/SW + + qw_16B[7] USER/SW + +Generic example + +// faster than memcpy() for this amount of data. +// SCB (PIO or UREG) COPY ONLY (STORE) +static inline void fi_opx_store_scb_qw(volatile uint64_t dest[8], const uint64_t source[8]) +{ + OPX_HFI1_BAR_STORE(&dest[0], source[0]); + OPX_HFI1_BAR_STORE(&dest[1], source[1]); + OPX_HFI1_BAR_STORE(&dest[2], source[2]); + OPX_HFI1_BAR_STORE(&dest[3], source[3]); + OPX_HFI1_BAR_STORE(&dest[4], source[4]); + OPX_HFI1_BAR_STORE(&dest[5], source[5]); + OPX_HFI1_BAR_STORE(&dest[6], source[6]); + OPX_HFI1_BAR_STORE(&dest[7], source[7]); +} + + +9B/16B example, must be hfi1-aware + + struct fi_opx_hfi1_txe_scb_9B model_9B = opx_ep->reliability->service.tx.hfi1.ping_model_9B; + struct fi_opx_hfi1_txe_scb_16B model_16B = opx_ep->reliability->service.tx.hfi1.ping_model_16B; -} __attribute__((__aligned__(8))); + volatile uint64_t * const scb = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); + if ((hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B))) { + OPX_HFI1_BAR_STORE(&scb[0], (model_9B.qw0 | OPX_PBC_CR(0x1, hfi1_type) | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_9B.hdr.qw_9B[0] | lrh_dlid)); + OPX_HFI1_BAR_STORE(&scb[2], (model_9B.hdr.qw_9B[1] | bth_rx)); +<...> + } else { + OPX_HFI1_BAR_STORE(&scb[0], (model_16B.qw0 | OPX_PBC_CR(1, hfi1_type) | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_16B.hdr.qw_16B[0] | ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[2], (model_16B.hdr.qw_16B[1] | ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[3], model_16B.hdr.qw_16B[2] | bth_rx); +<...> + } + +*/ + +/* Only 8 QWs valid in 16 QW storage. */ +struct fi_opx_hfi1_txe_scb_9B { + + union { /* 15 QWs union*/ + + /* pbc is qw0. it overlays hdr's unused_pad_9B */ + struct { + uint64_t qw0; + uint64_t qw[14]; + } __attribute__((__packed__)) __attribute__((__aligned__(8))); + + union opx_hfi1_packet_hdr hdr; /* 1 QW unused + 7 QWs 9B header + 7 QWs unused*/ + + } __attribute__((__packed__)) __attribute__((__aligned__(8))); + + uint64_t pad; /* 1 QW pad (to 16 QWs) */ +} __attribute__((__aligned__(8))) __attribute__((packed)); + +/* 16 QW valid in 16 QW storage. */ +struct fi_opx_hfi1_txe_scb_16B { + uint64_t qw0; /* PBC */ + union opx_hfi1_packet_hdr hdr; /* 15 QWs 16B header */ +} __attribute__((__aligned__(8))) __attribute__((packed)); + +static_assert((sizeof(struct fi_opx_hfi1_txe_scb_9B) == sizeof(struct fi_opx_hfi1_txe_scb_16B)), "storge for scbs should match"); +static_assert((sizeof(struct fi_opx_hfi1_txe_scb_9B) == (sizeof(uint64_t)*16)), "16 qw scb storage"); + +/* Storage for a scb. Use HFI1 type to access the correct structure */ +union opx_hfi1_txe_scb_union { + struct fi_opx_hfi1_txe_scb_9B scb_9B; + struct fi_opx_hfi1_txe_scb_16B scb_16B; +}; struct fi_opx_hfi1_rxe_hdr { - union fi_opx_hfi1_packet_hdr hdr; + union opx_hfi1_packet_hdr hdr; uint64_t rhf; } __attribute__((__aligned__(64))); @@ -403,7 +503,7 @@ struct fi_opx_hfi1_context { } info; int fd; - uint16_t lid; + uint32_t lid; struct _hfi_ctrl * ctrl; //struct hfi1_user_info_dep user_info; enum opx_hfi1_type hfi_hfi1_type; @@ -500,12 +600,12 @@ void fi_opx_consume_credits(union fi_opx_hfi1_pio_state *pio_state, size_t count } #define FI_OPX_HFI1_CREDITS_IN_USE(pio_state) fi_opx_credits_in_use(&pio_state) -#define FI_OPX_HFI1_UPDATE_CREDITS(pio_state, pio_credits_addr) fi_opx_update_credits(&pio_state, pio_credits_addr); +#define FI_OPX_HFI1_UPDATE_CREDITS(pio_state, pio_credits_addr) fi_opx_update_credits(&pio_state, pio_credits_addr) #define FI_OPX_HFI1_PIO_SCB_HEAD(pio_scb_base, pio_state) fi_opx_pio_scb_base(pio_scb_base, &pio_state) #define FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, force_credit_return, credits_needed) fi_opx_credits_avail(&pio_state, force_credit_return, credits_needed) #define FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) fi_opx_reliability_credits_avail(&pio_state) #define FI_OPX_HFI1_CONSUME_CREDITS(pio_state, count) fi_opx_consume_credits(&pio_state, count) -#define FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state) FI_OPX_HFI1_CONSUME_CREDITS(pio_state, 1); +#define FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state) FI_OPX_HFI1_CONSUME_CREDITS(pio_state, 1) __OPX_FORCE_INLINE__ @@ -531,7 +631,7 @@ int fi_opx_hfi1_get_lid_local_unit(uint16_t lid) } __OPX_FORCE_INLINE__ -bool fi_opx_hfi_is_intranode(uint16_t lid) +bool opx_lid_is_intranode(uint16_t lid) { if (fi_opx_global.hfi_local_info.lid == lid) { return true; @@ -540,6 +640,19 @@ bool fi_opx_hfi_is_intranode(uint16_t lid) return fi_opx_hfi1_get_lid_local(lid); } +__OPX_FORCE_INLINE__ +bool opx_lrh_is_intranode(union opx_hfi1_packet_hdr *hdr, const enum opx_hfi1_type hfi1_type) +{ + uint32_t lid_be; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lid_be = hdr->lrh_9B.slid; + } else { + lid_be = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + } + return opx_lid_is_intranode(lid_be); +} + struct fi_opx_hfi1_context * fi_opx_hfi1_context_open (struct fid_ep *ep, uuid_t unique_job_key); int init_hfi1_rxe_state (struct fi_opx_hfi1_context * context, @@ -552,7 +665,10 @@ void fi_opx_init_hfi_lookup(); */ #define FI_OPX_SHM_FIFO_SIZE (1024) #define FI_OPX_SHM_BUFFER_MASK (FI_OPX_SHM_FIFO_SIZE-1) -#define FI_OPX_SHM_PACKET_SIZE (FI_OPX_HFI1_PACKET_MTU + sizeof(struct fi_opx_hfi1_stl_packet_hdr)) + + +#define FI_OPX_SHM_PACKET_SIZE (FI_OPX_HFI1_PACKET_MTU + sizeof(union opx_hfi1_packet_hdr)) + #ifndef NDEBUG #define OPX_BUF_FREE(x) \ @@ -589,7 +705,7 @@ void opx_print_context(struct fi_opx_hfi1_context *context) FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "Context info.sdma.queue_size %#X\n",context->info.sdma.queue_size); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "Context info.sdma.completion_queue %p errcode %#X status %#X\n",context->info.sdma.completion_queue, context->info.sdma.completion_queue->errcode, - context->info.sdma.completion_queue->status); + context->info.sdma.completion_queue->status); /* Not printing Context info.sdma.queued_entries); */ FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "Context info.rxe.hdrq.base_addr %p \n",context->info.rxe.hdrq.base_addr); diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h index cd2310f0be3..300340ec1aa 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h @@ -40,7 +40,7 @@ __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_put( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t op64, @@ -50,12 +50,20 @@ size_t opx_hfi1_dput_write_header_and_payload_put( uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_PUT | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_PUT | (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); - tx_hdr->qw[5] = key; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + hdr->qw_9B[5] = key; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_PUT | + (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); + hdr->qw_16B[6] = key; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } if (tx_payload) { assert(!iov); @@ -100,7 +108,7 @@ void opx_hfi1_dput_write_payload_atomic_fetch( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_atomic_fetch( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t op64, @@ -113,12 +121,20 @@ size_t opx_hfi1_dput_write_header_and_payload_atomic_fetch( uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH | + (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); + hdr->qw_9B[5] = key; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH | (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); - tx_hdr->qw[5] = key; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + hdr->qw_16B[6] = key; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } size_t dput_bytes = payload_bytes - sizeof(struct fi_opx_hfi1_dput_fetch); @@ -184,7 +200,7 @@ void opx_hfi1_dput_write_payload_atomic_compare_fetch( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_atomic_compare_fetch( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t op64, @@ -200,12 +216,20 @@ size_t opx_hfi1_dput_write_header_and_payload_atomic_compare_fetch( uint8_t **cbuf, const enum fi_hmem_iface cbuf_iface, const uint64_t cbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH | + (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); + hdr->qw_9B[5] = key; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH | (dt64 << 16) | (op64 << 24) | (payload_bytes << 48); - tx_hdr->qw[5] = key; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + hdr->qw_16B[6] = key; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } size_t dput_bytes = payload_bytes - sizeof(struct fi_opx_hfi1_dput_fetch); size_t dput_bytes_half = dput_bytes >> 1; @@ -242,7 +266,7 @@ size_t opx_hfi1_dput_write_header_and_payload_atomic_compare_fetch( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_get( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t dt64, @@ -251,12 +275,20 @@ size_t opx_hfi1_dput_write_header_and_payload_get( uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_GET | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_GET | + (dt64 << 16) | (payload_bytes << 48); + hdr->qw_9B[5] = rma_request_vaddr; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_GET | (dt64 << 16) | (payload_bytes << 48); - tx_hdr->qw[5] = rma_request_vaddr; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + hdr->qw_16B[6] = rma_request_vaddr; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } if (tx_payload) { assert(!iov); @@ -289,7 +321,7 @@ size_t opx_hfi1_dput_write_header_and_payload_get( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload_rzv( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint64_t op64, @@ -300,11 +332,18 @@ size_t opx_hfi1_dput_write_header_and_payload_rzv( uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + enum opx_hfi1_type hfi1_type) { - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | (opcode) | (payload_bytes << 48); - tx_hdr->qw[5] = target_byte_counter_vaddr; - tx_hdr->qw[6] = fi_opx_dput_rbuf_out(*rbuf); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | (opcode) | (payload_bytes << 48); + hdr->qw_9B[5] = target_byte_counter_vaddr; + hdr->qw_9B[6] = fi_opx_dput_rbuf_out(*rbuf); + } else { + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | (opcode) | (payload_bytes << 48); + hdr->qw_16B[6] = target_byte_counter_vaddr; + hdr->qw_16B[7] = fi_opx_dput_rbuf_out(*rbuf); + } if (tx_payload) { assert(!iov); @@ -325,7 +364,7 @@ size_t opx_hfi1_dput_write_header_and_payload_rzv( __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_packet(struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, struct iovec *iov, const uint32_t opcode, @@ -347,50 +386,63 @@ size_t opx_hfi1_dput_write_packet(struct fi_opx_ep *opx_ep, uint8_t **cbuf, const enum fi_hmem_iface cbuf_iface, const uint64_t cbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { uint64_t psn = (uint64_t) htonl((uint32_t)psn_orig); - tx_hdr->qw[0] = opx_ep->rx->tx.dput.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - tx_hdr->qw[1] = opx_ep->rx->tx.dput.hdr.qw[1] | bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.dput.hdr.qw[2] | psn; - tx_hdr->qw[3] = opx_ep->rx->tx.dput.hdr.qw[3]; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[0] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[1] | bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[2] | psn; + hdr->qw_9B[3] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[3]; + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + hdr->qw_16B[0] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[2] | bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[3] | psn; + hdr->qw_16B[4] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[4]; + } switch(opcode) { case FI_OPX_HFI_DPUT_OPCODE_RZV: case FI_OPX_HFI_DPUT_OPCODE_RZV_TID: case FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG: return opx_hfi1_dput_write_header_and_payload_rzv( - opx_ep, tx_hdr, tx_payload, iov, + opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, opcode, target_byte_counter_vaddr, sbuf, - sbuf_iface, sbuf_device, rbuf); + sbuf_iface, sbuf_device, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_GET: return opx_hfi1_dput_write_header_and_payload_get( - opx_ep, tx_hdr, tx_payload, iov, + opx_ep, hdr, tx_payload, iov, dt64, payload_bytes, rma_request_vaddr, - sbuf, sbuf_iface, sbuf_device, rbuf); + sbuf, sbuf_iface, sbuf_device, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_PUT: return opx_hfi1_dput_write_header_and_payload_put( - opx_ep, tx_hdr, tx_payload, + opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, - key, sbuf, sbuf_iface, sbuf_device, rbuf); + key, sbuf, sbuf_iface, sbuf_device, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH: return opx_hfi1_dput_write_header_and_payload_atomic_fetch( - opx_ep, tx_hdr, tx_payload, iov, op64, dt64, + opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, key, fetch_vaddr, rma_request_vaddr, bytes_sent, sbuf, - sbuf_iface, sbuf_device, rbuf); + sbuf_iface, sbuf_device, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH: return opx_hfi1_dput_write_header_and_payload_atomic_compare_fetch( - opx_ep, tx_hdr, tx_payload, iov, op64, dt64, + opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, key, fetch_vaddr, rma_request_vaddr, bytes_sent, sbuf, sbuf_iface, - sbuf_device, cbuf, cbuf_iface, cbuf_device, rbuf); + sbuf_device, cbuf, cbuf_iface, cbuf_device, rbuf, hfi1_type); break; default: FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -402,7 +454,7 @@ size_t opx_hfi1_dput_write_packet(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_payload( struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, union fi_opx_hfi1_packet_payload *tx_payload, const uint32_t opcode, const int64_t psn_orig, @@ -423,20 +475,21 @@ size_t opx_hfi1_dput_write_header_and_payload( uint8_t **cbuf, const enum fi_hmem_iface cbuf_iface, const uint64_t cbuf_device, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { - return opx_hfi1_dput_write_packet(opx_ep, tx_hdr, tx_payload, NULL, + return opx_hfi1_dput_write_packet(opx_ep, hdr, tx_payload, NULL, opcode, psn_orig, lrh_dws, op64, dt64, lrh_dlid, bth_rx, payload_bytes, key, fetch_vaddr, target_byte_counter_vaddr, rma_request_vaddr, bytes_sent, sbuf, sbuf_iface, sbuf_device, - cbuf, cbuf_iface, cbuf_device, rbuf); + cbuf, cbuf_iface, cbuf_device, rbuf, hfi1_type); } __OPX_FORCE_INLINE__ size_t opx_hfi1_dput_write_header_and_iov(struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_packet_hdr *tx_hdr, + union opx_hfi1_packet_hdr *hdr, struct iovec *iov, const uint32_t opcode, const uint16_t lrh_dws, @@ -452,19 +505,20 @@ size_t opx_hfi1_dput_write_header_and_iov(struct fi_opx_ep *opx_ep, uint64_t bytes_sent, uint8_t **sbuf, uint8_t **cbuf, - uintptr_t *rbuf) + uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { /* When we're just setting the IOV * 1. Use a PSN of 0, because the caller will set that later * 2. The sbuf/cbuf iface and device are not used, so just pass in system/0 */ - return opx_hfi1_dput_write_packet(opx_ep, tx_hdr, NULL, iov, opcode, 0, + return opx_hfi1_dput_write_packet(opx_ep, hdr, NULL, iov, opcode, 0, lrh_dws, op64, dt64, lrh_dlid, bth_rx, payload_bytes, key, fetch_vaddr, target_byte_counter_vaddr, rma_request_vaddr, bytes_sent, sbuf, FI_HMEM_SYSTEM, 0ul, cbuf, FI_HMEM_SYSTEM, 0ul, - rbuf); + rbuf, hfi1_type); } #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h index 64e654c9dfe..e80420ca8f5 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h @@ -98,23 +98,22 @@ /* Fields that unused on JKR (zero will be OR'd) */ #define OPX_PBC_JKR_UNUSED 0UL -#define OPX_PBC_JKR_DLID(_dlid) (((unsigned long long)(_dlid & OPX_PBC_JKR_DLID_MASK) << OPX_PBC_JKR_DLID_SHIFT) << OPX_PBC_MSB_SHIFT) -#define OPX_PBC_JKR_SCTXT(_ctx) (((unsigned long long)(_ctx & OPX_PBC_JKR_SCTXT_MASK) << OPX_PBC_JKR_SCTXT_SHIFT) << OPX_PBC_MSB_SHIFT) +#define OPX_PBC_JKR_DLID(_dlid) (((unsigned long long)(_dlid & OPX_PBC_JKR_DLID_MASK) << OPX_PBC_JKR_DLID_SHIFT) << OPX_MSB_SHIFT) +#define OPX_PBC_JKR_SCTXT(_ctx) (((unsigned long long)(_ctx & OPX_PBC_JKR_SCTXT_MASK) << OPX_PBC_JKR_SCTXT_SHIFT) << OPX_MSB_SHIFT) #define OPX_PBC_JKR_L2COMPRESSED(_c) OPX_PBC_JKR_UNUSED /* unused until 16B headers are optimized */ #define OPX_PBC_JKR_PORTIDX(_pidx) (((OPX_JKR_PHYS_PORT_TO_INDEX(_pidx)) & OPX_PBC_JKR_PORT_MASK) << OPX_PBC_JKR_PORT_SHIFT) #define OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID(_dlid) OPX_PBC_JKR_DLID(htons(_dlid >> 16)) +#define OPX_PBC_JKR_INSERT_NON9B_ICRC (1<<24) #ifndef NDEBUG __OPX_FORCE_INLINE__ uint32_t opx_pbc_jkr_l2type(unsigned _type) { - /* 16B not supported yet */ - assert(_type == OPX_PBC_JKR_L2TYPE_9B); return (_type & OPX_PBC_JKR_L2TYPE_MASK) << OPX_PBC_JKR_L2TYPE_SHIFT; } #define OPX_PBC_JKR_L2TYPE(_type) opx_pbc_jkr_l2type(_type) #else -#define OPX_PBC_JKR_L2TYPE(_type) ((OPX_PBC_JKR_L2TYPE_9B & OPX_PBC_JKR_L2TYPE_MASK) << OPX_PBC_JKR_L2TYPE_SHIFT) /* 16B not supported yet */ +#define OPX_PBC_JKR_L2TYPE(_type) ((_type & OPX_PBC_JKR_L2TYPE_MASK) << OPX_PBC_JKR_L2TYPE_SHIFT) #endif #define OPX_PBC_JKR_RUNTIME(_dlid, _pidx) OPX_PBC_JKR_UNUSED @@ -187,6 +186,18 @@ static inline int opx_bth_rc2_val() #define OPX_BTH_JKR_RC2(_rc2) ((_rc2 & OPX_BTH_JKR_RC2_MASK) << OPX_BTH_JKR_RC2_SHIFT) #define OPX_BTH_JKR_RC2_VAL opx_bth_rc2_val() + +/* LRH */ +#define OPX_LRH_JKR_16B_DLID_MASK_16B 0x0FFFFF +#define OPX_LRH_JKR_16B_DLID_SHIFT_16B OPX_MSB_SHIFT + +#define OPX_LRH_JKR_16B_DLID20_MASK_16B 0xF00000 +#define OPX_LRH_JKR_16B_DLID20_SHIFT_16B (20 - 12) // shift right 20 (dlid bits) and left 12 (lrh bits) + +#define OPX_LRH_JKR_16B_RX_MASK_16B 0xFF +#define OPX_LRH_JKR_16B_RX_SHIFT_16B (7*8) // 7 bytes + + /* RHF */ /* JKR * @@ -204,8 +215,8 @@ static inline int opx_bth_rc2_val() #define OPX_JKR_RHF_SEQ_NOT_MATCH(_seq, _rhf) (_seq != (_rhf & 0x0F00000000000000ul)) #define OPX_JKR_RHF_SEQ_INCREMENT(_seq) ((_seq < 0x0D00000000000000ul) * _seq + 0x0100000000000000ul) -#define OPX_JKR_IS_ERRORED_RHF(_rhf) (_rhf & 0x8000000000000000ul) -#define OPX_JKR_RHF_SEQ_MATCH(_seq, _rhf) (_seq == (_rhf & 0x0F00000000000000ul)) +#define OPX_JKR_IS_ERRORED_RHF(_rhf, _hfi1_type) (_rhf & 0x8000000000000000ul) /* does not check RHF.KHdrLenErr */ +#define OPX_JKR_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type) (_seq == (_rhf & 0x0F00000000000000ul)) #define OPX_JKR_RHF_SEQ_INIT_VAL (0x0100000000000000ul) #define OPX_JKR_RHF_IS_USE_EGR_BUF(_rhf) ((_rhf & 0x00008000ul) == 0x00008000ul) @@ -238,10 +249,11 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); -#define OPX_JKR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) \ - opx_jkr_rhe_debug(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) +#define OPX_JKR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) \ + opx_jkr_rhe_debug(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) // Common to both JKR/WFR @@ -250,22 +262,122 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, #define OPX_JKR_RHF_RCV_TYPE_OTHER(_rhf) ((_rhf & 0x00006000ul) != 0x00000000ul) /* Common (jkr) handler to WFR/JKR 9B (for now) */ -int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr); +int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); -__OPX_FORCE_INLINE__ int opx_jkr_rhf_check_header(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr) +__OPX_FORCE_INLINE__ int opx_jkr_9B_rhf_check_header(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { /* RHF error */ - if (OFI_UNLIKELY(OPX_JKR_IS_ERRORED_RHF(rhf_rcvd))) return 1; /* error */ + if (OFI_UNLIKELY(OPX_JKR_IS_ERRORED_RHF(rhf_rcvd, OPX_HFI1_JKR))) return 1; /* error */ /* Bad packet header */ if (OFI_UNLIKELY((!OPX_JKR_RHF_IS_USE_EGR_BUF(rhf_rcvd)) && - (ntohs(hdr->stl.lrh.pktlen) > 0x15) && + (ntohs(hdr->lrh_9B.pktlen) > 0x15) && !(OPX_JKR_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)))) - return opx_jkr_rhf_error_handler(rhf_rcvd, hdr); /* error */ + return opx_jkr_rhf_error_handler(rhf_rcvd, hdr, hfi1_type); /* error */ else return 0; /* no error*/ } -#define OPX_JKR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) opx_jkr_rhf_check_header(_rhf_rcvd, _hdr) +__OPX_FORCE_INLINE__ int opx_jkr_16B_rhf_check_header(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) +{ + /* RHF error */ + if (OFI_UNLIKELY(OPX_JKR_IS_ERRORED_RHF(rhf_rcvd, OPX_HFI1_JKR))) return 1; /* error */ + + /* Bad packet header */ + if (OFI_UNLIKELY((!OPX_JKR_RHF_IS_USE_EGR_BUF(rhf_rcvd)) && + (hdr->lrh_16B.pktlen > 0x9) && + !(OPX_JKR_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)))) + return opx_jkr_rhf_error_handler(rhf_rcvd, hdr, hfi1_type); /* error */ + else + return 0; /* no error*/ +} + +#define OPX_JKR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr, _hfi1_type) ((_hfi1_type & OPX_HFI1_JKR_9B) ? \ + opx_jkr_9B_rhf_check_header(_rhf_rcvd, _hdr, _hfi1_type) : opx_jkr_16B_rhf_check_header(_rhf_rcvd, _hdr, _hfi1_type)) + +union opx_jkr_pbc{ + uint64_t raw64b; + uint32_t raw32b[2]; + + __le64 qw; + __le32 dw[2]; + __le16 w[1]; + + struct { + __le64 LengthDWs:12; + __le64 Vl:4; + __le64 PortIdx:2; + __le64 Reserved_2:1; + __le64 L2Compressed:1; + __le64 L2Type:2; + __le64 Fecnd:1; + __le64 TestBadLcrc:1; + __le64 InsertNon9bIcrc:1; + __le64 CreditReturn:1; + __le64 InsertHcrc:2; + __le64 Reserved_1:1; + __le64 TestEbp:1; + __le64 Sc4:1; + __le64 Intr:1; + __le64 Dlid: 24; + __le64 SendCtxt: 8; + }; + +}; + +#ifndef NDEBUG + #define OPX_PRINT_RHF(a) opx_print_rhf((opx_jkr_rhf)(a),__func__,__LINE__) +#else + #define OPX_PRINT_RHF(a) +#endif + +union opx_jkr_rhf { + uint64_t qw; + uint32_t dw[2]; + uint16_t w[4]; + struct { + uint64_t PktLen:12; + uint64_t RcvType:3; + uint64_t UseEgrBfr:1; + uint64_t EgrIndex:14; + uint64_t Rsvd:1; + uint64_t KHdrLenErr:1; + uint64_t EgrOffset:12; + uint64_t HdrqOffset:9; + uint64_t L2Type9bSc4:1; + uint64_t L2Type:2; + uint64_t RcvSeq:4; + uint64_t RcvPort:2; + uint64_t SendPacing:1; + uint64_t RheValid:1; + }; +}; + + +static inline void opx_print_rhf(union opx_jkr_rhf rhf, const char* func, const unsigned line) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: %s \n", func, line, __func__); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.RheValid = %#x\n", func, line, rhf.RheValid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.SendPacing = %#x\n", func, line, rhf.SendPacing); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.RcvPort = %#x\n", func, line, rhf.RcvPort); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.RcvSeq = %#x\n", func, line, rhf.RcvSeq); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s:%u: RHF.L2Type = %#x%s\n", func, line, rhf.L2Type, + (rhf.L2Type == 0x3 ? " 9B": + (rhf.L2Type == 0x2 ? " 16B": + (rhf.L2Type == 0x1 ? " 10B": + (rhf.L2Type == 0x0 ? " 8B":" INVALID"))))); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.L2Type9bSc4 = %#x\n", func, line, rhf.L2Type9bSc4); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.HdrqOffset = %#x\n", func, line, rhf.HdrqOffset); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.EgrOffset = %#x\n", func, line, rhf.EgrOffset); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.Rsvd = %#x\n", func, line, rhf.Rsvd); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.KHdrLenErr = %#x\n", func, line, rhf.KHdrLenErr); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.EgrIndex = %#x\n", func, line, rhf.EgrIndex); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.UseEgrBfr = %#x\n", func, line, rhf.UseEgrBfr); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.RcvType = %#x\n", func, line, rhf.RcvType); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s:%u: RHF.PktLen = %#x\n", func, line, rhf.PktLen); +} #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h index 87a5a65d9f5..353b1db6399 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h @@ -53,6 +53,7 @@ #define FI_OPX_ADDR_SEP_RX_MAX (4) #define FI_OPX_HFI1_PACKET_MTU (8192) #define OPX_HFI1_TID_PAGESIZE (PAGE_SIZE) /* assume 4K, no hugepages*/ + #define FI_OPX_HFI1_PACKET_IMM (16) /* opcodes (0x00..0xBF) are reserved */ @@ -110,19 +111,19 @@ static inline const char* opx_hfi1_bth_opcode_to_string(uint16_t opcode) } return FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS[sizeof(FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS)/sizeof(char*)-1]; /* INVALID */ } -#define FI_OPX_HFI1_PACKET_SLID(packet_hdr) \ - (((packet_hdr).qw[0] & 0xFFFF000000000000ul) >> 48) -#define FI_OPX_HFI1_PACKET_PSN(packet_hdr) \ - (((packet_hdr)->stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) \ - ? ntohl((packet_hdr)->stl.bth.psn) & 0x00FFFFFF \ - : (packet_hdr)->reliability.psn) -#define FI_OPX_HFI1_PACKET_ORIGIN_TX(packet_hdr) \ - (((packet_hdr)->stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) \ - ? (packet_hdr)->dput.target.origin_tx \ + +#define FI_OPX_HFI1_PACKET_ORIGIN_TX(packet_hdr) \ + (((packet_hdr)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) \ + ? (packet_hdr)->dput.target.origin_tx \ : (packet_hdr)->reliability.origin_tx) +#define FI_OPX_HFI1_PACKET_PSN(packet_hdr) \ + (((packet_hdr)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) \ + ? ntohl((packet_hdr)->bth.psn) & 0x00FFFFFF \ + : (packet_hdr)->reliability.psn) + #define FI_OPX_HFI_UD_OPCODE_FIRST_INVALID (0x00) #define FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING (0x01) #define FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK (0x02) @@ -214,7 +215,9 @@ static inline const char* opx_hfi1_dput_opcode_to_string(uint8_t opcode) #define HFI_KHDR_KVER_SHIFT 30 #define HFI_KHDR_KVER_MASK 0x3 -struct fi_opx_hfi1_stl_packet_hdr { + +/* "Legacy" header with 9DWs of KDETH */ +struct fi_opx_hfi1_stl_packet_hdr_9B { /* == quadword 0 == */ union { @@ -226,8 +229,8 @@ struct fi_opx_hfi1_stl_packet_hdr { uint16_t dlid; /* lrh.w[1] - big-endian! */ uint16_t pktlen; /* lrh.w[2] - big-endian! */ uint16_t slid; /* lrh.w[3] - big-endian! */ - } __attribute__((packed)); - } lrh; + } __attribute__((__packed__)); + } lrh_9B; /* == quadword 1 == */ union { @@ -245,7 +248,7 @@ struct fi_opx_hfi1_stl_packet_hdr { /* == quadword 2 == */ uint32_t psn; /* bth.dw[2] ..... the 'psn' field is unused for 'eager' packets -----> reliability::psn, etc */ - } __attribute__((packed)); + } __attribute__((__packed__)); } bth; union { @@ -259,11 +262,87 @@ struct fi_opx_hfi1_stl_packet_hdr { uint16_t jkey; /* kdeth.w[2] */ uint16_t hcrc; /* kdeth.w[3] */ uint32_t unused; /* kdeth.dw[2] -----> immediate data (32b) */ - } __attribute__((packed)); + } __attribute__((__packed__)); } kdeth; /* == quadword 4,5,6 == */ - uint64_t unused[3]; + uint64_t kdeth_sw[3]; + +} __attribute__((__packed__)); + +/* "Legacy" header with 9DWs of KDETH */ +struct fi_opx_hfi1_stl_packet_hdr_16B { + + /* == quadword 0,1 == */ + union { + struct { + __le64 qw0; + __le64 qw1; + }; + __le64 qw[2]; + __le32 dw[4]; + __le16 w[8]; + struct { /* 16B header */ + __le32 slid:20; /* dw[0] qw[0]*/ + /* This is the packet length and is in units of flits (QWs) for 8B, 10B and 16B + formats, but in units of DWs for 9B formats.*/ + __le32 pktlen:11; + __le32 b:1; + + __le32 dlid:20; /* dw[1] */ + __le32 sc:5; + __le32 rc:3; + __le32 f:1; + __le32 l2:2; + __le32 lt:1; + + __le32 l4:8; /* dw[2] qw[1] */ + __le32 slid20:4; + __le32 dlid20:4; + __le32 pkey:16; + + __le32 entropy:16; /* dw[3] */ + __le32 age:3; + __le32 cspec:5; + __le32 r:8; + }; + }lrh_16B; + + /* == quadword 2 == */ + union { + uint32_t dw[3]; + uint16_t w[6]; + uint8_t hw[12]; + struct { + uint8_t opcode; /* bth.hw[0] */ + uint8_t bth_1; /* bth.hw[1] */ + uint16_t pkey; /* bth.w[1] - big-endian! */ + uint8_t ecn; /* bth.hw[4] (FECN, BECN, (CSPEC and RC2 for JKR) and reserved) */ + uint8_t qp; /* bth.hw[5] */ + uint8_t unused; /* bth.hw[6] -----> inject::message_length, send::xfer_bytes_tail */ + uint8_t rx; /* bth.hw[7] */ + + /* == quadword 3 == */ + uint32_t psn; /* bth.dw[2] ..... the 'psn' field is unused for 'eager' packets -----> reliability::psn, etc */ + } __attribute__((__packed__)); + } bth; + + union { + uint32_t dw[3]; + uint16_t w[6]; + uint8_t hw[12]; + struct { + uint32_t offset_ver_tid; /* kdeth.dw[0] .... the 'offset' field is unused for 'eager' packets */ + + /* == quadword 4 == */ + uint16_t jkey; /* kdeth.w[2] */ + uint16_t hcrc; /* kdeth.w[3] */ + uint32_t unused; /* kdeth.dw[2] -----> immediate data (32b) */ + } __attribute__((__packed__)); + } kdeth; + + /* == quadword 5,6,7 == */ + uint64_t kdeth_sw[3]; } __attribute__((__packed__)); @@ -337,9 +416,10 @@ struct fi_opx_hfi1_stl_packet_hdr { #define FI_OPX_PKT_RZV_FLAGS_NONCONTIG (1ul) #define FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK (FI_OPX_PKT_RZV_FLAGS_NONCONTIG << FI_OPX_PKT_RZV_FLAGS_SHIFT) +#if 0 #ifndef NDEBUG static inline -void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr * hdr, +void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr_9B * hdr, const char * fn, const unsigned ln) { #if __GNUC__ > 9 @@ -351,10 +431,10 @@ void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr * hdr, #endif fprintf(stderr, "%s():%u ==== dump stl packet header @ %p [%016lx %016lx %016lx %016lx]\n", fn, ln, hdr, qw[0], qw[1], qw[2], qw[3]); - fprintf(stderr, "%s():%u .lrh.flags ............. 0x%04hx\n", fn, ln, hdr->lrh.flags); - fprintf(stderr, "%s():%u .lrh.dlid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh.dlid, hdr->lrh.dlid, ntohs(hdr->lrh.dlid)); - fprintf(stderr, "%s():%u .lrh.pktlen ............ 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh.pktlen, hdr->lrh.pktlen, ntohs(hdr->lrh.pktlen)); - fprintf(stderr, "%s():%u .lrh.slid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh.slid, hdr->lrh.slid, ntohs(hdr->lrh.slid)); + fprintf(stderr, "%s():%u .lrh.flags ............. 0x%04hx\n", fn, ln, hdr->lrh_9B.flags); + fprintf(stderr, "%s():%u .lrh.dlid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.dlid, hdr->lrh_9B.dlid, ntohs(hdr->lrh.dlid)); + fprintf(stderr, "%s():%u .lrh.pktlen ............ 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.pktlen, hdr->lrh_9B.pktlen, ntohs(hdr->lrh.pktlen)); + fprintf(stderr, "%s():%u .lrh.slid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.slid, hdr->lrh_9B.slid, ntohs(hdr->lrh.slid)); fprintf(stderr, "%s():%u\n", fn, ln); fprintf(stderr, "%s():%u .bth.opcode ............ 0x%02x \n", fn, ln, hdr->bth.opcode); fprintf(stderr, "%s():%u .bth.bth_1 ............. 0x%02x \n", fn, ln, hdr->bth.bth_1); @@ -374,7 +454,7 @@ void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr * hdr, return; } #endif - +#endif /** @@ -383,17 +463,17 @@ void fi_opx_hfi1_dump_stl_packet_hdr (struct fi_opx_hfi1_stl_packet_hdr * hdr, * The HFI1 packet header is consumed in many places and sometimes overloaded * for cache and memory allocation reasons. */ -union fi_opx_hfi1_packet_hdr { +union fi_opx_hfi1_packet_hdr_9B { uint64_t qw[7]; - struct fi_opx_hfi1_stl_packet_hdr stl; + struct fi_opx_hfi1_stl_packet_hdr_9B stl; struct { /* == quadword 0 == */ uint16_t reserved_0[3]; - uint16_t slid; + uint16_t _slid; /* == quadword 1 == */ uint64_t reserved_1; @@ -413,7 +493,7 @@ union fi_opx_hfi1_packet_hdr { struct { /* == quadword 0 == */ uint16_t reserved_0[3]; - uint16_t slid; /* used for FI_DIRECTED_RECV; identifies the node - big-endian! */ + uint16_t _slid; /* used for FI_DIRECTED_RECV; identifies the node - big-endian! */ /* == quadword 1 == */ uint64_t reserved_1; @@ -724,7 +804,7 @@ union fi_opx_hfi1_packet_hdr { struct { /* == quadword 0 == */ uint16_t reserved_0[3]; - uint16_t slid; /* stl.lrh.slid */ + uint16_t _slid; /* stl.lrh.slid */ /* == quadword 1 == */ uint64_t reserved_1; @@ -746,16 +826,477 @@ union fi_opx_hfi1_packet_hdr { } __attribute__((__packed__)) service; /* "reliability service" */ } __attribute__((__aligned__(8))); -static_assert(((offsetof(union fi_opx_hfi1_packet_hdr, rendezvous.flags) % 8) * 8) == FI_OPX_PKT_RZV_FLAGS_SHIFT, - "struct fi_opx_hfi1_packet_hdr.rendezvous.flags offset inconsistent with FLAGS_SHIFT!"); -static inline -fi_opx_uid_t fi_opx_hfi1_packet_hdr_uid (const union fi_opx_hfi1_packet_hdr * const hdr) { +static_assert(((offsetof(union fi_opx_hfi1_packet_hdr_9B, rendezvous.flags) % 8) * 8) == FI_OPX_PKT_RZV_FLAGS_SHIFT, + "struct opx_hfi1_packet_hdr.rendezvous.flags offset inconsistent with FLAGS_SHIFT!"); + + + +/* + HEADER UNION RX POLL + ===================== ============= + 9B 16B 9B 16B + ========= ========== ===== ===== +QW[0] (pad) LRH <-- | <-- RX header union pointer +QW[1] LRH LRH | | +QW[2] BTH BTH | | <- RX common OPX headers +QW[3] BTH/KDETH BTH/KDETH | | +QW[4] KDETH KDETH | | +QW[5] USER/SW USER/SW | | +QW[6] USER/SW USER/SW | | +QW[7] USER/SW USER/SW | | + RHF RHF + + (*) HDRQ entries are 128 bytes (16 quadwords) and include HEADER + RHF + + In RX POLL, pull SLID, DLID and PKTLEN out of 9B/16B LRH. + All other RX stack can use the common OPX headers to access OPX fields. +*/ + +/** + * \brief Converged HFI1 packet header for 9B & 16B (JKR) + * + * The HFI1 packet header is consumed in many places and sometimes overloaded + * for cache and memory allocation reasons. + */ +union opx_hfi1_packet_hdr { + /* STL UNION */ + union opx_hfi1_stl_packet_hdr { + struct { + uint64_t qw0; + struct fi_opx_hfi1_stl_packet_hdr_9B hdr_9B; /* 9B legacy w/ 9 DW KDETH */ + uint64_t qwn[7]; /* 9B(+) QW's */ + } __attribute__((__packed__)) fi_opx_hfi1_stl_packet_hdr_9BP; /* 9B(+14 DWs of KDETH ) */ + struct { + struct fi_opx_hfi1_stl_packet_hdr_16B hdr_16B; /* 16B legacy w/ 9 DW KDETH */ + uint64_t qwn[7]; /* 16B(+) QW's */ + } __attribute__((__packed__)) fi_opx_hfi1_stl_packet_hdr_16BP; /* 16B(+14 DWs of KDETH */ + } __attribute__((__packed__)) stl; /* for alignment/sizes*/ + + /* QUADWORD UNION */ + struct { + uint64_t unused_pad_9B; + uint64_t qw_9B[7]; /* 9B QW's */ + uint64_t qw_9BP[7]; /* 9B(+) QW's */ + }; + uint64_t qw_16B[15]; /* 16B QW's */ + + /* Standard (new) Headers - LRH, BTH, KDETH, SW defined (KDETH) + 15 quadwords */ + struct { + /* LRH union for (padded) 9B and 16B LRH */ + union { + struct { + uint64_t unused_pad_qw0; + union { + uint64_t qw[1]; /* 9B LRH is 1 quadword */ + + struct { /* 9B LRH */ + uint16_t flags; + uint16_t dlid; + uint16_t pktlen; + uint16_t slid; + } __attribute__((__packed__)); + } lrh_9B; + }; + union { + __le64 qw[2]; /* 16B is 2 quadwords */ + + struct { /* 16B LRH */ + __le32 slid:20; + /* This is the packet length and is in units of flits (QWs) for 8B, 10B and 16B + formats, but in units of DWs for 9B formats.*/ + __le32 pktlen:11; + __le32 b:1; + + __le32 dlid:20; + __le32 sc:5; + __le32 rc:3; + __le32 f:1; + __le32 l2:2; + __le32 lt:1; + + __le32 l4:8; + __le32 slid20:4; + __le32 dlid20:4; + __le32 pkey:16; + + __le32 entropy:16; + __le32 age:3; + __le32 cspec:5; + __le32 r:8; + }; + } lrh_16B; + } ; + + /* QW[2-3] BTH 1 1/2 quadwords, 3 dwords */ + struct { + uint8_t opcode; + uint8_t bth_1; + uint16_t pkey; + uint8_t ecn; /* (FECN, BECN, (CSPEC and RC2 for JKR) and reserved) */ + uint8_t qp; + uint8_t unused; + uint8_t rx; + + /* QW[3] starts */ + uint32_t psn; + } __attribute__((__packed__)) bth; + + /* QW[3-4] KDETH 1 1/2 quadwords, 3 dwords */ + struct { + uint32_t offset_ver_tid; + + /* QW[4] starts */ + uint16_t jkey; + uint16_t hcrc; + uint32_t unused; + } __attribute__((__packed__)) kdeth; + + /* QW[5-7] 9B SW defined */ + /* QW[8-14] 9B(+) SW defined */ + /* QW[5-14] 16B SW defined */ + uint64_t sw_defined[10]; + } __attribute__((__packed__)); + + + /* OPX headers + * + * overlay/redefine some standard header fields + * and the SW defined header */ + + + /* OPX RELIABILITY HEADER */ + struct { + uint64_t reserved[3]; /* QW[0-2] */ + + /* QW[3] BTH/KDETH (psn,offset_ver_tid)*/ + uint32_t psn : 24; + uint32_t origin_tx : 8; + uint8_t unused; /* WHY? unused but zeroed in model */ + uint8_t reserved_1[3]; + + uint64_t reserved_n[10]; /* QW[4-14] KDETH/SW */ + + } __attribute__((__packed__)) reliability; + + + /* OPX MATCH HEADER */ + struct { + uint64_t reserved[3]; /* QW[0-2] */ + + /* QW[3] BTH/KDETH (psn) */ + uint8_t reserved_0[3]; + uint8_t origin_tx; /* used for FI_DIRECTED_RECV; identifies the endpoint on the node */ + uint32_t reserved_1; + + /* QW[4] KDETH (unused) */ + uint32_t reserved_2; + uint32_t ofi_data; /* used for FI_RX_CQ_DATA */ + + uint64_t reserved_3[2]; /* QW[5-6] SW */ + + uint64_t ofi_tag; /* QW[7] SW last 9B quadword */ + uint64_t reserved_n[6]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) match; + + + /* OPX INJECT HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t message_length; /* only need 5 bits; maximum inject message size is 16 bytes */ + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH*/ + uint64_t reserved_3[2]; + + /* QW[5-6] SW */ + union { + uint8_t app_data_u8[16]; + uint16_t app_data_u16[8]; + uint32_t app_data_u32[4]; + uint64_t app_data_u64[2]; + }; + + uint64_t reserved_n[7]; /* QW[7-14] SW */ + + } __attribute__((__packed__)) inject; + + + /* OPX SEND HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t xfer_bytes_tail; /* only need 4 bits; maximum tail size is 8 bytes (or is it 7?) */ + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH*/ + uint64_t reserved_3[2]; + + /* QW[5] SW */ + uint16_t unused[3]; + uint16_t payload_qws_total; /* TODO - use stl.lrh.pktlen instead (num dws); only need 11 bits; maximum number of payload qw is 10240 / 8 = 1280 */ + + /* QW[6] SW */ + uint64_t xfer_tail; + + uint64_t reserved_n[7]; /* QW[7-14] SW */ + + } __attribute__((__packed__)) send; + + /* OPX MP EAGER 1ST HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t xfer_bytes_tail; /* Maximum tail size is 16 bytes */ + uint8_t reserved_2; + + /* QW[3] BTH/KDETH (offset_ver_tid) */ + uint32_t reserved_3; + uint32_t payload_bytes_total; /* Total length of payload across all mp-eager packets */ + + /* QW[4] KDETH */ + uint64_t reserved_4; + + /* QW[5-6] SW */ + uint64_t xfer_tail[2]; + + uint64_t reserved_n[7]; /* QW[7-14] SW */ + + } __attribute__((__packed__)) mp_eager_first; + + /* OPX MP EAGER NTH HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t xfer_bytes_tail; /* Maximum tail size is 16 bytes */ + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH */ + uint64_t reserved_3[2]; + + /* QW[5-6] SW */ + uint64_t xfer_tail[2]; + + /* QW[7] SW last 9B quadword */ + uint32_t payload_offset; + uint32_t mp_egr_uid; + + uint64_t reserved_n[6]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) mp_eager_nth; + + /* OPX RENDEZVOUS HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t origin_rx; + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH */ + uint64_t reserved_3[2]; + + /* QW[5] SW */ + uint16_t origin_rs; + uint8_t flags; + uint8_t unused[3]; + uint16_t niov; /* number of non-contiguous buffers */ + + /* QW[6] SW */ + uint64_t message_length; /* total length in bytes of all non-contiguous buffers and immediate data */ + + uint64_t reserved_n[7]; /* QW[7-14] SW */ + + } __attribute__((__packed__)) rendezvous; + + /* OPX CTS HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t origin_rx; + uint8_t reserved_2; + + /* QW[3-4] BTH/KDETH */ + uint64_t reserved_3[2]; + + /* QW[5-14] SW */ + union { + uint8_t opcode; + struct { + /* QW[5] SW */ + uint8_t opcode; + uint8_t unused0; + uint16_t unused1; + uint16_t ntidpairs; /* number of tidpairs described in the packet payload */ + uint16_t niov; /* number of non-contiguous buffers described in the packet payload */ + + /* QW[6-7] SW */ + uintptr_t origin_byte_counter_vaddr; + uintptr_t target_context_vaddr; + } vaddr; + struct { + /* QW[5] SW */ + uint8_t opcode; + uint8_t unused0; + uint16_t unused1; + uint8_t dt; + uint8_t op; + uint16_t niov; /* number of non-contiguous buffers described in the packet payload */ + + /* QW[6-7] SW */ + uintptr_t rma_request_vaddr; + uint64_t key; + } mr; + struct { + /* QW[5] SW */ + uint8_t opcode; + uint8_t unused0; + uint16_t unused1; + uint8_t unused2; + uint8_t unused3; + uint16_t unused4; /* number of non-contiguous buffers described in the packet payload */ + + /* QW[6-7] SW */ + uintptr_t completion_counter; + uint64_t bytes_to_fence; + } fence; + } target; + + uint64_t reserved_n[6]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) cts; + + /* OPX DPUT HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t origin_rx; + uint8_t reserved_o2; + + /* == quadword 2 == */ + uint64_t reserved_3; + + /* == quadword 3 == */ + uint64_t reserved_4; + + union { + /* QW[5] SW */ + /* Common fields */ + struct { + uint8_t opcode; + uint8_t origin_tx; + uint8_t dt; + uint8_t op; + uint16_t last_bytes; + uint16_t bytes; + + uint64_t reserved[2]; /* op-specific */ + }; + + struct { + /* QW[5] SW */ + uint64_t reserved; /* Common fields */ + + /* QW[6] SW */ + uintptr_t rma_request_vaddr; + /* QW[7] SW */ + uintptr_t rbuf; + } get; + + struct { + /* QW[5] SW */ + uint64_t reserved; /* Common fields */ + + /* QW[6] SW */ + uintptr_t completion_vaddr; /* struct fi_opx_rzv_completion * */ + /* QW[7] SW */ + uintptr_t rbuf; + } rzv; + + struct { + /* QW[5] SW */ + uint64_t reserved; /* Common fields */ + + /* QW[6-7] SW */ + uintptr_t key; + uintptr_t offset; + } mr; + + struct { + /* QW[5] SW */ + uint64_t reserved; /* Common fields */ + + /* QW[6-7] SW */ + uintptr_t completion_counter; + uint64_t bytes_to_fence; + } fence; + } target; + + uint64_t reserved_n[6]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) dput; + + /* OPX UD HEADER */ + struct { + uint64_t reserved[2]; /* QW[0-1] */ + + /* QW[2] BTH (unused)*/ + uint16_t reserved_1[3]; + uint8_t opcode; + uint8_t reserved_2; + uint64_t reserved_n[11]; /* QW[3-14] SW */ + + } __attribute__((__packed__)) ud; + + /* OPX SERVICE HEADER */ + struct { + uint64_t reserved[3]; /* QW[0-2] */ + + /* QW[3] BTH/KDETH (psn,offset_ver_tid) */ + uint32_t range_count; + uint8_t origin_reliability_rx; + uint8_t reserved_1[3]; + + /* QW[4] KDETH (unused) */ + uint32_t reserved_2; + uint32_t unused; /* WHY? unused but zeroed in model */ + + /* QW[5-7] SW */ + uint64_t psn_count; + uint64_t psn_start; + uint64_t key; /* fi_opx_reliability_service_flow_key */ + + uint64_t reserved_n[6]; /* QW[8-14] SW */ + + } __attribute__((__packed__)) service; /* "reliability service" */ +} __attribute__((__packed__)) __attribute__((__aligned__(8))); + + +static inline +fi_opx_uid_t fi_opx_hfi1_packet_hdr_uid (const union opx_hfi1_packet_hdr * const hdr, + const uint64_t slid) { const union fi_opx_uid uid = { .endpoint_id = hdr->reliability.origin_tx, /* node-scoped endpoint id */ - .lid = hdr->match.slid /* job-scoped node id */ + .lid_3B = 0, + .lid = slid /* job-scoped node id */ }; return uid.fi; @@ -763,10 +1304,10 @@ fi_opx_uid_t fi_opx_hfi1_packet_hdr_uid (const union fi_opx_hfi1_packet_hdr * co static inline size_t -fi_opx_hfi1_packet_hdr_message_length (const union fi_opx_hfi1_packet_hdr * const hdr) +fi_opx_hfi1_packet_hdr_message_length (const union opx_hfi1_packet_hdr * const hdr) { size_t message_length = 0; - switch (hdr->stl.bth.opcode) { + switch (hdr->bth.opcode) { case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT: case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT: message_length = hdr->inject.message_length; @@ -786,8 +1327,8 @@ fi_opx_hfi1_packet_hdr_message_length (const union fi_opx_hfi1_packet_hdr * cons break; default: fprintf(stderr, "%s:%s():%d abort. hdr->stl.bth.opcode = %02x (%u)\n", - __FILE__, __func__, __LINE__, hdr->stl.bth.opcode, - hdr->stl.bth.opcode); + __FILE__, __func__, __LINE__, hdr->bth.opcode, + hdr->bth.opcode); abort(); break; } @@ -795,6 +1336,191 @@ fi_opx_hfi1_packet_hdr_message_length (const union fi_opx_hfi1_packet_hdr * cons return message_length; } +#ifndef NDEBUG + +#define OPX_JKR_PRINT_16B_PBC(a) opx_jkr_print_16B_pbc((a),__func__) +#define OPX_JKR_PRINT_16B_LRH(a,b) opx_jkr_print_16B_lrh((a),(b),__func__) +#define OPX_JKR_PRINT_16B_BTH(a,b) opx_jkr_print_16B_bth((a),(b),__func__) + +void opx_jkr_print_16B_pbc(uint64_t pbc1, const char* func); +void opx_jkr_print_16B_lrh(uint64_t lrh1, uint64_t lrh2, const char* func); +void opx_jkr_print_16B_bth(uint64_t bth1, uint64_t bth2, const char* func); + + +static inline +void fi_opx_hfi1_dump_stl_packet_hdr (const union opx_hfi1_packet_hdr * hdr, + const enum opx_hfi1_type hfi1_type, + const char * fn, const unsigned ln) +{ + +#if __GNUC__ > 9 +#pragma GCC diagnostic ignored "=Waddress-of-packed-member" +#endif + const uint64_t * const qw = (uint64_t *)hdr; +#if __GNUC__ > 9 +#pragma GCC diagnostic pop +#endif + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u ==== dump stl packet header @ %p [%016lx %016lx %016lx %016lx]\n", fn, ln, hdr, qw[0], qw[1], qw[2], qw[3]); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .lrh.flags ............. 0x%04hx\n", fn, ln, hdr->lrh_9B.flags); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .lrh.dlid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.dlid, hdr->lrh_9B.dlid, ntohs(hdr->lrh_9B.dlid)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .lrh.pktlen ............ 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.pktlen, hdr->lrh_9B.pktlen, ntohs(hdr->lrh_9B.pktlen)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .lrh.slid .............. 0x%04hx (be: %5hu, le: %5hu)\n", fn, ln, hdr->lrh_9B.slid, hdr->lrh_9B.slid, ntohs(hdr->lrh_9B.slid)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u\n", fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.opcode ............ 0x%02x \n", fn, ln, hdr->bth.opcode); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.bth_1 ............. 0x%02x \n", fn, ln, hdr->bth.bth_1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.pkey .............. 0x%04hx \n", fn, ln, hdr->bth.pkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.ecn ............... 0x%02x \n", fn, ln, hdr->bth.ecn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.qp ................ 0x%02x \n", fn, ln, hdr->bth.qp); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.unused ............ 0x%02x \n", fn, ln, hdr->bth.unused); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.rx ................ 0x%02x \n", fn, ln, hdr->bth.rx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u\n", fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .bth.psn ............... 0x%08x \n", fn, ln, hdr->bth.psn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .kdeth.offset_ver_tid .. 0x%08x\n", fn, ln, hdr->kdeth.offset_ver_tid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u\n", fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .kdeth.jkey ............ 0x%04hx\n", fn, ln, hdr->kdeth.jkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .kdeth.hcrc ............ 0x%04hx\n", fn, ln, hdr->kdeth.hcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"%s():%u .kdeth.unused .......... 0x%08x\n", fn, ln, hdr->kdeth.unused); + + return; +} + +static inline +void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, + const enum opx_hfi1_type hfi1_type, + const char * fn, const unsigned ln) +{ + const uint64_t * const qw = (uint64_t *)hdr; + const pid_t pid = getpid(); + //fi_opx_hfi1_dump_stl_packet_hdr (hdr, hfi1_type, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u ==== dump packet header @ %p [%016lx %016lx %016lx %016lx]\n", pid, fn, ln, hdr, qw[0], qw[1], qw[2], qw[3]); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.flags ........... 0x%04hx\n", pid, fn, ln, hdr->lrh_9B.flags); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.dlid ............ 0x%04hx (be: %5hu, le: %5hu)\n", pid, fn, ln, hdr->lrh_9B.dlid, hdr->lrh_9B.dlid, ntohs(hdr->lrh_9B.dlid)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.pktlen .......... 0x%04hx (be: %5hu, le: %5hu)\n", pid, fn, ln, hdr->lrh_9B.pktlen, hdr->lrh_9B.pktlen, ntohs(hdr->lrh_9B.pktlen)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.slid ............ 0x%04hx (be: %5hu, le: %5hu)\n", pid, fn, ln, hdr->lrh_9B.slid, hdr->lrh_9B.slid, ntohs(hdr->lrh_9B.slid)); + } else { + OPX_JKR_PRINT_16B_LRH(hdr->qw_16B[0], hdr->qw_16B[1]); + OPX_JKR_PRINT_16B_BTH(hdr->qw_16B[2], hdr->qw_16B[2]); + } + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .stl.bth.opcode ...... 0x%02x (%s)\n", pid, fn, ln, + hdr->bth.opcode, opx_hfi1_bth_opcode_to_string((uint16_t)hdr->bth.opcode)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.bth_1 .......... 0x%02x \n", pid, fn, ln, hdr->bth.bth_1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.pkey .......... 0x%04hx\n", pid, fn, ln, hdr->bth.pkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.ecn .......... 0x%02x \n", pid, fn, ln, hdr->bth.ecn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.qp .......... 0x%02x \n", pid, fn, ln, hdr->bth.qp); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.unused .......... 0x%02x \n", pid, fn, ln, hdr->bth.unused); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.rx .......... 0x%02x \n", pid, fn, ln, hdr->bth.rx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .bth.psn .......... 0x%08x \n", pid, fn, ln, hdr->bth.psn); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .kdeth.offset_ver_tid. 0x%08x \n", pid, fn, ln, hdr->kdeth.offset_ver_tid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .jkey .......... 0x%04hx \n", pid, fn, ln, hdr->kdeth.jkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .hcrc .......... 0x%04hx \n", pid, fn, ln, hdr->kdeth.hcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .unused .......... 0x%08x \n", pid, fn, ln, hdr->kdeth.unused); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u ofi_tag, last 9B QW... 0x%16.16lx\n", pid, fn, ln, hdr->qw_9B[6]); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .match.origin_tx ..... 0x%02x \n", pid, fn, ln, hdr->match.origin_tx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .match.ofi_data ...... 0x%08x \n", pid, fn, ln, hdr->match.ofi_data); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .match.ofi_tag ....... 0x%016lx \n", pid, fn, ln, hdr->match.ofi_tag); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.message_length 0x%04x \n", pid, fn, ln, hdr->inject.message_length); + + switch (hdr->bth.opcode) { + case FI_OPX_HFI_BTH_OPCODE_UD: + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .ud.opcode ... 0x%02x (%s) \n", pid, fn, ln, + hdr->ud.opcode, opx_hfi1_ud_opcode_to_string(hdr->ud.opcode)); + break; + case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT: + case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT: + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.message_length ... 0x%02x \n", pid, fn, ln, hdr->inject.message_length); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.app_data_u64[0] .. 0x%016lx \n", pid, fn, ln, hdr->inject.app_data_u64[0]); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.app_data_u64[1] .. 0x%016lx \n", pid, fn, ln, hdr->inject.app_data_u64[1]); + break; + case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER: + case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER: + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .send.xfer_bytes_tail .... 0x%02x \n", pid, fn, ln, hdr->send.xfer_bytes_tail); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .send.payload_qws_total .. 0x%04x \n", pid, fn, ln, hdr->send.payload_qws_total); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .send.xfer_tail .......... 0x%016lx \n", pid, fn, ln, hdr->send.xfer_tail); + break; + case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS: + case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS: /* calculate (?) total bytes to be transfered */ + case FI_OPX_HFI_BTH_OPCODE_RZV_CTS: + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .cts.origin .......... 0x%x \n", pid, fn, ln, hdr->cts.origin_rx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .cts.target.vaddr.ntidpairs .......... 0x%x \n", pid, fn, ln, hdr->cts.target.vaddr.ntidpairs); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .cts.target.opcode .......... 0x%x (%s) \n", pid, fn, ln, + hdr->cts.target.opcode, opx_hfi1_dput_opcode_to_string(hdr->cts.target.opcode)); + break; + default: + break; + } + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u\n", pid, fn, ln); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u ==== QWs 4-7 : [%016lx %016lx %016lx %016lx]\n", pid, fn, ln, qw[4], qw[5], qw[6], qw[7]); + + return; +} + +#define OPX_DEBUG_PRINT_HDR(__hdr,__hfi1_type) \ + if (__hfi1_type & OPX_HFI1_JKR) { \ + OPX_JKR_PRINT_16B_LRH(__hdr->qw_16B[0], \ + __hdr->qw_16B[1]); \ + OPX_JKR_PRINT_16B_BTH(__hdr->qw_16B[2], \ + __hdr->qw_16B[3]); \ + } else { \ + fi_opx_hfi1_dump_packet_hdr(__hdr, __hfi1_type, \ + __func__, __LINE__); \ + } + +#define OPX_DEBUG_PRINT_PBC_HDR(__pbc,__hdr,__hfi1_type) \ + if (__hfi1_type & OPX_HFI1_JKR) { \ + OPX_JKR_PRINT_16B_PBC(__pbc); \ + OPX_JKR_PRINT_16B_LRH(__hdr->qw_16B[0], \ + __hdr->qw_16B[1]); \ + OPX_JKR_PRINT_16B_BTH(__hdr->qw_16B[2], \ + __hdr->qw_16B[3]); \ + } else { \ + fi_opx_hfi1_dump_packet_hdr(__hdr, __hfi1_type, \ + __func__, __LINE__);\ + } + +#define OPX_DEBUG_PRINT_PBC_HDR_QW(q0,q1,q2,q3,q4,__hfi1_type) \ + if (__hfi1_type & OPX_HFI1_JKR) { \ + OPX_JKR_PRINT_16B_PBC(q0); \ + OPX_JKR_PRINT_16B_LRH(q1,q2); \ + OPX_JKR_PRINT_16B_BTH(q3,q4); \ + } + +#else +// Disable the macros +#define OPX_JKR_PRINT_16B_PBC(a) +#define OPX_JKR_PRINT_16B_LRH(a,b) +#define OPX_JKR_PRINT_16B_BTH(a,b) + +void opx_jkr_print_16B_pbc(uint64_t pbc1, const char* func); +void opx_jkr_print_16B_lrh(uint64_t lrh1, uint64_t lrh2, const char* func); +void opx_jkr_print_16B_bth(uint64_t bth1, uint64_t bth2, const char* func); + +static inline +void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, + const enum opx_hfi1_type hfi1_type, + const char * fn, const unsigned ln) +{ + return; +} + +#define OPX_DEBUG_PRINT_HDR(__hdr,__hfi1_type) +#define OPX_DEBUG_PRINT_PBC_HDR(__pbc,__hdr,__hfi1_type) +#define OPX_DEBUG_PRINT_PBC_HDR_QW(q0,q1,q2,q3,q4,__hfi1_type) + +#endif + + union cacheline { uint64_t qw[8]; uint32_t dw[16]; @@ -863,8 +1589,110 @@ union fi_opx_hfi1_rzv_rts_immediate_info { }; }; + +/* Cache "blocked" payloads in 16B are currently "tricky". + * The sender will always send 1 QW of header after SOP so STORE'ing + * a full cacheline block is not possible. The payload will + * arrive cacheline aligned in the eager buffer but not in the + * same "blocks" as written. + * + * For example, contiguous rzv: + * + * STORE(tag + 7 qw's of CACHELINE 0 unused[1], not unused[2] as in 9B above) + * fi_opx_init_hfi_lookupoptionally STORE(icrc/tail) if no more immediate data + * + * STORE(full block of immediate fragment unaligned data) + * STORe(full block of immediate data) + * STORE(full block of immediate end data) + * STORE(icrc/tail) + */ +union fi_opx_hfi1_packet_payload_16B { + uint8_t byte[FI_OPX_HFI1_PACKET_MTU]; + uint64_t qw[FI_OPX_HFI1_PACKET_MTU>>3]; + union { + struct { + /* ==== CACHE LINE 0 ==== */ + + uintptr_t src_vaddr; + uint64_t src_blocks; /* number of 64-byte data blocks to transfer */ + uint64_t src_device_id; + uint64_t src_iface; + uint64_t immediate_info; + uintptr_t origin_byte_counter_vaddr; + uint64_t unused[1]; + + /* Not cacheline aligned after the first block */ + union { + struct { + uint8_t immediate_byte[8]; + uint64_t immediate_qw[7]; + }; + + union cacheline cache_line_1; + }; + + union cacheline immediate_block[FI_OPX_HFI1_PACKET_MTU / sizeof(union cacheline) - 2]; + + } contiguous; + struct { + /* ==== CACHE LINE 0 ==== */ + + uintptr_t src_vaddr; + uint64_t src_blocks; /* number of 64-byte data blocks to transfer */ + uint64_t src_device_id; + uint64_t src_iface; + uint64_t immediate_info; + uintptr_t origin_byte_counter_vaddr; + uint64_t unused[1]; + + union { + struct { + uint8_t immediate_byte[8]; + uint64_t immediate_qw[7]; + }; + + union cacheline cache_line_1; + }; + + union cacheline immediate_block[FI_OPX_HFI1_PACKET_MTU / sizeof(union cacheline) - 2]; + + } contiguous_16B; + struct { + /* ==== CACHE LINE 0 ==== */ + + uintptr_t origin_byte_counter_vaddr; + struct fi_opx_hmem_iov iov[2]; + + /* ==== CACHE LINE 1-127 (for 8k mtu) ==== */ + struct fi_opx_hmem_iov iov_ext[FI_OPX_MAX_HMEM_IOV - 2]; + size_t unused; + + } noncontiguous; + } rendezvous; + + struct { + union fi_opx_hfi1_dput_iov iov[FI_OPX_MAX_DPUT_IOV]; + } cts; + + /* tid_cts extends cts*/ + struct { + /* ==== CACHE LINE 0 ==== */ + union fi_opx_hfi1_dput_iov iov[1]; + uint32_t tid_offset; + uint32_t ntidpairs; + int32_t origin_byte_counter_adjust; + uint32_t unused; + + /* ==== CACHE LINE 1 ==== */ + uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; + } tid_cts; + +} __attribute__((__aligned__(32))); + +/* 9B and common payload structure */ union fi_opx_hfi1_packet_payload { uint8_t byte[FI_OPX_HFI1_PACKET_MTU]; + uint64_t qw[FI_OPX_HFI1_PACKET_MTU>>3]; union { struct { /* ==== CACHE LINE 0 ==== */ @@ -921,7 +1749,8 @@ union fi_opx_hfi1_packet_payload { /* ==== CACHE LINE 1 ==== */ uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; } tid_cts; - + /* Union with 16B payload */ + union fi_opx_hfi1_packet_payload_16B payload_16B; } __attribute__((__aligned__(32))); static_assert(sizeof(union fi_opx_hfi1_packet_payload) <= FI_OPX_HFI1_PACKET_MTU, @@ -942,7 +1771,6 @@ static_assert((offsetof(union fi_opx_hfi1_packet_payload, tid_cts.tidpairs) + "If you added/removed fields in struct tid_cts, you need to adjust FI_OPX_MAX_DPUT_TIDPAIRS!"); - struct fi_opx_hfi1_ue_packet_slist; struct fi_opx_hfi1_ue_packet { /* == CACHE LINE 0 == */ @@ -968,17 +1796,18 @@ struct fi_opx_hfi1_ue_packet { uint32_t unused_cacheline0; - /* == CACHE LINE 1 == */ + /* == CACHE LINE 1, 2 == */ uint64_t unused_cacheline1; - union fi_opx_hfi1_packet_hdr hdr; + union opx_hfi1_packet_hdr hdr; - /* == CACHE LINE 2 == */ - union fi_opx_hfi1_packet_payload payload; + /* == CACHE LINE 3 == */ + union fi_opx_hfi1_packet_payload payload; } __attribute__((__packed__)) __attribute__((aligned(64))); static_assert(offsetof(struct fi_opx_hfi1_ue_packet, unused_cacheline1) == 64, "struct fi_opx_hfi1_ue_packet->unused_cacheline1 should be aligned on cache boundary!"); -static_assert(offsetof(struct fi_opx_hfi1_ue_packet, payload) == 128, + +static_assert(offsetof(struct fi_opx_hfi1_ue_packet, payload) == 192, "struct fi_opx_hfi1_ue_packet->payload should be aligned on cache boundary!"); struct fi_opx_hfi1_ue_packet_slist { @@ -1087,6 +1916,7 @@ struct fi_opx_hfi1_ue_packet *fi_opx_hfi1_ue_packet_slist_remove_item (struct fi return next_item; } +#if 0 static inline void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr, const char * fn, const unsigned ln) { @@ -1206,5 +2036,6 @@ void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr return; } +#endif #endif /* _FI_PROV_OPX_HFI1_PACKET_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h index 72ae427de94..370f5306e33 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h @@ -92,7 +92,8 @@ unsigned fi_opx_hfi1_handle_poll_error(struct fi_opx_ep *opx_ep, volatile uint64_t *rhe_ptr, volatile uint32_t *rhf_ptr, const uint32_t rhf_msb, const uint32_t rhf_lsb, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); __OPX_FORCE_INLINE__ void fi_opx_hfi1_update_hdrq_head_register(struct fi_opx_ep *opx_ep, const uint64_t hdrq_offset) @@ -107,12 +108,16 @@ void fi_opx_hfi1_update_hdrq_head_register(struct fi_opx_ep *opx_ep, const uint6 __OPX_FORCE_INLINE__ void fi_opx_hfi1_handle_ud_eager_packet(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, - const uint64_t rhf) + const union opx_hfi1_packet_hdr *const hdr, + const uint64_t rhf, + const uint64_t slid, + const uint64_t dlid, + const uint16_t pktlen, + const enum opx_hfi1_type hfi1_type) { /* "eager" packet - has payload */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf); - const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf); + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf,hfi1_type); + const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf,hfi1_type); const uint8_t *const __attribute__((unused)) payload = (uint8_t *)((uintptr_t)opx_ep->rx->egrq.base_addr + (uintptr_t)egrbfr_index * (uintptr_t)opx_ep->rx->egrq.elemsz + @@ -120,13 +125,6 @@ void fi_opx_hfi1_handle_ud_eager_packet(struct fi_opx_ep *opx_ep, assert(payload != NULL); - /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t __attribute__((unused)) total_bytes_to_copy = - (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t __attribute__((unused)) payload_bytes_to_copy = - total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); - /* currently no eager UD packets are defined */ fprintf(stderr, "%s:%s():%d bad ud eager packet; abort.\n", __FILE__, __func__, __LINE__); @@ -142,7 +140,8 @@ void fi_opx_hfi1_handle_ud_eager_packet(struct fi_opx_ep *opx_ep, static void fi_opx_hfi1_handle_ud_ping(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr, + const uint64_t slid) { struct fi_opx_reliability_service *service = opx_ep->reliability->state.service; @@ -165,7 +164,7 @@ void fi_opx_hfi1_handle_ud_ping(struct fi_opx_ep *opx_ep, ->pending_rx_reliability_pool); assert(ping_op != NULL); ping_op->ud_opcode = hdr->ud.opcode; - ping_op->slid = (uint64_t)hdr->stl.lrh.slid; + ping_op->slid = slid; ping_op->rx = (uint64_t)hdr->service.origin_reliability_rx; ping_op->key.key = hdr->service.key; ping_op->psn_count = hdr->service.psn_count; @@ -185,7 +184,7 @@ void fi_opx_hfi1_handle_ud_ping(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ void fi_opx_hfi1_handle_ud_ack(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { const uint64_t key = hdr->service.key; const uint64_t psn_count = hdr->service.psn_count; @@ -198,7 +197,7 @@ void fi_opx_hfi1_handle_ud_ack(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ void fi_opx_hfi1_handle_ud_nack(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { const uint64_t key = hdr->service.key; const uint64_t psn_count = hdr->service.psn_count; @@ -211,15 +210,19 @@ void fi_opx_hfi1_handle_ud_nack(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ unsigned fi_opx_hfi1_handle_ud_packet(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint64_t rhf_seq, const uint64_t hdrq_offset, - const uint64_t rhf) + const uint64_t rhf, + const uint64_t slid, + const uint64_t dlid, + const uint16_t pktlen, + const enum opx_hfi1_type hfi1_type) { - if (OFI_LIKELY(!OPX_RHF_IS_USE_EGR_BUF(rhf))) { - /* "header only" packet - no payload */ + /* "header only" packet - no payload */ + if (OFI_LIKELY(!OPX_RHF_IS_USE_EGR_BUF(rhf, hfi1_type))) { switch(hdr->ud.opcode) { case FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING: - fi_opx_hfi1_handle_ud_ping(opx_ep, hdr); + fi_opx_hfi1_handle_ud_ping(opx_ep, hdr, slid); break; case FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK: fi_opx_hfi1_handle_ud_ack(opx_ep, hdr); @@ -248,12 +251,12 @@ unsigned fi_opx_hfi1_handle_ud_packet(struct fi_opx_ep *opx_ep, fprintf(stderr, "%s:%s():%d bad ud header packet; abort.\n", __FILE__, __func__, __LINE__); abort(); - }; + } } else { - fi_opx_hfi1_handle_ud_eager_packet(opx_ep, hdr, rhf); + fi_opx_hfi1_handle_ud_eager_packet(opx_ep, hdr, rhf, slid, dlid, pktlen, hfi1_type); } - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,hfi1_type); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; @@ -264,7 +267,7 @@ unsigned fi_opx_hfi1_handle_ud_packet(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ unsigned fi_opx_hfi1_error_inject(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf) { @@ -273,11 +276,11 @@ unsigned fi_opx_hfi1_error_inject(struct fi_opx_ep *opx_ep, * Error injection .. purposefully drop packet */ if (OFI_UNLIKELY(FI_OPX_RELIABILITY_RX_DROP_PACKET(&opx_ep->reliability->state, hdr))) { - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,OPX_HFI1_TYPE); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; - if (OPX_RHF_IS_USE_EGR_BUF(rhf)) { /* eager */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf); + if (OPX_RHF_IS_USE_EGR_BUF(rhf,OPX_HFI1_TYPE)) { /* eager */ + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf,OPX_HFI1_TYPE); const uint32_t last_egrbfr_index = opx_ep->rx->egrq.last_egrbfr_index; if (OFI_UNLIKELY(last_egrbfr_index != egrbfr_index)) { OPX_HFI1_BAR_STORE(opx_ep->rx->egrq.head_register, @@ -296,27 +299,30 @@ unsigned fi_opx_hfi1_error_inject(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ unsigned fi_opx_hfi1_handle_reliability(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint64_t rhf_seq, const uint64_t hdrq_offset, - uint8_t *origin_rx, const uint64_t rhf) + uint8_t *origin_rx, const uint64_t rhf, + const uint64_t slid, + const uint16_t pktlen, + const enum opx_hfi1_type hfi1_type) { /* * Check for 'reliability' exceptions */ - const uint64_t slid = hdr->stl.lrh.slid; const uint64_t origin_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(hdr); const uint64_t psn = FI_OPX_HFI1_PACKET_PSN(hdr); + if (OFI_UNLIKELY(fi_opx_reliability_rx_check(&opx_ep->reliability->state, slid, origin_tx, psn, origin_rx) == FI_OPX_RELIABILITY_EXCEPTION)) { - if (!OPX_RHF_IS_USE_EGR_BUF(rhf)) { + if (!OPX_RHF_IS_USE_EGR_BUF(rhf,hfi1_type)) { /* no payload */ fi_opx_reliability_rx_exception(&opx_ep->reliability->state, slid, - origin_tx, psn, &opx_ep->ep_fid, hdr, NULL); + origin_tx, psn, &opx_ep->ep_fid, hdr, NULL, pktlen, hfi1_type); } else { /* has payload */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf); - const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf); + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf,hfi1_type); + const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf,hfi1_type); const uint8_t *const payload = (uint8_t *)((uintptr_t)opx_ep->rx->egrq.base_addr + (uintptr_t)egrbfr_index * @@ -326,7 +332,7 @@ unsigned fi_opx_hfi1_handle_reliability(struct fi_opx_ep *opx_ep, assert(payload != NULL); fi_opx_reliability_rx_exception(&opx_ep->reliability->state, slid, origin_tx, psn, &opx_ep->ep_fid, hdr, - payload); + payload, pktlen, hfi1_type); const uint32_t last_egrbfr_index = opx_ep->rx->egrq.last_egrbfr_index; if (OFI_UNLIKELY(last_egrbfr_index != egrbfr_index)) { @@ -336,7 +342,7 @@ unsigned fi_opx_hfi1_handle_reliability(struct fi_opx_ep *opx_ep, } } - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,hfi1_type); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; fi_opx_hfi1_update_hdrq_head_register(opx_ep, hdrq_offset); @@ -348,40 +354,43 @@ unsigned fi_opx_hfi1_handle_reliability(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint64_t rhf_seq, const uint64_t hdrq_offset, const int lock_required, const enum ofi_reliability_kind reliability, const uint8_t origin_rx, - const uint64_t rhf) + const uint64_t rhf, + const enum opx_hfi1_type hfi1_type, + const uint64_t slid, + const uint16_t pktlen) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "================ received a packet from the fabric\n"); - if (!OPX_RHF_IS_USE_EGR_BUF(rhf)) { + if (!OPX_RHF_IS_USE_EGR_BUF(rhf,hfi1_type)) { if (OFI_LIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)) { /* "header only" packet - no payload */ fi_opx_ep_rx_process_header(&opx_ep->ep_fid, hdr, NULL, 0, FI_TAGGED, FI_OPX_HFI_BTH_OPCODE_TAG_INJECT, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); - + lock_required, reliability, + hfi1_type, slid); } else if (opcode > FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { /* all other "tag" packets */ fi_opx_ep_rx_process_header_tag(&opx_ep->ep_fid, hdr, NULL, 0, opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); } else { fi_opx_ep_rx_process_header_msg(&opx_ep->ep_fid, hdr, NULL, 0, opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); } } else { /* "eager" packet - has payload */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf); - const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf); + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf,hfi1_type); + const uint32_t egrbfr_offset = OPX_RHF_EGR_OFFSET(rhf,hfi1_type); const uint8_t *const payload = (uint8_t *)((uintptr_t)opx_ep->rx->egrq.base_addr + (uintptr_t)egrbfr_index * (uintptr_t)opx_ep->rx->egrq.elemsz + @@ -390,11 +399,21 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, assert(payload != NULL); /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes_to_copy = - (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes_to_copy = - total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); + uint16_t lrh_pktlen_le; + size_t total_bytes_to_copy; + size_t payload_bytes_to_copy; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(pktlen); + total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes_to_copy = + total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + } else { + lrh_pktlen_le = pktlen; + total_bytes_to_copy = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing tail/icrc QW*/ + payload_bytes_to_copy = + total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); + } if (OFI_LIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)) { fi_opx_ep_rx_process_header( @@ -403,18 +422,20 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, payload_bytes_to_copy, FI_TAGGED, FI_OPX_HFI_BTH_OPCODE_TAG_EAGER, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); + lock_required, reliability, + hfi1_type, + slid); } else if (opcode > FI_OPX_HFI_BTH_OPCODE_TAG_EAGER) { /* all other "tag" packets */ fi_opx_ep_rx_process_header_tag(&opx_ep->ep_fid, hdr, payload, payload_bytes_to_copy, opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); } else { fi_opx_ep_rx_process_header_msg(&opx_ep->ep_fid, hdr, payload, payload_bytes_to_copy, opcode, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability); + lock_required, reliability, hfi1_type, slid); } const uint32_t last_egrbfr_index = opx_ep->rx->egrq.last_egrbfr_index; if (OFI_UNLIKELY(last_egrbfr_index != egrbfr_index)) { @@ -426,7 +447,7 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, FLIGHT_RECORDER_PACKET_HDR(opx_ep->fr, FR_EVENT_HFI1_POLL_ONCE, hdr); } - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,hfi1_type); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; fi_opx_hfi1_update_hdrq_head_register(opx_ep, hdrq_offset); @@ -449,10 +470,10 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, opx_ep->reliability->state.rx, psn - opx_ep->reliability->service.preemptive_ack_rate + 1, /* psn_start */ opx_ep->reliability->service.preemptive_ack_rate, /* psn_count */ - hdr, origin_rx); + hdr, origin_rx, slid, hfi1_type); - } else if (hdr->stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA && - ((ntohl(hdr->stl.bth.psn) & 0x80000000) || + } else if (hdr->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA && + ((ntohl(hdr->bth.psn) & 0x80000000) || (hdr->dput.target.opcode == FI_OPX_HFI_DPUT_OPCODE_PUT))) { /* Send preemptive ACKs on Rendezvous FI_OPX_HFI_DPUT_OPCODE_PUT or * on the final packet of a Rendezvous SDMA writev (the high bit @@ -466,16 +487,10 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, opx_ep->reliability->state.rx, psn - psn_count + 1, /* psn_start */ psn_count, /* psn_count */ - hdr, origin_rx); + hdr, origin_rx, slid, hfi1_type); } } -/* - * ============================================================================ - * Write CSR software trigger from host software by writing MISC_GPIO_OUT = 0x4 - * ============================================================================ -*/ - /* * ============================================================================ * THIS IS THE HFI POLL FUNCTION @@ -484,7 +499,8 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, __OPX_FORCE_INLINE__ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, const enum ofi_reliability_kind reliability, - const uint64_t hdrq_mask) + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const uint64_t local_hdrq_mask = (hdrq_mask == FI_OPX_HDRQ_MASK_RUNTIME) ? @@ -496,60 +512,92 @@ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, volatile uint32_t *rhf_ptr = opx_ep->rx->hdrq.rhf_base + hdrq_offset; const uint64_t rhf_rcvd = *((volatile uint64_t *)rhf_ptr); + uint32_t slid, dlid; + uint16_t pktlen; const uint64_t rhf_seq = opx_ep->rx->state.hdrq.rhf_seq; /* The software must look at the RHF.RcvSeq. * If it detects the next sequence number in the entry, the new header * was written into memory. Otherwise, do not process RHF - no packet. */ - if (OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd)) { + if (OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd, hfi1_type)) { const uint32_t rhf_msb = rhf_rcvd >> 32; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "OPX_RHF_SEQ_MATCH = %d rhf_rcvd = %#lx rhf_seq = %#lx\n", - OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd), rhf_rcvd, rhf_seq); + OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd, hfi1_type), rhf_rcvd, rhf_seq); const uint64_t hdrq_offset_dws = (rhf_msb >> 12) & 0x01FFu; - uint32_t *pkt = (uint32_t *)rhf_ptr - FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS + - 2 + /* rhf field size in dw */ - hdrq_offset_dws; - - const union fi_opx_hfi1_packet_hdr *const hdr = (union fi_opx_hfi1_packet_hdr *)pkt; + uint32_t *pkt; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + assert(hdrq_offset_dws); /* need padding before this header */ + pkt = (uint32_t *)rhf_ptr - FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS + + 2 /* rhf field size in dw */ + - 2 /* sizeof(uint64_t) in dw, offset back to align + for the 9B padding in the header union */ + + hdrq_offset_dws; + } else { + assert(((union opx_jkr_rhf)rhf_rcvd).L2Type == 0x2); + pkt = (uint32_t *)rhf_ptr - FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS + + 2 /* rhf field size in dw */ + + hdrq_offset_dws; + /* Assert we got full expected kdeth split header. + * In the future, we may handle this so this is + * not part of OPX_RHF_CHECK_HEADER */ + assert(!(((union opx_jkr_rhf)rhf_rcvd).KHdrLenErr)); + } - const uint8_t opcode = hdr->stl.bth.opcode; + const union opx_hfi1_packet_hdr *const hdr = (union opx_hfi1_packet_hdr *)pkt; + const uint8_t opcode = hdr->bth.opcode; /* If there's an RHF/RHE error or a bad header detected, handle the error and return */ - if(OPX_RHF_CHECK_HEADER(rhf_rcvd, hdr)) { + if(OPX_RHF_CHECK_HEADER(rhf_rcvd, hdr, hfi1_type)) { const uint32_t rhf_lsb = rhf_rcvd & 0xFFFFFFFF; volatile uint64_t *rhe_ptr = opx_ep->rx->hdrq.rhe_base; - return fi_opx_hfi1_handle_poll_error(opx_ep, rhe_ptr, rhf_ptr, rhf_msb, rhf_lsb, rhf_seq, hdrq_offset, rhf_rcvd, hdr); + return fi_opx_hfi1_handle_poll_error(opx_ep, rhe_ptr, rhf_ptr, rhf_msb, rhf_lsb, rhf_seq, hdrq_offset, rhf_rcvd, hdr, hfi1_type); } + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = (uint32_t)hdr->lrh_9B.slid; + pktlen = (uint32_t)hdr->lrh_9B.pktlen; /* pass it down unchanged. lower layers handle BE/LE */ + dlid = (uint32_t)hdr->lrh_9B.dlid; + } else { + slid = htons((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid)); /* BE for lower layers */ + pktlen = (uint16_t) hdr->lrh_16B.pktlen; /* pass it down unchanged. lower layers handle BE/LE */ + dlid = htons(((hdr->lrh_16B.dlid20 << 20) | (hdr->lrh_16B.dlid))); /* BE for lower layers */ + } + + if (OFI_UNLIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_UD)) { assert(reliability == OFI_RELIABILITY_KIND_ONLOAD); /* * process "unreliable datagram" packets first - before all the * software reliability protocol checks. */ - return fi_opx_hfi1_handle_ud_packet(opx_ep, hdr, rhf_seq, hdrq_offset, rhf_rcvd); + return fi_opx_hfi1_handle_ud_packet(opx_ep, hdr, rhf_seq, hdrq_offset, rhf_rcvd, + slid, dlid, pktlen, hfi1_type); } uint8_t origin_rx; /* - * check for software reliability events - */ + * check for software reliability events + */ /* This error inject call will compile out in optimized builds */ unsigned rc = fi_opx_hfi1_error_inject(opx_ep, hdr, rhf_seq, hdrq_offset, rhf_rcvd); if (OFI_UNLIKELY(rc != -1)) { return rc; } + rc = fi_opx_hfi1_handle_reliability(opx_ep, hdr, rhf_seq, - hdrq_offset, &origin_rx, rhf_rcvd); + hdrq_offset, &origin_rx, rhf_rcvd, slid, pktlen, hfi1_type); if (OFI_UNLIKELY(rc != -1)) { return rc; } + fi_opx_hfi1_handle_packet(opx_ep, opcode, hdr, rhf_seq, - hdrq_offset, lock_required, reliability, origin_rx, rhf_rcvd); + hdrq_offset, lock_required, reliability, origin_rx, rhf_rcvd, + hfi1_type, slid, pktlen); return 1; /* one packet was processed */ } return 0; @@ -561,23 +609,34 @@ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, * ============================================================================ */ static inline -void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) +void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); uint64_t pos; struct opx_shm_packet* packet = opx_shm_rx_next(&opx_ep->rx->shm, &pos); - union fi_opx_hfi1_packet_hdr * hdr = (packet) ? - (union fi_opx_hfi1_packet_hdr *) packet->data : NULL; + union opx_hfi1_packet_hdr * hdr = (packet) ? + (union opx_hfi1_packet_hdr *) packet->data : NULL; + uint32_t slid; while (hdr != NULL) { - const uint8_t opcode = hdr->stl.bth.opcode; + const uint8_t opcode = hdr->bth.opcode; uint32_t origin_reliability_rx = hdr->service.origin_reliability_rx; /* DAOS HFI Rank Support: */ if (!opx_ep->daos_info.hfi_rank_enabled) { - assert(hdr->stl.lrh.dlid == opx_ep->rx->self.uid.lid); - assert(hdr->stl.bth.rx == opx_ep->rx->self.hfi1_rx || - hdr->stl.bth.rx == opx_ep->rx->self.reliability_rx); +#ifndef NDEBUG + uint32_t dlid __attribute__ ((unused)); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + dlid = hdr->lrh_9B.dlid; + } else { + dlid = htons((hdr->lrh_16B.dlid20 << 20) | (hdr->lrh_16B.dlid)); + } + + assert(dlid == opx_ep->rx->self.uid.lid); + assert(hdr->bth.rx == opx_ep->rx->self.hfi1_rx || + hdr->bth.rx == opx_ep->rx->self.reliability_rx); +#endif } else { /* DAOS Persistent Address Support: * No Context Resource Management Framework is supported by OPX to @@ -596,8 +655,16 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) * change due to support for Persistent Addressing. The only reliable field * in the fi_addr is the hfi1_unit. */ - assert(hdr->stl.lrh.dlid == opx_ep->rx->self.uid.lid); +#ifndef NDEBUG + uint32_t dlid __attribute__ ((unused)); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + dlid = hdr->lrh_9B.dlid; + } else { + dlid = htons(hdr->lrh_16B.dlid20 << 20 | hdr->lrh_16B.dlid); + } + assert(dlid == opx_ep->rx->self.uid.lid); +#endif /* origin_reliability_rx is HFI rank instead of HFI rx */ origin_reliability_rx = packet->origin_rank; @@ -610,6 +677,12 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) opx_ep->daos_info.rank, opx_ep->rx->shm.segment_key); } + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = hdr->lrh_9B.slid; + } else { + slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + } + if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { fi_opx_ep_rx_process_header(ep, hdr, NULL, 0, FI_TAGGED, @@ -617,7 +690,9 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) (const uint8_t) origin_reliability_rx, OPX_INTRANODE_TRUE, lock_required, - OFI_RELIABILITY_KIND_NONE); + OFI_RELIABILITY_KIND_NONE, + hfi1_type, + slid); } else if (opcode == FI_OPX_HFI_BTH_OPCODE_UD) { const uint8_t ud_opcode = hdr->ud.opcode; @@ -642,9 +717,19 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) const uint8_t * const payload = (uint8_t *)(hdr+1); /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes_to_copy = total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); + uint16_t lrh_pktlen_le; + size_t total_bytes_to_copy; + size_t payload_bytes_to_copy; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(hdr->lrh_9B.pktlen); + total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + } else { + lrh_pktlen_le = hdr->lrh_16B.pktlen; + total_bytes_to_copy = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing tail/icrc QW*/ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); + } if (opcode >= FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { @@ -652,7 +737,8 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) payload_bytes_to_copy, opcode, (const uint8_t) origin_reliability_rx, OPX_INTRANODE_TRUE, - lock_required, OFI_RELIABILITY_KIND_NONE); + lock_required, OFI_RELIABILITY_KIND_NONE, + hfi1_type, slid); } else { @@ -660,13 +746,15 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required) payload_bytes_to_copy, opcode, (const uint8_t) origin_reliability_rx, OPX_INTRANODE_TRUE, - lock_required, OFI_RELIABILITY_KIND_NONE); + lock_required, OFI_RELIABILITY_KIND_NONE, + hfi1_type, slid); } } opx_shm_rx_advance(&opx_ep->rx->shm, (void *)hdr, pos); packet = opx_shm_rx_next(&opx_ep->rx->shm, &pos); - hdr = (packet) ? (union fi_opx_hfi1_packet_hdr *) packet->data : NULL; + hdr = (packet) ? + (union opx_hfi1_packet_hdr *) packet->data : NULL; } } @@ -677,7 +765,8 @@ void fi_opx_hfi1_poll_many (struct fid_ep *ep, const int lock_required, const uint64_t caps, const enum ofi_reliability_kind reliability, - const uint64_t hdrq_mask) + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) { /* All callers to this function should have already obtained the necessary lock */ assert(!lock_required); @@ -690,14 +779,16 @@ void fi_opx_hfi1_poll_many (struct fid_ep *ep, if ((caps & FI_LOCAL_COMM) || (caps == 0)) { - fi_opx_shm_poll_many(ep, 0); + fi_opx_shm_poll_many(ep, 0, hfi1_type); } if ((caps & FI_REMOTE_COMM) || (caps == 0)) { do { - packets = fi_opx_hfi1_poll_once(ep, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask); + packets = fi_opx_hfi1_poll_once(ep, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, hfi1_type); } while ((packets > 0) && (hfi1_poll_count++ < hfi1_poll_max)); + + if (reliability == OFI_RELIABILITY_KIND_ONLOAD) { /* compile-time constant expression */ struct fi_opx_reliability_service *service = opx_ep->reliability->state.service; @@ -718,7 +809,6 @@ void fi_opx_hfi1_poll_many (struct fid_ep *ep, service->usec_next = fi_opx_timer_next_event_usec(timer, timestamp, service->usec_max); }// End timer fired - } } diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h index 692a0ed625f..dbd1c6d06b5 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h @@ -48,9 +48,11 @@ #define OPX_SDMA_REPLAY_DATA_IOV_COUNT (1) #define OPX_SDMA_REPLAY_IOV_COUNT (OPX_SDMA_REPLAY_DATA_IOV_COUNT + 1) #define OPX_SDMA_HFI_MAX_IOVS_PER_WRITE (64) + OPX_COMPILE_TIME_ASSERT((OPX_SDMA_HFI_MAX_IOVS_PER_WRITE + 1) == OPX_DEBUG_COUNTERS_WRITEV_MAX, "OPX_DEBUG_COUNTERS_WRITEV_MAX should be OPX_SDMA_HFI_MAX_IOVS_PER_WRITE + 1!\n"); + // Driver limit of the number of TIDs that can be used in a single SDMA request #define OPX_SDMA_MAX_TIDS_PER_REQUEST (1024) @@ -60,6 +62,7 @@ OPX_COMPILE_TIME_ASSERT((OPX_SDMA_HFI_MAX_IOVS_PER_WRITE + 1) == OPX_DEBUG_COUNT #define OPX_SDMA_MEMINFO_SIZE (136) #define OPX_SDMA_MEMINFO_SIZE_QWS (OPX_SDMA_MEMINFO_SIZE >> 3) + OPX_COMPILE_TIME_ASSERT((OPX_SDMA_MEMINFO_SIZE & 0x7) == 0, "OPX_SDMA_MEMINFO_SIZE must be a multiple of 8!"); #ifdef OPX_HMEM OPX_COMPILE_TIME_ASSERT(sizeof(struct sdma_req_meminfo) == OPX_SDMA_MEMINFO_SIZE, @@ -67,6 +70,7 @@ OPX_COMPILE_TIME_ASSERT(sizeof(struct sdma_req_meminfo) == OPX_SDMA_MEMINFO_SIZE #endif + static const uint16_t OPX_SDMA_REQ_SET_MEMINFO[2] = {0, #ifdef OPX_HMEM ((uint16_t) 1) << HFI1_SDMA_REQ_MEMINFO_SHIFT @@ -96,7 +100,7 @@ struct fi_opx_hfi1_sdma_header_vec { } hmem; }; - struct fi_opx_hfi1_txe_scb scb; + struct fi_opx_hfi1_txe_scb_9B scb; }; static const size_t OPX_SDMA_REQ_INFO_OFFSET[2] = { @@ -467,7 +471,8 @@ __OPX_FORCE_INLINE__ int opx_hfi1_sdma_enqueue_request(struct fi_opx_ep *opx_ep, void *requester, enum opx_sdma_comp_state *requester_comp_state, - struct fi_opx_hfi1_txe_scb *source_scb, + struct fi_opx_hfi1_txe_scb_9B *source_scb, +/* struct opx_hfi1_txe_scb_union *source_scb, */ struct iovec *iovs, const uint16_t num_iovs, const uint16_t num_packets, @@ -510,9 +515,11 @@ int opx_hfi1_sdma_enqueue_request(struct fi_opx_ep *opx_ep, /* Set the Acknowledge Request Bit if we're only sending one packet */ uint64_t set_ack_bit = (num_packets == 1) ? (uint64_t)htonl(0x80000000) : 0; + + OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); request->header_vec.scb = *source_scb; - request->header_vec.scb.hdr.qw[2] |= ((uint64_t)kdeth << 32) | set_ack_bit; - request->header_vec.scb.hdr.qw[4] |= (last_packet_bytes << 32); + request->header_vec.scb.hdr.qw_9B[2] |= ((uint64_t)kdeth << 32) | set_ack_bit; + request->header_vec.scb.hdr.qw_9B[4] |= (last_packet_bytes << 32); request->iovecs[0].iov_len = OPX_SDMA_REQ_HDR_SIZE[set_meminfo]; request->iovecs[0].iov_base = req_info; @@ -539,11 +546,13 @@ int opx_hfi1_sdma_enqueue_replay(struct fi_opx_ep *opx_ep, assert(replay->use_iov); assert(replay->iov->iov_len == payload_bytes); + OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.replay_requests); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== Enqueuing replay for SDMA Send\n"); return opx_hfi1_sdma_enqueue_request(opx_ep, we, &we->comp_state, - &replay->scb, replay->iov, + &replay->scb_9B, replay->iov, OPX_SDMA_REPLAY_DATA_IOV_COUNT, 1, // num_packets, (payload_bytes + 63) & 0xFFC0, // Frag_size @@ -585,10 +594,12 @@ uint16_t opx_hfi1_sdma_register_replays(struct fi_opx_ep *opx_ep, we->dlid, we->rx, we->rs, &we->psn_ptr, we->num_packets); + OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); + uint32_t fragsize = 0; for (int i = 0; i < we->num_packets; ++i) { fragsize = MAX(fragsize, we->packets[i].length); - we->packets[i].replay->scb.hdr.qw[2] |= (uint64_t)htonl((uint32_t)psn); + we->packets[i].replay->scb_9B.hdr.qw_9B[2] |= (uint64_t)htonl((uint32_t)psn); we->packets[i].replay->sdma_we_use_count = we->bounce_buf.use_count; we->packets[i].replay->sdma_we = replay_back_ptr; we->packets[i].replay->hmem_iface = we->hmem.iface; @@ -617,11 +628,13 @@ void opx_hfi1_sdma_enqueue_dput(struct fi_opx_ep *opx_ep, .iov_len = (we->total_payload + 3) & -4 }; + OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.nontid_requests); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== Enqueuing non-tid request for SDMA Send\n"); opx_hfi1_sdma_enqueue_request(opx_ep, we, &we->comp_state, - &we->packets[0].replay->scb, + &we->packets[0].replay->scb_9B, &payload_iov, OPX_SDMA_NONTID_DATA_IOV_COUNT, we->num_packets, @@ -680,11 +693,13 @@ void opx_hfi1_sdma_enqueue_dput_tid(struct fi_opx_ep *opx_ep, } }; + OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.tid_requests); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== Enqueuing tid request for SDMA Send\n"); opx_hfi1_sdma_enqueue_request(opx_ep, we, &we->comp_state, - &we->packets[0].replay->scb, + &we->packets[0].replay->scb_9B, payload_tid_iovs, OPX_SDMA_TID_DATA_IOV_COUNT, we->num_packets, diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index 46d607322f7..a4530ca0961 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -108,8 +108,9 @@ void fi_opx_ep_tx_cq_inject_completion(struct fid_ep *ep, } // faster than memcpy() for this amount of data. -// DOES NOT SUPPORT SCB (PIO or UREG) COPY (SIM) -static inline void fi_opx_copy_cacheline(volatile uint64_t dest[8], uint64_t source[8]) +// DOES NOT SUPPORT SCB (PIO or UREG) (does not support SIM/BAR) +// Unstructured copy - for payloads or other memcpy replacement +static inline void fi_opx_copy_cacheline(uint64_t dest[8], uint64_t source[8]) { dest[0] = source[0]; dest[1] = source[1]; @@ -122,9 +123,65 @@ static inline void fi_opx_copy_cacheline(volatile uint64_t dest[8], uint64_t sou } +// faster than memcpy() for this amount of data. +// DOES NOT SUPPORT SCB (PIO or UREG) (does not support SIM/BAR) +// Structured copy - for headers +static inline void fi_opx_copy_hdr9B_cacheline(struct fi_opx_hfi1_txe_scb_9B * dest, const uint64_t *source) +{ + dest->qw0 = source[0]; + dest->hdr.qw_9B[0] = source[1]; + dest->hdr.qw_9B[1] = source[2]; + dest->hdr.qw_9B[2] = source[3]; + dest->hdr.qw_9B[3] = source[4]; + dest->hdr.qw_9B[4] = source[5]; + dest->hdr.qw_9B[5] = source[6]; + dest->hdr.qw_9B[6] = source[7]; +} + +static inline void fi_opx_copy_hdr16B_cacheline(struct fi_opx_hfi1_txe_scb_16B * dest, const uint64_t *source) +{ + dest->qw0 = source[0]; + dest->hdr.qw_16B[0] = source[1]; + dest->hdr.qw_16B[1] = source[2]; + dest->hdr.qw_16B[2] = source[3]; + dest->hdr.qw_16B[3] = source[4]; + dest->hdr.qw_16B[4] = source[5]; + dest->hdr.qw_16B[5] = source[6]; + dest->hdr.qw_16B[6] = source[7]; + dest->hdr.qw_16B[7] = source[8]; // cacheline + 1 spillover + dest->hdr.qw_16B[8] = 0UL; + dest->hdr.qw_16B[9] = 0UL; + dest->hdr.qw_16B[10] = 0UL; + dest->hdr.qw_16B[11] = 0UL; + dest->hdr.qw_16B[12] = 0UL; + dest->hdr.qw_16B[13] = 0UL; + dest->hdr.qw_16B[14] = 0UL; +} + + +static inline void fi_opx_copy_hdr16B_2cacheline(struct fi_opx_hfi1_txe_scb_16B * dest, const uint64_t *source) +{ + dest->qw0 = source[0]; + dest->hdr.qw_16B[0] = source[1]; + dest->hdr.qw_16B[1] = source[2]; + dest->hdr.qw_16B[2] = source[3]; + dest->hdr.qw_16B[3] = source[4]; + dest->hdr.qw_16B[4] = source[5]; + dest->hdr.qw_16B[5] = source[6]; + dest->hdr.qw_16B[6] = source[7]; + dest->hdr.qw_16B[7] = source[8]; + dest->hdr.qw_16B[8] = source[9]; + dest->hdr.qw_16B[9] = source[10]; + dest->hdr.qw_16B[10] = source[11]; + dest->hdr.qw_16B[11] = source[12]; + dest->hdr.qw_16B[12] = source[13]; + dest->hdr.qw_16B[13] = source[14]; + dest->hdr.qw_16B[14] = source[15]; +} + // faster than memcpy() for this amount of data. // SCB (PIO or UREG) COPY ONLY (STORE) -static inline void fi_opx_copy_scb(volatile uint64_t dest[8], uint64_t source[8]) +static inline void fi_opx_store_scb_qw(volatile uint64_t dest[8], const uint64_t source[8]) { OPX_HFI1_BAR_STORE(&dest[0], source[0]); OPX_HFI1_BAR_STORE(&dest[1], source[1]); @@ -136,10 +193,67 @@ static inline void fi_opx_copy_scb(volatile uint64_t dest[8], uint64_t source[8] OPX_HFI1_BAR_STORE(&dest[7], source[7]); } + +// Use this to fill out an SCB before the data is copied to local storage. +// (The local copy is usually used for setting up replay buffers or for log +// messages.) +static inline void fi_opx_store_and_copy_scb_9B(volatile uint64_t scb[8], + struct fi_opx_hfi1_txe_scb_9B *local, + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, uint64_t d6, uint64_t d7) +{ + OPX_HFI1_BAR_STORE(&scb[0], d0); + OPX_HFI1_BAR_STORE(&scb[1], d1); + OPX_HFI1_BAR_STORE(&scb[2], d2); + OPX_HFI1_BAR_STORE(&scb[3], d3); + OPX_HFI1_BAR_STORE(&scb[4], d4); + OPX_HFI1_BAR_STORE(&scb[5], d5); + OPX_HFI1_BAR_STORE(&scb[6], d6); + OPX_HFI1_BAR_STORE(&scb[7], d7); + local->qw0 = d0; + local->hdr.qw_9B[0] = d1; + local->hdr.qw_9B[1] = d2; + local->hdr.qw_9B[2] = d3; + local->hdr.qw_9B[3] = d4; + local->hdr.qw_9B[4] = d5; + local->hdr.qw_9B[5] = d6; + local->hdr.qw_9B[6] = d7; +} + // Use this to fill out an SCB before the data is copied to local storage. // (The local copy is usually used for setting up replay buffers or for log // messages.) -static inline void fi_opx_set_scb(volatile uint64_t scb[8], uint64_t local[8], +static inline void fi_opx_store_and_copy_scb_16B(volatile uint64_t scb[8], + struct fi_opx_hfi1_txe_scb_16B *local, + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, uint64_t d6, uint64_t d7) +{ + + OPX_HFI1_BAR_STORE(&scb[0], d0); + OPX_HFI1_BAR_STORE(&scb[1], d1); + OPX_HFI1_BAR_STORE(&scb[2], d2); + OPX_HFI1_BAR_STORE(&scb[3], d3); + OPX_HFI1_BAR_STORE(&scb[4], d4); + OPX_HFI1_BAR_STORE(&scb[5], d5); + OPX_HFI1_BAR_STORE(&scb[6], d6); + OPX_HFI1_BAR_STORE(&scb[7], d7); + + local->qw0 = d0; + local->hdr.qw_16B[0] = d1; + local->hdr.qw_16B[1] = d2; + local->hdr.qw_16B[2] = d3; + local->hdr.qw_16B[3] = d4; + local->hdr.qw_16B[4] = d5; + local->hdr.qw_16B[5] = d6; + local->hdr.qw_16B[6] = d7; + +} +// Use this to fill out a payload before the data is copied to local storage. +// (The local copy is usually used for setting up replay buffers or for log +// messages.) +// +// Common to 9B/16B for temporary local storage (generic QW[] scb's) +static inline void fi_opx_store_and_copy_qw(volatile uint64_t scb[8], uint64_t local[8], uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, uint64_t d4, uint64_t d5, uint64_t d6, uint64_t d7) { @@ -193,14 +307,97 @@ void fi_opx_duff_copy(char *to, const char *from, int64_t len) { } } +// Use this to fill out an SCB before the data is copied to local storage. +// (The local copy is usually used for setting up replay buffers or for log +// messages.) +// +// Different from fi_opx_store_and_copy_qw because it moves < 1 QW of data +// into the correct qw for 9B headers +static inline void fi_opx_store_and_copy_qw_9B(volatile uint64_t scb[8], + uint64_t local[8], + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, const void *buf, size_t len, uint64_t d7) +{ + // less than a qw to store + local[6] = 0; + // the purpose of this is to quickly copy the contents of buf into + // the 6th DWORD of the SCB and the local copy. + if (len > 7) { + fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); + abort(); + } else if (len > 0) { + fi_opx_duff_copy((char*)&local[6], buf, len); + } + + OPX_HFI1_BAR_STORE(&scb[0], d0); + OPX_HFI1_BAR_STORE(&scb[1], d1); + OPX_HFI1_BAR_STORE(&scb[2], d2); + OPX_HFI1_BAR_STORE(&scb[3], d3); + OPX_HFI1_BAR_STORE(&scb[4], d4); + OPX_HFI1_BAR_STORE(&scb[5], d5); + OPX_HFI1_BAR_STORE(&scb[6], local[6]); + OPX_HFI1_BAR_STORE(&scb[7], d7); + local[0] = d0; + local[1] = d1; + local[2] = d2; + local[3] = d3; + local[4] = d4; + local[5] = d5; +// local[6] = d6; + local[7] = d7; +} + // Use this to fill out an SCB before the data is copied to local storage. // (The local copy is usually used for setting up replay buffers or for log // messages.) // -// This version embeds up to 16 bytes of immediate data into the SCB. +// Different from fi_opx_store_and_copy_qw because it moves < 1 QW of data +// into the correct qw for 16B headers +static inline void fi_opx_store_and_copy_qw_16B(volatile uint64_t scb[8], + uint64_t local[8], + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, uint64_t d6, const void *buf, size_t len) +{ + // less than a qw to store + local[7] = 0; + // the purpose of this is to quickly copy the contents of buf into + // the 7th DWORD of the SCB and the local copy. + if (len > 7) { + fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); + abort(); + } else if (len > 0) { + fi_opx_duff_copy((char*)&local[7], buf, len); + } + + OPX_HFI1_BAR_STORE(&scb[0], d0); + OPX_HFI1_BAR_STORE(&scb[1], d1); + OPX_HFI1_BAR_STORE(&scb[2], d2); + OPX_HFI1_BAR_STORE(&scb[3], d3); + OPX_HFI1_BAR_STORE(&scb[4], d4); + OPX_HFI1_BAR_STORE(&scb[5], d5); + OPX_HFI1_BAR_STORE(&scb[6], d6); + OPX_HFI1_BAR_STORE(&scb[7], local[7]); + local[0] = d0; + local[1] = d1; + local[2] = d2; + local[3] = d3; + local[4] = d4; + local[5] = d5; + local[6] = d6; +// local[7] = d7; +} + +// Use this to fill out a PIO SOP SCB before the data is copied to local +// storage. (The local copy is usually used for setting up replay buffers +// or for log messages.) +// +// These versions embeds up to 16 bytes of immediate data into the SCB. +// Header only - no additional payload expected - +// 9B is one call/one cacheline __OPX_FORCE_INLINE__ -void fi_opx_set_scb_special(volatile uint64_t scb[8], uint64_t local[8], +void fi_opx_store_inject_and_copy_scb_9B(volatile uint64_t scb[8], + uint64_t *local, uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, uint64_t d4, const void *buf, size_t len, uint64_t d7) { @@ -306,36 +503,145 @@ void fi_opx_set_scb_special(volatile uint64_t scb[8], uint64_t local[8], local[7] = d7; } -// Use this to fill out an SCB before the data is copied to local storage. -// (The local copy is usually used for setting up replay buffers or for log -// messages.) -static inline void fi_opx_set_scb_special2(volatile uint64_t scb[8], uint64_t local[8], - uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, - uint64_t d4, uint64_t d5, const void *buf, size_t len, uint64_t d7) +// Use this to fill out a PIO SOP SCB before the data is copied to local +// storage. (The local copy is usually used for setting up replay buffers +// or for log messages.) +// +// These versions embeds up to 16 bytes of immediate data into the SCB. +// Header only - no additional payload expected - +// 16B is two calls/two cachelines, second cacheline is padded out +__OPX_FORCE_INLINE__ +void fi_opx_store_inject_and_copy_scb_16B(volatile uint64_t scb[8], + uint64_t *local, + uint64_t d0, uint64_t d1, uint64_t d2, uint64_t d3, + uint64_t d4, uint64_t d5, const void *buf, size_t len) { - local[6] = 0; - memcpy((void*)&local[6], buf, len); - OPX_HFI1_BAR_STORE(&scb[0], d0); - OPX_HFI1_BAR_STORE(&scb[1], d1); - OPX_HFI1_BAR_STORE(&scb[2], d2); - OPX_HFI1_BAR_STORE(&scb[3], d3); - OPX_HFI1_BAR_STORE(&scb[4], d4); - OPX_HFI1_BAR_STORE(&scb[5], d5); - OPX_HFI1_BAR_STORE(&scb[6], local[6]); - OPX_HFI1_BAR_STORE(&scb[7], d7); + // the purpose of this is to quickly copy the contents of buf into + // the 5th and 6th DWORDs of the SCB and the local copy. + switch (len) { + case 0: + local[6] = 0; + local[7] = 0; + break; + case 1: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 1); + local[7] = 0; + break; + case 2: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 2); + local[7] = 0; + break; + case 3: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 3); + local[7] = 0; + break; + case 4: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 4); + local[7] = 0; + break; + case 5: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 5); + local[7] = 0; + break; + case 6: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 6); + local[7] = 0; + break; + case 7: + local[6] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 7); + local[7] = 0; + break; + case 8: + local[6] = *((uint64_t*)buf); + local[7] = 0; + break; + case 9: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 9); + break; + case 10: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 10); + break; + case 11: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 11); + break; + case 12: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 12); + break; + case 13: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 13); + break; + case 14: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 14); + break; + case 15: + local[7] = 0; + fi_opx_duff_copy((char*)&local[6], buf, 15); + break; + case 16: + local[6] = *((uint64_t*)buf); + local[7] = *((uint64_t*)buf+1); + break; + default: + fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); + break; + } + + //OPX_DEBUG_PRINT_PBC_HDR_QW(d0, d1, d2, d3, d4, OPX_HFI1_JKR); + + // 1st cacheline PIO SOP + OPX_HFI1_BAR_STORE(&scb[0], d0); //pbc + OPX_HFI1_BAR_STORE(&scb[1], d1); //lrh + OPX_HFI1_BAR_STORE(&scb[2], d2); //lrh + OPX_HFI1_BAR_STORE(&scb[3], d3); //bth + OPX_HFI1_BAR_STORE(&scb[4], d4); //bth + kdeth + OPX_HFI1_BAR_STORE(&scb[5], d5); //kdeth + OPX_HFI1_BAR_STORE(&scb[6], local[6]); //data 1 + OPX_HFI1_BAR_STORE(&scb[7], local[7]); //data 2 + local[0] = d0; local[1] = d1; local[2] = d2; local[3] = d3; local[4] = d4; local[5] = d5; - // local[6] = d6; - local[7] = d7; + // local[6] + // local[7] +} + +__OPX_FORCE_INLINE__ +void fi_opx_store_inject_and_copy_scb2_16B(volatile uint64_t scb[8], + uint64_t *local, uint64_t d8) +{ + // 2nd cacheline PIO (only) padded out + + OPX_HFI1_BAR_STORE(&scb[0], d8); // tag + OPX_HFI1_BAR_STORE(&scb[1], 0); + OPX_HFI1_BAR_STORE(&scb[2], 0); + OPX_HFI1_BAR_STORE(&scb[3], 0); + OPX_HFI1_BAR_STORE(&scb[4], 0); + OPX_HFI1_BAR_STORE(&scb[5], 0); + OPX_HFI1_BAR_STORE(&scb[6], 0); + OPX_HFI1_BAR_STORE(&scb[7], 0); + + local[8] = d8; } void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, - const void * const hdr, const void * const payload, + const union opx_hfi1_packet_hdr * const hdr, const void * const payload, const uint8_t u8_rx, const uint64_t niov, uintptr_t origin_byte_counter_vaddr, union fi_opx_context *const target_context, @@ -348,11 +654,12 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, uint8_t opcode, const unsigned is_intranode, const enum ofi_reliability_kind reliability, - const uint32_t u32_extended_rx); + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type); union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ep, struct fi_opx_mr * opx_mr, - const void * const hdr, const void * const payload, + const union opx_hfi1_packet_hdr * const hdr, const void * const payload, size_t payload_bytes_to_copy, const uint8_t u8_rx, const uint8_t origin_rs, const uint32_t niov, @@ -366,7 +673,8 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx void (*completion_action)(union fi_opx_hfi1_deferred_work * work_state), const unsigned is_intranode, const enum ofi_reliability_kind reliability, - const uint32_t u32_extended_rx + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type ); union fi_opx_hfi1_deferred_work; @@ -437,7 +745,6 @@ struct fi_opx_hfi1_dput_params { }; }; }; - OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_hfi1_dput_params, compare_iov) & 7) == 0, "compare_iov not 8-byte aligned!"); struct fi_opx_hfi1_rx_rzv_rts_params { @@ -543,9 +850,10 @@ union fi_opx_hfi1_deferred_work { int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work); void opx_hfi1_dput_fence(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint8_t u8_rx, - const uint32_t u32_extended_rx); + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type); int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work *work); int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work *work); @@ -593,17 +901,18 @@ __OPX_FORCE_INLINE__ void fi_opx_force_credit_return(struct fid_ep *ep, fi_addr_t dest_addr, const uint64_t dest_rx, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - const uint64_t pbc_dws = 16; + const uint64_t pbc_dws = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 16 : 20; const uint16_t lrh_dws = htons(pbc_dws-1); + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ - const uint64_t force_credit_return = OPX_PBC_CR(0x1); + const uint64_t force_credit_return = OPX_PBC_CR(0x1, hfi1_type); /* * Write the 'start of packet' (hw+sw header) 'send control block' @@ -620,33 +929,67 @@ void fi_opx_force_credit_return(struct fid_ep *ep, * credits will be returned soon naturally anyway, and sending a no-op packet * forcing a credit return would just add unnecessary traffic. */ + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + uint64_t available_credits = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1); - while (OFI_UNLIKELY(available_credits < 1)) { + while (OFI_UNLIKELY(available_credits < credits_needed)) { if (loop++ & 0x10) { opx_ep->tx->pio_state->qw0 = pio_state.qw0; + return; } FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); - available_credits = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1); + available_credits = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed); } volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; /* WHY BOTHER? TODO: REMOVE */ + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - fi_opx_set_scb(scb, tmp, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | force_credit_return | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid), - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->send.hdr.qw[1] | bth_rx | ((uint64_t)FI_OPX_HFI_UD_OPCODE_RELIABILITY_NOOP << 48) - | (uint64_t)FI_OPX_HFI_BTH_OPCODE_UD, - opx_ep->tx->send.hdr.qw[2], - opx_ep->tx->send.hdr.qw[3], - opx_ep->tx->send.hdr.qw[4], - 0, 0); + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | ((uint64_t)FI_OPX_HFI_UD_OPCODE_RELIABILITY_NOOP << 48) + | (uint64_t)FI_OPX_HFI_BTH_OPCODE_UD, + opx_ep->tx->send_9B.hdr.qw_9B[2], + opx_ep->tx->send_9B.hdr.qw_9B[3], + opx_ep->tx->send_9B.hdr.qw_9B[4], + 0, 0); + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_qws << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | ((uint64_t)FI_OPX_HFI_UD_OPCODE_RELIABILITY_NOOP << 48) + | (uint64_t)FI_OPX_HFI_BTH_OPCODE_UD, + opx_ep->tx->send_16B.hdr.qw_16B[3], + opx_ep->tx->send_16B.hdr.qw_16B[4], + opx_ep->tx->send_16B.hdr.qw_16B[5], + 0); + + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_and_copy_qw(scb_payload, local_temp, + 0UL, + 0UL, + 0UL, + 0UL, + 0UL, + 0UL, + 0UL, + 0UL); + } - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + FI_OPX_HFI1_CONSUME_CREDITS(pio_state, credits_needed); opx_ep->tx->pio_state->qw0 = pio_state.qw0; FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); @@ -659,7 +1002,7 @@ uint64_t fi_opx_hfi1_tx_is_intranode(struct fi_opx_ep *opx_ep, const union fi_op the source lid is the same as the destination lid) */ return ((caps & (FI_LOCAL_COMM | FI_REMOTE_COMM)) == FI_LOCAL_COMM) || (((caps & (FI_LOCAL_COMM | FI_REMOTE_COMM)) == (FI_LOCAL_COMM | FI_REMOTE_COMM)) && - (fi_opx_hfi_is_intranode(addr.uid.lid))); + (opx_lid_is_intranode(addr.uid.lid))); } __OPX_FORCE_INLINE__ @@ -668,14 +1011,16 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, const uint32_t data, int lock_required, const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; const uint64_t bth_rx = dest_rx << 56; - const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(addr.fi); + const uint64_t lrh_dlid_9B = FI_OPX_ADDR_TO_HFI1_LRH_DLID(addr.fi); + uint32_t dlid = htons(lrh_dlid_9B >> 16); if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -683,7 +1028,7 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-INJECT-SHM"); uint64_t pos; ssize_t rc; - union fi_opx_hfi1_packet_hdr * const hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); @@ -705,22 +1050,41 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, } } #endif - hdr->qw[0] = opx_ep->tx->inject.hdr.qw[0] | lrh_dlid; - - hdr->qw[1] = opx_ep->tx->inject.hdr.qw[1] | bth_rx | (len << 48) | - ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT); - hdr->qw[2] = opx_ep->tx->inject.hdr.qw[2]; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[0] = opx_ep->tx->inject_9B.hdr.qw_9B[0] | lrh_dlid_9B; - hdr->qw[3] = opx_ep->tx->inject.hdr.qw[3] | (((uint64_t)data) << 32); - - hdr->qw[4] = 0; - hdr->qw[5] = 0; - fi_opx_hfi1_memcpy8((void*)&hdr->qw[4], buf, len); - - hdr->qw[6] = tag; + hdr->qw_9B[1] = opx_ep->tx->inject_9B.hdr.qw_9B[1] | bth_rx | (len << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT); + + hdr->qw_9B[2] = opx_ep->tx->inject_9B.hdr.qw_9B[2]; + + hdr->qw_9B[3] = opx_ep->tx->inject_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + + hdr->qw_9B[4] = 0; + hdr->qw_9B[5] = 0; + fi_opx_hfi1_memcpy8((void*)&hdr->qw_9B[4], buf, len); + + hdr->qw_9B[6] = tag; + } else { + hdr->qw_16B[0] = opx_ep->tx->inject_16B.hdr.qw_16B[0] | + ((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B); + hdr->qw_16B[1] = opx_ep->tx->inject_16B.hdr.qw_16B[1] | + (((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->tx->inject_16B.hdr.qw_16B[2] | bth_rx | (len << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT); + + hdr->qw_16B[3] = opx_ep->tx->inject_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->inject_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + hdr->qw_16B[5] = 0; + hdr->qw_16B[6] = 0; + fi_opx_hfi1_memcpy8((void*)&hdr->qw_16B[5], buf, len); + hdr->qw_16B[7] = tag; + } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); @@ -738,10 +1102,13 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1) < 1)) { + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed) < + credits_needed)) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); opx_ep->tx->pio_state->qw0 = pio_state.qw0; - if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1) < 1) { + + if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed) < 1) { return -FI_EAGAIN; } } @@ -751,7 +1118,7 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, int64_t psn; psn = fi_opx_reliability_get_replay(ep, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if(OFI_UNLIKELY(psn == -1)) { OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND-INJECT-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); @@ -779,42 +1146,66 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8] = {0}; + uint64_t local_temp[16] = {0}; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_inject_and_copy_scb_9B(scb, local_temp, + opx_ep->tx->inject_9B.qw0 | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid_9B, hfi1_type), + opx_ep->tx->inject_9B.hdr.qw_9B[0] | lrh_dlid_9B, + + opx_ep->tx->inject_9B.hdr.qw_9B[1] | bth_rx | (len << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT), + + opx_ep->tx->inject_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->inject_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + buf, len, tag); + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + } else { + // 1st cacheline + fi_opx_store_inject_and_copy_scb_16B(scb, local_temp, + opx_ep->tx->inject_16B.qw0 | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid_9B, hfi1_type), + opx_ep->tx->inject_16B.hdr.qw_16B[0] | ((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B), + opx_ep->tx->inject_16B.hdr.qw_16B[1] | (((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), - fi_opx_set_scb_special(scb, tmp, - opx_ep->tx->inject.qw0 | OPX_PBC_CR(opx_ep->tx->force_credit_return) | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid), - opx_ep->tx->inject.hdr.qw[0] | lrh_dlid, + opx_ep->tx->inject_16B.hdr.qw_16B[2] | bth_rx | (len << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT), - opx_ep->tx->inject.hdr.qw[1] | bth_rx | (len << 48) | - ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT), + opx_ep->tx->inject_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->inject_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + buf, len ); - opx_ep->tx->inject.hdr.qw[2] | psn, - opx_ep->tx->inject.hdr.qw[3] | (((uint64_t)data) << 32), - buf, len, tag); + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + // 2nd cacheline + volatile uint64_t * const scb2 = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + fi_opx_store_inject_and_copy_scb2_16B(scb2, local_temp, tag ); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + } - /* consume one credit */ - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); /* save the updated txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; - replay->scb.qw0 = tmp[0]; - replay->scb.hdr.qw[0] = tmp[1]; - replay->scb.hdr.qw[1] = tmp[2]; - replay->scb.hdr.qw[2] = tmp[3]; - replay->scb.hdr.qw[3] = tmp[4]; - replay->scb.hdr.qw[4] = tmp[5]; - replay->scb.hdr.qw[5] = tmp[6]; - replay->scb.hdr.qw[6] = tmp[7]; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); + } else { + fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, local_temp); + } - fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.uid.lid, addr.reliability_rx, dest_rx, psn_ptr, replay, reliability); + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-INJECT-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -883,7 +1274,8 @@ bool fi_opx_hfi1_fill_from_iov8(const struct iovec *iov, /* In: iovec array * return false; } -static inline void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required); +static inline void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, + const enum opx_hfi1_type hfi1_type); __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_check_credits(struct fi_opx_ep *opx_ep, @@ -912,156 +1304,525 @@ ssize_t fi_opx_hfi1_tx_check_credits(struct fi_opx_ep *opx_ep, return (ssize_t) total_credits_available; } + __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, size_t niov, - size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, - void *context, const uint32_t data, int lock_required, - const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability, - const uint64_t do_cq_completion, - const enum fi_hmem_iface iface, - const uint64_t hmem_device) +ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, + const struct iovec *iov, size_t niov, + const uint16_t lrh_dws, + const uint64_t lrh_dlid, + const uint64_t bth_rx, + size_t total_len, + const size_t payload_qws_total, + const size_t xfer_bytes_tail, + void *desc, + const union fi_opx_addr *addr, + uint64_t tag, + void *context, + const uint32_t data, + int lock_required, + const uint64_t dest_rx, + const uint64_t caps, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { - assert(lock_required == 0); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - const union fi_opx_addr addr = { .fi = dest_addr }; - const size_t xfer_bytes_tail = total_len & 0x07ul; - const size_t payload_qws_total = total_len >> 3; - const size_t payload_qws_tail = payload_qws_total & 0x07ul; - - const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; - const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - uint16_t full_block_credits_needed = (total_len >> 6); - uint16_t total_credits_needed = 1 + /* packet header */ - full_block_credits_needed; /* full blocks */ - - if(payload_qws_tail || xfer_bytes_tail) { - total_credits_needed += 1; - } - - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lhr */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - ((total_credits_needed-1) << 4); - - /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ - const uint16_t lrh_dws = htons(pbc_dws - 1); struct iovec *iov_ptr = (struct iovec *) iov; size_t *niov_ptr = &niov; - if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { - FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, SHM -- EAGER (begin)\n"); - OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-SHM"); - uint64_t pos; - ssize_t rc; - union fi_opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( - &opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, - opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, SHM -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-SHM"); + uint64_t pos; + ssize_t rc; + union opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( + &opx_ep->tx->shm, addr->hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, + opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); - if (!hdr) return rc; + if (!hdr) return rc; #ifdef OPX_HMEM - /* Note: This code is duplicated in the internode and intranode + /* Note: This code is duplicated in the internode and intranode paths at points in the code where we know we'll be able to proceed with the send, so that we don't waste cycles doing this, only to EAGAIN because we couldn't get a SHM packet or credits/replay/psn */ - size_t hmem_niov = 1; - struct iovec hmem_iov; + size_t hmem_niov = 1; + struct iovec hmem_iov; - /* If the IOVs are GPU-resident, copy all their data to the HMEM + /* If the IOVs are GPU-resident, copy all their data to the HMEM bounce buffer, and then proceed as if we only have a single IOV that points to the bounce buffer. */ - if (iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; - unsigned iov_total_len = 0; - for (int i = 0; i < niov; ++i) { - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, - &opx_ep->hmem_copy_buf[iov_total_len], - iov[i].iov_base, iov[i].iov_len, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); - iov_total_len += iov[i].iov_len; - } - - hmem_iov.iov_base = opx_ep->hmem_copy_buf; - hmem_iov.iov_len = iov_total_len; - iov_ptr = &hmem_iov; - niov_ptr = &hmem_niov; - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode - .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] - .send.eager_noncontig); - } -#endif - hdr->qw[0] = opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - hdr->qw[1] = opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); - hdr->qw[2] = opx_ep->tx->send.hdr.qw[2]; - hdr->qw[3] = opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32); - hdr->qw[4] = opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48); - - /* Fill QW 5 from the iovec */ - uint8_t *buf = (uint8_t *)&hdr->qw[5]; - ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; - - if (xfer_bytes_tail) { - ssize_t tail_len = xfer_bytes_tail; - remain = total_len - tail_len; - while (false == - fi_opx_hfi1_fill_from_iov8( - iov_ptr, /* In: iovec array */ - *niov_ptr, /* In: total iovecs */ - buf, /* In: target buffer to fill */ - &tail_len, /* In/Out: buffer length to fill */ - &iov_idx, /* In/Out: start index, returns end */ - &iov_base_offset)) { /* In/Out: start offset, returns offset */ - // copy until done; - } - assert(tail_len == 0); + if (iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + unsigned iov_total_len = 0; + for (int i = 0; i < niov; ++i) { + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, + &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); + iov_total_len += iov[i].iov_len; } - hdr->qw[6] = tag; - - union fi_opx_hfi1_packet_payload *const payload = - (union fi_opx_hfi1_packet_payload *)(hdr + 1); - buf = payload->byte; - while (false == - fi_opx_hfi1_fill_from_iov8( + hmem_iov.iov_base = opx_ep->hmem_copy_buf; + hmem_iov.iov_len = iov_total_len; + iov_ptr = &hmem_iov; + niov_ptr = &hmem_niov; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager_noncontig); + } +#endif + hdr->qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + hdr->qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + hdr->qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); + /* Fill QW 5 from the iovec */ + uint8_t *buf = (uint8_t *)&hdr->qw_9B[5]; + ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; + + if (xfer_bytes_tail) { + ssize_t tail_len = xfer_bytes_tail; + remain = total_len - tail_len; + while (false == + fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + buf, /* In: target buffer to fill */ + &tail_len, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } + assert(tail_len == 0); + } + hdr->qw_9B[6] = tag; + + union fi_opx_hfi1_packet_payload *const payload = + (union fi_opx_hfi1_packet_payload *)(hdr + 1); + + buf = payload->byte; + while (false == + fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + buf, /* In: target buffer to fill */ + &remain, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } + assert(remain == 0); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); + fi_opx_shm_poll_many(&opx_ep->ep_fid, 0, hfi1_type); + + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, + lock_required, tag, caps); + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-SHM"); + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, SHM -- EAGER (end)\n"); + return FI_SUCCESS; +} +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, size_t niov, + size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + assert(lock_required == 0); + struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; + const size_t xfer_bytes_tail = total_len & 0x07ul; + const size_t payload_qws_total = total_len >> 3; + const size_t payload_qws_tail = payload_qws_total & 0x07ul; + + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); + uint16_t full_block_credits_needed = (total_len >> 6); + uint16_t total_credits_needed = 1 + /* packet header */ + full_block_credits_needed; /* full blocks */ + + if(payload_qws_tail || xfer_bytes_tail) { + total_credits_needed += 1; + } + + const uint64_t pbc_dws = 2 + /* pbc */ + 2 + /* lhr */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + ((total_credits_needed-1) << 4); + + /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + const uint16_t lrh_dws = htons(pbc_dws - 1); + + struct iovec *iov_ptr = (struct iovec *) iov; + size_t *niov_ptr = &niov; + + if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { + return fi_opx_hfi1_tx_sendv_egr_intranode(ep, + iov, niov, + lrh_dws, + lrh_dlid, + bth_rx, + total_len, + payload_qws_total, + xfer_bytes_tail, + desc, + &addr, + tag, + context, + data, + lock_required, + dest_rx, + caps, + do_cq_completion, + iface, + hmem_device, + hfi1_type); + } + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, HFI -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-HFI"); + + // Even though we're using the reliability service to pack this buffer + // we still want to make sure it will have enough credits available to send + // and allow the user to poll and quiesce the fabric some + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); + if (OFI_UNLIKELY(total_credits_available < 0)) { + return -FI_ENOBUFS; + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(ep, &opx_ep->reliability->state, addr.uid.lid, + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if(OFI_UNLIKELY(psn == -1)) { + return -FI_EAGAIN; + } + +#ifdef OPX_HMEM + size_t hmem_niov = 1; + struct iovec hmem_iov; + + /* If the IOVs are GPU-resident, copy all their data to the HMEM + bounce buffer, and then proceed as if we only have a single IOV + that points to the bounce buffer. */ + if (iface != FI_HMEM_SYSTEM) { + unsigned iov_total_len = 0; + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + for (int i = 0; i < niov; ++i) { + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); + iov_total_len += iov[i].iov_len; + } + + hmem_iov.iov_base = opx_ep->hmem_copy_buf; + hmem_iov.iov_len = iov_total_len; + iov_ptr = &hmem_iov; + niov_ptr = &hmem_niov; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager_noncontig); + } +#endif + ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; + + OPX_NO_16B_SUPPORT(hfi1_type); + + replay->scb_9B.qw0 = opx_ep->tx->send_9B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); + replay->scb_9B.hdr.qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + replay->scb_9B.hdr.qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + replay->scb_9B.hdr.qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2] | psn; + replay->scb_9B.hdr.qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + replay->scb_9B.hdr.qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); + if (xfer_bytes_tail) { + ssize_t tail_len = xfer_bytes_tail; + remain = total_len - tail_len; + while (false == + fi_opx_hfi1_fill_from_iov8( iov_ptr, /* In: iovec array */ *niov_ptr, /* In: total iovecs */ - buf, /* In: target buffer to fill */ - &remain, /* In/Out: buffer length to fill */ + &replay->scb_9B.hdr.qw_9B[5], /* In: target buffer to fill */ + &tail_len, /* In/Out: buffer length to fill */ &iov_idx, /* In/Out: start index, returns end */ &iov_base_offset)) { /* In/Out: start offset, returns offset */ // copy until done; } - assert(remain == 0); - opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); - fi_opx_shm_poll_many(&opx_ep->ep_fid, 0); + assert(tail_len == 0); + } + replay->scb_9B.hdr.qw_9B[6] = tag; + + remain = total_len - xfer_bytes_tail; + uint64_t *payload = replay->payload; + while (false == + fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + payload, /* In: target buffer to fill */ + &remain, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } - if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, + fi_opx_reliability_client_replay_register_no_update( + &opx_ep->reliability->state, addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); + + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, lock_required, tag, caps); + } + + fi_opx_reliability_service_do_replay(&opx_ep->reliability->service, replay); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-HFI"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV, HFI -- EAGER (end)\n"); + + + return FI_SUCCESS; +} + + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, + const struct iovec *iov, size_t niov, + const uint16_t lrh_qws, + const uint64_t lrh_dlid, + const uint64_t bth_rx, + size_t total_len, + const size_t payload_qws_total, + const size_t xfer_bytes_tail, + void *desc, + const union fi_opx_addr *addr, + uint64_t tag, + void *context, + const uint32_t data, + int lock_required, + const uint64_t dest_rx, + const uint64_t caps, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + + struct iovec *iov_ptr = (struct iovec *) iov; + size_t *niov_ptr = &niov; + + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV 16B, SHM -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-SHM"); + uint64_t pos; + ssize_t rc; + union opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( + &opx_ep->tx->shm, addr->hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, + opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); + + if (!hdr) return rc; + +#ifdef OPX_HMEM + /* Note: This code is duplicated in the internode and intranode + paths at points in the code where we know we'll be able to + proceed with the send, so that we don't waste cycles doing + this, only to EAGAIN because we couldn't get a SHM packet + or credits/replay/psn */ + size_t hmem_niov = 1; + struct iovec hmem_iov; + + /* If the IOVs are GPU-resident, copy all their data to the HMEM + bounce buffer, and then proceed as if we only have a single IOV + that points to the bounce buffer. */ + if (iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + unsigned iov_total_len = 0; + for (int i = 0; i < niov; ++i) { + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, + &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); + iov_total_len += iov[i].iov_len; } - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-SHM"); - FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, SHM -- EAGER (end)\n"); - return FI_SUCCESS; + hmem_iov.iov_base = opx_ep->hmem_copy_buf; + hmem_iov.iov_len = iov_total_len; + iov_ptr = &hmem_iov; + niov_ptr = &hmem_niov; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager_noncontig); + } +#endif + hdr->qw_16B[0] = opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_qws << 20); + hdr->qw_16B[1] = opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + hdr->qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + hdr->qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); + + /* Fill QW 6 from the iovec */ + uint8_t *buf = (uint8_t *)&hdr->qw_16B[6]; + ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; + + if (xfer_bytes_tail) { + ssize_t tail_len = xfer_bytes_tail; + remain = total_len - tail_len; + while (false == + fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + buf, /* In: target buffer to fill */ + &tail_len, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } + assert(tail_len == 0); + } + hdr->qw_16B[7] = tag; + + union fi_opx_hfi1_packet_payload *const payload = + (union fi_opx_hfi1_packet_payload *)(hdr + 1); + + buf = payload->byte; + while (false == + fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + buf, /* In: target buffer to fill */ + &remain, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ + // copy until done; + } + assert(remain == 0); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); + fi_opx_shm_poll_many(&opx_ep->ep_fid, 0, hfi1_type); + + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, + lock_required, tag, caps); + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-SHM"); + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SENDV 16B, SHM -- EAGER (end)\n"); + return FI_SUCCESS; +} + + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, size_t niov, + size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + assert(lock_required == 0); + struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; + const size_t xfer_bytes_tail = total_len & 0x07ul; + const size_t payload_qws_total = total_len >> 3; + + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); + const uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); + /* 16B PBC is dws */ + const uint64_t pbc_dws = + /* PIO SOP is 16 DWS/8 QWS*/ + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth */ + + /* PIO is everything else */ + 2 + /* kdeth9 remaining 2 dws */ + //--------------------- header split point KDETH 9 DWS + (payload_qws_total << 1) + /* one packet payload */ + 2 ; /* tail 1 qws/2 dws */ + + const uint16_t total_credits_needed = (pbc_dws + 15 ) >> 4; /* round up to full blocks */ + + /* 16B LRH is qws */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ + + struct iovec *iov_ptr = (struct iovec *) iov; + size_t *niov_ptr = &niov; + + if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { + return fi_opx_hfi1_tx_sendv_egr_intranode_16B(ep, + iov, niov, + lrh_qws, + lrh_dlid_16B, + bth_rx, + payload_qws_total, + total_len, + xfer_bytes_tail, + desc, + &addr, + tag, + context, + data, + lock_required, + dest_rx, + caps, + do_cq_completion, + iface, + hmem_device, + hfi1_type); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, HFI -- EAGER (begin)\n"); + "===================================== SENDV 16B, HFI -- EAGER (begin)\n"); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-EAGER-HFI"); // Even though we're using the reliability service to pack this buffer // we still want to make sure it will have enough credits available to send // and allow the user to poll and quiesce the fabric some union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); if (OFI_UNLIKELY(total_credits_available < 0)) { return -FI_ENOBUFS; @@ -1072,7 +1833,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz int64_t psn; psn = fi_opx_reliability_get_replay(ep, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if(OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } @@ -1105,17 +1866,21 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz #endif ssize_t remain = total_len, iov_idx = 0, iov_base_offset = 0; - replay->scb.qw0 = opx_ep->tx->send.qw0 | - OPX_PBC_LEN(pbc_dws) | - OPX_PBC_CR(opx_ep->tx->force_credit_return) | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid); - replay->scb.hdr.qw[0] = opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - replay->scb.hdr.qw[1] = opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | + OPX_NO_9B_SUPPORT(hfi1_type); + + replay->scb_16B.qw0 = opx_ep->tx->send_16B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid; //OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid_16B, hfi1_type); + + replay->scb_16B.hdr.qw_16B[0] = opx_ep->tx->send_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_qws << 20); + replay->scb_16B.hdr.qw_16B[1] = opx_ep->tx->send_16B.hdr.qw_16B[1] |((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + replay->scb_16B.hdr.qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); - replay->scb.hdr.qw[2] = opx_ep->tx->send.hdr.qw[2] | psn; - replay->scb.hdr.qw[3] = opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32); - replay->scb.hdr.qw[4] = opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48); + replay->scb_16B.hdr.qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3] | psn; + replay->scb_16B.hdr.qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + replay->scb_16B.hdr.qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); if (xfer_bytes_tail) { ssize_t tail_len = xfer_bytes_tail; remain = total_len - tail_len; @@ -1123,7 +1888,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz fi_opx_hfi1_fill_from_iov8( iov_ptr, /* In: iovec array */ *niov_ptr, /* In: total iovecs */ - &replay->scb.hdr.qw[5], /* In: target buffer to fill */ + &replay->scb_16B.hdr.qw_16B[6], /* In: target buffer to fill */ &tail_len, /* In/Out: buffer length to fill */ &iov_idx, /* In/Out: start index, returns end */ &iov_base_offset)) { /* In/Out: start offset, returns offset */ @@ -1131,7 +1896,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz } assert(tail_len == 0); } - replay->scb.hdr.qw[6] = tag; + replay->scb_16B.hdr.qw_16B[7] = tag; remain = total_len - xfer_bytes_tail; uint64_t *payload = replay->payload; @@ -1147,8 +1912,8 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz } fi_opx_reliability_client_replay_register_no_update( - &opx_ep->reliability->state, addr.uid.lid, addr.reliability_rx, - dest_rx, psn_ptr, replay, reliability); + &opx_ep->reliability->state, addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); if (OFI_LIKELY(do_cq_completion)) { fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, @@ -1161,27 +1926,86 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, HFI -- EAGER (end)\n"); + "===================================== SENDV 16B, HFI -- EAGER (end)\n"); return FI_SUCCESS; } + __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, - const void *buf, - size_t len, - void *desc, - fi_addr_t dest_addr, - uint64_t tag, - void * context, - const uint32_t data, - int lock_required, - const uint64_t dest_rx, - const uint64_t caps, +ssize_t fi_opx_hfi1_tx_sendv_egr_select(struct fid_ep *ep, + const struct iovec *iov, size_t niov, size_t total_len, + void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + if (hfi1_type & OPX_HFI1_WFR) { + return fi_opx_hfi1_tx_sendv_egr(ep, + iov, niov, total_len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_WFR); + } else if (hfi1_type & OPX_HFI1_JKR) { + return fi_opx_hfi1_tx_sendv_egr_16B(ep, + iov, niov, total_len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_JKR); + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + return fi_opx_hfi1_tx_sendv_egr(ep, + iov, niov, total_len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_JKR_9B); + } + abort(); + return (ssize_t)-1L; +} + + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, + const void *buf, + size_t len, + void *desc, + fi_addr_t dest_addr, + uint64_t tag, + void * context, + const uint32_t data, + int lock_required, + const uint64_t dest_rx, + const uint64_t caps, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; @@ -1206,7 +2030,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-EAGER-SHM"); uint64_t pos; ssize_t rc; - union fi_opx_hfi1_packet_hdr * const hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); @@ -1222,36 +2046,144 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, if (iface != FI_HMEM_SYSTEM) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, - buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf = opx_ep->hmem_copy_buf; FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] .send.eager); } #endif - hdr->qw[0] = opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - hdr->qw[1] = opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + hdr->qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + hdr->qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + hdr->qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); + + /* only if is_contiguous */ + if (OFI_LIKELY(len > 7)) { + /* safe to blindly qw-copy the first portion of the source buffer */ + hdr->qw_9B[5] = *((uint64_t *)buf); + } else { + hdr->qw_9B[5] = 0; + memcpy((void*)&hdr->qw_9B[5], buf, xfer_bytes_tail); + } + + hdr->qw_9B[6] = tag; + + union fi_opx_hfi1_packet_payload * const payload = + (union fi_opx_hfi1_packet_payload *)(hdr+1); + + memcpy((void*)payload->byte, + (const void *)((uintptr_t)buf + xfer_bytes_tail), + payload_qws_total * sizeof(uint64_t)); + + + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); + + if (do_cq_completion) { + fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, + tag, caps); + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-SHM"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, SHM -- EAGER (end)\n"); + + return FI_SUCCESS; + +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_egr_intranode_16B(struct fid_ep *ep, + const void *buf, + size_t len, + void *desc, + fi_addr_t dest_addr, + uint64_t tag, + void * context, + const uint32_t data, + int lock_required, + const uint64_t dest_rx, + const uint64_t caps, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device) +{ + struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; - hdr->qw[2] = opx_ep->tx->send.hdr.qw[2]; + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = htons(dest_addr >> 40); + + const size_t xfer_bytes_tail = len & 0x07ul; + const size_t payload_qws_total = len >> 3; - hdr->qw[3] = opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32); + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth + unused */ + 2 + /* second cacheline */ + ((payload_qws_total) << 1) + + 2; //ICRC + Tail - hdr->qw[4] = opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48); + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, SHM -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-EAGER-SHM"); + uint64_t pos; + ssize_t rc; + union opx_hfi1_packet_hdr * const hdr = + opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, + opx_ep->daos_info.rank_inst, &rc); + + if (!hdr) { + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND-EAGER-SHM"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, SHM -- EAGER (end) - No packet available.\n"); + return rc; + } + +#ifdef OPX_HMEM + if (iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, + buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + buf = opx_ep->hmem_copy_buf; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager); + } +#endif + + hdr->qw_16B[0] = opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_qws << 20); + hdr->qw_16B[1] = opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + hdr->qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + hdr->qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); /* only if is_contiguous */ if (OFI_LIKELY(len > 7)) { /* safe to blindly qw-copy the first portion of the source buffer */ - hdr->qw[5] = *((uint64_t *)buf); + hdr->qw_16B[6] = *((uint64_t *)buf); } else { - hdr->qw[5] = 0; - memcpy((void*)&hdr->qw[5], buf, xfer_bytes_tail); + hdr->qw_16B[6] = 0; + memcpy((void*)&hdr->qw_16B[6], buf, xfer_bytes_tail); } - hdr->qw[6] = tag; + hdr->qw_16B[7] = tag; union fi_opx_hfi1_packet_payload * const payload = (union fi_opx_hfi1_packet_payload *)(hdr+1); @@ -1269,7 +2201,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-SHM"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, SHM -- EAGER (end)\n"); + "===================================== SEND 16B, SHM -- EAGER (end)\n"); return FI_SUCCESS; @@ -1278,11 +2210,11 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, - uint64_t local_target[8], + uint64_t *local_storage, const void *buf, const uint64_t bth_rx, const uint64_t lrh_dlid, - const uint16_t lrh_dws, + const uint16_t lrh_packet_length, /* 9B dws, 16B qws and little/big-endian as required */ const uint64_t pbc_dlid, const uint64_t pbc_dws, const ssize_t len, @@ -1291,7 +2223,8 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, const uint32_t psn, const uint32_t data, const uint64_t tag, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { /* * Write the 'start of packet' (hw+sw header) 'send control block' @@ -1302,46 +2235,129 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, /* only if is_contiguous */ if (OFI_LIKELY(len > 7)) { - /* safe to blindly qw-copy the first portion of the source buffer */ - fi_opx_set_scb(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | - pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - - opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), - opx_ep->tx->send.hdr.qw[2] | psn, - opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32), - opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48), - *((uint64_t *)buf), tag); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + /* safe to blindly qw-copy the first portion of the source buffer */ + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_packet_length << 32), + + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), + + opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48), + *((uint64_t *)buf), tag); + + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_packet_length << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? /* compile-time constant expression */ + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), + + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48), + *((uint64_t *)buf)); + } } else { - fi_opx_set_scb_special2(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | - pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw_9B(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_packet_length << 32), + + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), + + opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48), + buf, xfer_bytes_tail, tag); + + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw_16B(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_packet_length << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48), + buf, + xfer_bytes_tail); - opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), - - opx_ep->tx->send.hdr.qw[2] | psn, - opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32), - opx_ep->tx->send.hdr.qw[4] | (payload_qws_total << 48), - buf, xfer_bytes_tail, tag); + } } FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); return 1; // Consumed 1 credit + +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(struct fi_opx_ep *opx_ep, + union fi_opx_hfi1_pio_state *pio_state, + uint64_t *local_storage, + uint64_t *buf_qws, + const size_t hdr_and_payload_qws, + const uint64_t tag) +{ + assert(pio_state->credits_total - pio_state->scb_head_index); + assert(hdr_and_payload_qws <= 8); + + union fi_opx_hfi1_pio_state pio_local = *pio_state; + volatile uint64_t * scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_local); + + // spill from 1st cacheline (SOP) + OPX_HFI1_BAR_STORE(&scb_payload[0], tag); // header + local_storage[8] = tag; /* todo: pretty sure it's already there */ + + int i; + + for (i = 1; i < hdr_and_payload_qws ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], buf_qws[i-1]); + local_storage[8 + i] = buf_qws[i-1]; + } + if (hdr_and_payload_qws < 8) { /* less than a full block stored? pad it out */ + for (; i<8 ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], -1UL); + local_storage[8 + i] = -1UL; + } + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + FI_OPX_HFI1_CONSUME_CREDITS(pio_local, 1); + pio_state->qw0 = pio_local.qw0; + return 1; + } + + __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_egr_write_full_payload_blocks(struct fi_opx_ep *opx_ep, +ssize_t fi_opx_hfi1_tx_egr_store_full_payload_blocks(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, uint64_t *buf_qws, uint16_t full_block_credits_needed, @@ -1375,6 +2391,7 @@ ssize_t fi_opx_hfi1_tx_egr_write_full_payload_blocks(struct fi_opx_ep *opx_ep, OPX_HFI1_BAR_STORE(&scb_payload[7], buf_qws[7]); scb_payload += 8; buf_qws += 8; + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); } FI_OPX_HFI1_CONSUME_CREDITS(pio_local, contiguous_full_blocks_to_write); @@ -1402,6 +2419,7 @@ ssize_t fi_opx_hfi1_tx_egr_write_full_payload_blocks(struct fi_opx_ep *opx_ep, OPX_HFI1_BAR_STORE(&scb_payload[7], buf_qws[7]); scb_payload += 8; buf_qws += 8; + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); } FI_OPX_HFI1_CONSUME_CREDITS(pio_local, full_block_credits_needed); @@ -1414,45 +2432,49 @@ ssize_t fi_opx_hfi1_tx_egr_write_full_payload_blocks(struct fi_opx_ep *opx_ep, } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_egr_write_payload_tail(struct fi_opx_ep *opx_ep, +ssize_t fi_opx_hfi1_tx_egr_store_payload_tail(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, uint64_t *buf_qws, const size_t payload_qws_tail) { volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, *pio_state); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "buf_qws %p, payload_qws_tail %zu\n", + buf_qws, payload_qws_tail); + unsigned i = 0; for (; iscb.qw0 = tmp[0]; - replay->scb.hdr.qw[0] = tmp[1]; - replay->scb.hdr.qw[1] = tmp[2]; - replay->scb.hdr.qw[2] = tmp[3]; - replay->scb.hdr.qw[3] = tmp[4]; - replay->scb.hdr.qw[4] = tmp[5]; - replay->scb.hdr.qw[5] = tmp[6]; - replay->scb.hdr.qw[6] = tmp[7]; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_source); + else + fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, local_source); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); uint64_t * payload = replay->payload; @@ -1461,9 +2483,9 @@ void fi_opx_hfi1_tx_send_egr_write_replay_data(struct fi_opx_ep *opx_ep, payload[i] = buf_qws[i]; } - fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.uid.lid, + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.reliability_rx, addr.hfi1_rx, psn_ptr, replay, - reliability); + reliability, hfi1_type); } __OPX_FORCE_INLINE__ @@ -1477,51 +2499,216 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; + + OPX_NO_16B_SUPPORT(hfi1_type); + + if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { + return fi_opx_hfi1_tx_send_egr_intranode(ep, buf, len, desc, dest_addr, + tag, context, data, lock_required, dest_rx, caps, do_cq_completion, + iface, hmem_device); + } + + const size_t xfer_bytes_tail = len & 0x07ul; + const size_t payload_qws_total = len >> 3; + + const size_t payload_qws_tail = payload_qws_total & 0x07ul; + + uint16_t full_block_credits_needed = (uint16_t)(payload_qws_total >> 3); + + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); + const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); + + assert(hfi1_type != OPX_HFI1_JKR); + /* 9B PBC is dws */ + const uint64_t pbc_dws = + /* PIO SOP is 16 DWS/8 QWS*/ + 2 + /* pbc */ + 2 + /* lhr */ + 3 + /* bth */ + 3 + /* kdeth */ + 6 + /* software kdeth */ + //--------------------- header split point KDETH 9 DWS + + /* PIO is everything else */ + (payload_qws_total << 1); /* one packet payload */ + + /* 9B LRH is dws */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + + assert(lock_required == 0); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, HFI -- EAGER (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-EAGER-HFI"); + + /* first check for sufficient credits to inject the entire packet */ + + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + uint16_t total_credits_needed = + 1 + /* PIO SOP -- 1 credit */ + full_block_credits_needed + /* PIO full blocks -- payload */ + (payload_qws_tail > 0); /* PIO partial block -- 1 credit */ + + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); + if (OFI_UNLIKELY(total_credits_available < 0)) { + return -FI_ENOBUFS; + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int32_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if (OFI_UNLIKELY(psn == -1)) { + return -FI_EAGAIN; + } + +#ifdef OPX_HMEM + if (iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, + buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + buf = opx_ep->hmem_copy_buf; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.eager); + } +#endif + + uint64_t local_temp[16] = {0}; +#ifndef NDEBUG + unsigned credits_consumed = +#endif + fi_opx_hfi1_tx_egr_write_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, + lrh_dws, pbc_dlid, pbc_dws, len, xfer_bytes_tail, + payload_qws_total, psn, data, tag, caps, hfi1_type); + + uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); + + if (OFI_LIKELY(full_block_credits_needed)) { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, full_block_credits_needed, + total_credits_available - 1); + } + + if (OFI_LIKELY(payload_qws_tail)) { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, + buf_qws + (full_block_credits_needed << 3), + payload_qws_tail); + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + +#ifndef NDEBUG + assert(credits_consumed == total_credits_needed); +#endif + + /* update the hfi txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, + xfer_bytes_tail, local_temp, buf, payload_qws_total, reliability, hfi1_type); + + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_inject_completion(ep, context, len, + lock_required, tag, caps); + } + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-HFI"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, HFI -- EAGER (end)\n"); + + return FI_SUCCESS; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, + const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void* context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; + OPX_NO_9B_SUPPORT(hfi1_type); + if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { - return fi_opx_hfi1_tx_send_egr_intranode(ep, buf, len, desc, dest_addr, + return fi_opx_hfi1_tx_send_egr_intranode_16B(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, dest_rx, caps, do_cq_completion, iface, hmem_device); } const size_t xfer_bytes_tail = len & 0x07ul; const size_t payload_qws_total = len >> 3; - const size_t payload_qws_tail = payload_qws_total & 0x07ul; - uint16_t full_block_credits_needed = (uint16_t)(payload_qws_total >> 3); + /* 16B (RcvPktCtrl=9) has 1 QW of KDETH and 1 QW of tail in PIO (non-SOP) */ + const size_t kdeth9_qws_total = 1; + const size_t tail_qws_total = 1; + + + /* Full 64 byte/8 qword blocks -- 1 credit per block */ + uint16_t full_block_credits_needed = (uint16_t)((kdeth9_qws_total + payload_qws_total + tail_qws_total) >> 3); + /* Remaining tail qwords (< 8) after full blocks */ + size_t tail_partial_block_qws = (kdeth9_qws_total + payload_qws_total + tail_qws_total) & 0x07ul; const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid); + const uint64_t pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); + assert(hfi1_type & OPX_HFI1_JKR); + /* 16B PBC is dws */ const uint64_t pbc_dws = - 2 + /* pbc */ - 2 + /* lhr */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - (payload_qws_total << 1); - - const uint16_t lrh_dws = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + /* PIO SOP is 16 DWS/8 QWS*/ + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth */ + + /* PIO is everything else */ + (kdeth9_qws_total << 1) + /* kdeth9 remaining 2 dws */ + //--------------------- header split point KDETH 9 DWS + (payload_qws_total << 1) + /* one packet payload */ + (tail_qws_total << 1) ; /* tail 1 qws/2 dws */ + + /* 16B LRH is qws */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ assert(lock_required == 0); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, HFI -- EAGER (begin)\n"); + "===================================== SEND 16B, HFI -- EAGER (begin)\n"); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-EAGER-HFI"); /* first check for sufficient credits to inject the entire packet */ - union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - - const uint16_t total_credits_needed = - 1 + /* packet header */ - full_block_credits_needed + /* full payload blocks */ - (payload_qws_tail > 0); /* partial payload block */ + uint16_t total_credits_needed = + 1 + /* PIO SOP -- 1 credit */ + full_block_credits_needed + /* PIO full blocks -- kdeth9/payload/tail */ + (tail_partial_block_qws > 0); /* PIO partial block -- 1 credit */ ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); if (OFI_UNLIKELY(total_credits_available < 0)) { @@ -1533,7 +2720,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, int32_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } @@ -1542,7 +2729,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, if (iface != FI_HMEM_SYSTEM) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, - buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf = opx_ep->hmem_copy_buf; FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] @@ -1550,32 +2737,48 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, } #endif - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; #ifndef NDEBUG unsigned credits_consumed = #endif - fi_opx_hfi1_tx_egr_write_packet_header(opx_ep, &pio_state, tmp, buf, bth_rx, lrh_dlid, - lrh_dws, pbc_dlid, pbc_dws, len, xfer_bytes_tail, - payload_qws_total, psn, data, tag, caps); + fi_opx_hfi1_tx_egr_write_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, + lrh_qws, pbc_dlid, pbc_dws, len, xfer_bytes_tail, + payload_qws_total, psn, data, tag, caps, hfi1_type); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); + assert(hfi1_type & OPX_HFI1_JKR); + + /* write one block of PIO non-SOP, either one full block (8 qws) or the partial qws/block */ + const size_t first_block_qws = full_block_credits_needed ? 8 : tail_partial_block_qws ; +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(opx_ep, &pio_state, local_temp, buf_qws, + first_block_qws, tag); + + buf_qws = buf_qws + first_block_qws - 1 /* not the kdeth qword */; + /* adjust full or partial for what we just consumed */ + if (full_block_credits_needed) full_block_credits_needed--; + else tail_partial_block_qws = 0; + + if (OFI_LIKELY(full_block_credits_needed)) { #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, buf_qws, full_block_credits_needed, - total_credits_available - 1); + total_credits_available - 2); } - if (OFI_LIKELY(payload_qws_tail)) { + if (OFI_LIKELY(tail_partial_block_qws)) { #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_payload_tail(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, buf_qws + (full_block_credits_needed << 3), - payload_qws_tail); + tail_partial_block_qws); } FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); @@ -1588,7 +2791,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, - xfer_bytes_tail, tmp, buf, payload_qws_total, reliability); + xfer_bytes_tail, local_temp, buf, payload_qws_total, reliability, hfi1_type); if (OFI_LIKELY(do_cq_completion)) { fi_opx_ep_tx_cq_inject_completion(ep, context, len, @@ -1597,11 +2800,71 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, HFI -- EAGER (end)\n"); + "===================================== SEND 16B, HFI -- EAGER (end)\n"); return FI_SUCCESS; } +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_egr_select(struct fid_ep *ep, + const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void* context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + if (hfi1_type & OPX_HFI1_WFR) { + return fi_opx_hfi1_tx_send_egr(ep, + buf, len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_WFR); + } else if (hfi1_type & OPX_HFI1_JKR) { + return fi_opx_hfi1_tx_send_egr_16B(ep, + buf, len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_JKR); + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + return fi_opx_hfi1_tx_send_egr(ep, + buf, len, desc, + dest_addr, tag, context, + data, lock_required, + override_flags, tx_op_flags, + dest_rx, + caps, + reliability, + do_cq_completion, + iface, + hmem_device, + OPX_HFI1_JKR_9B); + } + abort(); + return (ssize_t)-1L; +} + + + /* * Write the initial packet header of a multi-packet eager send. This will include the size of * the entire multi-packet eager payload. @@ -1609,7 +2872,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, - uint64_t local_target[8], + uint64_t *local_storage, const void *buf, const uint64_t bth_rx, const uint64_t lrh_dlid, @@ -1620,7 +2883,8 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ const uint32_t psn, const uint32_t data, const uint64_t tag, - const uint64_t caps) + const uint64_t caps, + const enum opx_hfi1_type hfi1_type) { /* * Write the 'start of packet' (hw+sw header) 'send control block' @@ -1631,18 +2895,39 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ /* For a multi-packet eager, the *first* packet's payload length should always be > 15 bytes, so we should be safe to blindly copy 2 qws out of buf */ - fi_opx_set_scb(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->send.hdr.qw[1] | bth_rx | FI_OPX_MP_EGR_XFER_BYTES_TAIL | + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | FI_OPX_MP_EGR_XFER_BYTES_TAIL | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST), - opx_ep->tx->send.hdr.qw[2] | (payload_bytes_total << 32) | psn, - opx_ep->tx->send.hdr.qw[3] | (((uint64_t)data) << 32), - *((uint64_t *)buf), - *((uint64_t *)buf + 1), tag); + opx_ep->tx->send_9B.hdr.qw_9B[2] | (payload_bytes_total << 32) | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + *((uint64_t *)buf), + *((uint64_t *)buf + 1), tag); + } else { + uint32_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | FI_OPX_MP_EGR_XFER_BYTES_TAIL | + ((caps & FI_MSG) ? /* compile-time constant expression */ + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST), + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn | (payload_bytes_total << 32), + opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + *((uint64_t *)buf), + *((uint64_t *)buf + 1)); + } + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); @@ -1650,6 +2935,36 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ return 1; /* Consumed 1 credit */ } +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(struct fi_opx_ep *opx_ep, + union fi_opx_hfi1_pio_state *pio_state, + uint64_t *local_storage, + const uint64_t tag, + uint64_t *buf_qws) +{ + union fi_opx_hfi1_pio_state pio_local = *pio_state; + volatile uint64_t * scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_local); + + // spill from 1st cacheline (SOP) + OPX_HFI1_BAR_STORE(&scb_payload[0], tag); // header + local_storage[8] = tag; /* todo: pretty sure it's already there */ + + int i; + + for (i = 1; i <= 7 ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], buf_qws[i-1]); + local_storage[8 + i] = buf_qws[i-1]; + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + FI_OPX_HFI1_CONSUME_CREDITS(pio_local, 1); + pio_state->qw0 = pio_local.qw0; + return 1; + +} + /* * Write the nth packet header of a multi-packet eager send where the remaining payload data is * more than 16 bytes. This means we'll use all 16 bytes of tail space in the packet header, and @@ -1658,7 +2973,7 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, - uint64_t local_target[8], + uint64_t *local_storage, const void *buf, const uint64_t bth_rx, const uint64_t lrh_dlid, @@ -1668,20 +2983,40 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(struct fi_opx_ep *opx_ep, const ssize_t xfer_bytes_tail, const uint32_t payload_offset, const uint32_t psn, - const uint32_t mp_egr_uid) + const uint32_t mp_egr_uid, + const enum opx_hfi1_type hfi1_type) { volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, *pio_state); - fi_opx_set_scb(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, - opx_ep->tx->send.hdr.qw[2] | psn, - opx_ep->tx->send.hdr.qw[3], - *((uint64_t *)buf), - *((uint64_t *)buf + 1), - (((uint64_t) mp_egr_uid) << 32) | payload_offset); + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, + opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3], + *((uint64_t *)buf), + *((uint64_t *)buf + 1), + (((uint64_t) mp_egr_uid) << 32) | payload_offset); + } else { + uint32_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->send_16B.hdr.qw_16B[4], + *((uint64_t *)buf), + *((uint64_t *)buf + 1)); + } FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); @@ -1696,7 +3031,7 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, - uint64_t local_target[8], + uint64_t *local_storage, const void *buf, const uint64_t bth_rx, const uint64_t lrh_dlid, @@ -1706,44 +3041,79 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(struct fi_opx_e const ssize_t xfer_bytes_tail, const uint32_t payload_offset, const uint32_t psn, - const uint32_t mp_egr_uid) + const uint32_t mp_egr_uid, + const enum opx_hfi1_type hfi1_type) { volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, *pio_state); - fi_opx_set_scb_special(scb, local_target, - opx_ep->tx->send.qw0 | OPX_PBC_LEN(pbc_dws) | OPX_PBC_CR(opx_ep->tx->force_credit_return) | - pbc_dlid, - opx_ep->tx->send.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->send.hdr.qw[1] | bth_rx | (xfer_bytes_tail << 48) | (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, - opx_ep->tx->send.hdr.qw[2] | psn, - opx_ep->tx->send.hdr.qw[3], - buf, xfer_bytes_tail, - (((uint64_t) mp_egr_uid) << 32) | payload_offset); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_inject_and_copy_scb_9B(scb, local_storage, + opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, + opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->send_9B.hdr.qw_9B[3], + buf, xfer_bytes_tail, + (((uint64_t) mp_egr_uid) << 32) | payload_offset); + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + return 1; /* Consumed 1 credit */ + } else { + uint32_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); - FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + // 1st cacheline + fi_opx_store_inject_and_copy_scb_16B(scb, local_storage, + opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + pbc_dlid, + opx_ep->tx->send_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20), + opx_ep->tx->send_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, + opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->send_16B.hdr.qw_16B[4], + buf, xfer_bytes_tail); - return 1; /* Consumed 1 credit */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); + + // 2nd cacheline + volatile uint64_t * const scb2 = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, *pio_state); + + fi_opx_store_inject_and_copy_scb2_16B(scb2, local_storage, (((uint64_t) mp_egr_uid) << 32) | payload_offset ); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); + + opx_ep->tx->pio_state->qw0 = pio_state->qw0; + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + return 2; /* Consumed 2 credit */ + } } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, - void **buf, - const uint64_t payload_bytes_total, - const void *desc, - uint8_t *hmem_bounce_buf, - const uint64_t pbc_dlid, - const uint64_t bth_rx, - const uint64_t lrh_dlid, - const union fi_opx_addr addr, - uint64_t tag, - const uint32_t data, - int lock_required, - const uint64_t caps, - const enum ofi_reliability_kind reliability, - uint32_t *psn_out, - const enum fi_hmem_iface iface, - const uint64_t hmem_device) +ssize_t fi_opx_hfi1_tx_send_mp_egr_first_common(struct fi_opx_ep *opx_ep, + void **buf, + const uint64_t payload_bytes_total, + const void *desc, + uint8_t *hmem_bounce_buf, + const uint64_t pbc_dlid, + const uint64_t bth_rx, + const uint64_t lrh_dlid, + const union fi_opx_addr addr, + uint64_t tag, + const uint32_t data, + int lock_required, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + uint32_t *psn_out, + const enum fi_hmem_iface iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); @@ -1752,6 +3122,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-MP-EAGER-FIRST-HFI"); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, FI_OPX_MP_EGR_CHUNK_CREDITS); if (OFI_UNLIKELY(total_credits_available < 0)) { return -FI_ENOBUFS; @@ -1763,12 +3134,12 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } - *psn_out = psn; /* This will be the UID used in the remaining packets */ + *psn_out = psn; /* This will be the UID used in the remaining packets */ #ifdef OPX_HMEM /* If the source buf resides in GPU memory, copy the entire payload to @@ -1778,40 +3149,74 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, if (iface != FI_HMEM_SYSTEM) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, hmem_bounce_buf, - *buf, payload_bytes_total, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + *buf, payload_bytes_total, OPX_HMEM_DEV_REG_SEND_THRESHOLD); *buf = hmem_bounce_buf; FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi - .kind[FI_OPX_KIND_TAG] - .send.mp_eager); + .kind[FI_OPX_KIND_TAG] + .send.mp_eager); } #endif void *buf_ptr = *buf; - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; + + const uint16_t lrh_dws = (hfi1_type & OPX_HFI1_JKR) ? (FI_OPX_MP_EGR_CHUNK_DWS - 2) >> 1 : htons(FI_OPX_MP_EGR_CHUNK_DWS - 1); #ifndef NDEBUG unsigned credits_consumed = #endif - fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(opx_ep, &pio_state, tmp, buf_ptr, bth_rx, lrh_dlid, - htons(FI_OPX_MP_EGR_CHUNK_DWS - 1), - pbc_dlid, - FI_OPX_MP_EGR_CHUNK_DWS, - payload_bytes_total, - psn, - data, - tag, - caps); + fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(opx_ep, &pio_state, local_temp, buf_ptr, bth_rx, lrh_dlid, + lrh_dws, + pbc_dlid, + FI_OPX_MP_EGR_CHUNK_DWS, + payload_bytes_total, + psn, + data, + tag, + caps, + hfi1_type); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf_ptr + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL); + if (hfi1_type & OPX_HFI1_JKR) { + /* write header and payload */ + +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, tag, buf_qws); + + buf_qws = buf_qws + 7; + + uint32_t full_block_credits_needed = FI_OPX_MP_EGR_CHUNK_CREDITS - 3; // the last block needs to include icrc, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, - buf_qws, - FI_OPX_MP_EGR_CHUNK_CREDITS - 1, - total_credits_available - 1); + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, + full_block_credits_needed, + total_credits_available - 2); FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + buf_qws = buf_qws + (full_block_credits_needed << 3); + +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, + buf_qws, + 7 ); // 7 QW data + 1 QW ICRC + } else { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, + FI_OPX_MP_EGR_CHUNK_CREDITS - 1, + total_credits_available - 1); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + } + #ifndef NDEBUG assert(credits_consumed == FI_OPX_MP_EGR_CHUNK_CREDITS); #endif @@ -1823,9 +3228,8 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first (struct fi_opx_ep *opx_ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, tmp, buf_ptr, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS, reliability); - + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, local_temp, buf_ptr, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type), reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-FIRST-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND, HFI -- MULTI-PACKET EAGER FIRST (end)\n"); @@ -1843,7 +3247,8 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, const uint64_t lrh_dlid, const union fi_opx_addr addr, int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); @@ -1852,6 +3257,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-MP-EAGER-NTH-HFI"); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, FI_OPX_MP_EGR_CHUNK_CREDITS); if (OFI_UNLIKELY(total_credits_available < 0)) { OPX_TRACER_TRACE(OPX_TRACER_END_ENOBUFS, "SEND-MP-EAGER-NTH-HFI"); @@ -1863,23 +3269,24 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, int32_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; #ifndef NDEBUG unsigned credits_consumed = #endif - fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, tmp, buf, bth_rx, lrh_dlid, + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, htons(FI_OPX_MP_EGR_CHUNK_DWS - 1), pbc_dlid, FI_OPX_MP_EGR_CHUNK_DWS, FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, payload_offset, psn, - mp_egr_uid); + mp_egr_uid, + hfi1_type); FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL); @@ -1887,7 +3294,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, buf_qws, FI_OPX_MP_EGR_CHUNK_CREDITS - 1, total_credits_available - 1); @@ -1902,8 +3309,107 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth (struct fi_opx_ep *opx_ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, tmp, buf, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS, reliability); + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, local_temp, buf, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type), reliability, hfi1_type); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-NTH-HFI"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, HFI -- MULTI-PACKET EAGER NTH (end)\n"); + + return FI_SUCCESS; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_mp_egr_nth_16B (struct fi_opx_ep *opx_ep, + const void *buf, + const uint32_t payload_offset, + const uint32_t mp_egr_uid, + const uint64_t pbc_dlid, + const uint64_t bth_rx, + const uint64_t lrh_dlid, + const union fi_opx_addr addr, + int lock_required, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) +{ + assert(lock_required == 0); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER NTH (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-MP-EAGER-NTH-HFI"); + + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, FI_OPX_MP_EGR_CHUNK_CREDITS); + if (OFI_UNLIKELY(total_credits_available < 0)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ENOBUFS, "SEND-MP-EAGER-NTH-HFI"); + return -FI_ENOBUFS; + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int32_t psn; + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if (OFI_UNLIKELY(psn == -1)) { + return -FI_EAGAIN; + } + + uint64_t local_temp[16] = {0}; +#ifndef NDEBUG + unsigned credits_consumed = +#endif + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, + (FI_OPX_MP_EGR_CHUNK_DWS - 2) >> 1, + pbc_dlid, + FI_OPX_MP_EGR_CHUNK_DWS, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, + payload_offset, + psn, + mp_egr_uid, + hfi1_type); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL); + + /* header and payload */ +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, + (((uint64_t) mp_egr_uid) << 32) | payload_offset, buf_qws); + buf_qws = (uint64_t*)((uintptr_t)buf + 56); + + uint16_t full_block_credits_needed = FI_OPX_MP_EGR_CHUNK_CREDITS - 3; +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, + full_block_credits_needed, + total_credits_available - 2); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + buf_qws = buf_qws + (full_block_credits_needed << 3); + +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, + buf_qws, + 7 ); // 7 QW data + 1 QW ICRC + +#ifndef NDEBUG + assert(credits_consumed == FI_OPX_MP_EGR_CHUNK_CREDITS); +#endif + + + /* update the hfi txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, local_temp, buf, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type), reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-NTH-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -1923,7 +3429,8 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, const uint64_t lrh_dlid, const union fi_opx_addr addr, int lock_required, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); @@ -1958,6 +3465,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, const uint16_t lrh_dws = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); if (OFI_UNLIKELY(total_credits_available < 0)) { OPX_TRACER_TRACE(OPX_TRACER_END_ENOBUFS, "SEND-MP-EAGER-NTH-LAST"); @@ -1969,13 +3477,13 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, int32_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND-MP-EAGER-NTH-LAST"); return -FI_EAGAIN; } - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; #ifndef NDEBUG unsigned credits_consumed; @@ -1985,16 +3493,16 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, #ifndef NDEBUG credits_consumed = #endif - fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(opx_ep, &pio_state, tmp, buf, bth_rx, + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, lrh_dws, pbc_dlid, pbc_dws, len, payload_offset, - psn, mp_egr_uid); + psn, mp_egr_uid, hfi1_type); } else { #ifndef NDEBUG credits_consumed = #endif - fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, tmp, buf, bth_rx, lrh_dlid, - lrh_dws, pbc_dlid, pbc_dws, xfer_bytes_tail, payload_offset, psn, mp_egr_uid); + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, + lrh_dws, pbc_dlid, pbc_dws, xfer_bytes_tail, payload_offset, psn, mp_egr_uid, hfi1_type); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); @@ -2002,7 +3510,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, buf_qws, full_block_credits_needed, total_credits_available - 1); @@ -2012,7 +3520,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_payload_tail(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, buf_qws + (full_block_credits_needed << 3), payload_qws_tail); } @@ -2028,8 +3536,160 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, - xfer_bytes_tail, tmp, buf, - payload_qws_total, reliability); + xfer_bytes_tail, local_temp, buf, + payload_qws_total, reliability, hfi1_type); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-NTH-LAST"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND, HFI -- MULTI-PACKET EAGER LAST (end)\n"); + + return FI_SUCCESS; +} + +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_mp_egr_last_16B (struct fi_opx_ep *opx_ep, + const void *buf, + const uint32_t payload_offset, + const ssize_t len, + const uint32_t mp_egr_uid, + const uint64_t pbc_dlid, + const uint64_t bth_rx, + const uint64_t lrh_dlid, + const union fi_opx_addr addr, + int lock_required, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) +{ + assert(lock_required == 0); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER LAST (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-MP-EAGER-NTH-LAST"); + + size_t xfer_bytes_tail; + if (len <= FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL) { + xfer_bytes_tail = len; + } else if (!(len & 0x07ul)) { + /* Length is a multiple of 8 bytes and must be at least 24. + We can store 16 bytes of that in tail */ + xfer_bytes_tail = FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL; + } else { + /* Length is not a multiple of 8 bytes, and it's greater than 16. + We can store 8 + n bytes in tail (where n == len % 8) */ + xfer_bytes_tail = 8 + (len & 0x07ul); + } + + const size_t payload_qws_total = (len - xfer_bytes_tail) >> 3; + /* 16B (RcvPktCtrl=9) has 1 QW of KDETH and 1 QW of tail in PIO (non-SOP) */ + const size_t kdeth9_qws_total = 1; + const size_t tail_qws_total = 1; + + /* Full 64 byte/8 qword blocks -- 1 credit per block */ + uint16_t full_block_credits_needed = (uint16_t)((kdeth9_qws_total + payload_qws_total + tail_qws_total) >> 3); + /* Remaining tail qwords (< 8) after full blocks */ + size_t tail_partial_block_qws = (kdeth9_qws_total + payload_qws_total + tail_qws_total) & 0x07ul; + + const uint64_t pbc_dws = + /* PIO SOP is 16 DWS/8 QWS*/ + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth */ + /* PIO is everything else */ + (kdeth9_qws_total << 1) + /* kdeth9 remaining 2 dws */ + //--------------------- header split point KDETH 9 DWS + (payload_qws_total << 1) + /* one packet payload */ + (tail_qws_total << 1) ; /* tail 1 qws/2 dws */ + + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ + + uint16_t total_credits_needed = + 1 + /* PIO SOP -- 1 credit */ + full_block_credits_needed + /* PIO full blocks -- kdeth9/payload/tail */ + (tail_partial_block_qws > 0); /* PIO partial block -- 1 credit */ + + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, total_credits_needed); + if (OFI_UNLIKELY(total_credits_available < 0)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ENOBUFS, "SEND-MP-EAGER-NTH-LAST"); + return -FI_ENOBUFS; + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int32_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, + addr.hfi1_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if (OFI_UNLIKELY(psn == -1)) { + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND-MP-EAGER-NTH-LAST"); + return -FI_EAGAIN; + } + + uint64_t local_temp[16] = {0}; + +#ifndef NDEBUG + unsigned credits_consumed; +#endif + + if (OFI_UNLIKELY(len <= FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL)) { +#ifndef NDEBUG + credits_consumed = +#endif + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload(opx_ep, &pio_state, local_temp, buf, bth_rx, + lrh_dlid, lrh_qws, pbc_dlid, pbc_dws, len, payload_offset, + psn, mp_egr_uid, hfi1_type); + } else { +#ifndef NDEBUG + credits_consumed = +#endif + fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, + lrh_qws, pbc_dlid, pbc_dws, xfer_bytes_tail, payload_offset, psn, mp_egr_uid, hfi1_type); + uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); + + /* header and payload */ +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, + (((uint64_t) mp_egr_uid) << 32) | payload_offset, buf_qws); + buf_qws = (uint64_t*)((uintptr_t)buf + 56); + + if (full_block_credits_needed) full_block_credits_needed--; + + if (OFI_LIKELY(full_block_credits_needed)) { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, + buf_qws, + full_block_credits_needed, + total_credits_available - 2); + } + + if (OFI_LIKELY(tail_partial_block_qws)) { +#ifndef NDEBUG + credits_consumed += +#endif + fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, + buf_qws + (full_block_credits_needed << 3), + tail_partial_block_qws); + } + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + +#ifndef NDEBUG + assert(credits_consumed == total_credits_needed); +#endif + + /* update the hfi txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, + xfer_bytes_tail, local_temp, buf, + payload_qws_total, reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-NTH-LAST"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -2038,6 +3698,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, return FI_SUCCESS; } + static inline void fi_opx_shm_write_fence(struct fi_opx_ep *opx_ep, const uint8_t dest_hfi_unit, const uint64_t dest_rx, @@ -2057,26 +3718,26 @@ static inline void fi_opx_shm_write_fence(struct fi_opx_ep *opx_ep, ssize_t rc; /* DAOS support - rank_inst field has been depricated and will be phased out. * The value is always zero. */ - union fi_opx_hfi1_packet_hdr * tx_hdr = opx_shm_tx_next( + union opx_hfi1_packet_hdr * hdr = opx_shm_tx_next( &opx_ep->tx->shm, dest_hfi_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, dest_extended_rx, 0, &rc); /* Potential infinite loop, unable to return result to application */ - while(OFI_UNLIKELY(tx_hdr == NULL)) { //TODO: Verify that all callers of this function can tolderate a NULL rc - fi_opx_shm_poll_many(&opx_ep->ep_fid, FI_OPX_LOCK_NOT_REQUIRED); - tx_hdr = opx_shm_tx_next( + while(OFI_UNLIKELY(hdr == NULL)) { //TODO: Verify that all callers of this function can tolderate a NULL rc + fi_opx_shm_poll_many(&opx_ep->ep_fid, FI_OPX_LOCK_NOT_REQUIRED, OPX_HFI1_TYPE); + hdr = opx_shm_tx_next( &opx_ep->tx->shm, dest_hfi_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, dest_extended_rx, 0, &rc); } - tx_hdr->qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - tx_hdr->qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.cts.hdr.qw[2]; - tx_hdr->qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - tx_hdr->qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); - tx_hdr->qw[5] = (uintptr_t)cc; - tx_hdr->qw[6] = bytes_to_sync; + hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); + hdr->qw_9B[5] = (uintptr_t)cc; + hdr->qw_9B[6] = bytes_to_sync; - opx_shm_tx_advance(&opx_ep->tx->shm, (void *)tx_hdr, pos); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); } ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, size_t niov, @@ -2087,7 +3748,8 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz uint64_t *origin_byte_counter_value, const uint64_t caps, const enum ofi_reliability_kind reliability, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device); + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t tag, void *context, @@ -2097,7 +3759,18 @@ ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, uint64_t *origin_byte_counter_value, const uint64_t caps, const enum ofi_reliability_kind reliability, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device); + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type); +ssize_t fi_opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void *context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, + uint64_t *origin_byte_counter_value, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const enum fi_hmem_iface hmem_iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type); #endif /* _FI_PROV_OPX_HFI1_TRANSPORT_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_version.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_version.h index 1d87a9244d4..e3b2fd8503d 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_version.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_version.h @@ -42,209 +42,108 @@ // RHF changes // Common to both JKR/WFR -#define OPX_RHF_RCV_TYPE_EXPECTED_RCV(_rhf) ((_rhf & 0x00007000ul) == 0x00000000ul) -#define OPX_RHF_RCV_TYPE_EAGER_RCV(_rhf) ((_rhf & 0x00001000ul) == 0x00001000ul) -#define OPX_RHF_RCV_TYPE_OTHER(_rhf) ((_rhf & 0x00006000ul) != 0x00000000ul) +#define OPX_RHF_RCV_TYPE_EXPECTED_RCV(_rhf, _noop) ((_rhf & 0x00007000ul) == 0x00000000ul) +#define OPX_RHF_RCV_TYPE_EAGER_RCV(_rhf, _noop) ((_rhf & 0x00001000ul) == 0x00001000ul) +#define OPX_RHF_RCV_TYPE_OTHER(_rhf, _noop) ((_rhf & 0x00006000ul) != 0x00000000ul) -#define OPX_PBC_CR(cr) ((cr & FI_OPX_HFI1_PBC_CR_MASK) << FI_OPX_HFI1_PBC_CR_SHIFT) -#define OPX_PBC_LEN(len) (len) -#define OPX_PBC_VL(vl) ((vl & FI_OPX_HFI1_PBC_VL_MASK) << FI_OPX_HFI1_PBC_VL_SHIFT) +#define OPX_PBC_CR(cr, _noop) ((cr & FI_OPX_HFI1_PBC_CR_MASK) << FI_OPX_HFI1_PBC_CR_SHIFT) +#define OPX_PBC_LEN(len, _noop) (len) +#define OPX_PBC_VL(vl, _noop) ((vl & FI_OPX_HFI1_PBC_VL_MASK) << FI_OPX_HFI1_PBC_VL_SHIFT) /* Note: double check JKR sc bits */ -#define OPX_PBC_SC(sc) (((sc >> FI_OPX_HFI1_PBC_SC4_SHIFT) & FI_OPX_HFI1_PBC_SC4_MASK) << FI_OPX_HFI1_PBC_DCINFO_SHIFT) +#define OPX_PBC_SC(sc, _noop) (((sc >> FI_OPX_HFI1_PBC_SC4_SHIFT) & FI_OPX_HFI1_PBC_SC4_MASK) << FI_OPX_HFI1_PBC_DCINFO_SHIFT) /* PBC most significant bits shift (32 bits) defines */ -#define OPX_PBC_MSB_SHIFT 32 +#define OPX_MSB_SHIFT 32 -#if (defined(OPX_WFR) && !defined(OPX_JKR)) /***************************************************************/ -/* WFR Build specific definitions */ +/* Both JKR and WFR runtime is now supported (no longer doing */ +/* build-time constants) */ +/* */ +/* Runtime support relies on a local variable "hfi1_type", */ +/* which is likely passed down through macro and function */ +/* constants which are selected/optimized inline with */ +/* function tables. */ /***************************************************************/ - #define OPX_PBC_DLID OPX_PBC_WFR_DLID - #define OPX_PBC_SCTXT OPX_PBC_WFR_SCTXT - #define OPX_PBC_L2COMPRESSED OPX_PBC_WFR_L2COMPRESSED - #define OPX_PBC_PORTIDX OPX_PBC_WFR_PORTIDX - #define OPX_PBC_L2TYPE OPX_PBC_WFR_L2TYPE - #define OPX_PBC_RUNTIME OPX_PBC_WFR_RUNTIME - #define OPX_PBC_LRH_DLID_TO_PBC_DLID OPX_PBC_WFR_LRH_DLID_TO_PBC_DLID +#define OPX_PBC_DLID(dlid, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_DLID(dlid) : OPX_PBC_JKR_DLID(dlid)) -#elif (defined(OPX_JKR) && !defined(OPX_WFR)) -/***************************************************************/ -/* JKR Build specific definitions */ -/***************************************************************/ - - #define OPX_PBC_DLID OPX_PBC_JKR_DLID - #define OPX_PBC_SCTXT OPX_PBC_JKR_SCTXT - #define OPX_PBC_L2COMPRESSED OPX_PBC_JKR_L2COMPRESSED - #define OPX_PBC_PORTIDX OPX_PBC_JKR_PORTIDX - #define OPX_PBC_L2TYPE OPX_PBC_JKR_L2TYPE - #define OPX_PBC_RUNTIME OPX_PBC_JKR_RUNTIME - #define OPX_PBC_LRH_DLID_TO_PBC_DLID OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID +#define OPX_PBC_SCTXT(ctx, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_SCTXT(ctx) : OPX_PBC_JKR_SCTXT(ctx)) -#elif (defined(OPX_JKR) && defined(OPX_WFR)) -/***************************************************************/ -/* Both JKR and WFR runtime support (not build-time constants) */ -/***************************************************************/ +#define OPX_PBC_L2COMPRESSED(c, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_L2COMPRESSED(c) : OPX_PBC_JKR_L2COMPRESSED(c)) - #define OPX_PBC_DLID(dlid) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_DLID(dlid) : OPX_PBC_WFR_DLID(dlid)) +#define OPX_PBC_PORTIDX(pidx, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_PORTIDX(pidx) : OPX_PBC_JKR_PORTIDX(pidx)) - #define OPX_PBC_SCTXT(ctx) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_SCTXT(ctx) : OPX_PBC_WFR_SCTXT(ctx)) +#define OPX_PBC_LRH_DLID_TO_PBC_DLID(dlid, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ +OPX_PBC_WFR_LRH_DLID_TO_PBC_DLID(dlid) : OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID(dlid)) - #define OPX_PBC_L2COMPRESSED(c) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_L2COMPRESSED(c) : OPX_PBC_WFR_L2COMPRESSED(c)) - #define OPX_PBC_PORTIDX(pidx) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_PORTIDX(pidx) : OPX_PBC_WFR_PORTIDX(pidx)) +#define OPX_PBC_L2TYPE(type, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_PBC_WFR_L2TYPE(type) : OPX_PBC_JKR_L2TYPE(type)) - #define OPX_PBC_LRH_DLID_TO_PBC_DLID(dlid) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID(dlid) : OPX_PBC_WFR_LRH_DLID_TO_PBC_DLID(dlid)) - - -/* Mixed WFR/JKR header support must be 9B */ -#ifndef NDEBUG - - __OPX_FORCE_INLINE__ - uint32_t opx_pbc_l2type(unsigned type) - { - assert(type == OPX_PBC_JKR_L2TYPE_9B); - return ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? - OPX_PBC_JKR_L2TYPE(type) : OPX_PBC_WFR_L2TYPE(type)); - } - #define OPX_PBC_L2TYPE(type) opx_pbc_l2type(type) -#else - - #define OPX_PBC_L2TYPE(type) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_PBC_JKR_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) : \ - OPX_PBC_WFR_L2TYPE(OPX_PBC_JKR_L2TYPE_9B)) /* OPX_PBC_WFR_UNUSED */ -#endif +/* One runtime check for mutiple fields - DLID, PORT, L2TYPE */ +#define OPX_PBC_RUNTIME(_dlid, _pidx, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + (OPX_PBC_WFR_DLID(_dlid) | OPX_PBC_WFR_PORTIDX(_pidx)) : \ + (OPX_PBC_JKR_DLID(_dlid) | OPX_PBC_JKR_PORTIDX(_pidx))) - /* One runtime check for mutiple fields - DLID, PORT, L2TYPE */ - #define OPX_PBC_RUNTIME(dlid, pidx) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - (OPX_PBC_JKR_DLID(dlid) | OPX_PBC_JKR_PORTIDX(pidx)) : \ - (OPX_PBC_WFR_DLID(dlid) | OPX_PBC_WFR_PORTIDX(pidx)) ) -#else /* ERROR */ - #warning Should not happen Not WFR and Not JKR - #error "NOT WFR AND NOT JKR" -#endif - #define OPX_BTH_UNUSED 0 // Default unsupported values to 0 -#if (defined(OPX_JKR) && !defined(OPX_WFR)) -/***************************************************************/ -/* JKR Build specific definitions */ -/***************************************************************/ - -#define OPX_BTH_CSPEC(_cspec) OPX_BTH_JKR_CSPEC(_cspec) -#define OPX_BTH_RC2(_rc2) OPX_BTH_JKR_RC2(_rc2) -#define OPX_BTH_CSPEC_DEFAULT OPX_BTH_UNUSED // Cspec is not used in 9B header -#define OPX_BTH_RC2_VAL OPX_BTH_JKR_RC2_VAL - -#elif (defined(OPX_WFR) && !defined(OPX_JKR)) -/***************************************************************/ -/* WKR Build specific definitions */ -/***************************************************************/ - -#define OPX_BTH_RC2(_rc2) OPX_BTH_UNUSED -#define OPX_BTH_CSPEC(_cspec) OPX_BTH_UNUSED -#define OPX_BTH_CSPEC_DEFAULT OPX_BTH_UNUSED -#define OPX_BTH_RC2_VAL OPX_BTH_UNUSED - -#elif (defined(OPX_JKR) && defined(OPX_WFR)) -/***************************************************************/ -/* Both JKR and WFR runtime support (not build-time constants) */ -/***************************************************************/ - -#define OPX_BTH_RC2(_rc2) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ +#define OPX_BTH_RC2(_rc2, _hfi1_type) ((_hfi1_type & OPX_HFI1_JKR) ? \ OPX_BTH_JKR_RC2(_rc2) : OPX_BTH_UNUSED) -#define OPX_BTH_CSPEC(_cspec) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ +#define OPX_BTH_CSPEC(_cspec, _hfi1_type) ((_hfi1_type & OPX_HFI1_JKR) ? \ OPX_BTH_JKR_CSPEC(_cspec) : OPX_BTH_UNUSED) #define OPX_BTH_CSPEC_DEFAULT OPX_BTH_UNUSED // Cspec is not used in 9B header -#define OPX_BTH_RC2_VAL ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ +#define OPX_BTH_RC2_VAL(_hfi1_type) ((_hfi1_type & OPX_HFI1_JKR) ? \ OPX_BTH_JKR_RC2_VAL : OPX_BTH_UNUSED) -#endif +#define OPX_RHF_SEQ_NOT_MATCH(_seq, _rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_SEQ_NOT_MATCH(_seq, _rhf) : OPX_JKR_RHF_SEQ_NOT_MATCH(_seq, _rhf)) -#if (defined(OPX_JKR) && !defined(OPX_WFR)) -/***************************************************************/ -/* JKR Build specific definitions */ -/***************************************************************/ -#define OPX_RHF_SEQ_NOT_MATCH OPX_JKR_RHF_SEQ_NOT_MATCH -#define OPX_RHF_SEQ_INCREMENT OPX_JKR_RHF_SEQ_INCREMENT -#define OPX_IS_ERRORED_RHF OPX_JKR_IS_ERRORED_RHF -#define OPX_RHF_SEQ_MATCH OPX_JKR_RHF_SEQ_MATCH -#define OPX_RHF_SEQ_INIT_VAL OPX_JKR_RHF_SEQ_INIT_VAL -#define OPX_RHF_IS_USE_EGR_BUF OPX_JKR_RHF_IS_USE_EGR_BUF -#define OPX_RHF_EGR_INDEX OPX_JKR_RHF_EGR_INDEX -#define OPX_RHF_EGR_OFFSET OPX_JKR_RHF_EGR_OFFSET -#define OPX_RHF_HDRQ_OFFSET OPX_JKR_RHF_HDRQ_OFFSET - -#define OPX_RHE_DEBUG OPX_JKR_RHE_DEBUG -#define OPX_RHF_CHECK_HEADER OPX_JKR_RHF_CHECK_HEADER - -#elif (defined(OPX_WFR) && !defined(OPX_JKR)) -/***************************************************************/ -/* WKR Build specific definitions */ -/***************************************************************/ -#define OPX_RHF_SEQ_NOT_MATCH OPX_WFR_RHF_SEQ_NOT_MATCH -#define OPX_RHF_SEQ_INCREMENT OPX_WFR_RHF_SEQ_INCREMENT -#define OPX_IS_ERRORED_RHF OPX_WFR_IS_ERRORED_RHF -#define OPX_RHF_SEQ_MATCH OPX_WFR_RHF_SEQ_MATCH -#define OPX_RHF_SEQ_INIT_VAL OPX_WFR_RHF_SEQ_INIT_VAL -#define OPX_RHF_IS_USE_EGR_BUF OPX_WFR_RHF_IS_USE_EGR_BUF -#define OPX_RHF_EGR_INDEX OPX_WFR_RHF_EGR_INDEX -#define OPX_RHF_EGR_OFFSET OPX_WFR_RHF_EGR_OFFSET -#define OPX_RHF_HDRQ_OFFSET OPX_WFR_RHF_HDRQ_OFFSET - -#define OPX_RHE_DEBUG OPX_WFR_RHE_DEBUG -#define OPX_RHF_CHECK_HEADER OPX_WFR_RHF_CHECK_HEADER - -#elif (defined(OPX_JKR) && defined(OPX_WFR)) -/***************************************************************/ -/* Both JKR and WFR runtime support (not build-time constants) */ -/* Constant macro magic will be used later for this */ -/***************************************************************/ -#define OPX_RHF_SEQ_NOT_MATCH(_seq, _rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_SEQ_NOT_MATCH(_seq, _rhf) : OPX_WFR_RHF_SEQ_NOT_MATCH(_seq, _rhf)) +#define OPX_RHF_SEQ_INCREMENT(_seq, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_SEQ_INCREMENT(_seq) : OPX_JKR_RHF_SEQ_INCREMENT(_seq)) -#define OPX_RHF_SEQ_INCREMENT(_seq) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_SEQ_INCREMENT(_seq) : OPX_WFR_RHF_SEQ_INCREMENT(_seq)) +#define OPX_IS_ERRORED_RHF(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_IS_ERRORED_RHF(_rhf, _hfi1_type) : OPX_JKR_IS_ERRORED_RHF(_rhf, _hfi1_type)) -#define OPX_IS_ERRORED_RHF(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_IS_ERRORED_RHF(_rhf) : OPX_WFR_IS_ERRORED_RHF(_rhf)) +#define OPX_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type) : OPX_JKR_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type)) -#define OPX_RHF_SEQ_MATCH(_seq, _rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_SEQ_MATCH(_seq, _rhf) : OPX_WFR_RHF_SEQ_MATCH(_seq, _rhf)) +/* Init-time, let it use the variable - not optimized */ +#define OPX_RHF_SEQ_INIT_VAL(_hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_SEQ_INIT_VAL : OPX_JKR_RHF_SEQ_INIT_VAL) -#define OPX_RHF_SEQ_INIT_VAL ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_SEQ_INIT_VAL : OPX_WFR_RHF_SEQ_INIT_VAL) +#define OPX_RHF_IS_USE_EGR_BUF(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_IS_USE_EGR_BUF(_rhf) : OPX_JKR_RHF_IS_USE_EGR_BUF(_rhf)) -#define OPX_RHF_IS_USE_EGR_BUF(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_IS_USE_EGR_BUF(_rhf) : OPX_WFR_RHF_IS_USE_EGR_BUF(_rhf)) +#define OPX_RHF_EGR_INDEX(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_EGR_INDEX(_rhf) : OPX_JKR_RHF_EGR_INDEX(_rhf)) -#define OPX_RHF_EGR_INDEX(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_EGR_INDEX(_rhf) : OPX_WFR_RHF_EGR_INDEX(_rhf)) +#define OPX_RHF_EGR_OFFSET(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_EGR_OFFSET(_rhf) : OPX_JKR_RHF_EGR_OFFSET(_rhf)) -#define OPX_RHF_EGR_OFFSET(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_EGR_OFFSET(_rhf) : OPX_WFR_RHF_EGR_OFFSET(_rhf)) +#define OPX_RHF_HDRQ_OFFSET(_rhf, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_HDRQ_OFFSET(_rhf) : OPX_JKR_RHF_HDRQ_OFFSET(_rhf)) -#define OPX_RHF_HDRQ_OFFSET(_rhf) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_HDRQ_OFFSET(_rhf) : OPX_WFR_RHF_HDRQ_OFFSET(_rhf)) +#define OPX_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) \ + ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) : \ + OPX_JKR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type)) -#define OPX_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) \ - ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) : \ - OPX_WFR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr)) +#define OPX_RHF_CHECK_HEADER(_rhf_rcvd, _pktlen, _hfi1_type) ((_hfi1_type & OPX_HFI1_WFR) ? \ + OPX_WFR_RHF_CHECK_HEADER(_rhf_rcvd, _pktlen, _hfi1_type) : OPX_JKR_RHF_CHECK_HEADER(_rhf_rcvd, _pktlen, _hfi1_type)) -#define OPX_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) ((OPX_HFI1_TYPE == OPX_HFI1_JKR) ? \ - OPX_JKR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) : OPX_WFR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) -#endif +#define OPX_HEADER_SIZE (8 * 8) // doesn't include PBC. For 9B it includes the unused_pad qw. #endif + + + diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_wfr.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_wfr.h index df8e6ea13af..086795afc23 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_wfr.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_wfr.h @@ -52,8 +52,6 @@ __OPX_FORCE_INLINE__ uint32_t opx_pbc_wfr_l2type(unsigned _type) { - /* Just verify WFR isn't attempting non-9B */ - assert(_type == _OPX_PBC_JKR_L2TYPE_9B_); return OPX_PBC_WFR_UNUSED; } #define OPX_PBC_WFR_L2TYPE(_type) opx_pbc_wfr_l2type(_type) @@ -64,7 +62,7 @@ #define OPX_PBC_WFR_RUNTIME(_dlid, _pidx) OPX_PBC_WFR_UNUSED /* Unused WFR field - always initialized with PBC to 0. - #define OPX_PBC_STATICRCC(srcc) (((unsigned long long)(dlid & OPX_PBC_WFR_STATICRCC_MASK) << OPX_PBC_WFR_STATICRCC_SHIFT) << OPX_PBC_MSB_SHIFT) + #define OPX_PBC_STATICRCC(srcc) (((unsigned long long)(dlid & OPX_PBC_WFR_STATICRCC_MASK) << OPX_PBC_WFR_STATICRCC_SHIFT) << OPX_MSB_SHIFT) */ /* WFR @@ -83,8 +81,8 @@ #define OPX_WFR_RHF_SEQ_NOT_MATCH(_seq, _rhf) (_seq != (_rhf & 0xF0000000ul)) #define OPX_WFR_RHF_SEQ_INCREMENT(_seq) ((_seq < 0xD0000000ul) * _seq + 0x10000000ul) -#define OPX_WFR_IS_ERRORED_RHF(_rhf) (_rhf & 0xBFE0000000000000ul) -#define OPX_WFR_RHF_SEQ_MATCH(_seq, _rhf) (_seq == (_rhf & 0xF0000000ul)) +#define OPX_WFR_IS_ERRORED_RHF(_rhf, _hfi1_type) (_rhf & 0xBFE0000000000000ul) +#define OPX_WFR_RHF_SEQ_MATCH(_seq, _rhf, _hfi1_type) (_seq == (_rhf & 0xF0000000ul)) #define OPX_WFR_RHF_SEQ_INIT_VAL (0x10000000ul) #define OPX_WFR_RHF_IS_USE_EGR_BUF(_rhf) ((_rhf & 0x00008000ul) == 0x00008000ul) #define OPX_WFR_RHF_EGRBFR_INDEX_MASK (0x7FF) @@ -112,10 +110,11 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); -#define OPX_WFR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) \ - opx_wfr_rhe_debug(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr) +#define OPX_WFR_RHE_DEBUG(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) \ + opx_wfr_rhe_debug(_opx_ep, _rhe_ptr, _rhf_ptr, _rhf_msb, _rhf_lsb, _rhf_seq, _hdrq_offset, _rhf_rcvd, _hdr, _hfi1_type) // Common to both JKR/WFR @@ -124,22 +123,24 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, #define OPX_WFR_RHF_RCV_TYPE_OTHER(_rhf) ((_rhf & 0x00006000ul) != 0x00000000ul) /* Common (jkr) handler to WFR/JKR 9B (for now) */ -int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr); +int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type); -__OPX_FORCE_INLINE__ int opx_wfr_rhf_check_header(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr) +__OPX_FORCE_INLINE__ int opx_wfr_rhf_check_header(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { /* RHF error */ - if (OFI_UNLIKELY(OPX_WFR_IS_ERRORED_RHF(rhf_rcvd))) return 1; /* error */ + if (OFI_UNLIKELY(OPX_WFR_IS_ERRORED_RHF(rhf_rcvd, OPX_HFI1_WFR))) return 1; /* error */ /* Bad packet header */ if (OFI_UNLIKELY((!OPX_WFR_RHF_IS_USE_EGR_BUF(rhf_rcvd)) && - (ntohs(hdr->stl.lrh.pktlen) > 0x15) && + (ntohs(hdr->lrh_9B.pktlen) > 0x15) && !(OPX_WFR_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)))) - return opx_jkr_rhf_error_handler(rhf_rcvd, hdr); /* error */ + return opx_jkr_rhf_error_handler(rhf_rcvd, hdr, hfi1_type); /* error */ else return 0; /* no error*/ } -#define OPX_WFR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr) opx_wfr_rhf_check_header(_rhf_rcvd, _hdr) +#define OPX_WFR_RHF_CHECK_HEADER(_rhf_rcvd, _hdr, _hfi1_type) opx_wfr_rhf_check_header(_rhf_rcvd, _hdr, _hfi1_type) #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_reliability.h b/prov/opx/include/rdma/opx/fi_opx_reliability.h index d6bd81b65ef..5e690377dd8 100644 --- a/prov/opx/include/rdma/opx/fi_opx_reliability.h +++ b/prov/opx/include/rdma/opx/fi_opx_reliability.h @@ -153,9 +153,12 @@ struct fi_opx_reliability_service { volatile uint64_t * pio_scb_first; /* == CACHE LINE == */ - struct fi_opx_hfi1_txe_scb ping_model; - struct fi_opx_hfi1_txe_scb ack_model; - struct fi_opx_hfi1_txe_scb nack_model; + struct fi_opx_hfi1_txe_scb_9B ping_model_9B; + struct fi_opx_hfi1_txe_scb_9B ack_model_9B; + struct fi_opx_hfi1_txe_scb_9B nack_model_9B; + struct fi_opx_hfi1_txe_scb_16B ping_model_16B; + struct fi_opx_hfi1_txe_scb_16B ack_model_16B; + struct fi_opx_hfi1_txe_scb_16B nack_model_16B; } hfi1; } tx __attribute__((__packed__));; @@ -281,13 +284,22 @@ struct fi_opx_reliability_tx_replay { /* == CACHE LINE == */ /* --- MUST BE 64 BYTE ALIGNED --- */ - struct fi_opx_hfi1_txe_scb scb; + union { + struct fi_opx_hfi1_txe_scb_9B scb_9B; + struct fi_opx_hfi1_txe_scb_16B scb_16B; + }; + uint8_t data[]; } __attribute__((__aligned__(64))); +#define OPX_REPLAY_HDR(_replay) OPX_REPLAY_HDR_TYPE(_replay, OPX_HFI1_TYPE) + +#define OPX_REPLAY_HDR_TYPE(_replay,_hfi1_type) ((_hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? \ + (&((_replay)->scb_9B.hdr)) : (&((_replay)->scb_16B.hdr)) ) + OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_reliability_tx_replay, sdma_we) == FI_OPX_CACHE_LINE_SIZE, "Reliability Replay sdma_we should start on first cacheline!"); -OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_reliability_tx_replay, scb) & (FI_OPX_CACHE_LINE_SIZE - 1)) == 0, +OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_reliability_tx_replay, scb_9B) & (FI_OPX_CACHE_LINE_SIZE - 1)) == 0, "Reliability Replay scb must be 64-byte aligned!"); struct fi_opx_reliability_resynch_flow { @@ -392,6 +404,8 @@ void fi_opx_rbt_key_value(RbtHandle h, RbtIterator it, void **key, void **val) { uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * service, uuid_t unique_job_key, struct fi_opx_hfi1_context * hfi1, const enum ofi_reliability_kind reliability_kind); +void fi_opx_reliability_model_init_16B (struct fi_opx_reliability_service * service, + struct fi_opx_hfi1_context * hfi1); void fi_opx_reliability_service_fini (struct fi_opx_reliability_service * service); void fi_reliability_service_ping_remote (struct fid_ep *ep, struct fi_opx_reliability_service * service); @@ -416,7 +430,7 @@ struct fi_opx_reliability_rx_uepkt { /* == CACHE LINE == */ uint64_t unused_1; - union fi_opx_hfi1_packet_hdr hdr; /* 56 bytes */ + union opx_hfi1_packet_hdr hdr; /* 56 bytes */ /* == CACHE LINE == */ @@ -547,7 +561,7 @@ struct fi_opx_reliability_client_state { // 88 bytes struct fi_opx_reliability_service * service; void (*process_fn)(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_rs); // 104 bytes @@ -569,7 +583,7 @@ void fi_opx_reliability_client_init (struct fi_opx_reliability_client_state * st const uint8_t rx, const uint8_t tx, void (*process_fn)(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_reliability_rx)); @@ -584,31 +598,34 @@ unsigned fi_opx_reliability_client_active (struct fi_opx_reliability_client_stat static inline void fi_reliability_service_process_command (struct fi_opx_reliability_client_state *state, - struct fi_opx_reliability_tx_replay * replay) + struct fi_opx_reliability_tx_replay * replay, + uint32_t slid, uint32_t dlid, + uint8_t tx, uint8_t rx, + const enum opx_hfi1_type hfi1_type) { union fi_opx_reliability_service_flow_key key = { - .slid = replay->scb.hdr.stl.lrh.slid, - .tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(&replay->scb.hdr), - .dlid = replay->scb.hdr.stl.lrh.dlid, - .rx = replay->scb.hdr.stl.bth.rx + .slid = slid, + .tx = tx, + .dlid = dlid, + .rx = rx }; void * itr = NULL; #ifdef OPX_RELIABILITY_DEBUG - fprintf(stderr, "(tx) packet %016lx %08u posted.\n", key.value, FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr)); + fprintf(stderr, "(tx) packet %016lx %08u posted.\n", key.value, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR_TYPE(replay, hfi1_type))); #endif #ifndef NDEBUG itr = fi_opx_rbt_find(state->tx_flow_rbtree, (void*)key.value); if (itr == NULL) { - fprintf(stderr, "(%d) %s:%s():%d [%016lX] [slid=%04hX tx=%08X dlid=%04hX rx=%0hhX] Error trying to register replay for flow with no handshake!\n", + fprintf(stderr, "(%d) %s:%s():%d [%016lX] [slid=%08X tx=%08X dlid=%08X rx=%0hhX] Error trying to register replay for flow with no handshake!\n", getpid(), __FILE__, __func__, __LINE__, key.value, - replay->scb.hdr.stl.lrh.slid, - FI_OPX_HFI1_PACKET_ORIGIN_TX(&replay->scb.hdr), - replay->scb.hdr.stl.lrh.dlid, - replay->scb.hdr.stl.bth.rx); + slid, + FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, hfi1_type)), + dlid, + OPX_REPLAY_HDR_TYPE(replay, hfi1_type)->bth.rx); assert(itr); } #endif @@ -658,7 +675,7 @@ void fi_reliability_service_process_command (struct fi_opx_reliability_client_st // Debugging tool that deliberately drops packets. static inline uint16_t fi_opx_reliability_rx_drop_packet (struct fi_opx_reliability_client_state * state, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { /* * Two variations of when to drop packets. The first drops a percentage of the @@ -667,7 +684,7 @@ uint16_t fi_opx_reliability_rx_drop_packet (struct fi_opx_reliability_client_sta * use either of these or code up something different depending on what you're * trying to debug. */ -#if 0 +#if 1 // drops a percentage of the packets based on drop_mask. const uint16_t tmp = state->drop_count & state->drop_mask; @@ -701,7 +718,8 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_init(struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, - const uint64_t opcode); + const uint64_t opcode, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_hfi1_tx_reliability_inject_ud_resynch(struct fid_ep *ep, const uint64_t key, @@ -717,9 +735,16 @@ size_t fi_opx_reliability_replay_get_payload_size(struct fi_opx_reliability_tx_r } /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(replay->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - return total_bytes - sizeof(union fi_opx_hfi1_packet_hdr); + /* Inlined but called from non-inlined functions with no const hfi1 type, so just use the runtime check */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + const uint16_t lrh_pktlen_le = ntohs(replay->scb_9B.hdr.lrh_9B.pktlen); + const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + return (total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)); + } else { + const uint16_t lrh_pktlen_le = replay->scb_16B.hdr.lrh_16B.pktlen; + const size_t total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + return (total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B)); + } } __OPX_FORCE_INLINE__ @@ -751,7 +776,7 @@ void fi_opx_reliability_create_rx_flow(struct fi_opx_reliability_client_state * __OPX_FORCE_INLINE__ void fi_opx_reliability_handle_ud_init(struct fid_ep *ep, struct fi_opx_reliability_client_state *state, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { union fi_opx_reliability_service_flow_key key = { .value = hdr->service.key @@ -770,12 +795,13 @@ void fi_opx_reliability_handle_ud_init(struct fid_ep *ep, #endif } - fi_opx_hfi1_tx_reliability_inject_ud_init(ep, key.value, key.slid, origin_rx, FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT_ACK); + fi_opx_hfi1_tx_reliability_inject_ud_init(ep, key.value, key.slid, origin_rx, FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT_ACK, + OPX_HFI1_TYPE); } __OPX_FORCE_INLINE__ void fi_opx_reliability_handle_ud_init_ack(struct fi_opx_reliability_client_state *state, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { /* Find the flow for this communication in flow_rbtree */ union fi_opx_reliability_service_flow_key key = { @@ -854,31 +880,35 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * state, uint64_t slid, uint64_t origin_tx, uint32_t psn, - struct fid_ep *ep, const union fi_opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload); + struct fid_ep *ep, const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, + const uint16_t pktlen, const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, const uint64_t psn_start, const uint64_t psn_count, - const uint64_t opcode); + const uint64_t opcode, const enum opx_hfi1_type hfi1_type); void fi_opx_hfi1_rx_reliability_send_pre_acks(struct fid_ep *ep, const uint64_t dlid, const uint64_t reliability_rx, const uint64_t psn_start, const uint64_t psn_count, - const union fi_opx_hfi1_packet_hdr *const hdr, - const uint8_t origin_rx); + const union opx_hfi1_packet_hdr *const hdr, + const uint8_t origin_rx, + uint32_t slid, + const enum opx_hfi1_type hfi1_type); void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, struct fi_opx_reliability_service * service, uint32_t origin_reliability_rx, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr); void fi_opx_hfi1_rx_reliability_ack_resynch (struct fid_ep *ep, struct fi_opx_reliability_service * service, - const union fi_opx_hfi1_packet_hdr *const hdr); + const union opx_hfi1_packet_hdr *const hdr); void opx_reliability_handshake_init(struct fid_ep *ep, union fi_opx_reliability_service_flow_key key, - const uint64_t target_reliability_rx); + const uint64_t target_reliability_rx, + const enum opx_hfi1_type hfi1_type); __OPX_FORCE_INLINE__ int32_t fi_opx_reliability_tx_max_outstanding () { @@ -920,7 +950,7 @@ bool opx_reliability_ready(struct fid_ep *ep, { /* Not using reliability, or it's Intranode */ - if (fi_opx_hfi_is_intranode(dlid)) + if (opx_lid_is_intranode(dlid)) return true; union fi_opx_reliability_service_flow_key key = { @@ -933,7 +963,7 @@ bool opx_reliability_ready(struct fid_ep *ep, void * itr = fi_opx_rbt_find(state->tx_flow_rbtree, (void*)key.value); if (OFI_UNLIKELY(!itr)) { /* Reliability handshake is incomplete, initiate it */ - opx_reliability_handshake_init(ep, key, target_reliability_rx); + opx_reliability_handshake_init(ep, key, target_reliability_rx, OPX_HFI1_TYPE); return false; } @@ -965,7 +995,7 @@ int32_t fi_opx_reliability_tx_available_psns (struct fid_ep *ep, /* We've never sent to this receiver, so initiate a reliability handshake with them. Once they create the receive flow on their end, and we receive their ack, we'll create the flow on our end and be able to send. */ - opx_reliability_handshake_init(ep, key, target_reliability_rx); + opx_reliability_handshake_init(ep, key, target_reliability_rx, OPX_HFI1_TYPE); OPX_TRACER_TRACE_SDMA(OPX_TRACER_END_EAGAIN_SDMA_PSNS, "GET_PSNS"); return -1; } @@ -1028,7 +1058,7 @@ int32_t fi_opx_reliability_tx_next_psn (struct fid_ep *ep, /* We've never sent to this receiver, so initiate a reliability handshake with them. Once they create the receive flow on their end, and we receive their ack, we'll create the flow on our end and be able to send. */ - opx_reliability_handshake_init(ep, key, target_reliability_rx); + opx_reliability_handshake_init(ep, key, target_reliability_rx, OPX_HFI1_TYPE); return -1; } else { *psn_ptr = (union fi_opx_reliability_tx_psn *)fi_opx_rbt_value_ptr(state->tx_flow_rbtree, itr); @@ -1104,8 +1134,8 @@ int32_t fi_opx_reliability_get_replay (struct fid_ep *ep, const uint64_t target_reliability_rx, union fi_opx_reliability_tx_psn **psn_ptr, struct fi_opx_reliability_tx_replay **replay, - const enum ofi_reliability_kind reliability - ) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { union fi_opx_reliability_service_flow_key key = { @@ -1120,7 +1150,7 @@ int32_t fi_opx_reliability_get_replay (struct fid_ep *ep, /* We've never sent to this receiver, so initiate a reliability handshake with them. Once they create the receive flow on their end, and we receive their ack, we'll create the flow on our end and be able to send. */ - opx_reliability_handshake_init(ep, key, target_reliability_rx); + opx_reliability_handshake_init(ep, key, target_reliability_rx, hfi1_type); return -1; } @@ -1175,13 +1205,34 @@ void fi_opx_reliability_client_replay_deallocate(struct fi_opx_reliability_clien static inline void fi_opx_reliability_client_replay_register_no_update (struct fi_opx_reliability_client_state * state, - const uint16_t dlid, const uint8_t rs, const uint8_t rx, union fi_opx_reliability_tx_psn *psn_ptr, + const uint8_t rs, const uint8_t rx, union fi_opx_reliability_tx_psn *psn_ptr, struct fi_opx_reliability_tx_replay * replay, - const enum ofi_reliability_kind reliability_kind) + const enum ofi_reliability_kind reliability_kind, + const enum opx_hfi1_type hfi1_type) { - const uint16_t lrh_pktlen_le = ntohs(replay->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + uint16_t lrh_pktlen_le; + size_t total_bytes; + uint32_t hdr_dlid; + uint8_t hdr_tx; + uint8_t hdr_rx; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(replay->scb_9B.hdr.lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + hdr_dlid = replay->scb_9B.hdr.lrh_9B.dlid; + /* hardcoded replay hfi type for macros */ + hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)), + hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)->bth.rx; + } else { + lrh_pktlen_le = replay->scb_16B.hdr.lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + hdr_dlid = htons(replay->scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb_16B.hdr.lrh_16B.dlid); + /* hardcoded replay hfi type for macros */ + hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)); + hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)->bth.rx; + } psn_ptr->psn.bytes_outstanding += total_bytes; + replay->target_reliability_rx = rs; replay->psn_ptr = psn_ptr; @@ -1206,7 +1257,7 @@ void fi_opx_reliability_client_replay_register_no_update (struct fi_opx_reliabil fi_opx_atomic_fifo_produce(&state->fifo, (uint64_t)replay | TX_CMD); } else if (reliability_kind == OFI_RELIABILITY_KIND_ONLOAD || reliability_kind == OFI_RELIABILITY_KIND_RUNTIME) { /* constant compile-time expression */ - fi_reliability_service_process_command(state, replay); + fi_reliability_service_process_command(state, replay, state->lid_be, hdr_dlid, hdr_tx, hdr_rx, hfi1_type); } else { fprintf(stderr, "%s():%d abort\n", __func__, __LINE__); abort(); } @@ -1222,8 +1273,29 @@ void fi_opx_reliability_client_replay_register_with_update (struct fi_opx_reliab struct fi_opx_completion_counter * counter, uint64_t value, const enum ofi_reliability_kind reliability_kind) { - const uint16_t lrh_pktlen_le = ntohs(replay->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + uint16_t lrh_pktlen_le; + size_t total_bytes; + uint32_t hdr_dlid; + uint8_t hdr_tx; + uint8_t hdr_rx; + + /* global note: runtime HFI1 type - may need macro/inlining/const parameter hfi1_type to be branchless */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(replay->scb_9B.hdr.lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + hdr_dlid = replay->scb_9B.hdr.lrh_9B.dlid; + /* hardcoded replay hfi type for macros */ + hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)), + hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)->bth.rx; + } else { + lrh_pktlen_le = replay->scb_16B.hdr.lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + hdr_dlid = htons(replay->scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb_16B.hdr.lrh_16B.dlid); + /* hardcoded replay hfi type for macros */ + hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)); + hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)->bth.rx; + } + psn_ptr->psn.bytes_outstanding += total_bytes; replay->target_reliability_rx = rs; replay->psn_ptr = psn_ptr; @@ -1254,8 +1326,7 @@ void fi_opx_reliability_client_replay_register_with_update (struct fi_opx_reliab fi_opx_atomic_fifo_produce(&state->fifo, (uint64_t)replay | TX_CMD); } else if (reliability_kind == OFI_RELIABILITY_KIND_ONLOAD || reliability_kind == OFI_RELIABILITY_KIND_RUNTIME) { /* constant compile-time expression */ - - fi_reliability_service_process_command(state, replay); + fi_reliability_service_process_command(state, replay, state->lid_be, hdr_dlid, hdr_tx, hdr_rx, OPX_HFI1_TYPE); } else { fprintf(stderr, "%s():%d abort\n", __func__, __LINE__); abort(); diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index 2a21e70ddc1..8e53211e3c2 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -67,7 +67,8 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, const uint32_t opcode, const int lock_required, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { union fi_opx_hfi1_deferred_work *work = @@ -86,12 +87,14 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, params->dest_rx = opx_target_addr.hfi1_rx; params->bth_rx = params->dest_rx << 56; params->lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(opx_target_addr.fi); - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); params->pbc_dws = 2 + /* pbc */ 2 + /* lrh */ 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ 16; /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ + /* lrh does not include pbc (8 bytes/2 dws), but does include icrc (4 bytes/1 dws), + so subtract 1 dws */ params->lrh_dws = htons(params->pbc_dws - 1); params->is_intranode = fi_opx_hfi1_tx_is_intranode(opx_ep, opx_target_addr, caps); params->reliability = reliability; @@ -163,7 +166,8 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, const uint64_t tx_op_flags, const uint64_t is_hmem, const int lock_required, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert(niov == 1); // TODO, support something ... bigger assert(op == FI_NOOP || op < OFI_ATOMIC_OP_LAST); @@ -179,7 +183,7 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, params->work_elem.complete = false; params->opx_ep = opx_ep; params->lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(opx_dst_addr.fi); - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); params->slid = opx_dst_addr.uid.lid; params->origin_rs = opx_dst_addr.reliability_rx; params->dt = dt == FI_VOID ? FI_VOID-1 : dt; @@ -214,7 +218,7 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, */ ssize_t rc = fi_opx_shm_dynamic_tx_connect(params->is_intranode, opx_ep, params->u32_extended_rx, opx_dst_addr.hfi1_unit); assert(rc == FI_SUCCESS); - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); fi_opx_hfi1_dput_sdma_init(opx_ep, params, iov->len, 0, 0, NULL, is_hmem); FI_OPX_DEBUG_COUNTERS_INC_COND(is_hmem && params->is_intranode, @@ -260,37 +264,42 @@ ssize_t fi_opx_inject_write_generic(struct fid_ep *ep, const void *buf, size_t l fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_write_generic(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_writev_generic(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_writemsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability); + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_read_generic(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_readv_generic(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - - const enum ofi_reliability_kind reliability); + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_readmsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability); + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type); #ifdef __cplusplus } diff --git a/prov/opx/include/rdma/opx/fi_opx_rma_ops.h b/prov/opx/include/rdma/opx/fi_opx_rma_ops.h index fd0b118b241..3c0ec8f916f 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma_ops.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma_ops.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,61 +38,61 @@ * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_RMA_SPECIALIZED_FUNC(LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_RMA_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_RMA_SPECIALIZED_FUNC(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_RMA_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_RMA_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY) \ - static inline ssize_t fi_opx_writemsg_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ +#define FI_OPX_RMA_SPECIALIZED_FUNC_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + static inline ssize_t fi_opx_writemsg_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) \ { \ - return fi_opx_writemsg_generic(ep, msg, flags, LOCK, AV, CAPS, RELIABILITY); \ + return fi_opx_writemsg_generic(ep, msg, flags, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_writev_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_writev_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, \ fi_addr_t dest_addr, uint64_t addr_offset, uint64_t key, void *context) \ { \ return fi_opx_writev_generic(ep, iov, desc, count, dest_addr, addr_offset, key, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_write_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_write_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dst_addr, \ uint64_t addr_offset, uint64_t key, void *context) \ { \ return fi_opx_write_generic(ep, buf, len, desc, dst_addr, addr_offset, key, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_inject_write_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_inject_write_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dst_addr, \ uint64_t addr_offset, uint64_t key) \ { \ return fi_opx_inject_write_generic(ep, buf, len, dst_addr, addr_offset, key, LOCK, \ - AV, CAPS, RELIABILITY); \ + AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_readmsg_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_readmsg_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) \ { \ - return fi_opx_readmsg_generic(ep, msg, flags, LOCK, AV, CAPS, RELIABILITY); \ + return fi_opx_readmsg_generic(ep, msg, flags, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_readv_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_readv_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, \ fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context) \ { \ return fi_opx_writev_generic(ep, iov, desc, count, src_addr, addr_offset, key, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } \ - static inline ssize_t fi_opx_read_##LOCK##_##AV##_##CAPS##_##RELIABILITY( \ + static inline ssize_t fi_opx_read_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE( \ struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, \ uint64_t addr_offset, uint64_t key, void *context) \ { \ return fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, \ - context, LOCK, AV, CAPS, RELIABILITY); \ + context, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE); \ } -#define FI_OPX_RMA_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_RMA_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_RMA_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_RMA_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_RMA_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_##TYPE##_##LOCK##_##AV##_##CAPS##_##RELIABILITY +#define FI_OPX_RMA_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_##TYPE##_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE /* diff --git a/prov/opx/include/rdma/opx/fi_opx_tagged.h b/prov/opx/include/rdma/opx/fi_opx_tagged.h index 11bdfb5391a..38143f24278 100644 --- a/prov/opx/include/rdma/opx/fi_opx_tagged.h +++ b/prov/opx/include/rdma/opx/fi_opx_tagged.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021 Cornelis Networks. + * Copyright (C) 2021,2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,12 +38,12 @@ * C requires another indirection for expanding macros since * operands of the token pasting operator are not expanded */ -#define FI_OPX_TAGGED_SPECIALIZED_FUNC(LOCK,AV,CAPS,RELIABILITY) \ - FI_OPX_TAGGED_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY) +#define FI_OPX_TAGGED_SPECIALIZED_FUNC(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ + FI_OPX_TAGGED_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) -#define FI_OPX_TAGGED_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY) \ +#define FI_OPX_TAGGED_SPECIALIZED_FUNC_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_tsend_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_tsend_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ void *desc, fi_addr_t dest_addr, \ uint64_t tag, void *context) \ @@ -56,32 +56,34 @@ 0, /* override_flags */ \ 0, /* flags */ \ CAPS | FI_TAGGED, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_trecv_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_trecv_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, void *buf, size_t len, \ void *desc, fi_addr_t src_addr, uint64_t tag, \ uint64_t ignore, void *context) \ { \ return fi_opx_recv_generic(ep, buf, len, desc, \ src_addr, tag, ignore, context, \ - LOCK, AV, FI_TAGGED, RELIABILITY); \ + LOCK, AV, FI_TAGGED, RELIABILITY, HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_tinject_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_tinject_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ fi_addr_t dest_addr, uint64_t tag) \ - { \ + { \ return fi_opx_ep_tx_inject(ep, buf, len, \ dest_addr, tag, 0, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ CAPS | FI_TAGGED, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_tsenddata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_tsenddata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ void *desc, uint64_t data, fi_addr_t dest_addr, \ uint64_t tag, void *context) \ @@ -94,26 +96,28 @@ 0, /* override_flags */ \ 0, /* flags */ \ CAPS | FI_TAGGED, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } \ __OPX_FORCE_INLINE__ ssize_t \ - fi_opx_tinjectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY \ + fi_opx_tinjectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ uint64_t data, fi_addr_t dest_addr, \ uint64_t tag) \ - { \ + { \ return fi_opx_ep_tx_inject(ep, buf, len, \ dest_addr, tag, data, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ CAPS | FI_TAGGED, \ - RELIABILITY); \ + RELIABILITY, \ + HFI1_TYPE); \ } -#define FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) +#define FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_ ## TYPE ## _ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY +#define FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME_(TYPE, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_ ## TYPE ## _ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE #endif /* _FI_PROV_OPX_TAGGED_H_ */ diff --git a/prov/opx/include/rdma/opx/opx_hfi1_sim.h b/prov/opx/include/rdma/opx/opx_hfi1_sim.h index e9be731cb52..0f2906b6ef9 100644 --- a/prov/opx/include/rdma/opx/opx_hfi1_sim.h +++ b/prov/opx/include/rdma/opx/opx_hfi1_sim.h @@ -49,15 +49,15 @@ void opx_sim_store(uint64_t offset, uint64_t *value, const char* func, const int line) { long ret, loffset = (long) offset; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "%s:%u FI_OPX_HFI1_BAR_STORE: offset %#16.16lX\n", func,line,offset); ret = lseek(fi_opx_global.hfi_local_info.sim_fd, offset, SEEK_SET); if (ret != loffset) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "%s:%u FI_OPX_HFI1_BAR_STORE: offset %#16.16lX\n", func,line,offset); perror("FI_OPX_HFI1_BAR_STORE: Unable to lseek BAR: "); sleep(5); abort(); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "FI_OPX_HFI1_BAR_STORE: value %#16.16lX\n", *value); + "%s:%u FI_OPX_HFI1_BAR_STORE: %#16.16lX value [%#16.16lX]\n", func,line,offset, *value); if (write(fi_opx_global.hfi_local_info.sim_fd, value, sizeof(*value)) < 0) { perror("FI_OPX_HFI1_BAR_STORE: Unable to write BAR: "); sleep(5); abort(); @@ -101,10 +101,7 @@ assert(unit < 2); const char* filename = sim_barfiles[unit]; - #if (!defined(OPX_WFR) && !defined(OPX_JKR)) - fprintf(stderr, "Simulator MUST be built with OPX_WFR or OPX_JKR\n"); - abort(); - #endif + if (getenv("HFI_FNAME")) { filename = getenv("HFI_FNAME"); } @@ -134,7 +131,7 @@ #define OPX_HFI1_INIT_PIO_SOP(context, input) ({ \ volatile uint64_t * __pio_sop; \ do { \ - if(OPX_HFI1_TYPE == OPX_HFI1_WFR) { \ + if(OPX_HFI1_TYPE & OPX_HFI1_WFR) { \ __pio_sop = (uint64_t *) \ (OPX_TXE_PIO_SEND + \ (context * (64*1024L)) + \ @@ -152,7 +149,7 @@ #define OPX_HFI1_INIT_PIO(context, input) ({ \ volatile uint64_t * __pio; \ do { \ - if(OPX_HFI1_TYPE == OPX_HFI1_WFR) { \ + if(OPX_HFI1_TYPE & OPX_HFI1_WFR) { \ __pio = (uint64_t *)(OPX_TXE_PIO_SEND + \ (context * (64*1024L))); \ } else { \ @@ -167,7 +164,7 @@ #define OPX_HFI1_INIT_UREGS(context, input) ({ \ volatile uint64_t * __uregs; \ do { \ - if(OPX_HFI1_TYPE == OPX_HFI1_WFR) { \ + if(OPX_HFI1_TYPE & OPX_HFI1_WFR) { \ __uregs = (uint64_t *)(OPX_WFR_RXE_PER_CONTEXT_OFFSET + \ ((context) * OPX_WFR_RXE_UCTX_STRIDE)); \ } else { \ diff --git a/prov/opx/src/fi_opx_atomic.c b/prov/opx/src/fi_opx_atomic.c index a69512a34a9..596b899df2f 100644 --- a/prov/opx/src/fi_opx_atomic.c +++ b/prov/opx/src/fi_opx_atomic.c @@ -122,7 +122,8 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, const int lock_required, const uint64_t caps, const enum ofi_reliability_kind reliability, const uint64_t is_hmem, - const uint64_t is_intranode) + const uint64_t is_intranode, + const enum opx_hfi1_type hfi1_type) { if (tx_op_flags & FI_INJECT) { assert((tx_op_flags & (FI_COMPLETION | FI_TRANSMIT_COMPLETE)) != @@ -145,7 +146,7 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, params->work_elem.complete = false; params->opx_ep = opx_ep; params->lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(opx_dst_addr.fi); - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); params->slid = opx_dst_addr.uid.lid; params->origin_rs = opx_dst_addr.reliability_rx; params->dt = dt == FI_VOID ? FI_VOID-1 : dt; @@ -193,7 +194,7 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, rma_request->hmem_device = fetch_iov->device; params->rma_request_vaddr = (uintptr_t) rma_request; - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); fi_opx_hfi1_dput_sdma_init(opx_ep, params, buf_iov->len, 0, 0, NULL, is_hmem); @@ -256,7 +257,8 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, const unsigned is_compare, const void *compare_vaddr, const uint64_t tx_op_flags, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { assert((is_fetch == 0) || (is_fetch == 1)); assert((is_compare == 0) || (is_compare == 1)); @@ -275,7 +277,7 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, (union fi_opx_context *)context, opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, FI_OPX_HFI_DPUT_OPCODE_GET, - lock_required, caps, reliability); + lock_required, caps, reliability, hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC READ (end)\n"); return count; @@ -309,7 +311,7 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, NULL, (union fi_opx_context *)context, opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, - lock_required, caps, reliability, is_hmem, is_intranode); + lock_required, caps, reliability, is_hmem, is_intranode, hfi1_type); } else { struct fi_opx_hmem_iov compare_iov; @@ -326,7 +328,7 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, &compare_iov, (union fi_opx_context *)context, opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, - lock_required, caps, reliability, is_hmem, is_intranode); + lock_required, caps, reliability, is_hmem, is_intranode, hfi1_type); } FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC %s (end)\n", @@ -348,7 +350,8 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, (union fi_opx_context *)NULL, cc, datatype, op, opx_ep->tx->op_flags, is_hmem, lock_required, caps, - reliability); + reliability, + hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC WRITE (end)\n"); @@ -359,7 +362,8 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, ssize_t fi_opx_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, fi_addr_t dst_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, const int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability) + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; @@ -379,7 +383,7 @@ ssize_t fi_opx_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, opx_addr.hfi1_rx, opx_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } @@ -400,7 +404,7 @@ ssize_t fi_opx_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, size_t xfer __attribute__((unused)); xfer = fi_opx_atomic_internal(opx_ep, buf, count, opx_addr, addr, key, datatype, op, context, cc, 0, NULL, 0, NULL, opx_ep->tx->op_flags, - lock_required, av_type, caps, reliability); + lock_required, av_type, caps, reliability, hfi1_type); assert(xfer == count); return 0; @@ -413,7 +417,8 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -442,7 +447,7 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } @@ -482,7 +487,7 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, fi_opx_atomic_internal(opx_ep, (void *)msg_iov_vaddr, count_requested, opx_dst_addr, rma_iov_addr, rma_iov_key, datatype, op, NULL, cc, 0, NULL, 0, NULL, flags, lock_required, - av_type, caps, reliability); + av_type, caps, reliability, hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -518,7 +523,8 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -547,7 +553,7 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } @@ -595,7 +601,8 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, count_requested, opx_dst_addr, rma_iov_addr, rma_iov_key, datatype, op, NULL, cc, 1, (const void *)rst_iov_vaddr, 0, NULL, flags, - lock_required, av_type, caps, reliability); + lock_required, av_type, caps, reliability, + hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -637,7 +644,7 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, const size_t count_transfered = fi_opx_atomic_internal( opx_ep, NULL, count_requested, opx_dst_addr, rma_iov_addr, rma_iov_key, datatype, op, NULL, cc, 1, (const void *)rst_iov_vaddr, - 0, NULL, flags, lock_required, av_type, caps, reliability); + 0, NULL, flags, lock_required, av_type, caps, reliability, hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -678,7 +685,8 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -707,7 +715,7 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } @@ -759,7 +767,7 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, opx_dst_addr, rma_iov_addr, rma_iov_key, datatype, op, NULL, cc, 1, (const void *)rst_iov_vaddr, 1, (const void *)cmp_iov_vaddr, flags, lock_required, - av_type, caps, reliability); + av_type, caps, reliability, hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -815,7 +823,8 @@ ssize_t fi_opx_fetch_compare_atomic_generic( void *compare_desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; @@ -835,7 +844,7 @@ ssize_t fi_opx_fetch_compare_atomic_generic( opx_addr.hfi1_rx, opx_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } @@ -856,7 +865,7 @@ ssize_t fi_opx_fetch_compare_atomic_generic( size_t xfer __attribute__((unused)); xfer = fi_opx_atomic_internal(opx_ep, buf, count, opx_addr, addr, key, datatype, op, context, cc, 1, result, compare!=NULL, compare, opx_ep->tx->op_flags, - lock_required, av_type, caps, reliability); + lock_required, av_type, caps, reliability, hfi1_type); assert(xfer == count); return 0; @@ -866,12 +875,14 @@ ssize_t fi_opx_fetch_atomic_generic(struct fid_ep *ep, const void *buf, size_t c uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { return fi_opx_fetch_compare_atomic_generic(ep, buf, count, desc, NULL, NULL, result, result_desc, dest_addr, addr, key, datatype, op, context, lock_required, av_type, caps, - reliability); + reliability, + hfi1_type); } ssize_t fi_opx_compare_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, void *desc, @@ -880,12 +891,13 @@ ssize_t fi_opx_compare_atomic_generic(struct fid_ep *ep, const void *buf, size_t uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { return fi_opx_fetch_compare_atomic_generic(ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, key, datatype, op, context, lock_required, av_type, - caps, reliability); + caps, reliability, hfi1_type); } ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, @@ -893,7 +905,8 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t enum fi_datatype datatype, enum fi_op op, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -915,7 +928,7 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } @@ -937,7 +950,7 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr, key, NULL, cc, datatype, op, opx_ep->tx->op_flags | FI_INJECT, - is_hmem, lock_required, caps, reliability); + is_hmem, lock_required, caps, reliability, hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC INJECT WRITE (end)\n"); @@ -960,10 +973,30 @@ ssize_t fi_opx_atomic(struct fid_ep *ep, const void *buf, size_t count, void *de ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, - context, FI_OPX_LOCK_NOT_REQUIRED, - opx_ep->av_type, 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, + context, FI_OPX_LOCK_NOT_REQUIRED, + opx_ep->av_type, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, + context, FI_OPX_LOCK_NOT_REQUIRED, + opx_ep->av_type, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, + context, FI_OPX_LOCK_NOT_REQUIRED, + opx_ep->av_type, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -987,16 +1020,47 @@ ssize_t fi_opx_fetch_atomic(struct fid_ep *ep, const void *buf, size_t count, vo fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_fetch_atomic_generic( - ep, buf, count, desc, result, result_desc, dest_addr, addr, key, - datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + } else { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + rc = fi_opx_fetch_atomic_generic( + ep, buf, count, desc, result, result_desc, dest_addr, addr, key, + datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); + } } else { - rc = fi_opx_fetch_atomic_generic( - ep, buf, count, desc, result, result_desc, dest_addr, addr, key, - datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -1022,16 +1086,47 @@ ssize_t fi_opx_compare_atomic(struct fid_ep *ep, const void *buf, size_t count, fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_compare_atomic_generic( - ep, buf, count, desc, compare, compare_desc, result, result_desc, - dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + } else { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + rc = fi_opx_compare_atomic_generic( + ep, buf, count, desc, compare, compare_desc, result, result_desc, + dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); + } } else { - rc = fi_opx_compare_atomic_generic( - ep, buf, count, desc, compare, compare_desc, result, result_desc, - dest_addr, addr, key, datatype, op, context, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -1055,17 +1150,55 @@ ssize_t fi_opx_inject_atomic(struct fid_ep *ep, const void *buf, size_t count, f fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, - datatype, op, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, + datatype, op, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - rc = fi_opx_inject_atomic_generic(ep, buf, count, dest_addr, addr, key, - datatype, op, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, 0x0018000000000000ull, - OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1093,16 +1226,53 @@ ssize_t fi_opx_atomic_writemsg(struct fid_ep *ep, const struct fi_msg_atomic *ms fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, - 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, - 0x0018000000000000ull, - OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -1126,20 +1296,63 @@ ssize_t fi_opx_atomic_readwritemsg(struct fid_ep *ep, const struct fi_msg_atomic fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, - flags, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, - 0x0018000000000000ull, - OPX_RELIABILITY); + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, + flags, FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, + 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - rc = fi_opx_atomic_readwritemsg_generic(ep, msg, resultv, result_count, - flags, FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, - 0x0018000000000000ull, - OPX_RELIABILITY); + rc = -FI_EINVAL; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EINVAL\n"); + abort(); } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1162,18 +1375,59 @@ ssize_t fi_opx_atomic_compwritemsg(struct fid_ep *ep, const struct fi_msg_atomic fi_opx_lock_if_required(&opx_ep->lock, lock_required); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); - if (opx_ep->av_type == FI_AV_MAP) { - rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, - resultv, result_count, flags, - FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_MAP, 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (opx_ep->av_type == FI_AV_MAP) { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_MAP, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, + resultv, result_count, flags, + FI_OPX_LOCK_NOT_REQUIRED, + FI_AV_TABLE, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - rc = fi_opx_atomic_compwritemsg_generic(ep, msg, comparev, compare_count, - resultv, result_count, flags, - FI_OPX_LOCK_NOT_REQUIRED, - FI_AV_TABLE, 0x0018000000000000ull, - OPX_RELIABILITY); + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -1333,24 +1587,58 @@ int fi_opx_finalize_atomic_ops(struct fid_ep *ep) return 0; } -FI_OPX_ATOMIC_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY) +FI_OPX_ATOMIC_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR) +FI_OPX_ATOMIC_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B) +FI_OPX_ATOMIC_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR) ssize_t fi_opx_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t count, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(atomic, OPX_LOCK, OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY)( - ep, buf, count, desc, dest_addr, addr, key, datatype, op, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(atomic, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)( + ep, buf, count, desc, dest_addr, addr, key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(atomic, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)( + ep, buf, count, desc, dest_addr, addr, key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(atomic, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)( + ep, buf, count, desc, dest_addr, addr, key, datatype, op, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_inject_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op) { - return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(inject_atomic, OPX_LOCK, OPX_AV, - 0x0018000000000000ull, OPX_RELIABILITY)( - ep, buf, count, dest_addr, addr, key, datatype, op); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(inject_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR)( + ep, buf, count, dest_addr, addr, key, datatype, op); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(inject_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B)( + ep, buf, count, dest_addr, addr, key, datatype, op); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(inject_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR)( + ep, buf, count, dest_addr, addr, key, datatype, op); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_fetch_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t count, @@ -1358,11 +1646,31 @@ ssize_t fi_opx_fetch_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, si fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(fetch_atomic, OPX_LOCK, OPX_AV, - 0x0018000000000000ull, - OPX_RELIABILITY)(ep, buf, count, desc, result, - result_desc, dest_addr, addr, - key, datatype, op, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(fetch_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)(ep, buf, count, desc, result, + result_desc, dest_addr, addr, + key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(fetch_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)(ep, buf, count, desc, result, + result_desc, dest_addr, addr, + key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(fetch_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)(ep, buf, count, desc, result, + result_desc, dest_addr, addr, + key, datatype, op, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_compare_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t count, @@ -1371,8 +1679,26 @@ ssize_t fi_opx_compare_atomic_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(compare_atomic, OPX_LOCK, OPX_AV, - 0x0018000000000000ull, OPX_RELIABILITY)( - ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, - key, datatype, op, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(compare_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR)( + ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, + key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(compare_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B)( + ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, + key, datatype, op, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_ATOMIC_SPECIALIZED_FUNC_NAME(compare_atomic, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR)( + ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, + key, datatype, op, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } diff --git a/prov/opx/src/fi_opx_cntr.c b/prov/opx/src/fi_opx_cntr.c index 04bde0cb991..fa628fc6209 100644 --- a/prov/opx/src/fi_opx_cntr.c +++ b/prov/opx/src/fi_opx_cntr.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 by Cornelis Networks. + * Copyright (C) 2021-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -79,12 +79,12 @@ static uint64_t fi_opx_cntr_read(struct fid_cntr *cntr) if (OFI_UNLIKELY(opx_cntr->lock_required)) { for (i=0; iprogress.ep[i]->lock); - fi_opx_ep_rx_poll(&opx_cntr->progress.ep[i]->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_cntr->progress.ep[i]->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, OPX_HFI1_TYPE); fi_opx_unlock(&opx_cntr->progress.ep[i]->lock); } } else { for (i=0; iprogress.ep[i]->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_cntr->progress.ep[i]->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, OPX_HFI1_TYPE); } } } @@ -153,14 +153,16 @@ fi_opx_cntr_wait(struct fid_cntr *cntr, uint64_t threshold, int timeout) fi_opx_lock(&opx_cntr->progress.ep[i]->lock); fi_opx_ep_rx_poll(&opx_cntr->progress.ep[i]->ep_fid, 0, OPX_RELIABILITY, - FI_OPX_HDRQ_MASK_RUNTIME); + FI_OPX_HDRQ_MASK_RUNTIME, + OPX_HFI1_TYPE); fi_opx_unlock(&opx_cntr->progress.ep[i]->lock); } } else { for (i=0; iprogress.ep[i]->ep_fid, 0, OPX_RELIABILITY, - FI_OPX_HDRQ_MASK_RUNTIME); + FI_OPX_HDRQ_MASK_RUNTIME, + OPX_HFI1_TYPE); } } } diff --git a/prov/opx/src/fi_opx_cq.c b/prov/opx/src/fi_opx_cq.c index 364b3b64d48..f2712436572 100644 --- a/prov/opx/src/fi_opx_cq.c +++ b/prov/opx/src/fi_opx_cq.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 by Cornelis Networks. + * Copyright (C) 2021-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -203,7 +203,8 @@ struct fi_ops_cq * fi_opx_cq_select_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, const uint64_t rcvhdrcnt, const uint64_t caps, - const enum fi_progress progress) + const enum fi_progress progress, + const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_CQ, "(called)\n"); @@ -222,19 +223,51 @@ struct fi_ops_cq * fi_opx_cq_select_ops(const enum fi_cq_format format, abort(); } - const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); - - switch(rcvhdrcnt) { - case 2048: - return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps) : - fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps); - case 8192: - return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps) : - fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps); - default: - FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); - return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps) : - fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps); + const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); + + if (hfi1_type & OPX_HFI1_WFR) { + + switch(rcvhdrcnt) { + case 2048: + return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps, 0) : + fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps, 0); + case 8192: + return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps, 0) : + fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps, 0); + default: + FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); + return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps, 0) : + fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps, 0); + } + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + switch(rcvhdrcnt) { + case 2048: + return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps, 1) : + fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps, 1); + case 8192: + return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps, 1) : + fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps, 1); + default: + FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); + return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps, 1) : + fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps, 1); + } + } else if (hfi1_type & OPX_HFI1_JKR) { + switch(rcvhdrcnt) { + case 2048: + return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps, 2) : + fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps, 2); + case 8192: + return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps, 2) : + fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps, 2); + default: + FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); + return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps, 2) : + fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps, 2); + } + } else { + FI_WARN(fi_opx_global.prov, FI_LOG_CQ, "Invalid HFI type %d\n", hfi1_type); + return NULL; } } @@ -370,7 +403,8 @@ void fi_opx_cq_finalize_ops(struct fid_ep *ep) fi_opx_select_reliability(opx_ep), opx_ep->hfi->info.rxe.hdrq.elemcnt, opx_cq->ep_comm_caps, - opx_cq->domain->data_progress); + opx_cq->domain->data_progress, + OPX_HFI1_TYPE); } if (opx_ep->tx->cq && (opx_ep->tx->cq != opx_ep->rx->cq)) { @@ -381,7 +415,8 @@ void fi_opx_cq_finalize_ops(struct fid_ep *ep) fi_opx_select_reliability(opx_ep), opx_ep->hfi->info.rxe.hdrq.elemcnt, opx_cq->ep_comm_caps, - opx_cq->domain->data_progress); + opx_cq->domain->data_progress, + OPX_HFI1_TYPE); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_CQ, "(end)\n"); diff --git a/prov/opx/src/fi_opx_cq_ops_table_locking.c b/prov/opx/src/fi_opx_cq_ops_table_locking.c index f7b59b0f54f..ffadda5d946 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_locking.c +++ b/prov/opx/src/fi_opx_cq_ops_table_locking.c @@ -32,78 +32,202 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" - /* HDRQ_MASK = 2k value (2047 * 0x20) */ + +/* WFR 9B headers */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + +/* JKR 16B */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + static struct fi_ops_cq fi_opx_cq_locking_2048_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + - // Format: FI_CQ_FORMAT_UNSPEC - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - // Format: FI_CQ_FORMAT_CONTEXT - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - // Format: FI_CQ_FORMAT_MSG - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - //Format: FI_CQ_FORMAT_DATA - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - // Format: FI_CQ_FORMAT_TAGGED - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), }; static op_matrix_t *fi_opx_cq_locking_2048_ops = (op_matrix_t *)&fi_opx_cq_locking_2048_ops_table; struct fi_ops_cq * fi_opx_cq_select_locking_2048_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_locking_2048_ops)[format][0][comm_caps]; + return &(*fi_opx_cq_locking_2048_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c b/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c index 198f93a8616..41cf35f3aba 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c +++ b/prov/opx/src/fi_opx_cq_ops_table_locking_8192.c @@ -32,74 +32,202 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" - /* HDRQ_MASK = 8k value (8191 * 0x20) */ + +/* WFR 9B headers */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + +/* JKR 16B */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) static struct fi_ops_cq fi_opx_cq_locking_8192_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), }; static op_matrix_t *fi_opx_cq_locking_8192_ops = (op_matrix_t *)&fi_opx_cq_locking_8192_ops_table; struct fi_ops_cq * fi_opx_cq_select_locking_8192_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_locking_8192_ops)[format][0][comm_caps]; + return &(*fi_opx_cq_locking_8192_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c b/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c index 16d2b67fb2d..7377ef549d0 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c +++ b/prov/opx/src/fi_opx_cq_ops_table_locking_runtime.c @@ -33,71 +33,201 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" /* HDRQ_MASK = runtime value (not 2047 or 8191, won't be optimal) */ + +/* WFR 9B headers */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + +/* JKR 16B */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + static struct fi_ops_cq fi_opx_cq_locking_runtime_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), }; static op_matrix_t *fi_opx_cq_locking_runtime_ops = (op_matrix_t *)&fi_opx_cq_locking_runtime_ops_table; struct fi_ops_cq * fi_opx_cq_select_locking_runtime_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_locking_runtime_ops)[format][0][comm_caps]; + return &(*fi_opx_cq_locking_runtime_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_non_locking.c b/prov/opx/src/fi_opx_cq_ops_table_non_locking.c index 50b4c6f03e5..5caaaefe9a9 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_non_locking.c +++ b/prov/opx/src/fi_opx_cq_ops_table_non_locking.c @@ -33,78 +33,201 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" /* HDRQ_MASK = 2k value (2047 * 0x20) */ + +/* WFR 9B headers */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + +/* JKR 9B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + +/* JKR 16B */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) static struct fi_ops_cq fi_opx_cq_non_locking_2048_ops_table[] = { + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), - // Format: FI_CQ_FORMAT_UNSPEC - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - // Format: FI_CQ_FORMAT_CONTEXT - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - // Format: FI_CQ_FORMAT_MSG - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - //Format: FI_CQ_FORMAT_DATA - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), - - // Format: FI_CQ_FORMAT_TAGGED - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE), -}; + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_2048, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + +}; + static op_matrix_t *fi_opx_cq_non_locking_2048_ops = (op_matrix_t *)&fi_opx_cq_non_locking_2048_ops_table; struct fi_ops_cq * fi_opx_cq_select_non_locking_2048_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_non_locking_2048_ops)[format][0][comm_caps]; + return &(*fi_opx_cq_non_locking_2048_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c b/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c index 7b1ad22c6ac..ee4e94483ff 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c +++ b/prov/opx/src/fi_opx_cq_ops_table_non_locking_8192.c @@ -33,70 +33,201 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" /* HDRQ_MASK = 8k value (8191 * 0x20) */ + +/* WFR 9B headers */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + +/* JKR 9B */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + +/* JKR 16B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + static struct fi_ops_cq fi_opx_cq_non_locking_8192_ops_table[] = { - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_8192, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + }; static op_matrix_t *fi_opx_cq_non_locking_8192_ops = (op_matrix_t *)&fi_opx_cq_non_locking_8192_ops_table; struct fi_ops_cq * fi_opx_cq_select_non_locking_8192_ops(const enum fi_cq_format format, const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_non_locking_8192_ops)[format][0][comm_caps]; + return &(*fi_opx_cq_non_locking_8192_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c b/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c index 5ca74f424b9..8efa55b2e3b 100644 --- a/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c +++ b/prov/opx/src/fi_opx_cq_ops_table_non_locking_runtime.c @@ -32,98 +32,281 @@ #include "rdma/opx/fi_opx_cq_ops_table.h" - /* HDRQ_MASK = runtime value (not 2047 or 8191, won't be optimal) */ + +/* WFR 9B headers */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR) + +/* JKR 9B */ /* CAPS = FI_OPX_COMMS_NONE (runtime) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B) /* CAPS = FI_OPX_COMMS_LOCAL (only local) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B) /* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B) /* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ /* ----- OFI_RELIABILITY_KIND_ONLOAD */ - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) - FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B) + +/* JKR 16B */ +/* CAPS = FI_OPX_COMMS_NONE (runtime) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR) + +/* CAPS = FI_OPX_COMMS_LOCAL (only local) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR) + +/* CAPS = FI_OPX_COMMS_REMOTE (only remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR) + +/* CAPS = FI_OPX_COMMS_LOCAL_REMOTE (local and remote) */ +/* ----- OFI_RELIABILITY_KIND_ONLOAD */ + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_UNSPEC, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_CONTEXT, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_MSG, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_DATA, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) + FI_OPX_CQ_SPECIALIZED_FUNC_NON_LOCKING(FI_CQ_FORMAT_TAGGED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR) static struct fi_ops_cq fi_opx_cq_non_locking_runtime_ops_table[] = { - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), - - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE), - FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_UNSPEC, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_CONTEXT, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_MSG, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_DATA, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_NONE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_REMOTE, OPX_HFI1_JKR), + + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_WFR), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR_9B), + FI_OPX_CQ_OPS_STRUCT_INIT(FI_CQ_FORMAT_TAGGED, FI_OPX_LOCK_NOT_REQUIRED, OFI_RELIABILITY_KIND_ONLOAD, FI_OPX_HDRQ_MASK_RUNTIME, FI_OPX_COMMS_LOCAL_REMOTE, OPX_HFI1_JKR), + + }; ssize_t fi_opx_cq_read_FABRIC_DIRECT(struct fid_cq *cq, void *buf, size_t count) { - return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, - OPX_CQ_FORMAT, - OPX_LOCK, - OPX_RELIABILITY, - OPX_MASK, - OPX_CQ_CAPS) + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_WFR) + (cq, buf, count); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_JKR_9B) (cq, buf, count); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_read, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_JKR) + (cq, buf, count); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_cq_readfrom_FABRIC_DIRECT(struct fid_cq *cq, void *buf, size_t count, fi_addr_t *src_addr) { - return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, - OPX_CQ_FORMAT, - OPX_LOCK, - OPX_RELIABILITY, - OPX_MASK, - OPX_CQ_CAPS) + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_WFR) + (cq, buf, count, src_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_JKR_9B) + (cq, buf, count, src_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_CQ_SPECIALIZED_FUNC_NAME(cq_readfrom, + OPX_CQ_FORMAT, + OPX_LOCK, + OPX_RELIABILITY, + OPX_MASK, + OPX_CQ_CAPS, + OPX_HFI1_JKR) (cq, buf, count, src_addr); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } static op_matrix_t *fi_opx_cq_non_locking_runtime_ops = (op_matrix_t *)&fi_opx_cq_non_locking_runtime_ops_table; struct fi_ops_cq * fi_opx_cq_select_non_locking_runtime_ops(const enum fi_cq_format format, - const enum ofi_reliability_kind reliability, - const uint64_t comm_caps) + const enum ofi_reliability_kind reliability, + const uint64_t comm_caps, + const uint32_t hfi1_type) { - return &(*fi_opx_cq_non_locking_runtime_ops)[format][0][comm_caps]; + return &(*fi_opx_cq_non_locking_runtime_ops)[format][0][comm_caps][hfi1_type]; } diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index d2890e1b42a..ddbb07cfd90 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -212,65 +212,69 @@ static struct fi_ops_ep fi_opx_stx_ep_ops = { void fi_opx_ep_tx_model_init (struct fi_opx_hfi1_context * hfi, const uint8_t reliability_rx, - struct fi_opx_hfi1_txe_scb * inject, - struct fi_opx_hfi1_txe_scb * send, - struct fi_opx_hfi1_txe_scb * rendezvous) { + struct fi_opx_hfi1_txe_scb_9B * inject_9B, + struct fi_opx_hfi1_txe_scb_9B * send_9B, + struct fi_opx_hfi1_txe_scb_9B * rendezvous_9B) { /* * fi_send*() model - eager */ - + /* Setup the 9B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = (OPX_HFI1_TYPE & OPX_HFI1_WFR) ? OPX_HFI1_WFR : OPX_HFI1_JKR_9B; /* PBC data */ - memset(send, 0, sizeof(*send)); - memset(inject, 0, sizeof(*inject)); - memset(rendezvous, 0, sizeof(*rendezvous)); - send->qw0 = OPX_PBC_LEN(0) /* length_dws */ | - OPX_PBC_VL(hfi->vl) | - OPX_PBC_SC(hfi->sc) | - OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) | - OPX_PBC_L2COMPRESSED(0) | - OPX_PBC_PORTIDX(hfi->hfi_port) | - OPX_PBC_SCTXT(hfi->send_ctxt); + memset(send_9B, 0, sizeof(*send_9B)); + memset(inject_9B, 0, sizeof(*inject_9B)); + memset(rendezvous_9B, 0, sizeof(*rendezvous_9B)); + send_9B->qw0 = OPX_PBC_LEN(0,hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi->vl,hfi1_type) | + OPX_PBC_SC(hfi->sc,hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B,hfi1_type) | + OPX_PBC_L2COMPRESSED(0,hfi1_type) | + OPX_PBC_PORTIDX(hfi->hfi_port,hfi1_type) | + OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type); /* LRH header */ - send->hdr.stl.lrh.flags = + send_9B->hdr.lrh_9B.flags = htons(FI_OPX_HFI1_LRH_BTH | ((hfi->sl & FI_OPX_HFI1_LRH_SL_MASK) << FI_OPX_HFI1_LRH_SL_SHIFT) | ((hfi->sc & FI_OPX_HFI1_LRH_SC_MASK) << FI_OPX_HFI1_LRH_SC_SHIFT)); - send->hdr.stl.lrh.dlid = 0; /* set at runtime */ - send->hdr.stl.lrh.pktlen = 0; /* set at runtime */ - send->hdr.stl.lrh.slid = htons(hfi->lid); + send_9B->hdr.lrh_9B.dlid = 0; /* set at runtime */ + send_9B->hdr.lrh_9B.pktlen = 0; /* set at runtime */ + send_9B->hdr.lrh_9B.slid = htons((uint16_t)hfi->lid); /* BTH header */ - send->hdr.stl.bth.opcode = 0; - send->hdr.stl.bth.bth_1 = 0; - send->hdr.stl.bth.pkey = htons(hfi->pkey); - send->hdr.stl.bth.ecn = (uint8_t)(OPX_BTH_RC2(OPX_BTH_RC2_VAL) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT)); - send->hdr.stl.bth.qp = hfi->bthqp; - send->hdr.stl.bth.unused = 0; - send->hdr.stl.bth.rx = 0; /* set at runtime */ + send_9B->hdr.bth.opcode = 0; + send_9B->hdr.bth.bth_1 = 0; + send_9B->hdr.bth.pkey = htons(hfi->pkey); + send_9B->hdr.bth.ecn = (uint8_t)(OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)),hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT,hfi1_type)); + send_9B->hdr.bth.qp = hfi->bthqp; + send_9B->hdr.bth.unused = 0; + send_9B->hdr.bth.rx = 0; /* set at runtime */ - send->hdr.reliability.psn = 0; - send->hdr.reliability.origin_tx = hfi->send_ctxt; + send_9B->hdr.reliability.psn = 0; + send_9B->hdr.reliability.origin_tx = hfi->send_ctxt; /* KDETH header */ - send->hdr.stl.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ - send->hdr.stl.kdeth.jkey = hfi->jkey; - send->hdr.stl.kdeth.hcrc = 0; - send->hdr.stl.kdeth.unused = 0; + send_9B->hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ + send_9B->hdr.kdeth.jkey = hfi->jkey; + send_9B->hdr.kdeth.hcrc = 0; + send_9B->hdr.kdeth.unused = 0; /* OFI header */ - send->hdr.match.ofi_data = 0; /* set at runtime */ - send->hdr.match.ofi_tag = 0; /* set at runtime */ + send_9B->hdr.match.ofi_data = 0; /* set at runtime */ + send_9B->hdr.match.ofi_tag = 0; /* set at runtime */ /* * fi_send*() model - rendezvous */ - *rendezvous = *send; - rendezvous->hdr.rendezvous.origin_rs = reliability_rx; + *rendezvous_9B = *send_9B; + rendezvous_9B->hdr.rendezvous.origin_rs = reliability_rx; + + /* clone from send model, then adjust */ + *inject_9B = *send_9B; /* * fi_inject() model @@ -281,24 +285,130 @@ void fi_opx_ep_tx_model_init (struct fi_opx_hfi1_context * hfi, 3 + /* bth */ 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - inject->qw0 = OPX_PBC_LEN(inject_pbc_dws) /* length_dws */ | - OPX_PBC_VL(hfi->vl) | - OPX_PBC_SC(hfi->sc) | - OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) | - OPX_PBC_L2COMPRESSED(0)| - OPX_PBC_PORTIDX(hfi->hfi_port) | - OPX_PBC_SCTXT(hfi->send_ctxt); + inject_9B->qw0 = OPX_PBC_LEN(inject_pbc_dws,hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi->vl,hfi1_type) | + OPX_PBC_SC(hfi->sc,hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B,hfi1_type) | + OPX_PBC_L2COMPRESSED(0,hfi1_type)| + OPX_PBC_PORTIDX(hfi->hfi_port,hfi1_type) | + OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type); + + /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + inject_9B->hdr.lrh_9B.pktlen = htons(inject_pbc_dws-1); + + /* specified at runtime */ + inject_9B->hdr.inject.message_length = 0; + inject_9B->hdr.inject.app_data_u64[0] = 0; + inject_9B->hdr.inject.app_data_u64[1] = 0; +} + +void fi_opx_ep_tx_model_init_16B (struct fi_opx_hfi1_context * hfi, + const uint8_t reliability_rx, + struct fi_opx_hfi1_txe_scb_16B * inject_16B, + struct fi_opx_hfi1_txe_scb_16B * send_16B, + struct fi_opx_hfi1_txe_scb_16B * rendezvous_16B) { + + /* + * fi_send*() model - eager + */ + /* Setup the 16B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = OPX_HFI1_JKR; + + /* PBC data */ + memset(send_16B, 0, sizeof(*send_16B)); + memset(inject_16B, 0, sizeof(*inject_16B)); + memset(rendezvous_16B, 0, sizeof(*rendezvous_16B)); + send_16B->qw0 = OPX_PBC_LEN(0,hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi->vl,hfi1_type) | + OPX_PBC_SC(hfi->sc,hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_16B,hfi1_type) | + OPX_PBC_L2COMPRESSED(0,hfi1_type) | + OPX_PBC_PORTIDX(hfi->hfi_port,hfi1_type) | + OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type) | + OPX_PBC_JKR_INSERT_NON9B_ICRC; + + /* LRH header */ + send_16B->hdr.lrh_16B.qw[0] = 0UL; + send_16B->hdr.lrh_16B.qw[1] = 0UL; + + send_16B->hdr.lrh_16B.sc = hfi->sc; + send_16B->hdr.lrh_16B.entropy = 0; + send_16B->hdr.lrh_16B.lt = 0; // need to add env variable to change + send_16B->hdr.lrh_16B.l2 = OPX_PBC_JKR_L2TYPE_16B; + send_16B->hdr.lrh_16B.l4 = 9; + send_16B->hdr.lrh_16B.rc = OPX_RC_IN_ORDER_0; + send_16B->hdr.lrh_16B.cspec = OPX_BTH_CSPEC_DEFAULT; /*NOT BTH CSPEC*/ + send_16B->hdr.lrh_16B.pkey = hfi->pkey; + + send_16B->hdr.lrh_16B.slid = hfi->lid & 0xFFFFF; + send_16B->hdr.lrh_16B.slid20 = (hfi->lid) >> 20; + + /* BTH header */ + send_16B->hdr.bth.opcode = 0; + send_16B->hdr.bth.bth_1 = 0; + send_16B->hdr.bth.pkey = htons(hfi->pkey); + send_16B->hdr.bth.ecn = (uint8_t)(OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)),hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT,hfi1_type)); + send_16B->hdr.bth.qp = hfi->bthqp; + send_16B->hdr.bth.unused = 0; + send_16B->hdr.bth.rx = 0; /* set at runtime */ + + send_16B->hdr.reliability.psn = 0; + send_16B->hdr.reliability.origin_tx = hfi->send_ctxt; + + /* KDETH header */ + send_16B->hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ + send_16B->hdr.kdeth.jkey = hfi->jkey; + send_16B->hdr.kdeth.hcrc = 0; + send_16B->hdr.kdeth.unused = 0; + + /* OFI header */ + send_16B->hdr.match.ofi_data = 0; /* set at runtime */ + send_16B->hdr.match.ofi_tag = 0; /* set at runtime */ + + + /* + * fi_send*() model - rendezvous + */ + *rendezvous_16B = *send_16B; + rendezvous_16B->hdr.rendezvous.origin_rs = reliability_rx; + + + /* + * fi_inject() model + */ /* clone from send model, then adjust */ - inject->hdr = send->hdr; + *inject_16B = *send_16B; + + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth + unused */ + 2 + /* ICRC and tail */ + 2 ; /* second cacheline */ + + inject_16B->qw0 = OPX_PBC_LEN(pbc_dws,hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi->vl,hfi1_type) | + OPX_PBC_SC(hfi->sc,hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_16B,hfi1_type) | + OPX_PBC_L2COMPRESSED(0,hfi1_type)| + OPX_PBC_PORTIDX(hfi->hfi_port,hfi1_type) | + OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type) | + OPX_PBC_JKR_INSERT_NON9B_ICRC; + + const uint32_t packetLength = (pbc_dws - 2) * 4; + const uint32_t lrh_qws = (packetLength >> 3) + + ((packetLength & 0x07u) != 0); + /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ - inject->hdr.stl.lrh.pktlen = htons(inject_pbc_dws-1); + inject_16B->hdr.lrh_16B.pktlen = lrh_qws; /* specified at runtime */ - inject->hdr.inject.message_length = 0; - inject->hdr.inject.app_data_u64[0] = 0; - inject->hdr.inject.app_data_u64[1] = 0; + inject_16B->hdr.inject.message_length = 0; + inject_16B->hdr.inject.app_data_u64[0] = 0; } int fi_opx_stx_init (struct fi_opx_domain *opx_domain, struct fi_tx_attr *attr, @@ -353,6 +463,12 @@ int fi_opx_stx_init (struct fi_opx_domain *opx_domain, struct fi_tx_attr *attr, &opx_stx->tx.inject, &opx_stx->tx.send, &opx_stx->tx.rzv); + + fi_opx_ep_tx_model_init_16B(opx_stx->hfi, + opx_stx->reliability_rx, + &opx_stx->tx.inject_16B, + &opx_stx->tx.send_16B, + &opx_stx->tx.rzv_16B); fi_opx_ref_inc(&opx_domain->ref_cnt, "domain"); @@ -480,7 +596,7 @@ static int fi_opx_close_ep(fid_t fid) fi_reliability_service_ping_remote(&opx_ep->ep_fid, service); service->usec_next = fi_opx_timer_next_event_usec(timer, timestamp, service->usec_max); } - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, OPX_HFI1_TYPE); compare = fi_opx_timer_now(timestamp, timer); } } @@ -837,14 +953,21 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, /* initialize the models */ fi_opx_ep_tx_model_init(hfi, opx_ep->reliability->rx, - &opx_ep->tx->inject, - &opx_ep->tx->send, - &opx_ep->tx->rzv); + &opx_ep->tx->inject_9B, + &opx_ep->tx->send_9B, + &opx_ep->tx->rzv_9B); + + fi_opx_ep_tx_model_init_16B(hfi, + opx_ep->reliability->rx, + &opx_ep->tx->inject_16B, + &opx_ep->tx->send_16B, + &opx_ep->tx->rzv_16B); - opx_ep->tx->inject.hdr.reliability.unused = 0; - opx_ep->tx->rzv.hdr.reliability.unused = 0; + opx_ep->tx->inject_9B.hdr.reliability.unused = 0; + opx_ep->tx->rzv_9B.hdr.reliability.unused = 0; - opx_ep->tx->rzv.hdr.rendezvous.origin_rx = hfi->info.rxe.id; + opx_ep->tx->rzv_9B.hdr.rendezvous.origin_rx = hfi->info.rxe.id; + opx_ep->tx->rzv_16B.hdr.rendezvous.origin_rx = hfi->info.rxe.id; // these 3 lines should move to ep init ? opx_ep->threading = (uint32_t) opx_domain->threading; @@ -1069,7 +1192,7 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) opx_ep->rx->egrq.head_register = hfi1->info.rxe.egrq.head_register; opx_ep->rx->self.raw64b = 0; - opx_ep->rx->self.uid.lid = htons(hfi1->lid); + opx_ep->rx->self.uid.lid = htons(hfi1->lid); // lid needs to be changed to uint32 opx_ep->rx->self.hfi1_rx = hfi1->info.rxe.id; opx_ep->rx->self.hfi1_unit = (uint8_t)hfi1->hfi_unit; opx_ep->rx->self.uid.endpoint_id = hfi1->send_ctxt; @@ -1081,80 +1204,153 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) /* Initialize hash table used to lookup info on any HFI units on the node */ fi_opx_global.hfi_local_info.hfi_unit = (uint8_t)hfi1->hfi_unit; fi_opx_global.hfi_local_info.lid = htons(hfi1->lid); - fi_opx_global.hfi_local_info.type = opx_ep->hfi->hfi_hfi1_type; - if(fi_opx_global.hfi_local_info.type != OPX_HFI1_TYPE) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Runtime HFI type (%u) doesn't match build type (%u)\n", - fi_opx_global.hfi_local_info.type, OPX_HFI1_TYPE); - abort(); - } + fi_opx_init_hfi_lookup(); + /* * initialize tx for acks, etc */ { /* rendezvous CTS packet model */ - memset(&opx_ep->rx->tx.cts, 0, sizeof(opx_ep->rx->tx.cts)); + /* Setup the 9B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = (OPX_HFI1_TYPE & OPX_HFI1_WFR) ? OPX_HFI1_WFR : OPX_HFI1_JKR_9B; + + memset(&opx_ep->rx->tx.cts_9B, 0, sizeof(opx_ep->rx->tx.cts_9B)); /* PBC data */ - opx_ep->rx->tx.cts.qw0 = OPX_PBC_LEN(0) /* length_dws */ | - OPX_PBC_VL(hfi1->vl) | - OPX_PBC_SC(hfi1->sc) | - OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) | - OPX_PBC_L2COMPRESSED(0) | - OPX_PBC_PORTIDX(hfi1->hfi_port) | - OPX_PBC_SCTXT(hfi1->send_ctxt); + opx_ep->rx->tx.cts_9B.qw0 = OPX_PBC_LEN(0, hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi1->vl, hfi1_type) | + OPX_PBC_SC(hfi1->sc, hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B, hfi1_type) | + OPX_PBC_L2COMPRESSED(0, hfi1_type) | + OPX_PBC_PORTIDX(hfi1->hfi_port, hfi1_type) | + OPX_PBC_SCTXT(hfi1->send_ctxt, hfi1_type); /* LRH header */ - opx_ep->rx->tx.cts.hdr.stl.lrh.flags = + opx_ep->rx->tx.cts_9B.hdr.lrh_9B.flags = htons(FI_OPX_HFI1_LRH_BTH | ((hfi1->sl & FI_OPX_HFI1_LRH_SL_MASK) << FI_OPX_HFI1_LRH_SL_SHIFT) | ((hfi1->sc & FI_OPX_HFI1_LRH_SC_MASK) << FI_OPX_HFI1_LRH_SC_SHIFT)); - opx_ep->rx->tx.cts.hdr.stl.lrh.dlid = 0; /* set at runtime */ - opx_ep->rx->tx.cts.hdr.stl.lrh.pktlen = 0; /* set at runtime */ - opx_ep->rx->tx.cts.hdr.stl.lrh.slid = htons(hfi1->lid); + opx_ep->rx->tx.cts_9B.hdr.lrh_9B.dlid = 0; /* set at runtime */ + opx_ep->rx->tx.cts_9B.hdr.lrh_9B.pktlen = 0; /* set at runtime */ + opx_ep->rx->tx.cts_9B.hdr.lrh_9B.slid = htons(hfi1->lid); /* BTH header */ - opx_ep->rx->tx.cts.hdr.stl.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_CTS; - opx_ep->rx->tx.cts.hdr.stl.bth.bth_1 = 0; - opx_ep->rx->tx.cts.hdr.stl.bth.pkey = htons(hfi1->pkey); - opx_ep->rx->tx.cts.hdr.stl.bth.ecn = (uint8_t) (OPX_BTH_RC2(OPX_BTH_RC2_VAL) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT)); - opx_ep->rx->tx.cts.hdr.stl.bth.qp = hfi1->bthqp; - opx_ep->rx->tx.cts.hdr.stl.bth.unused = 0; - opx_ep->rx->tx.cts.hdr.stl.bth.rx = 0; /* set at runtime */ + opx_ep->rx->tx.cts_9B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_CTS; + opx_ep->rx->tx.cts_9B.hdr.bth.bth_1 = 0; + opx_ep->rx->tx.cts_9B.hdr.bth.pkey = htons(hfi1->pkey); + opx_ep->rx->tx.cts_9B.hdr.bth.ecn = (uint8_t) (OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)), hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT, hfi1_type)); + opx_ep->rx->tx.cts_9B.hdr.bth.qp = hfi1->bthqp; + opx_ep->rx->tx.cts_9B.hdr.bth.unused = 0; + opx_ep->rx->tx.cts_9B.hdr.bth.rx = 0; /* set at runtime */ - opx_ep->rx->tx.cts.hdr.reliability.psn = 0; - opx_ep->rx->tx.cts.hdr.reliability.origin_tx = hfi1->send_ctxt; + opx_ep->rx->tx.cts_9B.hdr.reliability.psn = 0; + opx_ep->rx->tx.cts_9B.hdr.reliability.origin_tx = hfi1->send_ctxt; /* KDETH header */ - opx_ep->rx->tx.cts.hdr.stl.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ - opx_ep->rx->tx.cts.hdr.stl.kdeth.jkey = hfi1->jkey; - opx_ep->rx->tx.cts.hdr.stl.kdeth.hcrc = 0; - opx_ep->rx->tx.cts.hdr.stl.kdeth.unused = 0; + opx_ep->rx->tx.cts_9B.hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ + opx_ep->rx->tx.cts_9B.hdr.kdeth.jkey = hfi1->jkey; + opx_ep->rx->tx.cts_9B.hdr.kdeth.hcrc = 0; + opx_ep->rx->tx.cts_9B.hdr.kdeth.unused = 0; /* OFI header */ - opx_ep->rx->tx.cts.hdr.cts.origin_rx = hfi1->info.rxe.id; - opx_ep->rx->tx.cts.hdr.cts.target.opcode = FI_OPX_HFI_DPUT_OPCODE_RZV; + opx_ep->rx->tx.cts_9B.hdr.cts.origin_rx = hfi1->info.rxe.id; + opx_ep->rx->tx.cts_9B.hdr.cts.target.opcode = FI_OPX_HFI_DPUT_OPCODE_RZV; } { /* rendezvous DPUT packet model */ /* tagged model */ - memset(&opx_ep->rx->tx.dput, 0, - sizeof(opx_ep->rx->tx.dput)); + memset(&opx_ep->rx->tx.dput_9B, 0, + sizeof(opx_ep->rx->tx.dput_9B)); - opx_ep->rx->tx.dput = opx_ep->rx->tx.cts; - opx_ep->rx->tx.dput.hdr.reliability.origin_tx = 0; - opx_ep->rx->tx.dput.hdr.dput.target.origin_tx = hfi1->send_ctxt; - opx_ep->rx->tx.dput.hdr.dput.target.dt = 0; - opx_ep->rx->tx.dput.hdr.dput.target.op = 0; - opx_ep->rx->tx.dput.hdr.dput.target.last_bytes = 0; - opx_ep->rx->tx.dput.hdr.dput.target.bytes = 0; - opx_ep->rx->tx.dput.hdr.dput.origin_rx = hfi1->info.rxe.id; - opx_ep->rx->tx.dput.hdr.stl.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_DATA; + opx_ep->rx->tx.dput_9B = opx_ep->rx->tx.cts_9B; + opx_ep->rx->tx.dput_9B.hdr.reliability.origin_tx = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.target.origin_tx = hfi1->send_ctxt; + opx_ep->rx->tx.dput_9B.hdr.dput.target.dt = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.target.op = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.target.last_bytes = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.target.bytes = 0; + opx_ep->rx->tx.dput_9B.hdr.dput.origin_rx = hfi1->info.rxe.id; + opx_ep->rx->tx.dput_9B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_DATA; } + { /* rendezvous CTS packet model for 16B*/ + /* Setup the 16B models whether or not they'll be used */ + + uint64_t hfi1_type = OPX_HFI1_JKR; + + memset(&opx_ep->rx->tx.cts_16B, 0, sizeof(opx_ep->rx->tx.cts_16B)); + /* PBC data */ + opx_ep->rx->tx.cts_16B.qw0 = OPX_PBC_LEN(0, hfi1_type) /* length_dws */ | + OPX_PBC_VL(hfi1->vl, hfi1_type) | + OPX_PBC_SC(hfi1->sc, hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_16B, hfi1_type) | + OPX_PBC_L2COMPRESSED(0, hfi1_type) | + OPX_PBC_PORTIDX(hfi1->hfi_port, hfi1_type) | + OPX_PBC_SCTXT(hfi1->send_ctxt, hfi1_type) | + OPX_PBC_JKR_INSERT_NON9B_ICRC; + + /* LRH header */ + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.qw[0] = 0; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.qw[1] = 0; + + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.sc = hfi1->sc; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.entropy = 0; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.lt = 0; // need to add env variable to change + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.l2 = OPX_PBC_JKR_L2TYPE_16B; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.l4 = 9; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.rc = OPX_RC_IN_ORDER_0; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.cspec = OPX_BTH_CSPEC_DEFAULT; /*NOT BTH CSPEC*/ + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.pkey = hfi1->pkey; + + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.slid = hfi1->lid & 0xFFFFF; + opx_ep->rx->tx.cts_16B.hdr.lrh_16B.slid20 = (hfi1->lid) >> 20; + + /* BTH header */ + opx_ep->rx->tx.cts_16B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_CTS; + opx_ep->rx->tx.cts_16B.hdr.bth.bth_1 = 0; + opx_ep->rx->tx.cts_16B.hdr.bth.pkey = htons(hfi1->pkey); + opx_ep->rx->tx.cts_16B.hdr.bth.ecn = (uint8_t) (OPX_BTH_RC2(OPX_BTH_RC2_VAL(hfi1_type), hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT, hfi1_type)); + opx_ep->rx->tx.cts_16B.hdr.bth.qp = hfi1->bthqp; + opx_ep->rx->tx.cts_16B.hdr.bth.unused = 0; + opx_ep->rx->tx.cts_16B.hdr.bth.rx = 0; /* set at runtime */ + + opx_ep->rx->tx.cts_16B.hdr.reliability.psn = 0; + opx_ep->rx->tx.cts_16B.hdr.reliability.origin_tx = hfi1->send_ctxt; + + /* KDETH header */ + opx_ep->rx->tx.cts_16B.hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; /* no flags */ + opx_ep->rx->tx.cts_16B.hdr.kdeth.jkey = hfi1->jkey; + opx_ep->rx->tx.cts_16B.hdr.kdeth.hcrc = 0; + opx_ep->rx->tx.cts_16B.hdr.kdeth.unused = 0; + + /* OFI header */ + opx_ep->rx->tx.cts_16B.hdr.cts.origin_rx = hfi1->info.rxe.id; + opx_ep->rx->tx.cts_16B.hdr.cts.target.opcode = FI_OPX_HFI_DPUT_OPCODE_RZV; + } + + { /* rendezvous DPUT packet model */ + + /* tagged model */ + memset(&opx_ep->rx->tx.dput_16B, 0, + sizeof(opx_ep->rx->tx.dput_16B)); + + + opx_ep->rx->tx.dput_16B = opx_ep->rx->tx.cts_16B; + opx_ep->rx->tx.dput_16B.hdr.reliability.origin_tx = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.target.origin_tx = hfi1->send_ctxt; + opx_ep->rx->tx.dput_16B.hdr.dput.target.dt = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.target.op = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.target.last_bytes = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.target.bytes = 0; + opx_ep->rx->tx.dput_16B.hdr.dput.origin_rx = hfi1->info.rxe.id; + opx_ep->rx->tx.dput_16B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_RZV_DATA; + } + + if ((opx_ep->rx->caps & FI_LOCAL_COMM) || ((opx_ep->rx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM)) == 0)) { char buffer[128]; @@ -1224,28 +1420,23 @@ static int fi_opx_apply_info_and_init_ops(struct fi_opx_ep *opx_ep) { opx_ep->rx->op_flags |= info->rx_attr ? info->rx_attr->op_flags : 0; // Init oprations per endpoint - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); int ret; ret = fi_opx_init_cm_ops(&opx_ep->ep_fid.fid, info); if (ret) goto err; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); ret = fi_opx_init_msg_ops(&opx_ep->ep_fid, info); if (ret) goto err; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); ret = fi_opx_init_rma_ops(&opx_ep->ep_fid, info); if (ret) goto err; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); ret = fi_opx_init_tagged_ops(&opx_ep->ep_fid, info); if (ret) goto err; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); ret = fi_opx_init_atomic_ops(&opx_ep->ep_fid, info); if (ret) goto err; @@ -1326,7 +1517,6 @@ static void fi_opx_apply_bind_flags(struct fi_opx_ep *opx_ep) { opx_ep->is_rx_cq_bound = true; } - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); } static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) @@ -1372,6 +1562,18 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) return -errno; } fi_opx_ref_inc(&opx_ep->hfi->ref_cnt, "HFI context"); + + fi_opx_global.hfi_local_info.type = opx_ep->hfi->hfi_hfi1_type; + + int mixed_network = 0; + if (fi_param_get_int(fi_opx_global.prov, "mixed_network", &mixed_network) == FI_SUCCESS) { + if ((mixed_network == 1) && (fi_opx_global.hfi_local_info.type == OPX_HFI1_JKR)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Runtime HFI type is 9B JKR\n"); + fi_opx_global.hfi_local_info.type = OPX_HFI1_JKR_9B; + opx_ep->hfi->hfi_hfi1_type = OPX_HFI1_JKR_9B; + } + } + FI_INFO(fi_opx_global.prov, FI_LOG_EP_DATA, "Opened hfi %p, HFI type %#X/%#X, unit %#X, port %#X, ref_cnt %#lX," " rcv ctxt %#X, send ctxt %#X, \n", @@ -1381,12 +1583,13 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) opx_ep->hfi->ctrl->ctxt_info.ctxt, opx_ep->hfi->ctrl->ctxt_info.send_ctxt); - if (OPX_HFI1_TYPE == OPX_HFI1_JKR) { + if (OPX_HFI1_TYPE & OPX_HFI1_JKR || OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "*****HFI type is JKR (CN5000)\n"); } else { OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "*****HFI type is WFR (Omni-path)\n"); } + void *mem = NULL; mem = malloc(sizeof(struct fi_opx_ep_reliability) + FI_OPX_CACHE_LINE_SIZE); if (!mem) { @@ -1414,6 +1617,8 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) fi_opx_reliability_service_init(&opx_ep->reliability->service, opx_domain->unique_job_key, opx_ep->hfi, OFI_RELIABILITY_KIND_ONLOAD); + fi_opx_reliability_model_init_16B(&opx_ep->reliability->service, + opx_ep->hfi); opx_ep->reliability->rx = opx_ep->hfi->info.rxe.id; fi_opx_reliability_client_init(&opx_ep->reliability->state, &opx_ep->reliability->service, @@ -1480,7 +1685,7 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) fprintf(stderr, "%s:%s():%d bad structure alignment !\n", __FILE__, __func__, __LINE__); abort(); } - alignment_check = (uintptr_t)&opx_ep->tx->send; + alignment_check = (uintptr_t)&opx_ep->tx->send_9B; if ((alignment_check & 0x03Full) != 0) { fprintf(stderr, "%s:%s():%d bad structure alignment !\n", __FILE__, __func__, __LINE__); abort(); } @@ -2208,19 +2413,13 @@ int fi_opx_endpoint_rx_tx (struct fid_domain *dom, struct fi_info *info, goto err; } - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - ret = fi_opx_fid_check(&dom->fid, FI_CLASS_DOMAIN, "domain"); if (ret) return ret; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - ret = fi_opx_check_info(info); if (ret) return ret; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - void *mem = NULL; mem = malloc(sizeof(struct fi_opx_ep) + FI_OPX_CACHE_LINE_SIZE); if (!mem) { @@ -2246,8 +2445,6 @@ int fi_opx_endpoint_rx_tx (struct fid_domain *dom, struct fi_info *info, opx_ep->fr = fr; #endif - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - opx_ep->ep_fid.fid.fclass = FI_CLASS_EP; opx_ep->ep_fid.fid.context = context; opx_ep->ep_fid.fid.ops = &fi_opx_fi_ops; @@ -2406,7 +2603,8 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, const uint64_t rx_op_flags, const uint64_t is_context_ext, const uint64_t is_hmem, const int lock_required, const enum fi_av_type av_type, - const enum ofi_reliability_kind reliability) { + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fid_ep * ep = &opx_ep->ep_fid; @@ -2425,7 +2623,7 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "rx_op_flags & FI_PEEK searching unexpected queue\n"); __attribute__((__unused__)) bool from_hash_queue = false; - struct fi_opx_hfi1_ue_packet * uepkt = fi_opx_ep_find_matching_packet(opx_ep, context, kind); + struct fi_opx_hfi1_ue_packet * uepkt = fi_opx_ep_find_matching_packet(opx_ep, context, kind, hfi1_type); #ifndef FI_OPX_MATCH_HASH_DISABLE if (!uepkt && kind == FI_OPX_KIND_TAG) { @@ -2519,20 +2717,21 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, struct fi_opx_hfi1_ue_packet * claimed_pkt = context->claim; const unsigned is_intranode = - fi_opx_hfi_is_intranode(claimed_pkt->hdr.stl.lrh.slid); + opx_lrh_is_intranode(&(claimed_pkt->hdr), hfi1_type); complete_receive_operation(ep, &claimed_pkt->hdr, (union fi_opx_hfi1_packet_payload *)&claimed_pkt->payload, claimed_pkt->hdr.match.ofi_tag, context, - claimed_pkt->hdr.stl.bth.opcode, + claimed_pkt->hdr.bth.opcode, rx_op_flags & FI_OPX_CQ_CONTEXT_EXT, OPX_MULTI_RECV_FALSE, is_intranode, rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM, lock_required, - reliability); + reliability, + hfi1_type); /* ... and prepend the claimed uepkt to the ue free list. claimed_pkt->next should have been set to NULL at the time we @@ -2561,7 +2760,7 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, const union fi_opx_addr src_addr = { .fi = context->src_addr }; while (uepkt != NULL) { - unsigned is_intranode = fi_opx_hfi_is_intranode(uepkt->hdr.stl.lrh.slid); + unsigned is_intranode = opx_lrh_is_intranode(&(uepkt->hdr), hfi1_type); if (fi_opx_ep_is_matching_packet(uepkt->tag, uepkt->origin_uid_fi, FI_OPX_MATCH_IGNORE_ALL, @@ -2596,13 +2795,14 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, (union fi_opx_hfi1_packet_payload *)&uepkt->payload, uepkt->hdr.match.ofi_tag, context, - uepkt->hdr.stl.bth.opcode, + uepkt->hdr.bth.opcode, OPX_CONTEXT_EXTENDED_FALSE, OPX_MULTI_RECV_TRUE, OPX_HMEM_FALSE, is_intranode, lock_required, - reliability); + reliability, + hfi1_type); /* remove this item from the ue list and prepend * the (now) completed uepkt to the ue free list. */ @@ -2648,14 +2848,16 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, void fi_opx_ep_rx_process_header_tag (struct fid_ep * ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes, const uint8_t opcode, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) { + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + uint32_t slid) { fi_opx_ep_rx_process_header(ep, hdr, (const union fi_opx_hfi1_packet_payload * const )payload, @@ -2665,18 +2867,22 @@ void fi_opx_ep_rx_process_header_tag (struct fid_ep * ep, origin_rs, is_intranode, lock_required, - reliability); + reliability, + hfi1_type, + slid); } void fi_opx_ep_rx_process_header_msg (struct fid_ep * ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes, const uint8_t opcode, const uint8_t origin_rs, const unsigned is_intranode, const int lock_required, - const enum ofi_reliability_kind reliability) { + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + uint32_t slid) { fi_opx_ep_rx_process_header(ep, hdr, (const union fi_opx_hfi1_packet_payload * const )payload, @@ -2686,22 +2892,39 @@ void fi_opx_ep_rx_process_header_msg (struct fid_ep * ep, origin_rs, is_intranode, lock_required, - reliability); + reliability, + hfi1_type, + slid); } void fi_opx_ep_rx_reliability_process_packet (struct fid_ep * ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_rs) { OPX_LOG_PKT(FI_LOG_DEBUG, FI_LOG_EP_DATA, "================ received a packet from the reliability service\n"); - const uint8_t opcode = hdr->stl.bth.opcode; + const uint8_t opcode = hdr->bth.opcode; /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes = total_bytes - sizeof(union fi_opx_hfi1_packet_hdr); + uint16_t lrh_pktlen_le; + size_t total_bytes; + size_t payload_bytes; + uint32_t slid; + + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(hdr->lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes = total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + slid = hdr->lrh_9B.slid; + } else { + lrh_pktlen_le = hdr->lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + payload_bytes = total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); + slid = htons(((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid))); + } if (OFI_LIKELY(opcode >= FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)) { fi_opx_ep_rx_process_header(ep, hdr, @@ -2712,7 +2935,9 @@ void fi_opx_ep_rx_reliability_process_packet (struct fid_ep * ep, origin_rs, OPX_INTRANODE_FALSE, FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_ONLOAD); + OFI_RELIABILITY_KIND_ONLOAD, + OPX_HFI1_TYPE, + slid); } else { fi_opx_ep_rx_process_header(ep, hdr, (const union fi_opx_hfi1_packet_payload * const) payload, @@ -2722,22 +2947,25 @@ void fi_opx_ep_rx_reliability_process_packet (struct fid_ep * ep, origin_rs, OPX_INTRANODE_FALSE, FI_OPX_LOCK_NOT_REQUIRED, - OFI_RELIABILITY_KIND_ONLOAD); + OFI_RELIABILITY_KIND_ONLOAD, + OPX_HFI1_TYPE, + slid); } } __OPX_FORCE_INLINE__ struct fi_opx_hfi1_ue_packet *fi_opx_ep_rx_append_ue (struct fi_opx_ep_rx * const rx, struct fi_opx_hfi1_ue_packet_slist * ue, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, - const uint32_t rank_inst) + const uint32_t rank_inst, + const uint64_t slid) { struct fi_opx_hfi1_ue_packet *uepkt = ofi_buf_alloc(rx->ue_packet_pool); - memcpy((void *)&uepkt->hdr, (const void *)hdr, sizeof(union fi_opx_hfi1_packet_hdr)); + memcpy((void *)&(uepkt->hdr), (const void *)hdr, OPX_HEADER_SIZE); if (payload != NULL) { @@ -2745,7 +2973,7 @@ struct fi_opx_hfi1_ue_packet *fi_opx_ep_rx_append_ue (struct fi_opx_ep_rx * cons } uepkt->tag = hdr->match.ofi_tag; - uepkt->origin_uid_fi = fi_opx_hfi1_packet_hdr_uid(hdr); + uepkt->origin_uid_fi = fi_opx_hfi1_packet_hdr_uid(hdr, slid); /* DAOS Persistent Address Support: * Support: save rank information associated with this inbound packet. @@ -2762,27 +2990,29 @@ struct fi_opx_hfi1_ue_packet *fi_opx_ep_rx_append_ue (struct fi_opx_ep_rx * cons } void fi_opx_ep_rx_append_ue_msg (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, const uint32_t rank_inst, const bool daos_enabled, - struct fi_opx_debug_counters *debug_counters) + struct fi_opx_debug_counters *debug_counters, + const uint64_t slid) { fi_opx_ep_rx_append_ue(rx, &rx->queue[FI_OPX_KIND_MSG].ue, - hdr, payload, payload_bytes, rank, rank_inst); + hdr, payload, payload_bytes, rank, rank_inst, slid); FI_OPX_DEBUG_COUNTERS_MAX_OF(debug_counters->match.default_max_length, rx->queue[FI_OPX_KIND_MSG].ue.length); } void fi_opx_ep_rx_append_ue_tag (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const size_t payload_bytes, const uint32_t rank, const uint32_t rank_inst, const bool daos_enabled, - struct fi_opx_debug_counters *debug_counters) + struct fi_opx_debug_counters *debug_counters, + const uint64_t slid) { #ifndef FI_OPX_MATCH_HASH_DISABLE @@ -2791,31 +3021,32 @@ void fi_opx_ep_rx_append_ue_tag (struct fi_opx_ep_rx * const rx, rx->queue[FI_OPX_KIND_TAG].ue.length >= FI_OPX_MATCH_DEFAULT_UE_LIST_MAX_LENGTH)) { struct fi_opx_hfi1_ue_packet *uepkt = fi_opx_ep_rx_append_ue(rx, &rx->match_ue_tag_hash->ue, - hdr, payload, payload_bytes, 0, 0); + hdr, payload, payload_bytes, 0, 0, slid); fi_opx_match_ue_hash_append(uepkt, rx->match_ue_tag_hash, debug_counters); } else { fi_opx_ep_rx_append_ue(rx, &rx->queue[FI_OPX_KIND_TAG].ue, - hdr, payload, payload_bytes, rank, rank_inst); + hdr, payload, payload_bytes, rank, rank_inst, slid); } #else fi_opx_ep_rx_append_ue(rx, &rx->queue[FI_OPX_KIND_TAG].ue, - hdr, payload, payload_bytes, rank, rank_inst); + hdr, payload, payload_bytes, rank, rank_inst, slid); #endif FI_OPX_DEBUG_COUNTERS_MAX_OF(debug_counters->match.default_max_length, rx->queue[FI_OPX_KIND_TAG].ue.length); } void fi_opx_ep_rx_append_ue_egr (struct fi_opx_ep_rx * const rx, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, - const size_t payload_bytes) { + const size_t payload_bytes, + const uint64_t slid) { /* DAOS Persistent Address Support: * No need to retain rank related data for packets appended to the * MP Eager unexpected queue, because the mp_egr_id related data in * the packet is referenced instead. */ - fi_opx_ep_rx_append_ue(rx, &rx->mp_egr_queue.ue, hdr, payload, payload_bytes, 0, 0); + fi_opx_ep_rx_append_ue(rx, &rx->mp_egr_queue.ue, hdr, payload, payload_bytes, 0, 0, slid); } static void fi_opx_update_daos_av_rank(struct fi_opx_ep *opx_ep, fi_addr_t addr) @@ -2947,76 +3178,229 @@ ssize_t fi_opx_ep_tx_connect (struct fi_opx_ep *opx_ep, size_t count, } -FI_OPX_MSG_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, OPX_EP_CAPS, OPX_RELIABILITY) +FI_OPX_MSG_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, OPX_EP_CAPS, OPX_RELIABILITY,OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, OPX_EP_CAPS, OPX_RELIABILITY,OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(OPX_LOCK, OPX_AV, OPX_EP_CAPS, OPX_RELIABILITY,OPX_HFI1_JKR) ssize_t fi_opx_send_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, desc, dest_addr, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, dest_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, dest_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, desc, dest_addr, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_recv_FABRIC_DIRECT(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, desc, src_addr, context); + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, src_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, src_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, desc, src_addr, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_inject_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, dest_addr); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, dest_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, dest_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, dest_addr); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_recvmsg_FABRIC_DIRECT(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, msg, flags); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, msg, flags); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, msg, flags); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, msg, flags); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_senddata_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, void *context) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, desc, data, dest_addr, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, data, dest_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, data, dest_addr, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, desc, data, dest_addr, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_injectdata_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { - return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, - OPX_LOCK, - OPX_AV, - OPX_EP_CAPS, - OPX_RELIABILITY) - (ep, buf, len, data, dest_addr); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, data, dest_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, data, dest_addr); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, + OPX_LOCK, + OPX_AV, + OPX_EP_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, data, dest_addr); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 8d617da1bb2..f8c0e77389c 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -910,7 +910,7 @@ int init_hfi1_rxe_state (struct fi_opx_hfi1_context * context, rxe_state->hdrq.head = 0; assert(!(context->runtime_flags & HFI1_CAP_DMA_RTAIL)); - rxe_state->hdrq.rhf_seq = OPX_RHF_SEQ_INIT_VAL; + rxe_state->hdrq.rhf_seq = OPX_RHF_SEQ_INIT_VAL(OPX_HFI1_TYPE); /* OPX relies on RHF.SeqNum, not the RcvHdrTail if (context->runtime_flags & HFI1_CAP_DMA_RTAIL) { rxe_state->hdrq.rhf_seq = 0; @@ -935,7 +935,7 @@ ssize_t fi_opx_hfi1_tx_connect (struct fi_opx_ep *opx_ep, fi_addr_t peer) const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(peer); const uint16_t dlid_be16 = (uint16_t)(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); - if (fi_opx_hfi_is_intranode(dlid_be16)) { + if (opx_lid_is_intranode(dlid_be16)) { char buffer[128]; union fi_opx_addr addr; addr.raw64b = (uint64_t)peer; @@ -972,7 +972,6 @@ ssize_t fi_opx_hfi1_tx_connect (struct fi_opx_ep *opx_ep, fi_addr_t peer) int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; - struct fi_opx_ep * opx_ep = params->opx_ep; const uint64_t lrh_dlid = params->lrh_dlid; const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; @@ -991,26 +990,26 @@ int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work return -FI_EAGAIN; } - union fi_opx_hfi1_packet_hdr * const tx_hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, opx_ep->daos_info.rank_inst, &rc); - if(!tx_hdr) return rc; + if(!hdr) return rc; /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), because this is intranode and since it's a CTS packet, lrh.pktlen isn't used/needed */ - tx_hdr->qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid; - tx_hdr->qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.cts.hdr.qw[2]; - tx_hdr->qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - tx_hdr->qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | (params->niov << 48) | params->opcode; - tx_hdr->qw[5] = params->origin_byte_counter_vaddr; - tx_hdr->qw[6] = (uint64_t)params->rzv_comp; + hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid; + hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | (params->niov << 48) | params->opcode; + hdr->qw_9B[5] = params->origin_byte_counter_vaddr; + hdr->qw_9B[6] = (uint64_t)params->rzv_comp; union fi_opx_hfi1_packet_payload * const tx_payload = - (union fi_opx_hfi1_packet_payload *)(tx_hdr+1); + (union fi_opx_hfi1_packet_payload *)(hdr+1); uintptr_t vaddr_with_offset = params->dst_vaddr; /* receive buffer virtual address */ for(int i = 0; i < params->niov; i++) { @@ -1024,7 +1023,7 @@ int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work vaddr_with_offset += params->dput_iov[i].bytes; } - opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS-SHM"); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -1033,6 +1032,73 @@ int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work return FI_SUCCESS; } +int opx_hfi1_rx_rzv_rts_send_cts_intranode_16B(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep * opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + const uint64_t lrh_dlid_16B = htons(lrh_dlid >> 16); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV 16B, SHM -- RENDEZVOUS RTS (begin)\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS-SHM"); + uint64_t pos; + /* Possible SHM connections required for certain applications (i.e., DAOS) + * exceeds the max value of the legacy u8_rx field. Use u32_extended field. + */ + ssize_t rc = fi_opx_shm_dynamic_tx_connect(OPX_INTRANODE_TRUE, opx_ep, + params->u32_extended_rx, params->target_hfi_unit); + + if (OFI_UNLIKELY(rc)) { + return -FI_EAGAIN; + } + + union opx_hfi1_packet_hdr * const hdr = + opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, + opx_ep->daos_info.rank_inst, &rc); + + if(!hdr) return rc; + + /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), + because this is intranode and since it's a CTS packet, lrh.pktlen + isn't used/needed */ + hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B)); + hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + hdr->qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | (params->niov << 48) | params->opcode; + hdr->qw_16B[6] = params->origin_byte_counter_vaddr; + hdr->qw_16B[7] = (uint64_t)params->rzv_comp; + + union fi_opx_hfi1_packet_payload * const tx_payload = + (union fi_opx_hfi1_packet_payload *)(hdr+1); + + uintptr_t vaddr_with_offset = params->dst_vaddr; /* receive buffer virtual address */ + for(int i = 0; i < params->niov; i++) { + tx_payload->cts.iov[i].rbuf = vaddr_with_offset; + tx_payload->cts.iov[i].sbuf = (uintptr_t)params->dput_iov[i].sbuf; + tx_payload->cts.iov[i].bytes = params->dput_iov[i].bytes; + tx_payload->cts.iov[i].rbuf_device = params->dput_iov[i].rbuf_device; + tx_payload->cts.iov[i].sbuf_device = params->dput_iov[i].sbuf_device; + tx_payload->cts.iov[i].rbuf_iface = params->dput_iov[i].rbuf_iface; + tx_payload->cts.iov[i].sbuf_iface = params->dput_iov[i].sbuf_iface; + vaddr_with_offset += params->dput_iov[i].bytes; + } + + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS-SHM"); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV 16B, SHM -- RENDEZVOUS RTS (end)\n"); + + return FI_SUCCESS; +} + int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; @@ -1073,6 +1139,7 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) &opx_ep->tx->force_credit_return, total_credits_needed); opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (total_credits_available < total_credits_needed) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV, HFI -- RENDEZVOUS %s RTS (EAGAIN credits) (params=%p rzv_comp=%p context=%p)\n", @@ -1095,7 +1162,8 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) params->origin_rs, &psn_ptr, &replay, - params->reliability); + params->reliability, + OPX_HFI1_TYPE); if(OFI_UNLIKELY(psn == -1)) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV, HFI -- RENDEZVOUS %s RTS (EAGAIN psn/replay) (params=%p rzv_comp=%p context=%p)\n", @@ -1110,19 +1178,20 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) // The "memcopy first" code is here as an alternative to the more complicated // direct write to pio followed by memory copy of the reliability buffer - replay->scb.qw0 = opx_ep->rx->tx.cts.qw0 | - OPX_PBC_LEN(pbc_dws) | + + replay->scb_9B.qw0 = opx_ep->rx->tx.cts_9B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid; - replay->scb.hdr.qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid | + replay->scb_9B.hdr.qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t) lrh_dws << 32); - replay->scb.hdr.qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; - replay->scb.hdr.qw[2] = opx_ep->rx->tx.cts.hdr.qw[2] | psn; - replay->scb.hdr.qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - replay->scb.hdr.qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | + replay->scb_9B.hdr.qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + replay->scb_9B.hdr.qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn; + replay->scb_9B.hdr.qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + replay->scb_9B.hdr.qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | ((uint64_t) params->tid_info.npairs << 32) | (params->niov << 48) | params->opcode; - replay->scb.hdr.qw[5] = params->origin_byte_counter_vaddr; - replay->scb.hdr.qw[6] = (uint64_t) params->rzv_comp; + replay->scb_9B.hdr.qw_9B[5] = params->origin_byte_counter_vaddr; + replay->scb_9B.hdr.qw_9B[6] = (uint64_t) params->rzv_comp; union fi_opx_hfi1_packet_payload *const tx_payload = (union fi_opx_hfi1_packet_payload *) replay->payload; @@ -1172,12 +1241,190 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) fi_opx_reliability_service_do_replay(&opx_ep->reliability->service,replay); fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, - params->slid, params->origin_rs, params->origin_rx, psn_ptr, replay, - params->reliability); + params->reliability, + OPX_HFI1_TYPE); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-CTS-HFI:%p", params->rzv_comp); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (end) (params=%p rzv_comp=%p context=%p)\n", + params->tid_info.npairs ? "EXPECTED TID" : "EAGER", + params, + params->rzv_comp, + params->rzv_comp->context); + return FI_SUCCESS; +} + +int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t lrh_dlid_16B = htons(params->lrh_dlid >> 16); + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV 16B, HFI -- RENDEZVOUS %s RTS (begin) (params=%p rzv_comp=%p context=%p)\n", + params->tid_info.npairs ? "EXPECTED TID" : "EAGER", + params, + params->rzv_comp, + params->rzv_comp->context); + assert (params->rzv_comp->context->byte_counter >= params->dput_iov[0].bytes); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-CTS-HFI:%p", params->rzv_comp); + const uint64_t tid_payload = params->tid_info.npairs + ? ((params->tid_info.npairs + 4) * sizeof(params->tidpairs[0])) + : 0; + const uint64_t payload_bytes = (params->niov * sizeof(union fi_opx_hfi1_dput_iov)) + tid_payload; + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "payload_bytes = %ld\n", payload_bytes); + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 7 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + ((payload_bytes + 3) >> 2) + + 2; // ICRC + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + const uint16_t total_credits_needed = 1 + /* packet header */ + ((payload_bytes + 63) >> 6); /* payload blocks needed */ + uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + &opx_ep->tx->force_credit_return, + total_credits_needed); + + if (OFI_UNLIKELY(total_credits_available < total_credits_needed)) { + fi_opx_compiler_msync_writes(); + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + &opx_ep->tx->force_credit_return, + total_credits_needed); + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + if (total_credits_available < total_credits_needed) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (EAGAIN credits) (params=%p rzv_comp=%p context=%p)\n", + params->tid_info.npairs ? "EXPECTED TID" : "EAGER", + params, + params->rzv_comp, + params->rzv_comp->context); + return -FI_EAGAIN; + } + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, + &opx_ep->reliability->state, + params->slid, + params->u8_rx, + params->origin_rs, + &psn_ptr, + &replay, + params->reliability, + OPX_HFI1_TYPE); + if(OFI_UNLIKELY(psn == -1)) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (EAGAIN psn/replay) (params=%p rzv_comp=%p context=%p)\n", + params->tid_info.npairs ? "EXPECTED TID" : "EAGER", + params, + params->rzv_comp, + params->rzv_comp->context); + return -FI_EAGAIN; + } + + assert(payload_bytes <= FI_OPX_HFI1_PACKET_MTU); + + // The "memcopy first" code is here as an alternative to the more complicated + // direct write to pio followed by memory copy of the reliability buffer + replay->scb_16B.qw0 = opx_ep->rx->tx.cts_16B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, OPX_HFI1_JKR); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "replay->scb_16B.qw0 = %#lx pbc_dws = %ld\n", replay->scb_16B.qw0, pbc_dws); + replay->scb_16B.hdr.qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t) lrh_qws << 20); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "lrh_qws = %d replay->scb_16B.hdr.lrh_16B.pktlen = %d\n", lrh_qws, replay->scb_16B.hdr.lrh_16B.pktlen); + replay->scb_16B.hdr.qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + + replay->scb_16B.hdr.qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; + replay->scb_16B.hdr.qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3] | psn; + replay->scb_16B.hdr.qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + replay->scb_16B.hdr.qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | + ((uint64_t) params->tid_info.npairs << 32) | + (params->niov << 48) | params->opcode; + replay->scb_16B.hdr.qw_16B[6] = params->origin_byte_counter_vaddr; + + replay->scb_16B.hdr.qw_16B[7] = (uint64_t) params->rzv_comp; + +#ifndef NDEBUG + if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + OPX_JKR_PRINT_16B_PBC(replay->scb_16B.qw0); + OPX_JKR_PRINT_16B_LRH(replay->scb_16B.hdr.qw_16B[0], replay->scb_16B.hdr.qw_16B[1]); + OPX_JKR_PRINT_16B_BTH(replay->scb_16B.hdr.qw_16B[2], replay->scb_16B.hdr.qw_16B[3]); + } else { + abort(); + fi_opx_hfi1_dump_packet_hdr(&(replay->scb_9B.hdr), OPX_HFI1_TYPE, __func__, __LINE__); + } +#endif + + union fi_opx_hfi1_packet_payload *const tx_payload = + (union fi_opx_hfi1_packet_payload *) (replay->payload); + + assert(((uint8_t *)tx_payload) == ((uint8_t *)&(replay->data))); + + uintptr_t vaddr_with_offset = params->tid_info.npairs ? + ((uint64_t)params->dst_vaddr & -64) : + params->dst_vaddr; /* receive buffer virtual address */ + + for (int i = 0; i < params->niov; i++) { + tx_payload->cts.iov[i].rbuf = vaddr_with_offset; + tx_payload->cts.iov[i].sbuf = params->dput_iov[i].sbuf; + tx_payload->cts.iov[i].bytes = params->dput_iov[i].bytes; + tx_payload->cts.iov[i].sbuf_device = params->dput_iov[i].sbuf_device; + tx_payload->cts.iov[i].rbuf_device = params->dput_iov[i].rbuf_device; + tx_payload->cts.iov[i].sbuf_iface = params->dput_iov[i].sbuf_iface; + tx_payload->cts.iov[i].rbuf_iface = params->dput_iov[i].rbuf_iface; + vaddr_with_offset += params->dput_iov[i].bytes; + } +#ifndef NDEBUG + if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + OPX_JKR_PRINT_16B_PBC(replay->scb_16B.qw0); + OPX_JKR_PRINT_16B_LRH(replay->scb_16B.hdr.qw_16B[0], replay->scb_16B.hdr.qw_16B[1]); + OPX_JKR_PRINT_16B_BTH(replay->scb_16B.hdr.qw_16B[2], replay->scb_16B.hdr.qw_16B[3]); + } else { + abort(); + fi_opx_hfi1_dump_packet_hdr(&(replay->scb_9B.hdr), OPX_HFI1_TYPE, __func__, __LINE__); + } +#endif + + /* copy tidpairs to packet */ + if (params->tid_info.npairs) { + assert(params->tid_info.npairs < FI_OPX_MAX_DPUT_TIDPAIRS); + assert(params->tidpairs[0] != 0); + assert(params->niov == 1); + assert(params->rzv_comp->context->byte_counter >= params->dput_iov[0].bytes); + + /* coverity[missing_lock] */ + tx_payload->tid_cts.tid_offset = params->tid_info.offset; + tx_payload->tid_cts.ntidpairs = params->tid_info.npairs; + tx_payload->tid_cts.origin_byte_counter_adjust = params->tid_info.origin_byte_counter_adj; + for (int i = 0; i < params->tid_info.npairs; ++i) { + tx_payload->tid_cts.tidpairs[i] = params->tidpairs[i]; + } + } + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "fi_opx_reliability_service_do_replay &opx_ep->reliability->service %p, replay %p\n",&opx_ep->reliability->service, replay); + fi_opx_reliability_service_do_replay(&opx_ep->reliability->service,replay); + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + params->origin_rs, + params->origin_rx, + psn_ptr, + replay, + params->reliability, + OPX_HFI1_TYPE); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-CTS-HFI:%p", params->rzv_comp); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV, HFI -- RENDEZVOUS %s RTS (end) (params=%p rzv_comp=%p context=%p)\n", @@ -1188,6 +1435,7 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) return FI_SUCCESS; } + __OPX_FORCE_INLINE__ int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, struct fi_opx_hfi1_rx_rzv_rts_params *params, @@ -1480,7 +1728,8 @@ int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) } void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, - const void * const hdr, const void * const payload, + const union opx_hfi1_packet_hdr * const hdr, + const void * const payload, const uint8_t u8_rx, const uint64_t niov, uintptr_t origin_byte_counter_vaddr, union fi_opx_context *const target_context, @@ -1493,12 +1742,11 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, uint8_t opcode, const unsigned is_intranode, const enum ofi_reliability_kind reliability, - const uint32_t u32_extended_rx) + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type) { - const union fi_opx_hfi1_packet_hdr * const hfi1_hdr = - (const union fi_opx_hfi1_packet_hdr * const) hdr; - OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS-HFI:%ld",hfi1_hdr->qw[6]); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-RTS-HFI:%ld",hdr->qw_9B[6]); union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); assert(work != NULL); struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; @@ -1527,12 +1775,22 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, if (is_intranode) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_intranode %u\n",is_intranode ); - params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_intranode; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_intranode; + else + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_intranode_16B; params->work_elem.work_type = OPX_WORK_TYPE_SHM; - if (hfi1_hdr->stl.lrh.slid == opx_ep->rx->self.uid.lid) { + + uint32_t lid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + lid = hdr->lrh_9B.slid; + else + lid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + + if (lid == opx_ep->rx->self.uid.lid) { params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; } else { - struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(hfi1_hdr->stl.lrh.slid); + struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(lid); assert(hfi_lookup); params->target_hfi_unit = hfi_lookup->hfi_unit; } @@ -1541,19 +1799,31 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, "opx_ep->use_expected_tid_rzv=%u niov=%lu opcode=%u\n", opx_ep->use_expected_tid_rzv, niov, params->opcode); - params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + } else { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_16B; + } params->work_elem.work_type = OPX_WORK_TYPE_PIO; params->target_hfi_unit = 0xFF; } params->work_elem.completion_action = NULL; params->work_elem.payload_copy = NULL; params->work_elem.complete = false; - params->lrh_dlid = (hfi1_hdr->stl.lrh.qw[0] & 0xFFFF000000000000ul) >> 32; - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); - params->slid = hfi1_hdr->stl.lrh.slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->slid = hdr->lrh_9B.slid; + if (hfi1_type & OPX_HFI1_WFR) + params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; + else + params->lrh_dlid = hdr->lrh_9B.slid << 16; + } else { + params->slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + params->lrh_dlid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid) << 16; // Send CTS to the SLID that sent RTS + } + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); - params->origin_rx = hfi1_hdr->rendezvous.origin_rx; - params->origin_rs = hfi1_hdr->rendezvous.origin_rs; + params->origin_rx = hdr->rendezvous.origin_rx; + params->origin_rs = hdr->rendezvous.origin_rs; params->u8_rx = u8_rx; params->u32_extended_rx = u32_extended_rx; params->niov = niov; @@ -1591,7 +1861,7 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, int rc = params->work_elem.work_fn(work); if(rc == FI_SUCCESS) { OPX_BUF_FREE(work); - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS-HFI:%ld",hfi1_hdr->qw[6]); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS-HFI:%ld",hdr->qw_9B[6]); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_SUCCESS\n"); return; } @@ -1599,7 +1869,7 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, /* Try again later*/ assert(work->work_elem.slist_entry.next == NULL); slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending[params->work_elem.work_type]); - OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "RECV-RZV-RTS-HFI:%ld",hfi1_hdr->qw[6]); + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "RECV-RZV-RTS-HFI:%ld",hdr->qw_9B[6]); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); } @@ -1625,31 +1895,32 @@ int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work) return -FI_EAGAIN; } - union fi_opx_hfi1_packet_hdr *const tx_hdr = + union opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, opx_ep->daos_info.rank_inst, &rc); - if (tx_hdr == NULL) { + if (hdr == NULL) { return rc; } - tx_hdr->qw[0] = opx_ep->rx->tx.dput.hdr.qw[0] | params->lrh_dlid | ((uint64_t)lrh_dws << 32); - tx_hdr->qw[1] = opx_ep->rx->tx.dput.hdr.qw[1] | params->bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.dput.hdr.qw[2]; - tx_hdr->qw[3] = opx_ep->rx->tx.dput.hdr.qw[3]; - tx_hdr->qw[4] = opx_ep->rx->tx.dput.hdr.qw[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE; - tx_hdr->qw[5] = (uint64_t)params->cc; - tx_hdr->qw[6] = params->bytes_to_fence; + hdr->qw_9B[0] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[0] | params->lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[1] | params->bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE; + hdr->qw_9B[5] = (uint64_t)params->cc; + hdr->qw_9B[6] = params->bytes_to_fence; - opx_shm_tx_advance(&opx_ep->tx->shm, (void *)tx_hdr, pos); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); return FI_SUCCESS; } void opx_hfi1_dput_fence(struct fi_opx_ep *opx_ep, - const union fi_opx_hfi1_packet_hdr *const hdr, + const union opx_hfi1_packet_hdr *const hdr, const uint8_t u8_rx, - const uint32_t u32_extended_rx) + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type) { union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); assert(work != NULL); @@ -1661,17 +1932,27 @@ void opx_hfi1_dput_fence(struct fi_opx_ep *opx_ep, params->work_elem.payload_copy = NULL; params->work_elem.complete = false; params->work_elem.work_type = OPX_WORK_TYPE_SHM; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; + else + params->lrh_dlid = hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid; - params->lrh_dlid = (hdr->stl.lrh.qw[0] & 0xFFFF000000000000ul) >> 32; params->bth_rx = (uint64_t)u8_rx << 56; params->u8_rx = u8_rx; params->u32_extended_rx = u32_extended_rx; params->bytes_to_fence = hdr->dput.target.fence.bytes_to_fence; params->cc = (struct fi_opx_completion_counter *) hdr->dput.target.fence.completion_counter; - if (hdr->stl.lrh.slid == opx_ep->rx->self.uid.lid) { + uint32_t slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + slid = hdr->lrh_9B.slid; + else + slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + + if (slid == opx_ep->rx->self.uid.lid) { params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; } else { - struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(hdr->stl.lrh.slid); + struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(slid); assert(hfi_lookup); params->target_hfi_unit = hfi_lookup->hfi_unit; } @@ -1709,6 +1990,7 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) * as the dlid for the lrh header of the outgoing packet */ const uint64_t lrh_dlid = params->lrh_dlid; const uint64_t bth_rx = ((uint64_t)u8_rx) << 56; + const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; enum fi_hmem_iface cbuf_iface = params->compare_iov.iface; uint64_t cbuf_device = params->compare_iov.device; @@ -1761,34 +2043,56 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) uint64_t bytes_to_send = dput_iov[i].bytes - params->bytes_sent; while (bytes_to_send > 0) { - uint64_t bytes_to_send_this_packet = MIN(bytes_to_send + params->payload_bytes_for_iovec, + uint64_t bytes_to_send_this_packet, blocks_to_send_in_this_packet; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + bytes_to_send_this_packet = MIN(bytes_to_send + params->payload_bytes_for_iovec, max_bytes_per_packet); - uint64_t tail_bytes = bytes_to_send_this_packet & 0x3Ful; - uint64_t blocks_to_send_in_this_packet = (bytes_to_send_this_packet >> 6) + (tail_bytes ? 1 : 0); + uint64_t tail_bytes = bytes_to_send_this_packet & 0x3Ful; + blocks_to_send_in_this_packet = (bytes_to_send_this_packet >> 6) + (tail_bytes ? 1 : 0); + } else { + const uint64_t additional_hdr_tail_byte = 2 * 8; /* 1 QW for hdr that spills to 2nd cacheline + 1 QW for ICRC/tail */ + uint64_t payload_n_additional_hdr_tail_bytes = (MIN(bytes_to_send + params->payload_bytes_for_iovec + additional_hdr_tail_byte, + max_bytes_per_packet)); + uint64_t tail_bytes = payload_n_additional_hdr_tail_bytes & 0x3Ful; + blocks_to_send_in_this_packet = (payload_n_additional_hdr_tail_bytes >> 6) + (tail_bytes ? 1 : 0); + bytes_to_send_this_packet = payload_n_additional_hdr_tail_bytes - additional_hdr_tail_byte; - const uint64_t pbc_dws = 2 + /* pbc */ + } + + uint64_t pbc_dws; + uint16_t lrh_dws; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + pbc_dws = 2 + /* pbc */ 2 + /* lrh */ 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ (blocks_to_send_in_this_packet << 4); - - const uint16_t lrh_dws = htons(pbc_dws - 1); + lrh_dws = htons(pbc_dws - 1); + } else { + pbc_dws = 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 7 + /* kdeth */ + (blocks_to_send_in_this_packet << 4); // ICRC and the kdeth in the second cacheline are accounted for here + lrh_dws = (pbc_dws - 1) >> 1; + } uint64_t bytes_sent; if (is_intranode) { uint64_t pos; - union fi_opx_hfi1_packet_hdr * tx_hdr = + union opx_hfi1_packet_hdr * hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, u8_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, opx_ep->daos_info.rank_inst, &rc); - if(!tx_hdr) return rc; + if(!hdr) return rc; union fi_opx_hfi1_packet_payload * const tx_payload = - (union fi_opx_hfi1_packet_payload *)(tx_hdr+1); + (union fi_opx_hfi1_packet_payload *)(hdr+1); bytes_sent = opx_hfi1_dput_write_header_and_payload( - opx_ep, tx_hdr, tx_payload, + opx_ep, hdr, tx_payload, opcode, 0, lrh_dws, op64, dt64, lrh_dlid, bth_rx, bytes_to_send_this_packet, key, @@ -1798,11 +2102,13 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) params->bytes_sent, &sbuf, sbuf_iface, sbuf_device, (uint8_t **) ¶ms->compare_vaddr, - cbuf_iface, cbuf_device, &rbuf); + cbuf_iface, cbuf_device, &rbuf, + hfi1_type); - opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); } else { union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + const uint16_t credits_needed = blocks_to_send_in_this_packet + 1 /* header */; uint32_t total_credits_available = @@ -1828,7 +2134,7 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) int64_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, params->slid, - u8_rx, params->origin_rs, &psn_ptr, &replay, reliability); + u8_rx, params->origin_rs, &psn_ptr, &replay, reliability, hfi1_type); if(OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } @@ -1838,13 +2144,20 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) (union fi_opx_hfi1_packet_payload *) replay->payload; assert(!replay->use_iov); assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); - replay->scb.qw0 = opx_ep->rx->tx.dput.qw0 | - OPX_PBC_LEN(pbc_dws) | - OPX_PBC_CR(opx_ep->tx->force_credit_return) | - params->pbc_dlid; + if (hfi1_type & OPX_HFI1_JKR) { + replay->scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + params->pbc_dlid; + } else { + replay->scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + params->pbc_dlid; + } bytes_sent = opx_hfi1_dput_write_header_and_payload( - opx_ep, &replay->scb.hdr, replay_payload, + opx_ep, OPX_REPLAY_HDR(replay), replay_payload, opcode, psn, lrh_dws, op64, dt64, lrh_dlid, bth_rx, bytes_to_send_this_packet, key, @@ -1854,7 +2167,7 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) params->bytes_sent, &sbuf, sbuf_iface, sbuf_device, (uint8_t **) ¶ms->compare_vaddr, - cbuf_iface, cbuf_device, &rbuf); + cbuf_iface, cbuf_device, &rbuf, hfi1_type); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); @@ -1870,8 +2183,8 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) fi_opx_compiler_msync_writes(); fi_opx_reliability_client_replay_register_no_update( - &opx_ep->reliability->state, params->slid, - params->origin_rs, u8_rx, psn_ptr, replay, reliability); + &opx_ep->reliability->state, + params->origin_rs, u8_rx, psn_ptr, replay, reliability, hfi1_type); } } @@ -1993,7 +2306,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) // We should never be in this function for intranode ops assert(!params->is_intranode); - assert(opx_ep->rx->tx.dput.hdr.stl.lrh.slid != params->slid); + assert(opx_ep->rx->tx.dput_9B.hdr.lrh_9B.slid != params->slid); assert(((opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH || opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH) && @@ -2154,12 +2467,19 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) const uint16_t lrh_dws = htons(pbc_dws - 1); assert(replay != NULL); - replay->scb.qw0 = opx_ep->rx->tx.dput.qw0 | OPX_PBC_LEN(pbc_dws) | - params->pbc_dlid; + + if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + replay->scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + params->pbc_dlid; + } else { + replay->scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + params->pbc_dlid; + } + uint64_t bytes_sent = opx_hfi1_dput_write_header_and_iov( - opx_ep, &replay->scb.hdr, + opx_ep, OPX_REPLAY_HDR(replay), replay->iov, opcode, lrh_dws, op64, dt64, lrh_dlid, bth_rx, packet_bytes, key, @@ -2168,7 +2488,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) params->rma_request_vaddr, params->bytes_sent, &sbuf_tmp, (uint8_t **) ¶ms->compare_vaddr, - &rbuf); + &rbuf, OPX_HFI1_TYPE); params->cc->byte_counter += params->payload_bytes_for_iovec; fi_opx_hfi1_sdma_add_packet(params->sdma_we, replay, packet_bytes); @@ -2188,6 +2508,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) return -FI_EAGAIN; } + opx_hfi1_sdma_flush(opx_ep, params->sdma_we, ¶ms->sdma_reqs, @@ -2226,8 +2547,6 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) assert((*params->origin_byte_counter) >= params->origin_bytes_sent); *params->origin_byte_counter -= params->origin_bytes_sent; params->origin_byte_counter = NULL; - } else { - assert(params->origin_bytes_sent <= *params->origin_byte_counter); } params->work_elem.work_type = OPX_WORK_TYPE_LAST; params->work_elem.work_fn = fi_opx_hfi1_dput_sdma_pending_completion; @@ -2268,7 +2587,7 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) // We should never be in this function for intranode ops assert(!params->is_intranode); - assert(opx_ep->rx->tx.dput.hdr.stl.lrh.slid != params->slid); + assert(opx_ep->rx->tx.dput_9B.hdr.lrh_9B.slid != params->slid); assert((opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_TID) && (params->payload_bytes_for_iovec == 0)); @@ -2578,8 +2897,16 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) const uint16_t lrh_dws = htons(pbc_dws - 1); - replay->scb.qw0 = opx_ep->rx->tx.dput.qw0 | OPX_PBC_LEN(pbc_dws) | - params->pbc_dlid; + OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); + + + if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + replay->scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + params->pbc_dlid; + } else { + replay->scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + params->pbc_dlid; + } /* The fetch_vaddr and cbuf arguments are only used for atomic fetch operations, which by their one- @@ -2587,14 +2914,14 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) hard-coded to 0/NULL respectively */ uint64_t bytes_sent = opx_hfi1_dput_write_header_and_iov( - opx_ep, &replay->scb.hdr, + opx_ep, OPX_REPLAY_HDR(replay), replay->iov, opcode, lrh_dws, op64, dt64, lrh_dlid, bth_rx, packet_bytes, key, 0ul, target_byte_counter_vaddr, params->rma_request_vaddr, params->bytes_sent, &sbuf_tmp, - NULL, &rbuf); + NULL, &rbuf, OPX_HFI1_TYPE); /* tid packets are page aligned and 4k/8k length except first TID and last (remnant) packet */ assert((tididx == 0) || (first_tid_last_packet) || @@ -2675,7 +3002,7 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ep, struct fi_opx_mr * opx_mr, - const void * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const void * const payload, size_t payload_bytes_to_copy, const uint8_t u8_rx, @@ -2691,9 +3018,8 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ void (*completion_action)(union fi_opx_hfi1_deferred_work * work_state), const unsigned is_intranode, const enum ofi_reliability_kind reliability, - const uint32_t u32_extended_rx) { - const union fi_opx_hfi1_packet_hdr * const hfi1_hdr = - (const union fi_opx_hfi1_packet_hdr * const) hdr; + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type) { union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); struct fi_opx_hfi1_dput_params *params = &work->dput; @@ -2704,9 +3030,14 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ params->work_elem.complete = false; params->opx_ep = opx_ep; params->opx_mr = opx_mr; - params->lrh_dlid = (hfi1_hdr->stl.lrh.qw[0] & 0xFFFF000000000000ul) >> 32; - params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid); - params->slid = hfi1_hdr->stl.lrh.slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->slid = hdr->lrh_9B.slid; + params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; + } else { + params->slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + params->lrh_dlid = (htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid)) << 16; // Send dput to the SLID that sent CTS + } + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); params->origin_rs = origin_rs; params->u8_rx = u8_rx; params->u32_extended_rx = u32_extended_rx; @@ -2729,10 +3060,10 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ params->is_intranode = is_intranode; params->reliability = reliability; if (is_intranode) { - if (hfi1_hdr->stl.lrh.slid == opx_ep->rx->self.uid.lid) { + if (params->slid == opx_ep->rx->self.uid.lid) { params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; } else { - struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(hfi1_hdr->stl.lrh.slid); + struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(params->slid); assert(hfi_lookup); params->target_hfi_unit = hfi_lookup->hfi_unit; } @@ -2756,7 +3087,7 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ uint32_t *tidpairs = NULL; if (opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_TID) { - ntidpairs = hfi1_hdr->cts.target.vaddr.ntidpairs; + ntidpairs = hdr->cts.target.vaddr.ntidpairs; if (ntidpairs) { union fi_opx_hfi1_packet_payload *tid_payload = (union fi_opx_hfi1_packet_payload *) payload; @@ -2826,7 +3157,8 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz uint64_t *origin_byte_counter_value, const uint64_t caps, const enum ofi_reliability_kind reliability, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) { // We should already have grabbed the lock prior to calling this function assert(!lock_required); @@ -2872,22 +3204,22 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-RZV-RTS-NONCONTIG-SHM"); uint64_t pos; ssize_t rc; - union fi_opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( + union opx_hfi1_packet_hdr *const hdr = opx_shm_tx_next( &opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); if (!hdr) return rc; - hdr->qw[0] = opx_ep->tx->rzv.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - hdr->qw[1] = opx_ep->tx->rzv.hdr.qw[1] | bth_rx | + hdr->qw_9B[0] = opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); - hdr->qw[2] = opx_ep->tx->rzv.hdr.qw[2]; - hdr->qw[3] = opx_ep->tx->rzv.hdr.qw[3] | (((uint64_t)data) << 32); - hdr->qw[4] = opx_ep->tx->rzv.hdr.qw[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; - hdr->qw[5] = total_len; - hdr->qw[6] = tag; + hdr->qw_9B[2] = opx_ep->tx->rzv_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + hdr->qw_9B[4] = opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; + hdr->qw_9B[5] = total_len; + hdr->qw_9B[6] = tag; union fi_opx_hfi1_packet_payload *const payload = (union fi_opx_hfi1_packet_payload *)(hdr + 1); @@ -2923,7 +3255,7 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz FI_DBG_TRACE( fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SENDV, SHM -- RENDEZVOUS RTS (end) context %p\n",context); - fi_opx_shm_poll_many(&opx_ep->ep_fid, 0); + fi_opx_shm_poll_many(&opx_ep->ep_fid, 0, hfi1_type); return FI_SUCCESS; } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -2931,6 +3263,7 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-RZV-RTS-HFI"); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + const uint16_t total_credits_needed = 1 + /* packet header */ payload_blocks_total; /* packet payload */ @@ -2941,6 +3274,7 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz &opx_ep->tx->force_credit_return, total_credits_needed); if (total_credits_available < total_credits_needed) { opx_ep->tx->pio_state->qw0 = pio_state.qw0; + return -FI_EAGAIN; } } @@ -2950,7 +3284,7 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz int64_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if(OFI_UNLIKELY(psn == -1)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); return -FI_EAGAIN; @@ -2976,22 +3310,22 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] .send.rzv_noncontig); - assert(opx_ep->tx->rzv.qw0 == 0); - const uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return); + assert(opx_ep->tx->rzv_9B.qw0 == 0); + const uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type); volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; - fi_opx_set_scb(scb, tmp, - opx_ep->tx->rzv.qw0 | OPX_PBC_LEN(pbc_dws) | force_credit_return | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid), - opx_ep->tx->rzv.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->rzv.hdr.qw[1] | bth_rx | + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->tx->rzv_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), - opx_ep->tx->rzv.hdr.qw[2] | psn, - opx_ep->tx->rzv.hdr.qw[3] | (((uint64_t)data) << 32), - opx_ep->tx->rzv.hdr.qw[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, + opx_ep->tx->rzv_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, total_len, tag); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); @@ -3003,13 +3337,13 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz unsigned credits_consumed = 1; #endif - fi_opx_copy_cacheline(&replay->scb.qw0, tmp); - + fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); /* write the payload */ uint64_t *iov_qws = (uint64_t *) &hmem_iov[0]; volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - fi_opx_set_scb(scb_payload, tmp, + uint64_t local_temp_payload[16] = {0}; + fi_opx_store_and_copy_qw(scb_payload, local_temp_payload, origin_byte_counter_vaddr, iov_qws[0], iov_qws[1], @@ -3029,7 +3363,7 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz uint64_t * replay_payload = replay->payload; assert(!replay->use_iov); assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); - fi_opx_copy_cacheline(replay_payload, tmp); + fi_opx_copy_cacheline(replay_payload, local_temp_payload); replay_payload += 8; if (payload_blocks_total > 1) { @@ -3038,7 +3372,7 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, &pio_state, + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, (uint64_t *) &hmem_iov[2], payload_blocks_total - 1, total_credits_available); @@ -3052,9 +3386,9 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz #endif fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, - addr.uid.lid, addr.reliability_rx, dest_rx, - psn_ptr, replay, reliability); + psn_ptr, replay, reliability, + hfi1_type); /* update the hfi txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; @@ -3077,7 +3411,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, const uint64_t caps, const enum ofi_reliability_kind reliability, const enum fi_hmem_iface src_iface, - const uint64_t src_device_id) + const uint64_t src_device_id, + const enum opx_hfi1_type hfi1_type) { // We should already have grabbed the lock prior to calling this function assert(!lock_required); @@ -3163,7 +3498,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-SHM"); uint64_t pos; ssize_t rc; - union fi_opx_hfi1_packet_hdr * const hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, &rc); @@ -3178,18 +3513,17 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] .send.rzv); - hdr->qw[0] = opx_ep->tx->rzv.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - - hdr->qw[1] = opx_ep->tx->rzv.hdr.qw[1] | bth_rx | + hdr->qw_9B[0] = opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); - hdr->qw[2] = opx_ep->tx->rzv.hdr.qw[2]; - hdr->qw[3] = opx_ep->tx->rzv.hdr.qw[3] | (((uint64_t)data) << 32); - hdr->qw[4] = opx_ep->tx->rzv.hdr.qw[4] | (1ull << 48); /* effectively 1 iov */ - hdr->qw[5] = len; - hdr->qw[6] = tag; + hdr->qw_9B[2] = opx_ep->tx->rzv_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + hdr->qw_9B[4] = opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (1ull << 48); /* effectively 1 iov */ + hdr->qw_9B[5] = len; + hdr->qw_9B[6] = tag; union fi_opx_hfi1_packet_payload * const payload = (union fi_opx_hfi1_packet_payload *)(hdr+1); @@ -3277,7 +3611,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, int64_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, - dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); if(OFI_UNLIKELY(psn == -1)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); return -FI_EAGAIN; @@ -3292,23 +3626,23 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, * which will consume a single pio credit. */ - uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return); + uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type); volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; - fi_opx_set_scb(scb, tmp, - opx_ep->tx->rzv.qw0 | OPX_PBC_LEN(pbc_dws) | force_credit_return | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid), - opx_ep->tx->rzv.hdr.qw[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), - opx_ep->tx->rzv.hdr.qw[1] | bth_rx | + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->tx->rzv_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), + opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), - opx_ep->tx->rzv.hdr.qw[2] | psn, - opx_ep->tx->rzv.hdr.qw[3] | (((uint64_t)data) << 32), - opx_ep->tx->rzv.hdr.qw[4] | (1ull << 48), + opx_ep->tx->rzv_9B.hdr.qw_9B[2] | psn, + opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), + opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (1ull << 48), len, tag); /* consume one credit for the packet header */ @@ -3319,15 +3653,15 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - fi_opx_copy_cacheline(&replay->scb.qw0, tmp); + fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); /* * write the rendezvous payload "send control blocks" */ volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - - fi_opx_set_scb(scb_payload, tmp, + uint64_t temp[8]; + fi_opx_store_and_copy_qw(scb_payload, temp, (uintptr_t)buf + immediate_total, /* src_vaddr */ (len - immediate_total) >> 6, /* src_blocks */ src_device_id, @@ -3346,7 +3680,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, assert(!replay->use_iov); assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); - fi_opx_copy_cacheline(replay_payload, tmp); + fi_opx_copy_cacheline(replay_payload, temp); replay_payload += 8; uint8_t *sbuf; @@ -3378,7 +3712,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, uint64_t * sbuf_qw = (uint64_t *)(sbuf + immediate_byte_count); if (immediate_fragment) { - struct tmp_payload_t *tmp_payload = (void*)tmp; + struct tmp_payload_t *tmp_payload = (void*)temp; if (immediate_byte_count > 0) { memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count); } @@ -3386,10 +3720,10 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, for (int i=0; iimmediate_qw[i] = sbuf_qw[i]; } - fi_opx_copy_scb(scb_payload, tmp); + fi_opx_store_scb_qw(scb_payload, temp); sbuf_qw += immediate_qw_count; - fi_opx_copy_cacheline(replay_payload, tmp); + fi_opx_copy_cacheline(replay_payload, temp); replay_payload += 8; /* consume one credit for the rendezvous payload immediate data */ @@ -3408,7 +3742,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, assert((credits_consumed + immediate_block_count) <= total_credits_needed); ssize_t credits = #endif - fi_opx_hfi1_tx_egr_write_full_payload_blocks(opx_ep, + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, sbuf_qw, immediate_block_count, @@ -3441,7 +3775,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, src_iface, src_device_id); scb_payload = (uint64_t *)FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - fi_opx_copy_scb(scb_payload, align_tmp.immediate_qw); + fi_opx_store_scb_qw(scb_payload, align_tmp.immediate_qw); fi_opx_copy_cacheline(replay_payload, align_tmp.immediate_qw); replay_payload += 8; @@ -3453,8 +3787,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, } fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, - addr.uid.lid, addr.reliability_rx, - dest_rx, psn_ptr, replay, reliability); + addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); #ifndef NDEBUG @@ -3471,40 +3805,521 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, return FI_SUCCESS; } - -unsigned fi_opx_hfi1_handle_poll_error(struct fi_opx_ep * opx_ep, - volatile uint64_t *rhe_ptr, - volatile uint32_t * rhf_ptr, - const uint32_t rhf_msb, - const uint32_t rhf_lsb, - const uint64_t rhf_seq, - const uint64_t hdrq_offset, - const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr) +ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, + const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void* context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, + const uintptr_t origin_byte_counter_vaddr, + uint64_t *origin_byte_counter_value, + const uint64_t caps, + const enum ofi_reliability_kind reliability, + const enum fi_hmem_iface src_iface, + const uint64_t src_device_id, + const enum opx_hfi1_type hfi1_type) { - /* We are assuming that we can process any error and consume this header, - let reliability detect and replay it as needed. */ - FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "RECEIVE ERROR: rhf_msb = 0x%08x, rhf_lsb = 0x%08x, rhf_seq = 0x%lx\n", rhf_msb, rhf_lsb, rhf_seq); + // We should already have grabbed the lock prior to calling this function + assert(!lock_required); - /* Unexpected errors on WFR */ - (void)rhf_ptr; /* unused unless debug is turned on */ + //Need at least one full block of payload + assert(len >= FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES); - /* drop this packet and allow reliability protocol to retry */ -#ifdef OPX_RELIABILITY_DEBUG - const uint64_t hdrq_offset_dws = (rhf_msb >> 12) & 0x01FFu; + struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); + const union fi_opx_addr addr = { .fi = dest_addr }; + +#ifndef NDEBUG + const uint64_t max_immediate_block_count = (FI_OPX_HFI1_PACKET_MTU >> 6)-2 ; +#endif + /* Expected tid needs to send a leading data block and a trailing + * data block for alignment. Limit this to SDMA (8K+) for now */ + + const uint64_t immediate_block_count = (len > opx_ep->tx->sdma_min_payload_bytes && opx_ep->use_expected_tid_rzv) ? 1 : 0; + const uint64_t immediate_end_block_count = immediate_block_count; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "immediate_block_count %#lX *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, " + "*origin_byte_counter_vaddr %lu/%#lX, len %lu/%#lX\n", + immediate_block_count, *origin_byte_counter_value, (uint64_t*)origin_byte_counter_vaddr, + origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, + origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, len, len ); + + assert((immediate_block_count + immediate_end_block_count) <= max_immediate_block_count); + + const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; + const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); + const uint64_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + + const uint64_t immediate_byte_count = len & 0x0007ul; + uint64_t immediate_qw_count = (len >> 3) & 0x0007ul; + uint64_t immediate_fragment = (((len & 0x003Ful) + 63) >> 6); + + /* Need a full block for ICRC after the end block... */ + const uint64_t icrc_end_block = immediate_end_block_count; + + /* ... otherwise need a qw (or block) in the immediate fragment */ + const uint64_t icrc_fragment = icrc_end_block ? 0 : immediate_fragment; + + /* if there are already 7 qw's need a full block */ + const uint64_t icrc_fragment_block = icrc_fragment && (immediate_qw_count == 7) ? 1: 0 ; + + /* Summary: we can add the tail qw in... + * - rzv metadata if there is no other immediate data + * - an empty fragment qw if there are no other blocks (icrc_fragment & !icrc_fragment_block) + * - a full (additional) fragment block if there are no other blocks (icrc_fragment & icrc_fragment_block) + * - a full (additional) trailing block after the end (icrc_end_block) + */ + + + /* Immediate total does not include trailing block */ + const uint64_t immediate_total = immediate_byte_count + + immediate_qw_count * sizeof(uint64_t) + + immediate_block_count * sizeof(union cacheline); + + union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { + .byte_count = (uint8_t) immediate_byte_count, + .qw_count = (uint8_t) immediate_qw_count, + .block_count = (uint8_t) immediate_block_count, + .end_block_count = (uint8_t) immediate_end_block_count, + .unused = 0 + }; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "max_immediate_block_count %#lX, len %#lX >> 6 %#lX, immediate_total %#lX, " + "immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX, " + "origin_byte_counter %lu/%#lX, adjusted origin_byte_counter %lu/%#lX\n", + max_immediate_block_count, len, (len >> 6), immediate_total, immediate_byte_count, + immediate_qw_count, immediate_block_count, *origin_byte_counter_value, + *origin_byte_counter_value, len - immediate_total, len - immediate_total); + + assert(immediate_byte_count <= UINT8_MAX); + assert(immediate_qw_count <= UINT8_MAX); + assert(immediate_block_count <= UINT8_MAX); + assert(immediate_end_block_count <= UINT8_MAX); + assert(icrc_end_block + icrc_fragment_block < 2); /* not both */ + assert(immediate_end_block_count == immediate_block_count); + + assert(((len - immediate_total) & 0x003Fu) == 0); + + *origin_byte_counter_value = len - immediate_total; + + /* full blocks only. icrc_end_block/icrc_fragment_block count 1 qw only */ + const uint64_t payload_blocks_total = + 1 + /* rzv metadata */ + immediate_fragment + + immediate_block_count + + immediate_end_block_count; + + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lhr */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth + unused */ + (payload_blocks_total << 4) + + ((icrc_end_block | icrc_fragment_block) << 1); /* 1 QW of any added tail block */ + + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; // Does not include PBC and is in QW + + if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, SHM -- RENDEZVOUS RTS (begin) context %p\n",context); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-SHM"); + uint64_t pos; + ssize_t rc; + union opx_hfi1_packet_hdr * const hdr = + opx_shm_tx_next(&opx_ep->tx->shm, addr.hfi1_unit, dest_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, opx_ep->daos_info.rank, + opx_ep->daos_info.rank_inst, &rc); + + if (!hdr) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"return %zd\n",rc); + return rc; + } + + FI_OPX_DEBUG_COUNTERS_INC_COND(src_iface != FI_HMEM_SYSTEM, + opx_ep->debug_counters.hmem.intranode + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.rzv); + + hdr->qw_16B[0] = opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_qws << 20); + + hdr->qw_16B[1] = opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | + ((caps & FI_MSG) ? + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); + + hdr->qw_16B[3] = opx_ep->tx->rzv_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + hdr->qw_16B[5] = opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (1ull << 48); /* effectively 1 iov */ + hdr->qw_16B[6] = len; + hdr->qw_16B[7] = tag; + + union fi_opx_hfi1_packet_payload_16B * const payload = + (union fi_opx_hfi1_packet_payload_16B *)(hdr+1); + + payload->rendezvous.contiguous.src_vaddr = (uintptr_t)buf + immediate_total; + payload->rendezvous.contiguous.src_blocks = (len - immediate_total) >> 6; + payload->rendezvous.contiguous.src_device_id = src_device_id; + payload->rendezvous.contiguous.src_iface = (uint64_t) src_iface; + payload->rendezvous.contiguous.immediate_info = immediate_info.qw0; + payload->rendezvous.contiguous.origin_byte_counter_vaddr = origin_byte_counter_vaddr; + payload->rendezvous.contiguous.unused[0] = 0; + + + if (immediate_total) { + uint8_t *sbuf; + if (src_iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr->hmem_dev_reg_handle, + opx_ep->hmem_copy_buf, buf, immediate_total, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); + sbuf = opx_ep->hmem_copy_buf; + } else { + sbuf = (uint8_t *) buf; + } + + if (immediate_byte_count > 0) { + memcpy((void*)&payload->rendezvous.contiguous.immediate_byte, (const void*)sbuf, immediate_byte_count); + sbuf += immediate_byte_count; + } + + uint64_t * sbuf_qw = (uint64_t *)sbuf; + unsigned i=0; + for (i=0; irendezvous.contiguous.immediate_qw[i] = sbuf_qw[i]; + } + sbuf_qw += immediate_qw_count; + + memcpy((void*)(&payload->rendezvous.contiguous.cache_line_1 + immediate_fragment), + (const void *)sbuf_qw, immediate_block_count << 6); /* immediate_end_block_count */ + } + + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-SHM"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, SHM -- RENDEZVOUS RTS (end) context %p\n",context); + + return FI_SUCCESS; + } + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- RENDEZVOUS RTS (begin) context %p\n",context); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-HFI:%ld", tag); + + /* + * While the bulk of the payload data will be sent via SDMA once we + * get the CTS from the receiver, the initial RTS packet is sent via PIO. + */ + + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + const uint16_t total_credits_needed = (lrh_qws + 1 /* pbc */ + 7) >> 3 ; + + uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + &opx_ep->tx->force_credit_return, + total_credits_needed); + if (OFI_UNLIKELY(total_credits_available < total_credits_needed)) { + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + &opx_ep->tx->force_credit_return, total_credits_needed); + if (total_credits_available < total_credits_needed) { + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + return -FI_EAGAIN; + } + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, + dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); + if(OFI_UNLIKELY(psn == -1)) { + return -FI_EAGAIN; + } + + FI_OPX_DEBUG_COUNTERS_INC_COND(src_iface != FI_HMEM_SYSTEM, opx_ep->debug_counters.hmem.hfi + .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .send.rzv); + + /* + * Write the 'start of packet' (hw+sw header) 'send control block' + * which will consume a single pio credit. + */ + + uint64_t force_credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type); + volatile uint64_t * const scb = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); + + struct fi_opx_hfi1_txe_scb_16B tmp; + + fi_opx_store_and_copy_scb_16B(scb, &tmp, + opx_ep->tx->rzv_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_qws << 20), + opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | + ((caps & FI_MSG) ? + (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), + opx_ep->tx->rzv_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + opx_ep->tx->rzv_16B.hdr.qw_16B[5] | (1ull << 48), + len); + + + /* consume one credit for the packet header */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + unsigned credits_consumed = 1; +#endif + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + tmp.hdr.qw_16B[7] = tag; + fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, (uint64_t *)&tmp.qw0); + + /* + * write the rendezvous payload "send control blocks" + */ + + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + uint64_t temp[8]; + fi_opx_store_and_copy_qw(scb_payload, temp, + tag, /* end of header */ + /* start of receiver payload/cacheline */ + (uintptr_t)buf + immediate_total, /* rendezvous.contiguous.src_vaddr */ + (len - immediate_total) >> 6, /* rendezvous.contiguous.src_blocks */ + src_device_id, /* rendezvous.contiguous.src_device_id */ + (uint64_t) src_iface, /* rendezvous.contiguous.src_iface */ + immediate_info.qw0, /* rendezvous.contiguous.immediate_info */ + origin_byte_counter_vaddr, /* rendezvous.contiguous.origin_byte_counter_vaddr */ + -1UL /* unused */); /* rendezvous.contiguous.unused[0] */ + + /* consume one credit for the rendezvous payload metadata */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); +#ifndef NDEBUG + ++credits_consumed; +#endif + + uint64_t * replay_payload = replay->payload; + + assert(!replay->use_iov); + assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); + fi_opx_copy_cacheline(replay_payload, temp); + replay_payload += 8; + + uint8_t *sbuf; + if (src_iface != FI_HMEM_SYSTEM && immediate_total) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(src_iface, src_device_id, desc_mr->hmem_dev_reg_handle, + opx_ep->hmem_copy_buf, buf, immediate_total, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); + sbuf = opx_ep->hmem_copy_buf; + } else { + sbuf = (uint8_t *) buf; + } + + /* immediate_byte and immediate_qw are "packed" in the current implementation */ + /* meaning the immediate bytes are filled, then followed by the rest of the data directly */ + /* adjacent to the packed bytes. It's probably more efficient to leave a pad and not go */ + /* through the confusion of finding these boundaries on both sides of the rendezvous */ + /* That is, just pack the immediate bytes, then pack the "rest" in the immediate qws */ + /* This would lead to more efficient packing on both sides at the expense of */ + /* wasting space of a common 0 byte immediate */ + /* tmp_payload_t represents the second cache line of the rts packet */ + /* fi_opx_hfi1_packet_payload -> rendezvous -> contiguous */ + struct tmp_payload_t { + uint8_t immediate_byte[8]; /* rendezvous.contiguous.immediate_byte */ + uint64_t immediate_qw[7]; /* rendezvous.contiguous.immediate_qw */ + } __attribute__((packed)); + + uint64_t * sbuf_qw = (uint64_t *)(sbuf + immediate_byte_count); + if (immediate_fragment) { + struct tmp_payload_t *tmp_payload = (void*)temp; + if (immediate_byte_count > 0) { + memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count); + } + + for (int i=0; iimmediate_qw[i] = sbuf_qw[i]; + } + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, temp); + sbuf_qw += immediate_qw_count; + + fi_opx_copy_cacheline(replay_payload, temp); + replay_payload += 8; + + /* consume one credit for the rendezvous payload immediate data */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + ++credits_consumed; +#endif + /* Need a full tail block */ + if (icrc_fragment_block) { + /* No other tail or immediate block after this */ + assert(!icrc_end_block && !immediate_block_count && !immediate_end_block_count); + + /* Write another block to accomodate the ICRC and tail */ + uint64_t temp_0[8] = {-2UL}; + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + fi_opx_store_scb_qw(scb_payload, temp_0); + fi_opx_copy_cacheline(replay_payload, temp_0); + replay_payload += 8; + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + ++credits_consumed; +#endif + } +#ifndef NDEBUG + else if(icrc_fragment) { /* used an immediate qw for tail */ + /* No other tail or immediate block after this */ + assert(!icrc_end_block && !immediate_block_count && !immediate_end_block_count); + } else { + /* Must be tail and immediate blocks after this */ + assert(icrc_end_block && immediate_block_count && immediate_end_block_count); + } +#endif + + } + + if(immediate_block_count) { +#ifndef NDEBUG + /* Tail will be it's own block */ + assert(icrc_end_block && !icrc_fragment_block && !icrc_fragment && immediate_end_block_count); + /* assert immediate_block_count can be used for both + * full_block_credits_needed and total_credits_available parameters + * on the call + */ + assert((credits_consumed + immediate_block_count) <= total_credits_needed); + ssize_t credits = +#endif + fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, + &pio_state, + sbuf_qw, + immediate_block_count, + immediate_block_count); + memcpy(replay_payload, sbuf_qw, (immediate_block_count << 6)); + /* replay_payload is pointer to uint64_t, not char */ + replay_payload += (immediate_block_count << 3); /* immediate_block_count << 6 / sizeof(uint64_t) */ + + +#ifndef NDEBUG + assert(credits == immediate_block_count); + credits_consumed+= (unsigned) credits; +#endif + + } + + if (immediate_end_block_count) { + /* Tail will be it's own block */ + assert(icrc_end_block && !icrc_fragment_block && !icrc_fragment && immediate_block_count); + char* sbuf_end = (char *)buf + len - (immediate_end_block_count << 6); + union { + uint8_t immediate_byte[64]; + uint64_t immediate_qw[8]; + } align_tmp; + assert(immediate_end_block_count == 1); + + OPX_HMEM_COPY_FROM(align_tmp.immediate_byte, sbuf_end, (immediate_block_count << 6), + desc ? ((struct fi_opx_mr *)desc)->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, + OPX_HMEM_DEV_REG_SEND_THRESHOLD, + src_iface, src_device_id); + + scb_payload = (uint64_t *)FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, align_tmp.immediate_qw); + + fi_opx_copy_cacheline(replay_payload, align_tmp.immediate_qw); + replay_payload += 8; + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + ++credits_consumed; +#endif + + /* Need a full block for ICRC after the end block... */ + assert(icrc_end_block); + + /* Write another block to accomodate the ICRC and tail */ + uint64_t temp_0[8] = {-3UL}; + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, temp_0); + fi_opx_copy_cacheline(replay_payload, temp_0); + replay_payload += 8; + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + ++credits_consumed; +#endif + + } + + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + addr.reliability_rx, + dest_rx, psn_ptr, replay, reliability, hfi1_type); +#ifndef NDEBUG + assert(credits_consumed == total_credits_needed); +#endif + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + /* update the hfi txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-HFI:%ld",tag); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND 16B, HFI -- RENDEZVOUS RTS (end) context %p\n",context); + + return FI_SUCCESS; +} + +unsigned fi_opx_hfi1_handle_poll_error(struct fi_opx_ep * opx_ep, + volatile uint64_t *rhe_ptr, + volatile uint32_t * rhf_ptr, + const uint32_t rhf_msb, + const uint32_t rhf_lsb, + const uint64_t rhf_seq, + const uint64_t hdrq_offset, + const uint64_t rhf_rcvd, + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) +{ + /* We are assuming that we can process any error and consume this header, + let reliability detect and replay it as needed. */ + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "RECEIVE ERROR: rhf_msb = 0x%08x, rhf_lsb = 0x%08x, rhf_seq = 0x%lx\n", rhf_msb, rhf_lsb, rhf_seq); + + /* Unexpected errors on WFR */ + (void)rhf_ptr; /* unused unless debug is turned on */ + + /* drop this packet and allow reliability protocol to retry */ +#ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, - "%s:%s():%d drop this packet and allow reliability protocol to retry, psn = %u, RHF %#16.16lX, OPX_RHF_IS_USE_EGR_BUF %u, hdrq_offset_dws %lu\n", + "%s:%s():%d drop this packet and allow reliability protocol to retry, psn = %u, RHF %#16.16lX, OPX_RHF_IS_USE_EGR_BUF %u, hdrq_offset %lu\n", __FILE__, __func__, __LINE__, FI_OPX_HFI1_PACKET_PSN(hdr), - rhf_rcvd, OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd), hdrq_offset_dws); + rhf_rcvd, OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd,hfi1_type), hdrq_offset); #endif - OPX_RHE_DEBUG(opx_ep, rhe_ptr, rhf_ptr, rhf_msb, rhf_lsb, rhf_seq, hdrq_offset, rhf_rcvd, hdr); + OPX_RHE_DEBUG(opx_ep, rhe_ptr, rhf_ptr, rhf_msb, rhf_lsb, rhf_seq, hdrq_offset, rhf_rcvd, hdr, hfi1_type); - if (OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd)) { + if (OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd,hfi1_type)) { /* "consume" this egrq element */ - const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf_rcvd); + const uint32_t egrbfr_index = OPX_RHF_EGR_INDEX(rhf_rcvd, hfi1_type); const uint32_t last_egrbfr_index = opx_ep->rx->egrq.last_egrbfr_index; if (OFI_UNLIKELY(last_egrbfr_index != egrbfr_index)) { @@ -3515,7 +4330,7 @@ unsigned fi_opx_hfi1_handle_poll_error(struct fi_opx_ep * opx_ep, } /* "consume" this hdrq element */ - opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq,hfi1_type); opx_ep->rx->state.hdrq.head = hdrq_offset + FI_OPX_HFI1_HDRQ_ENTRY_SIZE_DWS; fi_opx_hfi1_update_hdrq_head_register(opx_ep, hdrq_offset); diff --git a/prov/opx/src/fi_opx_hfi1_jkr.c b/prov/opx/src/fi_opx_hfi1_jkr.c index 3b2f714435b..7f44341c09c 100644 --- a/prov/opx/src/fi_opx_hfi1_jkr.c +++ b/prov/opx/src/fi_opx_hfi1_jkr.c @@ -42,7 +42,8 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { uint32_t rhe_index = hdrq_offset >> FI_OPX_HFI1_HDRQ_INDEX_SHIFT; volatile uint64_t *rhe = rhe_ptr + rhe_index; /* 8 byte entries */ @@ -53,13 +54,13 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, #endif "RHF(%#16.16lX) RHE(%p)[%u]=%p RHE %#16.16lX is ERRORED %u, UseEgrBuf %u, EgrIndex %#X/%#X, EgrOffset %#X, %s%s%s %s %#16.16lX %s%s%s%s%s%s%s%s%s%s%s \n", rhf_rcvd, rhe_ptr, rhe_index, rhe, *rhe, - OPX_IS_ERRORED_RHF(rhf_rcvd) != 0UL, - OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd), - (uint32_t)OPX_RHF_EGR_INDEX(rhf_rcvd),opx_ep->rx->egrq.last_egrbfr_index, - (uint32_t) OPX_RHF_EGR_OFFSET(rhf_rcvd), - OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)? "EXPECTED_RCV" : "", - OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)? "EAGER_RCV" : "", - OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)? "OTHER RCV" : "", + OPX_IS_ERRORED_RHF(rhf_rcvd, hfi1_type) != 0UL, + OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd, hfi1_type), + (uint32_t)OPX_RHF_EGR_INDEX(rhf_rcvd, hfi1_type),opx_ep->rx->egrq.last_egrbfr_index, + (uint32_t) OPX_RHF_EGR_OFFSET(rhf_rcvd, hfi1_type), + OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd, hfi1_type)? "EXPECTED_RCV" : "", + OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd, hfi1_type)? "EAGER_RCV" : "", + OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd, hfi1_type)? "OTHER RCV" : "", ((*rhe) & OPX_JKR_RHE_TAIL )? "OPX_JKR_RHE_TAIL " : "", ((*rhe) & OPX_JKR_RHE_TAIL), ((*rhe) & OPX_JKR_RHE_ICRCERR )? "OPX_JKR_RHE_ICRCERR " : "", ((*rhe) & OPX_JKR_RHE_TIDBYPASSERR)? "OPX_JKR_RHE_TIDBYPASSERR" : "", @@ -86,12 +87,12 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, FI_OPX_DEBUG_COUNTERS_INC_COND((*rhe) & OPX_JKR_RHE_FLOWSEQERR ,opx_ep->debug_counters.rhf.flowseqerr); FI_OPX_DEBUG_COUNTERS_INC_COND((*rhe) & OPX_JKR_RHE_RCVTYPEERR ,opx_ep->debug_counters.rhf.rcvtypeerr); /* Count the packet type that had an error */ - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeexp); - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeegr); - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeoth); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd, hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeexp); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd, hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeegr); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd, hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeoth); #ifdef OPX_VERBOSE_TRIGGER // verbose output - fi_opx_hfi1_dump_packet_hdr (hdr, "OPX_IS_ERRORED_RHF", __LINE__); + fi_opx_hfi1_dump_packet_hdr(hdr, hfi1_type, "OPX_IS_ERRORED_RHF", __LINE__); #endif /* trigger on unexpected errors ) ignoring TIDERR */ @@ -102,10 +103,10 @@ void opx_jkr_rhe_debug(struct fi_opx_ep * opx_ep, } -int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union fi_opx_hfi1_packet_hdr *const hdr) +int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { - const uint8_t opcode = hdr->stl.bth.opcode; - + const uint8_t opcode = hdr->bth.opcode; #ifdef OPX_VERBOSE_TRIGGER // verbose output fprintf(stderr, #else @@ -113,14 +114,96 @@ int opx_jkr_rhf_error_handler(const uint64_t rhf_rcvd, const union fi_opx_hfi1_p #endif "%s:%s():%d MISSING PAYLOAD opcode %#X, UseEgrBuf %u, pktlen %#X, type: %s%s%s\n", __FILE__, __func__, __LINE__, - opcode, OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd), ntohs(hdr->stl.lrh.pktlen), - OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)? "EXPECTED_RCV" : "", - OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)? "EAGER_RCV" : "", - OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)? "OTHER RCV" : ""); + opcode, OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd, hfi1_type), ntohs(hdr->lrh_9B.pktlen), + OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd, hfi1_type)? "EXPECTED_RCV" : "", + OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd, hfi1_type)? "EAGER_RCV" : "", + OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd, hfi1_type)? "OTHER RCV" : ""); #ifdef OPX_VERBOSE_TRIGGER // verbose ouput - fi_opx_hfi1_dump_packet_hdr (hdr, "MISSING PAYLOAD", __LINE__); + fi_opx_hfi1_dump_packet_hdr (hdr, OPX_HFI1_JKR, "MISSING PAYLOAD", __LINE__); #endif opx_sw_trigger(); return 1; } +void opx_jkr_print_16B_pbc(uint64_t pbc1, const char* func) +{ + __attribute__((__unused__)) union opx_jkr_pbc pbc; + pbc.raw64b = pbc1; + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc = %#16.16lX\n", func, pbc1); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.LengthDWs = %#x %zu\n", func, pbc.LengthDWs, pbc.LengthDWs * sizeof(uint32_t)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Vl = %#x\n", func, pbc.Vl); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.PortIdx = %#x\n", func, pbc.PortIdx); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Reserved_2 = %#x\n", func, pbc.Reserved_2); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.L2Compressed = %#x\n", func, pbc.L2Compressed); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.L2Type = %#x\n", func, pbc.L2Type); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Fecnd = %#x\n", func, pbc.Fecnd); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.TestBadLcrc = %#x\n", func, pbc.TestBadLcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.InsertNon9bIcrc = %#x\n", func, pbc.InsertNon9bIcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.CreditReturn = %#x\n", func, pbc.CreditReturn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.InsertHcrc = %#x\n", func, pbc.InsertHcrc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Reserved_1 = %#x\n", func, pbc.Reserved_1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.TestEbp = %#x\n", func, pbc.TestEbp); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Sc4 = %#x\n", func, pbc.Sc4); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Intr = %#x\n", func, pbc.Intr); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.Dlid = %#x %u\n", func, pbc.Dlid, pbc.Dlid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: Pbc.SendCtxt = %#x\n", func, pbc.SendCtxt); +} + + +void opx_jkr_print_16B_lrh(uint64_t lrh1, uint64_t lrh2, const char* func) +{ + __attribute__((__unused__)) union opx_hfi1_packet_hdr hdr; + hdr.lrh_16B.qw[0] = lrh1; + hdr.lrh_16B.qw[1] = lrh2; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH = %#16.16lX\n", func, lrh1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH = %#16.16lX\n", func, lrh2); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.slid = %#x\n", func, hdr.lrh_16B.slid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.pktlen = %#x %zu\n", func, hdr.lrh_16B.pktlen, hdr.lrh_16B.pktlen * sizeof(uint64_t)); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.b = %#x\n", func, hdr.lrh_16B.b); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.dlid = %#x\n", func, hdr.lrh_16B.dlid); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.sc = %#x\n", func, hdr.lrh_16B.sc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.rc = %#x\n", func, hdr.lrh_16B.rc); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.f = %#x\n", func, hdr.lrh_16B.f); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.l2 = %#x\n", func, hdr.lrh_16B.l2); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.lt = %#x\n", func, hdr.lrh_16B.lt); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.l4 = %#x\n", func, hdr.lrh_16B.l4); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.slid20 = %#x\n", func, hdr.lrh_16B.slid20); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.dlid20 = %#x\n", func, hdr.lrh_16B.dlid20); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.pkey = %#x\n", func, hdr.lrh_16B.pkey); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.entropy = %#x\n", func, hdr.lrh_16B.entropy); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.age = %#x\n", func, hdr.lrh_16B.age); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.cspec = %#x\n", func, hdr.lrh_16B.cspec); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.r = %#x\n", func, hdr.lrh_16B.r); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.SLID(full) = %#6.6x (BE format = %#6.6x)\n", func, hdr.lrh_16B.slid20 << 20 | hdr.lrh_16B.slid, htons(((hdr.lrh_16B.slid20 << 20) | hdr.lrh_16B.slid))); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: LRH.DLID(full) = %#6.6x (BE format = %#6.6x)\n", func, hdr.lrh_16B.dlid20 << 20 | hdr.lrh_16B.dlid, htons(((hdr.lrh_16B.dlid20 << 20) | hdr.lrh_16B.dlid))); + + +} + + +void opx_jkr_print_16B_bth(uint64_t bth1, uint64_t bth2, const char* func) +{ + __attribute__((__unused__)) union opx_hfi1_packet_hdr hdr; + hdr.qw_16B[2] = bth1; + hdr.qw_16B[3] = bth2; + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH = %#16.16lX\n", func, bth1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH = %#16.16lX\n", func, bth2); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.opcode = %#x\n", func, hdr.bth.opcode); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.bth_1 = %#x\n", func, hdr.bth.bth_1); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.pkey = %#x\n", func, hdr.bth.pkey); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.ecn = %#x\n", func, hdr.bth.ecn); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.qp = %#x\n", func, hdr.bth.qp); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.unused = %#x\n", func, hdr.bth.unused); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.rx = %#x\n", func, hdr.bth.rx); + + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "%s: BTH.psn = %#x\n", func, hdr.bth.psn); +} diff --git a/prov/opx/src/fi_opx_hfi1_sdma.c b/prov/opx/src/fi_opx_hfi1_sdma.c index 32367500684..6816ef9988b 100644 --- a/prov/opx/src/fi_opx_hfi1_sdma.c +++ b/prov/opx/src/fi_opx_hfi1_sdma.c @@ -214,7 +214,7 @@ void fi_opx_hfi1_sdma_handle_errors(struct fi_opx_ep *opx_ep, fprintf(stderr, "(%d) [%d] PBC: %#16.16lX\n", pid, req_num, header_vec->scb.qw0); - fi_opx_hfi1_dump_packet_hdr(&header_vec->scb.hdr, func, line); + fi_opx_hfi1_dump_packet_hdr(&header_vec->scb.hdr, OPX_HFI1_TYPE, func, line); fprintf(stderr, "(%d) [%d] req data iov=%p len=%lu\n", pid, req_num, iov_ptr[1].iov_base, iov_ptr[1].iov_len); @@ -246,7 +246,7 @@ void fi_opx_hfi1_sdma_handle_errors(struct fi_opx_ep *opx_ep, "(%d) [%d] ERROR: Request opcode is set to EXPECTED (TID), but TID IOV's length is < minimum!\n", pid, req_num); } - uint32_t kdeth = (uint32_t) (header_vec->scb.hdr.qw[2] >> 32); + uint32_t kdeth = (uint32_t) (header_vec->scb.hdr.qw_9B[2] >> 32); uint32_t tidctrl = (kdeth >> FI_OPX_HFI1_KDETH_TIDCTRL_SHIFT) & FI_OPX_HFI1_KDETH_TIDCTRL; uint32_t tididx = (kdeth >> FI_OPX_HFI1_KDETH_TID_SHIFT) & FI_OPX_HFI1_KDETH_TID; uint32_t tidOMshift = (kdeth >> KDETH_OM_SHIFT) & KDETH_OM_MASK; diff --git a/prov/opx/src/fi_opx_hfi1_wfr.c b/prov/opx/src/fi_opx_hfi1_wfr.c index fb9388703f0..7e9ffef1aa3 100644 --- a/prov/opx/src/fi_opx_hfi1_wfr.c +++ b/prov/opx/src/fi_opx_hfi1_wfr.c @@ -41,7 +41,8 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, const uint64_t rhf_seq, const uint64_t hdrq_offset, const uint64_t rhf_rcvd, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr, + const enum opx_hfi1_type hfi1_type) { #ifdef OPX_VERBOSE_TRIGGER // verbose output fprintf(stderr, @@ -50,13 +51,13 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, #endif "RHF(%#16.16lX) RHE %#8.8X is ERRORED %u, UseEgrBuf %u, EgrIndex %#X/%#X, EgrOffset %#X, %s%s%s %s%s%s%s%s%s%s%s \n", rhf_rcvd, rhf_msb & 0xBFE00000u, - OPX_IS_ERRORED_RHF(rhf_rcvd) != 0UL, - OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd), - (uint32_t)OPX_RHF_EGR_INDEX(rhf_rcvd),opx_ep->rx->egrq.last_egrbfr_index, - (uint32_t) OPX_RHF_EGR_OFFSET(rhf_rcvd), - OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)? "EXPECTED_RCV" : "", - OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)? "EAGER_RCV" : "", - OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)? "OTHER RCV" : "", + OPX_IS_ERRORED_RHF(rhf_rcvd, hfi1_type) != 0UL, + OPX_RHF_IS_USE_EGR_BUF(rhf_rcvd, hfi1_type), + (uint32_t)OPX_RHF_EGR_INDEX(rhf_rcvd, hfi1_type),opx_ep->rx->egrq.last_egrbfr_index, + (uint32_t) OPX_RHF_EGR_OFFSET(rhf_rcvd, hfi1_type), + OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd, hfi1_type)? "EXPECTED_RCV" : "", + OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd, hfi1_type)? "EAGER_RCV" : "", + OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd, hfi1_type)? "OTHER RCV" : "", rhf_msb & OPX_WFR_RHF_ICRCERR? "OPX_WFR_RHF_ICRCERR" :"", rhf_msb & OPX_WFR_RHF_LENERR? "OPX_WFR_RHF_LENERR" :"", rhf_msb & OPX_WFR_RHF_ECCERR? "OPX_WFR_RHF_ECCERR" :"", @@ -76,12 +77,12 @@ void opx_wfr_rhe_debug(struct fi_opx_ep * opx_ep, FI_OPX_DEBUG_COUNTERS_INC_COND(rhf_msb & OPX_WFR_RHF_KHDRLENERR,opx_ep->debug_counters.rhf.khdrlenerr); FI_OPX_DEBUG_COUNTERS_INC_COND(rhf_msb & OPX_WFR_RHF_RCVTYPEERR,opx_ep->debug_counters.rhf.rcvtypeerr); /* Count the packet type that had an error */ - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeexp); - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeegr); - FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd)),opx_ep->debug_counters.rhf.rcvtypeoth); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EXPECTED_RCV(rhf_rcvd,hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeexp); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_EAGER_RCV(rhf_rcvd,hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeegr); + FI_OPX_DEBUG_COUNTERS_INC_COND((OPX_RHF_RCV_TYPE_OTHER(rhf_rcvd,hfi1_type)),opx_ep->debug_counters.rhf.rcvtypeoth); #ifdef OPX_VERBOSE_TRIGGER // verbose output - fi_opx_hfi1_dump_packet_hdr (hdr, "OPX_IS_ERRORED_RHF", __LINE__); + fi_opx_hfi1_dump_packet_hdr(hdr, hfi1_type, "OPX_IS_ERRORED_RHF", __LINE__); #endif return; diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 21b27428442..0e340c735c8 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -59,7 +59,7 @@ union fi_opx_addr opx_default_addr = { .hfi1_rx = 0, .hfi1_unit = 0xff, .reliability_rx = 0, - .uid = { .lid = 0xffff, .endpoint_id = 0xffff }, + .uid = {.lid = 0xffff, .lid_3B = 0xff, .endpoint_id = 0xff }, .rx_index = 0, }; @@ -649,10 +649,10 @@ struct fi_provider fi_opx_provider = { */ static void do_static_assert_tests() { - // Verify that pio_state is exactly one cache-line long. */ + // Verify that pio_state is exactly one cache-line long. OPX_COMPILE_TIME_ASSERT((sizeof(union fi_opx_hfi1_pio_state) == 8), "fi_opx_hfi1_pio_state size error."); - // Verify that pointers are exactly one cache-line long. */ + // Verify that pointers are exactly one cache-line long. OPX_COMPILE_TIME_ASSERT((sizeof(union fi_opx_hfi1_pio_state*) == 8), "fi_opx_hfi1_pio_state pointer size error."); @@ -675,8 +675,6 @@ static void do_static_assert_tests() "sizeof(fi_opx_hmem_info) >> 3 != OPX_HMEM_SIZE_QWS") ; OPX_COMPILE_TIME_ASSERT(OPX_HFI1_TID_PAGESIZE == 4096, "OPX_HFI1_TID_PAGESIZE must be 4K!"); - OPX_COMPILE_TIME_ASSERT(OPX_MR != OFI_MR_UNSPEC, - "OPX_MR should be set to 'FI_MR_SCALABLE' or 'FI_MR_BASIC', not 'FI_MR_UNSPEC'"); } #pragma GCC diagnostic pop @@ -740,6 +738,7 @@ OPX_INI /* CN5000 only */ fi_param_define(&fi_opx_provider, "rate_control", FI_PARAM_INT,"Rate control (CN5000 only). Values can range from 0-7. 0-3 is used for in-order and 4-7 is used for out-of-order. Default is %d\n", OPX_BTH_RC2_DEFAULT); // fi_param_define(&fi_opx_provider, "varname", FI_PARAM_*, "help"); + fi_param_define(&fi_opx_provider, "mixed_network", FI_PARAM_INT, "Indicates a mixed network of OPA100 and CN5000. Needs to be set to 1 when mixed network is used. Default is 0.\n"); /* Track TID and HMEM domains so caches can be cleared on exit */ dlist_init(&fi_opx_global.tid_domain_list); diff --git a/prov/opx/src/fi_opx_msg.c b/prov/opx/src/fi_opx_msg.c index eafba58397b..6cc9c0a343c 100644 --- a/prov/opx/src/fi_opx_msg.c +++ b/prov/opx/src/fi_opx_msg.c @@ -42,8 +42,6 @@ ssize_t fi_opx_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const enum fi_threading threading = opx_ep->threading; const enum fi_av_type av_type = opx_ep->av_type; @@ -61,7 +59,8 @@ ssize_t fi_opx_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_MSG, - reliability); + reliability, + OPX_HFI1_TYPE); } return fi_opx_ep_tx_send(ep, msg->msg_iov, msg->iov_count, @@ -72,15 +71,14 @@ ssize_t fi_opx_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_MSG, - reliability); + reliability, + OPX_HFI1_TYPE); } ssize_t fi_opx_sendv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const enum fi_threading threading = opx_ep->threading; const enum fi_av_type av_type = opx_ep->av_type; @@ -98,7 +96,8 @@ ssize_t fi_opx_sendv(struct fid_ep *ep, const struct iovec *iov, OPX_FLAGS_OVERRIDE_FALSE, 0, /* flags */ caps | FI_MSG, - reliability); + reliability, + OPX_HFI1_TYPE); } return fi_opx_ep_tx_send(ep, iov, count, desc, dest_addr, 0, context, 0, @@ -108,92 +107,176 @@ ssize_t fi_opx_sendv(struct fid_ep *ep, const struct iovec *iov, OPX_FLAGS_OVERRIDE_FALSE, 0, /* flags */ caps | FI_MSG, - reliability); + reliability, + OPX_HFI1_TYPE); } ssize_t fi_opx_senddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, uint64_t data, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - errno = FI_ENOSYS; return -errno; } /* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) /* FI_LOCAL_COMM = 0x0008000000000000ull */ -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) /* FI_REMOTE_COMM = 0x0010000000000000ull */ -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) -#define FI_OPX_MSG_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY) -#define FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_ops_msg_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY +#define FI_OPX_MSG_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_MSG_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY) \ +#define FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_ops_msg_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE + +#define FI_OPX_MSG_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ static struct fi_ops_msg \ - FI_OPX_MSG_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY) __attribute__ ((unused)) = { \ + FI_OPX_MSG_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY, HFI1_TYPE) __attribute__ ((unused)) = { \ .size = sizeof(struct fi_ops_msg), \ - .recv = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, LOCK, AV, CAPS, RELIABILITY), \ + .recv = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .recvv = fi_no_msg_recvv, \ - .recvmsg = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, LOCK, AV, CAPS, RELIABILITY), \ - .send = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, LOCK, AV, CAPS, RELIABILITY), \ + .recvmsg = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .send = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .sendv = fi_opx_sendv, \ .sendmsg = fi_opx_sendmsg, \ - .inject = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, LOCK, AV, CAPS, RELIABILITY), \ - .senddata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, LOCK, AV, CAPS, RELIABILITY), \ - .injectdata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, LOCK, AV, CAPS, RELIABILITY),\ + .inject = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .senddata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .injectdata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE),\ } /* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); /* FI_LOCAL_COMM = 0x0008000000000000ull */ -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); /* FI_REMOTE_COMM = 0x0010000000000000ull */ -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - - +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_MSG_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); static struct fi_ops_msg fi_opx_no_msg_ops = { @@ -266,47 +349,141 @@ int fi_opx_enable_msg_ops(struct fid_ep *ep) return -FI_EINVAL; } - if (!lock_required) { - if (opx_ep->av->type == FI_AV_TABLE) { - if (comm_caps == FI_LOCAL_COMM) { - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - } else if (comm_caps == FI_REMOTE_COMM) { - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } + } else { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - } else if (opx_ep->av->type == FI_AV_MAP) { - if (comm_caps == FI_LOCAL_COMM) { - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - } else if (comm_caps == FI_REMOTE_COMM) { - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); + + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B){ + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } } else { - /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ - assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } + } } else { - if (opx_ep->av->type == FI_AV_TABLE) { - if (comm_caps == FI_LOCAL_COMM) { - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - } else if (comm_caps == FI_REMOTE_COMM) { - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - } - } else if (opx_ep->av->type == FI_AV_MAP) { - if (comm_caps == FI_LOCAL_COMM) { - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - } else if (comm_caps == FI_REMOTE_COMM) { - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } } else { - /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ - assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.msg = &FI_OPX_MSG_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_MSG_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } + } } diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index 3009876ce7b..44c8cfc59aa 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -421,15 +421,17 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_opcode (struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, - const uint64_t opcode) + const uint64_t opcode, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < 1)) { + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < credits_needed)) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); - if (FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < 1) { + if (FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < credits_needed) { opx_ep->tx->pio_state->qw0 = pio_state.qw0; return -FI_EAGAIN; } @@ -437,31 +439,59 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_opcode (struct fid_ep *ep, const uint64_t lrh_dlid = dlid << 16; const uint64_t bth_rx = reliability_rx << 56; - struct fi_opx_hfi1_txe_scb model = opx_ep->reliability->service.tx.hfi1.ping_model; - model.hdr.ud.opcode = opcode; volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - OPX_HFI1_BAR_STORE(&scb[0], (model.qw0 | OPX_PBC_CR(0x1) | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid))); - OPX_HFI1_BAR_STORE(&scb[1], (model.hdr.qw[0] | lrh_dlid)); - OPX_HFI1_BAR_STORE(&scb[2], (model.hdr.qw[1] | bth_rx)); - OPX_HFI1_BAR_STORE(&scb[3], model.hdr.qw[2]); - OPX_HFI1_BAR_STORE(&scb[4], model.hdr.qw[3]); - OPX_HFI1_BAR_STORE(&scb[5], 0UL); - OPX_HFI1_BAR_STORE(&scb[6], 0UL); - OPX_HFI1_BAR_STORE(&scb[7], key); - - - /* consume one credit for the packet header */ - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + if ((hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B))) { + struct fi_opx_hfi1_txe_scb_9B model_9B = opx_ep->reliability->service.tx.hfi1.ping_model_9B; + model_9B.hdr.ud.opcode = opcode; + OPX_HFI1_BAR_STORE(&scb[0], (model_9B.qw0 | OPX_PBC_CR(0x1, hfi1_type) | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_9B.hdr.qw_9B[0] | lrh_dlid)); + OPX_HFI1_BAR_STORE(&scb[2], (model_9B.hdr.qw_9B[1] | bth_rx)); + OPX_HFI1_BAR_STORE(&scb[3], model_9B.hdr.qw_9B[2]); + OPX_HFI1_BAR_STORE(&scb[4], model_9B.hdr.qw_9B[3]); + OPX_HFI1_BAR_STORE(&scb[5], 0UL); + OPX_HFI1_BAR_STORE(&scb[6], 0UL); + OPX_HFI1_BAR_STORE(&scb[7], key); + + /* consume one credit for the packet header */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + } else { + struct fi_opx_hfi1_txe_scb_16B model_16B = opx_ep->reliability->service.tx.hfi1.ping_model_16B; + model_16B.hdr.ud.opcode = opcode; + OPX_HFI1_BAR_STORE(&scb[0], (model_16B.qw0 | OPX_PBC_CR(1, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_16B.hdr.qw_16B[0] | + ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[2], (model_16B.hdr.qw_16B[1] | + ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[3], model_16B.hdr.qw_16B[2] | bth_rx); + OPX_HFI1_BAR_STORE(&scb[4], model_16B.hdr.qw_16B[3]); + OPX_HFI1_BAR_STORE(&scb[5], model_16B.hdr.qw_16B[4]); + OPX_HFI1_BAR_STORE(&scb[6], 0UL); + OPX_HFI1_BAR_STORE(&scb[7], 0UL); + FI_OPX_HFI1_CONSUME_CREDITS(pio_state, 1); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + volatile uint64_t * const scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + OPX_HFI1_BAR_STORE(&scb_payload[0], key); + OPX_HFI1_BAR_STORE(&scb_payload[1], 0); + OPX_HFI1_BAR_STORE(&scb_payload[2], 0); + OPX_HFI1_BAR_STORE(&scb_payload[3], 0); + OPX_HFI1_BAR_STORE(&scb_payload[4], 0); + OPX_HFI1_BAR_STORE(&scb_payload[5], 0); + OPX_HFI1_BAR_STORE(&scb_payload[6], 0); + OPX_HFI1_BAR_STORE(&scb_payload[7], 0); + FI_OPX_HFI1_CONSUME_CREDITS(pio_state, 1); + } FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); /* save the updated txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; - return FI_SUCCESS; } @@ -471,7 +501,8 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_opcode (struct fid_ep *ep, void opx_reliability_handshake_init(struct fid_ep *ep, union fi_opx_reliability_service_flow_key key, - const uint64_t target_reliability_rx) + const uint64_t target_reliability_rx, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -481,7 +512,8 @@ void opx_reliability_handshake_init(struct fid_ep *ep, fi_opx_hfi1_tx_reliability_inject_ud_init(ep, key.value, key.dlid, target_reliability_rx, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT, + hfi1_type); uint64_t value = 1; rbtInsert(opx_ep->reliability->service.handshake_init, (void*)key.value, (void*)value); @@ -495,7 +527,8 @@ void opx_reliability_handshake_init(struct fid_ep *ep, fi_opx_hfi1_tx_reliability_inject_ud_init(ep, key.value, key.dlid, target_reliability_rx, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT, + hfi1_type); } (*count_ptr)++; @@ -505,11 +538,12 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_init(struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, - const uint64_t opcode) + const uint64_t opcode, + const enum opx_hfi1_type hfi1_type) { assert(opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT || opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT_ACK); - ssize_t rc = fi_opx_hfi1_tx_reliability_inject_ud_opcode(ep, key, dlid, reliability_rx, opcode); + ssize_t rc = fi_opx_hfi1_tx_reliability_inject_ud_opcode(ep, key, dlid, reliability_rx, opcode, hfi1_type); if (OFI_UNLIKELY(rc)) { #ifdef OPX_RELIABILITY_DEBUG @@ -542,7 +576,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_resynch(struct fid_ep *ep, { assert(opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_RESYNCH || opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_RESYNCH_ACK); - ssize_t rc = fi_opx_hfi1_tx_reliability_inject_ud_opcode(ep, key, dlid, reliability_rx, opcode); + ssize_t rc = fi_opx_hfi1_tx_reliability_inject_ud_opcode(ep, key, dlid, reliability_rx, opcode, OPX_HFI1_TYPE); if (OFI_UNLIKELY(rc)) { #ifdef OPX_RELIABILITY_DEBUG @@ -570,7 +604,8 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_resynch(struct fid_ep *ep, ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, const uint64_t key, const uint64_t dlid, const uint64_t reliability_rx, const uint64_t psn_start, const uint64_t psn_count, - const uint64_t opcode) + const uint64_t opcode, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -580,9 +615,10 @@ ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, const uint64_t psn_start_24 = psn_start & MAX_PSN; const uint64_t psn_count_24 = MIN(psn_count, MAX_PSN-psn_start_24 + 1); - if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < 1)) { + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < credits_needed)) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); - if (FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < 1) { + if (FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state) < credits_needed) { /* * no credits available @@ -602,6 +638,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, } #endif opx_ep->tx->pio_state->qw0 = pio_state.qw0; + return -FI_EAGAIN; } } @@ -639,44 +676,86 @@ ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, const uint64_t lrh_dlid = dlid << 16; const uint64_t bth_rx = reliability_rx << 56; - const struct fi_opx_hfi1_txe_scb * const model = + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + const struct fi_opx_hfi1_txe_scb_9B * const model = opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING ? - &opx_ep->reliability->service.tx.hfi1.ping_model : + &opx_ep->reliability->service.tx.hfi1.ping_model_9B : ( opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK ? - &opx_ep->reliability->service.tx.hfi1.ack_model : - &opx_ep->reliability->service.tx.hfi1.nack_model ); + &opx_ep->reliability->service.tx.hfi1.ack_model_9B : + &opx_ep->reliability->service.tx.hfi1.nack_model_9B ); - OPX_HFI1_BAR_STORE(&scb[0], (model->qw0 | OPX_PBC_CR(0x1) | - OPX_PBC_JKR_LRH_DLID_TO_PBC_DLID(lrh_dlid))); - OPX_HFI1_BAR_STORE(&scb[1], (model->hdr.qw[0] | lrh_dlid)); - OPX_HFI1_BAR_STORE(&scb[2], (model->hdr.qw[1] | bth_rx)); - OPX_HFI1_BAR_STORE(&scb[3], model->hdr.qw[2]); - OPX_HFI1_BAR_STORE(&scb[4], model->hdr.qw[3]); - OPX_HFI1_BAR_STORE(&scb[5], psn_count_24); - OPX_HFI1_BAR_STORE(&scb[6], psn_start_24); - OPX_HFI1_BAR_STORE(&scb[7], key); /* service.key */ + OPX_HFI1_BAR_STORE(&scb[0], (model->qw0 | OPX_PBC_CR(0x1, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model->hdr.qw_9B[0] | lrh_dlid)); + OPX_HFI1_BAR_STORE(&scb[2], (model->hdr.qw_9B[1] | bth_rx)); + OPX_HFI1_BAR_STORE(&scb[3], model->hdr.qw_9B[2]); + OPX_HFI1_BAR_STORE(&scb[4], model->hdr.qw_9B[3]); + OPX_HFI1_BAR_STORE(&scb[5], psn_count_24); + OPX_HFI1_BAR_STORE(&scb[6], psn_start_24); + OPX_HFI1_BAR_STORE(&scb[7], key); /* service.key */ - //fi_opx_hfi1_dump_stl_packet_hdr((struct fi_opx_hfi1_stl_packet_hdr *)&tmp[1], __func__, __LINE__); + //fi_opx_hfi1_dump_stl_packet_hdr((struct fi_opx_hfi1_stl_packet_hdr_9B *)&tmp[1], __func__, __LINE__); - /* consume one credit for the packet header */ - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + /* consume one credit for the packet header */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + } else { + const struct fi_opx_hfi1_txe_scb_16B * const model_16B = + opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING ? + &opx_ep->reliability->service.tx.hfi1.ping_model_16B : + ( opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK ? + &opx_ep->reliability->service.tx.hfi1.ack_model_16B : + &opx_ep->reliability->service.tx.hfi1.nack_model_16B ); + OPX_HFI1_BAR_STORE(&scb[0], (model_16B->qw0 | OPX_PBC_CR(1, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type))); + OPX_HFI1_BAR_STORE(&scb[1], (model_16B->hdr.qw_16B[0] | + ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[2], (model_16B->hdr.qw_16B[1] | + ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B))); + OPX_HFI1_BAR_STORE(&scb[3], model_16B->hdr.qw_16B[2] | bth_rx); + OPX_HFI1_BAR_STORE(&scb[4], model_16B->hdr.qw_16B[3]); + OPX_HFI1_BAR_STORE(&scb[5], model_16B->hdr.qw_16B[4]); + OPX_HFI1_BAR_STORE(&scb[6], psn_count_24); + OPX_HFI1_BAR_STORE(&scb[7], psn_start_24); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_INFO(fi_opx_global.prov, FI_LOG_EP_DATA, "Completed cacheline 1\n"); + + volatile uint64_t * const scb2 = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + OPX_HFI1_BAR_STORE(&scb2[0], key); + + //fi_opx_compiler_msync_writes(); + + OPX_HFI1_BAR_STORE(&scb2[1], 0); + OPX_HFI1_BAR_STORE(&scb2[2], 0); + OPX_HFI1_BAR_STORE(&scb2[3], 0); + OPX_HFI1_BAR_STORE(&scb2[4], 0); + OPX_HFI1_BAR_STORE(&scb2[5], 0); + OPX_HFI1_BAR_STORE(&scb2[6], 0); + OPX_HFI1_BAR_STORE(&scb2[7], 0); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + } /* save the updated txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; - return FI_SUCCESS; } void fi_opx_hfi1_rx_reliability_send_pre_acks(struct fid_ep *ep, const uint64_t dlid, const uint64_t reliability_rx, const uint64_t psn_start, const uint64_t psn_count, - const union fi_opx_hfi1_packet_hdr *const hdr, - const uint8_t origin_rx) + const union opx_hfi1_packet_hdr *const hdr, + const uint8_t origin_rx, + const uint32_t slid, + const enum opx_hfi1_type hfi1_type) { OPX_TRACER_TRACE_RELI(OPX_TRACER_BEGIN, "RX_RELI_SEND_PRE_ACKS"); - const uint64_t slid = hdr->stl.lrh.slid; const union fi_opx_reliability_service_flow_key key = { .slid = slid, @@ -688,7 +767,8 @@ void fi_opx_hfi1_rx_reliability_send_pre_acks(struct fid_ep *ep, const uint64_t rc = fi_opx_hfi1_tx_reliability_inject(ep, (uint64_t)key.value, slid, origin_rx, psn_start, psn_count, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK, + hfi1_type); INC_PING_STAT_COND(rc == FI_SUCCESS, PRE_ACKS_SENT, key.value, psn_start, psn_count); OPX_TRACER_TRACE_RELI(OPX_TRACER_END_SUCCESS, "RX_RELI_SEND_PRE_ACKS"); } @@ -720,7 +800,7 @@ ssize_t fi_opx_hfi1_rx_reliability_ping_response (struct fid_ep *ep, key, slid, rx, psn_start_24, psn_count_24, - opcode); + opcode, OPX_HFI1_TYPE); INC_PING_STAT_COND(rc == FI_SUCCESS, opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK ? ACKS_SENT : NACKS_SENT, key, psn_start_24, psn_count_24); @@ -764,7 +844,8 @@ void fi_opx_hfi1_rx_reliability_ping (struct fid_ep *ep, key, slid, rx, 0, /* psn_start */ 1, /* psn_count */ - FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK, + OPX_HFI1_TYPE); INC_PING_STAT_COND(rc == FI_SUCCESS, NACKS_SENT, key, 0, 1); OPX_TRACER_TRACE_RELI(OPX_TRACER_END_ERROR, "RX_RELI_PING"); return; @@ -987,14 +1068,14 @@ void fi_opx_hfi1_reliability_iov_payload_check( "orig_payload[%d]=%016lX current[@%p]=%016lX\n", getpid(), file, func, line, key, - FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr), - FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(replay)), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(replay)), replay->sdma_we, bounce_buf, use_bounce_buf, pending_bounce_buf, we_cc, we_cc_byte_counter, replay->cc_ptr, replay->cc_ptr->byte_counter, replay->cc_dec, cc_next, cc_next_byte_counter, - replay->scb.hdr.stl.bth.opcode, - replay->scb.hdr.dput.target.opcode, + OPX_REPLAY_HDR(replay)->bth.opcode, + OPX_REPLAY_HDR(replay)->dput.target.opcode, replay->iov->iov_base, replay->iov->iov_len, error_msg, i, replay->orig_payload[i], @@ -1063,8 +1144,9 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, * q doesn't contain a rollover (i.e, the tail's PSN >= the head's PSN) * we can just retire all elements in the queue */ - uint32_t head_psn = FI_OPX_HFI1_PACKET_PSN(&head->scb.hdr); - uint32_t tail_psn = FI_OPX_HFI1_PACKET_PSN(&tail->scb.hdr); + + uint32_t head_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(head)); + uint32_t tail_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tail)); if ((head_psn >= psn_start) && (tail_psn <= psn_stop) && (tail_psn >= head_psn)) { #ifdef OPX_RELIABILITY_DEBUG @@ -1089,7 +1171,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, last_ack_index += snprintf(&last_ack[last_ack_index], LAST_ACK_LEN-last_ack_index, "(tx) packet %016lx %08x retired (fast path).\n", - key, FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + key, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif next = tmp->next; @@ -1108,8 +1190,17 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, } } - const uint16_t lrh_pktlen_le = ntohs(tmp->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + uint16_t lrh_pktlen_le; + size_t total_bytes; + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(tmp->scb_9B.hdr.lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + } else { + lrh_pktlen_le = tmp->scb_16B.hdr.lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + } tmp->psn_ptr->psn.bytes_outstanding -= total_bytes; assert((int32_t)tmp->psn_ptr->psn.bytes_outstanding >= 0); @@ -1120,7 +1211,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, } else { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u ACK'd but pinned, marking as ACK'd and skipping free of replay.\n", - key, FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + key, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif tmp->acked = true; } @@ -1144,7 +1235,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, uint32_t start_psn = head_psn; while ((start_psn < psn_start) && (start != tail)) { start = start->next; - start_psn = FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr); + start_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)); } if (OFI_UNLIKELY(start_psn < psn_start)) { @@ -1170,12 +1261,12 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, struct fi_opx_reliability_tx_replay * stop = start; uint32_t stop_psn = start_psn; - uint32_t stop_next_psn = FI_OPX_HFI1_PACKET_PSN(&stop->next->scb.hdr); + uint32_t stop_next_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop->next)); while ((stop->next != head) && (stop_next_psn <= psn_stop) && (stop_next_psn > psn_start)) { stop_psn = stop_next_psn; stop = stop->next; - stop_next_psn = FI_OPX_HFI1_PACKET_PSN(&stop->next->scb.hdr); + stop_next_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop->next)); } if (OFI_UNLIKELY(stop_psn > psn_stop)) { @@ -1212,9 +1303,9 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, if (last_ack_index < LAST_ACK_LEN) last_ack_index+=snprintf(&last_ack[last_ack_index],LAST_ACK_LEN-last_ack_index, "(tx) Start = %x, Stop = %x, Halt = %x\n", - FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr), - FI_OPX_HFI1_PACKET_PSN(&stop->scb.hdr), - FI_OPX_HFI1_PACKET_PSN(&halt->scb.hdr)); + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop)), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(halt))); #endif /* remove the psn range to ack from the queue */ @@ -1232,7 +1323,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, if (last_ack_index < LAST_ACK_LEN) last_ack_index+=snprintf(&last_ack[last_ack_index],LAST_ACK_LEN-last_ack_index, "(tx) packet %016lx %08x retired (slow path).\n", key, - FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif struct fi_opx_reliability_tx_replay * next = tmp->next; @@ -1251,8 +1342,16 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, } } - const uint16_t lrh_pktlen_le = ntohs(tmp->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + uint16_t lrh_pktlen_le; + size_t total_bytes; + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(tmp->scb_9B.hdr.lrh_9B.pktlen); + total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + } else { + lrh_pktlen_le = tmp->scb_16B.hdr.lrh_16B.pktlen; + total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ + } tmp->psn_ptr->psn.bytes_outstanding -= total_bytes; assert((int32_t)tmp->psn_ptr->psn.bytes_outstanding >= 0); @@ -1263,7 +1362,7 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, } else { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u ACK'd but pinned, marking as ACK'd and skipping free of replay.\n", - key, FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + key, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif tmp->acked = true; } @@ -1290,12 +1389,12 @@ ssize_t fi_opx_reliability_sdma_replay_complete (union fi_opx_reliability_deferr if (OFI_UNLIKELY(we->comp_state == OPX_SDMA_COMP_ERROR)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Failed sending replay with PSN %u (%X) via SDMA\n", - (uint32_t)FI_OPX_HFI1_PACKET_PSN(&we->replay->scb.hdr), - (uint32_t)FI_OPX_HFI1_PACKET_PSN(&we->replay->scb.hdr)); + (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(we->replay)), + (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(we->replay))); #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) replay packet %016lx %08u failed sending via SDMA.\n", params->flow_key, - FI_OPX_HFI1_PACKET_PSN(&we->replay->scb.hdr)); + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(we->replay))); #endif } @@ -1310,7 +1409,7 @@ ssize_t fi_opx_reliability_sdma_replay_complete (union fi_opx_reliability_deferr fprintf(stderr, "(tx) packet %016lx %08u replay over SDMA complete and ACK'd, freeing replay\n", params->flow_key, - (uint32_t)we->replay->scb.hdr.reliability.psn); + (uint32_t)(OPX_REPLAY_HDR(we->replay)->reliability.psn)); #endif fi_opx_reliability_client_replay_deallocate(&opx_ep->reliability->state, we->replay); @@ -1319,7 +1418,7 @@ ssize_t fi_opx_reliability_sdma_replay_complete (union fi_opx_reliability_deferr fprintf(stderr, "(tx) packet %016lx %08u replay over SDMA complete, un-pinning replay\n", params->flow_key, - (uint32_t)we->replay->scb.hdr.reliability.psn); + (uint32_t)(OPX_REPLAY_HDR(we->replay)->reliability.psn)); #endif } slist_remove_head(¶ms->sdma_reqs); @@ -1360,13 +1459,22 @@ ssize_t fi_opx_reliability_service_do_replay_sdma (struct fid_ep *ep, params->opx_ep = opx_ep; slist_init(¶ms->sdma_reqs); + OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); + #if defined(OPX_RELIABILITY_DEBUG) || !defined(NDEBUG) union fi_opx_reliability_service_flow_key key; - key.slid = (uint32_t)start_replay->scb.hdr.stl.lrh.slid; - key.tx = (uint32_t)start_replay->scb.hdr.reliability.origin_tx; - key.dlid = (uint32_t)start_replay->scb.hdr.stl.lrh.dlid; - key.rx = (uint32_t)start_replay->scb.hdr.stl.bth.rx; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + key.slid = (uint32_t)start_replay->scb_9B.hdr.lrh_9B.slid; + key.dlid = (uint32_t)start_replay->scb_9B.hdr.lrh_9B.dlid; + } + else { + key.slid = htons(start_replay->scb_16B.hdr.lrh_16B.slid20 << 20 | start_replay->scb_16B.hdr.lrh_16B.slid); + key.dlid = htons(start_replay->scb_16B.hdr.lrh_16B.dlid20 << 20 | start_replay->scb_16B.hdr.lrh_16B.dlid); + } + key.tx = (uint32_t)(OPX_REPLAY_HDR(start_replay)->reliability.origin_tx); + key.rx = (uint32_t)(OPX_REPLAY_HDR(start_replay)->bth.rx); #endif + uint32_t replayed = 0; #ifdef OPX_RELIABILITY_DEBUG @@ -1397,6 +1505,8 @@ ssize_t fi_opx_reliability_service_do_replay_sdma (struct fid_ep *ep, sdma_we->replay = replay; sdma_we->comp_state = OPX_SDMA_COMP_PENDING_WRITEV; + OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); + uint64_t payload_size = fi_opx_reliability_replay_get_payload_size(replay); #ifndef NDEBUG @@ -1413,7 +1523,7 @@ ssize_t fi_opx_reliability_service_do_replay_sdma (struct fid_ep *ep, #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u size %ld bytes replay injected over SDMA\n", - key.value, (uint32_t) replay->scb.hdr.reliability.psn, + key.value, (uint32_t) (OPX_REPLAY_HDR(replay)->reliability.psn), payload_size); #endif replay->pinned = true; @@ -1445,25 +1555,50 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service #if defined(OPX_RELIABILITY_DEBUG) || !defined(NDEBUG) union fi_opx_reliability_service_flow_key key; - key.slid = (uint32_t)replay->scb.hdr.stl.lrh.slid; - key.tx = (uint32_t)FI_OPX_HFI1_PACKET_ORIGIN_TX(&replay->scb.hdr); - key.dlid = (uint32_t)replay->scb.hdr.stl.lrh.dlid; - key.rx = (uint32_t)replay->scb.hdr.stl.bth.rx; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + key.slid = (uint32_t)replay->scb_9B.hdr.lrh_9B.slid; + key.dlid = (uint32_t)replay->scb_9B.hdr.lrh_9B.dlid; + } else { + key.slid = htons(replay->scb_16B.hdr.lrh_16B.slid20 << 20 | replay->scb_16B.hdr.lrh_16B.slid); + key.dlid = htons(replay->scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb_16B.hdr.lrh_16B.dlid); + } + key.tx = (uint32_t)FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR(replay)); + key.rx = (uint32_t)(OPX_REPLAY_HDR(replay)->bth.rx); #endif - /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(replay->scb.hdr.stl.lrh.pktlen); - const size_t total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes_to_copy = total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); - uint16_t payload_credits_needed = - (payload_bytes_to_copy >> 6) + /* number of full 64-byte blocks of payload */ - ((payload_bytes_to_copy & 0x000000000000003Ful) != 0); /* number of partial 64-byte blocks of payload */ + /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ + uint16_t lrh_pktlen_le; + size_t total_bytes_to_copy; + size_t payload_bytes_to_copy; + + /* runtime checks for non-inlined functions */ + const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; + uint16_t payload_credits_needed; + int payload_qw_to_copy_with_header = 0; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(replay->scb_9B.hdr.lrh_9B.pktlen); + total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + payload_credits_needed = (payload_bytes_to_copy >> 6); /* number of full 64-byte blocks of payload */ + } else { + lrh_pktlen_le = replay->scb_16B.hdr.lrh_16B.pktlen; + total_bytes_to_copy = (lrh_pktlen_le) * 8; /* including trailing icrc */ + payload_bytes_to_copy = (total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B)); + payload_qw_to_copy_with_header = MIN((7*8), payload_bytes_to_copy)>>3; /* up to 7 qwords */ + assert(payload_bytes_to_copy >= payload_qw_to_copy_with_header*8); + payload_bytes_to_copy -= payload_qw_to_copy_with_header<<3; + /* ICRC/tail qword is already accounted for in the lrh */ + payload_credits_needed = (payload_bytes_to_copy >> 6); /* number of full 64-byte blocks of payload */ + } union fi_opx_hfi1_pio_state pio_state = *service->tx.hfi1.pio_state; - FI_OPX_HFI1_UPDATE_CREDITS(pio_state, service->tx.hfi1.pio_credits_addr); - const uint16_t total_credits_needed = payload_credits_needed + 1; + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + const uint16_t total_credits_needed = credits_needed + /* header */ + payload_credits_needed + /* full payload blocks */ + ((payload_bytes_to_copy & 0x3Ful) ? 1 : 0); /* last partial block */ uint16_t total_credits_available = FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state); if (total_credits_available < total_credits_needed) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, service->tx.hfi1.pio_credits_addr); @@ -1471,38 +1606,17 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service if (total_credits_available < total_credits_needed) { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u Couldn't do replay (no credits)\n", - key.value, (uint32_t)FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr)); + key.value, (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(replay))); #endif service->tx.hfi1.pio_state->qw0 = pio_state.qw0; + return -FI_EAGAIN; } } #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u replay injected\n", - key.value, (uint32_t)FI_OPX_HFI1_PACKET_PSN(&replay->scb.hdr)); -#endif - - volatile uint64_t * const scb = - FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_sop_first, pio_state); - - OPX_HFI1_BAR_STORE(&scb[0], replay->scb.qw0); - OPX_HFI1_BAR_STORE(&scb[1], replay->scb.hdr.qw[0]); - OPX_HFI1_BAR_STORE(&scb[2], replay->scb.hdr.qw[1]); - OPX_HFI1_BAR_STORE(&scb[3], replay->scb.hdr.qw[2]); - OPX_HFI1_BAR_STORE(&scb[4], replay->scb.hdr.qw[3]); - OPX_HFI1_BAR_STORE(&scb[5], replay->scb.hdr.qw[4]); - OPX_HFI1_BAR_STORE(&scb[6], replay->scb.hdr.qw[5]); - OPX_HFI1_BAR_STORE(&scb[7], replay->scb.hdr.qw[6]); - - - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); - - /* consume one credit for the packet header */ - --total_credits_available; - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); -#ifndef NDEBUG - unsigned consumed_credits = 1; + key.value, (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(replay))); #endif uint64_t * buf_qws; @@ -1532,12 +1646,83 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service } fi_opx_hfi1_reliability_iov_payload_check(replay, key.value, "Replaying packet (PIO) where source buffer has changed!", __FILE__, __func__, __LINE__); #endif + /* TODO if using user iov we can't go past their buffer for the tail */ buf_qws = replay->iov[0].iov_base; } else { buf_qws = replay->payload; } +#ifndef NDEBUG + unsigned consumed_credits = 0; +#endif + + volatile uint64_t * const scb = + FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_sop_first, pio_state); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + OPX_HFI1_BAR_STORE(&scb[0], replay->scb_9B.qw0); + OPX_HFI1_BAR_STORE(&scb[1], replay->scb_9B.hdr.qw_9B[0]); + OPX_HFI1_BAR_STORE(&scb[2], replay->scb_9B.hdr.qw_9B[1]); + OPX_HFI1_BAR_STORE(&scb[3], replay->scb_9B.hdr.qw_9B[2]); + OPX_HFI1_BAR_STORE(&scb[4], replay->scb_9B.hdr.qw_9B[3]); + OPX_HFI1_BAR_STORE(&scb[5], replay->scb_9B.hdr.qw_9B[4]); + OPX_HFI1_BAR_STORE(&scb[6], replay->scb_9B.hdr.qw_9B[5]); + OPX_HFI1_BAR_STORE(&scb[7], replay->scb_9B.hdr.qw_9B[6]); + + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); + + /* consume one credit for the packet header */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + consumed_credits = 1; +#endif + } else { + OPX_HFI1_BAR_STORE(&scb[0], replay->scb_16B.qw0); + OPX_HFI1_BAR_STORE(&scb[1], replay->scb_16B.hdr.qw_16B[0]); + OPX_HFI1_BAR_STORE(&scb[2], replay->scb_16B.hdr.qw_16B[1]); + OPX_HFI1_BAR_STORE(&scb[3], replay->scb_16B.hdr.qw_16B[2]); + OPX_HFI1_BAR_STORE(&scb[4], replay->scb_16B.hdr.qw_16B[3]); + OPX_HFI1_BAR_STORE(&scb[5], replay->scb_16B.hdr.qw_16B[4]); + OPX_HFI1_BAR_STORE(&scb[6], replay->scb_16B.hdr.qw_16B[5]); + OPX_HFI1_BAR_STORE(&scb[7], replay->scb_16B.hdr.qw_16B[6]); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); + + /* consume one credit for the packet header */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + volatile uint64_t * scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); + + // spill from 1st cacheline (SOP) + OPX_HFI1_BAR_STORE(&scb_payload[0], replay->scb_16B.hdr.qw_16B[7]); // header + + int i; + + for (i = 1; i <= payload_qw_to_copy_with_header ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], *buf_qws); + buf_qws += 1; + } + for (i = payload_qw_to_copy_with_header+1; i <= 7 ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], 0UL); + } + + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); + + /* consume one credit for the packet header+payload */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + consumed_credits = 2; +#endif + } - while (payload_credits_needed > 0) { + /* Skip last block if there a partial 64-byte block of payload */ + const uint16_t last_partial_block = (payload_bytes_to_copy & 0x3Ful) ? 1 : 0; + while (payload_credits_needed) { + /* TODO if using user iov we can't go past their buffer for the tail */ volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); @@ -1572,6 +1757,75 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service FI_OPX_HFI1_CONSUME_CREDITS(pio_state, contiguous_full_blocks_to_write); #ifndef NDEBUG consumed_credits += contiguous_full_blocks_to_write; +#endif + } + /* Store last partial 64-byte block of payload */ + if(last_partial_block != 0) { + /* TODO if using user iov we can't go past their buffer for the tail */ + + int16_t tail_bytes = (payload_bytes_to_copy & 0x3Ful) ; + + /* We have a credit so we don't have to worry about this wrapping on one block */ + volatile uint64_t * scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); + + uint16_t i = 0; + for ( ; tail_bytes >= 8; tail_bytes-=8) { + + OPX_HFI1_BAR_STORE(scb_payload, *buf_qws); + scb_payload += 1; + buf_qws += 1; + i++; + } + + /* LRH packets are dword (9B) or qword (16b) aligned */ + assert((tail_bytes == 4) || (tail_bytes == 0)); + if (hfi1_type != OPX_HFI1_JKR) { + if (tail_bytes) { + OPX_HFI1_BAR_STORE(scb_payload, ((*buf_qws))); + scb_payload += 1; + i++; + } + } else { + /* QWORD aligned for 16B */ + assert(tail_bytes == 0); + /* TODO if using user iov we can't go past their buffer for the tail */ + /* assert(i<-8); // left a pad for tail */ + } + /* Pad out the cacheline/block */ + for (; i <8; i++) { + OPX_HFI1_BAR_STORE(scb_payload, 0UL); + scb_payload += 1; + } + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); + + /* consume one credit for the tail partial block payload */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + consumed_credits += 1; +#endif + } else if (0 /* TODO */ ) /* (hfi1_type & OPX_HFI1_JKR) */ { + /* TODO if using user iov we can't go past their buffer for the tail */ + + /* The padding counted as a tail above but if we wrote + * all full blocks of payload, we need to write another + * block just to send a tail qword + */ + volatile uint64_t * scb_payload = + FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); + for (int i = 0; i <8; i++) { + OPX_HFI1_BAR_STORE(scb_payload, 0UL); + scb_payload += 1; + } + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); + + /* consume one credit for the tail partial block payload */ + --total_credits_available; + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); +#ifndef NDEBUG + consumed_credits += 1; #endif } @@ -1579,10 +1833,11 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service assert(consumed_credits == total_credits_needed); #endif - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(service->tx.hfi1.pio_credits_addr); + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, service->tx.hfi1.pio_credits_addr); /* save the updated txe state */ service->tx.hfi1.pio_state->qw0 = pio_state.qw0; + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(service->tx.hfi1.pio_credits_addr); return FI_SUCCESS; } @@ -1602,7 +1857,7 @@ ssize_t fi_opx_reliability_pio_replay (union fi_opx_reliability_deferred_work *w if (params->replays[i]->acked) { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) packet %016lx %08u replay already ACK'd, skipping deferred replay\n", - params->flow_key, FI_OPX_HFI1_PACKET_PSN(¶ms->replays[i]->scb.hdr)); + params->flow_key, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(params->replays[i]))); #endif fi_opx_reliability_client_replay_deallocate(&opx_ep->reliability->state, params->replays[i]); params->replays[i] = NULL; @@ -1685,16 +1940,16 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, */ struct fi_opx_reliability_tx_replay * start = head; - uint32_t start_psn = FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr); + uint32_t start_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)); while ((start_psn < psn_start || start->pinned) && (start != tail)) { #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) flow__ %016lx rcv nack %lu..%lu Looking for start replay, current start->psn == %u, start->pinned == %d\n", key, psn_start, psn_stop, - FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr), + FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)), start->pinned); #endif start = start->next; - start_psn = FI_OPX_HFI1_PACKET_PSN(&start->scb.hdr); + start_psn = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(start)); } if (OFI_UNLIKELY(start_psn < psn_start || start_psn > psn_stop || start->pinned)) { @@ -1733,8 +1988,8 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, struct fi_opx_reliability_tx_replay * stop = start; const uint64_t max = (uint64_t) MIN(OPX_RELIABILITY_TX_MAX_REPLAYS,OPX_RELIABILITY_RX_MAX_NACK); while ((stop->next != head) && - (FI_OPX_HFI1_PACKET_PSN(&stop->scb.hdr) < FI_OPX_HFI1_PACKET_PSN(&stop->next->scb.hdr)) && - (FI_OPX_HFI1_PACKET_PSN(&stop->next->scb.hdr) <= psn_stop) && + (FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop)) < FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop->next))) && + (FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop->next)) <= psn_stop) && (replay_count < max)) { // We won't retransmit pinned replays, so don't count those @@ -1753,7 +2008,7 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(tx) flow__ %016lx rcv nack %08lu..%08lu Replaying PSNs %08u - %08u\n", key, psn_start, psn_stop, start_psn, - (uint32_t)FI_OPX_HFI1_PACKET_PSN(&stop->scb.hdr)); + (uint32_t)FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(stop))); #endif // Turn on throttling for this flow while we catch up on replays start->psn_ptr->psn.nack_count = 1; @@ -1775,11 +2030,11 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, if (!queing_replays) { #ifdef OPX_DEBUG_COUNTERS_RELIABILITY struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - if(replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS || replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) { + if(OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS || OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rts); - } else if (replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { + } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_cts); - } else if (replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { + } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rzv); } #endif @@ -1819,11 +2074,11 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, } #ifdef OPX_DEBUG_COUNTERS_RELIABILITY struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - if(replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS || replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) { + if(OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS || OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rts); - } else if (replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { + } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_cts); - } else if (replay->scb.hdr.stl.bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { + } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rzv); } #endif @@ -1865,12 +2120,20 @@ ssize_t fi_opx_reliability_send_ping(struct fid_ep *ep, return OPX_RELIABILITY_PING_NO_REPLAYS; } - const uint64_t dlid = (uint64_t)head->scb.hdr.stl.lrh.dlid; + uint64_t dlid; + /* Inlined but called from non-inlined functions with no const hfi1 type, so just use the runtime check */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + dlid = (uint64_t) head->scb_9B.hdr.lrh_9B.dlid; + } else { + dlid = (uint64_t) htons(head->scb_16B.hdr.lrh_16B.dlid20 << 20 | head->scb_16B.hdr.lrh_16B.dlid); + } + const uint64_t rx = (uint64_t)head->target_reliability_rx; // psn_start will always be 24-bit max number here - uint64_t psn_start = FI_OPX_HFI1_PACKET_PSN(&head->scb.hdr); - uint64_t psn_stop = FI_OPX_HFI1_PACKET_PSN(&head->prev->scb.hdr); + uint64_t psn_start = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(head)); + uint64_t psn_stop = FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(head->prev)); + // if the PSN of the tail is less than the PSN of the head, the // PSN has rolled over. In that case, truncate the ping range @@ -1882,7 +2145,8 @@ ssize_t fi_opx_reliability_send_ping(struct fid_ep *ep, key_value, dlid, rx, psn_start, psn_count, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING, + OPX_HFI1_TYPE); INC_PING_STAT_COND(rc == FI_SUCCESS, PINGS_SENT, key_value, psn_start, psn_count); @@ -2172,6 +2436,98 @@ void * pthread_start_routine (void * arg) { } #endif + +void fi_opx_reliability_model_init_16B(struct fi_opx_reliability_service * service, + struct fi_opx_hfi1_context * hfi1) +{ + /* Ping model */ + { + /* PBC */ + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 3 + /* kdeth */ + 4 + /* software kdeth + unused */ + 2 + /* ICRC and tail */ + 2 ; /* second cacheline */ + + + /* Setup the 16B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = OPX_HFI1_JKR; + + service->tx.hfi1.ping_model_16B.qw0 = OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_VL(hfi1->vl, hfi1_type) | + OPX_PBC_SC(hfi1->sc, hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_16B, hfi1_type) | + OPX_PBC_L2COMPRESSED(0, hfi1_type) | + OPX_PBC_PORTIDX(hfi1->hfi_port, hfi1_type) | + OPX_PBC_SCTXT(hfi1->send_ctxt, hfi1_type) | + OPX_PBC_JKR_INSERT_NON9B_ICRC; + + /* LRH */ + const uint32_t packetLength = (pbc_dws - 2) * 4; + const uint32_t lrh_qws = (packetLength >> 3) + + ((packetLength & 0x07u) != 0); + + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.qw[0] = 0UL; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.qw[1] = 0UL; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.pktlen = lrh_qws; /* does not include pbc, but does include icrc */ + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.sc = hfi1->sc; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.entropy = 0; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.lt = 0; // need to add env variable to change + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.l2 = OPX_PBC_JKR_L2TYPE_16B; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.l4 = 9; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.rc = OPX_RC_IN_ORDER_0; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.cspec = OPX_BTH_CSPEC_DEFAULT; /*NOT BTH CSPEC*/ + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.pkey = hfi1->pkey; + + + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.slid = hfi1->lid & 0xFFFFF; + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.slid20 = (hfi1->lid) >> 20; + + /* BTH */ + service->tx.hfi1.ping_model_16B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_UD; + service->tx.hfi1.ping_model_16B.hdr.bth.bth_1 = 0; + service->tx.hfi1.ping_model_16B.hdr.bth.pkey = hfi1->pkey; + service->tx.hfi1.ping_model_16B.hdr.bth.ecn = (uint8_t)(OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)), hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT, hfi1_type)); + service->tx.hfi1.ping_model_16B.hdr.bth.qp = hfi1->bthqp; + service->tx.hfi1.ping_model_16B.hdr.bth.unused = 0; + service->tx.hfi1.ping_model_16B.hdr.bth.rx = 0; /* set at runtime */ + service->tx.hfi1.ping_model_16B.hdr.bth.psn = 0; + + /* KDETH */ + service->tx.hfi1.ping_model_16B.hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; + service->tx.hfi1.ping_model_16B.hdr.kdeth.jkey = hfi1->jkey; + service->tx.hfi1.ping_model_16B.hdr.kdeth.hcrc = 0; + service->tx.hfi1.ping_model_16B.hdr.kdeth.unused = 0; + + /* reliability service */ + union opx_hfi1_packet_hdr * hdr = &service->tx.hfi1.ping_model_16B.hdr; + + hdr->ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING; + + hdr->service.origin_reliability_rx = hfi1->info.rxe.id; + hdr->service.range_count = 0; + hdr->service.unused = 0; + hdr->service.psn_count = 0; + hdr->service.psn_start = 0; + hdr->service.key = 0; + } + + /* 'ack' pio send model */ + { + service->tx.hfi1.ack_model_16B = service->tx.hfi1.ping_model_16B; + service->tx.hfi1.ack_model_16B.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK; + } + + /* 'nack' pio send model */ + { + service->tx.hfi1.nack_model_16B = service->tx.hfi1.ping_model_16B; + service->tx.hfi1.nack_model_16B.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK; + } +} + uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * service, uuid_t unique_job_key, struct fi_opx_hfi1_context * hfi1, @@ -2271,42 +2627,44 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser 3 + /* bth */ 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - service->tx.hfi1.ping_model.qw0 = OPX_PBC_LEN(pbc_dws) | - OPX_PBC_VL(hfi1->vl) | - OPX_PBC_SC(hfi1->sc) | - OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B) | - OPX_PBC_L2COMPRESSED(0) | - OPX_PBC_PORTIDX(hfi1->hfi_port) | - OPX_PBC_SCTXT(hfi1->send_ctxt); + /* Setup the 9B models whether or not they'll be used */ + enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = (OPX_HFI1_TYPE & OPX_HFI1_WFR) ? OPX_HFI1_WFR : OPX_HFI1_JKR_9B; + + service->tx.hfi1.ping_model_9B.qw0 = OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_VL(hfi1->vl, hfi1_type) | + OPX_PBC_SC(hfi1->sc, hfi1_type) | + OPX_PBC_L2TYPE(OPX_PBC_JKR_L2TYPE_9B, hfi1_type) | + OPX_PBC_L2COMPRESSED(0, hfi1_type) | + OPX_PBC_PORTIDX(hfi1->hfi_port, hfi1_type) | + OPX_PBC_SCTXT(hfi1->send_ctxt, hfi1_type); /* LRH */ - service->tx.hfi1.ping_model.hdr.stl.lrh.flags = + service->tx.hfi1.ping_model_9B.hdr.lrh_9B.flags = htons(FI_OPX_HFI1_LRH_BTH | ((hfi1->sl & FI_OPX_HFI1_LRH_SL_MASK) << FI_OPX_HFI1_LRH_SL_SHIFT) | ((hfi1->sc & FI_OPX_HFI1_LRH_SC_MASK) << FI_OPX_HFI1_LRH_SC_SHIFT)); - service->tx.hfi1.ping_model.hdr.stl.lrh.dlid = 0; /* set at runtime */ - service->tx.hfi1.ping_model.hdr.stl.lrh.pktlen = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ - service->tx.hfi1.ping_model.hdr.stl.lrh.slid = htons(hfi1->lid); + service->tx.hfi1.ping_model_9B.hdr.lrh_9B.dlid = 0; /* set at runtime */ + service->tx.hfi1.ping_model_9B.hdr.lrh_9B.pktlen = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + service->tx.hfi1.ping_model_9B.hdr.lrh_9B.slid = htons(hfi1->lid); /* BTH */ - service->tx.hfi1.ping_model.hdr.stl.bth.opcode = FI_OPX_HFI_BTH_OPCODE_UD; - service->tx.hfi1.ping_model.hdr.stl.bth.bth_1 = 0; - service->tx.hfi1.ping_model.hdr.stl.bth.pkey = htons(hfi1->pkey); - service->tx.hfi1.ping_model.hdr.stl.bth.ecn = (uint8_t) (OPX_BTH_RC2(OPX_BTH_RC2_VAL) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT)); - service->tx.hfi1.ping_model.hdr.stl.bth.qp = hfi1->bthqp; - service->tx.hfi1.ping_model.hdr.stl.bth.unused = 0; - service->tx.hfi1.ping_model.hdr.stl.bth.rx = 0; /* set at runtime */ + service->tx.hfi1.ping_model_9B.hdr.bth.opcode = FI_OPX_HFI_BTH_OPCODE_UD; + service->tx.hfi1.ping_model_9B.hdr.bth.bth_1 = 0; + service->tx.hfi1.ping_model_9B.hdr.bth.pkey = htons(hfi1->pkey); + service->tx.hfi1.ping_model_9B.hdr.bth.ecn = (uint8_t) (OPX_BTH_RC2((OPX_BTH_RC2_VAL(hfi1_type)), hfi1_type) | OPX_BTH_CSPEC(OPX_BTH_CSPEC_DEFAULT, hfi1_type)); + service->tx.hfi1.ping_model_9B.hdr.bth.qp = hfi1->bthqp; + service->tx.hfi1.ping_model_9B.hdr.bth.unused = 0; + service->tx.hfi1.ping_model_9B.hdr.bth.rx = 0; /* set at runtime */ /* KDETH */ - service->tx.hfi1.ping_model.hdr.stl.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; - service->tx.hfi1.ping_model.hdr.stl.kdeth.jkey = hfi1->jkey; - service->tx.hfi1.ping_model.hdr.stl.kdeth.hcrc = 0; - service->tx.hfi1.ping_model.hdr.stl.kdeth.unused = 0; + service->tx.hfi1.ping_model_9B.hdr.kdeth.offset_ver_tid = FI_OPX_HFI1_KDETH_VERSION << FI_OPX_HFI1_KDETH_VERSION_SHIFT; + service->tx.hfi1.ping_model_9B.hdr.kdeth.jkey = hfi1->jkey; + service->tx.hfi1.ping_model_9B.hdr.kdeth.hcrc = 0; + service->tx.hfi1.ping_model_9B.hdr.kdeth.unused = 0; /* reliability service */ - union fi_opx_hfi1_packet_hdr * hdr = - (union fi_opx_hfi1_packet_hdr *)&service->tx.hfi1.ping_model.hdr; + union opx_hfi1_packet_hdr * hdr = &service->tx.hfi1.ping_model_9B.hdr; hdr->ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_PING; @@ -2320,14 +2678,14 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser /* 'ack' pio send model */ { - service->tx.hfi1.ack_model = service->tx.hfi1.ping_model; - service->tx.hfi1.ack_model.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK; + service->tx.hfi1.ack_model_9B = service->tx.hfi1.ping_model_9B; + service->tx.hfi1.ack_model_9B.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK; } /* 'nack' pio send model */ { - service->tx.hfi1.nack_model = service->tx.hfi1.ping_model; - service->tx.hfi1.nack_model.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK; + service->tx.hfi1.nack_model_9B = service->tx.hfi1.ping_model_9B; + service->tx.hfi1.nack_model_9B.hdr.ud.opcode = FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK; } @@ -2702,7 +3060,7 @@ void fi_opx_reliability_client_init (struct fi_opx_reliability_client_state * st const uint8_t rx, const uint8_t tx, void (*process_fn)(struct fid_ep *ep, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const uint8_t origin_reliability_rx)) { @@ -2823,14 +3181,21 @@ void fi_opx_reliability_client_fini (struct fi_opx_reliability_client_state * st __OPX_FORCE_INLINE__ struct fi_opx_reliability_rx_uepkt *fi_opx_reliability_allocate_uepkt(struct fi_opx_reliability_service *service, - const union fi_opx_hfi1_packet_hdr * const hdr, + const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, const size_t payload_bytes_to_copy) { struct fi_opx_reliability_rx_uepkt * tmp = ofi_buf_alloc(service->uepkt_pool); assert(tmp); - memcpy((void*)&tmp->hdr, hdr, sizeof(union fi_opx_hfi1_packet_hdr)); + /* tmp->hdr.unused_pad_9B = hdr->unused_pad_9B; */ + tmp->hdr.qw_9B[0] = hdr->qw_9B[0]; + tmp->hdr.qw_9B[1] = hdr->qw_9B[1]; + tmp->hdr.qw_9B[2] = hdr->qw_9B[2]; + tmp->hdr.qw_9B[3] = hdr->qw_9B[3]; + tmp->hdr.qw_9B[4] = hdr->qw_9B[4]; + tmp->hdr.qw_9B[5] = hdr->qw_9B[5]; + tmp->hdr.qw_9B[6] = hdr->qw_9B[6]; if (payload && payload_bytes_to_copy > 0) memcpy((void*)&tmp->payload[0], (const void *)payload, payload_bytes_to_copy); @@ -2840,12 +3205,22 @@ struct fi_opx_reliability_rx_uepkt *fi_opx_reliability_allocate_uepkt(struct fi_ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * state, uint64_t slid, uint64_t origin_tx, uint32_t psn, - struct fid_ep *ep, const union fi_opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload) + struct fid_ep *ep, const union opx_hfi1_packet_hdr * const hdr, const uint8_t * const payload, + const uint16_t pktlen, const enum opx_hfi1_type hfi1_type) { /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ - const uint16_t lrh_pktlen_le = ntohs(hdr->stl.lrh.pktlen); - const size_t total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - const size_t payload_bytes_to_copy = total_bytes_to_copy - sizeof(union fi_opx_hfi1_packet_hdr); + uint16_t lrh_pktlen_le; + size_t total_bytes_to_copy, payload_bytes_to_copy; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + lrh_pktlen_le = ntohs(hdr->lrh_9B.pktlen); + total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); + } else { + lrh_pktlen_le = pktlen; + total_bytes_to_copy = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing tail/icrc QW*/ + payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); + } union fi_opx_reliability_service_flow_key key; key.slid = slid; @@ -2886,7 +3261,7 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s state->rx, psn - state->service->preemptive_ack_rate + 1, /* psn_start */ state->service->preemptive_ack_rate, /* psn_count */ - hdr, origin_rx); + hdr, origin_rx, slid, hfi1_type); } next_psn += 1; @@ -2915,7 +3290,7 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s state->rx, psn - state->service->preemptive_ack_rate + 1, /* psn_start */ state->service->preemptive_ack_rate, /* psn_count */ - hdr, origin_rx); + hdr, origin_rx, slid, hfi1_type); } ++next_psn; @@ -2971,7 +3346,8 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s origin_rx, psn, /* psn_start */ 1, /* psn_count */ - FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_ACK, + hfi1_type); INC_PING_STAT_COND(rc == FI_SUCCESS, PRE_ACKS_SENT, key.value, psn, 1); return; @@ -3009,7 +3385,8 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s origin_rx, next_psn, nack_count, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK, + hfi1_type); INC_PING_STAT_COND(rc == FI_SUCCESS, PRE_NACKS_SENT, key.value, next_psn, nack_count); #endif #ifdef OPX_RELIABILITY_DEBUG @@ -3070,7 +3447,8 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s origin_rx, nack_start_psn, nack_count, - FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK); + FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK, + hfi1_type); INC_PING_STAT_COND(rc == FI_SUCCESS, PRE_NACKS_SENT, key.value, next_psn, nack_count); } @@ -3262,7 +3640,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_shm (struct fid_ep *ep, * The rank_inst field has been depricated and will be phased out. * The value is always zero. */ - union fi_opx_hfi1_packet_hdr * const hdr = + union opx_hfi1_packet_hdr * const hdr = opx_shm_tx_next(&opx_ep->tx->shm, hfi1_unit, u8_reliability_rx, &pos, true, u32_reliability_rx, 0, &rc); @@ -3271,22 +3649,32 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_shm (struct fid_ep *ep, const uint64_t lrh_dlid = dlid << 16; const uint64_t bth_rx = u8_reliability_rx << 56; - struct fi_opx_hfi1_txe_scb model = opx_ep->reliability->service.tx.hfi1.ping_model; - model.hdr.ud.opcode = opcode; - - hdr->qw[0] = model.hdr.qw[0] | lrh_dlid; - - hdr->qw[1] = model.hdr.qw[1] | bth_rx; - - hdr->qw[2] = model.hdr.qw[2]; - - hdr->qw[3] = model.hdr.qw[3]; - - hdr->qw[4] = model.hdr.qw[4]; - - hdr->qw[5] = model.hdr.qw[5]; - // hdr->qw[6] - hdr->service.key = key; + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + + struct fi_opx_hfi1_txe_scb_9B model = opx_ep->reliability->service.tx.hfi1.ping_model_9B; + model.hdr.ud.opcode = opcode; + hdr->qw_9B[0] = model.hdr.qw_9B[0] | lrh_dlid; + hdr->qw_9B[1] = model.hdr.qw_9B[1] | bth_rx; + hdr->qw_9B[2] = model.hdr.qw_9B[2]; + hdr->qw_9B[3] = model.hdr.qw_9B[3]; + hdr->qw_9B[4] = model.hdr.qw_9B[4]; + hdr->qw_9B[5] = model.hdr.qw_9B[5]; + // hdr->qw[6] + hdr->service.key = key; + } else { + struct fi_opx_hfi1_txe_scb_16B model = opx_ep->reliability->service.tx.hfi1.ping_model_16B; + model.hdr.ud.opcode = opcode; + + hdr->qw_16B[0] = model.hdr.qw_16B[0] | ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B); + hdr->qw_16B[1] = model.hdr.qw_16B[1] | ((uint64_t)(ntohs(dlid) & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B); + hdr->qw_16B[2] = model.hdr.qw_16B[2] | bth_rx; + hdr->qw_16B[3] = model.hdr.qw_16B[3]; + hdr->qw_16B[4] = model.hdr.qw_16B[4]; + hdr->qw_16B[5] = model.hdr.qw_16B[5]; + hdr->qw_16B[6] = model.hdr.qw_16B[6]; + hdr->service.key = key; /* qw[7] */ + } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); @@ -3387,7 +3775,7 @@ void fi_opx_reliability_resynch_tx_flow_reset (struct fi_opx_ep *opx_ep, do { #ifdef OPX_RELIABILITY_DEBUG - fprintf(stderr, "(tx) packet %016lx %08u retired.\n", tx_key.value, FI_OPX_HFI1_PACKET_PSN(&tmp->scb.hdr)); + fprintf(stderr, "(tx) packet %016lx %08u retired.\n", tx_key.value, FI_OPX_HFI1_PACKET_PSN(OPX_REPLAY_HDR(tmp))); #endif next = tmp->next; @@ -3414,7 +3802,7 @@ void fi_opx_reliability_resynch_tx_flow_reset (struct fi_opx_ep *opx_ep, void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, struct fi_opx_reliability_service * service, uint32_t origin_reliability_rx, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); struct fi_opx_reliability_client_state * state = &opx_ep->reliability->state; @@ -3438,7 +3826,7 @@ void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, * Reset all SHM related reliability protocol data retained by this * Server EP about the remote Client EP. */ - if (fi_opx_hfi_is_intranode(rx_key.slid)) { + if (opx_lid_is_intranode(rx_key.slid)) { /* Record completion of the resynch request for the remote Client EP */ opx_ep->rx->shm.resynch_connection[origin_reliability_rx].completed = true; opx_ep->rx->shm.resynch_connection[origin_reliability_rx].counter++; @@ -3569,14 +3957,15 @@ void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, void fi_opx_hfi1_rx_reliability_ack_resynch (struct fid_ep *ep, struct fi_opx_reliability_service * service, - const union fi_opx_hfi1_packet_hdr *const hdr) + const union opx_hfi1_packet_hdr *const hdr) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); union fi_opx_reliability_service_flow_key rx_key = { .value = hdr->service.key }; + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FLOW KEY slid %x/%x dlid %x/%x, key.value %#lx\n",rx_key.slid,ntohs(rx_key.slid),rx_key.dlid,ntohs(rx_key.dlid), rx_key.value); #ifdef OPX_RELIABILITY_DEBUG fprintf(stderr, "(rx) %s Client flow__ %016lx rcv resynch ack\n", - (fi_opx_hfi_is_intranode(rx_key.dlid)) ? "SHM -" : "", + (opx_lid_is_intranode(rx_key.dlid)) ? "SHM -" : "", rx_key.value); #endif @@ -3597,7 +3986,7 @@ void fi_opx_hfi1_rx_reliability_ack_resynch (struct fid_ep *ep, #ifdef OPX_RELIABILITY_DEBUG else { fprintf(stderr, "Warning, (rx) %s Client flow__ %016lx rcv resynch ack; not found.\n", - (fi_opx_hfi_is_intranode(rx_key.dlid)) ? "SHM -" : "", + (opx_lid_is_intranode(rx_key.dlid)) ? "SHM -" : "", rx_key.value); } #endif @@ -3612,9 +4001,17 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); ssize_t rc = FI_SUCCESS; bool inject_done = false; + uint32_t slid; + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + slid = opx_ep->tx->send_9B.hdr.lrh_9B.slid; + } else { + slid = ntohs(opx_ep->tx->send_9B.hdr.lrh_16B.slid20 << 20 | opx_ep->tx->send_9B.hdr.lrh_16B.slid); + } union fi_opx_reliability_service_flow_key tx_key = { - .slid = opx_ep->tx->send.hdr.stl.lrh.slid, - .tx = opx_ep->tx->send.hdr.reliability.origin_tx, + .slid = slid, + .tx = opx_ep->tx->send_9B.hdr.reliability.origin_tx, .dlid = dest_addr.uid.lid, .rx = dest_addr.hfi1_rx }; @@ -3644,7 +4041,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->hfi->daos_info.rank, - opx_ep->tx->send.hdr.stl.lrh.slid, + slid, dest_addr.uid.lid); } else { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -3655,7 +4052,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, "(tx) SHM - Extended address not available\n"); } - if ((opx_ep->tx->send.hdr.stl.lrh.slid == dest_addr.uid.lid) && + if ((slid == dest_addr.uid.lid) && opx_ep->daos_info.rank == opx_ep->hfi->daos_info.rank && opx_ep->daos_info.rank_inst == opx_ep->hfi->daos_info.rank_inst) { /* Nothing to do */ @@ -3821,7 +4218,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, fi_opx_timer_next_event_usec(timer, &start, FI_OPX_TIMER_NEXT_EVENT_USEC_DEFAULT); while (compare < next) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, OPX_HFI1_TYPE); compare = fi_opx_timer_now(timestamp, timer); if (resynch_flow->remote_ep_resynch_completed) { diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index 1dd35647cbb..55294a84be8 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -109,31 +109,31 @@ int fi_opx_do_readv_internal_intranode(union fi_opx_hfi1_deferred_work *work) uint64_t pos; /* DAOS support - rank_inst field has been depricated and will be phased out. * The value is always zero.*/ - union fi_opx_hfi1_packet_hdr * tx_hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->opx_target_addr.hfi1_unit, + union opx_hfi1_packet_hdr * hdr = opx_shm_tx_next(&opx_ep->tx->shm, params->opx_target_addr.hfi1_unit, params->dest_rx, &pos, opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, 0, &rc); - if (OFI_UNLIKELY(tx_hdr == NULL)) { + if (OFI_UNLIKELY(hdr == NULL)) { return rc; } uint64_t niov = params->niov << 48; uint64_t op64 = params->op << 40; uint64_t dt64 = params->dt << 32; assert(FI_OPX_HFI_DPUT_OPCODE_GET == params->opcode); // double check packet type - assert(params->dt == (FI_VOID - 1) || params->dt < OFI_DATATYPE_LAST); - tx_hdr->qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | params->lrh_dlid | (params->lrh_dws << 32); - tx_hdr->qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | params->bth_rx; - tx_hdr->qw[2] = opx_ep->rx->tx.cts.hdr.qw[2]; - tx_hdr->qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - tx_hdr->qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | params->opcode | dt64 | op64 | niov; - tx_hdr->qw[5] = (uintptr_t)params->rma_request; - tx_hdr->qw[6] = params->key; + assert(params->dt == (FI_VOID - 1) || params->dt < FI_DATATYPE_LAST); + hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | params->lrh_dlid | (params->lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | params->bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode | dt64 | op64 | niov; + hdr->qw_9B[5] = (uintptr_t)params->rma_request; + hdr->qw_9B[6] = params->key; union fi_opx_hfi1_packet_payload *const tx_payload = - (union fi_opx_hfi1_packet_payload *)(tx_hdr + 1); + (union fi_opx_hfi1_packet_payload *)(hdr + 1); tx_payload->cts.iov[0] = params->dput_iov; - opx_shm_tx_advance(&opx_ep->tx->shm, (void *)tx_hdr, pos); + opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); return FI_SUCCESS; } @@ -144,6 +144,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) struct fi_opx_ep *opx_ep = params->opx_ep; union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + ssize_t credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, 2); if (OFI_UNLIKELY(credits_available < 2)) { return -FI_EAGAIN; @@ -156,7 +157,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) const union fi_opx_addr addr = params->opx_target_addr; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, addr.hfi1_rx, - addr.reliability_rx, &psn_ptr, &replay, params->reliability); + addr.reliability_rx, &psn_ptr, &replay, params->reliability, OPX_HFI1_TYPE); if (OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; @@ -164,20 +165,20 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t tmp[8]; + uint64_t local_temp[16] = {0}; uint64_t niov = params->niov << 48; uint64_t op64 = params->op << 40; uint64_t dt64 = params->dt << 32; - uint64_t credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return); + uint64_t credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type); assert(FI_OPX_HFI_DPUT_OPCODE_GET == params->opcode); // double check packet type - fi_opx_set_scb(scb, tmp, - opx_ep->rx->tx.cts.qw0 | OPX_PBC_LEN(params->pbc_dws) | credit_return | + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->rx->tx.cts_9B.qw0 | OPX_PBC_LEN(params->pbc_dws, hfi1_type) | credit_return | params->pbc_dlid, - opx_ep->rx->tx.cts.hdr.qw[0] | params->lrh_dlid | (params->lrh_dws << 32), - opx_ep->rx->tx.cts.hdr.qw[1] | params->bth_rx, - opx_ep->rx->tx.cts.hdr.qw[2] | psn, - opx_ep->rx->tx.cts.hdr.qw[3], - opx_ep->rx->tx.cts.hdr.qw[4] | params->opcode | dt64 | op64 | niov, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | params->lrh_dlid | (params->lrh_dws << 32), + opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | params->bth_rx, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[3], + opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode | dt64 | op64 | niov, (uintptr_t)params->rma_request, params->key); // key @@ -186,19 +187,14 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - replay->scb.qw0 = tmp[0]; - replay->scb.hdr.qw[0] = tmp[1]; - replay->scb.hdr.qw[1] = tmp[2]; - replay->scb.hdr.qw[2] = tmp[3]; - replay->scb.hdr.qw[3] = tmp[4]; - replay->scb.hdr.qw[4] = tmp[5]; - replay->scb.hdr.qw[5] = tmp[6]; - replay->scb.hdr.qw[6] = tmp[7]; + OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); + + fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); /* write the CTS payload "send control block" */ volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - - fi_opx_set_scb(scb_payload, tmp, + uint64_t temp[8]; + fi_opx_store_and_copy_qw(scb_payload, temp, params->dput_iov.qw[0], params->dput_iov.qw[1], params->dput_iov.qw[2], @@ -208,20 +204,20 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) 0, 0); FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); - replay->payload[0] = tmp[0]; - replay->payload[1] = tmp[1]; - replay->payload[2] = tmp[2]; - replay->payload[3] = tmp[3]; - replay->payload[4] = tmp[4]; - replay->payload[5] = tmp[5]; - replay->payload[6] = tmp[6]; - replay->payload[7] = tmp[7]; + replay->payload[0] = temp[0]; + replay->payload[1] = temp[1]; + replay->payload[2] = temp[2]; + replay->payload[3] = temp[3]; + replay->payload[4] = temp[4]; + replay->payload[5] = temp[5]; + replay->payload[6] = temp[6]; + replay->payload[7] = temp[7]; fi_opx_reliability_client_replay_register_no_update( &opx_ep->reliability->state, - params->opx_target_addr.uid.lid, params->opx_target_addr.reliability_rx, - params->dest_rx, psn_ptr, replay, params->reliability); + params->dest_rx, psn_ptr, replay, params->reliability, + OPX_HFI1_TYPE); FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); opx_ep->tx->pio_state->qw0 = pio_state.qw0; @@ -234,7 +230,8 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -260,7 +257,7 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t opx_dst_addr.hfi1_rx, opx_dst_addr.reliability_rx, reliability))) { - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME); + fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); return -FI_EAGAIN; } @@ -279,7 +276,7 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr_offset, key, NULL, cc, FI_VOID, FI_NOOP, opx_ep->tx->op_flags | FI_INJECT, - is_hmem, lock_required, caps, reliability); + is_hmem, lock_required, caps, reliability, hfi1_type); return 0; } @@ -288,13 +285,14 @@ inline ssize_t fi_opx_inject_write_generic(struct fid_ep *ep, const void *buf, s fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -305,7 +303,8 @@ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -344,7 +343,7 @@ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr_offset, key, (union fi_opx_context *)context, cc, FI_VOID, FI_NOOP, opx_ep->tx->op_flags, is_hmem, - lock_required, caps, reliability); + lock_required, caps, reliability, hfi1_type); return 0; } @@ -353,12 +352,13 @@ inline ssize_t fi_opx_write_generic(struct fid_ep *ep, const void *buf, size_t l fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -369,7 +369,8 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void size_t count, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -419,7 +420,7 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void addr_offset, key, (union fi_opx_context *)context, cc, FI_VOID, FI_NOOP, 0, is_hmem, - lock_required, caps, reliability); + lock_required, caps, reliability, hfi1_type); addr_offset += iov[index].iov_len; ++mr_ptr_array; @@ -433,12 +434,13 @@ inline ssize_t fi_opx_writev_generic(struct fid_ep *ep, const struct iovec *iov, size_t count, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_writev_internal(ep, iov, desc, count, dst_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -488,7 +490,8 @@ __OPX_FORCE_INLINE__ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -551,7 +554,7 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, rma_iov_addr, rma_iov_key, NULL, cc, FI_VOID, FI_NOOP, 0, is_hmem, lock_required, caps, - reliability); + reliability, hfi1_type); msg_iov_bytes -= len; msg_iov_vaddr += len; @@ -582,12 +585,13 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg inline ssize_t fi_opx_writemsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - av_type, caps, reliability); + av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -597,7 +601,8 @@ __OPX_FORCE_INLINE__ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability) + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -650,7 +655,7 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, - caps, reliability); + caps, reliability, hfi1_type); return FI_SUCCESS; } @@ -658,12 +663,13 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des inline ssize_t fi_opx_read_generic(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, const enum ofi_reliability_kind reliability) + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_read_internal(ep, buf, len, desc, src_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -674,7 +680,8 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -737,7 +744,7 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, fi_opx_readv_internal(opx_ep, hmem_iovs, 8, opx_addr, addr_v, key_v, NULL, 0, NULL, NULL, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, - caps, reliability); + caps, reliability, hfi1_type); } /* if 'partial_ndesc' is zero, the fi_opx_readv_internal() will fence */ @@ -754,7 +761,7 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, fi_opx_readv_internal(opx_ep, hmem_iovs, partial_ndesc, opx_addr, addr_v, key_v, opx_context, tx_op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, - FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability); + FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); return 0; } @@ -763,12 +770,13 @@ inline ssize_t fi_opx_readv_generic(struct fid_ep *ep, const struct iovec *iov, size_t count, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_readv(ep, iov, desc, count, src_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability); + FI_OPX_LOCK_NOT_REQUIRED, av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -778,7 +786,8 @@ __OPX_FORCE_INLINE__ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -885,7 +894,7 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, opx_context, flags, cq, opx_ep->read_cntr, /* enable_cq, enable_cntr */ cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, - reliability); + reliability, hfi1_type); return 0; @@ -935,11 +944,12 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, NULL, 0, NULL, NULL, /* disable_cq, disable_cntr */ cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, - caps, reliability); + caps, reliability, hfi1_type); } /* end while */ /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); abort(); return 0; @@ -948,12 +958,13 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, inline ssize_t fi_opx_readmsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability) + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); ssize_t rc = fi_opx_readmsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - av_type, caps, reliability); + av_type, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -966,10 +977,26 @@ static inline ssize_t fi_opx_rma_read(struct fid_ep *ep, void *buf, size_t len, struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_read_generic(ep, buf, len, desc, src_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -980,9 +1007,22 @@ static inline ssize_t fi_opx_rma_readmsg(struct fid_ep *ep, const struct fi_msg_ struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_readmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_readmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_readmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_readmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -994,10 +1034,25 @@ static inline ssize_t fi_opx_rma_inject_write(struct fid_ep *ep, const void *buf struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, - FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_inject_write_internal(ep, buf, len, dst_addr, addr_offset, key, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1009,10 +1064,25 @@ static inline ssize_t fi_opx_rma_write(struct fid_ep *ep, const void *buf, size_ struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_write(ep, buf, len, desc, dst_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1024,10 +1094,26 @@ static inline ssize_t fi_opx_rma_writev(struct fid_ep *ep, const struct iovec *i struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_writev_internal(ep, iov, desc, count, dest_addr, addr_offset, key, context, - FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_writev_internal(ep, iov, desc, count, dest_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_writev_internal(ep, iov, desc, count, dest_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_writev_internal(ep, iov, desc, count, dest_addr, addr_offset, key, context, + FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1038,10 +1124,25 @@ static inline ssize_t fi_opx_rma_writemsg(struct fid_ep *ep, const struct fi_msg struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const int lock_required = fi_opx_threading_lock_required(opx_ep->threading, fi_opx_global.progress); const uint64_t caps = opx_ep->tx->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM); + ssize_t rc; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, - OPX_AV, caps, OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_writemsg_internal(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, caps, OPX_RELIABILITY, OPX_HFI1_JKR); + } else { + /* should never get here */ + rc = -FI_EPERM; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1073,32 +1174,44 @@ int fi_opx_init_rma_ops(struct fid_ep *ep, struct fi_info *info) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-function" -FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY) -FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR) -#define FI_OPX_RMA_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY) \ - FI_OPX_RMA_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B) -#define FI_OPX_RMA_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY) \ - fi_opx_ops_rma_##LOCK##_##AV##_##CAPS##_##RELIABILITY +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR) +FI_OPX_RMA_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR) -#define FI_OPX_RMA_OPS_STRUCT(LOCK, AV, CAPS, RELIABILITY) \ - static struct fi_ops_rma FI_OPX_RMA_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY) = { \ +#define FI_OPX_RMA_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + FI_OPX_RMA_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) + +#define FI_OPX_RMA_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + fi_opx_ops_rma_##LOCK##_##AV##_##CAPS##_##RELIABILITY##_##HFI1_TYPE + +#define FI_OPX_RMA_OPS_STRUCT(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ + static struct fi_ops_rma FI_OPX_RMA_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) = { \ .size = sizeof(struct fi_ops_rma), \ - .read = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, LOCK, AV, CAPS, RELIABILITY), \ + .read = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .readv = fi_no_rma_readv, \ - .readmsg = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, LOCK, AV, CAPS, RELIABILITY), \ - .write = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, LOCK, AV, CAPS, RELIABILITY), \ + .readmsg = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .write = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .inject = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, LOCK, AV, CAPS, \ - RELIABILITY), \ - .writev = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(writev, LOCK, AV, CAPS, RELIABILITY), \ + RELIABILITY, HFI1_TYPE), \ + .writev = FI_OPX_RMA_SPECIALIZED_FUNC_NAME(writev, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .writemsg = \ - FI_OPX_RMA_SPECIALIZED_FUNC_NAME(writemsg, LOCK, AV, CAPS, RELIABILITY), \ + FI_OPX_RMA_SPECIALIZED_FUNC_NAME(writemsg, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .writedata = fi_no_rma_writedata, \ } -FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY); -FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY); +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR); + +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B); + +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); +FI_OPX_RMA_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, OPX_AV, 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR); #pragma GCC diagnostic pop @@ -1129,17 +1242,49 @@ int fi_opx_enable_rma_ops(struct fid_ep *ep) } const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); - if (!lock_required) { - opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED, - OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (!lock_required) { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } else { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_WFR); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (!lock_required) { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } else { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B); + } + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + if (!lock_required) { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } else { + opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED, + OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, + OPX_HFI1_JKR); + } } else { - opx_ep->ep_fid.rma = &FI_OPX_RMA_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED, - OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY); - + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); } - + return 0; err: return -errno; @@ -1154,31 +1299,92 @@ ssize_t fi_opx_write_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t le fi_addr_t dest_addr, uint64_t addr_offset, uint64_t key, void *context) { - return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, OPX_LOCK, OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY)(ep, buf, len, desc, dest_addr, - addr_offset, key, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)(ep, buf, len, desc, dest_addr, + addr_offset, key, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)(ep, buf, len, desc, dest_addr, + addr_offset, key, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(write, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)(ep, buf, len, desc, dest_addr, + addr_offset, key, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_inject_write_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr_offset, uint64_t key) { - return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, OPX_LOCK, OPX_AV, - 0x0018000000000000ull, OPX_RELIABILITY)( - ep, buf, len, dest_addr, addr_offset, key); -} + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_WFR)( + ep, buf, len, dest_addr, addr_offset, key); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR_9B)( + ep, buf, len, dest_addr, addr_offset, key); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(inject_write, OPX_LOCK, OPX_AV, + 0x0018000000000000ull, OPX_RELIABILITY, OPX_HFI1_JKR)( + ep, buf, len, dest_addr, addr_offset, key); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM;} ssize_t fi_opx_read_FABRIC_DIRECT(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context) { - return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, OPX_LOCK, OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY)(ep, buf, len, desc, src_addr, - addr_offset, key, context); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)(ep, buf, len, desc, src_addr, + addr_offset, key, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)(ep, buf, len, desc, src_addr, + addr_offset, key, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(read, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)(ep, buf, len, desc, src_addr, + addr_offset, key, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_readmsg_FABRIC_DIRECT(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { - return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, OPX_LOCK, OPX_AV, 0x0018000000000000ull, - OPX_RELIABILITY)(ep, msg, flags); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR)(ep, msg, flags); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B)(ep, msg, flags); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_RMA_SPECIALIZED_FUNC_NAME(readmsg, OPX_LOCK, OPX_AV, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR)(ep, msg, flags); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } diff --git a/prov/opx/src/fi_opx_tagged.c b/prov/opx/src/fi_opx_tagged.c index 8159bacfbbf..815ec13ed13 100644 --- a/prov/opx/src/fi_opx_tagged.c +++ b/prov/opx/src/fi_opx_tagged.c @@ -56,7 +56,8 @@ ssize_t fi_opx_trecvmsg_generic (struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const enum ofi_reliability_kind reliability, - const enum fi_progress progress) + const enum fi_progress progress, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); union fi_opx_context * opx_context = NULL; @@ -90,7 +91,8 @@ ssize_t fi_opx_trecvmsg_generic (struct fid_ep *ep, OPX_CONTEXT_EXTENDED_FALSE, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } #ifdef OPX_HMEM @@ -155,7 +157,8 @@ ssize_t fi_opx_trecvmsg_generic (struct fid_ep *ep, OPX_CONTEXT_EXTENDED_TRUE, OPX_HMEM_TRUE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } #endif if (msg->iov_count == 1) { @@ -178,7 +181,8 @@ ssize_t fi_opx_trecvmsg_generic (struct fid_ep *ep, OPX_CONTEXT_EXTENDED_FALSE, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } assert((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM); /* TODO - why not? */ @@ -208,24 +212,37 @@ ssize_t fi_opx_trecvmsg_generic (struct fid_ep *ep, OPX_CONTEXT_EXTENDED_TRUE, OPX_HMEM_FALSE, lock_required, av_type, - reliability); + reliability, + hfi1_type); } ssize_t fi_opx_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, uint64_t flags) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const enum fi_threading threading = opx_ep->domain->threading; const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); const enum fi_av_type av_type = opx_ep->av_type; + ssize_t rc = 0; fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_trecvmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, - opx_ep->reliability->state.kind, - opx_ep->domain->data_progress); + + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + rc = fi_opx_trecvmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, + opx_ep->reliability->state.kind, + opx_ep->domain->data_progress, OPX_HFI1_WFR); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + rc = fi_opx_trecvmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, + opx_ep->reliability->state.kind, + opx_ep->domain->data_progress, OPX_HFI1_JKR_9B); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + rc = fi_opx_trecvmsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, + opx_ep->reliability->state.kind, + opx_ep->domain->data_progress, OPX_HFI1_JKR); + } + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -233,8 +250,6 @@ ssize_t fi_opx_trecvmsg(struct fid_ep *ep, ssize_t fi_opx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, uint64_t flags) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const enum fi_threading threading = opx_ep->threading; const enum fi_av_type av_type = opx_ep->av_type; @@ -255,7 +270,8 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, FI_OPX_LOCK_NOT_REQUIRED, av_type, caps | FI_TAGGED, - opx_ep->reliability->state.kind); + opx_ep->reliability->state.kind, + OPX_HFI1_TYPE); } else { rc = fi_opx_ep_tx_send_internal(ep, 0, 0, msg->desc, msg->addr, msg->tag, msg->context, msg->data, @@ -265,7 +281,8 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_TAGGED, - opx_ep->reliability->state.kind); + opx_ep->reliability->state.kind, + OPX_HFI1_TYPE); } } else if (niov == 1) { rc = fi_opx_ep_tx_send_internal(ep, msg->msg_iov->iov_base, @@ -277,7 +294,8 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_TAGGED, - opx_ep->reliability->state.kind); + opx_ep->reliability->state.kind, + OPX_HFI1_TYPE); } else { rc = fi_opx_ep_tx_send_internal(ep, msg->msg_iov, msg->iov_count, msg->desc, msg->addr, msg->tag, msg->context, msg->data, @@ -287,7 +305,8 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, OPX_FLAGS_OVERRIDE_TRUE, flags, caps | FI_TAGGED, - opx_ep->reliability->state.kind); + opx_ep->reliability->state.kind, + OPX_HFI1_TYPE); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; @@ -295,76 +314,157 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, /* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) /* FI_LOCAL_COMM = 0x0008000000000000ull */ -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) /* FI_REMOTE_COMM = 0x0010000000000000ull */ -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) -FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD) - - - -#define FI_OPX_TAGGED_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY) \ - FI_OPX_TAGGED_OPS_STRUCT_NAME_(LOCK,AV,CAPS,RELIABILITY) - -#define FI_OPX_TAGGED_OPS_STRUCT_NAME_(LOCK,AV,CAPS,RELIABILITY) \ - fi_opx_ops_tagged_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY - -#define FI_OPX_TAGGED_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY) \ -static struct fi_ops_tagged \ - FI_OPX_TAGGED_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY) __attribute__ ((unused)) = { \ - .size = sizeof(struct fi_ops_tagged), \ - .recv = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, LOCK, AV, CAPS, RELIABILITY), \ - .recvv = fi_no_tagged_recvv, \ - .recvmsg = fi_opx_trecvmsg, \ - .send = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, LOCK, AV, CAPS, RELIABILITY), \ - .sendv = fi_no_tagged_sendv, \ - .sendmsg = fi_opx_tsendmsg, \ - .inject = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, LOCK, AV, CAPS, RELIABILITY), \ - .senddata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, LOCK, AV, CAPS, RELIABILITY), \ - .injectdata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, LOCK, AV, CAPS, RELIABILITY),\ +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B) + +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) +FI_OPX_TAGGED_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR) + +#define FI_OPX_TAGGED_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ + FI_OPX_TAGGED_OPS_STRUCT_NAME_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) + +#define FI_OPX_TAGGED_OPS_STRUCT_NAME_(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ + fi_opx_ops_tagged_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE + +#define FI_OPX_TAGGED_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ +static struct fi_ops_tagged \ + FI_OPX_TAGGED_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) __attribute__ ((unused)) = { \ + .size = sizeof(struct fi_ops_tagged), \ + .recv = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .recvv = fi_no_tagged_recvv, \ + .recvmsg = fi_opx_trecvmsg, \ + .send = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .sendv = fi_no_tagged_sendv, \ + .sendmsg = fi_opx_tsendmsg, \ + .inject = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .senddata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ + .injectdata = FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ } /* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0018000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); /* FI_LOCAL_COMM = 0x0008000000000000ull */ -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0008000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); /* FI_REMOTE_COMM = 0x0010000000000000ull */ -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); -FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD); - +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_NOT_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_MAP, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_TABLE, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); +FI_OPX_TAGGED_OPS_STRUCT(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000000000ull, OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); ssize_t fi_opx_tsearch(struct fid_ep *ep, uint64_t *tag, uint64_t ignore, uint64_t flags, @@ -445,47 +545,138 @@ int fi_opx_enable_tagged_ops(struct fid_ep *ep) const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); - if (!lock_required) { - if (opx_ep->av->type == FI_AV_TABLE) { - if (comm_caps == FI_LOCAL_COMM) { - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - } else if (comm_caps == FI_REMOTE_COMM) { - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - } - } else if (opx_ep->av->type == FI_AV_MAP) { - if (comm_caps == FI_LOCAL_COMM) { - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - } else if (comm_caps == FI_REMOTE_COMM) { - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } } else { - /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ - assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_WFR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } } - } else { - if (opx_ep->av->type == FI_AV_TABLE) { - if (comm_caps == FI_LOCAL_COMM) { - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - } else if (comm_caps == FI_REMOTE_COMM) { - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } - } else if (opx_ep->av->type == FI_AV_MAP) { - if (comm_caps == FI_LOCAL_COMM) { - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - } else if (comm_caps == FI_REMOTE_COMM) { - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); - } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ - opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD); + } else { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR_9B); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } + } + } else { + if (!lock_required) { + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_NOT_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); } } else { - /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ - assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + if (opx_ep->av->type == FI_AV_TABLE) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_TABLE,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else if (opx_ep->av->type == FI_AV_MAP) { + if (comm_caps == FI_LOCAL_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0008000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else if (comm_caps == FI_REMOTE_COMM) { + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0010000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } else { /* comm_caps == (FI_LOCAL_COMM | FI_REMOTE_COMM) */ + opx_ep->ep_fid.tagged = &FI_OPX_TAGGED_OPS_STRUCT_NAME(FI_OPX_LOCK_REQUIRED,FI_AV_MAP,0x0018000000000000ull,OFI_RELIABILITY_KIND_ONLOAD, OPX_HFI1_JKR); + } + } else { + /* FI_AV_UNSPEC is not a runtime value in the address vector so FI_OPX_TAGGED_OPS_STRUCT_NAME is not used here. It is used in FABRIC_DIRECT */ + assert((opx_ep->av->type==FI_AV_TABLE)||(opx_ep->av->type==FI_AV_MAP)); + } } } @@ -509,68 +700,183 @@ ssize_t fi_opx_tinject_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, dest_addr, tag); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, dest_addr, tag); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinject, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_JKR) (ep, buf, len, dest_addr, tag); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_tsend_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t tag, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_WFR) (ep, buf, len, desc, dest_addr, tag, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, dest_addr, tag, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsend, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, desc, dest_addr, tag, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_tinjectdata_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t tag) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_WFR) (ep, buf, len, data, dest_addr, tag); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, data, dest_addr, tag); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tinjectdata, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, data, dest_addr, tag); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_tsenddata_FABRIC_DIRECT(struct fid_ep *ep, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_WFR) + (ep, buf, len, desc, data, dest_addr, tag, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) (ep, buf, len, desc, data, dest_addr, tag, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(tsenddata, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, desc, data, dest_addr, tag, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } ssize_t fi_opx_trecv_FABRIC_DIRECT(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); - - return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ + if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, OPX_LOCK, OPX_AV, OPX_TAGGED_CAPS, - OPX_RELIABILITY) + OPX_RELIABILITY, + OPX_HFI1_WFR) (ep, buf, len, desc, src_addr, tag, ignore, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR_9B) + (ep, buf, len, desc, src_addr, tag, ignore, context); + } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { + return FI_OPX_TAGGED_SPECIALIZED_FUNC_NAME(trecv, + OPX_LOCK, + OPX_AV, + OPX_TAGGED_CAPS, + OPX_RELIABILITY, + OPX_HFI1_JKR) + (ep, buf, len, desc, src_addr, tag, ignore, context); + } else { + /* should never get here */ + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); + abort(); + } + return (ssize_t) -FI_EPERM; } diff --git a/prov/opx/src/opa_proto.c b/prov/opx/src/opa_proto.c index 235d2c37e57..742eb43721f 100644 --- a/prov/opx/src/opa_proto.c +++ b/prov/opx/src/opa_proto.c @@ -149,22 +149,12 @@ static int map_hfi_mem(int fd, struct _hfi_ctrl *ctrl, size_t subctxt_cnt) /* 7. Map RXE per-context CSRs */ /* JKR sz is 8K. WFR sz is 4K. */ - if(OPX_HFI1_WFR == opx_hfi1_check_hwversion(binfo->hw_version)){ + if(OPX_HFI1_WFR == opx_hfi1_check_hwversion(binfo->hw_version)) { sz = HFI_MMAP_PGSIZE; -#ifndef OPX_WFR - fprintf(stderr, "Runtime HFI type (%u) found on non-WFR build\n", - opx_hfi1_check_hwversion(binfo->hw_version)); - abort(); -#endif } else { /* JKR prefers 8K page alignment for possible future work with 8K virtual memory pages */ sz = 2*HFI_MMAP_PGSIZE; -#ifndef OPX_JKR - fprintf(stderr, "Runtime HFI type (%u) found on non-JKR build\n", - opx_hfi1_check_hwversion(binfo->hw_version)); - abort(); -#endif } HFI_MMAP_ERRCHECK(fd, binfo, user_regbase, sz, PROT_WRITE|PROT_READ); arrsz[USER_REGBASE] = sz; From 12b594a4fa4ae6249193f61d6375852a56876752 Mon Sep 17 00:00:00 2001 From: Charles Shereda Date: Wed, 14 Aug 2024 13:46:34 -0400 Subject: [PATCH 016/393] prov/opx: Added GDRCopy logging and failure path Signed-off-by: Thomas Huber Signed-off-by: Charles Shereda Co-authored-by: Thomas Huber --- prov/opx/src/fi_opx_ep.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index ddbb07cfd90..8ee217acca7 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -2521,6 +2521,30 @@ int fi_opx_endpoint_rx_tx (struct fid_domain *dom, struct fi_info *info, } #endif +#if defined(OPX_HMEM) && HAVE_CUDA + int use_gdrcopy; + int gdrcopy_enabled = cuda_is_gdrcopy_enabled(); + + if (fi_param_get_bool(NULL, "hmem_cuda_use_gdrcopy", &use_gdrcopy) != FI_SUCCESS) { + FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "FI_HMEM_CUDA_USE_GDRCOPY either not specified or invalid. Using default value of 1\n"); + use_gdrcopy = 1; /* Set to the libfabric default of FI_HMEM_CUDA_USE_GDRCOPY=1 */ + } + + if (gdrcopy_enabled == 1) { + if (use_gdrcopy == 1) { + FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "GDRCopy has been requested and is available. If you wish to explicity disable GDRCopy, set FI_HMEM_CUDA_USE_GDRCOPY=0\n"); + } + } else if (use_gdrcopy == 1) { + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "GDRCopy has been requested but is not available on this system, set FI_HMEM_CUDA_USE_GDRCOPY=0 and try again.\n"); + fprintf(stderr, "%s:%s():%d GDRCopy cannot be used, set FI_HMEM_CUDA_USE_GDRCOPY=0 and try again. Returning FI_EOPNOTSUPP. \n", __FILE__, __func__, __LINE__); + errno = FI_EOPNOTSUPP; + goto err; + } else { + /* gdrcopy_enabled = 0 and use_gdrcopy = 0 */ + FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "If GDRCopy is installed on this system, change FI_HMEM_CUDA_USE_GDRCOPY=0 to FI_HMEM_CUDA_USE_GDRCOPY=1 to enable GDRCopy. \n"); + } +#endif + *ep = &opx_ep->ep_fid; FI_OPX_DEBUG_COUNTERS_INIT(opx_ep->debug_counters); From cabcbe085ae06bf560183f1d637498259eed972f Mon Sep 17 00:00:00 2001 From: Archana Venkatesha Date: Thu, 15 Aug 2024 09:45:52 -0400 Subject: [PATCH 017/393] prov/opx: CN5000/JKR: Changes needed to get RMA working in 16B This commit addresses all the changes necessary to get RMA working with JKR 16B. Signed-off-by: Archana Venkatesha --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 49 ++--- .../rdma/opx/fi_opx_fabric_transport.h | 5 +- .../include/rdma/opx/fi_opx_hfi1_transport.h | 90 +++++++-- prov/opx/include/rdma/opx/fi_opx_rma.h | 26 ++- prov/opx/src/fi_opx_hfi1.c | 186 +++++++++++++----- prov/opx/src/fi_opx_rma.c | 159 ++++++++++----- 6 files changed, 357 insertions(+), 158 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 7d54792a133..e4222dc3abd 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -4235,42 +4235,23 @@ ssize_t fi_opx_ep_tx_send_rzv(struct fid_ep *ep, } do { - if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - if (is_contiguous) { - rc = FI_OPX_FABRIC_TX_SEND_RZV( - ep, buf, len, desc, addr.fi, tag, context, data, - lock_required, override_flags, tx_op_flags, addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hmem_iface, hmem_device, hfi1_type); - } else { - rc = FI_OPX_FABRIC_TX_SENDV_RZV( - ep, local_iov, niov, total_len, desc, addr.fi, tag, - context, data, lock_required, override_flags, tx_op_flags, - addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hmem_iface, hmem_device, hfi1_type); - } + if (is_contiguous) { + rc = FI_OPX_FABRIC_TX_SEND_RZV( + ep, buf, len, desc, addr.fi, tag, context, data, + lock_required, override_flags, tx_op_flags, addr.hfi1_rx, + byte_counter_ptr, + byte_counter, + caps, reliability, hmem_iface, hmem_device, hfi1_type); } else { - if (is_contiguous) { - rc = FI_OPX_FABRIC_TX_SEND_RZV_16B( - ep, buf, len, desc, addr.fi, tag, context, data, - lock_required, override_flags, tx_op_flags, addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hmem_iface, hmem_device, hfi1_type); - } else { - /*rc = FI_OPX_FABRIC_TX_SENDV_RZV( - ep, local_iov, niov, total_len, desc, addr.fi, tag, - context, data, lock_required, override_flags, tx_op_flags, - addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hfi1_type); */ - abort(); - } + rc = FI_OPX_FABRIC_TX_SENDV_RZV( + ep, local_iov, niov, total_len, desc, addr.fi, tag, + context, data, lock_required, override_flags, tx_op_flags, + addr.hfi1_rx, + byte_counter_ptr, + byte_counter, + caps, reliability, hmem_iface, hmem_device, hfi1_type); } + if (OFI_UNLIKELY(rc == -EAGAIN)) { fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } diff --git a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h index 2de99f8b765..c821caebc15 100644 --- a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h @@ -41,13 +41,12 @@ #define FI_OPX_FABRIC_TX_INJECT fi_opx_hfi1_tx_inject #define FI_OPX_FABRIC_TX_SEND_EGR fi_opx_hfi1_tx_send_egr_select #define FI_OPX_FABRIC_TX_SENDV_EGR fi_opx_hfi1_tx_sendv_egr_select -#define FI_OPX_FABRIC_TX_SEND_RZV fi_opx_hfi1_tx_send_rzv +#define FI_OPX_FABRIC_TX_SEND_RZV fi_opx_hfi1_tx_send_rzv_select #define FI_OPX_FABRIC_TX_SENDV_RZV fi_opx_hfi1_tx_sendv_rzv #define FI_OPX_FABRIC_RX_RZV_RTS fi_opx_hfi1_rx_rzv_rts #define FI_OPX_FABRIC_RX_RZV_CTS fi_opx_hfi1_rx_rzv_cts -#define FI_OPX_FABRIC_TX_DO_PUT fi_opx_hfi1_do_dput +#define FI_OPX_FABRIC_TX_DO_PUT fi_opx_hfi1_do_dput -#define FI_OPX_FABRIC_TX_SEND_RZV_16B fi_opx_hfi1_tx_send_rzv_16B #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index a4530ca0961..ae513973669 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -1797,8 +1797,8 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, lrh_qws, lrh_dlid_16B, bth_rx, - payload_qws_total, total_len, + payload_qws_total, xfer_bytes_tail, desc, &addr, @@ -2863,8 +2863,6 @@ ssize_t fi_opx_hfi1_tx_send_egr_select(struct fid_ep *ep, return (ssize_t)-1L; } - - /* * Write the initial packet header of a multi-packet eager send. This will include the size of * the entire multi-packet eager payload. @@ -3705,14 +3703,9 @@ static inline void fi_opx_shm_write_fence(struct fi_opx_ep *opx_ep, const uint64_t lrh_dlid, struct fi_opx_completion_counter *cc, const uint64_t bytes_to_sync, - const uint32_t dest_extended_rx) + const uint32_t dest_extended_rx, + enum opx_hfi1_type hfi1_type) { - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - (0 << 4); - const uint16_t lrh_dws = htons(pbc_dws - 1); const uint64_t bth_rx = dest_rx << 56; uint64_t pos; ssize_t rc; @@ -3729,14 +3722,40 @@ static inline void fi_opx_shm_write_fence(struct fi_opx_ep *opx_ep, dest_extended_rx, 0, &rc); } - hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; - hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; - hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; - hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); - hdr->qw_9B[5] = (uintptr_t)cc; - hdr->qw_9B[6] = bytes_to_sync; - + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + const uint64_t pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + (0 << 4); + const uint16_t lrh_dws = htons(pbc_dws - 1); + hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); + hdr->qw_9B[5] = (uintptr_t)cc; + hdr->qw_9B[6] = bytes_to_sync; + } else { + const uint64_t pbc_dws = 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth */ + 2; /* ICRC */ + const uint16_t lrh_dws = (pbc_dws - 1) >> 1; + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + hdr->qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); + hdr->qw_16B[6] = (uintptr_t)cc; + hdr->qw_16B[7] = bytes_to_sync; + } opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); } @@ -3773,4 +3792,39 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t l const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type); +__OPX_FORCE_INLINE__ +ssize_t fi_opx_hfi1_tx_send_rzv_select(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void *context, + const uint32_t data, int lock_required, + const unsigned override_flags, uint64_t tx_op_flags, + const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, + uint64_t *origin_byte_counter_value, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const enum fi_hmem_iface hmem_iface, + const uint64_t hmem_device, + const enum opx_hfi1_type hfi1_type) +{ + if (hfi1_type & OPX_HFI1_WFR) { + return fi_opx_hfi1_tx_send_rzv(ep, buf, len, desc, dest_addr, tag, context, data, + lock_required, override_flags, tx_op_flags, dest_rx, + origin_byte_counter_vaddr, + origin_byte_counter_value, + caps, reliability, hmem_iface, hmem_device, OPX_HFI1_WFR); + } else if (hfi1_type & OPX_HFI1_JKR) { + return fi_opx_hfi1_tx_send_rzv_16B(ep, buf, len, desc, dest_addr, tag, context, data, + lock_required, override_flags, tx_op_flags, dest_rx, + origin_byte_counter_vaddr, + origin_byte_counter_value, + caps, reliability, hmem_iface, hmem_device, OPX_HFI1_JKR); + } else if (hfi1_type & OPX_HFI1_JKR_9B) { + return fi_opx_hfi1_tx_send_rzv(ep, buf, len, desc, dest_addr, tag, context, data, + lock_required, override_flags, tx_op_flags, dest_rx, + origin_byte_counter_vaddr, + origin_byte_counter_value, + caps, reliability, hmem_iface, hmem_device, OPX_HFI1_JKR_9B); + } + abort(); + return (ssize_t)-1L; +} + #endif /* _FI_PROV_OPX_HFI1_TRANSPORT_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index 8e53211e3c2..d04bf19fad7 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -88,14 +88,24 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, params->bth_rx = params->dest_rx << 56; params->lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(opx_target_addr.fi); params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); - params->pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - 16; /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ - /* lrh does not include pbc (8 bytes/2 dws), but does include icrc (4 bytes/1 dws), - so subtract 1 dws */ - params->lrh_dws = htons(params->pbc_dws - 1); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 16; /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ + /* lrh does not include pbc (8 bytes/2 dws), but does include icrc (4 bytes/1 dws), + so subtract 1 dws */ + params->lrh_dws = htons(params->pbc_dws - 1); + } else { + params->pbc_dws = 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 16 + /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ + 2; /* ICRC */ + params->lrh_dws = (params->pbc_dws - 2) >> 1; + } params->is_intranode = fi_opx_hfi1_tx_is_intranode(opx_ep, opx_target_addr, caps); params->reliability = reliability; params->opcode = opcode; diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index f8c0e77389c..d15a9cd9d3b 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -1875,12 +1875,6 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work) { - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - const uint16_t lrh_dws = htons(pbc_dws - 1); - struct fi_opx_hfi1_rx_dput_fence_params *params = &work->fence; struct fi_opx_ep * opx_ep = params->opx_ep; @@ -1903,13 +1897,39 @@ int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work) return rc; } - hdr->qw_9B[0] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[0] | params->lrh_dlid | ((uint64_t)lrh_dws << 32); - hdr->qw_9B[1] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[1] | params->bth_rx; - hdr->qw_9B[2] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[2]; - hdr->qw_9B[3] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[3]; - hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE; - hdr->qw_9B[5] = (uint64_t)params->cc; - hdr->qw_9B[6] = params->bytes_to_fence; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + const uint64_t pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + const uint16_t lrh_dws = htons(pbc_dws - 1); + + hdr->qw_9B[0] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[0] | params->lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[1] | params->bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_FENCE; + hdr->qw_9B[5] = (uint64_t)params->cc; + hdr->qw_9B[6] = params->bytes_to_fence; + } else { + const uint64_t pbc_dws = 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth */ + 2; /* ICRC */ + const uint16_t lrh_dws = (pbc_dws - 1) >> 1; + hdr->qw_16B[0] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[0] | + ((uint64_t)(params->lrh_dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[1] | + ((uint64_t)((params->lrh_dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[2] | params->bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[4]; + hdr->qw_16B[5] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[5] | FI_OPX_HFI_DPUT_OPCODE_FENCE | (0ULL << 32); + hdr->qw_16B[6] = (uintptr_t)params->cc; + hdr->qw_16B[7] = params->bytes_to_fence; + } opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); @@ -2200,7 +2220,7 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) if (opcode == FI_OPX_HFI_DPUT_OPCODE_PUT && is_intranode) { // RMA-type put, so send a ping/fence to better latency fi_opx_shm_write_fence(opx_ep, params->target_hfi_unit, u8_rx, lrh_dlid, cc, params->bytes_sent, - params->u32_extended_rx); + params->u32_extended_rx, hfi1_type); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-DPUT-%s", is_intranode ? "SHM" : "HFI"); @@ -3184,17 +3204,30 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz // Calculate space for each IOV, then add in the origin_byte_counter_vaddr, // and round to the next 64-byte block. + const uint64_t icrc_and_tail_block = ((hfi1_type == OPX_HFI1_JKR) ? 1 : 0); const uint64_t payload_blocks_total = ((niov * sizeof(struct fi_opx_hmem_iov)) + - sizeof(uintptr_t) + 63) >> 6; + sizeof(uintptr_t) + icrc_and_tail_block + 63) >> 6; assert(payload_blocks_total > 0 && payload_blocks_total < (FI_OPX_HFI1_PACKET_MTU >> 6)); - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lhr */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - (payload_blocks_total << 4); + uint64_t pbc_dws; + uint16_t lrh_dws; - const uint16_t lrh_dws = htons(pbc_dws - 1); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + (payload_blocks_total << 4); + + lrh_dws = htons(pbc_dws - 1); + } else { + pbc_dws = 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + (payload_blocks_total << 4); + lrh_dws = (pbc_dws - 1) >> 1; + } if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { FI_DBG_TRACE( @@ -3210,16 +3243,33 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz if (!hdr) return rc; - hdr->qw_9B[0] = opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - hdr->qw_9B[1] = opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[0] = opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | + ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); - hdr->qw_9B[2] = opx_ep->tx->rzv_9B.hdr.qw_9B[2]; - hdr->qw_9B[3] = opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); - hdr->qw_9B[4] = opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; - hdr->qw_9B[5] = total_len; - hdr->qw_9B[6] = tag; + hdr->qw_9B[2] = opx_ep->tx->rzv_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + hdr->qw_9B[4] = opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; + hdr->qw_9B[5] = total_len; + hdr->qw_9B[6] = tag; + } else { + const uint64_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + hdr->qw_16B[0] = opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | + ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); + hdr->qw_16B[3] = opx_ep->tx->rzv_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + hdr->qw_16B[5] = opx_ep->tx->rzv_16B.hdr.qw_16B[5] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; + hdr->qw_16B[6] = total_len; + hdr->qw_16B[7] = tag; + } union fi_opx_hfi1_packet_payload *const payload = (union fi_opx_hfi1_packet_payload *)(hdr + 1); @@ -3316,9 +3366,10 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); uint64_t local_temp[16] = {0}; - fi_opx_store_and_copy_qw(scb, local_temp, + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb, local_temp, opx_ep->tx->rzv_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : @@ -3327,6 +3378,25 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, total_len, tag); + fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); + } else { + const uint64_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->tx->rzv_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)lrh_dws << 20), + opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | + ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : + (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), + opx_ep->tx->rzv_16B.hdr.qw_16B[3] | psn, + opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), + opx_ep->tx->rzv_16B.hdr.qw_16B[5] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, + total_len); + } FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); @@ -3337,21 +3407,34 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz unsigned credits_consumed = 1; #endif - fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); /* write the payload */ uint64_t *iov_qws = (uint64_t *) &hmem_iov[0]; volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); uint64_t local_temp_payload[16] = {0}; - fi_opx_store_and_copy_qw(scb_payload, local_temp_payload, - origin_byte_counter_vaddr, - iov_qws[0], - iov_qws[1], - iov_qws[2], - iov_qws[3], - iov_qws[4], - iov_qws[5], - iov_qws[6]); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb_payload, local_temp_payload, + origin_byte_counter_vaddr, + iov_qws[0], + iov_qws[1], + iov_qws[2], + iov_qws[3], + iov_qws[4], + iov_qws[5], + iov_qws[6]); + iov_qws += 7; + } else { + fi_opx_store_and_copy_qw(scb_payload, local_temp_payload, + tag, + origin_byte_counter_vaddr, + iov_qws[0], + iov_qws[1], + iov_qws[2], + iov_qws[3], + iov_qws[4], + iov_qws[5]); + iov_qws += 6; + } /* consume one credit for the rendezvous payload metadata */ --total_credits_available; @@ -3363,8 +3446,19 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz uint64_t * replay_payload = replay->payload; assert(!replay->use_iov); assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); - fi_opx_copy_cacheline(replay_payload, local_temp_payload); - replay_payload += 8; + uint64_t rem_payload_size; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_copy_cacheline(replay_payload, local_temp_payload); + replay_payload += 8; + rem_payload_size = sizeof(struct fi_opx_hmem_iov) * (niov - 2); + } else { + local_temp[7] = local_temp_payload[0]; + fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, local_temp); + fi_opx_copy_cacheline(replay_payload, &local_temp_payload[1]); + replay_payload += 7; + rem_payload_size = (sizeof(struct fi_opx_hmem_iov) * (niov - 2) + 8); // overflow 8 bytes from 2nd cacheline + } + if (payload_blocks_total > 1) { assert(niov > 2); @@ -3373,11 +3467,11 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz credits_consumed += #endif fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, &pio_state, - (uint64_t *) &hmem_iov[2], + iov_qws, payload_blocks_total - 1, total_credits_available); - memcpy(replay_payload, &hmem_iov[2], sizeof(struct fi_opx_hmem_iov) * (niov - 2)); + memcpy(replay_payload, iov_qws, rem_payload_size); } FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index 55294a84be8..914e012da67 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -38,6 +38,7 @@ #include "rdma/opx/fi_opx_eq.h" #include "rdma/opx/fi_opx.h" #include "rdma/opx/fi_opx_internal.h" +#include "rdma/opx/fi_opx_hfi1_version.h" #include #include @@ -120,13 +121,28 @@ int fi_opx_do_readv_internal_intranode(union fi_opx_hfi1_deferred_work *work) uint64_t dt64 = params->dt << 32; assert(FI_OPX_HFI_DPUT_OPCODE_GET == params->opcode); // double check packet type assert(params->dt == (FI_VOID - 1) || params->dt < FI_DATATYPE_LAST); - hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | params->lrh_dlid | (params->lrh_dws << 32); - hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | params->bth_rx; - hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; - hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; - hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode | dt64 | op64 | niov; - hdr->qw_9B[5] = (uintptr_t)params->rma_request; - hdr->qw_9B[6] = params->key; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | params->lrh_dlid | (params->lrh_dws << 32); + hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | params->bth_rx; + hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode | dt64 | op64 | niov; + hdr->qw_9B[5] = (uintptr_t)params->rma_request; + hdr->qw_9B[6] = params->key; + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(params->lrh_dlid)); + hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)params->lrh_dws << 20); + hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + hdr->qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | params->bth_rx; + hdr->qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3]; + hdr->qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + hdr->qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | params->opcode | dt64 | op64 | niov; + hdr->qw_16B[6] = (uintptr_t)params->rma_request; + hdr->qw_16B[7] = params->key; + } union fi_opx_hfi1_packet_payload *const tx_payload = (union fi_opx_hfi1_packet_payload *)(hdr + 1); @@ -142,6 +158,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_readv_params *params = &work->readv; struct fi_opx_ep *opx_ep = params->opx_ep; + const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; @@ -157,7 +174,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) const union fi_opx_addr addr = params->opx_target_addr; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, addr.hfi1_rx, - addr.reliability_rx, &psn_ptr, &replay, params->reliability, OPX_HFI1_TYPE); + addr.reliability_rx, &psn_ptr, &replay, params->reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; @@ -171,47 +188,91 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) uint64_t dt64 = params->dt << 32; uint64_t credit_return = OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type); assert(FI_OPX_HFI_DPUT_OPCODE_GET == params->opcode); // double check packet type - fi_opx_store_and_copy_qw(scb, local_temp, - opx_ep->rx->tx.cts_9B.qw0 | OPX_PBC_LEN(params->pbc_dws, hfi1_type) | credit_return | - params->pbc_dlid, - opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | params->lrh_dlid | (params->lrh_dws << 32), - opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | params->bth_rx, - opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn, - opx_ep->rx->tx.cts_9B.hdr.qw_9B[3], - opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode | dt64 | op64 | niov, - (uintptr_t)params->rma_request, - params->key); // key - - /* consume one credit for the packet header */ - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); - - FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - - OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); - - fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); - - /* write the CTS payload "send control block" */ - volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - uint64_t temp[8]; - fi_opx_store_and_copy_qw(scb_payload, temp, - params->dput_iov.qw[0], - params->dput_iov.qw[1], - params->dput_iov.qw[2], - params->dput_iov.qw[3], - params->dput_iov.qw[4], - params->dput_iov.qw[5], - 0, 0); - - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); - replay->payload[0] = temp[0]; - replay->payload[1] = temp[1]; - replay->payload[2] = temp[2]; - replay->payload[3] = temp[3]; - replay->payload[4] = temp[4]; - replay->payload[5] = temp[5]; - replay->payload[6] = temp[6]; - replay->payload[7] = temp[7]; + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->rx->tx.cts_9B.qw0 | OPX_PBC_LEN(params->pbc_dws, hfi1_type) | credit_return | + params->pbc_dlid, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | params->lrh_dlid | (params->lrh_dws << 32), + opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | params->bth_rx, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[3], + opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode | dt64 | op64 | niov, + (uintptr_t)params->rma_request, + params->key); // key + /* consume one credit for the packet header */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); + + /* write the CTS payload "send control block" */ + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + uint64_t temp[8]; + fi_opx_store_and_copy_qw(scb_payload, temp, + params->dput_iov.qw[0], + params->dput_iov.qw[1], + params->dput_iov.qw[2], + params->dput_iov.qw[3], + params->dput_iov.qw[4], + params->dput_iov.qw[5], + 0, 0); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + replay->payload[0] = temp[0]; + replay->payload[1] = temp[1]; + replay->payload[2] = temp[2]; + replay->payload[3] = temp[3]; + replay->payload[4] = temp[4]; + replay->payload[5] = temp[5]; + replay->payload[6] = temp[6]; + replay->payload[7] = temp[7]; + } else { + uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(params->lrh_dlid)); + fi_opx_store_and_copy_qw(scb, local_temp, + opx_ep->rx->tx.cts_16B.qw0 | OPX_PBC_LEN(params->pbc_dws, hfi1_type) | + credit_return | params->pbc_dlid, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)params->lrh_dws << 20), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | params->bth_rx, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[3] | psn, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[4], + opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | params->opcode | dt64 | op64 | niov, + (uintptr_t)params->rma_request); + /* consume one credit for the packet header */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + uint64_t temp[16] = {0}; + fi_opx_store_and_copy_qw(scb_payload, temp, + params->key, + params->dput_iov.qw[0], + params->dput_iov.qw[1], + params->dput_iov.qw[2], + params->dput_iov.qw[3], + params->dput_iov.qw[4], + params->dput_iov.qw[5], + 0UL); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + local_temp[8] = temp[0]; + fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, local_temp); + + replay->payload[0] = temp[1]; + replay->payload[1] = temp[2]; + replay->payload[2] = temp[3]; + replay->payload[3] = temp[4]; + replay->payload[4] = temp[5]; + replay->payload[5] = temp[6]; + replay->payload[6] = temp[7]; + } fi_opx_reliability_client_replay_register_no_update( &opx_ep->reliability->state, From 9e74765c965dd56e1e66569590de892ef3f36aa4 Mon Sep 17 00:00:00 2001 From: Lindsay Reiser Date: Thu, 15 Aug 2024 13:40:31 -0400 Subject: [PATCH 018/393] prov/opx: Add OPX Tracer EP lock and Recv entries Signed-off-by: Lindsay Reiser Co-authored-by: Charles Shereda --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 25 ++++--- prov/opx/include/rdma/opx/fi_opx_internal.h | 16 ++++- prov/opx/include/rdma/opx/opx_tracer.h | 72 +++++++++++++-------- 3 files changed, 75 insertions(+), 38 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index e4222dc3abd..7030936f242 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -204,7 +204,7 @@ enum opx_work_type { }; OPX_COMPILE_TIME_ASSERT(OPX_WORK_TYPE_SDMA == 0, - "OPX_WORK_TYPE_SDMA needs to be 0/first value in the enum!"); + "OPX_WORK_TYPE_SDMA needs to be 0/first value in the enum!"); static const char * const OPX_WORK_TYPE_STR[] = { [OPX_WORK_TYPE_SDMA] = "SDMA", @@ -1719,7 +1719,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-MP-EAGER-FIRST"); const uint64_t ofi_data = hdr->match.ofi_data; - + uint64_t payload_qws_total; if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { payload_qws_total = (((uint64_t) ntohs(hdr->lrh_9B.pktlen)) - 15) >> 1; @@ -1951,14 +1951,14 @@ void complete_receive_operation_internal (struct fid_ep *ep, "===================================== RECV -- MULTI PACKET EAGER NTH byte counter %lu (end)\n",context->byte_counter); } else if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { /* 9B rendezvous packet */ union fi_opx_hfi1_packet_payload *p = (union fi_opx_hfi1_packet_payload *) payload; - + const uint64_t is_noncontig = hdr->rendezvous.flags & FI_OPX_PKT_RZV_FLAGS_NONCONTIG; uintptr_t origin_byte_counter_vaddr = (is_noncontig == 1) ? p->rendezvous.noncontiguous.origin_byte_counter_vaddr : p->rendezvous.contiguous.origin_byte_counter_vaddr; struct fi_opx_hmem_iov *iov = &p->rendezvous.noncontiguous.iov[0]; - + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { .qw0 = p->rendezvous.contiguous.immediate_info }; @@ -2294,7 +2294,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, uint16_t lrh_pktlen_le; size_t total_bytes_to_copy; uint16_t bytes; - + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { lrh_pktlen_le = ntohs(hdr->lrh_9B.pktlen); total_bytes_to_copy = @@ -2727,7 +2727,7 @@ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, slid = (uint64_t)(uepkt->hdr.lrh_9B.slid); } else { slid = htons(((uepkt->hdr.lrh_16B.slid20 << 20) | (uepkt->hdr.lrh_16B.slid))); - } + } if (fi_opx_mp_egr_id_from_nth_packet(&uepkt->hdr, slid) == mp_egr_id.id) { @@ -2970,7 +2970,7 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, static_flags, opcode, origin_rs, is_intranode, - lock_required, reliability, + lock_required, reliability, hfi1_type, slid); return; @@ -3435,7 +3435,7 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, .unused = 0 }; - fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, + fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_intranode, lock_required, reliability, hfi1_type); if (context->byte_counter) { @@ -3601,6 +3601,7 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECV: context = %p\n", context); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "POST-RECV"); const uint64_t rx_op_flags = opx_ep->rx->op_flags; uint64_t rx_caps = opx_ep->rx->caps; @@ -3644,6 +3645,7 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, if (OFI_UNLIKELY(ext == NULL)) { FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECV RETURN FI_ENOMEM\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "POST-RECV"); return -FI_ENOMEM; } struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &ext->hmem_info_qws[0]; @@ -3683,6 +3685,7 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, hfi1_type); } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECV"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECV RETURN\n"); return 0; @@ -3707,6 +3710,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, const enum opx_hfi1_type hfi1_type) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECVMSG\n"); + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "POST-RECVMSG"); FI_OPX_DEBUG_COUNTERS_INC_COND(!(flags & FI_MULTI_RECV), opx_ep->debug_counters.recv.posted_recv_msg); FI_OPX_DEBUG_COUNTERS_INC_COND((flags & FI_MULTI_RECV), opx_ep->debug_counters.recv.posted_multi_recv); assert(!lock_required); @@ -3798,6 +3802,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, if (OFI_UNLIKELY(ext == NULL)) { FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG (HMEM) RETURN FI_ENOMEM\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "POST-RECVMSG"); return -FI_ENOMEM; } @@ -3863,6 +3868,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); if (OFI_UNLIKELY(ext == NULL)) { FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECVMSG RETURN FI_ENOMEM\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "POST-RECVMSG"); return -FI_ENOMEM; } @@ -3885,6 +3891,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); @@ -4579,7 +4586,7 @@ ssize_t fi_opx_recvmsg_generic(struct fid_ep *ep, struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); - ssize_t rc = fi_opx_ep_rx_recvmsg_internal(opx_ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, + ssize_t rc = fi_opx_ep_rx_recvmsg_internal(opx_ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, av_type, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); diff --git a/prov/opx/include/rdma/opx/fi_opx_internal.h b/prov/opx/include/rdma/opx/fi_opx_internal.h index ebd0001c040..c53c3dd168d 100644 --- a/prov/opx/include/rdma/opx/fi_opx_internal.h +++ b/prov/opx/include/rdma/opx/fi_opx_internal.h @@ -41,6 +41,8 @@ #include #include +#include "rdma/opx/opx_tracer.h" + #define FI_OPX_CACHE_LINE_SIZE (64) #define FI_OPX_CQ_CONTEXT_EXT (0x8000000000000000ull) @@ -204,7 +206,12 @@ static inline int fi_opx_threading_lock_required(const enum fi_threading threadi static inline void fi_opx_lock_if_required (ofi_spin_t *lock, const int required) { - if (required) ofi_spin_lock(lock); + if (required) { + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_BEGIN, "LOCK"); + ofi_spin_lock(lock); + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_END_SUCCESS, "LOCK"); + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_BEGIN, "LOCK-HELD"); + } } static inline void fi_opx_lock (ofi_spin_t *lock) @@ -214,7 +221,12 @@ static inline void fi_opx_lock (ofi_spin_t *lock) static inline void fi_opx_unlock_if_required (ofi_spin_t *lock, const int required) { - if (required) ofi_spin_unlock(lock); + if (required) { + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_END_SUCCESS, "LOCK-HELD"); + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_BEGIN, "UNLOCK"); + ofi_spin_unlock(lock); + OPX_TRACER_TRACE_LOCK_IF_REQUIRED(OPX_TRACER_END_SUCCESS, "UNLOCK"); + } } static inline void fi_opx_unlock (ofi_spin_t *lock) diff --git a/prov/opx/include/rdma/opx/opx_tracer.h b/prov/opx/include/rdma/opx/opx_tracer.h index b07fa80a2e3..a3ce127c918 100644 --- a/prov/opx/include/rdma/opx/opx_tracer.h +++ b/prov/opx/include/rdma/opx/opx_tracer.h @@ -118,7 +118,7 @@ int opx_tracer_enabled() } __OPX_FORCE_INLINE__ -void opx_tracer_trace(enum opx_tracer_status status, +void opx_tracer_trace(enum opx_tracer_status status, const char *func, int line, const char *msg) { struct timespec ts; @@ -132,20 +132,20 @@ void opx_tracer_trace(enum opx_tracer_status status, timestamp, opx_tracer.pid, func, line, OPX_TRACER_STATUS_STR[status], msg); } -#if defined(OPX_TRACER) || defined(OPX_TRACER_SDMA) || defined(OPX_TRACER_RELI) +#if defined(OPX_TRACER) || defined(OPX_TRACER_SDMA) || defined(OPX_TRACER_RELI) || defined(OPX_TRACER_LOCK_IF_REQUIRED) #define OPX_TRACER_INIT() opx_tracer_init() -#define OPX_TRACER_TRACE(status, fmt, ...) \ - do { \ - if (opx_tracer_enabled()) { \ - int saved_errno = errno; \ - char msg[1024]; \ +#define OPX_TRACER_TRACE(status, fmt, ...) \ + do { \ + if (opx_tracer_enabled()) { \ + int saved_errno = errno; \ + char msg[1024]; \ snprintf(msg, sizeof(msg), fmt, ##__VA_ARGS__); \ - opx_tracer_trace(status, \ - __func__, __LINE__, msg); \ - errno = saved_errno; \ - } \ + opx_tracer_trace(status, \ + __func__, __LINE__, msg); \ + errno = saved_errno; \ + } \ } while (0) #define OPX_TRACER_EXIT() opx_tracer_exit() @@ -159,15 +159,15 @@ void opx_tracer_trace(enum opx_tracer_status status, #if defined(OPX_TRACER_SDMA) #define OPX_TRACER_TRACE_SDMA(status, fmt, ...) \ - do { \ - if (opx_tracer_enabled()) { \ - int saved_errno = errno; \ - char msg[1024]; \ + do { \ + if (opx_tracer_enabled()) { \ + int saved_errno = errno; \ + char msg[1024]; \ snprintf(msg, sizeof(msg), fmt, ##__VA_ARGS__); \ - opx_tracer_trace(status, \ - __func__, __LINE__, msg); \ - errno = saved_errno; \ - } \ + opx_tracer_trace(status, \ + __func__, __LINE__, msg); \ + errno = saved_errno; \ + } \ } while (0) #else @@ -177,19 +177,37 @@ void opx_tracer_trace(enum opx_tracer_status status, #if defined(OPX_TRACER_RELI) #define OPX_TRACER_TRACE_RELI(status, fmt, ...) \ - do { \ - if (opx_tracer_enabled()) { \ - int saved_errno = errno; \ - char msg[1024]; \ + do { \ + if (opx_tracer_enabled()) { \ + int saved_errno = errno; \ + char msg[1024]; \ snprintf(msg, sizeof(msg), fmt, ##__VA_ARGS__); \ - opx_tracer_trace(status, \ - __func__, __LINE__, msg); \ - errno = saved_errno; \ - } \ + opx_tracer_trace(status, \ + __func__, __LINE__, msg); \ + errno = saved_errno; \ + } \ } while (0) #else #define OPX_TRACER_TRACE_RELI(status, ...) #endif +#if defined(OPX_TRACER_LOCK_IF_REQUIRED) + +#define OPX_TRACER_TRACE_LOCK_IF_REQUIRED(status, fmt, ...) \ + do { \ + if (opx_tracer_enabled()) { \ + int saved_errno = errno; \ + char msg[1024]; \ + snprintf(msg, sizeof(msg), fmt, ##__VA_ARGS__); \ + opx_tracer_trace(status, \ + __func__, __LINE__, msg); \ + errno = saved_errno; \ + } \ + } while (0) + +#else +#define OPX_TRACER_TRACE_LOCK_IF_REQUIRED(status, ...) +#endif + #endif From 699620058f65676f5c100a78d0f28f1a016a0eb5 Mon Sep 17 00:00:00 2001 From: Lindsay Reiser Date: Thu, 15 Aug 2024 15:18:08 -0400 Subject: [PATCH 019/393] opx/prov: Remove assert from find pkt by tag Signed-off-by: Lindsay Reiser --- prov/opx/include/rdma/opx/fi_opx_match.h | 1 - 1 file changed, 1 deletion(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_match.h b/prov/opx/include/rdma/opx/fi_opx_match.h index 2b0176fedce..0df7929e64e 100644 --- a/prov/opx/include/rdma/opx/fi_opx_match.h +++ b/prov/opx/include/rdma/opx/fi_opx_match.h @@ -223,7 +223,6 @@ struct fi_opx_hfi1_ue_packet *fi_opx_match_find_uepkt_by_tag(struct fi_opx_match struct fi_opx_debug_counters *debug_counters) { struct fi_opx_hfi1_ue_packet *uepkt = ue_hash->tag_ht[hash_index].head; - assert(uepkt); FI_OPX_DEBUG_COUNTERS_INC(debug_counters->match.ue_hash_tag_searches); From 355fe835c9f3c0d16bd9e5965496b61888102a7e Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Thu, 15 Aug 2024 17:08:36 -0500 Subject: [PATCH 020/393] prov/opx: Replace fi_opx_context_slist with slist This change replaces the use of fi_opx_context_slist and its related functions with a standard slist and slist functions. Signed-off-by: Ben Lynam --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 80 ++++++++++--------- prov/opx/include/rdma/opx/fi_opx_eq.h | 52 ++++-------- .../include/rdma/opx/fi_opx_hfi1_transport.h | 48 +++++------ prov/opx/include/rdma/opx/fi_opx_internal.h | 55 ------------- prov/opx/src/fi_opx_cq.c | 28 +++---- prov/opx/src/fi_opx_ep.c | 42 +++++----- prov/opx/src/fi_opx_rma.c | 8 +- 7 files changed, 122 insertions(+), 191 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 7030936f242..2421c12596a 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -274,7 +274,7 @@ struct fi_opx_ep_tx { volatile uint64_t * pio_credits_addr; /* const; only used to infrequently "refresh" credit information */ volatile uint64_t * pio_scb_first; /* const; only eager and rendezvous */ uint64_t cq_bind_flags; - struct fi_opx_context_slist * cq_completed_ptr; + struct slist * cq_completed_ptr; uint32_t do_cq_completion; uint16_t unused_cacheline1; uint8_t force_credit_return; @@ -305,9 +305,9 @@ struct fi_opx_ep_tx { uint64_t op_flags; uint64_t caps; uint64_t mode; - struct fi_opx_context_slist * cq_err_ptr; + struct slist * cq_err_ptr; struct fi_opx_cq * cq; - struct fi_opx_context_slist * cq_pending_ptr; /* only rendezvous (typically) */ + struct slist * cq_pending_ptr; /* only rendezvous (typically) */ /* == CACHE LINE 14 == */ @@ -388,19 +388,19 @@ struct fi_opx_ep_rx { struct { struct fi_opx_hfi1_ue_packet_slist ue; /* 3 qws */ - struct fi_opx_context_slist mq; /* 2 qws */ + struct slist mq; /* 2 qws */ } queue[2]; /* 0 = FI_TAGGED, 1 = FI_MSG */ struct { struct fi_opx_hfi1_ue_packet_slist ue; /* 3 qws */ - struct fi_opx_context_slist mq; /* 2 qws */ + struct slist mq; /* 2 qws */ } mp_egr_queue; struct fi_opx_match_ue_hash * match_ue_tag_hash; /* == CACHE LINE 3 == */ - struct fi_opx_context_slist * cq_pending_ptr; - struct fi_opx_context_slist * cq_completed_ptr; + struct slist * cq_pending_ptr; + struct slist * cq_completed_ptr; struct ofi_bufpool * ue_packet_pool; struct ofi_bufpool * ctx_ext_pool; @@ -450,8 +450,8 @@ struct fi_opx_ep_rx { uint64_t mode; union fi_opx_addr self; - struct fi_opx_context_slist *cq_err_ptr; - struct fi_opx_cq * cq; + struct slist *cq_err_ptr; + struct fi_opx_cq *cq; struct opx_shm_rx shm; void *mem; @@ -966,7 +966,7 @@ uint32_t fi_opx_ep_get_u32_extended_rx (struct fi_opx_ep * opx_ep, } __OPX_FORCE_INLINE__ -void fi_opx_enqueue_completed(struct fi_opx_context_slist *queue, +void fi_opx_enqueue_completed(struct slist *queue, void *context, const uint64_t is_context_ext, const int lock_required) @@ -986,7 +986,7 @@ void fi_opx_enqueue_completed(struct fi_opx_context_slist *queue, real_context = (union fi_opx_context *) context; } - fi_opx_context_slist_insert_tail(real_context, queue); + slist_insert_tail((struct slist_entry *) real_context, queue); } __OPX_FORCE_INLINE__ @@ -1141,7 +1141,7 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, original_multi_recv_context->buf = (void*)((uintptr_t)(original_multi_recv_context->buf) + bytes_consumed); assert(context->next == NULL); if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr); } else if (OFI_LIKELY(xfer_len <= recv_len)) { @@ -1278,7 +1278,7 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, /* post a pending completion event for the individual receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr); } else { /* truncation - unlikely */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -1317,7 +1317,7 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, hfi1_type); if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr); /* Post a E_TRUNC to our local RX error queue because a client called receive with too small a buffer. Tell them about it via the error cq */ @@ -1353,7 +1353,7 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, /* post an 'error' completion event */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) ext, rx->cq_err_ptr); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS"); @@ -1440,7 +1440,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, /* post a completion event for the individual receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_completed_ptr); } else if (OFI_LIKELY(send_len <= recv_len)) { if (is_hmem && send_len) { @@ -1548,7 +1548,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, /* post an 'error' completion event for the receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) ext, rx->cq_err_ptr); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-INJECT"); @@ -1609,7 +1609,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, assert(context->next == NULL); /* post a completion event for the individual receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(context, rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_completed_ptr); } else if (OFI_LIKELY(send_len <= recv_len)) { @@ -1704,7 +1704,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, /* post an 'error' completion event for the receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) ext, rx->cq_err_ptr); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-EAGER"); @@ -2780,7 +2780,7 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, const uint64_t kind = (static_flags & FI_TAGGED) ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG; assert((kind == FI_OPX_KIND_TAG && opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST) || (kind == FI_OPX_KIND_MSG && opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST)); - union fi_opx_context * context = opx_ep->rx->queue[kind].mq.head; + union fi_opx_context * context = (union fi_opx_context *) opx_ep->rx->queue[kind].mq.head; union fi_opx_context * prev = NULL; while ( @@ -2817,7 +2817,9 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, } /* Found a match. Remove from the match queue */ - fi_opx_context_slist_remove_item(context, prev, &opx_ep->rx->queue[kind].mq); + slist_remove(&opx_ep->rx->queue[kind].mq, + (struct slist_entry *) context, + (struct slist_entry *) prev); uint64_t is_context_ext = context->flags & FI_OPX_CQ_CONTEXT_EXT; uint64_t is_hmem = context->flags & FI_OPX_CQ_CONTEXT_HMEM; @@ -2845,13 +2847,13 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, /* Only add this to the multi-packet egr queue if we still expect additional packets to come in */ if (context->byte_counter) { context->mp_egr_id = mp_egr_id; - fi_opx_context_slist_insert_tail(context, &opx_ep->rx->mp_egr_queue.mq); + slist_insert_tail((struct slist_entry *) context, &opx_ep->rx->mp_egr_queue.mq); } else { context->next = NULL; if (OFI_UNLIKELY(is_context_ext && ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); } else { fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, is_context_ext, lock_required); @@ -2881,7 +2883,7 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, /* Search mp-eager queue for the context w/ matching mp-eager ID */ const uint64_t mp_egr_id = fi_opx_mp_egr_id_from_nth_packet(hdr, slid); - union fi_opx_context *context = opx_ep->rx->mp_egr_queue.mq.head; + union fi_opx_context *context = (union fi_opx_context *) opx_ep->rx->mp_egr_queue.mq.head; union fi_opx_context *prev = NULL; FI_OPX_DEBUG_COUNTERS_DECLARE_TMP(length); @@ -2926,11 +2928,13 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, if (!context->byte_counter) { /* Remove from the mp-eager queue */ - fi_opx_context_slist_remove_item(context, prev, &opx_ep->rx->mp_egr_queue.mq); + slist_remove(&opx_ep->rx->mp_egr_queue.mq, + (struct slist_entry *) context, + (struct slist_entry *) prev); if (OFI_UNLIKELY(is_context_ext && ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); } else { fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, is_context_ext, lock_required); @@ -2990,7 +2994,7 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, assert(static_flags & (FI_TAGGED | FI_MSG)); const uint64_t kind = (static_flags & FI_TAGGED) ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG; - union fi_opx_context * context = opx_ep->rx->queue[kind].mq.head; + union fi_opx_context * context = (union fi_opx_context *) opx_ep->rx->queue[kind].mq.head; union fi_opx_context * prev = NULL; while (OFI_LIKELY(context != NULL) && @@ -3035,13 +3039,13 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, if (prev) prev->next = context->next; else { - assert(opx_ep->rx->queue[kind].mq.head == context); - opx_ep->rx->queue[kind].mq.head = context->next; + assert(opx_ep->rx->queue[kind].mq.head == (struct slist_entry *) context); + opx_ep->rx->queue[kind].mq.head = (struct slist_entry *) context->next; } if (context->next == NULL){ - assert(opx_ep->rx->queue[kind].mq.tail == context); - opx_ep->rx->queue[kind].mq.tail = prev; + assert(opx_ep->rx->queue[kind].mq.tail == (struct slist_entry *) context); + opx_ep->rx->queue[kind].mq.tail = (struct slist_entry *) prev; } context->next = NULL; @@ -3091,7 +3095,7 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, if (prev) prev->next = context->next; else - opx_ep->rx->queue[kind].mq.head = context->next; + opx_ep->rx->queue[kind].mq.head = (struct slist_entry *) context->next; if (context->next == NULL) opx_ep->rx->queue[kind].mq.tail = NULL; @@ -3103,7 +3107,7 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, // to ensure that any pending ops are completed (eg rendezvous multi-receive) if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } if(context->byte_counter == 0) { - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_completed_ptr); } } } else { @@ -3358,7 +3362,7 @@ int fi_opx_ep_cancel_context(struct fi_opx_ep * opx_ep, /* post an 'error' completion event for the canceled receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, opx_ep->rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) ext, opx_ep->rx->cq_err_ptr); return FI_ECANCELED; } @@ -3440,14 +3444,14 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, if (context->byte_counter) { context->mp_egr_id = mp_egr_id; - fi_opx_context_slist_insert_tail(context, &opx_ep->rx->mp_egr_queue.mq); + slist_insert_tail((struct slist_entry *) context, &opx_ep->rx->mp_egr_queue.mq); } else { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_completed_process_context); context->next = NULL; if (OFI_UNLIKELY(is_context_ext && ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); } else { fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, is_context_ext, lock_required); @@ -3489,7 +3493,7 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, * (context) to the appropriate match queue */ context->next = NULL; - fi_opx_context_slist_insert_tail(context, &opx_ep->rx->queue[kind].mq); + slist_insert_tail((struct slist_entry *) context, &opx_ep->rx->queue[kind].mq); return 0; } @@ -3956,7 +3960,7 @@ void fi_opx_ep_tx_cq_completion_rzv(struct fid_ep *ep, opx_context->next = NULL; if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail(opx_context, opx_ep->tx->cq_pending_ptr); + slist_insert_tail((struct slist_entry *) opx_context, opx_ep->tx->cq_pending_ptr); } __OPX_FORCE_INLINE__ diff --git a/prov/opx/include/rdma/opx/fi_opx_eq.h b/prov/opx/include/rdma/opx/fi_opx_eq.h index 597f8031389..a3c353f654b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_eq.h +++ b/prov/opx/include/rdma/opx/fi_opx_eq.h @@ -128,9 +128,9 @@ struct fi_opx_cq { /* == CACHE LINE == */ - struct fi_opx_context_slist pending; - struct fi_opx_context_slist completed; - struct fi_opx_context_slist err; /* 'struct fi_opx_context_ext' element linked list */ + struct slist pending; + struct slist completed; + struct slist err; /* 'struct fi_opx_context_ext' element linked list */ struct { uint64_t ep_count; @@ -218,14 +218,7 @@ int fi_opx_cq_enqueue_pending (struct fi_opx_cq * opx_cq, if (lock_required) { FI_WARN(fi_opx_global.prov, FI_LOG_CQ, "unimplemented\n"); abort(); } - union fi_opx_context * tail = opx_cq->pending.tail; - context->next = NULL; - if (tail) { - tail->next = context; - } else { - opx_cq->pending.head = context; - } - opx_cq->pending.tail = context; + slist_insert_tail((struct slist_entry *) context, &opx_cq->pending); return 0; } @@ -243,19 +236,7 @@ int fi_opx_cq_enqueue_completed (struct fi_opx_cq * opx_cq, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== MANUAL PROGRESS COMPLETION CQ ENQUEUED\n"); - union fi_opx_context * tail = opx_cq->completed.tail; - context->next = NULL; - if (tail) { - - assert(NULL != opx_cq->completed.head); - tail->next = context; - opx_cq->completed.tail = context; - - } else { - assert(NULL == opx_cq->completed.head); - opx_cq->completed.head = context; - opx_cq->completed.tail = context; - } + slist_insert_tail((struct slist_entry *) context, &opx_cq->completed); return 0; } @@ -320,8 +301,8 @@ static ssize_t fi_opx_cq_poll_noinline (struct fi_opx_cq *opx_cq, /* examine each context in the pending completion queue and, if the * operation is complete, initialize the cq entry in the application * buffer and remove the context from the queue. */ - union fi_opx_context * pending_head = opx_cq->pending.head; - union fi_opx_context * pending_tail = opx_cq->pending.tail; + union fi_opx_context * pending_head = (union fi_opx_context *) opx_cq->pending.head; + union fi_opx_context * pending_tail = (union fi_opx_context *) opx_cq->pending.tail; if (NULL != pending_head) { union fi_opx_context * context = pending_head; @@ -346,7 +327,8 @@ static ssize_t fi_opx_cq_poll_noinline (struct fi_opx_cq *opx_cq, multi_recv_context->byte_counter == 0) { /* Signal the user to repost their buffers */ assert(multi_recv_context->next == NULL); - fi_opx_context_slist_insert_tail(multi_recv_context, opx_ep->rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) multi_recv_context, + opx_ep->rx->cq_completed_ptr); } } else if (context->flags & FI_OPX_CQ_CONTEXT_EXT) { struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) context; @@ -374,12 +356,12 @@ static ssize_t fi_opx_cq_poll_noinline (struct fi_opx_cq *opx_cq, } /* save the updated pending head and pending tail pointers */ - opx_cq->pending.head = pending_head; - opx_cq->pending.tail = pending_tail; + opx_cq->pending.head = (struct slist_entry *) pending_head; + opx_cq->pending.tail = (struct slist_entry *) pending_tail; } - union fi_opx_context * head = opx_cq->completed.head; + union fi_opx_context * head = (union fi_opx_context *) opx_cq->completed.head; if (head) { union fi_opx_context * context = head; while ((count - num_entries) > 0 && context != NULL) { @@ -387,7 +369,7 @@ static ssize_t fi_opx_cq_poll_noinline (struct fi_opx_cq *opx_cq, ++ num_entries; context = context->next; } - opx_cq->completed.head = context; + opx_cq->completed.head = (struct slist_entry *) context; if (!context) opx_cq->completed.tail = NULL; } @@ -438,8 +420,8 @@ ssize_t fi_opx_cq_poll_inline(struct fid_cq *cq, void *buf, size_t count, fi_opx_lock(&opx_cq->progress.ep[i]->lock); fi_opx_ep_rx_poll(&opx_cq->progress.ep[i]->ep_fid, caps, reliability, FI_OPX_HDRQ_MASK_8192, hfi1_type); fi_opx_unlock(&opx_cq->progress.ep[i]->lock); - } - + } + } else { for (i=0; iprogress.ep[i]->lock); @@ -488,7 +470,7 @@ ssize_t fi_opx_cq_poll_inline(struct fid_cq *cq, void *buf, size_t count, ++ num_entries; context = context->next; } - opx_cq->completed.head = context; + opx_cq->completed.head = (struct slist_entry *) context; if (!context) opx_cq->completed.tail = NULL; return num_entries; @@ -514,7 +496,7 @@ ssize_t fi_opx_cq_read_generic_non_locking (struct fid_cq *cq, void *buf, size_t const uint64_t caps, const enum opx_hfi1_type hfi1_type) { - return fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); + return fi_opx_cq_poll_inline(cq, buf, count, NULL, format, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, caps, hfi1_type); } __OPX_FORCE_INLINE__ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index ae513973669..318433d9195 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -104,7 +104,7 @@ void fi_opx_ep_tx_cq_inject_completion(struct fid_ep *ep, if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== TX CQ COMPLETION QUEUED\n"); - fi_opx_context_slist_insert_tail(opx_context, opx_ep->tx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) opx_context, opx_ep->tx->cq_completed_ptr); } // faster than memcpy() for this amount of data. @@ -627,7 +627,7 @@ void fi_opx_store_inject_and_copy_scb2_16B(volatile uint64_t scb[8], uint64_t *local, uint64_t d8) { // 2nd cacheline PIO (only) padded out - + OPX_HFI1_BAR_STORE(&scb[0], d8); // tag OPX_HFI1_BAR_STORE(&scb[1], 0); OPX_HFI1_BAR_STORE(&scb[2], 0); @@ -1069,9 +1069,9 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, hdr->qw_9B[6] = tag; } else { - hdr->qw_16B[0] = opx_ep->tx->inject_16B.hdr.qw_16B[0] | + hdr->qw_16B[0] = opx_ep->tx->inject_16B.hdr.qw_16B[0] | ((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B); - hdr->qw_16B[1] = opx_ep->tx->inject_16B.hdr.qw_16B[1] | + hdr->qw_16B[1] = opx_ep->tx->inject_16B.hdr.qw_16B[1] | (((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); hdr->qw_16B[2] = opx_ep->tx->inject_16B.hdr.qw_16B[2] | bth_rx | (len << 48) | ((caps & FI_MSG) ? /* compile-time constant expression */ @@ -1103,7 +1103,7 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; - if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed) < + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed) < credits_needed)) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); opx_ep->tx->pio_state->qw0 = pio_state.qw0; @@ -1147,7 +1147,7 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); uint64_t local_temp[16] = {0}; - + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { fi_opx_store_inject_and_copy_scb_9B(scb, local_temp, opx_ep->tx->inject_9B.qw0 | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | @@ -1204,7 +1204,7 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, local_temp); } - fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.reliability_rx, + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.reliability_rx, dest_rx, psn_ptr, replay, reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-INJECT-HFI"); @@ -1274,7 +1274,7 @@ bool fi_opx_hfi1_fill_from_iov8(const struct iovec *iov, /* In: iovec array * return false; } -static inline void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, +static inline void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, const enum opx_hfi1_type hfi1_type); __OPX_FORCE_INLINE__ @@ -2252,16 +2252,16 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48), *((uint64_t *)buf), tag); - + } else { uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); fi_opx_store_and_copy_qw(scb, local_storage, opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | pbc_dlid, - opx_ep->tx->send_16B.hdr.qw_16B[0] | + opx_ep->tx->send_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_packet_length << 20), - opx_ep->tx->send_16B.hdr.qw_16B[1] | + opx_ep->tx->send_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | @@ -2316,9 +2316,9 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(struct fi_opx_ep *opx_ep, - union fi_opx_hfi1_pio_state *pio_state, + union fi_opx_hfi1_pio_state *pio_state, uint64_t *local_storage, - uint64_t *buf_qws, + uint64_t *buf_qws, const size_t hdr_and_payload_qws, const uint64_t tag) { @@ -2328,7 +2328,7 @@ ssize_t fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(struct fi_opx_ep *opx_ep union fi_opx_hfi1_pio_state pio_local = *pio_state; volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_local); - + // spill from 1st cacheline (SOP) OPX_HFI1_BAR_STORE(&scb_payload[0], tag); // header local_storage[8] = tag; /* todo: pretty sure it's already there */ @@ -2345,7 +2345,7 @@ ssize_t fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(struct fi_opx_ep *opx_ep local_storage[8 + i] = -1UL; } } - + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); FI_OPX_HFI1_CONSUME_CREDITS(pio_local, 1); @@ -2470,7 +2470,7 @@ void fi_opx_hfi1_tx_send_egr_write_replay_data(struct fi_opx_ep *opx_ep, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { - + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_source); else @@ -3007,7 +3007,7 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(struct fi_opx_ep *opx_ep, ((uint64_t)lrh_dws << 20), opx_ep->tx->send_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), - + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | (uint64_t)FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, @@ -3331,18 +3331,18 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth_16B (struct fi_opx_ep *opx_ep, const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); - + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND 16B, HFI -- MULTI-PACKET EAGER NTH (begin)\n"); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-MP-EAGER-NTH-HFI"); - + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; ssize_t total_credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, FI_OPX_MP_EGR_CHUNK_CREDITS); if (OFI_UNLIKELY(total_credits_available < 0)) { OPX_TRACER_TRACE(OPX_TRACER_END_ENOBUFS, "SEND-MP-EAGER-NTH-HFI"); return -FI_ENOBUFS; } - + struct fi_opx_reliability_tx_replay *replay; union fi_opx_reliability_tx_psn *psn_ptr; int32_t psn; @@ -3365,13 +3365,13 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth_16B (struct fi_opx_ep *opx_ep, psn, mp_egr_uid, hfi1_type); - + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL); /* header and payload */ #ifndef NDEBUG - credits_consumed += + credits_consumed += #endif fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, (((uint64_t) mp_egr_uid) << 32) | payload_offset, buf_qws); @@ -3655,7 +3655,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last_16B (struct fi_opx_ep *opx_ep, buf_qws = (uint64_t*)((uintptr_t)buf + 56); if (full_block_credits_needed) full_block_credits_needed--; - + if (OFI_LIKELY(full_block_credits_needed)) { #ifndef NDEBUG credits_consumed += @@ -3665,7 +3665,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last_16B (struct fi_opx_ep *opx_ep, full_block_credits_needed, total_credits_available - 2); } - + if (OFI_LIKELY(tail_partial_block_qws)) { #ifndef NDEBUG credits_consumed += diff --git a/prov/opx/include/rdma/opx/fi_opx_internal.h b/prov/opx/include/rdma/opx/fi_opx_internal.h index c53c3dd168d..b8ea3eefd94 100644 --- a/prov/opx/include/rdma/opx/fi_opx_internal.h +++ b/prov/opx/include/rdma/opx/fi_opx_internal.h @@ -91,61 +91,6 @@ union fi_opx_context { }; }; -struct fi_opx_context_slist { - union fi_opx_context * head; - union fi_opx_context * tail; -}; - -static inline void fi_opx_context_slist_init (struct fi_opx_context_slist* list) -{ - list->head = list->tail = NULL; -} - -static inline int fi_opx_context_slist_empty (struct fi_opx_context_slist* list) -{ - return !list->head; -} - -static inline void fi_opx_context_slist_insert_head (union fi_opx_context *item, - struct fi_opx_context_slist* list) -{ - assert(item->next == NULL); - if (fi_opx_context_slist_empty(list)) - list->tail = item; - else - item->next = list->head; - - list->head = item; -} - -static inline void fi_opx_context_slist_insert_tail (union fi_opx_context *item, - struct fi_opx_context_slist* list) -{ - assert(item->next == NULL); - if (fi_opx_context_slist_empty(list)) - list->head = item; - else - list->tail->next = item; - - list->tail = item; -} - -static inline void fi_opx_context_slist_remove_item (union fi_opx_context *item, - union fi_opx_context *prev, struct fi_opx_context_slist *list) -{ - if (prev) { - prev->next = item->next; - } else { - list->head = item->next; - } - - if (item->next == NULL) { - list->tail = prev; - } - - item->next = NULL; -} - struct fi_opx_context_ext { union fi_opx_context opx_context; struct fi_cq_err_entry err_entry; diff --git a/prov/opx/src/fi_opx_cq.c b/prov/opx/src/fi_opx_cq.c index f2712436572..f1e5e098d5f 100644 --- a/prov/opx/src/fi_opx_cq.c +++ b/prov/opx/src/fi_opx_cq.c @@ -58,7 +58,7 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line) { len -= n; if (opx_cq->completed.head != NULL) { - context = opx_cq->completed.head; + context = (union fi_opx_context *) opx_cq->completed.head; n = snprintf(s, len, " = { %p", context); s += n; len -= n; context = context->next; @@ -73,7 +73,7 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line) { n = 0; len = 2047; s = str; *s = 0; n = snprintf(s, len, "%s():%d [%p] pending(%p,%p)", func, line, opx_cq, opx_cq->pending.head, opx_cq->pending.tail); s += n; len -= n; if (opx_cq->pending.head != NULL) { - context = opx_cq->pending.head; + context = (union fi_opx_context *) opx_cq->pending.head; n = snprintf(s, len, " = { %p(%lu,0x%016lx)", context, context->byte_counter, context->byte_counter); s += n; len -= n; context = context->next; @@ -89,7 +89,7 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line) { n = 0; len = 2047; s = str; *s = 0; n = snprintf(s, len, "%s():%d [%p] err(%p,%p)", func, line, opx_cq, opx_cq->err.head, opx_cq->err.tail); s += n; len -= n; if (opx_cq->err.head != NULL) { - context = opx_cq->err.head; + context = (union fi_opx_context *) opx_cq->err.head; n = snprintf(s, len, " = { %p(%lu)", context, context->byte_counter); s += n; len -= n; context = context->next; @@ -142,7 +142,7 @@ static int fi_opx_close_cq(fid_t fid) free(opx_cq); opx_cq = NULL; - //opx_cq (the object passed in as fid) is now unusable + //opx_cq (the object passed in as fid) is now unusable FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_CQ, "cq closed\n"); return 0; @@ -193,7 +193,7 @@ int fi_opx_cq_enqueue_err (struct fi_opx_cq * opx_cq, assert(!lock_required); ext->opx_context.next = NULL; - fi_opx_context_slist_insert_tail((union fi_opx_context *)ext, &opx_cq->err); + slist_insert_tail((struct slist_entry *) ext, &opx_cq->err); return 0; } @@ -227,7 +227,7 @@ struct fi_ops_cq * fi_opx_cq_select_ops(const enum fi_cq_format format, if (hfi1_type & OPX_HFI1_WFR) { - switch(rcvhdrcnt) { + switch(rcvhdrcnt) { case 2048: return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps, 0) : fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps, 0); @@ -235,12 +235,12 @@ struct fi_ops_cq * fi_opx_cq_select_ops(const enum fi_cq_format format, return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps, 0) : fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps, 0); default: - FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); + FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps, 0) : fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps, 0); } } else if (hfi1_type & OPX_HFI1_JKR_9B) { - switch(rcvhdrcnt) { + switch(rcvhdrcnt) { case 2048: return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps, 1) : fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps, 1); @@ -248,12 +248,12 @@ struct fi_ops_cq * fi_opx_cq_select_ops(const enum fi_cq_format format, return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps, 1) : fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps, 1); default: - FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); + FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps, 1) : fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps, 1); } } else if (hfi1_type & OPX_HFI1_JKR) { - switch(rcvhdrcnt) { + switch(rcvhdrcnt) { case 2048: return lock_required ? fi_opx_cq_select_locking_2048_ops(format, reliability, comm_caps, 2) : fi_opx_cq_select_non_locking_2048_ops(format, reliability, comm_caps, 2); @@ -261,7 +261,7 @@ struct fi_ops_cq * fi_opx_cq_select_ops(const enum fi_cq_format format, return lock_required ? fi_opx_cq_select_locking_8192_ops(format, reliability, comm_caps, 2) : fi_opx_cq_select_non_locking_8192_ops(format, reliability, comm_caps, 2); default: - FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); + FI_INFO(fi_opx_global.prov, FI_LOG_CQ, "WARNING: non-optimal setting specified for hfi1 rcvhdrcnt. Optimal values are 2048 and 8192\n"); return lock_required ? fi_opx_cq_select_locking_runtime_ops(format, reliability, comm_caps, 2) : fi_opx_cq_select_non_locking_runtime_ops(format, reliability, comm_caps, 2); } @@ -307,9 +307,9 @@ int fi_opx_cq_open(struct fid_domain *dom, opx_cq->format = attr->format ? attr->format : FI_CQ_FORMAT_CONTEXT; - fi_opx_context_slist_init(&opx_cq->pending); - fi_opx_context_slist_init(&opx_cq->completed); - fi_opx_context_slist_init(&opx_cq->err); + slist_init(&opx_cq->pending); + slist_init(&opx_cq->completed); + slist_init(&opx_cq->err); opx_cq->ep_bind_count = 0; opx_cq->progress.ep_count = 0; diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 8ee217acca7..5474e6cd9d6 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -388,7 +388,7 @@ void fi_opx_ep_tx_model_init_16B (struct fi_opx_hfi1_context * hfi, 4 + /* software kdeth + unused */ 2 + /* ICRC and tail */ 2 ; /* second cacheline */ - + inject_16B->qw0 = OPX_PBC_LEN(pbc_dws,hfi1_type) /* length_dws */ | OPX_PBC_VL(hfi->vl,hfi1_type) | OPX_PBC_SC(hfi->sc,hfi1_type) | @@ -463,7 +463,7 @@ int fi_opx_stx_init (struct fi_opx_domain *opx_domain, struct fi_tx_attr *attr, &opx_stx->tx.inject, &opx_stx->tx.send, &opx_stx->tx.rzv); - + fi_opx_ep_tx_model_init_16B(opx_stx->hfi, opx_stx->reliability_rx, &opx_stx->tx.inject_16B, @@ -956,7 +956,7 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, &opx_ep->tx->inject_9B, &opx_ep->tx->send_9B, &opx_ep->tx->rzv_9B); - + fi_opx_ep_tx_model_init_16B(hfi, opx_ep->reliability->rx, &opx_ep->tx->inject_16B, @@ -1562,7 +1562,7 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) return -errno; } fi_opx_ref_inc(&opx_ep->hfi->ref_cnt, "HFI context"); - + fi_opx_global.hfi_local_info.type = opx_ep->hfi->hfi_hfi1_type; int mixed_network = 0; @@ -1709,9 +1709,9 @@ static int fi_opx_open_command_queues(struct fi_opx_ep *opx_ep) opx_ep->rx->mp_egr_queue.ue.tail = NULL; /* Context match queues (queue[0] == FI_TAGGED, queue[1] == FI_MSG) */ - fi_opx_context_slist_init(&opx_ep->rx->queue[0].mq); - fi_opx_context_slist_init(&opx_ep->rx->queue[1].mq); - fi_opx_context_slist_init(&opx_ep->rx->mp_egr_queue.mq); + slist_init(&opx_ep->rx->queue[0].mq); + slist_init(&opx_ep->rx->queue[1].mq); + slist_init(&opx_ep->rx->mp_egr_queue.mq); opx_ep->tx->cq = NULL; opx_ep->tx->cq_pending_ptr = NULL; @@ -1974,11 +1974,11 @@ static int fi_opx_setopt_ep(fid_t fid, int level, int optname, break; case FI_OPT_CUDA_API_PERMITTED: if (!hmem_ops[FI_HMEM_CUDA].initialized) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_CTRL, - "Cannot set CUDA API permitted when" - "CUDA library or CUDA device is not available\n"); - return -FI_EINVAL; - } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_CTRL, + "Cannot set CUDA API permitted when" + "CUDA library or CUDA device is not available\n"); + return -FI_EINVAL; + } /* our HMEM support does not make calls to CUDA API, * therefore we can accept any option for FI_OPT_CUDA_API_PERMITTED. */ @@ -1990,7 +1990,6 @@ static int fi_opx_setopt_ep(fid_t fid, int level, int optname, return 0; } - int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx * rx, const uint64_t static_flags, const union fi_opx_context * cancel_context, @@ -2006,7 +2005,7 @@ int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx * rx, */ union fi_opx_context * prev = NULL; - union fi_opx_context * item = rx->queue[kind].mq.head; + union fi_opx_context * item = (union fi_opx_context *) rx->queue[kind].mq.head; while (item) { const uint64_t is_context_ext = item->flags & FI_OPX_CQ_CONTEXT_EXT; @@ -2018,10 +2017,10 @@ int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx * rx, if (prev) prev->next = item->next; else - rx->queue[kind].mq.head = item->next; + rx->queue[kind].mq.head = (struct slist_entry *) item->next; if (!item->next) - rx->queue[kind].mq.tail = prev; + rx->queue[kind].mq.tail = (struct slist_entry *) prev; struct fi_opx_context_ext * ext = NULL; if (cancel_context->flags & FI_OPX_CQ_CONTEXT_EXT) { @@ -2052,7 +2051,7 @@ int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx * rx, ext->err_entry.err_data_size = 0; if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - fi_opx_context_slist_insert_tail((union fi_opx_context*)ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) ext, rx->cq_err_ptr); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "(end) canceled\n"); return FI_ECANCELED; @@ -2845,7 +2844,8 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, // to ensure that any pending ops are completed (eg rendezvous multi-receive) if(context->byte_counter == 0) { assert(context->next == NULL); - fi_opx_context_slist_insert_tail(context, opx_ep->rx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, + opx_ep->rx->cq_completed_ptr); } return; @@ -2863,7 +2863,7 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, * no unexpected headers were matched; add this match * information to the appropriate match queue */ - fi_opx_context_slist_insert_tail(context, &opx_ep->rx->queue[kind].mq); + slist_insert_tail((struct slist_entry *) context, &opx_ep->rx->queue[kind].mq); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "(end)\n"); @@ -2932,7 +2932,7 @@ void fi_opx_ep_rx_reliability_process_packet (struct fid_ep * ep, /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ uint16_t lrh_pktlen_le; - size_t total_bytes; + size_t total_bytes; size_t payload_bytes; uint32_t slid; @@ -3247,7 +3247,7 @@ ssize_t fi_opx_recv_FABRIC_DIRECT(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { - + /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { return FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index 914e012da67..a2355cef4b4 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -65,7 +65,7 @@ void fi_opx_hit_zero(struct fi_opx_completion_counter *cc) opx_context->tag = 0; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== CQ ENQUEUE COMPLETION\n"); - fi_opx_cq_enqueue_completed(cc->cq, cc->context, 0); + fi_opx_cq_enqueue_completed(cc->cq, cc->context, FI_OPX_LOCK_NOT_REQUIRED); } else { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== NO CQ COMPLETION\n"); } @@ -279,7 +279,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) params->opx_target_addr.reliability_rx, params->dest_rx, psn_ptr, replay, params->reliability, OPX_HFI1_TYPE); - + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); opx_ep->tx->pio_state->qw0 = pio_state.qw0; @@ -1174,7 +1174,7 @@ static inline ssize_t fi_opx_rma_writev(struct fid_ep *ep, const struct iovec *i FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); abort(); } - + fi_opx_unlock_if_required(&opx_ep->lock, lock_required); return rc; } @@ -1345,7 +1345,7 @@ int fi_opx_enable_rma_ops(struct fid_ep *ep) FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); abort(); } - + return 0; err: return -errno; From 609695533afeb818a235362e7fde16a2e735743f Mon Sep 17 00:00:00 2001 From: Jack Morrison Date: Thu, 22 Aug 2024 13:06:20 -0400 Subject: [PATCH 021/393] github/actions: Add concurrency group to Cornelis Networks workflow Add a concurrency group to enable preemption. Limit the triggering event to pull_request defaults. Signed-off-by: Jack Morrison --- .github/workflows/cn.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cn.yml b/.github/workflows/cn.yml index f28b00ca3f8..78166efbf47 100644 --- a/.github/workflows/cn.yml +++ b/.github/workflows/cn.yml @@ -3,17 +3,16 @@ name: 'Cornelis' on: workflow_dispatch: pull_request: - types: - - labeled - - opened - - reopened - - synchronize branches: - main paths-ignore: - 'man/**' - 'docs/**' +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: opx-ci: name: OPX CI From 16e34eedd4e72140947e9c651e41f8c14f37fa32 Mon Sep 17 00:00:00 2001 From: Jack Morrison Date: Fri, 23 Aug 2024 13:22:05 -0400 Subject: [PATCH 022/393] github/actions: Add an on-merge job to Cornelis Networks workflow Add a second job to the internal Cornelis Networks workflows. This is triggered by merge events in the cornelisnetworks/libfabric-internal repository, and skipped otherwise. Signed-off-by: Jack Morrison --- .github/workflows/cn.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cn.yml b/.github/workflows/cn.yml index 78166efbf47..b6ca2fabe0a 100644 --- a/.github/workflows/cn.yml +++ b/.github/workflows/cn.yml @@ -3,6 +3,11 @@ name: 'Cornelis' on: workflow_dispatch: pull_request: + types: + - opened + - reopened + - synchronize + - closed branches: - main paths-ignore: @@ -16,5 +21,13 @@ concurrency: jobs: opx-ci: name: OPX CI - if: github.repository == 'cornelisnetworks/libfabric-internal' + if: | + github.repository == 'cornelisnetworks/libfabric-internal' && + github.event.pull_request.merged != true uses: cornelisnetworks/libfabric-devel/.github/workflows/cn.yml@master + on-merge: + name: On-Merge + if: | + github.repository == 'cornelisnetworks/libfabric-internal' && + github.event.pull_request.merged == true + uses: cornelisnetworks/libfabric-devel/.github/workflows/merge.yml@master From e9bc6169cefd68815107af30aab116ed4d1d58ae Mon Sep 17 00:00:00 2001 From: Elias Kozah Date: Tue, 27 Aug 2024 21:37:43 -0400 Subject: [PATCH 023/393] prov/opx: Resolve coverity scan defects uncovered after upstream Signed-off-by: Elias Kozah --- prov/opx/src/fi_opx_domain.c | 7 +++- prov/opx/src/fi_opx_rma.c | 55 +++++++++++++++++++++----------- prov/opx/src/fi_opx_tid_domain.c | 6 +++- 3 files changed, 47 insertions(+), 21 deletions(-) diff --git a/prov/opx/src/fi_opx_domain.c b/prov/opx/src/fi_opx_domain.c index 6a0f7cbd291..cffe97b40b6 100644 --- a/prov/opx/src/fi_opx_domain.c +++ b/prov/opx/src/fi_opx_domain.c @@ -492,7 +492,7 @@ int fi_opx_domain(struct fid_fabric *fabric, strncpy(opx_domain->unique_job_key_str, env_var_uuid, OPX_JOB_KEY_STR_SIZE-1); opx_domain->unique_job_key_str[OPX_JOB_KEY_STR_SIZE-1] = '\0'; - sscanf(opx_domain->unique_job_key_str, "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx", + int elements_read = sscanf(opx_domain->unique_job_key_str, "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx", &opx_domain->unique_job_key[0], &opx_domain->unique_job_key[1], &opx_domain->unique_job_key[2], @@ -509,6 +509,11 @@ int fi_opx_domain(struct fid_fabric *fabric, &opx_domain->unique_job_key[13], &opx_domain->unique_job_key[14], &opx_domain->unique_job_key[15]); + if (elements_read == EOF) { + FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, "Error: sscanf encountered an input failure (EOF), unable to parse the unique job key string.\n"); + errno = FI_EINVAL; + goto err; + } FI_INFO(fi_opx_global.prov, FI_LOG_DOMAIN, "Domain unique job key set to %s\n", opx_domain->unique_job_key_str); //TODO: Print out a summary of all domain settings wtih FI_INFO diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index a2355cef4b4..0dae5a3bbf0 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -469,9 +469,14 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void cc->hit_zero = fi_opx_hit_zero; struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)desc; - const uint64_t mr_ptr_present = (mr_ptr_array != NULL); - struct fi_opx_mr *mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; for (index = 0; index < count; ++index) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } struct fi_opx_hmem_iov hmem_iov; const uint64_t is_hmem = fi_opx_hmem_iov_init(iov[index].iov_base, iov[index].iov_len, @@ -484,8 +489,6 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void lock_required, caps, reliability, hfi1_type); addr_offset += iov[index].iov_len; - ++mr_ptr_array; - mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; } return 0; @@ -603,12 +606,17 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg uintptr_t msg_iov_vaddr = (uintptr_t)msg->msg_iov[msg_iov_index].iov_base; struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)msg->desc; - const uint64_t mr_ptr_present = (mr_ptr_array != NULL); - struct fi_opx_mr *mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; struct fi_opx_hmem_iov iov; - uint64_t is_hmem = fi_opx_hmem_iov_init((void *)msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov); while (msg_iov_bytes != 0 && rma_iov_bytes != 0) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } + uint64_t is_hmem = fi_opx_hmem_iov_init((void *)msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov); size_t len = (msg_iov_bytes <= rma_iov_bytes) ? msg_iov_bytes : rma_iov_bytes; iov.buf = msg_iov_vaddr; iov.len = len; @@ -624,8 +632,6 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg ++msg_iov_index; msg_iov_bytes = msg->msg_iov[msg_iov_index].iov_len; msg_iov_vaddr = (uintptr_t)msg->msg_iov[msg_iov_index].iov_base; - ++mr_ptr_array; - mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; is_hmem = fi_opx_hmem_iov_init((void *)msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov); } @@ -789,18 +795,21 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, /* max 8 descriptors (iovecs) per readv_internal */ struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)desc; - const uint64_t mr_ptr_present = (mr_ptr_array != NULL); - struct fi_opx_mr *mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; const size_t full_count = count >> 3; for (index = 0; index < full_count; index += 8) { for (int i = 0; i < 8; ++i) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } hmem_iface = fi_opx_hmem_get_iface(iov[index + i].iov_base, mr_ptr, &hmem_device); hmem_iovs[i].buf = (uintptr_t) iov[index + i].iov_base; hmem_iovs[i].len = iov[index + i].iov_len; hmem_iovs[i].iface = hmem_iface; hmem_iovs[i].device = hmem_device; - ++mr_ptr_array; - mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; } fi_opx_readv_internal(opx_ep, hmem_iovs, 8, opx_addr, addr_v, key_v, NULL, 0, NULL, NULL, cc, FI_VOID, FI_NOOP, @@ -811,13 +820,18 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, /* if 'partial_ndesc' is zero, the fi_opx_readv_internal() will fence */ const size_t partial_ndesc = count & 0x07ull; for (int i = 0; i < partial_ndesc; ++i) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } hmem_iface = fi_opx_hmem_get_iface(iov[index + i].iov_base, mr_ptr, &hmem_device); hmem_iovs[i].buf = (uintptr_t) iov[index + i].iov_base; hmem_iovs[i].len = iov[index + i].iov_len; hmem_iovs[i].iface = hmem_iface; hmem_iovs[i].device = hmem_device; - ++mr_ptr_array; - mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; } fi_opx_readv_internal(opx_ep, hmem_iovs, partial_ndesc, opx_addr, addr_v, key_v, opx_context, tx_op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, @@ -920,10 +934,15 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, cc->hit_zero = fi_opx_hit_zero; struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)msg->desc; - const uint64_t mr_ptr_present = (mr_ptr_array != NULL); - struct fi_opx_mr *mr_ptr = mr_ptr_present ? *mr_ptr_array : NULL; while (src_iov_index < src_iov_count) { for (niov = 0; niov < 8; ++niov) { + struct fi_opx_mr *mr_ptr; + if (mr_ptr_array != NULL) { + mr_ptr = *mr_ptr_array; + ++mr_ptr_array; + } else { + mr_ptr = NULL; + } const size_t len = (dst_iov_bytes <= src_iov_bytes) ? dst_iov_bytes : src_iov_bytes; fi_opx_hmem_iov_init(dst_iov_vaddr, len, mr_ptr, &iov[niov]); @@ -987,8 +1006,6 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, ++dst_iov_index; dst_iov_bytes = msg->msg_iov[dst_iov_index].iov_len; dst_iov_vaddr = msg->msg_iov[dst_iov_index].iov_base; - ++mr_ptr_array; - mr_ptr = (mr_ptr_present) ? *mr_ptr_array : NULL; } } else { dst_iov_vaddr = (void *)((uintptr_t)dst_iov_vaddr + len); diff --git a/prov/opx/src/fi_opx_tid_domain.c b/prov/opx/src/fi_opx_tid_domain.c index 03024c82f50..946edbab2d2 100644 --- a/prov/opx/src/fi_opx_tid_domain.c +++ b/prov/opx/src/fi_opx_tid_domain.c @@ -149,7 +149,11 @@ int opx_close_tid_domain(struct opx_tid_domain *tid_domain, int locked) } dlist_remove(&tid_domain->list_entry); - ofi_domain_close(&tid_domain->util_domain); + int ret = ofi_domain_close(&tid_domain->util_domain); + if (ret != 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, "Error closing domain: %d\n", ret); + } + free(tid_domain); return 0; From 316656e360ac824db95d6a388fd56c366d5e90d6 Mon Sep 17 00:00:00 2001 From: Cody Mann Date: Wed, 28 Aug 2024 11:54:29 -0400 Subject: [PATCH 024/393] prov/opx: Fixing bug for credit check in inject code path. Signed-off-by: Cody Mann --- prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index 318433d9195..63077754d6e 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -1108,7 +1108,7 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); opx_ep->tx->pio_state->qw0 = pio_state.qw0; - if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed) < 1) { + if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, credits_needed) < credits_needed) { return -FI_EAGAIN; } } From ca9f385ff1e5c0bc8001c3fbfb3518b0029948b1 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Thu, 12 Sep 2024 15:08:16 +0000 Subject: [PATCH 025/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_opx.7 | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/man/man7/fi_opx.7 b/man/man7/fi_opx.7 index df7e5966147..8d9d27d42ff 100644 --- a/man/man7/fi_opx.7 +++ b/man/man7/fi_opx.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_opx" "7" "2024\-07\-23" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_opx" "7" "2024\-09\-12" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .PP {%include JB/setup %} @@ -312,6 +312,12 @@ registered copy when receiving data into GPU. The default threshold is 8192. This has no meaning if Libfabric was not configured with GDRCopy or ROCR support. +.TP +\f[I]FI_OPX_MIXED_NETWORK\f[R] +Integer. +Indicates that the network is a mix of OPA100 and CN5000. +Needs to be set to 1 in case of mixed network. +Default is 0. .SH SEE ALSO .PP \f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](7), From caa802652fdba841e4db3db1e0f8748943ac4640 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 3 Sep 2024 17:21:22 -0700 Subject: [PATCH 026/393] fabtests/fi_dgram_waitset: remove test (deprecated feature) fi_wait_open and fi_wait are deprecated in 2.0 Signed-off-by: Alexia Ingerson --- fabtests/Makefile.am | 6 - fabtests/Makefile.win | 4 +- fabtests/fabtests.vcxproj | 1 - fabtests/fabtests.vcxproj.filters | 3 - fabtests/functional/dgram_waitset.c | 192 ------------------- fabtests/man/fabtests.7.md | 3 - fabtests/pytest/default/test_dgram.py | 6 - fabtests/scripts/runfabtests.sh | 1 - fabtests/test_configs/efa/efa-neuron.exclude | 3 - fabtests/test_configs/efa/efa.exclude | 3 - fabtests/test_configs/psm3/psm3.exclude | 1 - 11 files changed, 1 insertion(+), 222 deletions(-) delete mode 100644 fabtests/functional/dgram_waitset.c diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index ac193830365..3e86d30dd79 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -22,7 +22,6 @@ bin_PROGRAMS = \ functional/fi_rdm_deferred_wq \ functional/fi_dgram \ functional/fi_mcast \ - functional/fi_dgram_waitset \ functional/fi_rdm_tagged_peek \ functional/fi_cq_data \ functional/fi_poll \ @@ -297,10 +296,6 @@ functional_fi_mcast_SOURCES = \ functional/mcast.c functional_fi_mcast_LDADD = libfabtests.la -functional_fi_dgram_waitset_SOURCES = \ - functional/dgram_waitset.c -functional_fi_dgram_waitset_LDADD = libfabtests.la - functional_fi_rdm_tagged_peek_SOURCES = \ functional/rdm_tagged_peek.c functional_fi_rdm_tagged_peek_LDADD = libfabtests.la @@ -624,7 +619,6 @@ dummy_man_pages = \ man/man1/fi_cm_data.1 \ man/man1/fi_cq_data.1 \ man/man1/fi_dgram.1 \ - man/man1/fi_dgram_waitset.1 \ man/man1/fi_inj_complete.1 \ man/man1/fi_mcast.1 \ man/man1/fi_msg.1 \ diff --git a/fabtests/Makefile.win b/fabtests/Makefile.win index e6b4f8a76b4..2eead3de38a 100644 --- a/fabtests/Makefile.win +++ b/fabtests/Makefile.win @@ -78,7 +78,7 @@ benchmarks: $(outdir)\dgram_pingpong.exe $(outdir)\msg_bw.exe \ $(outdir)\rdm_bw.exe $(outdir)\rdm_tagged_pingpong.exe $(outdir)\rma_bw.exe functional: $(outdir)\av_xfer.exe $(outdir)\bw.exe $(outdir)\cm_data.exe $(outdir)\cq_data.exe \ - $(outdir)\dgram.exe $(outdir)\dgram_waitset.exe $(outdir)\msg.exe $(outdir)\msg_epoll.exe \ + $(outdir)\dgram.exe $(outdir)\msg.exe $(outdir)\msg_epoll.exe \ $(outdir)\inject_test.exe $(outdir)\msg_sockets.exe $(outdir)\multi_mr.exe \ $(outdir)\multi_ep.exe $(outdir)\multi_recv.exe $(outdir)\poll.exe $(outdir)\rdm.exe \ $(outdir)\rdm_atomic.exe $(outdir)\rdm_multi_client.exe $(outdir)\rdm_rma_event.exe \ @@ -128,8 +128,6 @@ $(outdir)\cq_data.exe: {functional}cq_data.c $(basedeps) $(outdir)\dgram.exe: {functional}dgram.c $(basedeps) -$(outdir)\dgram_waitset.exe: {functional}dgram_waitset.c $(basedeps) - $(outdir)\msg.exe: {functional}msg.c $(basedeps) $(outdir)\msg_epoll.exe: {functional}msg_epoll.c $(basedeps) diff --git a/fabtests/fabtests.vcxproj b/fabtests/fabtests.vcxproj index 684fba50966..67c1d9f5876 100644 --- a/fabtests/fabtests.vcxproj +++ b/fabtests/fabtests.vcxproj @@ -227,7 +227,6 @@ - diff --git a/fabtests/fabtests.vcxproj.filters b/fabtests/fabtests.vcxproj.filters index ec7718e9c51..b4085a0561e 100644 --- a/fabtests/fabtests.vcxproj.filters +++ b/fabtests/fabtests.vcxproj.filters @@ -81,9 +81,6 @@ Source Files\functional - - Source Files\functional - Source Files\functional diff --git a/fabtests/functional/dgram_waitset.c b/fabtests/functional/dgram_waitset.c deleted file mode 100644 index 8b72d76e254..00000000000 --- a/fabtests/functional/dgram_waitset.c +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (c) 2013-2015 Intel Corporation. All rights reserved. - * Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under the BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include - -#include - -#include - -static int alloc_ep_res(struct fi_info *fi) -{ - struct fi_wait_attr wait_attr; - int ret; - - memset(&wait_attr, 0, sizeof wait_attr); - wait_attr.wait_obj = FI_WAIT_UNSPEC; - ret = fi_wait_open(fabric, &wait_attr, &waitset); - if (ret) { - FT_PRINTERR("fi_wait_open", ret); - return ret; - } - - ret = ft_alloc_active_res(fi); - if (ret) - return ret; - - return 0; -} - -static int init_fabric(void) -{ - int ret; - - ret = ft_getinfo(hints, &fi); - if (ret) - return ret; - - ret = ft_open_fabric_res(); - if (ret) - return ret; - - ret = alloc_ep_res(fi); - if (ret) - return ret; - - ret = ft_enable_ep_recv(); - if (ret) - return ret; - - return 0; -} - -static int send_recv() -{ - struct fi_cq_entry comp; - int ret; - - ret = fi_recv(ep, rx_buf, rx_size + ft_rx_prefix_size(), - mr_desc, 0, &rx_ctx); - if (ret) - return ret; - - ft_sync(); - - fprintf(stdout, "Posting a send...\n"); - ret = ft_post_tx(ep, remote_fi_addr, tx_size, NO_CQ_DATA, &tx_ctx); - if (ret) - return ret; - - while ((tx_cq_cntr < tx_seq) || (rx_cq_cntr < rx_seq)) { - /* Wait for completion events on CQs */ - ret = fi_wait(waitset, -1); - if (ret < 0) { - FT_PRINTERR("fi_wait", ret); - return ret; - } - - /* Read the send completion entry */ - ret = fi_cq_read(txcq, &comp, 1); - if (ret > 0) { - tx_cq_cntr++; - fprintf(stdout, "Received send completion event!\n"); - } else if (ret < 0 && ret != -FI_EAGAIN) { - if (ret == -FI_EAVAIL) { - ret = ft_cq_readerr(txcq); - } else { - FT_PRINTERR("fi_cq_read", ret); - } - return ret; - } - - /* Read the recv completion entry */ - ret = fi_cq_read(rxcq, &comp, 1); - if (ret > 0) { - rx_cq_cntr++; - fprintf(stdout, "Received recv completion event!\n"); - } else if (ret < 0 && ret != -FI_EAGAIN) { - if (ret == -FI_EAVAIL) { - ret = ft_cq_readerr(rxcq); - } else { - FT_PRINTERR("fi_cq_read", ret); - } - return ret; - } - } - - return 0; -} - -static int run(void) -{ - int ret; - - ret = init_fabric(); - if (ret) - return ret; - - ret = ft_init_av(); - if (ret) - return ret; - - return send_recv(); -} - -int main(int argc, char **argv) -{ - int op, ret = 0; - - opts = INIT_OPTS; - opts.options |= FT_OPT_SIZE; - opts.comp_method = FT_COMP_WAITSET; - - hints = fi_allocinfo(); - if (!hints) - return EXIT_FAILURE; - - while ((op = getopt(argc, argv, "h" ADDR_OPTS INFO_OPTS)) != -1) { - switch (op) { - default: - ft_parse_addr_opts(op, optarg, &opts); - ft_parseinfo(op, optarg, hints, &opts); - break; - case '?': - case 'h': - ft_usage(argv[0], "A DGRAM client-server example that uses waitset.\n"); - return EXIT_FAILURE; - } - } - - if (optind < argc) - opts.dst_addr = argv[optind]; - - hints->ep_attr->type = FI_EP_DGRAM; - hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; - hints->domain_attr->mr_mode = opts.mr_mode; - hints->addr_format = opts.address_format; - - ret = run(); - - ft_free_res(); - return ft_exit_code(ret); -} diff --git a/fabtests/man/fabtests.7.md b/fabtests/man/fabtests.7.md index 20b200e3123..e2d0321438b 100644 --- a/fabtests/man/fabtests.7.md +++ b/fabtests/man/fabtests.7.md @@ -50,9 +50,6 @@ features of libfabric. *fi_dgram* : A basic datagram endpoint example. -*fi_dgram_waitset* -: Transfers datagrams using waitsets for completion notification. - *fi_inj_complete* : Sends messages using the FI_INJECT_COMPLETE operation flag. diff --git a/fabtests/pytest/default/test_dgram.py b/fabtests/pytest/default/test_dgram.py index af118f6fcab..3cffa601532 100644 --- a/fabtests/pytest/default/test_dgram.py +++ b/fabtests/pytest/default/test_dgram.py @@ -12,12 +12,6 @@ def test_dgram(cmdline_args): test = ClientServerTest(cmdline_args, "fi_dgram") test.run() -@pytest.mark.functional -def test_dgram_waitset(cmdline_args): - from common import ClientServerTest - test = ClientServerTest(cmdline_args, "fi_dgram_waitset") - test.run() - @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index 92f85482d9b..977f9fdf84b 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -108,7 +108,6 @@ functional_tests=( "fi_cq_data -e rdm -o writedata" "fi_cq_data -e dgram -o writedata" "fi_dgram" - "fi_dgram_waitset" "fi_msg" "fi_msg_epoll" "fi_msg_sockets" diff --git a/fabtests/test_configs/efa/efa-neuron.exclude b/fabtests/test_configs/efa/efa-neuron.exclude index c5a8fd706c3..49aa4408e33 100644 --- a/fabtests/test_configs/efa/efa-neuron.exclude +++ b/fabtests/test_configs/efa/efa-neuron.exclude @@ -69,9 +69,6 @@ cmatose # shared AV isn't supported shared_av -# wait isn't supported -dgram_waitset - # Remove this once ubertest supports setting MR modes ubertest diff --git a/fabtests/test_configs/efa/efa.exclude b/fabtests/test_configs/efa/efa.exclude index 6743d1d3f77..6798f678936 100644 --- a/fabtests/test_configs/efa/efa.exclude +++ b/fabtests/test_configs/efa/efa.exclude @@ -74,9 +74,6 @@ cmatose # shared AV isn't supported shared_av -# wait isn't supported -dgram_waitset - # Remove this once ubertest supports setting MR modes ubertest diff --git a/fabtests/test_configs/psm3/psm3.exclude b/fabtests/test_configs/psm3/psm3.exclude index 418ba8a1b5d..b2288415605 100644 --- a/fabtests/test_configs/psm3/psm3.exclude +++ b/fabtests/test_configs/psm3/psm3.exclude @@ -15,6 +15,5 @@ scalable_ep shared_av rdm_cntr_pingpong multi_recv -dgram_waitset multinode rdm_tagged_peek From 513dba0dd63a8614a51531cbd2a3d2888db9845c Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 3 Sep 2024 17:23:49 -0700 Subject: [PATCH 027/393] fabtests/fi_poll: remove test (deprecated feature) fi_poll is deprecated in 2.0 Signed-off-by: Alexia Ingerson --- fabtests/Makefile.am | 7 - fabtests/Makefile.win | 4 +- fabtests/fabtests.vcxproj | 1 - fabtests/fabtests.vcxproj.filters | 3 - fabtests/functional/poll.c | 290 --------------------------- fabtests/man/fabtests.7.md | 4 - fabtests/pytest/default/test_poll.py | 9 - fabtests/scripts/runfabtests.cmd | 2 - fabtests/scripts/runfabtests.sh | 2 - 9 files changed, 1 insertion(+), 321 deletions(-) delete mode 100644 fabtests/functional/poll.c delete mode 100644 fabtests/pytest/default/test_poll.py diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index 3e86d30dd79..b97ca169e5f 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -24,7 +24,6 @@ bin_PROGRAMS = \ functional/fi_mcast \ functional/fi_rdm_tagged_peek \ functional/fi_cq_data \ - functional/fi_poll \ functional/fi_scalable_ep \ functional/fi_shared_ctx \ functional/fi_msg_epoll \ @@ -160,7 +159,6 @@ nobase_dist_config_DATA = \ pytest/default/test_msg.py \ pytest/default/test_multinode.py \ pytest/default/test_multi_recv.py \ - pytest/default/test_poll.py \ pytest/default/test_rdm.py \ pytest/default/test_recv_cancel.py \ pytest/default/test_rma_bw.py \ @@ -316,10 +314,6 @@ functional_fi_shared_ctx_SOURCES = \ functional/shared_ctx.c functional_fi_shared_ctx_LDADD = libfabtests.la -functional_fi_poll_SOURCES = \ - functional/poll.c -functional_fi_poll_LDADD = libfabtests.la - functional_fi_multi_ep_SOURCES = \ functional/multi_ep.c functional_fi_multi_ep_LDADD = libfabtests.la @@ -626,7 +620,6 @@ dummy_man_pages = \ man/man1/fi_msg_sockets.1 \ man/man1/fi_multi_ep.1 \ man/man1/fi_multi_mr.1 \ - man/man1/fi_poll.1 \ man/man1/fi_rdm.1 \ man/man1/fi_rdm_atomic.1 \ man/man1/fi_rdm_deferred_wq.1 \ diff --git a/fabtests/Makefile.win b/fabtests/Makefile.win index 2eead3de38a..dc3a28fe0b6 100644 --- a/fabtests/Makefile.win +++ b/fabtests/Makefile.win @@ -80,7 +80,7 @@ benchmarks: $(outdir)\dgram_pingpong.exe $(outdir)\msg_bw.exe \ functional: $(outdir)\av_xfer.exe $(outdir)\bw.exe $(outdir)\cm_data.exe $(outdir)\cq_data.exe \ $(outdir)\dgram.exe $(outdir)\msg.exe $(outdir)\msg_epoll.exe \ $(outdir)\inject_test.exe $(outdir)\msg_sockets.exe $(outdir)\multi_mr.exe \ - $(outdir)\multi_ep.exe $(outdir)\multi_recv.exe $(outdir)\poll.exe $(outdir)\rdm.exe \ + $(outdir)\multi_ep.exe $(outdir)\multi_recv.exe $(outdir)\rdm.exe \ $(outdir)\rdm_atomic.exe $(outdir)\rdm_multi_client.exe $(outdir)\rdm_rma_event.exe \ $(outdir)\rdm_rma_trigger.exe $(outdir)\rdm_shared_av.exe $(outdir)\rdm_tagged_peek.exe \ $(outdir)\recv_cancel.exe $(outdir)\scalable_ep.exe $(outdir)\shared_ctx.exe \ @@ -142,8 +142,6 @@ $(outdir)\multi_ep.exe: {functional}multi_ep.c $(basedeps) $(outdir)\multi_recv.exe: {functional}multi_recv.c $(basedeps) -$(outdir)\poll.exe: {functional}poll.c $(basedeps) - $(outdir)\rdm.exe: {functional}rdm.c $(basedeps) $(outdir)\rdm_atomic.exe: {functional}rdm_atomic.c $(basedeps) diff --git a/fabtests/fabtests.vcxproj b/fabtests/fabtests.vcxproj index 67c1d9f5876..cba59f15c1a 100644 --- a/fabtests/fabtests.vcxproj +++ b/fabtests/fabtests.vcxproj @@ -231,7 +231,6 @@ - diff --git a/fabtests/fabtests.vcxproj.filters b/fabtests/fabtests.vcxproj.filters index b4085a0561e..d3f495b81e2 100644 --- a/fabtests/fabtests.vcxproj.filters +++ b/fabtests/fabtests.vcxproj.filters @@ -90,9 +90,6 @@ Source Files\functional - - Source Files\functional - Source Files\functional diff --git a/fabtests/functional/poll.c b/fabtests/functional/poll.c deleted file mode 100644 index f9a2079c94f..00000000000 --- a/fabtests/functional/poll.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2013-2015 Intel Corporation. All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under the BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include -#include - -#include - -#define MAX_POLL_CNT 10 - -static int alloc_ep_res(struct fi_info *fi) -{ - struct fi_poll_attr poll_attr; - int ret; - - ret = ft_alloc_active_res(fi); - if (ret) - return ret; - - memset(&poll_attr, 0, sizeof poll_attr); - ret = fi_poll_open(domain, &poll_attr, &pollset); - if (ret) { - FT_PRINTERR("fi_poll_open", ret); - return ret; - } - - if (txcq) { - ret = fi_poll_add(pollset, &txcq->fid, 0); - if (ret) - goto err; - } - - if (rxcq) { - ret = fi_poll_add(pollset, &rxcq->fid, 0); - if (ret) - goto err; - } - - if (txcntr) { - ret = fi_poll_add(pollset, &txcntr->fid, 0); - if (ret) - goto err; - } - - if (rxcntr) { - ret = fi_poll_add(pollset, &rxcntr->fid, 0); - if (ret) - goto err; - } - - return 0; -err: - FT_PRINTERR("fi_poll_add", ret); - return ret; -} - -static int free_poll_res(void) -{ - int ret; - - if (!pollset) - return 0; - - if (txcq) { - ret = fi_poll_del(pollset, &txcq->fid, 0); - if (ret) - goto err; - } - - if (rxcq) { - ret = fi_poll_del(pollset, &rxcq->fid, 0); - if (ret) - goto err; - } - - if (txcntr) { - ret = fi_poll_del(pollset, &txcntr->fid, 0); - if (ret) - goto err; - } - - if (rxcntr) { - ret = fi_poll_del(pollset, &rxcntr->fid, 0); - if (ret) - goto err; - } - return 0; -err: - FT_PRINTERR("fi_poll_del", ret); - return ret; -} - -static int init_fabric(void) -{ - int ret; - - ret = ft_init(); - if (ret) - return ret; - - ret = ft_init_oob(); - if (ret) - return ret; - - ret = ft_getinfo(hints, &fi); - if (ret) - return ret; - - ret = ft_open_fabric_res(); - if (ret) - return ret; - - ret = alloc_ep_res(fi); - if (ret) - return ret; - - ret = ft_enable_ep_recv(); - if (ret) - return ret; - return 0; -} - -static int send_recv() -{ - struct fid_cq *cq; - void *context[MAX_POLL_CNT]; - struct fi_cq_entry comp; - int ret; - int ret_count = 0; - int i, tx_cntr_val = 0, rx_cntr_val = 0; - - fprintf(stdout, "Posting a send...\n"); - ret = ft_post_tx(ep, remote_fi_addr, tx_size, NO_CQ_DATA, &tx_ctx); - if (ret) - return ret; - - while (((opts.options & FT_OPT_TX_CQ) && (tx_cq_cntr < tx_seq)) || - ((opts.options & FT_OPT_TX_CNTR) && (tx_cntr_val < tx_seq)) || - ((opts.options & FT_OPT_RX_CQ) && (rx_cq_cntr < rx_seq)) || - ((opts.options & FT_OPT_RX_CNTR) && (rx_cntr_val < rx_seq))) { - - /* Poll send and recv CQs/Cntrs */ - do { - ret_count = fi_poll(pollset, context, MAX_POLL_CNT); - if (ret_count < 0) { - FT_PRINTERR("fi_poll", ret_count); - return ret_count; - } - } while (!ret_count); - - fprintf(stdout, "Retrieved %d event(s)\n", ret_count); - - for (i = 0; i < ret_count; i++) { - if (context[i] == &txcq) { - printf("Send completion received\n"); - cq = txcq; - tx_cq_cntr++; - } else if (context[i] == &rxcq) { - printf("Recv completion received\n"); - cq = rxcq; - rx_cq_cntr++; - } else if (context[i] == &txcntr) { - printf("Send counter poll-event\n"); - tx_cntr_val = fi_cntr_read(txcntr); - if (tx_cntr_val > tx_seq) { - FT_ERR("Invalid tx counter event\n"); - FT_ERR("expected: %" PRIu64 ", found: " - "%d\n", tx_seq, tx_cntr_val); - return -1; - } - continue; - } else if (context[i] == &rxcntr) { - printf("Recv counter poll-event\n"); - rx_cntr_val = fi_cntr_read(rxcntr); - if (rx_cntr_val > rx_seq) { - FT_ERR("Invalid rx counter event\n"); - FT_ERR("expected: %" PRIu64 ", found: " - "%d\n", rx_seq, rx_cntr_val); - return -1; - } - continue; - } else { - FT_ERR("Unknown completion received\n"); - return -1; - } - - /* Read the completion entry */ - ret = fi_cq_read(cq, &comp, 1); - if (ret < 0) { - if (ret == -FI_EAVAIL) { - ret = ft_cq_readerr(cq); - } else { - FT_PRINTERR("fi_cq_read", ret); - } - return ret; - } - } - } - - return 0; -} - -static int run(void) -{ - int ret; - - ret = init_fabric(); - if (ret) - return ret; - - ret = ft_init_av(); - if (ret) - return ret; - - return send_recv(); -} - -int main(int argc, char **argv) -{ - int op, ret = 0; - - opts = INIT_OPTS; - opts.options |= FT_OPT_SIZE; - - hints = fi_allocinfo(); - if (!hints) - return EXIT_FAILURE; - - while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS)) != -1) { - switch (op) { - default: - ft_parse_addr_opts(op, optarg, &opts); - ft_parseinfo(op, optarg, hints, &opts); - ft_parsecsopts(op, optarg, &opts); - break; - case '?': - case 'h': - ft_usage(argv[0], "A client-server example that uses poll.\n"); - FT_PRINT_OPTS_USAGE("-t ", "completion type [queue, counter]"); - return EXIT_FAILURE; - } - } - - if (optind < argc) - opts.dst_addr = argv[optind]; - - hints->ep_attr->type = FI_EP_RDM; - hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; - hints->domain_attr->mr_mode = opts.mr_mode; - hints->addr_format = opts.address_format; - - ret = run(); - - free_poll_res(); - ft_free_res(); - return ft_exit_code(ret); -} diff --git a/fabtests/man/fabtests.7.md b/fabtests/man/fabtests.7.md index e2d0321438b..4f31360e1cc 100644 --- a/fabtests/man/fabtests.7.md +++ b/fabtests/man/fabtests.7.md @@ -77,10 +77,6 @@ features of libfabric. completion counters of inbound writes as the notification mechanism. -*fi_poll* -: Exchanges data over RDM endpoints using poll sets to drive - completion notifications. - *fi_rdm* : A basic RDM endpoint example. diff --git a/fabtests/pytest/default/test_poll.py b/fabtests/pytest/default/test_poll.py deleted file mode 100644 index a3aa6ec35fc..00000000000 --- a/fabtests/pytest/default/test_poll.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest - -@pytest.mark.functional -@pytest.mark.parametrize("poll_type", ["queue", "counter"]) -def test_poll(cmdline_args, poll_type): - from common import ClientServerTest - test = ClientServerTest(cmdline_args, "fi_poll -t " + poll_type) - test.run() - diff --git a/fabtests/scripts/runfabtests.cmd b/fabtests/scripts/runfabtests.cmd index 2086ac42340..d362f905cd1 100644 --- a/fabtests/scripts/runfabtests.cmd +++ b/fabtests/scripts/runfabtests.cmd @@ -66,8 +66,6 @@ set functional_tests=^ "msg"^ "msg_epoll"^ "msg_sockets"^ - "poll -t queue"^ - "poll -t counter"^ "rdm"^ "rdm -U"^ "rdm_tagged_peek"^ diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index 977f9fdf84b..e6ad879d4e5 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -111,8 +111,6 @@ functional_tests=( "fi_msg" "fi_msg_epoll" "fi_msg_sockets" - "fi_poll -t queue" - "fi_poll -t counter" "fi_rdm" "fi_rdm -U" "fi_rdm_rma_event" From c3a2c671512091389d5056ce5026b4b2c1c41ca4 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 9 Sep 2024 20:40:08 -0700 Subject: [PATCH 028/393] fabtests: remove deprecated MR modes FI_MR_BASIC, FI_MR_SCALABLE, and FI_LOCAL_MR are deprecated in 2.0 Remove references to and tests targeting deprecated MR modes Add OFI_MR_DEPRECATED to resolve some compile warnings Signed-off-by: Alexia Ingerson --- fabtests/common/check_hmem.c | 2 +- fabtests/common/shared.c | 16 ++------ fabtests/functional/rdm_atomic.c | 4 +- fabtests/include/shared.h | 5 +++ fabtests/unit/av_test.c | 2 +- fabtests/unit/cntr_test.c | 2 +- fabtests/unit/cq_test.c | 2 +- fabtests/unit/dom_test.c | 2 +- fabtests/unit/eq_test.c | 2 +- fabtests/unit/getinfo_test.c | 65 +------------------------------- fabtests/unit/mr_cache_evict.c | 2 +- fabtests/unit/mr_test.c | 2 +- fabtests/unit/setopt_test.c | 2 +- 13 files changed, 19 insertions(+), 89 deletions(-) diff --git a/fabtests/common/check_hmem.c b/fabtests/common/check_hmem.c index b7a97bc8e79..5e319473c4e 100644 --- a/fabtests/common/check_hmem.c +++ b/fabtests/common/check_hmem.c @@ -44,7 +44,7 @@ int main(int argc, char** argv) return EXIT_FAILURE; hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; while ((op = getopt(argc, argv, "p:h")) != -1) { switch (op) { case 'p': diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index e4d45c46e42..c8fe2696156 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -276,16 +276,10 @@ static inline int ft_rma_write_target_allowed(uint64_t caps) return 0; } -static inline int ft_check_mr_local_flag(struct fi_info *info) -{ - return ((info->mode & FI_LOCAL_MR) || - (info->domain_attr->mr_mode & FI_MR_LOCAL)); -} - uint64_t ft_info_to_mr_access(struct fi_info *info) { uint64_t mr_access = 0; - if (ft_check_mr_local_flag(info)) { + if (info->domain_attr->mr_mode & FI_MR_LOCAL) { if (info->caps & (FI_MSG | FI_TAGGED)) { if (info->caps & FT_MSG_MR_ACCESS) { mr_access |= info->caps & FT_MSG_MR_ACCESS; @@ -1734,12 +1728,8 @@ int ft_exchange_keys(struct fi_rma_iov *peer_iov) len = sizeof(*rma_iov); } - if ((fi->domain_attr->mr_mode == FI_MR_BASIC) || - (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) { - rma_iov->addr = (uintptr_t) rx_buf + ft_rx_prefix_size(); - } else { - rma_iov->addr = 0; - } + rma_iov->addr = fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR ? + (uintptr_t) rx_buf + ft_rx_prefix_size() : 0; if (fi->domain_attr->mr_mode & FI_MR_RAW) { ret = fi_mr_raw_attr(mr, &addr, (uint8_t *) &rma_iov->key, diff --git a/fabtests/functional/rdm_atomic.c b/fabtests/functional/rdm_atomic.c index 638b9e1148b..cef31ecb229 100644 --- a/fabtests/functional/rdm_atomic.c +++ b/fabtests/functional/rdm_atomic.c @@ -376,9 +376,7 @@ static uint64_t get_mr_key() { static uint64_t user_key = FT_MR_KEY + 1; - return ((fi->domain_attr->mr_mode == FI_MR_BASIC) || - (fi->domain_attr->mr_mode & FI_MR_PROV_KEY)) ? - 0 : user_key++; + return fi->domain_attr->mr_mode & FI_MR_PROV_KEY ? 0 : user_key++; } static int alloc_ep_res(struct fi_info *fi) diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index ae35106b5b8..eb4e3238f10 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -69,6 +69,11 @@ extern "C" { ((type *) ((char *)ptr - offsetof(type, field))) #endif +/* + * Internal version of deprecated APIs. + * These are used internally to avoid compiler warnings. + */ +#define OFI_MR_DEPRECATED (0x3) /* FI_MR_BASIC | FI_MR_SCALABLE */ #define OFI_MR_BASIC_MAP (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR) /* exit codes must be 0-255 */ diff --git a/fabtests/unit/av_test.c b/fabtests/unit/av_test.c index 3c4f06ce773..72da2313a39 100644 --- a/fabtests/unit/av_test.c +++ b/fabtests/unit/av_test.c @@ -718,7 +718,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; hints->addr_format = FI_SOCKADDR; ret = fi_getinfo(FT_FIVERSION, opts.src_addr, 0, FI_SOURCE, hints, &fi); diff --git a/fabtests/unit/cntr_test.c b/fabtests/unit/cntr_test.c index 45306040f7b..f881ec2a452 100644 --- a/fabtests/unit/cntr_test.c +++ b/fabtests/unit/cntr_test.c @@ -174,7 +174,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); if (ret) { diff --git a/fabtests/unit/cq_test.c b/fabtests/unit/cq_test.c index a80fd16a415..32188e304d3 100644 --- a/fabtests/unit/cq_test.c +++ b/fabtests/unit/cq_test.c @@ -249,7 +249,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); if (ret) { diff --git a/fabtests/unit/dom_test.c b/fabtests/unit/dom_test.c index 7116b78e282..8f82abc5571 100644 --- a/fabtests/unit/dom_test.c +++ b/fabtests/unit/dom_test.c @@ -90,7 +90,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); if (ret) { diff --git a/fabtests/unit/eq_test.c b/fabtests/unit/eq_test.c index 80cfeb4a720..d5cbbedc114 100644 --- a/fabtests/unit/eq_test.c +++ b/fabtests/unit/eq_test.c @@ -611,7 +611,7 @@ int main(int argc, char **argv) hints->mode = FI_CONTEXT | FI_CONTEXT2 | FI_MSG_PREFIX | FI_ASYNC_IOV | FI_RX_CQ_DATA; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); if (ret) { diff --git a/fabtests/unit/getinfo_test.c b/fabtests/unit/getinfo_test.c index b888a942619..3c3060b4810 100644 --- a/fabtests/unit/getinfo_test.c +++ b/fabtests/unit/getinfo_test.c @@ -538,53 +538,6 @@ static int init_invalid_rma_WAW_ordering_size(struct fi_info *hints) /* * MR mode checks */ -static int init_mr_basic(struct fi_info *hints) -{ - hints->caps |= FI_RMA; - hints->domain_attr->mr_mode = FI_MR_BASIC; - return 0; -} - -static int check_mr_basic(struct fi_info *info) -{ - return (info->domain_attr->mr_mode != FI_MR_BASIC) ? - EXIT_FAILURE : 0; -} - -static int init_mr_scalable(struct fi_info *hints) -{ - hints->caps |= FI_RMA; - hints->domain_attr->mr_mode = FI_MR_SCALABLE; - return 0; -} - -static int check_mr_scalable(struct fi_info *info) -{ - return (info->domain_attr->mr_mode != FI_MR_SCALABLE) ? - EXIT_FAILURE : 0; -} - -static int init_mr_unspec(struct fi_info *hints) -{ - hints->caps |= FI_RMA; - hints->domain_attr->mr_mode = FI_MR_UNSPEC; - return 0; -} - -static int test_mr_v1_0(char *node, char *service, uint64_t flags, - struct fi_info *test_hints, struct fi_info **info) -{ - return fi_getinfo(FI_VERSION(1, 0), node, service, flags, - test_hints, info); -} - -static int check_mr_unspec(struct fi_info *info) -{ - return (info->domain_attr->mr_mode != FI_MR_BASIC && - info->domain_attr->mr_mode != FI_MR_SCALABLE) ? - EXIT_FAILURE : 0; -} - static int init_mr_mode(struct fi_info *hints, uint64_t mode) { hints->domain_attr->mr_mode = (uint32_t) mode; @@ -906,18 +859,7 @@ getinfo_test(bad_waw_ordering, 1, "Test invalid rma WAW ordering size", NULL, NULL, -FI_ENODATA) /* MR mode tests */ -getinfo_test(mr_mode, 1, "Test FI_MR_BASIC", NULL, NULL, 0, - hints, init_mr_basic, NULL, check_mr_basic, -FI_ENODATA) -getinfo_test(mr_mode, 2, "Test FI_MR_SCALABLE", NULL, NULL, 0, - hints, init_mr_scalable, NULL, check_mr_scalable, -FI_ENODATA) -getinfo_test(mr_mode, 3, "Test FI_MR_UNSPEC (v1.0)", NULL, NULL, 0, - hints, init_mr_unspec, test_mr_v1_0, check_mr_unspec, -FI_ENODATA) -getinfo_test(mr_mode, 4, "Test FI_MR_BASIC (v1.0)", NULL, NULL, 0, - hints, init_mr_basic, test_mr_v1_0, check_mr_basic, -FI_ENODATA) -getinfo_test(mr_mode, 5, "Test FI_MR_SCALABLE (v1.0)", NULL, NULL, 0, - hints, init_mr_scalable, test_mr_v1_0, check_mr_scalable, - -FI_ENODATA) -getinfo_test(mr_mode, 6, "Test mr_mode bits", NULL, NULL, 0, +getinfo_test(mr_mode, 1, "Test mr_mode bits", NULL, NULL, 0, hints, NULL, validate_mr_modes, NULL, 0) /* Progress tests */ @@ -1008,11 +950,6 @@ int main(int argc, char **argv) TEST_ENTRY_GETINFO(bad_waw_ordering1), TEST_ENTRY_GETINFO(neg1), TEST_ENTRY_GETINFO(mr_mode1), - TEST_ENTRY_GETINFO(mr_mode2), - TEST_ENTRY_GETINFO(mr_mode3), - TEST_ENTRY_GETINFO(mr_mode4), - TEST_ENTRY_GETINFO(mr_mode5), - TEST_ENTRY_GETINFO(mr_mode6), TEST_ENTRY_GETINFO(progress1), TEST_ENTRY_GETINFO(progress2), TEST_ENTRY_GETINFO(caps1), diff --git a/fabtests/unit/mr_cache_evict.c b/fabtests/unit/mr_cache_evict.c index a12f9c31372..4a3c16ac9a0 100644 --- a/fabtests/unit/mr_cache_evict.c +++ b/fabtests/unit/mr_cache_evict.c @@ -806,7 +806,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; hints->caps |= FI_MSG | FI_RMA; if (opts.options & FT_OPT_ENABLE_HMEM) diff --git a/fabtests/unit/mr_test.c b/fabtests/unit/mr_test.c index d071a8d74f7..df4caf66992 100644 --- a/fabtests/unit/mr_test.c +++ b/fabtests/unit/mr_test.c @@ -324,7 +324,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE | FI_MR_LOCAL); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; hints->caps |= FI_MSG | FI_RMA; if (opts.options & FT_OPT_ENABLE_HMEM) diff --git a/fabtests/unit/setopt_test.c b/fabtests/unit/setopt_test.c index 40487aaa3de..5f7b2ddc5be 100644 --- a/fabtests/unit/setopt_test.c +++ b/fabtests/unit/setopt_test.c @@ -152,7 +152,7 @@ int main(int argc, char **argv) hints->mode = ~0; hints->domain_attr->mode = ~0; - hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->domain_attr->mr_mode = ~OFI_MR_DEPRECATED; hints->caps |= FI_MSG; failed = run_tests(test_array, err_buf); From f8a5b2089d319c4315e036fcc91dfac020cd5bdb Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Fri, 13 Sep 2024 03:30:05 +0000 Subject: [PATCH 029/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- fabtests/man/man7/fabtests.7 | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fabtests/man/man7/fabtests.7 b/fabtests/man/man7/fabtests.7 index e71d9c1f1ee..98f90ca9099 100644 --- a/fabtests/man/man7/fabtests.7 +++ b/fabtests/man/man7/fabtests.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fabtests" "7" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fabtests" "7" "2024\-09\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -48,9 +48,6 @@ Tranfers messages with CQ data. \f[I]fi_dgram\f[R] A basic datagram endpoint example. .TP -\f[I]fi_dgram_waitset\f[R] -Transfers datagrams using waitsets for completion notification. -.TP \f[I]fi_inj_complete\f[R] Sends messages using the FI_INJECT_COMPLETE operation flag. .TP @@ -79,10 +76,6 @@ Performs data transfers over multiple endpoints in parallel. Issues RMA write operations to multiple memory regions, using completion counters of inbound writes as the notification mechanism. .TP -\f[I]fi_poll\f[R] -Exchanges data over RDM endpoints using poll sets to drive completion -notifications. -.TP \f[I]fi_rdm\f[R] A basic RDM endpoint example. .TP From 27b977d5e5aa130793a39e7b90494b63c688f747 Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Mon, 28 Nov 2022 09:47:44 -0600 Subject: [PATCH 030/393] prov/util: Add uffd user mode flag for kernels Linux kernels 5.11 and later introduced a UFFD_USER_MODE_ONLY. When set, the userfaultfd object will only be able to handle page faults originated from the user space on the registered regions. If this is not set on kernels 5.11 or later, uffd will not work. Signed-off-by: Ian Ziemba --- prov/util/src/util_mem_monitor.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index d1c980bf94b..9b4c0bc954d 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -43,6 +43,9 @@ #include #include +#ifndef UFFD_USER_MODE_ONLY +#define UFFD_USER_MODE_ONLY 0 +#endif pthread_mutex_t mm_lock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t mm_state_lock = PTHREAD_MUTEX_INITIALIZER; @@ -701,7 +704,8 @@ static int ofi_uffd_start(struct ofi_mem_monitor *monitor) if (!num_page_sizes) return -FI_ENODATA; - uffd.fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + uffd.fd = syscall(__NR_userfaultfd, + O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd.fd < 0) { FI_WARN(&core_prov, FI_LOG_MR, "syscall/userfaultfd %s\n", strerror(errno)); From bcd211c06fdc7cba69adcf7c15f8418cd50c31e7 Mon Sep 17 00:00:00 2001 From: Darryl Abbate Date: Wed, 11 Sep 2024 13:32:28 -0700 Subject: [PATCH 031/393] util/av: Log AV insert with AV's specified address format This prevents `(null)` from being logged when addr isn't compatible with `struct sockaddr` Signed-off-by: Darryl Abbate --- include/ofi_net.h | 4 ++-- include/ofi_util.h | 4 ++++ prov/util/src/util_av.c | 10 ++++------ src/common.c | 6 +++--- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/ofi_net.h b/include/ofi_net.h index 1eeaea980d7..c9f4df00774 100644 --- a/include/ofi_net.h +++ b/include/ofi_net.h @@ -903,14 +903,14 @@ uint32_t ofi_addr_format(const char *str); int ofi_str_toaddr(const char *str, uint32_t *addr_format, void **addr, size_t *len); -void ofi_straddr_log_internal(const char *func, int line, +void ofi_straddr_log_internal(const char *func, int line, uint32_t addr_format, const struct fi_provider *prov, enum fi_log_level level, enum fi_log_subsys subsys, char *log_str, const void *addr); #define ofi_straddr_log(...) \ - ofi_straddr_log_internal(__func__, __LINE__, __VA_ARGS__) + ofi_straddr_log_internal(__func__, __LINE__, FI_FORMAT_UNSPEC, __VA_ARGS__) #if ENABLE_DEBUG #define ofi_straddr_dbg(prov, subsystem, ...) \ diff --git a/include/ofi_util.h b/include/ofi_util.h index e90b09ea058..911a69893ba 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -835,6 +835,10 @@ static inline void ofi_ep_peer_rx_cntr_incerr(struct util_ep *ep, uint8_t op) * AV / addressing */ +#define ofi_av_straddr_log(av, level, ...) \ + ofi_straddr_log_internal(__func__, __LINE__, av->domain->addr_format, \ + av->prov, level, FI_LOG_AV, __VA_ARGS__) + struct util_av; struct util_av_set; struct util_peer_addr; diff --git a/prov/util/src/util_av.c b/prov/util/src/util_av.c index 16ebb595ce0..5594dd3debc 100644 --- a/prov/util/src/util_av.c +++ b/prov/util/src/util_av.c @@ -276,14 +276,13 @@ int ofi_av_insert_addr_at(struct util_av *av, const void *addr, fi_addr_t fi_add struct util_av_entry *entry = NULL; assert(ofi_mutex_held(&av->lock)); - ofi_straddr_log(av->prov, FI_LOG_INFO, FI_LOG_AV, "inserting addr", addr); + ofi_av_straddr_log(av, FI_LOG_INFO, "inserting addr", addr); HASH_FIND(hh, av->hash, addr, av->addrlen, entry); if (entry) { if (fi_addr == ofi_buf_index(entry)) return FI_SUCCESS; - ofi_straddr_log(av->prov, FI_LOG_WARN, FI_LOG_AV, - "addr already in AV", addr); + ofi_av_straddr_log(av, FI_LOG_WARN, "addr already in AV", addr); return -FI_EALREADY; } @@ -304,14 +303,13 @@ int ofi_av_insert_addr(struct util_av *av, const void *addr, fi_addr_t *fi_addr) struct util_av_entry *entry = NULL; assert(ofi_mutex_held(&av->lock)); - ofi_straddr_log(av->prov, FI_LOG_INFO, FI_LOG_AV, "inserting addr", addr); + ofi_av_straddr_log(av, FI_LOG_INFO, "inserting addr", addr); HASH_FIND(hh, av->hash, addr, av->addrlen, entry); if (entry) { if (fi_addr) *fi_addr = ofi_buf_index(entry); if (ofi_atomic_inc32(&entry->use_cnt) > 1) { - ofi_straddr_log(av->prov, FI_LOG_WARN, FI_LOG_AV, - "addr already in AV", addr); + ofi_av_straddr_log(av, FI_LOG_WARN, "addr already in AV", addr); } } else { entry = ofi_ibuf_alloc(av->av_entry_pool); diff --git a/src/common.c b/src/common.c index 1c29350fe28..a5f5ba5a22e 100644 --- a/src/common.c +++ b/src/common.c @@ -1053,19 +1053,19 @@ size_t ofi_mask_addr(struct sockaddr *maskaddr, const struct sockaddr *srcaddr, return len; } -void ofi_straddr_log_internal(const char *func, int line, +void ofi_straddr_log_internal(const char *func, int line, uint32_t addr_format, const struct fi_provider *prov, enum fi_log_level level, enum fi_log_subsys subsys, char *log_str, const void *addr) { char buf[OFI_ADDRSTRLEN]; - uint32_t addr_format; size_t len = sizeof(buf); if (fi_log_enabled(prov, level, subsys)) { if (addr) { - addr_format = ofi_translate_addr_format(ofi_sa_family(addr)); + if (addr_format == FI_FORMAT_UNSPEC) + addr_format = ofi_translate_addr_format(ofi_sa_family(addr)); fi_log(prov, level, subsys, func, line, "%s: %s\n", log_str, ofi_straddr(buf, &len, addr_format, addr)); } else { From fc93d99615f1db50bdcd31f46262f82d4ed259de Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 12 Sep 2024 23:38:13 +0000 Subject: [PATCH 032/393] fabtests/lpp: Fix compiler warning about unused variables suppress compiler warning for non-debug build Signed-off-by: Shi Jin --- fabtests/prov/lpp/src/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fabtests/prov/lpp/src/main.c b/fabtests/prov/lpp/src/main.c index 56eec613958..22971ad716a 100644 --- a/fabtests/prov/lpp/src/main.c +++ b/fabtests/prov/lpp/src/main.c @@ -320,6 +320,7 @@ static void run_tests(int parallel) // iteration. ret = pthread_barrier_init(&_barrier, NULL, nthreads + 1); assert(ret == 0); + (void) ret; /* suppress compiler warning for non-debug build */ pthread_t *threads = calloc(nthreads, sizeof(pthread_t)); assert(threads); From 0c42e542b7ea41ed6b1da561a4ace4350b1af20f Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 11 Sep 2024 19:03:20 +0000 Subject: [PATCH 033/393] fabtests/efa: Avoid testing duplicate mixed memory type workload Currently, we always test mixed memory type workload in two directions explicitly: like cuda-to-host, host-to-cuda. However. pingpong and rma read/write test are all bi-directional, which means there is no difference for cuda-to-host and host-to-cuda. Running them twice in this situation is not necessary and causing longer test duration. This patch improves this by having a reduced memory type list for bi-directional tests. Signed-off-by: Shi Jin --- fabtests/pytest/efa/conftest.py | 37 +++++++++++++++++++----- fabtests/pytest/efa/test_rdm.py | 20 ++++++------- fabtests/pytest/efa/test_rma_bw.py | 20 +++++-------- fabtests/pytest/efa/test_rma_pingpong.py | 12 ++++---- 4 files changed, 54 insertions(+), 35 deletions(-) diff --git a/fabtests/pytest/efa/conftest.py b/fabtests/pytest/efa/conftest.py index 8874b53a0a7..5c8928bdef5 100644 --- a/fabtests/pytest/efa/conftest.py +++ b/fabtests/pytest/efa/conftest.py @@ -1,16 +1,39 @@ import pytest +# The memory types for bi-directional tests. +memory_type_list_bi_dir = [ + pytest.param("host_to_host"), + pytest.param("host_to_cuda", marks=pytest.mark.cuda_memory), + pytest.param("cuda_to_cuda", marks=pytest.mark.cuda_memory), + pytest.param("host_to_neuron", marks=pytest.mark.neuron_memory), + pytest.param("neuron_to_neuron", marks=pytest.mark.neuron_memory), +] -@pytest.fixture(scope="module", params=["host_to_host", - pytest.param("host_to_cuda", marks=pytest.mark.cuda_memory), - pytest.param("cuda_to_host", marks=pytest.mark.cuda_memory), - pytest.param("cuda_to_cuda", marks=pytest.mark.cuda_memory), - pytest.param("neuron_to_neuron", marks=pytest.mark.neuron_memory), - pytest.param("neuron_to_host", marks=pytest.mark.neuron_memory), - pytest.param("host_to_neuron", marks=pytest.mark.neuron_memory)]) +# Add more memory types that are useful for uni-directional tests. +memory_type_list_all = memory_type_list_bi_dir + [ + pytest.param("cuda_to_host", marks=pytest.mark.cuda_memory), + pytest.param("neuron_to_host", marks=pytest.mark.neuron_memory), +] + +@pytest.fixture(scope="module", params=memory_type_list_all) def memory_type(request): return request.param +@pytest.fixture(scope="module", params=memory_type_list_bi_dir) +def memory_type_bi_dir(request): + return request.param + +@pytest.fixture(scope="module", params=["read", "writedata", "write"]) +def rma_operation_type(request): + return request.param + +@pytest.fixture(scope="module") +def check_rma_bw_memory_type(memory_type, rma_operation_type): + is_test_bi_dir = False if rma_operation_type == "writedata" else True + if is_test_bi_dir and (memory_type not in [_.values[0] for _ in memory_type_list_bi_dir]): + pytest.skip("Duplicated memory type for bi-directional test") + + @pytest.fixture(scope="module", params=["r:0,4,64", "r:4048,4,4148", "r:8000,4,9000", diff --git a/fabtests/pytest/efa/test_rdm.py b/fabtests/pytest/efa/test_rdm.py index d1a553abca7..ec1f3044c34 100644 --- a/fabtests/pytest/efa/test_rdm.py +++ b/fabtests/pytest/efa/test_rdm.py @@ -9,10 +9,10 @@ @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rdm_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): +def test_rdm_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type_bi_dir, completion_type): command = "fi_rdm_pingpong" + " " + perf_progress_model_cli efa_run_client_server_test(cmdline_args, command, iteration_type, - completion_semantic, memory_type, "all", completion_type=completion_type) + completion_semantic, memory_type_bi_dir, "all", completion_type=completion_type) @pytest.mark.functional @pytest.mark.serial @@ -21,9 +21,9 @@ def test_mr_exhaustion_rdm_pingpong(cmdline_args): "transmit_complete", "host_to_host", "all", timeout=1000) @pytest.mark.functional -def test_rdm_pingpong_range(cmdline_args, completion_semantic, memory_type, message_size): +def test_rdm_pingpong_range(cmdline_args, completion_semantic, memory_type_bi_dir, message_size): efa_run_client_server_test(cmdline_args, "fi_rdm_pingpong", "short", - completion_semantic, memory_type, message_size) + completion_semantic, memory_type_bi_dir, message_size) @pytest.mark.functional def test_rdm_pingpong_no_inject_range(cmdline_args, completion_semantic, inject_message_size): @@ -33,15 +33,15 @@ def test_rdm_pingpong_no_inject_range(cmdline_args, completion_semantic, inject_ @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rdm_tagged_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): +def test_rdm_tagged_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type_bi_dir, completion_type): command = "fi_rdm_tagged_pingpong" + " " + perf_progress_model_cli efa_run_client_server_test(cmdline_args, command, iteration_type, - completion_semantic, memory_type, "all", completion_type=completion_type) + completion_semantic, memory_type_bi_dir, "all", completion_type=completion_type) @pytest.mark.functional -def test_rdm_tagged_pingpong_range(cmdline_args, completion_semantic, memory_type, message_size): +def test_rdm_tagged_pingpong_range(cmdline_args, completion_semantic, memory_type_bi_dir, message_size): efa_run_client_server_test(cmdline_args, "fi_rdm_tagged_pingpong", "short", - completion_semantic, memory_type, message_size) + completion_semantic, memory_type_bi_dir, message_size) @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), @@ -116,13 +116,13 @@ def test_rdm_pingpong_1G(cmdline_args, completion_semantic): memory_type="host_to_host", warmup_iteration_type=0) @pytest.mark.functional -def test_rdm_pingpong_zcpy_recv(cmdline_args, memory_type, zcpy_recv_max_msg_size, zcpy_recv_message_size): +def test_rdm_pingpong_zcpy_recv(cmdline_args, memory_type_bi_dir, zcpy_recv_max_msg_size, zcpy_recv_message_size): if cmdline_args.server_id == cmdline_args.client_id: pytest.skip("no zero copy recv for intra-node communication") cmdline_args_copy = copy.copy(cmdline_args) cmdline_args_copy.append_environ("FI_EFA_ENABLE_SHM_TRANSFER=0") efa_run_client_server_test(cmdline_args_copy, f"fi_rdm_pingpong --max-msg-size {zcpy_recv_max_msg_size}", - "short", "transmit_complete", memory_type, zcpy_recv_message_size) + "short", "transmit_complete", memory_type_bi_dir, zcpy_recv_message_size) @pytest.mark.functional def test_rdm_bw_zcpy_recv(cmdline_args, memory_type, zcpy_recv_max_msg_size, zcpy_recv_message_size): diff --git a/fabtests/pytest/efa/test_rma_bw.py b/fabtests/pytest/efa/test_rma_bw.py index 58f26367c7f..3710db0075b 100644 --- a/fabtests/pytest/efa/test_rma_bw.py +++ b/fabtests/pytest/efa/test_rma_bw.py @@ -4,45 +4,41 @@ import copy -@pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rma_bw(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type): +def test_rma_bw(cmdline_args, iteration_type, rma_operation_type, completion_semantic, memory_type, check_rma_bw_memory_type): command = "fi_rma_bw -e rdm" - command = command + " -o " + operation_type + " " + perf_progress_model_cli + command = command + " -o " + rma_operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", timeout=timeout) -@pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) @pytest.mark.parametrize("env_vars", [["FI_EFA_TX_SIZE=64"], ["FI_EFA_RX_SIZE=64"], ["FI_EFA_TX_SIZE=64", "FI_EFA_RX_SIZE=64"]]) -def test_rma_bw_small_tx_rx(cmdline_args, operation_type, completion_semantic, memory_type, env_vars): +def test_rma_bw_small_tx_rx(cmdline_args, rma_operation_type, completion_semantic, memory_type, env_vars, check_rma_bw_memory_type): cmdline_args_copy = copy.copy(cmdline_args) for env_var in env_vars: cmdline_args_copy.append_environ(env_var) # Use a window size larger than tx/rx size command = "fi_rma_bw -e rdm -W 128" - command = command + " -o " + operation_type + " " + perf_progress_model_cli + command = command + " -o " + rma_operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args_copy.timeout) efa_run_client_server_test(cmdline_args_copy, command, "short", completion_semantic, memory_type, "all", timeout=timeout) @pytest.mark.functional -@pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) -def test_rma_bw_range(cmdline_args, operation_type, completion_semantic, message_size, memory_type): +def test_rma_bw_range(cmdline_args, rma_operation_type, completion_semantic, message_size, memory_type, check_rma_bw_memory_type): command = "fi_rma_bw -e rdm" - command = command + " -o " + operation_type + command = command + " -o " + rma_operation_type # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type, message_size, timeout=timeout) @pytest.mark.functional -@pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) -def test_rma_bw_range_no_inject(cmdline_args, operation_type, completion_semantic, inject_message_size): +def test_rma_bw_range_no_inject(cmdline_args, rma_operation_type, completion_semantic, inject_message_size): command = "fi_rma_bw -e rdm -j 0" - command = command + " -o " + operation_type + command = command + " -o " + rma_operation_type # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, "host_to_host", inject_message_size, timeout=timeout) diff --git a/fabtests/pytest/efa/test_rma_pingpong.py b/fabtests/pytest/efa/test_rma_pingpong.py index 29afcf4e062..b3fdf9c1408 100644 --- a/fabtests/pytest/efa/test_rma_pingpong.py +++ b/fabtests/pytest/efa/test_rma_pingpong.py @@ -14,23 +14,23 @@ def rma_pingpong_message_size(request): @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rma_pingpong(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type): +def test_rma_pingpong(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type_bi_dir): command = "fi_rma_pingpong -e rdm" command = command + " -o " + operation_type + " " + perf_progress_model_cli - efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all") + efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type_bi_dir, "all") @pytest.mark.functional @pytest.mark.parametrize("operation_type", ["writedata"]) -def test_rma_pingpong_range(cmdline_args, operation_type, completion_semantic, rma_pingpong_message_size, memory_type): +def test_rma_pingpong_range(cmdline_args, operation_type, completion_semantic, rma_pingpong_message_size, memory_type_bi_dir): command = "fi_rma_pingpong -e rdm" command = command + " -o " + operation_type - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type, rma_pingpong_message_size) + efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type_bi_dir, rma_pingpong_message_size) @pytest.mark.functional @pytest.mark.parametrize("operation_type", ["writedata"]) -def test_rma_pingpong_range_no_inject(cmdline_args, operation_type, completion_semantic, rma_pingpong_message_size, memory_type): +def test_rma_pingpong_range_no_inject(cmdline_args, operation_type, completion_semantic, rma_pingpong_message_size, memory_type_bi_dir): command = "fi_rma_pingpong -e rdm -j 0" command = command + " -o " + operation_type - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type, rma_pingpong_message_size) + efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type_bi_dir, rma_pingpong_message_size) From af2dba1073f84e821ca5aa4d748bef819fab7238 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 12 Sep 2024 22:02:25 +0000 Subject: [PATCH 034/393] prov/efa: Remove unused fi_errno in efa_rdm_write_error_msg Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ope.c | 4 ++-- prov/efa/src/rdm/efa_rdm_util.c | 3 +-- prov/efa/src/rdm/efa_rdm_util.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index a44f2debacc..5b57d91847a 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -602,7 +602,7 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) err_entry.buf = rxe->cq_entry.buf; err_entry.data = rxe->cq_entry.data; err_entry.tag = rxe->cq_entry.tag; - if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, rxe->addr, err, prov_errno, + if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, rxe->addr, prov_errno, &err_entry.err_data, &err_entry.err_data_size))) { err_entry.err_data_size = 0; } @@ -694,7 +694,7 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) err_entry.buf = txe->cq_entry.buf; err_entry.data = txe->cq_entry.data; err_entry.tag = txe->cq_entry.tag; - if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, txe->addr, err, prov_errno, + if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, txe->addr, prov_errno, &err_entry.err_data, &err_entry.err_data_size))) { err_entry.err_data_size = 0; } diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index 868509162b5..02880c09dfd 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -101,13 +101,12 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) * @brief Write the error message and return its byte length * @param[in] ep EFA RDM endpoint * @param[in] addr Remote peer fi_addr_t - * @param[in] err FI_* error code(must be positive) * @param[in] prov_errno EFA provider * error code(must be positive) * @param[out] buf Pointer to the address of error data written by this function * @param[out] buflen Pointer to the returned error data size * @return A status code. 0 if the error data was written successfully, otherwise a negative FI error code. */ -int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int err, int prov_errno, void **buf, size_t *buflen) +int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen) { char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index 1b2fc1da0a2..a2ba0083295 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -19,7 +19,7 @@ bool efa_rdm_get_use_device_rdma(uint32_t fabric_api_version); void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc); -int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int err, int prov_errno, void **buf, size_t *buflen); +int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen); #ifdef ENABLE_EFA_POISONING static inline void efa_rdm_poison_mem_region(void *ptr, size_t size) From ee4d5789efe19f14cc2a43cd94b16e4c04348f5e Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 12 Sep 2024 22:03:31 +0000 Subject: [PATCH 035/393] prov/efa: Improve the zero-copy recv error message. Extend the EFA_PROV_ERRNOS and efa_show_help to process the error message when the receiver has zcpy recv turned on but get a rtm pkt that it cannot handle. The extended error message includes possible root causes and the potential mitigations. Signed-off-by: Shi Jin --- prov/efa/src/efa_errno.h | 3 ++- prov/efa/src/efa_strerror.c | 5 +++++ prov/efa/src/rdm/efa_rdm_cq.c | 11 ++++++----- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/prov/efa/src/efa_errno.h b/prov/efa/src/efa_errno.h index 2b61b2f0464..1a147f0fbdf 100644 --- a/prov/efa/src/efa_errno.h +++ b/prov/efa/src/efa_errno.h @@ -104,7 +104,8 @@ _(4121, DGRAM_CQ_READ, Error reading from DGRAM CQ) \ _(4122, SHM_INTERNAL_ERROR, SHM internal error) \ _(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \ - _(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) + _(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \ + _(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) /** @} */ diff --git a/prov/efa/src/efa_strerror.c b/prov/efa/src/efa_strerror.c index 11197816efd..35710501d0e 100644 --- a/prov/efa/src/efa_strerror.c +++ b/prov/efa/src/efa_strerror.c @@ -83,6 +83,11 @@ void efa_show_help(enum efa_errno err) { "which indicates the error is likely due to the peer process no " "longer being present."; break; + case FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX: + help = "This error is detected locally. " + "Please consider matching the local and remote libfabric versions, or turning off " + "the zero-copy recv feature by setting FI_EFA_USE_ZCPY_RX=0 in the environment"; + break; default: return; } diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 3d34293e7e7..17a540c4da7 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -371,12 +371,13 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct * QP and we cannot cancel that. */ if (OFI_UNLIKELY(ep->use_zcpy_rx && efa_rdm_pkt_type_is_rtm(pkt_type))) { - EFA_WARN(FI_LOG_CQ, - "Invalid pkt type %d! Peer %d doesn't respect the request from this EP that" - " RTM packets must be sent to the user recv QP.\n", - base_hdr->type, (int)pkt_entry->addr); + void *errbuf; + size_t errbuf_len; - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE); + /* local & peer host-id & ep address will be logged by efa_rdm_write_error_msg */ + if (!efa_rdm_write_error_msg(ep, pkt_entry->addr, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, &errbuf, &errbuf_len)) + EFA_WARN(FI_LOG_CQ, "Error: %s\n", (const char *) errbuf); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX); efa_rdm_pke_release_rx(pkt_entry); return; } From 94922d6b2bd5f61572e9e533127e7cc85c2cba73 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:45:58 +0000 Subject: [PATCH 036/393] build(deps): bump github/codeql-action from 3.26.6 to 3.26.7 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.6 to 3.26.7. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/4dd16135b69a43b6c8efb853346f8437d92d3c93...8214744c546c1e5c8f03dde8fab3a7353211988d) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 21260a33bc0..f6c384a539b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 + uses: github/codeql-action/init@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 + uses: github/codeql-action/autobuild@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 + uses: github/codeql-action/analyze@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index aee05e7af95..e1a251618a0 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 + uses: github/codeql-action/upload-sarif@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 with: sarif_file: results.sarif From 4b1d6ba71d4a1b8fd13f5a6db7d39051c136f775 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 27 Aug 2024 09:19:04 -0700 Subject: [PATCH 037/393] fabtests/multi_ep: separate MR resources per EP Instead of using one allocation and MR, separate into separate regions to test multiple MRs with multiple EPs Use common hmem alloc interfaces to properly use device support Signed-off-by: Alexia Ingerson --- fabtests/functional/multi_ep.c | 86 ++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/fabtests/functional/multi_ep.c b/fabtests/functional/multi_ep.c index 4e9b25edc76..5fc2ab56dcf 100644 --- a/fabtests/functional/multi_ep.c +++ b/fabtests/functional/multi_ep.c @@ -46,17 +46,16 @@ #include #include "shared.h" +#include "hmem.h" static struct fid_ep **eps; -static char *data_bufs; -static char **send_bufs; -static char **recv_bufs; +static char **send_bufs, **recv_bufs; +static struct fid_mr **send_mrs, **recv_mrs; +static void **send_descs, **recv_descs; static struct fi_context *recv_ctx; static struct fi_context *send_ctx; static struct fid_cq **txcqs, **rxcqs; static struct fid_av **avs; -static struct fid_mr *data_mr = NULL; -static void *data_desc = NULL; static fi_addr_t *remote_addr; static bool shared_cq = false; static bool shared_av = false; @@ -71,9 +70,13 @@ static void free_ep_res() { int i; - FT_CLOSE_FID(data_mr); for (i = 0; i < num_eps; i++) { + FT_CLOSE_FID(send_mrs[i]); + FT_CLOSE_FID(recv_mrs[i]); FT_CLOSE_FID(eps[i]); + + (void) ft_hmem_free(opts.iface, (void *) send_bufs[i]); + (void) ft_hmem_free(opts.iface, (void *) recv_bufs[i]); } for (i = 0; i < num_eps; i++) { @@ -84,9 +87,12 @@ static void free_ep_res() free(txcqs); free(rxcqs); - free(data_bufs); free(send_bufs); free(recv_bufs); + free(send_mrs); + free(recv_mrs); + free(send_descs); + free(recv_descs); free(send_ctx); free(recv_ctx); free(remote_addr); @@ -94,38 +100,64 @@ static void free_ep_res() free(avs); } +static int reg_mrs(void) +{ + int i, ret; + + for (i = 0; i < num_eps; i++) { + ret = ft_reg_mr(fi, send_bufs[i], opts.transfer_size, + ft_info_to_mr_access(fi), + (FT_MR_KEY + 1) * (i + 1), opts.iface, + opts.device, &send_mrs[i], &send_descs[i]); + if (ret) + return ret; + + ret = ft_reg_mr(fi, recv_bufs[i], opts.transfer_size, + ft_info_to_mr_access(fi), + (FT_MR_KEY + 2) * (i + 2), opts.iface, + opts.device, &recv_mrs[i], &recv_descs[i]); + if (ret) + return ret; + } + + return FI_SUCCESS; +} + static int alloc_multi_ep_res() { - char *rx_buf_ptr; int i, ret; eps = calloc(num_eps, sizeof(*eps)); remote_addr = calloc(num_eps, sizeof(*remote_addr)); - send_bufs = calloc(num_eps, sizeof(*send_bufs)); - recv_bufs = calloc(num_eps, sizeof(*recv_bufs)); + send_mrs = calloc(num_eps, sizeof(*send_mrs)); + recv_mrs = calloc(num_eps, sizeof(*recv_mrs)); + send_descs = calloc(num_eps, sizeof(*send_descs)); + recv_descs = calloc(num_eps, sizeof(*recv_descs)); send_ctx = calloc(num_eps, sizeof(*send_ctx)); recv_ctx = calloc(num_eps, sizeof(*recv_ctx)); - data_bufs = calloc(num_eps * 2, opts.transfer_size); + send_bufs = calloc(num_eps, opts.transfer_size); + recv_bufs = calloc(num_eps, opts.transfer_size); + txcqs = calloc(num_eps, sizeof(*txcqs)); rxcqs = calloc(num_eps, sizeof(*rxcqs)); avs = calloc(num_eps, sizeof(*avs)); if (!eps || !remote_addr || !send_bufs || !recv_bufs || - !send_ctx || !recv_ctx || !data_bufs || !txcqs || !rxcqs) + !send_ctx || !recv_ctx || !send_bufs || !recv_bufs || + !send_mrs || !recv_mrs || !send_descs || !recv_descs || + !txcqs || !rxcqs) return -FI_ENOMEM; - rx_buf_ptr = data_bufs + opts.transfer_size * num_eps; for (i = 0; i < num_eps; i++) { - send_bufs[i] = data_bufs + opts.transfer_size * i; - recv_bufs[i] = rx_buf_ptr + opts.transfer_size * i; - } + ret = ft_hmem_alloc(opts.iface, opts.device, + (void **) &send_bufs[i], opts.transfer_size); + if (ret) + return ret; - ret = ft_reg_mr(fi, data_bufs, num_eps * 2 * opts.transfer_size, - ft_info_to_mr_access(fi), FT_MR_KEY + 1, opts.iface, - opts.device, &data_mr, &data_desc); - if (ret) { - free_ep_res(); - return ret; + ret = ft_hmem_alloc(opts.iface, opts.device, + (void **) &recv_bufs[i], opts.transfer_size); + if (ret) + return ret; } return 0; @@ -140,7 +172,8 @@ static int ep_post_rx(int idx) do { ret = fi_recv(eps[idx], recv_bufs[idx], opts.transfer_size, - data_desc, FI_ADDR_UNSPEC, &recv_ctx[idx]); + recv_descs[idx], FI_ADDR_UNSPEC, + &recv_ctx[idx]); if (ret == -FI_EAGAIN) (void) fi_cq_read(rxcqs[cq_read_idx], NULL, 0); @@ -164,7 +197,8 @@ static int ep_post_tx(int idx) do { ret = fi_send(eps[idx], send_bufs[idx], opts.transfer_size, - data_desc, remote_addr[idx], &send_ctx[idx]); + send_descs[idx], remote_addr[idx], + &send_ctx[idx]); if (ret == -FI_EAGAIN) (void) fi_cq_read(txcqs[cq_read_idx], NULL, 0); @@ -393,6 +427,10 @@ static int run_test(void) } } + ret = reg_mrs(); + if (ret) + goto out; + for (i = 0; i < num_eps; i++) { if (hints->ep_attr->type != FI_EP_MSG) { ret = enable_ep(i); From d4189bc3313e1cfddfee03a097ce115a4553da0f Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 27 Aug 2024 09:20:01 -0700 Subject: [PATCH 038/393] fabtests/common: create common raw key functions Pull some of the mr key/addr exchange into separate functions that fill in and convert rma info before being exchanged so that separate tests can call this function and support FI_MR_RAW and FI_MR_VIRT_ADDR more easily Signed-off-by: Alexia Ingerson --- fabtests/common/shared.c | 71 +++++++++++++++++++++++++-------------- fabtests/include/shared.h | 5 +++ 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index c8fe2696156..320954ecc2c 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -1706,40 +1706,67 @@ int ft_init_av_addr(struct fid_av *av_ptr, struct fid_ep *ep_ptr, return 0; } -int ft_exchange_keys(struct fi_rma_iov *peer_iov) +int ft_fill_rma_info(struct fid_mr *mr, void *mr_buf, + struct fi_rma_iov *rma_iov, size_t *key_size, + size_t *rma_iov_len) { - char temp[FT_MAX_CTRL_MSG]; - struct fi_rma_iov *rma_iov = (struct fi_rma_iov *) temp; - size_t key_size = 0, len; uint64_t addr; + size_t buf_len = *rma_iov_len; int ret; if (fi->domain_attr->mr_mode & FI_MR_RAW) { - ret = fi_mr_raw_attr(mr, &addr, NULL, &key_size, 0); + *key_size = 0; + ret = fi_mr_raw_attr(mr, &addr, NULL, key_size, 0); if (ret != -FI_ETOOSMALL) return ret; - len = sizeof(*rma_iov) + key_size - sizeof(rma_iov->key); - if (len > FT_MAX_CTRL_MSG) { + *rma_iov_len = sizeof(*rma_iov) + *key_size - sizeof(rma_iov->key); + if (*rma_iov_len > buf_len) { FT_PRINTERR("Raw key too large for ctrl message", -FI_ETOOSMALL); return -FI_ETOOSMALL; } - } else { - len = sizeof(*rma_iov); - } - rma_iov->addr = fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR ? - (uintptr_t) rx_buf + ft_rx_prefix_size() : 0; - - if (fi->domain_attr->mr_mode & FI_MR_RAW) { ret = fi_mr_raw_attr(mr, &addr, (uint8_t *) &rma_iov->key, - &key_size, 0); + key_size, 0); if (ret) return ret; } else { rma_iov->key = fi_mr_key(mr); + *key_size = sizeof(rma_iov->key); + *rma_iov_len = sizeof(*rma_iov); } + rma_iov->addr = fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR ? + (uintptr_t) mr_buf : 0; + + return FI_SUCCESS; +} + +int ft_get_rma_info(struct fi_rma_iov *rma_iov, + struct fi_rma_iov *peer_iov, size_t key_size) +{ + if (fi->domain_attr->mr_mode & FI_MR_RAW) { + peer_iov->addr = rma_iov->addr; + peer_iov->len = rma_iov->len; + return fi_mr_map_raw(domain, rma_iov->addr, + (uint8_t *) &rma_iov->key, key_size, + &peer_iov->key, 0); + } + *peer_iov = *rma_iov; + return FI_SUCCESS; +} + +int ft_exchange_keys(struct fi_rma_iov *peer_iov) +{ + char temp[FT_MAX_CTRL_MSG]; + struct fi_rma_iov *rma_iov = (struct fi_rma_iov *) temp; + size_t key_size, len = FT_MAX_CTRL_MSG; + int ret; + + ret = ft_fill_rma_info(mr, rx_buf, rma_iov, &key_size, &len); + if (ret) + return ret; + ret = ft_hmem_copy_to(opts.iface, opts.device, tx_buf + ft_tx_prefix_size(), temp, len); if (ret) @@ -1758,17 +1785,9 @@ int ft_exchange_keys(struct fi_rma_iov *peer_iov) if (ret) return ret; - if (fi->domain_attr->mr_mode & FI_MR_RAW) { - peer_iov->addr = rma_iov->addr; - peer_iov->len = rma_iov->len; - ret = fi_mr_map_raw(domain, rma_iov->addr, - (uint8_t *) &rma_iov->key, key_size, - &peer_iov->key, 0); - if (ret) - return ret; - } else { - *peer_iov = *rma_iov; - } + ret = ft_get_rma_info(rma_iov, peer_iov, key_size); + if (ret) + return ret; ret = ft_post_rx(ep, rx_size, &rx_ctx); if (ret) diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index eb4e3238f10..55c807bbe31 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -466,6 +466,11 @@ int ft_init_av_dst_addr(struct fid_av *av_ptr, struct fid_ep *ep_ptr, fi_addr_t *remote_addr); int ft_init_av_addr(struct fid_av *av, struct fid_ep *ep, fi_addr_t *addr); +int ft_fill_rma_info(struct fid_mr *mr, void *mr_buf, + struct fi_rma_iov *rma_iov, size_t *key_size, + size_t *rma_iov_len); +int ft_get_rma_info(struct fi_rma_iov *rma_iov, + struct fi_rma_iov *peer_iov, size_t key_size); int ft_exchange_keys(struct fi_rma_iov *peer_iov); void ft_fill_mr_attr(struct iovec *iov, struct fi_mr_dmabuf *dmabuf, int iov_count, uint64_t access, From 30abcbd7eddca73c000663c3a0422f447de9bb00 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 27 Aug 2024 09:29:58 -0700 Subject: [PATCH 039/393] fabtests/multi_ep: add RMA validation To test RMA in addition to FI_MSG, make the following changes: - Register all MRs for RMA use - Use existing message test to exchange RMA information (key, address) - Add RMA test with data validation after the message exchange test Also includes renaming remote_addr to remote_fiaddr to distinguish between fiaddr and RMA addr Signed-off-by: Alexia Ingerson --- fabtests/functional/multi_ep.c | 205 +++++++++++++++++++++++++++------ 1 file changed, 169 insertions(+), 36 deletions(-) diff --git a/fabtests/functional/multi_ep.c b/fabtests/functional/multi_ep.c index 5fc2ab56dcf..d5ea2613236 100644 --- a/fabtests/functional/multi_ep.c +++ b/fabtests/functional/multi_ep.c @@ -52,11 +52,12 @@ static struct fid_ep **eps; static char **send_bufs, **recv_bufs; static struct fid_mr **send_mrs, **recv_mrs; static void **send_descs, **recv_descs; +static struct fi_rma_iov *peer_iovs; static struct fi_context *recv_ctx; static struct fi_context *send_ctx; static struct fid_cq **txcqs, **rxcqs; static struct fid_av **avs; -static fi_addr_t *remote_addr; +static fi_addr_t *remote_fiaddr; static bool shared_cq = false; static bool shared_av = false; int num_eps = 3; @@ -71,6 +72,9 @@ static void free_ep_res() int i; for (i = 0; i < num_eps; i++) { + if (fi->domain_attr->mr_mode & FI_MR_RAW) + (void) fi_mr_unmap_key(domain, peer_iovs[i].key); + FT_CLOSE_FID(send_mrs[i]); FT_CLOSE_FID(recv_mrs[i]); FT_CLOSE_FID(eps[i]); @@ -91,11 +95,12 @@ static void free_ep_res() free(recv_bufs); free(send_mrs); free(recv_mrs); + free(peer_iovs); free(send_descs); free(recv_descs); free(send_ctx); free(recv_ctx); - free(remote_addr); + free(remote_fiaddr); free(eps); free(avs); } @@ -128,11 +133,12 @@ static int alloc_multi_ep_res() int i, ret; eps = calloc(num_eps, sizeof(*eps)); - remote_addr = calloc(num_eps, sizeof(*remote_addr)); + remote_fiaddr = calloc(num_eps, sizeof(*remote_fiaddr)); send_mrs = calloc(num_eps, sizeof(*send_mrs)); recv_mrs = calloc(num_eps, sizeof(*recv_mrs)); send_descs = calloc(num_eps, sizeof(*send_descs)); recv_descs = calloc(num_eps, sizeof(*recv_descs)); + peer_iovs = calloc(num_eps, sizeof(*peer_iovs)); send_ctx = calloc(num_eps, sizeof(*send_ctx)); recv_ctx = calloc(num_eps, sizeof(*recv_ctx)); send_bufs = calloc(num_eps, opts.transfer_size); @@ -142,10 +148,10 @@ static int alloc_multi_ep_res() rxcqs = calloc(num_eps, sizeof(*rxcqs)); avs = calloc(num_eps, sizeof(*avs)); - if (!eps || !remote_addr || !send_bufs || !recv_bufs || + if (!eps || !remote_fiaddr || !send_bufs || !recv_bufs || !send_ctx || !recv_ctx || !send_bufs || !recv_bufs || !send_mrs || !recv_mrs || !send_descs || !recv_descs || - !txcqs || !rxcqs) + !txcqs || !rxcqs || !peer_iovs) return -FI_ENOMEM; for (i = 0; i < num_eps; i++) { @@ -182,22 +188,16 @@ static int ep_post_rx(int idx) return ret; } -static int ep_post_tx(int idx) +static int ep_post_tx(int idx, size_t len) { int ret, cq_read_idx = idx; if (shared_cq) cq_read_idx = 0; - if (ft_check_opts(FT_OPT_VERIFY_DATA)) { - ret = ft_fill_buf(send_bufs[idx], opts.transfer_size); - if (ret) - return ret; - } - do { - ret = fi_send(eps[idx], send_bufs[idx], opts.transfer_size, - send_descs[idx], remote_addr[idx], + ret = fi_send(eps[idx], send_bufs[idx], len, + send_descs[idx], remote_fiaddr[idx], &send_ctx[idx]); if (ret == -FI_EAGAIN) (void) fi_cq_read(txcqs[cq_read_idx], NULL, 0); @@ -207,10 +207,54 @@ static int ep_post_tx(int idx) return ret; } -static int do_transfers(void) +static int ep_post_write(int idx) +{ + int ret, cq_read_idx = idx; + + if (shared_cq) + cq_read_idx = 0; + + do { + ret = fi_write(eps[idx], send_bufs[idx], opts.transfer_size, + send_descs[idx], remote_fiaddr[idx], + peer_iovs[idx].addr, peer_iovs[idx].key, + &send_ctx[idx]); + if (ret == -FI_EAGAIN) + (void) fi_cq_read(txcqs[cq_read_idx], NULL, 0); + + } while (ret == -FI_EAGAIN); + + return ret; +} + +static int get_one_comp(struct fid_cq *cq) +{ + struct fi_cq_err_entry comp; + int ret, i; + + do { + ret = fi_cq_read(cq, &comp, 1); + if (ret > 0) + break; + + if (ret < 0 && ret != -FI_EAGAIN) + return ret; + + if (!shared_cq) { + /* Drive progress on all EPs in case peer is waiting on + * different EP pair + */ + for (i = 0; i < num_eps; i++) + (void) fi_cq_read(rxcqs[i], NULL, 0); + } + } while (1); + + return FI_SUCCESS; +} + +static int sync_all(void) { int i, ret, cq_read_idx; - uint64_t cur; for (i = 0; i < num_eps; i++) { ret = ep_post_rx(i); @@ -218,34 +262,120 @@ static int do_transfers(void) FT_PRINTERR("fi_recv", ret); return ret; } + + ret = ep_post_tx(i, 0); + if (ret) { + FT_PRINTERR("fi_send", ret); + return ret; + } + + cq_read_idx = shared_cq ? 0 : i; + + ret = get_one_comp(txcqs[cq_read_idx]); + if (ret) + return ret; + + ret = get_one_comp(rxcqs[cq_read_idx]); + if (ret) + return ret; } + return FI_SUCCESS; +} + +static int do_sends(void) +{ + char temp[FT_MAX_CTRL_MSG]; + struct fi_rma_iov *rma_iov = (struct fi_rma_iov *) temp; + int i, ret, cq_read_idx; + size_t key_size, len; - printf("Send to all %d remote EPs\n", num_eps); for (i = 0; i < num_eps; i++) { - ret = ep_post_tx(i); + ret = ep_post_rx(i); + if (ret) { + FT_PRINTERR("fi_recv", ret); + return ret; + } + } + + memset(peer_iovs, 0, sizeof(*peer_iovs) * num_eps); + + printf("Send RMA info to all %d remote EPs\n", num_eps); + for (i = 0; i < num_eps; i++) { + len = opts.transfer_size; + ret = ft_fill_rma_info(recv_mrs[i], recv_bufs[i], rma_iov, + &key_size, &len); + if (ret) + return ret; + + ret = ft_hmem_copy_to(opts.iface, opts.device, send_bufs[i], + rma_iov, len); + if (ret) + return ret; + + ret = ep_post_tx(i, len); if (ret) { FT_PRINTERR("fi_send", ret); return ret; } + + cq_read_idx = shared_cq ? 0 : i; + + ret = get_one_comp(rxcqs[cq_read_idx]); + if (ret) + return ret; + + ret = get_one_comp(txcqs[cq_read_idx]); + if (ret) + return ret; } printf("Wait for all messages from peer\n"); for (i = 0; i < num_eps; i++) { - if (shared_cq) - cq_read_idx = 0; - else - cq_read_idx = i; - cur = 0; - ret = ft_get_cq_comp(txcqs[cq_read_idx], &cur, 1, -1); - if (ret < 0) + ret = ft_hmem_copy_from(opts.iface, opts.device, rma_iov, + recv_bufs[i], len); + if (ret) + return ret; + + ret = ft_get_rma_info(rma_iov, &peer_iovs[i], key_size); + if (ret) + return ret; + } + + ret = sync_all(); + if (ret) + return ret; + + printf("PASSED multi ep sends\n"); + return 0; +} + +static int do_rma(void) +{ + int i, ret, cq_read_idx; + + for (i = 0; i < num_eps; i++) { + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ret = ft_fill_buf(send_bufs[i], opts.transfer_size); + if (ret) + return ret; + } + ret = ep_post_write(i); + if (ret) return ret; + } - cur = 0; - ret = ft_get_cq_comp(rxcqs[cq_read_idx], &cur, 1, -1); - if (ret < 0) + printf("Wait for all writes from peer\n"); + for (i = 0; i < num_eps; i++) { + cq_read_idx = shared_cq ? 0 : i; + ret = get_one_comp(txcqs[cq_read_idx]); + if (ret) return ret; } + ret = sync_all(); + if (ret) + return ret; + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { for (i = 0; i < num_eps; i++) { ret = ft_check_buf(recv_bufs[i], opts.transfer_size); @@ -255,11 +385,7 @@ static int do_transfers(void) printf("Data check OK\n"); } - ret = ft_finalize_ep(ep); - if (ret) - return ret; - - printf("PASSED multi ep\n"); + printf("PASSED multi ep writes\n"); return 0; } @@ -381,7 +507,7 @@ static int enable_ep(int idx) if (ret) return ret; - ret = ft_init_av_addr(avs[av_bind_idx], eps[idx], &remote_addr[idx]); + ret = ft_init_av_addr(avs[av_bind_idx], eps[idx], &remote_fiaddr[idx]); if (ret) return ret; @@ -439,8 +565,15 @@ static int run_test(void) } } - ret = do_transfers(); + ret = do_sends(); + if (ret) + goto out; + + ret = do_rma(); + if (ret) + goto out; + ret = ft_finalize_ep(ep); out: free_ep_res(); return ret; @@ -504,7 +637,7 @@ int main(int argc, char **argv) if (optind < argc) opts.dst_addr = argv[optind]; - hints->caps = FI_MSG; + hints->caps = FI_MSG | FI_RMA; hints->mode = FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; From c29c70eb3af8c3ccbdb20a22c2e8aa0b8dbc7fa8 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Wed, 3 Jul 2024 10:37:10 -0700 Subject: [PATCH 040/393] fabtests/multi_ep: add closing and reopening of MRs Add extra stress testing on multiple EPs/MRs by closing all the MRs, re-registering them, and re-running the whole test sequence Signed-off-by: Alexia Ingerson --- fabtests/functional/multi_ep.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fabtests/functional/multi_ep.c b/fabtests/functional/multi_ep.c index d5ea2613236..d4a1b333ad0 100644 --- a/fabtests/functional/multi_ep.c +++ b/fabtests/functional/multi_ep.c @@ -565,6 +565,30 @@ static int run_test(void) } } + ret = do_sends(); + if (ret) + goto out; + + ret = do_rma(); + if (ret) + goto out; + + printf("Testing closing and re-registering all MRs and retesting\n"); + for (i = 0; i < num_eps; i++) { + if (fi->domain_attr->mr_mode & FI_MR_RAW) { + ret = fi_mr_unmap_key(domain, peer_iovs[i].key); + if (ret) + goto out; + } + + FT_CLOSE_FID(send_mrs[i]); + FT_CLOSE_FID(recv_mrs[i]); + } + + ret = reg_mrs(); + if (ret) + goto out; + ret = do_sends(); if (ret) goto out; From cfde54e5e3c2ecfd71b79d568594d13f46efe3d1 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Wed, 21 Aug 2024 18:24:05 -0700 Subject: [PATCH 041/393] fabtests/multi_ep: use common long ops, switch shared-av and cq opts Change the --shared-av and --shared-cq into short opts -A and -Q, respectively. This allows the multi_ep test to make sure of the common long opts Signed-off-by: Alexia Ingerson --- fabtests/functional/multi_ep.c | 17 +++++------------ fabtests/functional/recv_cancel.c | 1 + fabtests/pytest/efa/test_multi_ep.py | 2 +- fabtests/scripts/runfabtests.sh | 4 ++-- 4 files changed, 9 insertions(+), 15 deletions(-) diff --git a/fabtests/functional/multi_ep.c b/fabtests/functional/multi_ep.c index d4a1b333ad0..ed9fab93ec0 100644 --- a/fabtests/functional/multi_ep.c +++ b/fabtests/functional/multi_ep.c @@ -616,14 +616,7 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - int lopt_idx = 0; - struct option long_opts[] = { - {"shared-av", no_argument, NULL, LONG_OPT_SHARED_AV}, - {"shared-cq", no_argument, NULL, LONG_OPT_SHARED_CQ}, - {0, 0, 0, 0} - }; - - while ((op = getopt_long(argc, argv, "c:vh" ADDR_OPTS INFO_OPTS, + while ((op = getopt_long(argc, argv, "c:vhAQ" ADDR_OPTS INFO_OPTS, long_opts, &lopt_idx)) != -1) { switch (op) { default: @@ -636,10 +629,10 @@ int main(int argc, char **argv) case 'v': opts.options |= FT_OPT_VERIFY_DATA; break; - case LONG_OPT_SHARED_AV: + case 'A': shared_av = true; break; - case LONG_OPT_SHARED_CQ: + case 'Q': shared_cq = true; break; case '?': @@ -648,10 +641,10 @@ int main(int argc, char **argv) FT_PRINT_OPTS_USAGE("-c ", "number of endpoints to create and test (def 3)"); FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); - FT_PRINT_OPTS_USAGE("--shared-cq", + FT_PRINT_OPTS_USAGE("-Q", "Share tx/rx cq among endpoints. \n" "By default each ep has its own tx/rx cq"); - FT_PRINT_OPTS_USAGE("--shared-av", + FT_PRINT_OPTS_USAGE("-A", "Share the av among endpoints. \n" "By default each ep has its own av"); return EXIT_FAILURE; diff --git a/fabtests/functional/recv_cancel.c b/fabtests/functional/recv_cancel.c index dc7cf3d072a..d943f7d8259 100644 --- a/fabtests/functional/recv_cancel.c +++ b/fabtests/functional/recv_cancel.c @@ -200,6 +200,7 @@ static int recv_cancel_host(void) static int run_test(void) { int ret; + if (hints->ep_attr->type == FI_EP_MSG) ret = ft_init_fabric_cm(); else diff --git a/fabtests/pytest/efa/test_multi_ep.py b/fabtests/pytest/efa/test_multi_ep.py index 561919f1446..634529f0067 100644 --- a/fabtests/pytest/efa/test_multi_ep.py +++ b/fabtests/pytest/efa/test_multi_ep.py @@ -6,6 +6,6 @@ def test_multi_ep(cmdline_args, shared_cq): from common import ClientServerTest cmd = "fi_multi_ep -e rdm" if shared_cq: - cmd += " --shared-cq" + cmd += " -Q" test = ClientServerTest(cmdline_args, cmd) test.run() diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index e6ad879d4e5..fef5e111a51 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -129,8 +129,8 @@ functional_tests=( "fi_rdm_shared_av" "fi_multi_mr -e msg -V" "fi_multi_mr -e rdm -V" - "fi_multi_ep -e msg -v --shared-av" - "fi_multi_ep -e rdm -v --shared-av" + "fi_multi_ep -e msg -v -A" + "fi_multi_ep -e rdm -v -A" "fi_recv_cancel -e rdm -V" "fi_unexpected_msg -e msg -I 10 -v" "fi_unexpected_msg -e rdm -I 10 -v" From 669ca15913407701d742ee267dfb1a9a7055875e Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 30 Jul 2024 13:14:56 -0700 Subject: [PATCH 042/393] fabtests/common: add common threading option Add long option --threading for tests to use to allow caller to set threading level This removes default setting of domain_attr->threading in various tests since the default is now universally set to FI_THREAD_DOMAIN Signed-off-by: Alexia Ingerson --- fabtests/benchmarks/dgram_pingpong.c | 1 - fabtests/benchmarks/msg_bw.c | 1 - fabtests/benchmarks/rdm_bw.c | 1 - fabtests/benchmarks/rdm_pingpong.c | 1 - fabtests/benchmarks/rdm_tagged_bw.c | 1 - fabtests/benchmarks/rdm_tagged_pingpong.c | 1 - fabtests/benchmarks/rma_pingpong.c | 1 - fabtests/common/shared.c | 22 +++++++++++++++++++ fabtests/functional/multi_ep.c | 2 ++ fabtests/include/shared.h | 5 ++++- .../efa/src/efa_exhaust_mr_reg_rdm_pingpong.c | 1 - 11 files changed, 28 insertions(+), 9 deletions(-) diff --git a/fabtests/benchmarks/dgram_pingpong.c b/fabtests/benchmarks/dgram_pingpong.c index cab09f206bf..c65460851ab 100644 --- a/fabtests/benchmarks/dgram_pingpong.c +++ b/fabtests/benchmarks/dgram_pingpong.c @@ -129,7 +129,6 @@ int main(int argc, char **argv) hints->caps = FI_MSG; hints->mode |= FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/msg_bw.c b/fabtests/benchmarks/msg_bw.c index 751fd324bc6..9d613a0aa11 100644 --- a/fabtests/benchmarks/msg_bw.c +++ b/fabtests/benchmarks/msg_bw.c @@ -116,7 +116,6 @@ int main(int argc, char **argv) } hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->addr_format = opts.address_format; hints->tx_attr->tclass = FI_TC_BULK_DATA; diff --git a/fabtests/benchmarks/rdm_bw.c b/fabtests/benchmarks/rdm_bw.c index b355f21f5f1..d229cc7c1fc 100644 --- a/fabtests/benchmarks/rdm_bw.c +++ b/fabtests/benchmarks/rdm_bw.c @@ -82,7 +82,6 @@ int main(int argc, char **argv) hints->caps = FI_MSG; hints->mode |= FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_BULK_DATA; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rdm_pingpong.c b/fabtests/benchmarks/rdm_pingpong.c index d521fc2eee0..9eb5a0e0de5 100644 --- a/fabtests/benchmarks/rdm_pingpong.c +++ b/fabtests/benchmarks/rdm_pingpong.c @@ -103,7 +103,6 @@ int main(int argc, char **argv) hints->caps = FI_MSG; hints->mode |= FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rdm_tagged_bw.c b/fabtests/benchmarks/rdm_tagged_bw.c index 239f0fef866..c2d795edb3a 100644 --- a/fabtests/benchmarks/rdm_tagged_bw.c +++ b/fabtests/benchmarks/rdm_tagged_bw.c @@ -107,7 +107,6 @@ int main(int argc, char **argv) hints->caps = FI_TAGGED; hints->mode |= FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_BULK_DATA; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rdm_tagged_pingpong.c b/fabtests/benchmarks/rdm_tagged_pingpong.c index a54725e5186..a0288ad7ee1 100644 --- a/fabtests/benchmarks/rdm_tagged_pingpong.c +++ b/fabtests/benchmarks/rdm_tagged_pingpong.c @@ -105,7 +105,6 @@ int main(int argc, char **argv) hints->caps = FI_TAGGED; hints->mode |= FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rma_pingpong.c b/fabtests/benchmarks/rma_pingpong.c index be4cd71302c..76742f27d3a 100644 --- a/fabtests/benchmarks/rma_pingpong.c +++ b/fabtests/benchmarks/rma_pingpong.c @@ -94,7 +94,6 @@ int main(int argc, char **argv) hints->caps = FI_MSG | FI_RMA | FI_WRITE | FI_REMOTE_WRITE; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->mode = FI_CONTEXT; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->addr_format = opts.address_format; while ((op = getopt_long(argc, argv, "Uh" CS_OPTS INFO_OPTS API_OPTS diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 320954ecc2c..11b7a638fa2 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -1040,6 +1040,8 @@ int ft_getinfo(struct fi_info *hints, struct fi_info **info) hints->domain_attr->mr_mode |= FI_MR_HMEM; } + hints->domain_attr->threading = opts.threading; + ret = fi_getinfo(FT_FIVERSION, node, service, flags, hints, info); if (ret) { FT_PRINTERR("fi_getinfo", ret); @@ -4240,6 +4242,8 @@ void ft_longopts_usage() "maximum untagged message size"); FT_PRINT_OPTS_USAGE("--use-fi-more", "Run tests with FI_MORE"); + FT_PRINT_OPTS_USAGE("--threading", + "threading model: safe|completion|domain (default:domain)"); } int debug_assert; @@ -4253,6 +4257,7 @@ struct option long_opts[] = { {"control-progress", required_argument, NULL, LONG_OPT_CONTROL_PROGRESS}, {"max-msg-size", required_argument, NULL, LONG_OPT_MAX_MSG_SIZE}, {"use-fi-more", no_argument, NULL, LONG_OPT_USE_FI_MORE}, + {"threading", required_argument, NULL, LONG_OPT_THREADING}, {NULL, 0, NULL, 0}, }; @@ -4270,6 +4275,20 @@ int ft_parse_progress_model_string(char* progress_str) return ret; } +static int ft_parse_threading_string(char* threading_str) +{ + int ret = -1; + + if (!strcasecmp("safe", threading_str)) + ret = FI_THREAD_SAFE; + else if (!strcasecmp("completion", threading_str)) + ret = FI_THREAD_COMPLETION; + else if (!strcasecmp("domain", threading_str)) + ret = FI_THREAD_DOMAIN; + + return ret; +} + int ft_parse_long_opts(int op, char *optarg) { switch (op) { @@ -4296,6 +4315,9 @@ int ft_parse_long_opts(int op, char *optarg) case LONG_OPT_USE_FI_MORE: opts.use_fi_more = 1; return 0; + case LONG_OPT_THREADING: + opts.threading = ft_parse_threading_string(optarg); + return 0; default: return EXIT_FAILURE; } diff --git a/fabtests/functional/multi_ep.c b/fabtests/functional/multi_ep.c index ed9fab93ec0..2bddb8b9b1c 100644 --- a/fabtests/functional/multi_ep.c +++ b/fabtests/functional/multi_ep.c @@ -620,6 +620,8 @@ int main(int argc, char **argv) long_opts, &lopt_idx)) != -1) { switch (op) { default: + if (!ft_parse_long_opts(op, optarg)) + continue; ft_parse_addr_opts(op, optarg, &opts); ft_parseinfo(op, optarg, hints, &opts); break; diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 55c807bbe31..4bed8cb572b 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -215,6 +215,7 @@ struct ft_opts { int force_prefix; enum fi_hmem_iface iface; uint64_t device; + enum fi_threading threading; char **argv; }; @@ -320,7 +321,8 @@ extern char default_port[8]; .iface = FI_HMEM_SYSTEM, \ .device = 0, \ .argc = argc, .argv = argv, \ - .address_format = FI_FORMAT_UNSPEC \ + .address_format = FI_FORMAT_UNSPEC, \ + .threading = FI_THREAD_DOMAIN \ } #define FT_STR_LEN 32 @@ -661,6 +663,7 @@ enum { LONG_OPT_CONTROL_PROGRESS, LONG_OPT_MAX_MSG_SIZE, LONG_OPT_USE_FI_MORE, + LONG_OPT_THREADING, }; extern int debug_assert; diff --git a/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c b/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c index 9cde8bc43a3..ca38195e48d 100644 --- a/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c +++ b/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c @@ -81,7 +81,6 @@ int main(int argc, char **argv) hints->caps = FI_MSG; hints->mode |= FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; - hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->addr_format = opts.address_format; ret = ft_init_fabric(); From 9112766ee7ae97c647eaa2cb57674070c4dace1c Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 30 Jul 2024 13:30:26 -0700 Subject: [PATCH 043/393] scripts/runfabtests.sh: add more multi_ep tests As the most OFI resource intensive test, this test is a good test for testing different combinations of resource binding, especially with FI_THREAD_COMPLETION turned on. Applications that utilize threads will likely use a combination of separate domains, EPs, CQs, and AVs. Even though this test does not use threads and cannot test the protection against these resources, it can test different optimization paths within providers that may be triggered based on the threading level requested. Signed-off-by: Alexia Ingerson --- fabtests/scripts/runfabtests.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index fef5e111a51..8b402aa38e1 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -131,6 +131,18 @@ functional_tests=( "fi_multi_mr -e rdm -V" "fi_multi_ep -e msg -v -A" "fi_multi_ep -e rdm -v -A" + "fi_multi_ep -e msg -v -Q" + "fi_multi_ep -e rdm -v -Q" + "fi_multi_ep -e msg -v -A -Q" + "fi_multi_ep -e rdm -v -A -Q" + "fi_multi_ep -e msg -v --threading completion" + "fi_multi_ep -e rdm -v --threading completion" + "fi_multi_ep -e msg -v -A --threading completion" + "fi_multi_ep -e rdm -v -A --threading completion" + "fi_multi_ep -e msg -v -Q --threading completion" + "fi_multi_ep -e rdm -v -Q --threading completion" + "fi_multi_ep -e msg -v -A -Q --threading completion" + "fi_multi_ep -e rdm -v -A -Q --threading completion" "fi_recv_cancel -e rdm -V" "fi_unexpected_msg -e msg -I 10 -v" "fi_unexpected_msg -e rdm -I 10 -v" From 3d7bc021c8b4d7b7adfca7c17bb71e8cfa6463db Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 30 Jul 2024 13:38:08 -0700 Subject: [PATCH 044/393] test_configs/tcp: remove multi_ep from tcp exclude Signed-off-by: Alexia Ingerson --- fabtests/test_configs/tcp/tcp.exclude | 2 -- 1 file changed, 2 deletions(-) diff --git a/fabtests/test_configs/tcp/tcp.exclude b/fabtests/test_configs/tcp/tcp.exclude index ec4b67d507c..63f13f99a24 100644 --- a/fabtests/test_configs/tcp/tcp.exclude +++ b/fabtests/test_configs/tcp/tcp.exclude @@ -6,8 +6,6 @@ atomic # dgram endpoints not supported dgram -multi_ep - # tests use counters, but counters not supported multi_mr rma_event From 5c9640559524136d2fb3b6bbb4275fc0a32a6c84 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Mon, 16 Sep 2024 13:32:06 -0700 Subject: [PATCH 045/393] core: 1.8 ABI compat ABI version is updated to 1.8 to accommodate fi_fabric2() API. Signed-off-by: Jessie Yang --- include/ofi_abi.h | 2 +- man/fabric.7.md | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/ofi_abi.h b/include/ofi_abi.h index 66f76e93909..c7f55b68d7a 100644 --- a/include/ofi_abi.h +++ b/include/ofi_abi.h @@ -111,7 +111,7 @@ extern "C" { * name appended with the ABI version that it is compatible with. */ -#define CURRENT_ABI "FABRIC_1.7" +#define CURRENT_ABI "FABRIC_1.8" #if HAVE_ALIAS_ATTRIBUTE == 1 #define DEFAULT_SYMVER_PRE(a) a##_ diff --git a/man/fabric.7.md b/man/fabric.7.md index 789dd1ef2f7..a25190a46b0 100644 --- a/man/fabric.7.md +++ b/man/fabric.7.md @@ -447,6 +447,14 @@ attributes: *fi_domain_attr* : Added max_ep_auth_key +## ABI 1.8 + +ABI version starting with libfabric 2.0. Added new fi_fabric2 API call. +Added new fields to the following attributes: + +*fi_domain_attr* +: Added max_group_id + # SEE ALSO [`fi_info`(1)](fi_info.1.html), From bd0f21602478151bb84c373b836d9ef94298e4b7 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Tue, 17 Sep 2024 06:10:54 +0000 Subject: [PATCH 046/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fabric.7 | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/man/man7/fabric.7 b/man/man7/fabric.7 index 0d8f5686769..91659d5f7b3 100644 --- a/man/man7/fabric.7 +++ b/man/man7/fabric.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fabric" "7" "2024\-03\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fabric" "7" "2024\-09\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -499,6 +499,14 @@ Added new fields to the following attributes: .TP \f[I]fi_domain_attr\f[R] Added max_ep_auth_key +.SS ABI 1.8 +.PP +ABI version starting with libfabric 2.0. +Added new fi_fabric2 API call. +Added new fields to the following attributes: +.TP +\f[I]fi_domain_attr\f[R] +Added max_group_id .SH SEE ALSO .PP \f[C]fi_info\f[R](1), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3), From a34b25c34bf795bb344fcfc67b1242a007675cf3 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 13 Sep 2024 23:50:07 +0000 Subject: [PATCH 047/393] prov/efa: remove unused fields in efa_rdm_peer These dlists are now migrated to srx context. Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_peer.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index f6058883aed..8c2703fc140 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -60,8 +60,6 @@ struct efa_rdm_peer { int rnr_queued_pkt_cnt; /**< queued RNR packet count */ struct dlist_entry rnr_backoff_entry; /**< linked to efa_domain->peer_backoff_list */ struct dlist_entry handshake_queued_entry; /**< linked with efa_domain->handshake_queued_peer_list */ - struct dlist_entry rx_unexp_list; /**< a list of unexpected untagged rxe for this peer */ - struct dlist_entry rx_unexp_tagged_list; /**< a list of unexpected tagged rxe for this peer */ struct dlist_entry txe_list; /**< a list of txe related to this peer */ struct dlist_entry rxe_list; /**< a list of rxe relased to this peer */ struct dlist_entry overflow_pke_list; /**< a list of out-of-order pke that overflow the current recvwin */ From 4050272cf6979b01b135339d61496e8eac3f6913 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 13 Sep 2024 23:55:20 +0000 Subject: [PATCH 048/393] prov/efa: Shrink the size of extra_info array Currently, extra_info is a 256*8 bytes array in efa_rdm_ep and efa_rdm_peer, this is way too much as it can cover 256*64 bits of extra features/requests that RDM endpoint is likely to have. This patch reduces it to 4*8 bytes array that can cover 256 bits which should be enough for the foreseeable future. Signed-off-by: Shi Jin --- prov/efa/docs/efa_rdm_protocol_v4.md | 2 +- prov/efa/src/rdm/efa_rdm_protocol.h | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 968087ca89b..9016ec00958 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -414,7 +414,7 @@ Note, the field `extra_info` was named `features` when protocol v4 was initially only planned for extra features. Later, we discovered that the handshake subprotocol can also be used to pass additional request information, thus introduced the concept of "extra request" and renamed this field `extra_info`. -`nextra_p3` is number of `extra_info` flags of the endpoint plus 3. The "plus 3" is for historical reasons. +`nextra_p3` is number of 64-bit `extra_info` elements of the endpoint plus 3. The "plus 3" is for historical reasons. When protocol v4 was initially introduced, this field is named `maxproto`. The original plan was that protocol v4 can only have 64 extra features/requests. If the number of extra feature/request ever exceeds 64, the next feature/request will be defined as version 5 feature/request, (version 6 if the number exceeds 128, so on so diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h index abcec6d091e..1b94b5338d1 100644 --- a/prov/efa/src/rdm/efa_rdm_protocol.h +++ b/prov/efa/src/rdm/efa_rdm_protocol.h @@ -41,7 +41,12 @@ struct efa_ep_addr { #define EFA_RDM_EXTRA_FEATURE_READ_NACK BIT_ULL(6) #define EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP BIT_ULL(7) #define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 8 -#define EFA_RDM_MAX_NUM_EXINFO (256) +/* + * The length of 64-bit extra_info array used in efa_rdm_ep + * and efa_rdm_peer + * 4 means 64*4=256 bits of extra features or requests + */ +#define EFA_RDM_MAX_NUM_EXINFO (4) /* * Packet type ID of each packet type (section 1.3) From 8f45c7a2513be7a86a948ae44d951f2dcd262f38 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 16 Sep 2024 19:20:48 +0000 Subject: [PATCH 049/393] prov/efa: Remove unused fields in efa_rdm_ep Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep.h | 6 ------ prov/efa/src/rdm/efa_rdm_ep_fiops.c | 4 +--- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 21b8f271647..248129ee3ea 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -91,12 +91,6 @@ struct efa_rdm_ep { /* Applicaiton's message prefix size. */ size_t msg_prefix_size; - /* EFA RDM protocol's max header size */ - size_t max_proto_hdr_size; - - /* tx iov limit of EFA device */ - size_t efa_device_iov_limit; - /* threshold to release multi_recv buffer */ size_t min_multi_recv_size; diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index c706d784a5d..e6db119ec1b 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -570,17 +570,15 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->inject_size = info->tx_attr->inject_size; efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size; efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size; - efa_rdm_ep->efa_device_iov_limit = efa_domain->device->rdm_info->tx_attr->iov_limit; efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version); efa_rdm_ep->shm_permitted = true; efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size; efa_rdm_ep->max_rma_size = info->ep_attr->max_msg_size; efa_rdm_ep->msg_prefix_size = info->ep_attr->msg_prefix_size; - efa_rdm_ep->max_proto_hdr_size = efa_rdm_pkt_type_get_max_hdr_size(); efa_rdm_ep->mtu_size = efa_domain->device->rdm_info->ep_attr->max_msg_size; efa_rdm_ep->max_data_payload_size = efa_rdm_ep->mtu_size - sizeof(struct efa_rdm_ctsdata_hdr) - sizeof(struct efa_rdm_ctsdata_opt_connid_hdr); - efa_rdm_ep->min_multi_recv_size = efa_rdm_ep->mtu_size - efa_rdm_ep->max_proto_hdr_size; + efa_rdm_ep->min_multi_recv_size = efa_rdm_ep->mtu_size - efa_rdm_pkt_type_get_max_hdr_size(); if (efa_env.tx_queue_size > 0 && efa_env.tx_queue_size < efa_rdm_ep->efa_max_outstanding_tx_ops) From a1f06edcbff21ec4667f4ae13bdb3f30562831d8 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 17 Sep 2024 21:46:58 +0000 Subject: [PATCH 050/393] prov/efa/test: Use correct qp num in the mock Currently, the test uses hard-coded 0 as the return value of efa_mock_ibv_read_qp_num_return_mock, which is wrong as the qpn can be non-zero when there are other qp that share the device. This patch fixes this issue by using the correct qp num associated with the ep. Signed-off-by: Shi Jin --- prov/efa/test/efa_unit_test_cq.c | 6 +++--- prov/efa/test/efa_unit_test_ep.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index eb8ebe1ae5a..974a1f1ad8e 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -153,7 +153,7 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, will_return(efa_mock_ibv_end_poll_check_mock, NULL); will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND); will_return(efa_mock_ibv_read_vendor_err_return_mock, vendor_error); - will_return(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); ret = fi_cq_read(resource->cq, &cq_entry, 1); /* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */ assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); @@ -317,7 +317,7 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) * therefore use will_return_always() */ will_return_always(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV); - will_return_always(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return_always(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); /* the recv error will not populate to application cq because it's an EFA internal error and * and not related to any application recv. Currently we can only read the error from eq. @@ -612,7 +612,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc will_return(efa_mock_ibv_read_slid_return_mock, 0xffff); // slid=0xffff(-1) indicates an unknown AH will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size); will_return_maybe(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV); - will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); will_return_maybe(efa_mock_ibv_read_wc_flags_return_mock, 0); will_return_maybe(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn); diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index de7454567c1..03d922e0fff 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -192,7 +192,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin will_return(efa_mock_ibv_next_poll_check_function_called_and_return_mock, ENOENT); will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size); will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV); - will_return(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); will_return(efa_mock_ibv_read_wc_flags_return_mock, 0); will_return(efa_mock_ibv_read_slid_return_mock, efa_rdm_ep_get_peer_ahn(efa_rdm_ep, peer_addr)); will_return(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn); @@ -204,7 +204,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin */ will_return(efa_mock_ibv_end_poll_check_mock, NULL); will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND); - will_return(efa_mock_ibv_read_qp_num_return_mock, 0); + will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); will_return(efa_mock_ibv_read_vendor_err_return_mock, FI_EFA_ERR_OTHER); will_return(efa_mock_ibv_start_poll_return_mock, IBV_WC_SUCCESS); From 93cd9ff3e0af7de3b9e1ed8888e6ee2312c3d499 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 17 Sep 2024 21:03:35 +0000 Subject: [PATCH 051/393] prov/efa: Remove dupliated user_info in efa_rdm_ep user_info is just a duplication of fi_info passed by fi_endpoint(), which is duplicated to base_ep.info already. Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep.h | 7 +++--- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 33 +++++++++++------------------ prov/efa/src/rdm/efa_rdm_ope.c | 2 +- prov/efa/test/efa_unit_test_ep.c | 4 ++-- 4 files changed, 18 insertions(+), 28 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 248129ee3ea..a8a7fa8b149 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -187,7 +187,6 @@ struct efa_rdm_ep { */ bool use_device_rdma; - struct fi_info *user_info; /**< fi_info passed by user when calling fi_endpoint */ bool sendrecv_in_order_aligned_128_bytes; /**< whether to support in order send/recv of each aligned 128 bytes memory region */ bool write_in_order_aligned_128_bytes; /**< whether to support in order write of each aligned 128 bytes memory region */ char err_msg[EFA_RDM_ERROR_MSG_BUFFER_LENGTH]; /* A large enough buffer to store CQ/EQ error data used by e.g. fi_cq_readerr */ @@ -240,7 +239,7 @@ static inline size_t efa_rdm_ep_get_tx_pool_size(struct efa_rdm_ep *ep) static inline int efa_rdm_ep_need_sas(struct efa_rdm_ep *ep) { - return ((ep->user_info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->user_info->rx_attr->msg_order & FI_ORDER_SAS)); + return ((ep->base_ep.info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->base_ep.info->rx_attr->msg_order & FI_ORDER_SAS)); } @@ -365,7 +364,7 @@ bool efa_rdm_ep_support_rdma_write(struct efa_rdm_ep *ep) * @return -FI_EOPNOTSUPP if FI_RMA wasn't requested, 0 if it was. */ static inline int efa_rdm_ep_cap_check_rma(struct efa_rdm_ep *ep) { - if ((ep->user_info->caps & FI_RMA) == FI_RMA) + if ((ep->base_ep.info->caps & FI_RMA) == FI_RMA) return 0; EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_RMA capability, which was not requested.\n"); return -FI_EOPNOTSUPP; @@ -376,7 +375,7 @@ static inline int efa_rdm_ep_cap_check_rma(struct efa_rdm_ep *ep) { * @return -FI_EOPNOTSUPP if FI_ATOMIC wasn't requested, 0 if it was. */ static inline int efa_rdm_ep_cap_check_atomic(struct efa_rdm_ep *ep) { - if ((ep->user_info->caps & FI_ATOMIC) == FI_ATOMIC) + if ((ep->base_ep.info->caps & FI_ATOMIC) == FI_ATOMIC) return 0; EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_ATOMIC capability, which was not requested.\n"); return -FI_EOPNOTSUPP; diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index e6db119ec1b..956c2804f82 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -454,9 +454,9 @@ void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep) } /* Max msg size is too large, turn off zcpy recv */ - if (ep->max_msg_size > ep->mtu_size - ep->user_info->ep_attr->msg_prefix_size) { + if (ep->max_msg_size > ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size) { EFA_INFO(FI_LOG_EP_CTRL, "max_msg_size (%zu) is greater than the mtu size limit: %zu. Zero-copy receive protocol will be disabled.\n", - ep->max_msg_size, ep->mtu_size - ep->user_info->ep_attr->msg_prefix_size); + ep->max_msg_size, ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size); ep->use_zcpy_rx = false; goto out; } @@ -552,12 +552,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->shm_ep = NULL; } - efa_rdm_ep->user_info = fi_dupinfo(info); - if (!efa_rdm_ep->user_info) { - ret = -FI_ENOMEM; - goto err_free_ep; - } - efa_rdm_ep->host_id = efa_get_host_id(efa_env.host_id_file); if (efa_rdm_ep->host_id) { EFA_INFO(FI_LOG_EP_CTRL, "efa_rdm_ep->host_id: i-%017lx\n", efa_rdm_ep->host_id); @@ -999,9 +993,6 @@ static int efa_rdm_ep_close(struct fid *fid) if (efa_rdm_ep->pke_vec) free(efa_rdm_ep->pke_vec); - if (efa_rdm_ep->user_info) - fi_freeinfo(efa_rdm_ep->user_info); - free(efa_rdm_ep); return retv; } @@ -1137,7 +1128,7 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep) use_shm = true; - assert(ep->user_info); + assert(ep->base_ep.info); /* * shm provider must make cuda calls to transfer cuda memory. @@ -1147,7 +1138,7 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep) * AWS Neuron and Habana Synapse, have no SHM provider * support anyways, so disabling SHM will not impact them. */ - if (((ep->user_info->caps & FI_HMEM) + if (((ep->base_ep.info->caps & FI_HMEM) && hmem_ops[FI_HMEM_CUDA].initialized && !ep->cuda_api_permitted) || !ep->shm_permitted) { @@ -1468,11 +1459,11 @@ static int efa_rdm_ep_set_shared_memory_permitted(struct efa_rdm_ep *ep, bool sh */ static int efa_rdm_ep_set_max_msg_size(struct efa_rdm_ep *ep, size_t max_msg_size) { - if (max_msg_size > ep->user_info->ep_attr->max_msg_size) { + if (max_msg_size > ep->base_ep.info->ep_attr->max_msg_size) { EFA_WARN(FI_LOG_EP_CTRL, "Requested size of %zu for FI_OPT_MAX_MSG_SIZE " "exceeds the maximum (%zu)\n", - max_msg_size, ep->user_info->ep_attr->max_msg_size); + max_msg_size, ep->base_ep.info->ep_attr->max_msg_size); return -FI_EINVAL; } ep->max_msg_size = max_msg_size; @@ -1494,11 +1485,11 @@ static int efa_rdm_ep_set_max_msg_size(struct efa_rdm_ep *ep, size_t max_msg_siz */ static int efa_rdm_ep_set_max_rma_size(struct efa_rdm_ep *ep, size_t max_rma_size) { - if (max_rma_size > ep->user_info->ep_attr->max_msg_size) { + if (max_rma_size > ep->base_ep.info->ep_attr->max_msg_size) { EFA_WARN(FI_LOG_EP_CTRL, "Requested size of %zu for FI_OPT_MAX_RMA_SIZE " "exceeds the maximum (%zu)\n", - max_rma_size, ep->user_info->ep_attr->max_msg_size); + max_rma_size, ep->base_ep.info->ep_attr->max_msg_size); return -FI_EINVAL; } ep->max_rma_size = max_rma_size; @@ -1520,11 +1511,11 @@ static int efa_rdm_ep_set_max_rma_size(struct efa_rdm_ep *ep, size_t max_rma_siz */ static int efa_rdm_ep_set_inject_msg_size(struct efa_rdm_ep *ep, size_t inject_msg_size) { - if (inject_msg_size > ep->user_info->tx_attr->inject_size) { + if (inject_msg_size > ep->base_ep.info->tx_attr->inject_size) { EFA_WARN(FI_LOG_EP_CTRL, "Requested size of %zu for FI_OPT_INJECT_MSG_SIZE " "exceeds the maximum (%zu)\n", - inject_msg_size, ep->user_info->tx_attr->inject_size); + inject_msg_size, ep->base_ep.info->tx_attr->inject_size); return -FI_EINVAL; } ep->inject_size = inject_msg_size; @@ -1546,11 +1537,11 @@ static int efa_rdm_ep_set_inject_msg_size(struct efa_rdm_ep *ep, size_t inject_m */ static int efa_rdm_ep_set_inject_rma_size(struct efa_rdm_ep *ep, size_t inject_rma_size) { - if (inject_rma_size > ep->user_info->tx_attr->inject_size) { + if (inject_rma_size > ep->base_ep.info->tx_attr->inject_size) { EFA_WARN(FI_LOG_EP_CTRL, "Requested size of %zu for FI_OPT_INJECT_RMA_SIZE " "exceeds the maximum (%zu)\n", - inject_rma_size, ep->user_info->tx_attr->inject_size); + inject_rma_size, ep->base_ep.info->tx_attr->inject_size); return -FI_EINVAL; } ep->inject_size = inject_rma_size; diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index 5b57d91847a..80332ef70b9 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -58,7 +58,7 @@ void efa_rdm_txe_construct(struct efa_rdm_ope *txe, txe->cq_entry.len = ofi_total_iov_len(txe->iov, txe->iov_count); txe->cq_entry.buf = OFI_LIKELY(txe->cq_entry.len > 0) ? txe->iov[0].iov_base : NULL; - if (ep->user_info->mode & FI_MSG_PREFIX) { + if (ep->base_ep.info->mode & FI_MSG_PREFIX) { ofi_consume_iov_desc(txe->iov, txe->desc, &txe->iov_count, ep->msg_prefix_size); } txe->total_len = ofi_total_iov_len(txe->iov, txe->iov_count); diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 03d922e0fff..1a54092c040 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -742,7 +742,7 @@ void test_efa_rdm_ep_rma_without_caps(struct efa_resource **state) /* ensure we don't have RMA capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert_int_equal( efa_rdm_ep->user_info->caps & FI_RMA, 0); + assert_int_equal( efa_rdm_ep->base_ep.info->caps & FI_RMA, 0); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -793,7 +793,7 @@ void test_efa_rdm_ep_atomic_without_caps(struct efa_resource **state) /* ensure we don't have ATOMIC capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert_int_equal( efa_rdm_ep->user_info->caps & FI_ATOMIC, 0); + assert_int_equal( efa_rdm_ep->base_ep.info->caps & FI_ATOMIC, 0); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); From 143cf2ee02998ab8fbab80a37a105a5a13690e97 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 16 Sep 2024 16:14:45 -0700 Subject: [PATCH 052/393] core: Remove CURRENT_SYMVER() macro CURRENT_SYMVER() is just a shortcut for DEFAULT_SYMVER() with version set to CURRENT_ABI. Using DEFAULT_SYMVER() directly with explict version string makes the version info more visible from the code and eliminates the need to change from CURRENT_SYMVER() to DEFAULT_SYMVER() when the ABI version is updated in the future. Remove the definition of CURRENT_SYMVER() and CURRENT_ABI from the header. Update the documentation accordingly. Signed-off-by: Jianxin Xiong --- include/ofi_abi.h | 38 +++++++------------------------------- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/include/ofi_abi.h b/include/ofi_abi.h index c7f55b68d7a..e16a900f55a 100644 --- a/include/ofi_abi.h +++ b/include/ofi_abi.h @@ -46,31 +46,16 @@ extern "C" { /* * ABI version support definitions. * - * CURRENT_ABI: - * This defines the current ABI version. The ABI version is separate from - * the packaging or interface versions. Whenever a change is - * added to the interfaces that breaks the ABI, this definition should be - * updated. If you don't know if a change breaks the ABI, then you shouldn't - * be modifying the header files under include/rdma! :P - * * DEFAULT_SYMVER_PRE: * This macro appends an underscore to a function name. It should be used * around any function that is exported from the library as the default call * that applications invoke. * - * CURRENT_SYMVER: - * This macro is placed after a function definition. It should be used with - * any function that is exported by the library and was added as part of the - * current ABI (identified by CURRENT_ABI) version. It results in the function - * being exported at the current ABI version. This is the macro to use when - * exporting new functions. - * * DEFAULT_SYMVER: - * This macro is similar to CURRENT_SYMVER, but is used to specify that a - * function, while the default interface that applications call, was added - * in a previous version of the ABI. Any function that was not impacted by - * an ABI change should use this macro. This often means converting functions - * marked as CURRENT_SYMVER to DEFAULT_SYMVER as part of the ABI update. + * This macro is placed after a function definition. It should be used to + * specify that a function is the default interface that applications call + * and is/was added in the specified ABI version. Any function that is new + * or is not impacted by an ABI change should use this macro. * * COMPAT_SYMVER: * The compatibility symbols are used to mark interfaces which were exported @@ -83,21 +68,19 @@ extern "C" { * ABI version 1.1 modified the behavior for function foo(). * This scenario would result in the following definitions. * - * CURRENT_ABI "MYLIB_1.1" - * * This function is the main entry point for function bar. * int DEFAULT_SYMVER_PRE(bar)(void) * { * ... * } - * DEFAULT_SYMVER(bar_, bar, MYLIB_1.0); + * DEFAULT_SYMVER(bar_, bar, FABRIC_1.0); * * This function is the main entry point for function foo. * int DEFAULT_SYMVER_PRE(foo)(void) * { * ... * } - * CURRENT_SYMVER(foo_, foo); + * DEFAULT_SYMVER(foo_, foo, FABRIC_1.1); * * This function is the old entry point for function foo, provided for * backwards compatibility. @@ -105,14 +88,12 @@ extern "C" { * { * ... * } - * COMPAT_SYMVER(foo_1_0, foo, MYLIB_1.0); + * COMPAT_SYMVER(foo_1_0, foo, FABRIC_1.0); * * By convention, the name of compatibility functions is the exported function * name appended with the ABI version that it is compatible with. */ -#define CURRENT_ABI "FABRIC_1.8" - #if HAVE_ALIAS_ATTRIBUTE == 1 #define DEFAULT_SYMVER_PRE(a) a##_ #else @@ -126,8 +107,6 @@ extern "C" { asm(".symver " #name "," #api "@" #ver "\n") #define DEFAULT_SYMVER(name, api, ver) \ asm(".symver " #name "," #api "@@" #ver "\n") -#define CURRENT_SYMVER(name, api) \ - asm(".symver " #name "," #api "@@" CURRENT_ABI "\n") #else @@ -136,11 +115,8 @@ extern "C" { #if HAVE_ALIAS_ATTRIBUTE == 1 #define DEFAULT_SYMVER(name, api, ver) \ extern typeof (name) api __attribute__((alias(#name))); -#define CURRENT_SYMVER(name, api) \ - extern typeof (name) api __attribute__((alias(#name))); #else #define DEFAULT_SYMVER(name, api, ver) -#define CURRENT_SYMVER(name, api) #endif /* HAVE_ALIAS_ATTRIBUTE == 1*/ #endif /* HAVE_SYMVER_SUPPORT */ From b0af713a57821a7c29939049c8b59c4b9ba47b65 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 17 Sep 2024 16:34:22 -0700 Subject: [PATCH 053/393] prov/efa: always use p2p for system memory P2P is always available for host memory. Unregistered buffer will be regarded as host memory as EFA provider requires FI_MR_HMEM. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_ep.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index a8a7fa8b149..8970c90ba58 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -303,13 +303,11 @@ bool efa_rdm_ep_should_write_rnr_completion(struct efa_rdm_ep *ep) static inline int efa_rdm_ep_use_p2p(struct efa_rdm_ep *efa_rdm_ep, struct efa_mr *efa_mr) { - if (!efa_mr) - return 0; - /* - * always send from host buffers if we have a descriptor + * P2P is always available for host memory (Unregistered buffer will be + * regarded as host memory as EFA provider requires FI_MR_HMEM) */ - if (efa_mr->peer.iface == FI_HMEM_SYSTEM) + if (!efa_mr || efa_mr->peer.iface == FI_HMEM_SYSTEM) return 1; if (efa_rdm_ep_domain(efa_rdm_ep)->hmem_info[efa_mr->peer.iface].p2p_supported_by_device) From 677bdebdae2bca9dfd8bc8a67b436feb58f66808 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 17 Sep 2024 16:35:25 -0700 Subject: [PATCH 054/393] prov/efa: rename p2p_available to mr_p2p_available Rename to mr_p2p_available to indicate that we check the mr is not NULL. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_pke_utils.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_pke_utils.c b/prov/efa/src/rdm/efa_rdm_pke_utils.c index 49600a01707..a19b275d9e1 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_utils.c +++ b/prov/efa/src/rdm/efa_rdm_pke_utils.c @@ -41,7 +41,7 @@ ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke, size_t data_size) { int tx_iov_index, ret; - bool p2p_available; + bool mr_p2p_available; bool use_inline_buf; size_t tx_iov_offset, copied; struct efa_mr *iov_mr; @@ -62,14 +62,14 @@ ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke, assert(tx_iov_index < ope->iov_count); assert(tx_iov_offset < ope->iov[tx_iov_index].iov_len); iov_mr = ope->desc[tx_iov_index]; - p2p_available = false; + mr_p2p_available = false; use_inline_buf = false; if (iov_mr) { ret = efa_rdm_ep_use_p2p(pke->ep, iov_mr); if (ret < 0) return ret; - p2p_available = ret; + mr_p2p_available = ret; } else if (!efa_mr_is_hmem(iov_mr) && payload_offset + data_size <= efa_rdm_ep_domain(pke->ep)->device->efa_attr.inline_buf_size) { use_inline_buf = true; @@ -85,7 +85,7 @@ ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke, * a copy from the user buffer to the internal bounce buffer is needed. */ if (tx_iov_offset + data_size <= ope->iov[tx_iov_index].iov_len && - (use_inline_buf || (p2p_available && !(ope->fi_flags & FI_INJECT)))) { + (use_inline_buf || (mr_p2p_available && !(ope->fi_flags & FI_INJECT)))) { pke->payload = (char *)ope->iov[tx_iov_index].iov_base + tx_iov_offset; pke->payload_size = data_size; pke->payload_mr = ope->desc[tx_iov_index]; @@ -250,15 +250,16 @@ int efa_rdm_pke_get_available_copy_methods(struct efa_rdm_ep *ep, bool *restrict gdrcopy_available) { int ret; - bool p2p_available; + bool mr_p2p_available; + assert(efa_mr); ret = efa_rdm_ep_use_p2p(ep, efa_mr); if (ret < 0) { return ret; } - p2p_available = ret; - *local_read_available = p2p_available && efa_rdm_ep_support_rdma_read(ep); + mr_p2p_available = ret; + *local_read_available = mr_p2p_available && efa_rdm_ep_support_rdma_read(ep); *cuda_memcpy_available = ep->cuda_api_permitted; *gdrcopy_available = efa_mr->peer.flags & OFI_HMEM_DATA_DEV_REG_HANDLE; From 8cf2f34ee3f27cdf496354f368f3b824c118272d Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 19 Sep 2024 20:28:28 +0000 Subject: [PATCH 055/393] fabtests/pytest/efa: Skip memory registration that hit device limit The EFA limit for single MR that enables remote write is 1M pages aka 4GB for regular pages. Currently test_unexpected_msg can hit this limit for certain msg_size & msg_count combo. This patch skips such combos that can hit the limit to avoid errors. Signed-off-by: Shi Jin --- fabtests/pytest/efa/test_unexpected_msg.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fabtests/pytest/efa/test_unexpected_msg.py b/fabtests/pytest/efa/test_unexpected_msg.py index f183a0a7566..dc1f93e3c3c 100644 --- a/fabtests/pytest/efa/test_unexpected_msg.py +++ b/fabtests/pytest/efa/test_unexpected_msg.py @@ -20,6 +20,12 @@ def test_unexpected_msg(cmdline_args, msg_size, msg_count, memory_type, completi neuron_maximal_buffer_size = 2**32 if "neuron" in memory_type and allocated_memory >= neuron_maximal_buffer_size: pytest.skip("Cannot hit neuron allocation limit") + + # The EFA limit for single MR that enables remote write is 1M pages aka 4GB for regular pages + maximal_mr_size = 2**32 + if allocated_memory >= maximal_mr_size: + pytest.skip("Cannot hit EFA MR limit") + efa_run_client_server_test(cmdline_args, f"fi_unexpected_msg -e rdm -M {msg_count}", iteration_type="short", completion_semantic=completion_semantic, memory_type=memory_type, message_size=msg_size, completion_type="queue", timeout=1800) From e3c12ec34559149d6697c76d39c8c8b887c3b7a3 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 23 Sep 2024 00:30:20 +0000 Subject: [PATCH 056/393] prov/efa/test: Disable shm via fi_setopt Currently, all unit tests disable shm transfer by closing the shm ep directly, which is a hacky shortcut. This hack can cause trouble when more procedure is needed for shm closure. This patch fixes this issue by disabling shm via the public fi_setopt API with FI_OPT_SHARED_MEMORY_PERMITTED as false. A common utility function efa_unit_test_resource_construct_rdm_shm_disabled() is introduced and all test functions that need to disable shm should use this common function instead. Signed-off-by: Shi Jin --- prov/efa/test/efa_unit_test_common.c | 33 +++++++++++++++ prov/efa/test/efa_unit_test_cq.c | 9 +--- prov/efa/test/efa_unit_test_ep.c | 62 +++++++--------------------- prov/efa/test/efa_unit_test_rnr.c | 10 +---- prov/efa/test/efa_unit_tests.h | 2 + 5 files changed, 55 insertions(+), 61 deletions(-) diff --git a/prov/efa/test/efa_unit_test_common.c b/prov/efa/test/efa_unit_test_common.c index 930a686aa5c..772bd0608c9 100644 --- a/prov/efa/test/efa_unit_test_common.c +++ b/prov/efa/test/efa_unit_test_common.c @@ -165,6 +165,39 @@ void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(struct efa_resour fail(); } +/** + * @brief Construct RDM ep type resources with shm disabled + */ +void efa_unit_test_resource_construct_rdm_shm_disabled(struct efa_resource *resource) +{ + int ret; + bool shm_permitted = false; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + if (!resource->hints) + goto err; + + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), + resource->hints, false, true); + + ret = fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, + FI_OPT_SHARED_MEMORY_PERMITTED, &shm_permitted, + sizeof(shm_permitted)); + if (ret) + goto err; + + ret = fi_enable(resource->ep); + if (ret) + goto err; + + return; +err: + efa_unit_test_resource_destruct(resource); + + /* Fail test early if the resource struct fails to initialize */ + fail(); +} + /** * @brief Clean up test resources. * Note: Resources should be destroyed in order. diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 974a1f1ad8e..7c521f8dfec 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -102,7 +102,8 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, struct efa_rdm_peer *peer; struct efa_rdm_cq *efa_rdm_cq; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -111,12 +112,6 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); ibv_cqx = efa_rdm_cq->ibv_cq.ibv_cq_ex; - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(ret, 0); diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 1a54092c040..75f100a80f0 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -109,7 +109,6 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin struct efa_rdm_ep *efa_rdm_ep; struct efa_rdm_pke *pkt_entry; uint64_t actual_peer_host_id = UINT64_MAX; - int ret; struct efa_rdm_cq *efa_rdm_cq; g_efa_unit_test_mocks.local_host_id = local_host_id; @@ -117,19 +116,13 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin assert_false(actual_peer_host_id == g_efa_unit_test_mocks.peer_host_id); - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); efa_rdm_ep->host_id = g_efa_unit_test_mocks.local_host_id; - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - ret = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(ret, 0); - efa_rdm_ep->shm_ep = NULL; - } - /* Create and register a fake peer */ assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); @@ -411,7 +404,8 @@ void test_efa_rdm_ep_dc_atomic_queue_before_handshake(struct efa_resource **stat int buf[1] = {0}, err, numaddr; struct efa_rdm_ope *txe; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -434,12 +428,7 @@ void test_efa_rdm_ep_dc_atomic_queue_before_handshake(struct efa_resource **stat msg.op = FI_SUM; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } + /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) * handshake has not been received, so we do not know whether the peer support DC @@ -480,7 +469,8 @@ void test_efa_rdm_ep_dc_send_queue_before_handshake(struct efa_resource **state) int err, numaddr; struct efa_rdm_ope *txe; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -498,12 +488,7 @@ void test_efa_rdm_ep_dc_send_queue_before_handshake(struct efa_resource **state) msg.desc = NULL; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } + /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) * handshake has not been received, so we do not know whether the peer support DC @@ -545,7 +530,8 @@ void test_efa_rdm_ep_dc_send_queue_limit_before_handshake(struct efa_resource ** int err, numaddr; int i; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -563,12 +549,7 @@ void test_efa_rdm_ep_dc_send_queue_limit_before_handshake(struct efa_resource ** msg.desc = NULL; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } + /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) * handshake has not been received, so we do not know whether the peer support DC @@ -867,20 +848,13 @@ void test_efa_rdm_ep_setopt_shared_memory_permitted(struct efa_resource **state) { struct efa_resource *resource = *state; struct efa_rdm_ep *ep; - bool optval = false; - efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, - FI_OPT_SHARED_MEMORY_PERMITTED, &optval, - sizeof(optval)), - 0); - - assert_int_equal(fi_enable(resource->ep), 0); - assert_null(ep->shm_ep); } @@ -1153,7 +1127,8 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res struct efa_rdm_pke **pkt_entry_vec; int i; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -1164,12 +1139,7 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res assert_int_equal(numaddr, 1); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - err = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(err, 0); - efa_rdm_ep->shm_ep = NULL; - } + /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) * handshake has not been received, so we do not know whether the peer support DC diff --git a/prov/efa/test/efa_unit_test_rnr.c b/prov/efa/test/efa_unit_test_rnr.c index 411cc030dd2..bca4dd627b8 100644 --- a/prov/efa/test/efa_unit_test_rnr.c +++ b/prov/efa/test/efa_unit_test_rnr.c @@ -21,7 +21,8 @@ void test_efa_rnr_queue_and_resend(struct efa_resource **state) fi_addr_t peer_addr; int ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + /* disable shm to force using efa device to send */ + efa_unit_test_resource_construct_rdm_shm_disabled(resource); efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); /* Create and register a fake peer */ ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -40,13 +41,6 @@ void test_efa_rnr_queue_and_resend(struct efa_resource **state) efa_rdm_ep->base_ep.qp->ibv_qp_ex->wr_complete = &efa_mock_ibv_wr_complete_no_op; assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - /* close shm_ep to force efa_rdm_ep to use efa device to send */ - if (efa_rdm_ep->shm_ep) { - ret = fi_close(&efa_rdm_ep->shm_ep->fid); - assert_int_equal(ret, 0); - efa_rdm_ep->shm_ep = NULL; - } - ret = fi_send(resource->ep, send_buff.buff, send_buff.size, fi_mr_desc(send_buff.mr), peer_addr, NULL /* context */); assert_int_equal(ret, 0); assert_false(dlist_empty(&efa_rdm_ep->txe_list)); diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 0182f135569..a6c0773ad38 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -45,6 +45,8 @@ void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, uint32_t fi_version, struct fi_info *hints, bool enable_ep, bool open_cq); +void efa_unit_test_resource_construct_rdm_shm_disabled(struct efa_resource *resource); + void efa_unit_test_resource_destruct(struct efa_resource *resource); void efa_unit_test_construct_msg(struct fi_msg *msg, struct iovec *iov, From 83f1dd4bf5674fac78cb68ab9d79e71cdae955bb Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 23 Sep 2024 18:06:58 +0000 Subject: [PATCH 057/393] prov/efa: Add dependency header file in fi_ext_efa.h fi_ext_efa.h uses fid_mr from fi_domain.h. Import the dependency header for self-containing. Signed-off-by: Shi Jin --- prov/efa/src/fi_ext_efa.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/prov/efa/src/fi_ext_efa.h b/prov/efa/src/fi_ext_efa.h index 9d3c41575a4..a4d3465e455 100644 --- a/prov/efa/src/fi_ext_efa.h +++ b/prov/efa/src/fi_ext_efa.h @@ -4,6 +4,8 @@ #ifndef _FI_EXT_EFA_H_ #define _FI_EXT_EFA_H_ +#include + #define FI_EFA_DOMAIN_OPS "efa domain ops" struct fi_efa_mr_attr { From 00714154e4e96511595eececb54e08bd9a1e437c Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Wed, 18 Sep 2024 16:02:15 -0700 Subject: [PATCH 058/393] fabtests/prov/lpp: fix compile warnings Fix compiler warnings about incorrect placement of const and inline identifiers Signed-off-by: Alexia Ingerson --- fabtests/prov/lpp/src/atomic.c | 2 +- fabtests/prov/lpp/src/main.c | 4 ++-- fabtests/prov/lpp/src/msg.c | 2 +- fabtests/prov/lpp/src/rcq_data.c | 2 +- fabtests/prov/lpp/src/rma.c | 2 +- fabtests/prov/lpp/src/test_util.h | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fabtests/prov/lpp/src/atomic.c b/fabtests/prov/lpp/src/atomic.c index 0603d045ab2..ba8b2966402 100644 --- a/fabtests/prov/lpp/src/atomic.c +++ b/fabtests/prov/lpp/src/atomic.c @@ -32,7 +32,7 @@ #include "test_util.h" -const static uint64_t context = 0xabce; +static const uint64_t context = 0xabce; int run_simple_atomic_write(struct rank_info *ri) { diff --git a/fabtests/prov/lpp/src/main.c b/fabtests/prov/lpp/src/main.c index 22971ad716a..f27cce46234 100644 --- a/fabtests/prov/lpp/src/main.c +++ b/fabtests/prov/lpp/src/main.c @@ -78,7 +78,7 @@ enum node_id my_node; // Note: the two large RMA tests are intentionally far apart to reduce the // chances they run simultaneously. On configs with small IOVAs spaces, this // can be a problem. This only matters when running with -p > 1, of course. -const static struct test testlist[] = { +static const struct test testlist[] = { { run_simple_rma_write, "simple_rma_write" }, { run_offset_rma_write, "offset_rma_write" }, { run_inject_rma_write, "inject_rma_write" }, @@ -273,7 +273,7 @@ static void *worker_thread(void *arg) return (void*)1; } -static void inline populate_filtered_testlist(const struct test* tlist, +static inline void populate_filtered_testlist(const struct test* tlist, size_t num_tests) { for (int i = 0; i < num_tests; i++) { diff --git a/fabtests/prov/lpp/src/msg.c b/fabtests/prov/lpp/src/msg.c index f1e01b79ae2..a9fadead52d 100644 --- a/fabtests/prov/lpp/src/msg.c +++ b/fabtests/prov/lpp/src/msg.c @@ -34,7 +34,7 @@ #include "test_util.h" -const static uint64_t context = 0xabcd; +static const uint64_t context = 0xabcd; int run_simple_msg(struct rank_info *ri) { diff --git a/fabtests/prov/lpp/src/rcq_data.c b/fabtests/prov/lpp/src/rcq_data.c index 9a6c14c1472..3aeb20df3f6 100644 --- a/fabtests/prov/lpp/src/rcq_data.c +++ b/fabtests/prov/lpp/src/rcq_data.c @@ -1,6 +1,6 @@ #include "test_util.h" -const static uint64_t context = 0xabcd; +static const uint64_t context = 0xabcd; int run_fi_tsenddata(struct rank_info *ri){ struct wait_tx_cq_params wait_tx_cq_params = { 0 }; diff --git a/fabtests/prov/lpp/src/rma.c b/fabtests/prov/lpp/src/rma.c index 3ef52f54998..e7332155fbf 100644 --- a/fabtests/prov/lpp/src/rma.c +++ b/fabtests/prov/lpp/src/rma.c @@ -32,7 +32,7 @@ #include "test_util.h" -const static uint64_t context = 0xabcd; +static const uint64_t context = 0xabcd; static int simple_rma_write_common(struct rank_info *ri, size_t buffer_len) { diff --git a/fabtests/prov/lpp/src/test_util.h b/fabtests/prov/lpp/src/test_util.h index 7946acc8596..6f4cb4143d0 100644 --- a/fabtests/prov/lpp/src/test_util.h +++ b/fabtests/prov/lpp/src/test_util.h @@ -167,8 +167,8 @@ static inline struct fi_context *get_ctx_simple(struct rank_info *ri, } void free_ctx_tree(struct rank_info *ri); -const static unsigned int seed_node_a = 1234; -const static unsigned int seed_node_b = 9876; +static const unsigned int seed_node_a = 1234; +static const unsigned int seed_node_b = 9876; #ifdef USE_HMEM void hmem_init(void); From 18b3c0d8d838c6cff801801ffe07fe578faa989f Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Wed, 18 Sep 2024 15:07:26 +0200 Subject: [PATCH 059/393] fabtests/prov/lpp: update version and protocol in fi_getinfo Update version in fi_getinfo and protocol in ep attribs. Signed-off-by: Tadeusz Struk --- fabtests/prov/lpp/src/test_util.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fabtests/prov/lpp/src/test_util.c b/fabtests/prov/lpp/src/test_util.c index af54db7327f..7757899bb82 100644 --- a/fabtests/prov/lpp/src/test_util.c +++ b/fabtests/prov/lpp/src/test_util.c @@ -124,6 +124,7 @@ void util_init(struct rank_info *ri) hints.domain_attr = &domain_attr; hints.ep_attr->type = FI_EP_RDM; + hints.ep_attr->protocol = FI_PROTO_LPP; // TODO: Run some tests with more surgical application of caps (e.g., // only FI_MSG and FI_SEND for the sending side endpoint). hints.caps = FI_ATOMIC | FI_RMA | FI_MSG | FI_TAGGED | FI_READ | @@ -133,13 +134,14 @@ void util_init(struct rank_info *ri) hints.fabric_attr->prov_name = "lpp"; hints.domain_attr->mr_mode = FI_MR_BASIC; - rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &ri->fi); + rc = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + NULL, NULL, 0, &hints, &ri->fi); if (rc == -ENODATA) { warn("Failed to find provider with FI_HMEM, trying again without\n"); hints.caps &= ~FI_HMEM; INSIST_FI_EQ(ri, - fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, - &hints, &ri->fi), + fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + NULL, NULL, 0, &hints, &ri->fi), 0); } From 0a125217f5967e2d16fd0f2b4abc4f71d0a6e193 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 16:51:47 +0000 Subject: [PATCH 060/393] build(deps): bump github/codeql-action from 3.26.7 to 3.26.8 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.7 to 3.26.8. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/8214744c546c1e5c8f03dde8fab3a7353211988d...294a9d92911152fe08befb9ec03e240add280cb3) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index f6c384a539b..89ca6038efc 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 + uses: github/codeql-action/init@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 + uses: github/codeql-action/autobuild@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 + uses: github/codeql-action/analyze@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index e1a251618a0..29820d5cdd3 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 + uses: github/codeql-action/upload-sarif@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 with: sarif_file: results.sarif From b359d1b6db366a2da04b6e04abde06069fee23a6 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 18 Sep 2024 08:30:56 -0500 Subject: [PATCH 061/393] fabtests: Move pingpong logic into pre-posted func Move the current pingpong logic into a pingpong pre-posted RX function. This better describes the behavior of this pingpong test. In addition, this change will allow for a pingpong without pre-posted RX buffers to be defined. Signed-off-by: Ian Ziemba --- fabtests/benchmarks/benchmark_shared.c | 62 ++++++++++++++++---------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/fabtests/benchmarks/benchmark_shared.c b/fabtests/benchmarks/benchmark_shared.c index 935b7961cb8..32dc4304c11 100644 --- a/fabtests/benchmarks/benchmark_shared.c +++ b/fabtests/benchmarks/benchmark_shared.c @@ -86,27 +86,10 @@ void ft_benchmark_usage(void) "# of iterations > window size"); } -int pingpong(void) +/* Pingpong latency test with pre-posted receive buffers. */ +static int pingpong_pre_posted_rx(size_t inject_size) { int ret, i; - size_t inject_size = fi->tx_attr->inject_size; - - ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, - &inject_size, &(size_t){sizeof inject_size}); - if (ret && ret != -FI_ENOPROTOOPT) { - FT_PRINTERR("fi_getopt(FI_OPT_INJECT_MSG_SIZE)", ret); - return ret; - } - - if (inject_size_set) - inject_size = opts.inject_size; - - if (opts.options & FT_OPT_ENABLE_HMEM) - inject_size = 0; - - ret = ft_sync(); - if (ret) - return ret; if (opts.dst_addr) { for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) { @@ -114,9 +97,11 @@ int pingpong(void) ft_start(); if (opts.transfer_size <= inject_size) - ret = ft_inject(ep, remote_fi_addr, opts.transfer_size); + ret = ft_inject(ep, remote_fi_addr, + opts.transfer_size); else - ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx); + ret = ft_tx(ep, remote_fi_addr, + opts.transfer_size, &tx_ctx); if (ret) return ret; @@ -134,15 +119,46 @@ int pingpong(void) return ret; if (opts.transfer_size <= inject_size) - ret = ft_inject(ep, remote_fi_addr, opts.transfer_size); + ret = ft_inject(ep, remote_fi_addr, + opts.transfer_size); else - ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx); + ret = ft_tx(ep, remote_fi_addr, + opts.transfer_size, &tx_ctx); if (ret) return ret; } } ft_stop(); + return FI_SUCCESS; +} + +int pingpong(void) +{ + int ret; + size_t inject_size = fi->tx_attr->inject_size; + + ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, + &inject_size, &(size_t){sizeof inject_size}); + if (ret && ret != -FI_ENOPROTOOPT) { + FT_PRINTERR("fi_getopt(FI_OPT_INJECT_MSG_SIZE)", ret); + return ret; + } + + if (inject_size_set) + inject_size = opts.inject_size; + + if (opts.options & FT_OPT_ENABLE_HMEM) + inject_size = 0; + + ret = ft_sync(); + if (ret) + return ret; + + ret = pingpong_pre_posted_rx(inject_size); + if (ret) + return ret; + if (opts.machr) show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2, opts.argc, opts.argv); From 8e8a920bf54196f24230c22bcbe76182239d5b25 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 18 Sep 2024 09:27:00 -0500 Subject: [PATCH 062/393] fabtests: Define common run pingpong function The run() logic in rdm_pingpong and rdm_tagged_pingpong is the same. Consolidate this logic into a single run_pingpong() function. Signed-off-by: Ian Ziemba --- fabtests/benchmarks/benchmark_shared.c | 28 ++++++++++++++++++++ fabtests/benchmarks/benchmark_shared.h | 1 + fabtests/benchmarks/rdm_pingpong.c | 30 +-------------------- fabtests/benchmarks/rdm_tagged_pingpong.c | 32 +---------------------- 4 files changed, 31 insertions(+), 60 deletions(-) diff --git a/fabtests/benchmarks/benchmark_shared.c b/fabtests/benchmarks/benchmark_shared.c index 32dc4304c11..32b0fdc926c 100644 --- a/fabtests/benchmarks/benchmark_shared.c +++ b/fabtests/benchmarks/benchmark_shared.c @@ -168,6 +168,34 @@ int pingpong(void) return 0; } +int run_pingpong(void) +{ + int i, ret = 0; + + ret = ft_init_fabric(); + if (ret) + return ret; + + if (!(opts.options & FT_OPT_SIZE)) { + for (i = 0; i < TEST_CNT; i++) { + if (!ft_use_size(i, opts.sizes_enabled)) + continue; + opts.transfer_size = test_size[i].size; + init_test(&opts, test_name, sizeof(test_name)); + ret = pingpong(); + if (ret) + return ret; + } + } else { + init_test(&opts, test_name, sizeof(test_name)); + ret = pingpong(); + if (ret) + return ret; + } + + return ft_finalize(); +} + int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote) { int ret, i; diff --git a/fabtests/benchmarks/benchmark_shared.h b/fabtests/benchmarks/benchmark_shared.h index 1dcc7352fea..57f0facb087 100644 --- a/fabtests/benchmarks/benchmark_shared.h +++ b/fabtests/benchmarks/benchmark_shared.h @@ -46,6 +46,7 @@ extern "C" { void ft_parse_benchmark_opts(int op, char *optarg); void ft_benchmark_usage(void); int pingpong(void); +int run_pingpong(void); int bandwidth(void); int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote); int bandwidth_rma(enum ft_rma_opcodes op, struct fi_rma_iov *remote); diff --git a/fabtests/benchmarks/rdm_pingpong.c b/fabtests/benchmarks/rdm_pingpong.c index 9eb5a0e0de5..f5c5871e22d 100644 --- a/fabtests/benchmarks/rdm_pingpong.c +++ b/fabtests/benchmarks/rdm_pingpong.c @@ -36,34 +36,6 @@ #include "shared.h" #include "benchmark_shared.h" -static int run(void) -{ - int i, ret = 0; - - ret = ft_init_fabric(); - if (ret) - return ret; - - if (!(opts.options & FT_OPT_SIZE)) { - for (i = 0; i < TEST_CNT; i++) { - if (!ft_use_size(i, opts.sizes_enabled)) - continue; - opts.transfer_size = test_size[i].size; - init_test(&opts, test_name, sizeof(test_name)); - ret = pingpong(); - if (ret) - return ret; - } - } else { - init_test(&opts, test_name, sizeof(test_name)); - ret = pingpong(); - if (ret) - return ret; - } - - return ft_finalize(); -} - int main(int argc, char **argv) { int op, ret; @@ -106,7 +78,7 @@ int main(int argc, char **argv) hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; - ret = run(); + ret = run_pingpong(); ft_free_res(); return ft_exit_code(ret); diff --git a/fabtests/benchmarks/rdm_tagged_pingpong.c b/fabtests/benchmarks/rdm_tagged_pingpong.c index a0288ad7ee1..36a11152eb8 100644 --- a/fabtests/benchmarks/rdm_tagged_pingpong.c +++ b/fabtests/benchmarks/rdm_tagged_pingpong.c @@ -36,36 +36,6 @@ #include #include "benchmark_shared.h" -static int run(void) -{ - int i, ret = 0; - - ret = ft_init_fabric(); - if (ret) - return ret; - - if (!(opts.options & FT_OPT_SIZE)) { - for (i = 0; i < TEST_CNT; i++) { - if (!ft_use_size(i, opts.sizes_enabled)) - continue; - opts.transfer_size = test_size[i].size; - init_test(&opts, test_name, sizeof(test_name)); - ret = pingpong(); - if (ret) - goto out; - } - } else { - init_test(&opts, test_name, sizeof(test_name)); - ret = pingpong(); - if (ret) - goto out; - } - - ft_finalize(); -out: - return ret; -} - int main(int argc, char **argv) { int op, ret; @@ -108,7 +78,7 @@ int main(int argc, char **argv) hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; - ret = run(); + ret = run_pingpong(); ft_free_res(); return ft_exit_code(ret); From 77b3bddd0325e3b6e2babaf69173a15225fd9277 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Thu, 19 Sep 2024 11:54:39 -0500 Subject: [PATCH 063/393] fabtests: Split out ft_sync logic Split out the ft_sync logic into two separate functions: inband sync (ft_sync_inband) and out-of-band sync (ft_sync_oob). The inband sync supports the option to conditionally repost buffers after the sync. The breaking out of the sync logic is needed to support fi_rdm_pingpong/fi_rdm_tagged_pingpong with a no pre-posted RX buffer option. Signed-off-by: Ian Ziemba --- fabtests/common/shared.c | 87 +++++++++++++++++++++++++-------------- fabtests/include/shared.h | 2 + 2 files changed, 58 insertions(+), 31 deletions(-) diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 11b7a638fa2..2d4387ba4df 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -3012,49 +3012,74 @@ void eq_readerr(struct fid_eq *eq, const char *eq_str) } } -int ft_sync() +int ft_sync_oob(void) { char buf = 'a'; int ret; if (opts.dst_addr) { - if (!(opts.options & FT_OPT_OOB_SYNC)) { - ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 1, &tx_ctx, - FI_DELIVERY_COMPLETE); - if (ret) - return ret; + ret = ft_sock_send(oob_sock, &buf, 1); + if (ret) + return ret; - ret = ft_rx(ep, 1); - } else { - ret = ft_sock_send(oob_sock, &buf, 1); - if (ret) - return ret; + ret = ft_sock_recv(oob_sock, &buf, 1); + if (ret) + return ret; + } else { + ret = ft_sock_recv(oob_sock, &buf, 1); + if (ret) + return ret; - ret = ft_sock_recv(oob_sock, &buf, 1); - if (ret) - return ret; - } + ret = ft_sock_send(oob_sock, &buf, 1); + if (ret) + return ret; + } + + return FI_SUCCESS; +} + +int ft_sync_inband(bool repost_rx) +{ + int ret; + + if (opts.dst_addr) { + ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 1, &tx_ctx, + FI_DELIVERY_COMPLETE); + if (ret) + return ret; + + ret = ft_get_rx_comp(rx_seq); + if (ret) + return ret; } else { - if (!(opts.options & FT_OPT_OOB_SYNC)) { - ret = ft_rx(ep, 1); - if (ret) - return ret; + ret = ft_get_rx_comp(rx_seq); + if (ret) + return ret; - ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 1, &tx_ctx, - FI_DELIVERY_COMPLETE); - if (ret) - return ret; - } else { - ret = ft_sock_recv(oob_sock, &buf, 1); - if (ret) - return ret; + ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 1, &tx_ctx, + FI_DELIVERY_COMPLETE); + if (ret) + return ret; + } - ret = ft_sock_send(oob_sock, &buf, 1); - if (ret) - return ret; - } + if (repost_rx) { + ret = ft_post_rx(ep, rx_size, &rx_ctx); + if (ret) + return ret; } + return FI_SUCCESS; +} + +int ft_sync() +{ + int ret; + + if (ft_check_opts(FT_OPT_OOB_SYNC)) + ret = ft_sync_oob(); + else + ret = ft_sync_inband(true); + return ret; } diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 4bed8cb572b..80fd5538fe4 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -560,6 +560,8 @@ void *ft_get_aligned_addr(void *ptr, size_t alignment) int ft_read_cq(struct fid_cq *cq, uint64_t *cur, uint64_t total, int timeout, uint64_t tag); +int ft_sync_oob(void); +int ft_sync_inband(bool repost_rx); int ft_sync(void); int ft_sync_pair(int status); int ft_fork_and_pair(void); From 256d55ef67637bd354da17797d85b692037afcd6 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 18 Sep 2024 09:09:28 -0500 Subject: [PATCH 064/393] fabtests: Support no prepost RX pingpong test The new pingpong test allows for TX operations to be posted and processed, if necessary, before post the RX buffer. This better aligns to how OSU latency works. By doing this, fi_rdm_tagged_latency is now lower than OSU latency which makes sense since less SW is involved. The no prepost RX pingpong test can be enabled by using the -r option. Signed-off-by: Ian Ziemba --- fabtests/benchmarks/benchmark_shared.c | 102 +++++++++++++++++++++++-- fabtests/benchmarks/benchmark_shared.h | 2 +- fabtests/common/shared.c | 8 ++ fabtests/include/shared.h | 1 + 4 files changed, 106 insertions(+), 7 deletions(-) diff --git a/fabtests/benchmarks/benchmark_shared.c b/fabtests/benchmarks/benchmark_shared.c index 32b0fdc926c..6c863bbcf3a 100644 --- a/fabtests/benchmarks/benchmark_shared.c +++ b/fabtests/benchmarks/benchmark_shared.c @@ -70,6 +70,9 @@ void ft_parse_benchmark_opts(int op, char *optarg) case 'W': opts.window_size = atoi(optarg); break; + case 'r': + opts.options |= FT_OPT_NO_PRE_POSTED_RX; + break; default: break; } @@ -84,6 +87,10 @@ void ft_benchmark_usage(void) "* The following condition is required to have at least " "one window\nsize # of messsages to be sent: " "# of iterations > window size"); + FT_PRINT_OPTS_USAGE("-r", "Do not pre post RX buffers"); + FT_PRINT_OPTS_USAGE("", "Only the following tests support this option for now:"); + FT_PRINT_OPTS_USAGE("", "\tfi_rdm_tagged_pingpong"); + FT_PRINT_OPTS_USAGE("", "\tfi_rdm_pingpong"); } /* Pingpong latency test with pre-posted receive buffers. */ @@ -133,6 +140,68 @@ static int pingpong_pre_posted_rx(size_t inject_size) return FI_SUCCESS; } +/* Pingpong latency test without pre-posted receive buffers. */ +static int pingpong_no_pre_posted_rx(size_t inject_size) +{ + int ret, i; + + if (opts.dst_addr) { + for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) { + if (i == opts.warmup_iterations) + ft_start(); + + if (opts.transfer_size <= inject_size) + ret = ft_inject(ep, remote_fi_addr, + opts.transfer_size); + else + ret = ft_tx(ep, remote_fi_addr, + opts.transfer_size, &tx_ctx); + if (ret) + return ret; + + ret = ft_post_rx(ep, opts.transfer_size, &rx_ctx); + if (ret) + return ret; + + ret = ft_get_rx_comp(rx_seq); + if (ret) + return ret; + } + } else { + for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) { + if (i == opts.warmup_iterations) + ft_start(); + + ret = ft_post_rx(ep, opts.transfer_size, &rx_ctx); + if (ret) + return ret; + + ret = ft_get_rx_comp(rx_seq); + if (ret) + return ret; + + if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) { + ret = ft_check_buf((char *) rx_buf + ft_rx_prefix_size(), + opts.transfer_size); + if (ret) + return ret; + } + + if (opts.transfer_size <= inject_size) + ret = ft_inject(ep, remote_fi_addr, + opts.transfer_size); + else + ret = ft_tx(ep, remote_fi_addr, + opts.transfer_size, &tx_ctx); + if (ret) + return ret; + } + } + ft_stop(); + + return FI_SUCCESS; +} + int pingpong(void) { int ret; @@ -151,13 +220,34 @@ int pingpong(void) if (opts.options & FT_OPT_ENABLE_HMEM) inject_size = 0; - ret = ft_sync(); - if (ret) - return ret; + if (ft_check_opts(FT_OPT_NO_PRE_POSTED_RX)) { + if (ft_check_opts(FT_OPT_OOB_SYNC)) { + ret = ft_sync_oob(); + if (ret) + return ret; + } else { + /* Repost RX buffers to support inband sync. */ + ret = ft_post_rx(ep, rx_size, &rx_ctx); + if (ret) + return ret; - ret = pingpong_pre_posted_rx(inject_size); - if (ret) - return ret; + ret = ft_sync_inband(false); + if (ret) + return ret; + } + + ret = pingpong_no_pre_posted_rx(inject_size); + if (ret) + return ret; + } else { + ret = ft_sync(); + if (ret) + return ret; + + ret = pingpong_pre_posted_rx(inject_size); + if (ret) + return ret; + } if (opts.machr) show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2, diff --git a/fabtests/benchmarks/benchmark_shared.h b/fabtests/benchmarks/benchmark_shared.h index 57f0facb087..fbaf3eb3075 100644 --- a/fabtests/benchmarks/benchmark_shared.h +++ b/fabtests/benchmarks/benchmark_shared.h @@ -40,7 +40,7 @@ extern "C" { #include -#define BENCHMARK_OPTS "vkj:W:" +#define BENCHMARK_OPTS "rvkj:W:" #define FT_BENCHMARK_MAX_MSG_SIZE (test_size[TEST_CNT - 1].size) void ft_parse_benchmark_opts(int op, char *optarg); diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 2d4387ba4df..eb95127b6f0 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -1340,6 +1340,14 @@ int ft_init_fabric(void) if (ft_check_opts(FT_OPT_FORK_CHILD)) ft_fork_child(); + if (ft_check_opts(FT_OPT_NO_PRE_POSTED_RX) && + !ft_check_opts(FT_OPT_SKIP_MSG_ALLOC) && + (fi->caps & (FI_MSG | FI_TAGGED))) { + ret = ft_sync_inband(false); + if (ret) + return ret; + } + return 0; } diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 80fd5538fe4..7d56fdd7257 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -145,6 +145,7 @@ enum { FT_OPT_DISABLE_TAG_VALIDATION = 1 << 25, FT_OPT_ADDR_IS_OOB = 1 << 26, FT_OPT_REG_DMABUF_MR = 1 << 27, + FT_OPT_NO_PRE_POSTED_RX = 1 << 28, FT_OPT_OOB_CTRL = FT_OPT_OOB_SYNC | FT_OPT_OOB_ADDR_EXCH, }; From 16a4445aa4a50fa3a2bae9949961626b0408feb3 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Thu, 19 Sep 2024 13:26:31 +0200 Subject: [PATCH 065/393] fabtests/prov/lpp: remove invalid condition in fi_tsenddata Remove invalid condition in parameter check in fi_tsenddata test, and rework the code to use a #define number of buffers. Signed-off-by: Tadeusz Struk --- fabtests/prov/lpp/src/rcq_data.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fabtests/prov/lpp/src/rcq_data.c b/fabtests/prov/lpp/src/rcq_data.c index 3aeb20df3f6..a3ca76f8d03 100644 --- a/fabtests/prov/lpp/src/rcq_data.c +++ b/fabtests/prov/lpp/src/rcq_data.c @@ -1,6 +1,7 @@ #include "test_util.h" static const uint64_t context = 0xabcd; +#define BUF_NUM 4 int run_fi_tsenddata(struct rank_info *ri){ struct wait_tx_cq_params wait_tx_cq_params = { 0 }; @@ -8,23 +9,16 @@ int run_fi_tsenddata(struct rank_info *ri){ struct verify_buf_params verify_buf_params = { 0 }; struct mr_params mr_params = { 0 }; struct ep_params ep_params = { 0 }; - const size_t buff_lens[] = { (1<<15), (1<<14), 1024, 64 }; + const size_t buff_lens[BUF_NUM] = { (1<<15), (1<<14), 1024, 64 }; + const uint64_t tags[BUF_NUM] = {0xffff0001, 0xffff0002, 0xffff0003, 0xffff0004}; + uint64_t rcq_data[BUF_NUM] = { 0x1000, 0x2000, 0x3000, 0x4000}; struct rank_info *pri = NULL; - const uint64_t tags[] = {0xffff0001, 0xffff0002, 0xffff0003, 0xffff0004}; - uint64_t rcq_data[] = { 0x1000, 0x2000, 0x3000, 0x4000}; - size_t ndata = sizeof(rcq_data)/sizeof(*rcq_data); - size_t nbufflens = sizeof(buff_lens)/sizeof(*buff_lens); - - if (ndata == nbufflens) - return -EINVAL; - - for(int i = 0; iiteration; - } TRACE(ri, util_init(ri)); - for(int i = 0; i < ndata; i++){ + for (int i = 0; i < BUF_NUM; i++) { mr_params.idx = i; mr_params.length = buff_lens[i]; mr_params.access = FI_SEND | FI_RECV; @@ -34,10 +28,9 @@ int run_fi_tsenddata(struct rank_info *ri){ ep_params.idx = 0; TRACE(ri, util_create_ep(ri, &ep_params)); - TRACE(ri, util_sync(ri, &pri)); - for(int i= 0; iep_info[0].fid, ri->mr_info[i].uaddr, From ebdb7c7ed88734befcde17dd7e28e11b6ce211ba Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 25 Sep 2024 22:19:41 +0000 Subject: [PATCH 066/393] prov/efa: Adjust log level for setopt/getopt For unimplemented options, we shouldn't need a warning. Application can tell from the FI_ENOPROTOOPT rc and do the fall back accordingly. Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 956c2804f82..6aa0cf96d63 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1808,7 +1808,7 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, efa_rdm_ep->write_in_order_aligned_128_bytes = *(bool *)optval; break; default: - EFA_WARN(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); + EFA_INFO(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); return -FI_ENOPROTOOPT; } @@ -1913,7 +1913,7 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, *optlen = sizeof(bool); break; default: - EFA_WARN(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); + EFA_INFO(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); return -FI_ENOPROTOOPT; } From f4b7d2841e3f778b145e2ccea99e1cf2dc8f7787 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 24 Sep 2024 22:24:10 +0000 Subject: [PATCH 067/393] fabtests/prov/efa: Add efa_rdma_checker This code checks whether efa device support rdma read or write capability. Signed-off-by: Shi Jin --- fabtests/prov/efa/Makefile.include | 9 ++- fabtests/prov/efa/src/efa_rdma_checker.c | 99 ++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 fabtests/prov/efa/src/efa_rdma_checker.c diff --git a/fabtests/prov/efa/Makefile.include b/fabtests/prov/efa/Makefile.include index 895885e9e54..bfb0049d2d3 100644 --- a/fabtests/prov/efa/Makefile.include +++ b/fabtests/prov/efa/Makefile.include @@ -34,7 +34,8 @@ bin_PROGRAMS += prov/efa/src/fi_efa_rnr_read_cq_error \ prov/efa/src/fi_efa_rnr_queue_resend \ prov/efa/src/fi_efa_info_test if HAVE_VERBS_DEVEL -bin_PROGRAMS += prov/efa/src/fi_efa_exhaust_mr_reg_rdm_pingpong +bin_PROGRAMS += prov/efa/src/fi_efa_exhaust_mr_reg_rdm_pingpong \ + prov/efa/src/fi_efa_rdma_checker endif HAVE_VERBS_DEVEL efa_rnr_srcs = \ @@ -65,4 +66,10 @@ prov_efa_src_fi_efa_exhaust_mr_reg_rdm_pingpong_SOURCES = \ $(efa_exhaust_mr_reg_srcs) \ $(benchmarks_srcs) prov_efa_src_fi_efa_exhaust_mr_reg_rdm_pingpong_LDADD = libfabtests.la + +prov_efa_src_fi_efa_rdma_checker_SOURCES = \ + prov/efa/src/efa_rdma_checker.c +prov_efa_src_fi_efa_rdma_checker_LDADD = libfabtests.la +prov_efa_src_fi_efa_rdma_checker_LDFLAGS = -lefa + endif HAVE_VERBS_DEVEL diff --git a/fabtests/prov/efa/src/efa_rdma_checker.c b/fabtests/prov/efa/src/efa_rdma_checker.c new file mode 100644 index 00000000000..b764bbba91b --- /dev/null +++ b/fabtests/prov/efa/src/efa_rdma_checker.c @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + + +#include +#include +#include +#include +#include +#include +#include +#include + +enum rdma_op { + READ, + WRITE, +}; + +/* + * Check whether rdma read/write is enabled on the instance by querying the rdma device. + * Return 0 if rdma read/write is enabled, otherwise return -1. + */ +int main(int argc, char *argv[]) +{ + struct ibv_device **device_list; + struct ibv_context *ibv_ctx; + struct ibv_device_attr_ex ibv_dev_attr = {0}; + struct efadv_device_attr efadv_attr = {0}; + int dev_cnt; + int err, opt; + enum rdma_op op = READ; + + while ((opt = getopt(argc, argv, "ho:")) != -1) { + switch (opt) { + case 'o': + if (!strcasecmp(optarg, "read")) { + op = READ; + } else if (!strcasecmp(optarg, "write")) { + op = WRITE; + } else { + fprintf(stderr, "Unknown operation '%s. Allowed: read | write'\n", optarg); + return EXIT_FAILURE; + } + break; + case '?': + case 'h': + default: + fprintf(stderr, "Usage:\n"); + FT_PRINT_OPTS_USAGE("fi_efa_rdma_checker -o ", "rdma operation type: read|write"); + return EXIT_FAILURE; + } + } + + device_list = ibv_get_device_list(&dev_cnt); + if (dev_cnt <= 0) { + fprintf(stderr, "No ibv device found!\n"); + return -ENODEV; + } + + ibv_ctx = ibv_open_device(device_list[0]); + if (!ibv_ctx) { + fprintf(stderr, "cannot open device %d\n", 0); + return EXIT_FAILURE; + } + + err = ibv_query_device_ex(ibv_ctx, NULL, &ibv_dev_attr); + if (!err) { + fprintf(stdout, "ibv_dev_attr.device_cap_flags_ex: %lx\n", ibv_dev_attr.device_cap_flags_ex); + } + + err = efadv_query_device(ibv_ctx, (struct efadv_device_attr *)&efadv_attr, sizeof(efadv_attr)); + ibv_close_device(ibv_ctx); + if (err) { + fprintf(stderr, "cannot query device\n"); + goto out; + } + + if (efadv_attr.max_rdma_size == 0) { + fprintf(stderr, "rdma is not enabled \n"); + err = EXIT_FAILURE; + goto out; + } + fprintf(stdout, "rdma read is enabled \n"); + fprintf(stdout, "efa_dev_attr.max_rdma_size: %d\n", efadv_attr.max_rdma_size); + + if (op == READ) + goto out; + + if (efadv_attr.device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_WRITE) { + fprintf(stdout, "rdma write is enabled \n"); + } else { + fprintf(stderr, "rdma write is NOT enabled \n"); + err = op == WRITE ? 1 : 0; + } + +out: + ibv_free_device_list(device_list); + return err; +} From 46328d10885183e6eb3daf633473a3e5af736baf Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 24 Sep 2024 22:24:53 +0000 Subject: [PATCH 068/393] prov/efa: Update help message for inter_min_read_write_size The env inter_min_read_write_size only applies to emulated write path. If the efa device supports RDMA write, device RDMA write will always be used. Signed-off-by: Shi Jin --- prov/efa/src/efa_env.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/efa/src/efa_env.c b/prov/efa/src/efa_env.c index 484f544ddd6..79a315c7cbe 100644 --- a/prov/efa/src/efa_env.c +++ b/prov/efa/src/efa_env.c @@ -216,7 +216,7 @@ void efa_env_define() fi_param_define(&efa_prov, "inter_max_gdrcopy_message_size", FI_PARAM_INT, "The maximum message size to use gdrcopy. If instance support gdrcopy, messages whose size is smaller than this value will be sent by eager/longcts protocol (Default 32768)."); fi_param_define(&efa_prov, "inter_min_read_write_size", FI_PARAM_INT, - "The mimimum message size for inter EFA write to use read write protocol. If firmware support RDMA read, and FI_EFA_USE_DEVICE_RDMA is 1, write requests whose size is larger than this value will use the read write protocol (Default 65536)."); + "The mimimum message size for inter EFA write to use read write protocol. If firmware support RDMA read, and FI_EFA_USE_DEVICE_RDMA is 1, write requests whose size is larger than this value will use the read write protocol (Default 65536). If the efa device supports RDMA write, device RDMA write will always be used."); fi_param_define(&efa_prov, "inter_read_segment_size", FI_PARAM_INT, "Calls to RDMA read is segmented using this value."); fi_param_define(&efa_prov, "fork_safe", FI_PARAM_BOOL, From 0dcaceb2e69b4ed98a5272b323f97142654ecd11 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 24 Sep 2024 22:26:23 +0000 Subject: [PATCH 069/393] fabtests/pytest/efa: Skip inter_min_write_write_size test when rdma write is on FI_EFA_INTER_MIN_READ_WRITE_SIZE is only applied to emulated write protocols. When efa device supports rdma write, rdma write should always be used. Signed-off-by: Shi Jin --- fabtests/pytest/efa/efa_common.py | 24 +++++++++++++++++++ .../pytest/efa/test_efa_protocol_selection.py | 5 +++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/fabtests/pytest/efa/efa_common.py b/fabtests/pytest/efa/efa_common.py index 4f5da4faf02..d5f0eb959c0 100644 --- a/fabtests/pytest/efa/efa_common.py +++ b/fabtests/pytest/efa/efa_common.py @@ -1,3 +1,4 @@ +import os import subprocess import functools from common import SshConnectionError, is_ssh_connection_error, has_ssh_connection_err_msg, ClientServerTest @@ -66,6 +67,29 @@ def has_gdrcopy(hostname): process = subprocess.run(command, shell=True, check=False, stdout=subprocess.PIPE) return process.returncode == 0 +def has_rdma(cmdline_args, operation): + """ + determine whether a host has rdma enabled in efa device + hostname: a host + operation: rdma operation name, allowed values are read and write + return: a boolean + """ + assert operation in ["read", "write"] + binpath = cmdline_args.binpath or "" + cmd = "timeout " + str(cmdline_args.timeout) \ + + " " + os.path.join(binpath, f"fi_efa_rdma_checker -o {operation}") + if cmdline_args.environments: + cmd = cmdline_args.environments + " " + cmd + proc = subprocess.run("ssh {} {}".format(cmdline_args.server_id, cmd), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + universal_newlines=True) + if has_ssh_connection_err_msg(proc.stdout): + raise SshConnectionError() + + return proc.returncode == 0 + def efa_retrieve_gid(hostname): """ return the GID of efa device on a host diff --git a/fabtests/pytest/efa/test_efa_protocol_selection.py b/fabtests/pytest/efa/test_efa_protocol_selection.py index 76212febc10..949f2982304 100644 --- a/fabtests/pytest/efa/test_efa_protocol_selection.py +++ b/fabtests/pytest/efa/test_efa_protocol_selection.py @@ -1,6 +1,6 @@ import pytest -from efa.efa_common import has_gdrcopy +from efa.efa_common import has_gdrcopy, has_rdma # TODO Expand this test to run on all memory types (and rename) @@ -17,6 +17,9 @@ def test_transfer_with_read_protocol_cuda(cmdline_args, fabtest_name, cntrl_env_ from common import has_cuda, has_hmem_support from efa.efa_common import efa_run_client_server_test, efa_retrieve_hw_counter_value + if cntrl_env_var == "FI_EFA_INTER_MIN_READ_WRITE_SIZE" and has_rdma(cmdline_args, "write"): + pytest.skip("FI_EFA_INTER_MIN_READ_WRITE_SIZE is only applied to emulated write protocols") + if cmdline_args.server_id == cmdline_args.client_id: pytest.skip("No read for intra-node communication") From 8818b04d0d68c2be67a1a3b75fd42ad075ec5f7c Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 24 Sep 2024 16:27:47 -0700 Subject: [PATCH 070/393] prov/efa: Make NACK protocol fall back to DC longCTS when DC is requested When application requests FI_DELIVERY_COMPLETE, it should fallback to the DC version of LONG CTS RTMs, as the default LongCTS is not DC. Signed-off-by: Jessie Yang --- fabtests/pytest/efa/test_rdm.py | 4 ++-- prov/efa/src/rdm/efa_rdm_ope.c | 6 ++++-- prov/efa/src/rdm/efa_rdm_pke_cmd.c | 22 ++++++++++++++++++---- prov/efa/src/rdm/efa_rdm_pke_nonreq.c | 18 +++++++++++++++--- 4 files changed, 39 insertions(+), 11 deletions(-) diff --git a/fabtests/pytest/efa/test_rdm.py b/fabtests/pytest/efa/test_rdm.py index ec1f3044c34..112893c8ce6 100644 --- a/fabtests/pytest/efa/test_rdm.py +++ b/fabtests/pytest/efa/test_rdm.py @@ -16,9 +16,9 @@ def test_rdm_pingpong(cmdline_args, iteration_type, completion_semantic, memory_ @pytest.mark.functional @pytest.mark.serial -def test_mr_exhaustion_rdm_pingpong(cmdline_args): +def test_mr_exhaustion_rdm_pingpong(cmdline_args, completion_semantic): efa_run_client_server_test(cmdline_args, "fi_efa_exhaust_mr_reg_rdm_pingpong", "short", - "transmit_complete", "host_to_host", "all", timeout=1000) + completion_semantic, "host_to_host", "all", timeout=1000) @pytest.mark.functional def test_rdm_pingpong_range(cmdline_args, completion_semantic, memory_type_bi_dir, message_size): diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index 80332ef70b9..d3c2d94b6ad 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -1773,6 +1773,8 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type) ssize_t efa_rdm_ope_post_send_fallback(struct efa_rdm_ope *ope, int pkt_type, ssize_t err) { + bool delivery_complete_requested = ope->fi_flags & FI_DELIVERY_COMPLETE; + if (err == -FI_ENOMR) { /* Long read and runting read protocols could fail because of a * lack of memory registrations. In that case, we retry with @@ -1786,7 +1788,7 @@ ssize_t efa_rdm_ope_post_send_fallback(struct efa_rdm_ope *ope, "protocol because memory registration limit " "was reached on the sender\n"); return efa_rdm_ope_post_send_or_queue( - ope, EFA_RDM_LONGCTS_MSGRTM_PKT); + ope, delivery_complete_requested ? EFA_RDM_DC_LONGCTS_MSGRTM_PKT : EFA_RDM_LONGCTS_MSGRTM_PKT); case EFA_RDM_LONGREAD_TAGRTM_PKT: case EFA_RDM_RUNTREAD_TAGRTM_PKT: EFA_INFO(FI_LOG_EP_CTRL, @@ -1794,7 +1796,7 @@ ssize_t efa_rdm_ope_post_send_fallback(struct efa_rdm_ope *ope, "because memory registration limit was " "reached on the sender\n"); return efa_rdm_ope_post_send_or_queue( - ope, EFA_RDM_LONGCTS_TAGRTM_PKT); + ope, delivery_complete_requested ? EFA_RDM_DC_LONGCTS_TAGRTM_PKT : EFA_RDM_LONGCTS_TAGRTM_PKT); default: return err; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_cmd.c b/prov/efa/src/rdm/efa_rdm_pke_cmd.c index 97741ebbd27..f095cc1f772 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_cmd.c +++ b/prov/efa/src/rdm/efa_rdm_pke_cmd.c @@ -112,14 +112,18 @@ int efa_rdm_pke_fill_data(struct efa_rdm_pke *pkt_entry, /* The data_offset will be non-zero when the long CTS RTM packet * is sent to continue a runting read transfer after the * receiver has run out of memory registrations */ - assert((data_offset == 0 || ope->internal_flags & EFA_RDM_OPE_READ_NACK) && data_size == -1); + assert(data_offset == 0 || + ope->internal_flags & EFA_RDM_OPE_READ_NACK); + assert(data_size == -1); ret = efa_rdm_pke_init_longcts_msgrtm(pkt_entry, ope); break; case EFA_RDM_LONGCTS_TAGRTM_PKT: /* The data_offset will be non-zero when the long CTS RTM packet * is sent to continue a runting read transfer after the * receiver has run out of memory registrations */ - assert((data_offset == 0 || ope->internal_flags & EFA_RDM_OPE_READ_NACK) && data_size == -1); + assert(data_offset == 0 || + ope->internal_flags & EFA_RDM_OPE_READ_NACK); + assert(data_size == -1); ret = efa_rdm_pke_init_longcts_tagrtm(pkt_entry, ope); break; case EFA_RDM_LONGREAD_MSGRTM_PKT: @@ -187,11 +191,21 @@ int efa_rdm_pke_fill_data(struct efa_rdm_pke *pkt_entry, ret = efa_rdm_pke_init_dc_medium_tagrtm(pkt_entry, ope, data_offset, data_size); break; case EFA_RDM_DC_LONGCTS_MSGRTM_PKT: - assert(data_offset == 0 && data_size == -1); + /* The data_offset will be non-zero when the DC long CTS RTM packet + * is sent to continue a runting read transfer after the + * receiver has run out of memory registrations */ + assert(data_offset == 0 || + ope->internal_flags & EFA_RDM_OPE_READ_NACK); + assert(data_size == -1); ret = efa_rdm_pke_init_dc_longcts_msgrtm(pkt_entry, ope); break; case EFA_RDM_DC_LONGCTS_TAGRTM_PKT: - assert(data_offset == 0 && data_size == -1); + /* The data_offset will be non-zero when the DC long CTS tagged RTM packet + * is sent to continue a runting read transfer after the + * receiver has run out of memory registrations */ + assert(data_offset == 0 || + ope->internal_flags & EFA_RDM_OPE_READ_NACK); + assert(data_size == -1); ret = efa_rdm_pke_init_dc_longcts_tagrtm(pkt_entry, ope); break; case EFA_RDM_DC_EAGER_RTW_PKT: diff --git a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c index 3c384743c77..b1b7be31460 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c +++ b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c @@ -690,6 +690,7 @@ void efa_rdm_pke_handle_read_nack_recv(struct efa_rdm_pke *pkt_entry) { struct efa_rdm_read_nack_hdr *nack_hdr; struct efa_rdm_ope *txe; + bool delivery_complete_requested; efa_rdm_ep_domain(pkt_entry->ep)->num_read_msg_in_flight -= 1; @@ -700,23 +701,34 @@ void efa_rdm_pke_handle_read_nack_recv(struct efa_rdm_pke *pkt_entry) efa_rdm_pke_release_rx(pkt_entry); txe->internal_flags |= EFA_RDM_OPE_READ_NACK; + delivery_complete_requested = txe->fi_flags & FI_DELIVERY_COMPLETE; + if (txe->op == ofi_op_write) { EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to emulated long CTS write " "protocol because p2p is not available\n"); - efa_rdm_ope_post_send_or_queue(txe, EFA_RDM_LONGCTS_RTW_PKT); + efa_rdm_ope_post_send_or_queue( + txe, delivery_complete_requested ? + EFA_RDM_DC_LONGCTS_RTW_PKT : + EFA_RDM_LONGCTS_RTW_PKT); } else if (txe->op == ofi_op_tagged) { EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS tagged " "protocol because memory registration limit " "was reached on the receiver\n"); - efa_rdm_ope_post_send_or_queue(txe, EFA_RDM_LONGCTS_TAGRTM_PKT); + efa_rdm_ope_post_send_or_queue( + txe, delivery_complete_requested ? + EFA_RDM_DC_LONGCTS_TAGRTM_PKT : + EFA_RDM_LONGCTS_TAGRTM_PKT); } else { EFA_INFO(FI_LOG_EP_CTRL, "Sender fallback to long CTS untagged " "protocol because memory registration limit " "was reached on the receiver\n"); - efa_rdm_ope_post_send_or_queue(txe, EFA_RDM_LONGCTS_MSGRTM_PKT); + efa_rdm_ope_post_send_or_queue( + txe, delivery_complete_requested ? + EFA_RDM_DC_LONGCTS_MSGRTM_PKT : + EFA_RDM_LONGCTS_MSGRTM_PKT); } } From 5573b3f3f4ebc6d1857c52dd05d03e71352b8fcc Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 24 Sep 2024 11:12:01 -0700 Subject: [PATCH 071/393] prov/efa: differentiate unresponsive receiver errors following rdma-core Add a new vendor error code EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE from rdma core to indicate the remote is unreachable. Add a new EFA provider error code UNESTABLISHED_RECV_UNRESP to distinguish unresponsive receiver error when the peer is reachable by the EFA device but libfabric failed to complete a handshake. Add unit test for EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE. Signed-off-by: Jessie Yang --- prov/efa/src/efa_errno.h | 12 ++++++++---- prov/efa/src/efa_strerror.c | 16 +++++++++++----- prov/efa/src/rdm/efa_rdm_cq.c | 9 ++++++--- prov/efa/test/efa_unit_test_cq.c | 17 +++++++++++++++++ prov/efa/test/efa_unit_tests.c | 1 + prov/efa/test/efa_unit_tests.h | 1 + 6 files changed, 44 insertions(+), 12 deletions(-) diff --git a/prov/efa/src/efa_errno.h b/prov/efa/src/efa_errno.h index 1a147f0fbdf..4a68fe2488e 100644 --- a/prov/efa/src/efa_errno.h +++ b/prov/efa/src/efa_errno.h @@ -69,8 +69,9 @@ _(10, REMOTE_ERROR_RNR, Destination resource not ready (no work queue entries posted on receive queue)) \ _(11, REMOTE_ERROR_BAD_LENGTH, Remote scatter-gather list too short) \ _(12, REMOTE_ERROR_BAD_STATUS, Unexpected status returned by responder) \ - _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (detected locally)) \ - _(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations)) + _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (was previously responsive)) \ + _(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations)) \ + _(15, LOCAL_ERROR_UNREACH_REMOTE, Unreachable remote (never received a response)) /** * @brief EFA provider proprietary error codes @@ -105,7 +106,8 @@ _(4122, SHM_INTERNAL_ERROR, SHM internal error) \ _(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \ _(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \ - _(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) + _(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) \ + _(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed)) /** @} */ @@ -156,13 +158,15 @@ static inline int to_fi_errno(enum efa_errno err) { case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNSUPPORTED_OP: case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS: return FI_EINVAL; - case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE: return FI_EHOSTUNREACH; case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH: case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH: return FI_EMSGSIZE; case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP: + case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP: return FI_ECONNABORTED; case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN: case EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER: diff --git a/prov/efa/src/efa_strerror.c b/prov/efa/src/efa_strerror.c index 35710501d0e..895ebfd83e7 100644 --- a/prov/efa/src/efa_strerror.c +++ b/prov/efa/src/efa_strerror.c @@ -67,10 +67,10 @@ void efa_show_help(enum efa_errno err) { help = "This error is detected remotely; " "typically encountered when the peer process is no longer present"; break; - case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE: help = "This error is detected locally. " - "The connection status is unknown or was never established via " - "handshake. This typically indicates one or more misconfigured " + "The peer is not reachable by the EFA device. " + "This typically indicates one or more misconfigured " "EC2 instances; most often due to incorrect inbound/outbound " "security group rules and/or instances placed in different " "subnets. Refer to the public AWS documentation for EFA for " @@ -80,8 +80,14 @@ void efa_show_help(enum efa_errno err) { case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP: help = "This error is detected locally. " "The connection was previously established via handshake, " - "which indicates the error is likely due to the peer process no " - "longer being present."; + "which indicates the error is likely due to a hardware failure " + "on the remote peer, or the peer process no longer being present."; + break; + case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP: + help = "This error is detected locally. " + "The peer is reachable by the EFA device but libfabric failed " + "to complete a handshake, which indicates the error is likely " + "due to the peer process no longer being present."; break; case FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX: help = "This error is detected locally. " diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 17a540c4da7..67a02e55f3d 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -399,7 +399,9 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct * * @todo Currently, this only checks for unresponsive receiver * (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to - * #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other + * #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP if a handshake was made, or + * #FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP if the handshake failed. + * This should be expanded to handle other * RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate * error reporting */ @@ -418,8 +420,9 @@ static int efa_rdm_cq_get_prov_errno(struct ibv_cq_ex *ibv_cq_ex) { switch (vendor_err) { case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: { - if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) - vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP; + vendor_err = (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) ? + FI_EFA_ERR_ESTABLISHED_RECV_UNRESP : + FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP; break; } default: diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 7c521f8dfec..76d45368e87 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -227,6 +227,23 @@ void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); } +/** + * @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns + * unreachable remote error for send. + * + * When send operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available. + * then user should call fi_cq_readerr() to get an error CQ entry that contain error code. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_rdm_cq_read_bad_send_status_unreachable_receiver(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + test_rdm_cq_read_bad_send_status(resource, + 0x1234567812345678, 0x8765432187654321, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE); +} + /** * @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns * invalid qpn error for send. diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 883130d2320..e6b7a324d81 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -118,6 +118,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_rdm_cq_create_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unreachable_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_invalid_qpn, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_message_too_long, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_status, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index a6c0773ad38..2e62473a717 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -133,6 +133,7 @@ void test_ibv_cq_ex_read_failed_poll(); void test_rdm_cq_create_error_handling(); void test_rdm_cq_read_bad_send_status_unresponsive_receiver(); void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id(); +void test_rdm_cq_read_bad_send_status_unreachable_receiver(); void test_rdm_cq_read_bad_send_status_invalid_qpn(); void test_rdm_cq_read_bad_send_status_message_too_long(); void test_ibv_cq_ex_read_bad_recv_status(); From 3f40b495d4cc809e4275ef15af25310539d3c58a Mon Sep 17 00:00:00 2001 From: Darryl Abbate Date: Wed, 25 Sep 2024 13:46:08 -0700 Subject: [PATCH 072/393] actions: Upgrade macOS version This resolves warnings re: unsupported macOS version Signed-off-by: Darryl Abbate --- .github/workflows/pr-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 5d3e3b5f886..cc372c9532c 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -120,7 +120,7 @@ jobs: name: hmem-config.log path: config.log macos: - runs-on: macos-12 + runs-on: macos-13 steps: - name: Install dependencies (Mac OS) run: | @@ -141,5 +141,5 @@ jobs: if: failure() uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: - name: macos-12-config.log + name: macos-config.log path: config.log From 96a535599e07bb04d933d87d3696c1db8fe18292 Mon Sep 17 00:00:00 2001 From: Darryl Abbate Date: Wed, 25 Sep 2024 13:49:33 -0700 Subject: [PATCH 073/393] actions: Suppress warnings if libtool is already installed Signed-off-by: Darryl Abbate --- .github/workflows/pr-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index cc372c9532c..268b8cec795 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -125,7 +125,7 @@ jobs: - name: Install dependencies (Mac OS) run: | brew install automake - brew install libtool + brew install --quiet libtool - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Build Check run: | From 23dadff37818bee16db341479fa6eac329ac2db4 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Fri, 27 Sep 2024 13:45:00 -0700 Subject: [PATCH 074/393] prov/efa: Correctly handle fallback longcts-rtw send completion Fallback long cts rtw doesn't have any payload. In this case, this function shouldn't touch the tx entry as it may be released earlier as the CTSDATA pkts have already kicked off and finished the send. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_pke_rtw.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtw.c b/prov/efa/src/rdm/efa_rdm_pke_rtw.c index 2a1b6366d40..5872302136f 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtw.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtw.c @@ -12,6 +12,7 @@ #include "efa_rdm_rma.h" #include "efa_rdm_ope.h" #include "efa_rdm_pke.h" +#include "efa_rdm_pke_rtw.h" #include "efa_rdm_pke_utils.h" #include "efa_rdm_protocol.h" #include "efa_rdm_pke_req.h" @@ -348,6 +349,18 @@ void efa_rdm_pke_handle_longcts_rtw_send_completion(struct efa_rdm_pke *pkt_entr { struct efa_rdm_ope *txe; + /** + * A zero-payload longcts rtw pkt currently should only happen when it's + * used for the READ NACK protocol. In this case, this pkt doesn't + * contribute to the send completion, and the associated tx entry + * may be released earlier as the CTSDATA pkts have already kicked off + * and finished the send. + */ + if (pkt_entry->payload_size == 0) { + assert(efa_rdm_pke_get_rtw_base_hdr(pkt_entry)->flags & EFA_RDM_REQ_READ_NACK); + return; + } + txe = pkt_entry->ope; txe->bytes_acked += pkt_entry->payload_size; if (txe->total_len == txe->bytes_acked) From b39465ed7fc2350f16de69387d76b56484e572ee Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Fri, 27 Sep 2024 13:59:51 -0700 Subject: [PATCH 075/393] prov/efa: remove DC NACK packet from rxe map after recv completed We need to remove the ope for READ NACK packet from rxe map after recv completed for both DC and non-DC. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_ope.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index d3c2d94b6ad..ccd348992fc 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -1066,6 +1066,12 @@ void efa_rdm_ope_handle_recv_completed(struct efa_rdm_ope *ope) efa_rdm_rxe_report_completion(rxe); } + if (ope->internal_flags & EFA_RDM_OPE_READ_NACK) { + assert(ope->type == EFA_RDM_RXE); + /* Apply to both DC and non-DC */ + efa_rdm_rxe_map_remove(&ope->ep->rxe_map, ope->msg_id, ope->peer->efa_fiaddr, ope); + } + /* As can be seen, this function does not release rxe when * efa_rdm_ope_post_send_or_queue() was successful. * @@ -1106,9 +1112,6 @@ void efa_rdm_ope_handle_recv_completed(struct efa_rdm_ope *ope) return; } - if (ope->internal_flags & EFA_RDM_OPE_READ_NACK) - efa_rdm_rxe_map_remove(&ope->ep->rxe_map, ope->msg_id, ope->peer->efa_fiaddr, ope); - if (ope->type == EFA_RDM_TXE) { efa_rdm_txe_release(ope); } else { From fc5e85c3dbaf76a3293e4cfb6b415dc40d18e1a9 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 25 Sep 2024 15:41:34 +0000 Subject: [PATCH 076/393] fabtests/configure, efa: call provider specific configure Create efa provider's own configure.m4 and make configure.ac call it. Build efa_rdma_checker conditionally when all symbols are available. Signed-off-by: Shi Jin --- fabtests/configure.ac | 3 +++ fabtests/prov/efa/Makefile.include | 8 ++++++-- fabtests/prov/efa/configure.m4 | 32 ++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 fabtests/prov/efa/configure.m4 diff --git a/fabtests/configure.ac b/fabtests/configure.ac index bfeeda6dda8..ed5e8e767b8 100644 --- a/fabtests/configure.ac +++ b/fabtests/configure.ac @@ -80,6 +80,9 @@ AS_IF([test -z "$CFLAGS"], # <3), but it is necessary in AM 1.12.x. m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) +dnl Call the provider's CONFIGURE and CONDITIONALS macros +m4_include([prov/efa/configure.m4]) + AM_PROG_LIBTOOL AC_ARG_WITH([valgrind], diff --git a/fabtests/prov/efa/Makefile.include b/fabtests/prov/efa/Makefile.include index bfb0049d2d3..ee08fb6c312 100644 --- a/fabtests/prov/efa/Makefile.include +++ b/fabtests/prov/efa/Makefile.include @@ -34,8 +34,10 @@ bin_PROGRAMS += prov/efa/src/fi_efa_rnr_read_cq_error \ prov/efa/src/fi_efa_rnr_queue_resend \ prov/efa/src/fi_efa_info_test if HAVE_VERBS_DEVEL -bin_PROGRAMS += prov/efa/src/fi_efa_exhaust_mr_reg_rdm_pingpong \ - prov/efa/src/fi_efa_rdma_checker +bin_PROGRAMS += prov/efa/src/fi_efa_exhaust_mr_reg_rdm_pingpong +if BUILD_EFA_RDMA_CHECKER +bin_PROGRAMS += prov/efa/src/fi_efa_rdma_checker +endif BUILD_EFA_RDMA_CHECKER endif HAVE_VERBS_DEVEL efa_rnr_srcs = \ @@ -67,9 +69,11 @@ prov_efa_src_fi_efa_exhaust_mr_reg_rdm_pingpong_SOURCES = \ $(benchmarks_srcs) prov_efa_src_fi_efa_exhaust_mr_reg_rdm_pingpong_LDADD = libfabtests.la +if BUILD_EFA_RDMA_CHECKER prov_efa_src_fi_efa_rdma_checker_SOURCES = \ prov/efa/src/efa_rdma_checker.c prov_efa_src_fi_efa_rdma_checker_LDADD = libfabtests.la prov_efa_src_fi_efa_rdma_checker_LDFLAGS = -lefa +endif BUILD_EFA_RDMA_CHECKER endif HAVE_VERBS_DEVEL diff --git a/fabtests/prov/efa/configure.m4 b/fabtests/prov/efa/configure.m4 new file mode 100644 index 00000000000..bf0f3b624e9 --- /dev/null +++ b/fabtests/prov/efa/configure.m4 @@ -0,0 +1,32 @@ +dnl +dnl SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +dnl SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. +dnl +dnl Configure specific to the fabtests Amazon EFA provider + + +dnl Checks for presence of efadv verbs. Needed for building tests that calls efadv verbs. +have_efadv=0 +AC_CHECK_HEADER([infiniband/efadv.h], + [AC_CHECK_LIB(efa, efadv_query_device, + [have_efadv=1])]) + +efa_rdma_checker_happy=0 +AS_IF([test x"$have_efadv" = x"1"], [ + efa_rdma_checker_happy=1 + AC_CHECK_MEMBER(struct efadv_device_attr.max_rdma_size, + [], + [efa_rdma_checker_happy=0], + [[#include ]]) + + AC_CHECK_MEMBER(struct efadv_device_attr.device_caps, + [], + [efa_rdma_checker_happy=0], + [[#include ]]) + + AC_CHECK_DECL(EFADV_DEVICE_ATTR_CAPS_RDMA_WRITE, + [], + [efa_rdma_checker_happy=0], + [[#include ]]) +]) +AM_CONDITIONAL([BUILD_EFA_RDMA_CHECKER], [test $efa_rdma_checker_happy -eq 1]) From cec672e90a5b542bd2239a4009a6d6bfddde6c69 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:21:18 +0000 Subject: [PATCH 077/393] build(deps): bump github/codeql-action from 3.26.8 to 3.26.10 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.8 to 3.26.10. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/294a9d92911152fe08befb9ec03e240add280cb3...e2b3eafc8d227b0241d48be5f425d47c2d750a13) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 89ca6038efc..cbc03660646 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 + uses: github/codeql-action/init@e2b3eafc8d227b0241d48be5f425d47c2d750a13 # v3.26.10 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 + uses: github/codeql-action/autobuild@e2b3eafc8d227b0241d48be5f425d47c2d750a13 # v3.26.10 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 + uses: github/codeql-action/analyze@e2b3eafc8d227b0241d48be5f425d47c2d750a13 # v3.26.10 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 29820d5cdd3..6b793fabd75 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 + uses: github/codeql-action/upload-sarif@e2b3eafc8d227b0241d48be5f425d47c2d750a13 # v3.26.10 with: sarif_file: results.sarif From 1b6f3ef4c187a5d2cd5e9328c52bfbfd5a8335f2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:21:23 +0000 Subject: [PATCH 078/393] build(deps): bump actions/checkout from 4.1.7 to 4.2.0 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.7 to 4.2.0. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/692973e3d937129bcbf40652eb9f2f61becf3332...d632683dd7b4114ad314bca15554477dd762a938) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/clang-format-check-cn.yml | 2 +- .github/workflows/clang-format-check.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/coverity.yml | 2 +- .github/workflows/gh-man.yaml | 2 +- .github/workflows/nroff-elves.yaml | 2 +- .github/workflows/pr-ci.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/clang-format-check-cn.yml b/.github/workflows/clang-format-check-cn.yml index 74c03301ff2..4cfa1bb2a5b 100644 --- a/.github/workflows/clang-format-check-cn.yml +++ b/.github/workflows/clang-format-check-cn.yml @@ -9,7 +9,7 @@ jobs: path: - 'prov/opx' steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - name: Run clang-format style check for C/C++/Protobuf programs (Cornelis Networks-specific). uses: jidicula/clang-format-action@c74383674bf5f7c69f60ce562019c1c94bc1421a # v4.13.0 with: diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 3e3eb43755d..7afb11d4147 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -11,7 +11,7 @@ jobs: path: - 'prov/sm2' steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - name: Run clang-format style check for C/C++/Protobuf programs. uses: jidicula/clang-format-action@c74383674bf5f7c69f60ce562019c1c94bc1421a # v4.13.0 with: diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index cbc03660646..30e1e3ee04c 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -48,7 +48,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 73f34bb228a..84e3956734c 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -52,7 +52,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - name: Download Coverity tools run: | wget https://scan.coverity.com/download/linux64 --post-data "token=${{ secrets.COVERITY_SCAN_TOKEN }}&project=ofiwg%2Flibfabric" -O coverity_tool.tgz diff --git a/.github/workflows/gh-man.yaml b/.github/workflows/gh-man.yaml index 5400ec94dd2..4f29c5eabc6 100644 --- a/.github/workflows/gh-man.yaml +++ b/.github/workflows/gh-man.yaml @@ -25,7 +25,7 @@ jobs: echo "$GITHUB_DATA" - name: Check out the git repo - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - name: Update the man pages in branch gh-pages run: .github/workflows/gh-man.sh diff --git a/.github/workflows/nroff-elves.yaml b/.github/workflows/nroff-elves.yaml index 669a06bb4dd..e2caa2d9213 100644 --- a/.github/workflows/nroff-elves.yaml +++ b/.github/workflows/nroff-elves.yaml @@ -23,7 +23,7 @@ jobs: echo "$GITHUB_DATA" - name: Check out the git repo - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - name: Get the required packages run: sudo apt install -y pandoc diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 268b8cec795..450d2aed66b 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -56,7 +56,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - name: Build Check run: | set -x @@ -96,7 +96,7 @@ jobs: sudo apt-add-repository 'deb [arch=amd64] https://repositories.intel.com/graphics/ubuntu focal main' sudo apt-get update sudo apt-get install -y level-zero level-zero-dev - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - name: HMEM Checks run: | set -x @@ -126,7 +126,7 @@ jobs: run: | brew install automake brew install --quiet libtool - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - name: Build Check run: | ./autogen.sh diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 6b793fabd75..7855f755c8c 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -33,7 +33,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 with: persist-credentials: false From 1ef91c8215fe22d44abe2ae5a325521dd3a13593 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Mon, 30 Sep 2024 10:14:23 -0700 Subject: [PATCH 079/393] prov/efa: Remove tx_iov_limit and rx_iov_limit from efa_rdm_ep tx_iov_limit is a duplication of info->tx_attr->iov_limit. rx_iov_limit is a duplication of info->rx_attr->iov_limit. Access tx_iov_limit and rx_iov_limit from fi_info to save the memory of efa_rdm_ep struct. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_atomic.c | 2 +- prov/efa/src/rdm/efa_rdm_ep.h | 2 -- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 2 -- prov/efa/src/rdm/efa_rdm_ep_utils.c | 2 +- prov/efa/src/rdm/efa_rdm_msg.c | 4 ++-- prov/efa/src/rdm/efa_rdm_rma.c | 4 ++-- 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_atomic.c b/prov/efa/src/rdm/efa_rdm_atomic.c index eb997a77906..961e9f0695e 100644 --- a/prov/efa/src/rdm/efa_rdm_atomic.c +++ b/prov/efa/src/rdm/efa_rdm_atomic.c @@ -148,7 +148,7 @@ ssize_t efa_rdm_atomic_generic_efa(struct efa_rdm_ep *efa_rdm_ep, ssize_t err; struct util_srx_ctx *srx_ctx; - assert(msg->iov_count <= efa_rdm_ep->tx_iov_limit); + assert(msg->iov_count <= efa_rdm_ep->base_ep.info->tx_attr->iov_limit); efa_perfset_start(efa_rdm_ep, perf_efa_tx); srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 8970c90ba58..75ac34857e1 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -63,8 +63,6 @@ struct efa_rdm_ep { size_t rx_size; size_t tx_size; size_t mtu_size; - size_t rx_iov_limit; - size_t tx_iov_limit; size_t inject_size; /* Endpoint's capability to support zero-copy rx */ diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 6aa0cf96d63..15a20256c87 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -559,8 +559,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->rx_size = info->rx_attr->size; efa_rdm_ep->tx_size = info->tx_attr->size; - efa_rdm_ep->rx_iov_limit = info->rx_attr->iov_limit; - efa_rdm_ep->tx_iov_limit = info->tx_attr->iov_limit; efa_rdm_ep->inject_size = info->tx_attr->inject_size; efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size; efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size; diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index bf98cb08ef3..69812d0f90c 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -211,7 +211,7 @@ int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe size_t rx_iov_offset = 0; int err, rx_iov_index = 0; - assert(rxe->iov_count > 0 && rxe->iov_count <= ep->rx_iov_limit); + assert(rxe->iov_count > 0 && rxe->iov_count <= ep->base_ep.info->rx_attr->iov_limit); assert(rxe->iov[0].iov_len >= ep->msg_prefix_size); pkt_entry = efa_rdm_pke_alloc(ep, ep->user_rx_pkt_pool, EFA_RDM_PKE_FROM_USER_RX_POOL); if (OFI_UNLIKELY(!pkt_entry)) { diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index 2e95a4a721f..a4c8071f367 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -167,7 +167,7 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, struct efa_rdm_peer *pee srx_ctx = efa_rdm_ep_get_peer_srx_ctx(ep); - assert(msg->iov_count <= ep->tx_iov_limit); + assert(msg->iov_count <= ep->base_ep.info->tx_attr->iov_limit); efa_perfset_start(ep, perf_efa_tx); ofi_genlock_lock(srx_ctx->lock); @@ -893,7 +893,7 @@ ssize_t efa_rdm_msg_generic_recv(struct efa_rdm_ep *ep, const struct fi_msg *msg struct efa_rdm_ope *rxe; struct util_srx_ctx *srx_ctx; - assert(msg->iov_count <= ep->rx_iov_limit); + assert(msg->iov_count <= ep->base_ep.info->rx_attr->iov_limit); efa_perfset_start(ep, perf_efa_recv); diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index 15769f5bc56..6a6562b2849 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -189,7 +189,7 @@ ssize_t efa_rdm_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uin if (err) return err; - assert(msg->iov_count <= efa_rdm_ep->tx_iov_limit); + assert(msg->iov_count <= efa_rdm_ep->base_ep.info->tx_attr->iov_limit); efa_perfset_start(efa_rdm_ep, perf_efa_tx); ofi_genlock_lock(srx_ctx->lock); @@ -480,7 +480,7 @@ ssize_t efa_rdm_rma_writemsg(struct fid_ep *ep, if (err) return err; - assert(msg->iov_count <= efa_rdm_ep->tx_iov_limit); + assert(msg->iov_count <= efa_rdm_ep->base_ep.info->tx_attr->iov_limit); peer = efa_rdm_ep_get_peer(efa_rdm_ep, msg->addr); assert(peer); From 075c734e0f5db59a9c87364062632a864c57eeba Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Mon, 30 Sep 2024 13:04:34 -0700 Subject: [PATCH 080/393] prov/efa: Remove tx_size and rx_size from efa_rdm_ep tx_size is a duplication of info->tx_attr->size. rx_size is a duplication of info->rx_attr->size. Access them via fi_info to save the memory of efa_rdm_ep. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_ep.h | 11 ++--------- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 10 ++++------ prov/efa/src/rdm/efa_rdm_srx.c | 2 +- prov/efa/test/efa_unit_test_ep.c | 8 +++++--- 4 files changed, 12 insertions(+), 19 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 75ac34857e1..eba552d39ad 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -55,13 +55,6 @@ struct efa_rdm_ep { /* shm provider fid */ struct fid_ep *shm_ep; - /* - * EFA RDM endpoint rx/tx queue sizes. These may be different from the core - * provider's rx/tx size and will either limit the number of possible - * receives/sends or allow queueing. - */ - size_t rx_size; - size_t tx_size; size_t mtu_size; size_t inject_size; @@ -227,12 +220,12 @@ void efa_rdm_ep_record_tx_op_completed(struct efa_rdm_ep *ep, struct efa_rdm_pke static inline size_t efa_rdm_ep_get_rx_pool_size(struct efa_rdm_ep *ep) { - return MIN(ep->efa_max_outstanding_rx_ops, ep->rx_size); + return MIN(ep->efa_max_outstanding_rx_ops, ep->base_ep.info->rx_attr->size); } static inline size_t efa_rdm_ep_get_tx_pool_size(struct efa_rdm_ep *ep) { - return MIN(ep->efa_max_outstanding_tx_ops, ep->tx_size); + return MIN(ep->efa_max_outstanding_tx_ops, ep->base_ep.info->tx_attr->size); } static inline int efa_rdm_ep_need_sas(struct efa_rdm_ep *ep) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 15a20256c87..abc4f89c720 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -236,7 +236,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) ret = ofi_bufpool_create(&ep->user_rx_pkt_pool, sizeof(struct efa_rdm_pke), EFA_RDM_BUFPOOL_ALIGNMENT, - 0,ep->rx_size,0); + 0, ep->base_ep.info->rx_attr->size, 0); if (ret) goto err_free; @@ -285,7 +285,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) sizeof(struct efa_rdm_rxe_map_entry), EFA_RDM_BUFPOOL_ALIGNMENT, 0, /* no limit for max_cnt */ - ep->rx_size, 0); + ep->base_ep.info->rx_attr->size, 0); if (ret) goto err_free; @@ -301,7 +301,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) sizeof(struct efa_rdm_ope), EFA_RDM_BUFPOOL_ALIGNMENT, 0, /* no limit for max_cnt */ - ep->tx_size + ep->rx_size, 0); + ep->base_ep.info->tx_attr->size + ep->base_ep.info->rx_attr->size, 0); if (ret) goto err_free; @@ -309,7 +309,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) sizeof(struct efa_rdm_peer_overflow_pke_list_entry), EFA_RDM_BUFPOOL_ALIGNMENT, 0, /* no limit for max_cnt */ - ep->rx_size, 0); + ep->base_ep.info->rx_attr->size, 0); if (ret) goto err_free; @@ -557,8 +557,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, EFA_INFO(FI_LOG_EP_CTRL, "efa_rdm_ep->host_id: i-%017lx\n", efa_rdm_ep->host_id); } - efa_rdm_ep->rx_size = info->rx_attr->size; - efa_rdm_ep->tx_size = info->tx_attr->size; efa_rdm_ep->inject_size = info->tx_attr->inject_size; efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size; efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size; diff --git a/prov/efa/src/rdm/efa_rdm_srx.c b/prov/efa/src/rdm/efa_rdm_srx.c index 47919dc5667..4efbe01da72 100644 --- a/prov/efa/src/rdm/efa_rdm_srx.c +++ b/prov/efa/src/rdm/efa_rdm_srx.c @@ -151,7 +151,7 @@ int efa_rdm_peer_srx_construct(struct efa_rdm_ep *ep) { int ret; ret = util_ep_srx_context(&efa_rdm_ep_domain(ep)->util_domain, - ep->rx_size, EFA_RDM_IOV_LIMIT, + ep->base_ep.info->rx_attr->size, EFA_RDM_IOV_LIMIT, ep->min_multi_recv_size, &efa_rdm_srx_update_mr, &efa_rdm_ep_domain(ep)->srx_lock, diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 75f100a80f0..f3a70183c64 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -1126,6 +1126,7 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res int err, numaddr; struct efa_rdm_pke **pkt_entry_vec; int i; + size_t tx_size; /* disable shm to force using efa device to send */ efa_unit_test_resource_construct_rdm_shm_disabled(resource); @@ -1139,6 +1140,7 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res assert_int_equal(numaddr, 1); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + tx_size = efa_rdm_ep->base_ep.info->tx_attr->size; /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think * a REQ packet has been sent to the peer (so no need to send again) @@ -1148,11 +1150,11 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res peer->flags = EFA_RDM_PEER_REQ_SENT; peer->is_local = false; - pkt_entry_vec = calloc(efa_rdm_ep->tx_size, sizeof(struct efa_rdm_pke *)); + pkt_entry_vec = calloc(tx_size, sizeof(struct efa_rdm_pke *)); assert_non_null(pkt_entry_vec); /* Exhaust the tx pkt pool */ - for (i = 0; i < efa_rdm_ep->tx_size; i++) { + for (i = 0; i < tx_size; i++) { pkt_entry_vec[i] = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_tx_pkt_pool, EFA_RDM_PKE_FROM_EFA_TX_POOL); assert_non_null(pkt_entry_vec[i]); } @@ -1162,7 +1164,7 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res assert_int_equal(efa_rdm_ep_post_handshake(efa_rdm_ep, peer), -FI_EAGAIN); assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - for (i = 0; i < efa_rdm_ep->tx_size; i++) + for (i = 0; i < tx_size; i++) efa_rdm_pke_release_tx(pkt_entry_vec[i]); free(pkt_entry_vec); From 7f9b5601d802ab2c5ab82b4e6f4ca8b8146395e7 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Wed, 25 Sep 2024 16:16:19 -0700 Subject: [PATCH 081/393] prov/efa: Use tclass to prioritize the messages from an ep To prioritize the messages from a given endpoint, user can specify ` fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY` in the fi_endpoint() call to set the service level in rdma-core. All other tclass values will be ignored. Signed-off-by: Jessie Yang --- man/fi_efa.7.md | 2 ++ prov/efa/configure.m4 | 9 +++++++++ prov/efa/src/efa_base_ep.c | 8 ++++++-- prov/efa/src/efa_base_ep.h | 4 +++- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 4 ++-- 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/man/fi_efa.7.md b/man/fi_efa.7.md index 02ef1d80b73..b6eefc19dc1 100644 --- a/man/fi_efa.7.md +++ b/man/fi_efa.7.md @@ -205,6 +205,8 @@ struct fi_efa_mr_attr { **query_mr()** returns 0 on success, or the value of errno on failure (which indicates the failure reason). +# Traffic Class (tclass) in EFA +To prioritize the messages from a given endpoint, user can specify `fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY` in the fi_endpoint() call to set the service level in rdma-core. All other tclass values will be ignored. # RUNTIME PARAMETERS diff --git a/prov/efa/configure.m4 b/prov/efa/configure.m4 index f807ce9bc51..71152f72ed4 100644 --- a/prov/efa/configure.m4 +++ b/prov/efa/configure.m4 @@ -77,6 +77,7 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ efadv_support_extended_cq=0 have_efa_dmabuf_mr=0 have_efadv_query_mr=0 + have_efadv_sl=0 dnl $have_neuron is defined at top-level configure.ac AM_CONDITIONAL([HAVE_NEURON], [ test x"$have_neuron" = x1 ]) @@ -159,6 +160,11 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ [], [have_efadv_query_mr=0], [[#include ]]) + + AC_CHECK_MEMBER(struct efadv_qp_init_attr.sl, + [have_efadv_sl=1], + [have_efadv_sl=0], + [[#include ]]) ]) AC_DEFINE_UNQUOTED([HAVE_RDMA_SIZE], @@ -188,6 +194,9 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ AC_DEFINE_UNQUOTED([HAVE_EFADV_QUERY_MR], [$have_efadv_query_mr], [Indicates if efadv_query_mr verbs is available]) + AC_DEFINE_UNQUOTED([HAVE_EFADV_SL], + [$have_efadv_sl], + [Indicates if efadv_qp_init_attr has sl]) CPPFLAGS=$save_CPPFLAGS diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 8c55fee2387..d022c0e3ef6 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -167,7 +167,7 @@ static int efa_base_ep_modify_qp_rst2rts(struct efa_base_ep *base_ep, * @param init_attr_ex ibv_qp_init_attr_ex * @return int 0 on success, negative integer on failure */ -int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex) +int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, uint32_t tclass) { struct efadv_qp_init_attr efa_attr = { 0 }; @@ -185,6 +185,10 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex) efa_attr.flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV; #endif efa_attr.driver_qp_type = EFADV_QP_DRIVER_TYPE_SRD; +#if HAVE_EFADV_SL + if (tclass == FI_TC_LOW_LATENCY) + efa_attr.sl = EFA_QP_DEFAULT_SERVICE_LEVEL; +#endif (*qp)->ibv_qp = efadv_create_qp_ex( init_attr_ex->pd->context, init_attr_ex, &efa_attr, sizeof(struct efadv_qp_init_attr)); @@ -206,7 +210,7 @@ int efa_base_ep_create_qp(struct efa_base_ep *base_ep, { int ret; - ret = efa_qp_create(&base_ep->qp, init_attr_ex); + ret = efa_qp_create(&base_ep->qp, init_attr_ex, base_ep->info->tx_attr->tclass); if (ret) return ret; diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index bbcd0d26a2d..6cde8f9f4bf 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -12,6 +12,8 @@ #include "ofi_util.h" #include "rdm/efa_rdm_protocol.h" +#define EFA_QP_DEFAULT_SERVICE_LEVEL 8 + struct efa_qp { struct ibv_qp *ibv_qp; struct ibv_qp_ex *ibv_qp_ex; @@ -72,7 +74,7 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, int efa_base_ep_getname(fid_t fid, void *addr, size_t *addrlen); -int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex); +int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, uint32_t tclass); void efa_qp_destruct(struct efa_qp *qp); diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index abc4f89c720..1ce5041ba36 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -105,7 +105,7 @@ int efa_rdm_ep_create_base_ep_ibv_qp(struct efa_rdm_ep *ep) * without any headers. */ if (ep->use_zcpy_rx) { - ret = efa_qp_create(&ep->base_ep.user_recv_qp, &attr_ex); + ret = efa_qp_create(&ep->base_ep.user_recv_qp, &attr_ex, ep->base_ep.info->tx_attr->tclass); if (ret) { efa_base_ep_destruct_qp(&ep->base_ep); return ret; @@ -1642,7 +1642,7 @@ int efa_rdm_ep_check_qp_in_order_aligned_128_bytes(struct efa_rdm_ep *ep, /* Create a dummy qp for query only */ efa_rdm_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, ibv_cq_ex, ibv_cq_ex); - ret = efa_qp_create(&qp, &attr_ex); + ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC); if (ret) goto out; From 92f2058ad4bb8eed9723362230313e9b33402388 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Tue, 1 Oct 2024 16:59:43 +0000 Subject: [PATCH 082/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_efa.7 | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/man/man7/fi_efa.7 b/man/man7/fi_efa.7 index 43214ca5ef3..99396e71bbf 100644 --- a/man/man7/fi_efa.7 +++ b/man/man7/fi_efa.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_efa" "7" "2024\-08\-01" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_efa" "7" "2024\-10\-01" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -242,6 +242,12 @@ It is only valid when \f[C]ic_id_validity\f[R] has the .PP \f[B]query_mr()\f[R] returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH Traffic Class (tclass) in EFA +.PP +To prioritize the messages from a given endpoint, user can specify +\f[C]fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY\f[R] in the +fi_endpoint() call to set the service level in rdma-core. +All other tclass values will be ignored. .SH RUNTIME PARAMETERS .TP \f[I]FI_EFA_IFACE\f[R] From 48a5da09b4a930a235ec76ff754410083128c82d Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Fri, 27 Sep 2024 11:40:11 -0700 Subject: [PATCH 083/393] fabtests: Make building efa conditional Add --enable-efa argument to fabtests to disable efa building. Default is enabled. This is to disable efa for Intel CI. Signed-off-by: Zach Dworkin --- fabtests/configure.ac | 7 +++++++ fabtests/prov/efa/Makefile.include | 2 ++ 2 files changed, 9 insertions(+) diff --git a/fabtests/configure.ac b/fabtests/configure.ac index ed5e8e767b8..dc0d0c87b5f 100644 --- a/fabtests/configure.ac +++ b/fabtests/configure.ac @@ -54,6 +54,13 @@ AS_IF([test x"$enable_debug" != x"no"], AC_DEFINE_UNQUOTED([ENABLE_DEBUG], [$dbg], [defined to 1 if configured with --enable-debug]) +AC_ARG_ENABLE([efa], + [AS_HELP_STRING([--enable-efa], + [Enable efa provider specific tests - default YES])], + [], [enable_efa=yes]) + +AM_CONDITIONAL([ENABLE_EFA], [test x"$enable_efa" = x"yes"]) + AC_DEFUN([FI_ARG_ENABLE_SANITIZER],[ AC_ARG_ENABLE([$1], [AS_HELP_STRING([--enable-$1], diff --git a/fabtests/prov/efa/Makefile.include b/fabtests/prov/efa/Makefile.include index ee08fb6c312..f9d2d343354 100644 --- a/fabtests/prov/efa/Makefile.include +++ b/fabtests/prov/efa/Makefile.include @@ -30,6 +30,7 @@ # SOFTWARE. # +if ENABLE_EFA bin_PROGRAMS += prov/efa/src/fi_efa_rnr_read_cq_error \ prov/efa/src/fi_efa_rnr_queue_resend \ prov/efa/src/fi_efa_info_test @@ -77,3 +78,4 @@ prov_efa_src_fi_efa_rdma_checker_LDFLAGS = -lefa endif BUILD_EFA_RDMA_CHECKER endif HAVE_VERBS_DEVEL +endif ENABLE_EFA From c410477dd053365c7bad9b287a79e78aaa326cc3 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Fri, 27 Sep 2024 11:42:32 -0700 Subject: [PATCH 084/393] fabtests: Add option for conditionally building lpp Add --enable-lpp option to configure. Default on. This is to turn it off for CI that doesn't need to build this provider. Signed-off-by: Zach Dworkin --- fabtests/configure.ac | 7 +++++++ fabtests/prov/lpp/Makefile.include | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/fabtests/configure.ac b/fabtests/configure.ac index dc0d0c87b5f..18a4d1d17d9 100644 --- a/fabtests/configure.ac +++ b/fabtests/configure.ac @@ -61,6 +61,13 @@ AC_ARG_ENABLE([efa], AM_CONDITIONAL([ENABLE_EFA], [test x"$enable_efa" = x"yes"]) +AC_ARG_ENABLE([lpp], + [AS_HELP_STRING([--enable-lpp], + [Enable lpp provider specific tests - default YES])], + [], [enable_lpp=yes]) + +AM_CONDITIONAL([ENABLE_LPP], [test x"$enable_lpp" = x"yes"]) + AC_DEFUN([FI_ARG_ENABLE_SANITIZER],[ AC_ARG_ENABLE([$1], [AS_HELP_STRING([--enable-$1], diff --git a/fabtests/prov/lpp/Makefile.include b/fabtests/prov/lpp/Makefile.include index 1d831f2372d..ac7d69b0e30 100644 --- a/fabtests/prov/lpp/Makefile.include +++ b/fabtests/prov/lpp/Makefile.include @@ -30,6 +30,8 @@ # SOFTWARE. # +if ENABLE_LPP + LPP_REGRESSION_SRCS = prov/lpp/src/rcq_data.c \ prov/lpp/src/main.c \ prov/lpp/src/ipc.c \ @@ -64,3 +66,5 @@ endif prov_lpp_src_lpp_regression_SOURCES = $(LPP_REGRESSION_SRCS) prov_lpp_src_lpp_regression_LDADD = libfabtests.la + +endif ENABLE_LPP From 1bcfb3b82d3e562a2a5da6a929ac450499938131 Mon Sep 17 00:00:00 2001 From: Darryl Abbate Date: Wed, 25 Sep 2024 13:18:43 -0700 Subject: [PATCH 085/393] prov/efa: Split RDM EP inject size field into MSG,RMA variants Signed-off-by: Darryl Abbate --- prov/efa/src/rdm/efa_rdm_ep.h | 10 +- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 158 +++++----------------------- prov/efa/src/rdm/efa_rdm_msg.c | 6 +- prov/efa/src/rdm/efa_rdm_ope.c | 2 +- prov/efa/src/rdm/efa_rdm_rma.c | 4 +- prov/efa/test/efa_unit_test_ep.c | 10 +- 6 files changed, 42 insertions(+), 148 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index eba552d39ad..83e0a7c399a 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -56,7 +56,10 @@ struct efa_rdm_ep { struct fid_ep *shm_ep; size_t mtu_size; - size_t inject_size; + size_t max_msg_size; /**< #FI_OPT_MAX_MSG_SIZE */ + size_t max_rma_size; /**< #FI_OPT_MAX_RMA_SIZE */ + size_t inject_msg_size; /**< #FI_OPT_INJECT_MSG_SIZE */ + size_t inject_rma_size; /**< #FI_OPT_INJECT_RMA_SIZE */ /* Endpoint's capability to support zero-copy rx */ bool use_zcpy_rx; @@ -73,11 +76,6 @@ struct efa_rdm_ep { /* Resource management flag */ uint64_t rm_full; - /* Application's maximum msg size hint */ - size_t max_msg_size; - - /** Application's maximum RMA size */ - size_t max_rma_size; /* Applicaiton's message prefix size. */ size_t msg_prefix_size; diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 1ce5041ba36..265c6f94c1d 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -557,13 +557,14 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, EFA_INFO(FI_LOG_EP_CTRL, "efa_rdm_ep->host_id: i-%017lx\n", efa_rdm_ep->host_id); } - efa_rdm_ep->inject_size = info->tx_attr->inject_size; + efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size; + efa_rdm_ep->max_rma_size = info->ep_attr->max_msg_size; + efa_rdm_ep->inject_msg_size = info->tx_attr->inject_size; + efa_rdm_ep->inject_rma_size = info->tx_attr->inject_size; efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size; efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size; efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version); efa_rdm_ep->shm_permitted = true; - efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size; - efa_rdm_ep->max_rma_size = info->ep_attr->max_msg_size; efa_rdm_ep->msg_prefix_size = info->ep_attr->msg_prefix_size; efa_rdm_ep->mtu_size = efa_domain->device->rdm_info->ep_attr->max_msg_size; @@ -1251,7 +1252,7 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) * when supported */ if (ep->use_zcpy_rx) - ep->inject_size = MIN(ep->inject_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + ep->inject_rma_size = MIN(ep->inject_rma_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); ret = efa_rdm_ep_create_base_ep_ibv_qp(ep); if (ret) @@ -1440,110 +1441,6 @@ static int efa_rdm_ep_set_shared_memory_permitted(struct efa_rdm_ep *ep, bool sh return 0; } -/** - * @brief Conditionally set efa_rdm_ep#max_msg_size per user's request - * - * If the requested msg size exceeds the EFA provider's default value, the - * request is rejected. - * - * @param[in,out] ep EFA RDM endpoint - * @param[in] max_msg_size Requested maximum msg size - * - * @return 0 on success, -FI_EINVAL otherwise - * - * @sa #FI_OPT_MAX_MSG_SIZE - */ -static int efa_rdm_ep_set_max_msg_size(struct efa_rdm_ep *ep, size_t max_msg_size) -{ - if (max_msg_size > ep->base_ep.info->ep_attr->max_msg_size) { - EFA_WARN(FI_LOG_EP_CTRL, - "Requested size of %zu for FI_OPT_MAX_MSG_SIZE " - "exceeds the maximum (%zu)\n", - max_msg_size, ep->base_ep.info->ep_attr->max_msg_size); - return -FI_EINVAL; - } - ep->max_msg_size = max_msg_size; - return 0; -} - -/** - * @brief Conditionally set efa_rdm_ep#max_rma_size per user's request - * - * If the requested inject size exceeds the EFA provider's default value, the - * request is rejected. - * - * @param[in,out] ep EFA RDM endpoint - * @param[in] max_rma_size Requested max RMA size - * - * @return 0 on success, -FI_EINVAL otherwise - * - * @sa #FI_OPT_MAX_RMA_SIZE - */ -static int efa_rdm_ep_set_max_rma_size(struct efa_rdm_ep *ep, size_t max_rma_size) -{ - if (max_rma_size > ep->base_ep.info->ep_attr->max_msg_size) { - EFA_WARN(FI_LOG_EP_CTRL, - "Requested size of %zu for FI_OPT_MAX_RMA_SIZE " - "exceeds the maximum (%zu)\n", - max_rma_size, ep->base_ep.info->ep_attr->max_msg_size); - return -FI_EINVAL; - } - ep->max_rma_size = max_rma_size; - return 0; -} - -/** - * @brief Conditionally set efa_rdm_ep#inject_size per user's request - * - * If the requested inject size exceeds the EFA provider's default value, the - * request is rejected. - * - * @param[in,out] ep EFA RDM endpoint - * @param[in] inject_size Requested inject size - * - * @return 0 on success, -FI_EINVAL otherwise - * - * @sa #FI_OPT_INJECT_MSG_SIZE - */ -static int efa_rdm_ep_set_inject_msg_size(struct efa_rdm_ep *ep, size_t inject_msg_size) -{ - if (inject_msg_size > ep->base_ep.info->tx_attr->inject_size) { - EFA_WARN(FI_LOG_EP_CTRL, - "Requested size of %zu for FI_OPT_INJECT_MSG_SIZE " - "exceeds the maximum (%zu)\n", - inject_msg_size, ep->base_ep.info->tx_attr->inject_size); - return -FI_EINVAL; - } - ep->inject_size = inject_msg_size; - return 0; -} - -/** - * @brief Conditionally set efa_rdm_ep#inject_size per user's request - * - * If the requested inject size exceeds the EFA provider's default value, the - * request is rejected. - * - * @param[in,out] ep EFA RDM endpoint - * @param[in] inject_size Requested inject size - * - * @return 0 on success, -FI_EINVAL otherwise - * - * @sa #FI_OPT_INJECT_RMA_SIZE - */ -static int efa_rdm_ep_set_inject_rma_size(struct efa_rdm_ep *ep, size_t inject_rma_size) -{ - if (inject_rma_size > ep->base_ep.info->tx_attr->inject_size) { - EFA_WARN(FI_LOG_EP_CTRL, - "Requested size of %zu for FI_OPT_INJECT_RMA_SIZE " - "exceeds the maximum (%zu)\n", - inject_rma_size, ep->base_ep.info->tx_attr->inject_size); - return -FI_EINVAL; - } - ep->inject_size = inject_rma_size; - return 0; -} - /** * @brief set use_device_rdma flag in efa_rdm_ep. * @@ -1662,6 +1559,23 @@ int efa_rdm_ep_check_qp_in_order_aligned_128_bytes(struct efa_rdm_ep *ep, return ret; } +/** + * Convenience macro for setopt with an enforced threshold + */ +#define EFA_RDM_EP_SETOPT_THRESHOLD(opt, field, threshold) { \ + size_t _val = *(size_t *) optval; \ + if (optlen != sizeof field) \ + return -FI_EINVAL; \ + if (_val > threshold) { \ + EFA_WARN(FI_LOG_EP_CTRL, \ + "Requested size of %zu for FI_OPT_" #opt " " \ + "exceeds the maximum (%zu)\n", \ + _val, threshold); \ + return -FI_EINVAL; \ + } \ + field = _val; \ +} + /** * @brief implement the fi_setopt() API for EFA RDM endpoint * @param[in] fid fid to endpoint @@ -1745,32 +1659,16 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, return ret; break; case FI_OPT_MAX_MSG_SIZE: - if (optlen != sizeof (size_t)) - return -FI_EINVAL; - ret = efa_rdm_ep_set_max_msg_size(efa_rdm_ep, *(size_t *) optval); - if (ret) - return ret; + EFA_RDM_EP_SETOPT_THRESHOLD(MAX_MSG_SIZE, efa_rdm_ep->max_msg_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_MAX_RMA_SIZE: - if (optlen != sizeof (size_t)) - return -FI_EINVAL; - ret = efa_rdm_ep_set_max_rma_size(efa_rdm_ep, *(size_t *) optval); - if (ret) - return ret; + EFA_RDM_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, efa_rdm_ep->max_rma_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_INJECT_MSG_SIZE: - if (optlen != sizeof (size_t)) - return -FI_EINVAL; - ret = efa_rdm_ep_set_inject_msg_size(efa_rdm_ep, *(size_t *) optval); - if (ret) - return ret; + EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, efa_rdm_ep->inject_msg_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_INJECT_RMA_SIZE: - if (optlen != sizeof (size_t)) - return -FI_EINVAL; - ret = efa_rdm_ep_set_inject_rma_size(efa_rdm_ep, *(size_t *) optval); - if (ret) - return ret; + EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, efa_rdm_ep->inject_rma_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_EFA_USE_DEVICE_RDMA: if (optlen != sizeof(bool)) @@ -1863,13 +1761,13 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, case FI_OPT_INJECT_MSG_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->inject_size; + *(size_t *) optval = efa_rdm_ep->inject_msg_size; *optlen = sizeof (size_t); break; case FI_OPT_INJECT_RMA_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->inject_size; + *(size_t *) optval = efa_rdm_ep->inject_rma_size; *optlen = sizeof (size_t); break; case FI_OPT_EFA_EMULATED_READ: diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index a4c8071f367..2126bb8575a 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -356,7 +356,7 @@ ssize_t efa_rdm_msg_inject(struct fid_ep *ep, const void *buf, size_t len, struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->inject_msg_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -384,7 +384,7 @@ ssize_t efa_rdm_msg_injectdata(struct fid_ep *ep, const void *buf, struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->inject_msg_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -559,7 +559,6 @@ ssize_t efa_rdm_msg_tinject(struct fid_ep *ep_fid, const void *buf, size_t len, struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -586,7 +585,6 @@ ssize_t efa_rdm_msg_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t l struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index ccd348992fc..bdab59510b4 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -1509,7 +1509,7 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) if (ope->fi_flags & FI_INJECT) { assert(ope->iov_count == 1); - assert(ope->total_len <= ep->inject_size); + assert(ope->total_len <= ep->inject_rma_size); copied = efa_rdm_pke_copy_from_hmem_iov( ope->desc[iov_idx], pkt_entry, ope, sizeof(struct efa_rdm_rma_context_pkt), 0, diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index 6a6562b2849..ae04af66e1e 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -642,7 +642,7 @@ ssize_t efa_rdm_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->inject_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; @@ -679,7 +679,7 @@ ssize_t efa_rdm_rma_inject_writedata(struct fid_ep *ep, const void *buf, size_t int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_size); + assert(len <= efa_rdm_ep->inject_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index f3a70183c64..c1fc290cd4a 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -924,7 +924,7 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, struct efa_domain *efa_domain; struct efa_rdm_ep *ep; size_t max_msg_size = 1000; - size_t inject_size = 0; + size_t inject_rma_size = 0; bool shm_permitted = false; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), @@ -967,12 +967,12 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, assert_true(ep->use_zcpy_rx == expected_use_zcpy_rx); assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, - &inject_size, &(size_t){sizeof inject_size}), 0); - assert_int_equal(ep->inject_size, inject_size); + &inject_rma_size, &(size_t){sizeof inject_rma_size}), 0); + assert_int_equal(ep->inject_rma_size, inject_rma_size); if (expected_use_zcpy_rx) - assert_int_equal(inject_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + assert_int_equal(inject_rma_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); else - assert_int_equal(inject_size, resource->info->tx_attr->inject_size); + assert_int_equal(inject_rma_size, resource->info->tx_attr->inject_size); } /** From 68d6467e5211f5bd0b4d3d472b9d056514781bf1 Mon Sep 17 00:00:00 2001 From: Darryl Abbate Date: Wed, 25 Sep 2024 13:20:52 -0700 Subject: [PATCH 086/393] prov/efa: Add setopt/getopt support for remaining EP sizes This adds support for FI_OPT_{MSG,INJECT}_{TAGGED,ATOMIC}_SIZE Signed-off-by: Darryl Abbate --- prov/efa/src/rdm/efa_rdm_ep.h | 4 +++ prov/efa/src/rdm/efa_rdm_ep_fiops.c | 40 +++++++++++++++++++++++++++++ prov/efa/src/rdm/efa_rdm_msg.c | 2 ++ 3 files changed, 46 insertions(+) diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 83e0a7c399a..0a67d23d49d 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -57,9 +57,13 @@ struct efa_rdm_ep { size_t mtu_size; size_t max_msg_size; /**< #FI_OPT_MAX_MSG_SIZE */ + size_t max_tagged_size; /**< #FI_OPT_MAX_TAGGED_SIZE */ size_t max_rma_size; /**< #FI_OPT_MAX_RMA_SIZE */ + size_t max_atomic_size; /**< #FI_OPT_MAX_ATOMIC_SIZE */ size_t inject_msg_size; /**< #FI_OPT_INJECT_MSG_SIZE */ + size_t inject_tagged_size; /**< #FI_OPT_INJECT_TAGGED_SIZE */ size_t inject_rma_size; /**< #FI_OPT_INJECT_RMA_SIZE */ + size_t inject_atomic_size; /**< #FI_OPT_INJECT_ATOMIC_SIZE */ /* Endpoint's capability to support zero-copy rx */ bool use_zcpy_rx; diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 265c6f94c1d..d6f70f0992e 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -558,9 +558,13 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, } efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size; + efa_rdm_ep->max_tagged_size = info->ep_attr->max_msg_size; efa_rdm_ep->max_rma_size = info->ep_attr->max_msg_size; + efa_rdm_ep->max_atomic_size = info->ep_attr->max_msg_size; efa_rdm_ep->inject_msg_size = info->tx_attr->inject_size; + efa_rdm_ep->inject_tagged_size = info->tx_attr->inject_size; efa_rdm_ep->inject_rma_size = info->tx_attr->inject_size; + efa_rdm_ep->inject_atomic_size = info->tx_attr->inject_size; efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size; efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size; efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version); @@ -1661,15 +1665,27 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, case FI_OPT_MAX_MSG_SIZE: EFA_RDM_EP_SETOPT_THRESHOLD(MAX_MSG_SIZE, efa_rdm_ep->max_msg_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; + case FI_OPT_MAX_TAGGED_SIZE: + EFA_RDM_EP_SETOPT_THRESHOLD(MAX_TAGGED_SIZE, efa_rdm_ep->max_tagged_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + break; case FI_OPT_MAX_RMA_SIZE: EFA_RDM_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, efa_rdm_ep->max_rma_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; + case FI_OPT_MAX_ATOMIC_SIZE: + EFA_RDM_EP_SETOPT_THRESHOLD(MAX_ATOMIC_SIZE, efa_rdm_ep->max_atomic_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + break; case FI_OPT_INJECT_MSG_SIZE: EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, efa_rdm_ep->inject_msg_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; + case FI_OPT_INJECT_TAGGED_SIZE: + EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_TAGGED_SIZE, efa_rdm_ep->inject_tagged_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + break; case FI_OPT_INJECT_RMA_SIZE: EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, efa_rdm_ep->inject_rma_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; + case FI_OPT_INJECT_ATOMIC_SIZE: + EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_ATOMIC_SIZE, efa_rdm_ep->inject_atomic_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + break; case FI_OPT_EFA_USE_DEVICE_RDMA: if (optlen != sizeof(bool)) return -FI_EINVAL; @@ -1752,24 +1768,48 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, *(size_t *) optval = efa_rdm_ep->max_msg_size; *optlen = sizeof (size_t); break; + case FI_OPT_MAX_TAGGED_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = efa_rdm_ep->max_tagged_size; + *optlen = sizeof (size_t); + break; case FI_OPT_MAX_RMA_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; *(size_t *) optval = efa_rdm_ep->max_rma_size; *optlen = sizeof (size_t); break; + case FI_OPT_MAX_ATOMIC_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = efa_rdm_ep->max_atomic_size; + *optlen = sizeof (size_t); + break; case FI_OPT_INJECT_MSG_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; *(size_t *) optval = efa_rdm_ep->inject_msg_size; *optlen = sizeof (size_t); break; + case FI_OPT_INJECT_TAGGED_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = efa_rdm_ep->inject_tagged_size; + *optlen = sizeof (size_t); + break; case FI_OPT_INJECT_RMA_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; *(size_t *) optval = efa_rdm_ep->inject_rma_size; *optlen = sizeof (size_t); break; + case FI_OPT_INJECT_ATOMIC_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = efa_rdm_ep->inject_atomic_size; + *optlen = sizeof (size_t); + break; case FI_OPT_EFA_EMULATED_READ: if (*optlen < sizeof(bool)) return -FI_ETOOSMALL; diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index 2126bb8575a..839cde917f0 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -559,6 +559,7 @@ ssize_t efa_rdm_msg_tinject(struct fid_ep *ep_fid, const void *buf, size_t len, struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); + assert(len <= efa_rdm_ep->inject_tagged_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -585,6 +586,7 @@ ssize_t efa_rdm_msg_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t l struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); + assert(len <= efa_rdm_ep->inject_tagged_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); From 071adfe0b4c02087e822fa1bdcebc53e6c2acf24 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 1 Oct 2024 10:45:13 -0700 Subject: [PATCH 087/393] contrib/aws: Add g4dn nccl tests with tcp provider Signed-off-by: Jessie Yang --- contrib/aws/Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index e8d11d4edfa..74c6adf4f88 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -223,6 +223,7 @@ pipeline { stages["2_c6g_alinux2023_tcp"] = get_test_stage("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") stages["2_c6g_ubuntu2004_tcp"] = get_test_stage("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") stages["2_c6g_rhel8_tcp"] = get_test_stage("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") + stages["3_g4dn_alinux2_tcp"] = get_test_stage("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false --test-list test_nccl_tests") // Multi Node Tests - SOCKETS stages["2_c6g_alinux2_sockets"] = get_test_stage("2_c6g_alinux2_sockets", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") From 2db87b1c4ec72e882665d798f6f96e8a87fdb9e3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 16:17:53 +0000 Subject: [PATCH 088/393] build(deps): bump actions/upload-artifact from 4.4.0 to 4.4.1 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.4.0 to 4.4.1. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/50769540e7f4bd5e21e526ee35c689e35e0d6874...604373da6381bf24206979c74d06a550515601b9) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/coverity.yml | 2 +- .github/workflows/pr-ci.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 84e3956734c..6bfdb0ff69d 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -94,7 +94,7 @@ jobs: --form description="`$PWD/install/bin/fi_info -l`" \ https://scan.coverity.com/builds?project=ofiwg%2Flibfabric - name: Upload build logs - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 with: name: coverity-build-log.txt path: cov-int/build-log.txt diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 450d2aed66b..b8894e1dbf8 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -70,7 +70,7 @@ jobs: $PWD/install/bin/fi_info -l - name: Upload build logs if: failure() - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 with: name: ${{ matrix.os }}-${{ matrix.cc }}-config.log path: config.log @@ -115,7 +115,7 @@ jobs: $PWD/install/bin/fi_info -c FI_HMEM - name: Upload build logs if: failure() - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 with: name: hmem-config.log path: config.log @@ -139,7 +139,7 @@ jobs: make -j2 - name: Upload build logs if: failure() - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 with: name: macos-config.log path: config.log diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 7855f755c8c..8913357f9fe 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -60,7 +60,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 with: name: SARIF file path: results.sarif From 3e9b17b71bfe05507a5f35a02792306a691d4c06 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 16:18:05 +0000 Subject: [PATCH 089/393] build(deps): bump github/codeql-action from 3.26.10 to 3.26.11 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.10 to 3.26.11. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/e2b3eafc8d227b0241d48be5f425d47c2d750a13...6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 30e1e3ee04c..390b8a62945 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@e2b3eafc8d227b0241d48be5f425d47c2d750a13 # v3.26.10 + uses: github/codeql-action/init@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea # v3.26.11 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@e2b3eafc8d227b0241d48be5f425d47c2d750a13 # v3.26.10 + uses: github/codeql-action/autobuild@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea # v3.26.11 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@e2b3eafc8d227b0241d48be5f425d47c2d750a13 # v3.26.10 + uses: github/codeql-action/analyze@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea # v3.26.11 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 8913357f9fe..d2fe181a360 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@e2b3eafc8d227b0241d48be5f425d47c2d750a13 # v3.26.10 + uses: github/codeql-action/upload-sarif@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea # v3.26.11 with: sarif_file: results.sarif From 765cc161231964cf64ea24c819b177d4296b5829 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 8 Oct 2024 18:17:38 +0000 Subject: [PATCH 090/393] prov/efa: report correct inject_msg_size for zcpy rx When zcpy rx is on, both inject_msg_size and inject_rma_size should be reported as inline buf size. Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 4 +++- prov/efa/test/efa_unit_test_ep.c | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index d6f70f0992e..282a384e325 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1255,8 +1255,10 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) * TODO: Distinguish between inline data sizes for RDMA {send,write} * when supported */ - if (ep->use_zcpy_rx) + if (ep->use_zcpy_rx) { + ep->inject_msg_size = MIN(ep->inject_msg_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); ep->inject_rma_size = MIN(ep->inject_rma_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + } ret = efa_rdm_ep_create_base_ep_ibv_qp(ep); if (ret) diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index c1fc290cd4a..f01efc72560 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -924,6 +924,7 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, struct efa_domain *efa_domain; struct efa_rdm_ep *ep; size_t max_msg_size = 1000; + size_t inject_msg_size = 0; size_t inject_rma_size = 0; bool shm_permitted = false; @@ -966,13 +967,22 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, assert_int_equal(fi_enable(resource->ep), 0); assert_true(ep->use_zcpy_rx == expected_use_zcpy_rx); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, + &inject_msg_size, &(size_t){sizeof inject_msg_size}), 0); + assert_int_equal(ep->inject_msg_size, inject_msg_size); + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, &inject_rma_size, &(size_t){sizeof inject_rma_size}), 0); assert_int_equal(ep->inject_rma_size, inject_rma_size); - if (expected_use_zcpy_rx) + + if (expected_use_zcpy_rx) { + assert_int_equal(inject_msg_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); assert_int_equal(inject_rma_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); - else + } else { + assert_int_equal(inject_msg_size, resource->info->tx_attr->inject_size); assert_int_equal(inject_rma_size, resource->info->tx_attr->inject_size); + } } /** From df5472b8b62f25ed942c9614fb18d5d70b7b0402 Mon Sep 17 00:00:00 2001 From: Steve Welch Date: Thu, 8 Aug 2024 11:11:52 -0500 Subject: [PATCH 091/393] prov/util: Allow providers to update cache MR IOV A provider may update the memory region that is added to accommodate for instance alignment of the region to a larger page boundary. In such cases, the MR cache info used to search the cache should use the updated region. This allows the provider to avoid walking /proc/pid/smaps if the underlying kernel component may more efficiently determine the backing page size. Signed-off-by: Steve Welch Signed-off-by: Ian Ziemba --- include/ofi_mr.h | 9 +++++---- prov/util/src/util_mr_cache.c | 10 ++++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 1ebb07a8e11..64bae5f0755 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -417,14 +417,15 @@ bool ofi_mr_cache_flush(struct ofi_mr_cache *cache, bool flush_lru); * a new ofi_mr_entry and assign it to entry. * * @param[in] cache The cache the entry belongs to - * @param[in] info Information about the mr entry to search + * @param[in out] info Information about the mr entry to search. Info IOV may + * be updated by providers to reflect region registered by + * the provider. * @param[out] entry The registered entry corresponding to the * region described in info. * @returns On success, returns 0. On failure, returns a negative error code. */ -int ofi_mr_cache_search(struct ofi_mr_cache *cache, - const struct ofi_mr_info *info, - struct ofi_mr_entry **entry); +int ofi_mr_cache_search(struct ofi_mr_cache *cache, struct ofi_mr_info *info, + struct ofi_mr_entry **entry); /** * Given an attr (with an iov range), if the iov range is already registered, diff --git a/prov/util/src/util_mr_cache.c b/prov/util/src/util_mr_cache.c index f2148e56267..ea8bf15570f 100644 --- a/prov/util/src/util_mr_cache.c +++ b/prov/util/src/util_mr_cache.c @@ -267,7 +267,7 @@ void ofi_mr_cache_delete(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) * restart the entire operation. */ static int -util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info, +util_mr_cache_create(struct ofi_mr_cache *cache, struct ofi_mr_info *info, struct ofi_mr_entry **entry) { struct ofi_mr_entry *cur; @@ -291,6 +291,12 @@ util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info, if (ret) goto free; + /* Providers may have expanded the MR. Update MR info input + * accordingly. + */ + assert(ofi_iov_within(&(*info).iov, &(*entry)->info.iov)); + *info = (*entry)->info; + pthread_mutex_lock(&mm_lock); cur = ofi_mr_rbt_find(&cache->tree, info); if (cur) { @@ -329,7 +335,7 @@ util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info, return ret; } -int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct ofi_mr_info *info, +int ofi_mr_cache_search(struct ofi_mr_cache *cache, struct ofi_mr_info *info, struct ofi_mr_entry **entry) { struct ofi_mem_monitor *monitor; From 043e968534491319d1b9afc22e482783d5779677 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 8 Oct 2024 21:27:14 +0000 Subject: [PATCH 092/393] prov/efa: Only do dmabuf reg when FI_MR_DMABUF is set Application should use FI_MR_DMABUF API to pass the dmabuf fd and offset to make Libfabric register the mr via dmabuf. The only exception is for synapseai, beacuse dmabuf is the only way to register Gaudi device buffer and it was implemented before the FI_MR_DMABUF API. Keep this behavior unchanged for compatibility. Signed-off-by: Shi Jin --- prov/efa/src/efa_mr.c | 51 ++++++++++--------------------------------- 1 file changed, 11 insertions(+), 40 deletions(-) diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c index a9a37a4cd0b..dd1115f4d52 100644 --- a/prov/efa/src/efa_mr.c +++ b/prov/efa/src/efa_mr.c @@ -529,61 +529,32 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr ); /* - * TODO: remove the synapseai and neuron blocks by onboarding the - * ofi_hmem_get_dmabuf_fd API. + * When FI_MR_DMABUF flag is not set, + * only do ibv_reg_mr. + * The only exception is synapseai, + * because dmabuf is the only way + * to register Gaudi device buffer and + * it was implemented before the FI_MR_DMABUF API. */ -#if HAVE_SYNAPSEAI if (efa_mr_is_synapseai(efa_mr)) { int dmabuf_fd; uint64_t offset; int ret; - ret = synapseai_get_dmabuf_fd(mr_attr->mr_iov->iov_base, - (uint64_t) mr_attr->mr_iov->iov_len, - &dmabuf_fd, &offset); + ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_SYNAPSEAI, + mr_attr->mr_iov->iov_base, + (uint64_t) mr_attr->mr_iov->iov_len, + &dmabuf_fd, &offset); if (ret != FI_SUCCESS) { EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for Gaudi device buffer \n"); return NULL; } - return efa_mr_reg_ibv_dmabuf_mr(efa_mr->domain->ibv_pd, offset, - mr_attr->mr_iov->iov_len, - (uint64_t)mr_attr->mr_iov->iov_base, - dmabuf_fd, access); - } -#endif - -#if HAVE_NEURON - if (efa_mr_is_neuron(efa_mr)) { - int dmabuf_fd; - uint64_t offset; - int ret; - - ret = neuron_get_dmabuf_fd( - mr_attr->mr_iov->iov_base, - mr_attr->mr_iov->iov_len, - &dmabuf_fd, - &offset); - if (ret == FI_SUCCESS) { - /* Success => invoke ibv_reg_dmabuf_mr */ - return efa_mr_reg_ibv_dmabuf_mr( - efa_mr->domain->ibv_pd, 0, + return efa_mr_reg_ibv_dmabuf_mr(efa_mr->domain->ibv_pd, offset, mr_attr->mr_iov->iov_len, (uint64_t)mr_attr->mr_iov->iov_base, dmabuf_fd, access); - } else if (ret == -FI_EOPNOTSUPP) { - /* Protocol not availabe => fallback */ - EFA_INFO(FI_LOG_MR, - "Unable to get dmabuf fd for Neuron device buffer, " - "Fall back to ibv_reg_mr\n"); - return ibv_reg_mr( - efa_mr->domain->ibv_pd, - (void *)mr_attr->mr_iov->iov_base, - mr_attr->mr_iov->iov_len, access); - } - return NULL; } -#endif return ibv_reg_mr(efa_mr->domain->ibv_pd, (void *)mr_attr->mr_iov->iov_base, From e31af238cb83b0aa7d1fc608cef075d7301f09d3 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Wed, 9 Oct 2024 11:45:54 -0700 Subject: [PATCH 093/393] contrib/intel/jenkins: Temporarily disable psm3 in onecclgpu psm3 is failing onecclgpu because of a missing package. Disable it until the package dependency is resolved. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index b9a2aa1ad93..fd5bc54a65a 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -767,9 +767,9 @@ pipeline { steps { script { dir (RUN_LOCATION) { - run_middleware([["psm3", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, - "FI_HMEM_DISABLE_P2P=1") + // run_middleware([["psm3", null]], "oneCCL-GPU-v3", "onecclgpu", + // "gpu", "torchic", "1", null, null, + // "FI_HMEM_DISABLE_P2P=1") run_middleware([["verbs", null]], "oneCCL-GPU-v3", "onecclgpu", "gpu", "torchic", "1", null, null, "FI_HMEM_DISABLE_P2P=1") From c1820bf2603e9e4d308609e09e760166ccbc8ef5 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Mon, 7 Oct 2024 10:43:49 +0200 Subject: [PATCH 094/393] fabtests/lpp: remove deprecated FI_MR_BASIC Remove deprecated FI_MR_BASIC flag Signed-off-by: Tadeusz Struk --- fabtests/prov/lpp/src/test_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fabtests/prov/lpp/src/test_util.c b/fabtests/prov/lpp/src/test_util.c index 7757899bb82..2d3b55204f0 100644 --- a/fabtests/prov/lpp/src/test_util.c +++ b/fabtests/prov/lpp/src/test_util.c @@ -132,7 +132,7 @@ void util_init(struct rank_info *ri) hints.mode = 0; hints.fabric_attr->prov_name = "lpp"; - hints.domain_attr->mr_mode = FI_MR_BASIC; + hints.domain_attr->mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP; rc = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), NULL, NULL, 0, &hints, &ri->fi); From 2c7d7861db4e8d894df08f435533af56e6e4a50c Mon Sep 17 00:00:00 2001 From: Nikhil Nanal Date: Thu, 3 Oct 2024 14:21:02 -0700 Subject: [PATCH 095/393] Fabtests: Added inband sync to ft_init_fabric_cm. This patch adds the missing inband sync in ft_fabric_init_cm to handle the case where rx buffers are not pre-posted by the application. The default behaviour in fabtests is to pre-post a rx buffer. This change enables fabtests using ft_fabric_init_cm to consume the posted receive with an inband sync by setting the test option FT_OPT_NO_PRE_POSTED_RX. Similar changes have been made to ft_init_fabric https://github.com/ofiwg/libfabric/pull/10394 Signed-off-by: Nikhil Nanal --- fabtests/common/shared.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index eb95127b6f0..aea1c46d4aa 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -1067,6 +1067,13 @@ int ft_init_fabric_cm(void) ret = opts.dst_addr ? ft_client_connect() : ft_server_connect(); + if (ft_check_opts(FT_OPT_NO_PRE_POSTED_RX) && + !ft_check_opts(FT_OPT_SKIP_MSG_ALLOC) && + (fi->caps & (FI_MSG | FI_TAGGED))) { + ret = ft_sync_inband(false); + if (ret) + return ret; + } return ret; } From 8708b5cd021ceb17caa16437832c577e92cf255a Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Fri, 20 Sep 2024 23:53:34 -0700 Subject: [PATCH 096/393] prov/efa: avoid gdr_pin/gdr_map for dmabuf mrs efa_mr_hmem_setup previously always called ofi_hmem_dev_register on all FI_HMEM_CUDA calls, regardless of the presence of FI_MR_DMABUF in flags. When gdrcopy is enabled, this means deconstructing the fi_mr_dmabuf into a struct iovec from its {base, offset, len} 3-tuple, then passing the resulting iovec to gdr_pin followed by gdr_map. a dmabuf cannot be exported by the nvidia module without an implicit promise that the address space is already reserved and mapped in the current pid, of appropriate size and alignment, and that all pages/ranges backing it can be made available to an importer. All requirements are enforced by the cuda APIs used to acquire one. At best, calls to libgdrcopy here are unnecessary for dmabufs, and at worst the pgprots set by gdrdrv are different enough from the ones setup by cuda proper to cause issues, or the redundant mappings become costly for the driver to maintain. Prior to this patch, apps can only prevent these gdr_map calls on dmabuf arguments by disabling gdrcopy entirely through environment variables before launch. But apps may wish to use fi_mr_regattr with dmabuf arguments in the default case, while still reserving the right to call fi_mr_regattr with iov arguments on the same domain, where the gdr flow may still be desired in the latter case. This makes that possible. Signed-off-by: Nicholas Sielicki --- prov/efa/src/efa_mr.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c index dd1115f4d52..0307914aff2 100644 --- a/prov/efa/src/efa_mr.c +++ b/prov/efa/src/efa_mr.c @@ -184,12 +184,6 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr, { int err; struct iovec mr_iov = {0}; - - if (flags & FI_MR_DMABUF) - ofi_mr_get_iov_from_dmabuf(&mr_iov, attr->dmabuf, 1); - else - mr_iov = *attr->mr_iov; - efa_mr->peer.flags = flags; if (attr->iface == FI_HMEM_SYSTEM) { @@ -227,7 +221,8 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr, efa_mr->needs_sync = true; efa_mr->peer.device.cuda = attr->device.cuda; - if (cuda_is_gdrcopy_enabled()) { + if (!(flags & FI_MR_DMABUF) && cuda_is_gdrcopy_enabled()) { + mr_iov = *attr->mr_iov; err = ofi_hmem_dev_register(FI_HMEM_CUDA, mr_iov.iov_base, mr_iov.iov_len, (uint64_t *)&efa_mr->peer.hmem_data); efa_mr->peer.flags |= OFI_HMEM_DATA_DEV_REG_HANDLE; From 030d7341e5775c52800b5daa99b482f8ae1b06d3 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Mon, 7 Oct 2024 18:09:09 -0400 Subject: [PATCH 097/393] fabtests: Update runmultinode.py with args fi_multinode command line arguments changed. Update script to accommodate the change. Signed-off-by: Amir Shehata --- fabtests/scripts/runmultinode.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fabtests/scripts/runmultinode.py b/fabtests/scripts/runmultinode.py index a8c836f532e..b1749990dd1 100644 --- a/fabtests/scripts/runmultinode.py +++ b/fabtests/scripts/runmultinode.py @@ -6,7 +6,7 @@ def parse_args(): parser = argparse.ArgumentParser(description="libfabric multinode test with slurm") parser.add_argument('--dry-run', action='store_true', help='Perform a dry run without making any changes.') parser.add_argument("--ci", type=str, help="Commands to prepend to test call. Only used with the internal launcher option", default="") - parser.add_argument("-C", "--capability", type=str, help="libfabric capability", default="msg") + parser.add_argument("-x", "--capability", type=str, help="libfabric capability", default="msg") parser.add_argument("-i", "--iterations", type=int , help="Number of iterations", default=1) parser.add_argument("-l", "--launcher", type=str, choices=['internal', 'srun', 'mpirun'], help="launcher to use for running job. If nothing is specified, test manages processes internally. Available options: internal, srun and mpirun", default="internal") @@ -172,11 +172,11 @@ def is_srun_pm_supported(): if args.provider in no_addr_prov: cmd = f"fi_multinode -n {args.num_procs} -s {socket.gethostname()} " \ - f"-p {args.provider} -C {args.capability} -z {mnode['pattern']} " \ - f"-I {args.iterations} -u {args.launcher.lower()} -E -T" + f"-p {args.provider} -x {args.capability} -z {mnode['pattern']} " \ + f"-I {args.iterations} -u {args.launcher.lower()} -T" else: cmd = f"fi_multinode -n {args.num_procs} -s {socket.gethostname()} " \ - f"-p {args.provider} -C {args.capability} -z '{mnode['pattern']}' " \ + f"-p {args.provider} -x {args.capability} -z '{mnode['pattern']}' " \ f"-I {args.iterations} -u {args.launcher.lower()} -T" if args.launcher.lower() == 'mpirun': @@ -196,7 +196,7 @@ def is_srun_pm_supported(): exit() hl = ",".join(expand_host_list(os.environ['SLURM_NODELIST'])) mpi = f"runmultinode.sh -h {hl} -n {args.procs_per_node} -p {args.provider} " \ - f"-C {args.capability} -I {args.iterations} -z {mnode['pattern']}" + f"-x {args.capability} -I {args.iterations} -z {mnode['pattern']}" if args.ci: mpi += f" --ci '{args.ci}'" else: From cd63ccf7cf1311db02141e6e0fec36065ba7956f Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Mon, 7 Oct 2024 11:51:27 -0400 Subject: [PATCH 098/393] fabtests: fi_multinode update set FT_OPT_ADDR_IS_OOB by default. It enables out of band address exchange which is needed by CXI. Signed-off-by: Amir Shehata --- fabtests/multinode/src/harness.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fabtests/multinode/src/harness.c b/fabtests/multinode/src/harness.c index 0df5417521e..959a5940387 100644 --- a/fabtests/multinode/src/harness.c +++ b/fabtests/multinode/src/harness.c @@ -355,7 +355,7 @@ int main(int argc, char **argv) opts = INIT_OPTS; opts.options |= FT_OPT_SIZE | FT_OPT_OOB_ADDR_EXCH | - FT_OPT_DISABLE_TAG_VALIDATION; + FT_OPT_ADDR_IS_OOB | FT_OPT_DISABLE_TAG_VALIDATION; pm_job.clients = NULL; pm_job.pattern = -1; From ed239a5abece22732dcf4e0302748058a05618b4 Mon Sep 17 00:00:00 2001 From: Peinan Zhang Date: Tue, 1 Oct 2024 10:30:14 -0700 Subject: [PATCH 099/393] prov/hook/trace: Add trace log for domain_attr. Signed-off-by: Peinan Zhang --- prov/hook/trace/src/hook_trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/prov/hook/trace/src/hook_trace.c b/prov/hook/trace/src/hook_trace.c index 5813d47bce3..b5cdce4fb4f 100644 --- a/prov/hook/trace/src/hook_trace.c +++ b/prov/hook/trace/src/hook_trace.c @@ -262,6 +262,8 @@ static void hook_trace_prof_init(void *context) fi_tostr_r(buf,len, info->tx_attr, FI_TYPE_TX_ATTR)); \ FI_TRACE(dom->fabric->hprov, FI_LOG_DOMAIN, "\n%s", \ fi_tostr_r(buf,len, info->rx_attr, FI_TYPE_RX_ATTR)); \ + FI_TRACE(dom->fabric->hprov, FI_LOG_DOMAIN, "\n%s", \ + fi_tostr_r(buf,len, info->domain_attr, FI_TYPE_DOMAIN_ATTR)); \ } while (0); typedef void (*trace_cq_entry_fn)(const struct fi_provider *prov, From dbc685d8cb8c7c92cfc9abd3aac1a4ac35fb56b9 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Wed, 14 Aug 2024 10:34:29 -0700 Subject: [PATCH 100/393] man/fi_peer: update peer fid initialization language Add clarification in the man page indicating that the owner is responsible for creating unique fi_peer_*_contexts for each peer and that the peers are only allowed to set the peer ops of that context. Signed-off-by: Alexia Ingerson --- man/fi_peer.3.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/man/fi_peer.3.md b/man/fi_peer.3.md index 0dc4dd43077..fc58c16f507 100644 --- a/man/fi_peer.3.md +++ b/man/fi_peer.3.md @@ -83,6 +83,20 @@ similar, independent from the object being shared. However, because the goal of using peer providers is to avoid overhead, providers must be explicitly written to support the peer provider mechanisms. +When importing any shared fabric object into a peer, the owner will create a +separate fid_peer_* for each peer provider it intends to import into. The owner +will pass this unique fid_peer_* into each peer through the context parameter of +the init call for the resource (i.e. fi_cq_open, fi_srx_context, fi_cntr_open, +etc). The fi_peer_*_context will indicate the owner-allocated fid_peer_* for +the peer to use but is temporary for the init call and may not be accessed by +the peer after initialization. The peer will set just the peer_ops of the +owner-allocated fid and save a reference to the imported fid_peer_* for use in +the peer API flow. The peer will allocate its own fid for internal uses and +return that fid to the owner through the regular fid parameter of the init call +(as if it were just another opened resource). The owner is responsible for +saving the returned peer fid from the open call in order to close it later +(or to drive progress in the case of the cq_fid). + There are two peer provider models. In the example listed above, both peers are full providers in their own right and usable in a stand-alone fashion. In a second model, one of the peers is known as an offload provider. An From 472dc4987a6eb675b4d0784d8b51d80dbb8155c0 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 17 Sep 2024 12:20:49 -0700 Subject: [PATCH 101/393] prov/shm: use owner-allocated srx The peer API has been updated to specify that the owner must allocate the peer's fid_peer_srx. The shm implementation was allocating its own internal fid_peer_srx. This updates the shm implementation to assume it has a unique fid_peer_srx and updates the imported fid_peer_srx peer_ops, saving a pointer to the fid_peer_srx instead of the internal fid_ep which required a wrapper function to get back to the fid_peer_srx It also returns an internal fid_ep for the created srx which is used to close the srx by the owner. Even though shm doesn't need anything attached to the internal fid_ep, it is there for consistency and to track the domain reference counting for errors. This patch also moves the srx specific functions into smr_domain where they belong Signed-off-by: Alexia Ingerson --- prov/shm/src/smr.h | 11 ++---- prov/shm/src/smr_av.c | 5 ++- prov/shm/src/smr_domain.c | 72 +++++++++++++++++++++++++++++++++++++ prov/shm/src/smr_ep.c | 56 +++++++++-------------------- prov/shm/src/smr_msg.c | 28 ++++++++------- prov/shm/src/smr_progress.c | 25 +++++++------ 6 files changed, 119 insertions(+), 78 deletions(-) diff --git a/prov/shm/src/smr.h b/prov/shm/src/smr.h index d71f3903f12..1992ecbca9c 100644 --- a/prov/shm/src/smr.h +++ b/prov/shm/src/smr.h @@ -159,6 +159,7 @@ struct smr_domain { int fast_rma; /* cache for use with hmem ipc */ struct ofi_mr_cache *ipc_cache; + struct fid_ep rx_ep; struct fid_peer_srx *srx; }; @@ -220,7 +221,7 @@ struct smr_ep { const char *name; uint64_t msg_id; struct smr_region *volatile region; - struct fid_ep *srx; + struct fid_peer_srx *srx; struct ofi_bufpool *cmd_ctx_pool; struct ofi_bufpool *unexp_buf_pool; struct ofi_bufpool *pend_buf_pool; @@ -236,11 +237,6 @@ struct smr_ep { void (*smr_progress_ipc_list)(struct smr_ep *ep); }; -static inline struct fid_peer_srx *smr_get_peer_srx(struct smr_ep *ep) -{ - return container_of(ep->srx, struct fid_peer_srx, ep_fid); -} - #define smr_ep_rx_flags(smr_ep) ((smr_ep)->util_ep.rx_op_flags) #define smr_ep_tx_flags(smr_ep) ((smr_ep)->util_ep.tx_op_flags) @@ -251,9 +247,6 @@ static inline int smr_mmap_name(char *shm_name, const char *ep_name, ep_name, msg_id); } -int smr_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, - struct fid_ep **rx_ep, void *context); - int smr_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); void smr_ep_exchange_fds(struct smr_ep *ep, int64_t id); diff --git a/prov/shm/src/smr_av.c b/prov/shm/src/smr_av.c index 355d3bcad64..de12e152545 100644 --- a/prov/shm/src/smr_av.c +++ b/prov/shm/src/smr_av.c @@ -113,7 +113,6 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count, struct util_ep *util_ep; struct smr_av *smr_av; struct smr_ep *smr_ep; - struct fid_peer_srx *srx; struct dlist_entry *av_entry; fi_addr_t util_addr; int64_t shm_id = -1; @@ -173,8 +172,8 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count, smr_ep = container_of(util_ep, struct smr_ep, util_ep); smr_ep->region->max_sar_buf_per_peer = SMR_MAX_PEERS / smr_av->smr_map.num_peers; - srx = smr_get_peer_srx(smr_ep); - srx->owner_ops->foreach_unspec_addr(srx, &smr_get_addr); + smr_ep->srx->owner_ops->foreach_unspec_addr(smr_ep->srx, + &smr_get_addr); } } diff --git a/prov/shm/src/smr_domain.c b/prov/shm/src/smr_domain.c index 188cea31410..909298fbc2d 100644 --- a/prov/shm/src/smr_domain.c +++ b/prov/shm/src/smr_domain.c @@ -35,6 +35,78 @@ #include "smr.h" +extern struct fi_ops_srx_peer smr_srx_peer_ops; + +static int smr_srx_close(struct fid *fid) +{ + struct smr_domain *domain = container_of(fid, struct smr_domain, + rx_ep.fid); + + ofi_atomic_dec32(&domain->util_domain.ref); + + return FI_SUCCESS; +} + +static struct fi_ops smr_srx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = smr_srx_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_msg smr_srx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops_tagged smr_srx_tagged_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = fi_no_tagged_send, + .sendv = fi_no_tagged_sendv, + .sendmsg = fi_no_tagged_sendmsg, + .inject = fi_no_tagged_inject, + .senddata = fi_no_tagged_senddata, + .injectdata = fi_no_tagged_injectdata, +}; + +static int smr_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct smr_domain *smr_domain; + + smr_domain = container_of(domain, struct smr_domain, + util_domain.domain_fid); + + if (attr->op_flags & FI_PEER) { + smr_domain->srx = ((struct fi_peer_srx_context *) + (context))->srx; + smr_domain->srx->peer_ops = &smr_srx_peer_ops; + smr_domain->rx_ep.msg = &smr_srx_msg_ops; + smr_domain->rx_ep.tagged = &smr_srx_tagged_ops; + smr_domain->rx_ep.fid.ops = &smr_srx_fi_ops; + smr_domain->rx_ep.fid.fclass = FI_CLASS_SRX_CTX; + *rx_ep = &smr_domain->rx_ep; + ofi_atomic_inc32(&smr_domain->util_domain.ref); + return FI_SUCCESS; + } + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "shared srx only supported with FI_PEER flag\n"); + return -FI_EINVAL; +} + + static struct fi_ops_domain smr_domain_ops = { .size = sizeof(struct fi_ops_domain), .av_open = smr_av_open, diff --git a/prov/shm/src/smr_ep.c b/prov/shm/src/smr_ep.c index 8803495e382..8ad190711fb 100644 --- a/prov/shm/src/smr_ep.c +++ b/prov/shm/src/smr_ep.c @@ -119,8 +119,8 @@ int smr_ep_getopt(fid_t fid, int level, int optname, void *optval, struct smr_ep *smr_ep = container_of(fid, struct smr_ep, util_ep.ep_fid); - return smr_ep->srx->ops->getopt(&smr_ep->srx->fid, level, optname, - optval, optlen); + return smr_ep->srx->ep_fid.ops->getopt(&smr_ep->srx->ep_fid.fid, level, + optname, optval, optlen); } int smr_ep_setopt(fid_t fid, int level, int optname, const void *optval, @@ -134,7 +134,7 @@ int smr_ep_setopt(fid_t fid, int level, int optname, const void *optval, return -FI_ENOPROTOOPT; if (optname == FI_OPT_MIN_MULTI_RECV) { - srx = util_get_peer_srx(smr_ep->srx)->ep_fid.fid.context; + srx = smr_ep->srx->ep_fid.fid.context; srx->min_multi_recv_size = *(size_t *)optval; return FI_SUCCESS; } @@ -159,7 +159,7 @@ static ssize_t smr_ep_cancel(fid_t ep_fid, void *context) struct smr_ep *ep; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid); - return ep->srx->ops->cancel(&ep->srx->fid, context); + return ep->srx->ep_fid.ops->cancel(&ep->srx->ep_fid.fid, context); } static struct fi_ops_ep smr_ep_ops = { @@ -808,9 +808,7 @@ static int smr_ep_close(struct fid *fid) if (ep->srx) { /* shm is an owner provider */ if (ep->util_ep.ep_fid.msg != &smr_no_recv_msg_ops) - (void) util_srx_close(&ep->srx->fid); - else /* shm is a peer provider */ - free(ep->srx); + (void) util_srx_close(&ep->srx->ep_fid.fid); } ofi_endpoint_close(&ep->util_ep); @@ -1062,30 +1060,11 @@ static void smr_update(struct util_srx_ctx *srx, struct util_rx_entry *rx_entry) //by another provider } -int smr_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, - struct fid_ep **rx_ep, void *context) -{ - struct smr_domain *smr_domain; - - smr_domain = container_of(domain, struct smr_domain, - util_domain.domain_fid); - - if (attr->op_flags & FI_PEER) { - smr_domain->srx = ((struct fi_peer_srx_context *) - (context))->srx; - return FI_SUCCESS; - } - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "shared srx only supported with FI_PEER flag\n"); - return -FI_EINVAL; -} - static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) { struct smr_ep *ep; struct util_av *av; int ret = 0; - struct fid_peer_srx *srx, *srx_b; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); switch (bfid->fclass) { @@ -1109,16 +1088,10 @@ static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) struct util_cntr, cntr_fid.fid), flags); break; case FI_CLASS_SRX_CTX: - srx = calloc(1, sizeof(*srx)); - srx_b = container_of(bfid, struct fid_peer_srx, ep_fid.fid); - srx->peer_ops = &smr_srx_peer_ops; - srx->owner_ops = srx_b->owner_ops; - srx->ep_fid.fid.context = srx_b->ep_fid.fid.context; - ep->srx = &srx->ep_fid; + ep->srx = (container_of(bfid, struct smr_domain, rx_ep.fid))->srx; break; default: - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "invalid fid class\n"); + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "invalid fid class\n"); ret = -FI_EINVAL; break; } @@ -1131,6 +1104,7 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) struct smr_domain *domain; struct smr_ep *ep; struct smr_av *av; + struct fid_ep *srx; int ret; ep = container_of(fid, struct smr_ep, util_ep.ep_fid.fid); @@ -1171,15 +1145,17 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) ret = util_ep_srx_context(&domain->util_domain, ep->rx_size, SMR_IOV_LIMIT, SMR_INJECT_SIZE, &smr_update, - &ep->util_ep.lock, &ep->srx); + &ep->util_ep.lock, &srx); if (ret) return ret; - util_get_peer_srx(ep->srx)->peer_ops = - &smr_srx_peer_ops; - ret = util_srx_bind(&ep->srx->fid, - &ep->util_ep.rx_cq->cq_fid.fid, - FI_RECV); + ep->srx = container_of(srx, struct fid_peer_srx, + ep_fid.fid); + ep->srx->peer_ops = &smr_srx_peer_ops; + + ret = util_srx_bind(&ep->srx->ep_fid.fid, + &ep->util_ep.rx_cq->cq_fid.fid, + FI_RECV); if (ret) return ret; } else { diff --git a/prov/shm/src/smr_msg.c b/prov/shm/src/smr_msg.c index 641645ea5d3..0a34ae637ff 100644 --- a/prov/shm/src/smr_msg.c +++ b/prov/shm/src/smr_msg.c @@ -45,7 +45,7 @@ static ssize_t smr_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return util_srx_generic_recv(ep->srx, msg->msg_iov, msg->desc, + return util_srx_generic_recv(&ep->srx->ep_fid, msg->msg_iov, msg->desc, msg->iov_count, msg->addr, msg->context, flags | ep->util_ep.rx_msg_flags); } @@ -58,8 +58,8 @@ static ssize_t smr_recvv(struct fid_ep *ep_fid, const struct iovec *iov, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return util_srx_generic_recv(ep->srx, iov, desc, count, src_addr, - context, smr_ep_rx_flags(ep)); + return util_srx_generic_recv(&ep->srx->ep_fid, iov, desc, count, + src_addr, context, smr_ep_rx_flags(ep)); } static ssize_t smr_recv(struct fid_ep *ep_fid, void *buf, size_t len, @@ -73,8 +73,8 @@ static ssize_t smr_recv(struct fid_ep *ep_fid, void *buf, size_t len, iov.iov_base = buf; iov.iov_len = len; - return util_srx_generic_recv(ep->srx, &iov, &desc, 1, src_addr, context, - smr_ep_rx_flags(ep)); + return util_srx_generic_recv(&ep->srx->ep_fid, &iov, &desc, 1, src_addr, + context, smr_ep_rx_flags(ep)); } static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov, @@ -293,8 +293,9 @@ static ssize_t smr_trecv(struct fid_ep *ep_fid, void *buf, size_t len, iov.iov_base = buf; iov.iov_len = len; - return util_srx_generic_trecv(ep->srx, &iov, &desc, 1, src_addr, context, - tag, ignore, smr_ep_rx_flags(ep)); + return util_srx_generic_trecv(&ep->srx->ep_fid, &iov, &desc, 1, + src_addr, context, tag, ignore, + smr_ep_rx_flags(ep)); } static ssize_t smr_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, @@ -305,8 +306,9 @@ static ssize_t smr_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return util_srx_generic_trecv(ep->srx, iov, desc, count, src_addr, - context, tag, ignore, smr_ep_rx_flags(ep)); + return util_srx_generic_trecv(&ep->srx->ep_fid, iov, desc, count, + src_addr, context, tag, ignore, + smr_ep_rx_flags(ep)); } static ssize_t smr_trecvmsg(struct fid_ep *ep_fid, @@ -316,10 +318,10 @@ static ssize_t smr_trecvmsg(struct fid_ep *ep_fid, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return util_srx_generic_trecv(ep->srx, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, - msg->tag, msg->ignore, - flags | ep->util_ep.rx_msg_flags); + return util_srx_generic_trecv(&ep->srx->ep_fid, msg->msg_iov, msg->desc, + msg->iov_count, msg->addr, msg->context, + msg->tag, msg->ignore, + flags | ep->util_ep.rx_msg_flags); } static ssize_t smr_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, diff --git a/prov/shm/src/smr_progress.c b/prov/shm/src/smr_progress.c index 141826b9bba..c5315aa4b1f 100644 --- a/prov/shm/src/smr_progress.c +++ b/prov/shm/src/smr_progress.c @@ -781,7 +781,7 @@ static int smr_start_common(struct smr_ep *ep, struct smr_cmd *cmd, FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "unable to process rx completion\n"); } - smr_get_peer_srx(ep)->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); } return 0; @@ -836,7 +836,7 @@ static int smr_copy_saved(struct smr_cmd_ctx *cmd_ctx, "unable to process rx completion\n"); return ret; } - smr_get_peer_srx(cmd_ctx->ep)->owner_ops->free_entry(rx_entry); + cmd_ctx->ep->srx->owner_ops->free_entry(rx_entry); return FI_SUCCESS; } @@ -983,7 +983,6 @@ static int smr_alloc_cmd_ctx(struct smr_ep *ep, static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd) { - struct fid_peer_srx *peer_srx = smr_get_peer_srx(ep); struct fi_peer_match_attr attr; struct fi_peer_rx_entry *rx_entry; int ret; @@ -992,33 +991,33 @@ static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd) attr.msg_size = cmd->msg.hdr.size; attr.tag = cmd->msg.hdr.tag; if (cmd->msg.hdr.op == ofi_op_tagged) { - ret = peer_srx->owner_ops->get_tag(peer_srx, &attr, &rx_entry); + ret = ep->srx->owner_ops->get_tag(ep->srx, &attr, &rx_entry); if (ret == -FI_ENOENT) { ret = smr_alloc_cmd_ctx(ep, rx_entry, cmd); if (ret) { - peer_srx->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); return ret; } - ret = peer_srx->owner_ops->queue_tag(rx_entry); + ret = ep->srx->owner_ops->queue_tag(rx_entry); if (ret) { - peer_srx->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); return ret; } goto out; } } else { - ret = peer_srx->owner_ops->get_msg(peer_srx, &attr, &rx_entry); + ret = ep->srx->owner_ops->get_msg(ep->srx, &attr, &rx_entry); if (ret == -FI_ENOENT) { ret = smr_alloc_cmd_ctx(ep, rx_entry, cmd); if (ret) { - peer_srx->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); return ret; } - ret = peer_srx->owner_ops->queue_msg(rx_entry); + ret = ep->srx->owner_ops->queue_msg(rx_entry); if (ret) { - peer_srx->owner_ops->free_entry(rx_entry); + ep->srx->owner_ops->free_entry(rx_entry); return ret; } goto out; @@ -1338,7 +1337,7 @@ void smr_progress_ipc_list(struct smr_ep *ep) ipc_entry->async_event); dlist_remove(&ipc_entry->entry); if (ipc_entry->rx_entry) - smr_get_peer_srx(ep)->owner_ops->free_entry(ipc_entry->rx_entry); + ep->srx->owner_ops->free_entry(ipc_entry->rx_entry); ofi_buf_free(ipc_entry); } } @@ -1444,7 +1443,7 @@ static void smr_progress_sar_list(struct smr_ep *ep) "unable to process rx completion\n"); } if (sar_entry->rx_entry) - smr_get_peer_srx(ep)->owner_ops->free_entry(sar_entry->rx_entry); + ep->srx->owner_ops->free_entry(sar_entry->rx_entry); dlist_remove(&sar_entry->entry); ofi_buf_free(sar_entry); From 527d3207aadf38e7afb03e7ee48cc13d76427370 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 5 Sep 2024 08:10:23 -0700 Subject: [PATCH 102/393] prov/efa: update efa shm implementation to allocate fi_peer_srx_context The previous definition of the peer API didn't specify who allocated the second peer structure (the one referenced by the peer). The shm implementation was choosing to duplicate the imported srx and set it internally. The new definition specifies that the owner handle the duplication of the peer resource which is then imported into the peer to just set. Shm has been updated accordingly but efa needs to be updated to create a second peer_srx and set the fields to the original one for the peer to reference the owner_ops correctly. This also adds a missing fi_close for the shm srx resource Signed-off-by: Alexia Ingerson Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep.h | 4 ++ prov/efa/src/rdm/efa_rdm_ep_fiops.c | 82 +++++++++++++++++++++-------- 2 files changed, 64 insertions(+), 22 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 0a67d23d49d..316bab93d98 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -54,6 +54,10 @@ struct efa_rdm_ep { /* shm provider fid */ struct fid_ep *shm_ep; + /* shm srx fid (shm-owned) */ + struct fid_ep *shm_srx; + /* shm peer_srx (efa-owned) */ + struct fid_peer_srx *shm_peer_srx; size_t mtu_size; size_t max_msg_size; /**< #FI_OPT_MAX_MSG_SIZE */ diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 282a384e325..01a4b3fd909 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -941,6 +941,42 @@ void efa_rdm_ep_remove_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) } } +/** + * @brief Clean efa_rdm_ep's shm ep level resources as the best effort + * + * @param efa_rdm_ep pointer to efa rdm ep + * @return int FI_SUCCESS on success, negative integer on failure + */ +static int efa_rdm_ep_close_shm_ep_resources(struct efa_rdm_ep *efa_rdm_ep) +{ + int ret, retv = 0; + + if (efa_rdm_ep->shm_srx) { + ret = fi_close(&efa_rdm_ep->shm_srx->fid); + if (ret) { + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm srx\n"); + retv = ret; + } + efa_rdm_ep->shm_srx = NULL; + } + + if (efa_rdm_ep->shm_peer_srx) { + free(efa_rdm_ep->shm_peer_srx); + efa_rdm_ep->shm_peer_srx = NULL; + } + + if (efa_rdm_ep->shm_ep) { + ret = fi_close(&efa_rdm_ep->shm_ep->fid); + if (ret) { + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm ep\n"); + retv = ret; + } + efa_rdm_ep->shm_ep = NULL; + } + + return retv; +} + /** * @brief implement the fi_close() API for the EFA RDM endpoint * @param[in,out] fid Endpoint to close @@ -981,13 +1017,9 @@ static int efa_rdm_ep_close(struct fid *fid) retv = ret; } - if (efa_rdm_ep->shm_ep) { - ret = fi_close(&efa_rdm_ep->shm_ep->fid); - if (ret) { - EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm EP\n"); - retv = ret; - } - } + ret = efa_rdm_ep_close_shm_ep_resources(efa_rdm_ep); + if (ret) + retv = ret; efa_rdm_ep_destroy_buffer_pools(efa_rdm_ep); @@ -1053,12 +1085,8 @@ static void efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) struct efa_av *efa_av; struct efa_rdm_cq *efa_rdm_cq; - if (efa_rdm_ep->shm_ep) { - ret = fi_close(&efa_rdm_ep->shm_ep->fid); - if (ret) - EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm ep\n"); - efa_rdm_ep->shm_ep = NULL; - } + + (void) efa_rdm_ep_close_shm_ep_resources(efa_rdm_ep); efa_av = efa_rdm_ep->base_ep.av; if (efa_av->shm_rdm_av) { @@ -1230,7 +1258,6 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) int ret = 0; struct fi_peer_srx_context peer_srx_context = {0}; struct fi_rx_attr peer_srx_attr = {0}; - struct fid_ep *peer_srx_ep = NULL; struct util_srx_ctx *srx_ctx; switch (command) { @@ -1296,26 +1323,36 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) * shared memory region. */ if (ep->shm_ep) { - peer_srx_context.srx = util_get_peer_srx(ep->peer_srx_ep); + ep->shm_peer_srx = calloc(1, sizeof(*ep->shm_peer_srx)); + if (!ep->shm_peer_srx) { + ret = -FI_ENOMEM; + goto err_close_shm; + } + memcpy(ep->shm_peer_srx, util_get_peer_srx(ep->peer_srx_ep), + sizeof(*ep->shm_peer_srx)); + + peer_srx_context.size = sizeof(peer_srx_context); + peer_srx_context.srx = ep->shm_peer_srx; + peer_srx_attr.op_flags |= FI_PEER; ret = fi_srx_context(efa_rdm_ep_domain(ep)->shm_domain, - &peer_srx_attr, &peer_srx_ep, &peer_srx_context); + &peer_srx_attr, &ep->shm_srx, &peer_srx_context); if (ret) - goto err_unlock; + goto err_close_shm; shm_ep_name_len = EFA_SHM_NAME_MAX; ret = efa_shm_ep_name_construct(shm_ep_name, &shm_ep_name_len, &ep->base_ep.src_addr); if (ret < 0) - goto err_unlock; + goto err_close_shm; fi_setname(&ep->shm_ep->fid, shm_ep_name, shm_ep_name_len); /* Bind srx to shm ep */ - ret = fi_ep_bind(ep->shm_ep, &ep->peer_srx_ep->fid, 0); + ret = fi_ep_bind(ep->shm_ep, &ep->shm_srx->fid, 0); if (ret) - goto err_unlock; + goto err_close_shm; ret = fi_enable(ep->shm_ep); if (ret) - goto err_unlock; + goto err_close_shm; } ofi_genlock_unlock(srx_ctx->lock); break; @@ -1326,7 +1363,8 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) return ret; -err_unlock: +err_close_shm: + efa_rdm_ep_close_shm_ep_resources(ep); ofi_genlock_unlock(srx_ctx->lock); err_destroy_qp: efa_base_ep_destruct_qp(&ep->base_ep); From 64426b9499810a2183f7a3fd77a0ae2527185b70 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Fri, 11 Oct 2024 17:09:50 +0000 Subject: [PATCH 103/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- fabtests/man/man7/fabtests.7 | 28 +++++++++---- man/man1/fi_info.1 | 28 +++++++++---- man/man1/fi_pingpong.1 | 48 ++++++++++++++-------- man/man1/fi_strerror.1 | 26 +++++++++--- man/man3/fi_atomic.3 | 30 ++++++++++---- man/man3/fi_av.3 | 72 +++++++++++++++++++-------------- man/man3/fi_av_set.3 | 24 ++++++++--- man/man3/fi_cm.3 | 24 ++++++++--- man/man3/fi_cntr.3 | 28 +++++++++---- man/man3/fi_collective.3 | 38 ++++++++++++------ man/man3/fi_control.3 | 22 ++++++++-- man/man3/fi_cq.3 | 44 +++++++++++++------- man/man3/fi_domain.3 | 48 ++++++++++++++-------- man/man3/fi_endpoint.3 | 78 +++++++++++++++++++++--------------- man/man3/fi_eq.3 | 34 +++++++++++----- man/man3/fi_errno.3 | 20 +++++++-- man/man3/fi_fabric.3 | 32 ++++++++++----- man/man3/fi_getinfo.3 | 52 +++++++++++++++--------- man/man3/fi_mr.3 | 36 ++++++++++++----- man/man3/fi_msg.3 | 32 ++++++++++----- man/man3/fi_nic.3 | 28 +++++++++---- man/man3/fi_peer.3 | 48 ++++++++++++++++++---- man/man3/fi_poll.3 | 24 ++++++++--- man/man3/fi_profile.3 | 20 +++++++-- man/man3/fi_provider.3 | 28 +++++++++---- man/man3/fi_rma.3 | 30 ++++++++++---- man/man3/fi_tagged.3 | 30 ++++++++++---- man/man3/fi_trigger.3 | 22 ++++++++-- man/man3/fi_version.3 | 20 +++++++-- man/man7/fabric.7 | 52 +++++++++++++++--------- man/man7/fi_arch.7 | 18 ++++++++- man/man7/fi_cxi.7 | 78 +++++++++++++++++++++--------------- man/man7/fi_direct.7 | 22 ++++++++-- man/man7/fi_efa.7 | 50 ++++++++++++++--------- man/man7/fi_guide.7 | 24 ++++++++--- man/man7/fi_hook.7 | 20 +++++++-- man/man7/fi_intro.7 | 28 +++++++++---- man/man7/fi_lpp.7 | 20 +++++++-- man/man7/fi_mrail.7 | 30 ++++++++++---- man/man7/fi_opx.7 | 78 +++++++++++++++++++++--------------- man/man7/fi_provider.7 | 50 ++++++++++++++--------- man/man7/fi_psm2.7 | 28 +++++++++---- man/man7/fi_psm3.7 | 30 ++++++++++---- man/man7/fi_rxd.7 | 20 +++++++-- man/man7/fi_rxm.7 | 22 ++++++++-- man/man7/fi_setup.7 | 23 +++++++++-- man/man7/fi_shm.7 | 22 ++++++++-- man/man7/fi_sockets.7 | 20 +++++++-- man/man7/fi_tcp.7 | 20 +++++++-- man/man7/fi_ucx.7 | 20 +++++++-- man/man7/fi_udp.7 | 20 +++++++-- man/man7/fi_usnic.7 | 72 +++++++++++++++++++-------------- man/man7/fi_verbs.7 | 22 ++++++++-- 53 files changed, 1273 insertions(+), 510 deletions(-) diff --git a/fabtests/man/man7/fabtests.7 b/fabtests/man/man7/fabtests.7 index 98f90ca9099..ec2d2c07aae 100644 --- a/fabtests/man/man7/fabtests.7 +++ b/fabtests/man/man7/fabtests.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fabtests" "7" "2024\-09\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fabtests" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -268,7 +282,7 @@ FI_ENORX) can be read by the application, if RNR happens. \f[I]fi_efa_rnr_queue_resend\f[R] This test modifies the RNR retry count (rnr_retry) to 0 via fi_setopt, and then tests RNR queue/re-send logic for different packet types. -To run the test, one needs to use \f[C]-c\f[R] option to specify the +To run the test, one needs to use \f[V]-c\f[R] option to specify the category of packet types. .SS Component tests .PP @@ -448,9 +462,9 @@ The default endpoint type is rdm. Allocate data buffers on the specified device, rather than in host memory. Valid options are ze, cuda and synapseai. +.TP *-a -.IP \[bu] 2 -: The name of a shared address vector. +The name of a shared address vector. This option only applies to tests that support shared address vectors. .TP \f[I]-B \f[R] @@ -462,9 +476,9 @@ endpoints to the server. .TP \f[I]-P \f[R] Specifies the port number of the peer endpoint, overriding the default. +.TP *-s -.IP \[bu] 2 -: Specifies the address of the local endpoint. +Specifies the address of the local endpoint. .TP *-F Specifies the address format. diff --git a/man/man1/fi_info.1 b/man/man1/fi_info.1 index 6f1dbc213e8..ac741a93b27 100644 --- a/man/man1/fi_info.1 +++ b/man/man1/fi_info.1 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_info" "1" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_info" "1" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -56,7 +70,7 @@ For more information on address formats, see fi_getinfo(3). .TP \f[I]-p, \[en]provider=\f[R] Filter fabric interfaces by the provider implementation. -For a list of providers, see the \f[C]--list\f[R] option. +For a list of providers, see the \f[V]--list\f[R] option. .TP \f[I]-d, \[en]domain=\f[R] Filter interfaces to only those with the given domain name. @@ -130,7 +144,7 @@ provider: tcp \f[R] .fi .PP -To see the full fi_info structure, specify the \f[C]-v\f[R] option. +To see the full fi_info structure, specify the \f[V]-v\f[R] option. .IP .nf \f[C] @@ -223,7 +237,7 @@ fi_info: \f[R] .fi .PP -To see libfabric related environment variables \f[C]-e\f[R] option. +To see libfabric related environment variables \f[V]-e\f[R] option. .IP .nf \f[C] @@ -243,7 +257,7 @@ $ ./fi_info -e .fi .PP To see libfabric related environment variables with substring use -\f[C]-g\f[R] option. +\f[V]-g\f[R] option. .IP .nf \f[C] @@ -281,6 +295,6 @@ $ ./fi_info -g tcp .fi .SH SEE ALSO .PP -\f[C]fi_getinfo(3)\f[R], \f[C]fi_endpoint(3)\f[R] +\f[V]fi_getinfo(3)\f[R], \f[V]fi_endpoint(3)\f[R] .SH AUTHORS OpenFabrics. diff --git a/man/man1/fi_pingpong.1 b/man/man1/fi_pingpong.1 index eced1397e6c..419269ebf71 100644 --- a/man/man1/fi_pingpong.1 +++ b/man/man1/fi_pingpong.1 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_pingpong" "1" "2024\-04\-04" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_pingpong" "1" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -21,7 +35,7 @@ fi_pingpong also displays aggregated statistics after each test run, and can additionally verify data integrity upon receipt. .PP By default, the datagram (FI_EP_DGRAM) endpoint is used for the test, -unless otherwise specified via \f[C]-e\f[R]. +unless otherwise specified via \f[V]-e\f[R]. .SH HOW TO RUN TESTS .PP Two copies of the program must be launched: first, one copy must be @@ -47,15 +61,15 @@ client$ fi_pingpong .PP The server and client must be able to communicate properly for the fi_pingpong utility to function. -If any of the \f[C]-e\f[R], \f[C]-I\f[R], \f[C]-S\f[R], or \f[C]-p\f[R] +If any of the \f[V]-e\f[R], \f[V]-I\f[R], \f[V]-S\f[R], or \f[V]-p\f[R] options are used, then they must be specified on the invocation for both the server and the client process. -If the \f[C]-d\f[R] option is specified on the server, then the client +If the \f[V]-d\f[R] option is specified on the server, then the client will select the appropriate domain if no hint is provided on the client side. -If the \f[C]-d\f[R] option is specified on the client, then it must also +If the \f[V]-d\f[R] option is specified on the client, then it must also be specified on the server. -If both the server and client specify the \f[C]-d\f[R] option and the +If both the server and client specify the \f[V]-d\f[R] option and the given domains cannot communicate, then the application will fail. .SS Control Messaging .TP @@ -110,19 +124,19 @@ Activate output debugging (warning: highly verbose) Displays help output for the pingpong test. .SH USAGE EXAMPLES .SS A simple example -.SS Server: \f[C]fi_pingpong -p \f[R] +.SS Server: \f[V]fi_pingpong -p \f[R] .PP -\f[C]server$ fi_pingpong -p sockets\f[R] -.SS Client: \f[C]fi_pingpong -p \f[R] +\f[V]server$ fi_pingpong -p sockets\f[R] +.SS Client: \f[V]fi_pingpong -p \f[R] .PP -\f[C]client$ fi_pingpong -p sockets 192.168.0.123\f[R] +\f[V]client$ fi_pingpong -p sockets 192.168.0.123\f[R] .SS An example with various options .SS Server: .PP -\f[C]server$ fi_pingpong -p usnic -I 1000 -S 1024\f[R] +\f[V]server$ fi_pingpong -p usnic -I 1000 -S 1024\f[R] .SS Client: .PP -\f[C]client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123\f[R] +\f[V]client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123\f[R] .PP Specifically, this will run a pingpong test with: .IP \[bu] 2 @@ -136,14 +150,14 @@ server node as 192.168.0.123 .SS A longer test .SS Server: .PP -\f[C]server$ fi_pingpong -p usnic -I 10000 -S all\f[R] +\f[V]server$ fi_pingpong -p usnic -I 10000 -S all\f[R] .SS Client: .PP -\f[C]client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123\f[R] +\f[V]client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123\f[R] .SH DEFAULTS .PP There is no default provider; if a provider is not specified via the -\f[C]-p\f[R] switch, the test will pick one from the list of available +\f[V]-p\f[R] switch, the test will pick one from the list of available providers (as returned by fi_getinfo(3)). .PP If no endpoint type is specified, `dgram' is used. @@ -178,6 +192,6 @@ client per second .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3) \f[C]fabric\f[R](7), +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3) \f[V]fabric\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man1/fi_strerror.1 b/man/man1/fi_strerror.1 index d652db9229f..70605dabbd6 100644 --- a/man/man1/fi_strerror.1 +++ b/man/man1/fi_strerror.1 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_strerror" "1" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_strerror" "1" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -14,16 +28,16 @@ fi_strerror FI_ERROR_CODE .fi .SH DESCRIPTION .PP -Display the error string for the given numeric \f[C]FI_ERROR_CODE\f[R]. -\f[C]FI_ERROR_CODE\f[R] may be a hexadecimal, octal, or decimal +Display the error string for the given numeric \f[V]FI_ERROR_CODE\f[R]. +\f[V]FI_ERROR_CODE\f[R] may be a hexadecimal, octal, or decimal constant. -Although the \f[C]fi_strerror\f[R](3) library function only accepts +Although the \f[V]fi_strerror\f[R](3) library function only accepts positive error values, for convenience this utility accepts both positive and negative error values. .PP This is primarily a convenience tool for developers. .SH SEE ALSO .PP -\f[C]fabric\f[R](7) \f[C]fi_errno\f[R](3) +\f[V]fabric\f[R](7) \f[V]fi_errno\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_atomic.3 b/man/man3/fi_atomic.3 index 9c629486cdf..a1b0bd7c716 100644 --- a/man/man3/fi_atomic.3 +++ b/man/man3/fi_atomic.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_atomic" "3" "2024\-08\-06" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_atomic" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -129,7 +143,7 @@ Local data buffer to store initial value of remote buffer \f[I]desc / compare_desc / result_desc\f[R] Data descriptor associated with the local data buffer, local compare buffer, and local result buffer, respectively. -See \f[C]fi_mr\f[R](3). +See \f[V]fi_mr\f[R](3). .TP \f[I]dest_addr\f[R] Destination address for connectionless atomic operations. @@ -693,11 +707,11 @@ parameter specifying the tag. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EOPNOTSUPP\f[R] @@ -741,11 +755,11 @@ assigned to the transmitting and receiving endpoints. Both message and data ordering are required if the results of two atomic operations to the same memory buffers are to reflect the second operation acting on the results of the first. -See \f[C]fi_endpoint\f[R](3) for further details and message size +See \f[V]fi_endpoint\f[R](3) for further details and message size restrictions. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_rma\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3), \f[V]fi_rma\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_av.3 b/man/man3/fi_av.3 index 49ebc052318..45d58d2dbf0 100644 --- a/man/man3/fi_av.3 +++ b/man/man3/fi_av.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_av" "3" "2024\-08\-06" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_av" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -149,14 +163,14 @@ been deprecated, see below). See the NOTES section for AV restrictions on duplicate addresses. .PP \f[B]Deprecated\f[R]: AV operations may be set to operate asynchronously -by specifying the FI_EVENT flag to \f[C]fi_av_open\f[R]. +by specifying the FI_EVENT flag to \f[V]fi_av_open\f[R]. When requesting asynchronous operation, the application must first bind an event queue to the AV before inserting addresses. .SS fi_av_open .PP fi_av_open allocates or opens an address vector. The properties and behavior of the address vector are defined by -\f[C]struct fi_av_attr\f[R]. +\f[V]struct fi_av_attr\f[R]. .IP .nf \f[C] @@ -287,7 +301,7 @@ The context field in all completions will be the context specified to the insert call, and the data field in the final completion entry will report the number of addresses successfully inserted. If an error occurs during the asynchronous insertion, an error -completion entry is returned (see \f[C]fi_eq\f[R](3) for a discussion of +completion entry is returned (see \f[V]fi_eq\f[R](3) for a discussion of the fi_eq_err_entry error completion struct). The context field of the error completion will be the context that was specified in the insert call; the data field will contain the index of @@ -347,10 +361,10 @@ the call will return -FI_EBUSY. .SS fi_av_bind (deprecated) .PP Associates an event queue with the AV. -If an AV has been opened with \f[C]FI_EVENT\f[R], then an event queue +If an AV has been opened with \f[V]FI_EVENT\f[R], then an event queue must be bound to the AV before any insertion calls are attempted. Any calls to insert addresses before an event queue has been bound will -fail with \f[C]-FI_ENOEQ\f[R]. +fail with \f[V]-FI_ENOEQ\f[R]. Flags are reserved for future use and must be 0. .SS fi_av_insert .PP @@ -361,7 +375,7 @@ AV. Addresses inserted into an address vector must be in the same format as specified in the addr_format field of the fi_info struct provided when opening the corresponding domain. -When using the \f[C]FI_ADDR_STR\f[R] format, the \f[C]addr\f[R] +When using the \f[V]FI_ADDR_STR\f[R] format, the \f[V]addr\f[R] parameter should reference an array of strings (char **). .PP \f[B]Deprecated\f[R]: For AV\[cq]s of type FI_AV_MAP, once inserted @@ -395,14 +409,14 @@ buffer must remain valid until the insertion operation completes. Note that if fi_addr is NULL and synchronous operation is requested without using FI_SYNC_ERR flag, individual insertion failures cannot be reported and the application must use other calls, such as -\f[C]fi_av_lookup\f[R] to learn which specific addresses failed to +\f[V]fi_av_lookup\f[R] to learn which specific addresses failed to insert. .PP If the address vector is configured with authorization keys, the fi_addr parameter may be used as input to define the authorization keys associated with the endpoint addresses being inserted. This is done by setting the fi_addr to an authorization key fi_addr_t -generated from \f[C]fi_av_insert_auth_key\f[R] and setting the +generated from \f[V]fi_av_insert_auth_key\f[R] and setting the FI_AUTH_KEY flag. If the FI_AUTH_KEY flag is not set, addresses being inserted will not be associated with any authorization keys. @@ -416,7 +430,7 @@ authorization keys. These fi_addr_t\[cq]s can be used as the target for local data transfer operations. .PP -If the endpoint supports \f[C]FI_DIRECTED_RECV\f[R], these +If the endpoint supports \f[V]FI_DIRECTED_RECV\f[R], these fi_addr_t\[cq]s can be used to restrict receive buffers to a specific endpoint address and authorization key. .PP @@ -479,10 +493,10 @@ Node should be a string that corresponds to a hostname or network address. The service string corresponds to a textual representation of a transport address. -Applications may also pass in an \f[C]FI_ADDR_STR\f[R] formatted address +Applications may also pass in an \f[V]FI_ADDR_STR\f[R] formatted address as the node parameter. In such cases, the service parameter must be NULL. -See fi_getinfo.3 for details on using \f[C]FI_ADDR_STR\f[R]. +See fi_getinfo.3 for details on using \f[V]FI_ADDR_STR\f[R]. Supported flags are the same as for fi_av_insert. .SS fi_av_insertsym .PP @@ -526,7 +540,7 @@ Note that removing an address may not disable receiving data from the peer endpoint. fi_av_close will automatically cleanup any associated resource. .PP -If the address being removed came from \f[C]fi_av_insert_auth_key\f[R], +If the address being removed came from \f[V]fi_av_insert_auth_key\f[R], the address will only be removed if all endpoints, which have been enabled against the corresponding authorization key, have been closed. If all endpoints are not closed, -FI_EBUSY will be returned. @@ -576,8 +590,8 @@ fi_av_straddr returns a pointer to buf. .SS fi_av_insert_auth_key .PP This function associates authorization keys with an address vector. -This requires the domain to be opened with \f[C]FI_AV_AUTH_KEY\f[R]. -\f[C]FI_AV_AUTH_KEY\f[R] enables endpoints and memory regions to be +This requires the domain to be opened with \f[V]FI_AV_AUTH_KEY\f[R]. +\f[V]FI_AV_AUTH_KEY\f[R] enables endpoints and memory regions to be associated with authorization keys from the address vector. This behavior enables a single endpoint or memory region to be associated with multiple authorization keys. @@ -587,38 +601,38 @@ address vector authorization keys at that point in time. Later authorization key insertions will not propagate to already enabled endpoints and memory regions. .PP -The \f[C]auth_key\f[R] and \f[C]auth_key_size\f[R] parameters are used +The \f[V]auth_key\f[R] and \f[V]auth_key_size\f[R] parameters are used to input the authorization key into the address vector. The structure of the authorization key is provider specific. -If the \f[C]auth_key_size\f[R] does not align with provider specific +If the \f[V]auth_key_size\f[R] does not align with provider specific structure, -FI_EINVAL will be returned. .PP -The output of \f[C]fi_av_insert_auth_key\f[R] is an authorization key +The output of \f[V]fi_av_insert_auth_key\f[R] is an authorization key fi_addr_t handle representing all endpoint addresses against this specific authorization key. For all operations, including address vector, memory registration, and data transfers, which may accept an authorization key fi_addr_t as input, the FI_AUTH_KEY flag must be specified. Otherwise, the fi_addr_t will be treated as an fi_addr_t returned from -the \f[C]fi_av_insert\f[R] and related functions. +the \f[V]fi_av_insert\f[R] and related functions. .PP For endpoints enabled with FI_DIRECTED_RECV, authorization key fi_addr_t\[cq]s can be used to restrict incoming messages to only endpoint addresses within the authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[C]fi_recvmsg\f[R] and \f[C]fi_trecvmsg\f[R]. +\f[V]fi_recvmsg\f[R] and \f[V]fi_trecvmsg\f[R]. .PP For domains enabled with FI_DIRECTED_RECV, authorization key fi_addr_t\[cq]s can be used to restrict memory region access to only endpoint addresses within the authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[C]fi_mr_regattr\f[R]. +\f[V]fi_mr_regattr\f[R]. .PP These authorization key fi_addr_t\[cq]s can later be used an input for endpoint address insertion functions to generate an fi_addr_t for a specific endpoint address and authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[C]fi_av_insert\f[R] and related functions. +\f[V]fi_av_insert\f[R] and related functions. .PP For address vectors configured with FI_AV_USER_ID and endpoints with FI_SOURCE_ERR, all subsequent FI_EADDRNOTAVAIL error events will return @@ -636,7 +650,7 @@ Flags are reserved for future use and must be 0. This functions returns the authorization key associated with a fi_addr_t. Acceptable fi_addr_t\[cq]s input are the output of -\f[C]fi_av_insert_auth_key\f[R] and AV address insertion functions. +\f[V]fi_av_insert_auth_key\f[R] and AV address insertion functions. The returned authorization key is in a provider specific format. On input, the auth_key_size parameter should indicate the size of the auth_key buffer. @@ -745,14 +759,14 @@ function. This function is used to set the group ID portion of an fi_addr_t. .SH RETURN VALUES .PP -Insertion calls, excluding \f[C]fi_av_insert_auth_key\f[R], for an AV +Insertion calls, excluding \f[V]fi_av_insert_auth_key\f[R], for an AV opened for synchronous operation will return the number of addresses that were successfully inserted. In the case of failure, the return value will be less than the number of addresses that was specified. .PP \f[B]Deprecated\f[R]: Insertion calls, excluding -\f[C]fi_av_insert_auth_key\f[R], for an AV opened for asynchronous +\f[V]fi_av_insert_auth_key\f[R], for an AV opened for asynchronous operation (with FI_EVENT flag specified) will return FI_SUCCESS if the operation was successfully initiated. In the case of failure, a negative fabric errno will be returned. @@ -767,10 +781,10 @@ FI_ADDR_NOTAVAIL. .PP All other calls return FI_SUCCESS on success, or a negative value corresponding to fabric errno on error. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_av_set.3 b/man/man3/fi_av_set.3 index f64f51c7d9c..71cff544452 100644 --- a/man/man3/fi_av_set.3 +++ b/man/man3/fi_av_set.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_av_set" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_av_set" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -92,7 +106,7 @@ The creation and manipulation of an AV set is a local operation. No fabric traffic is exchanged between peers. As a result, each peer is responsible for creating matching AV sets as part of their collective membership definition. -See \f[C]fi_collective\f[R](3) for a discussion of membership models. +See \f[V]fi_collective\f[R](3) for a discussion of membership models. .SS fi_av_set .PP The fi_av_set call creates a new AV set. @@ -263,9 +277,9 @@ It is an error for a user to request an unsupported collective. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_av\f[R](3), \f[C]fi_collective\f[R](3) +\f[V]fi_av\f[R](3), \f[V]fi_collective\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cm.3 b/man/man3/fi_cm.3 index 85c8d2b5ea9..36fff256d99 100644 --- a/man/man3/fi_cm.3 +++ b/man/man3/fi_cm.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cm" "3" "2023\-01\-02" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cm" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -261,7 +275,7 @@ or an error will occur. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .SH NOTES .PP @@ -279,7 +293,7 @@ events, or as additional err_data to fi_eq_err_entry, in the case of a rejected connection. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cntr.3 b/man/man3/fi_cntr.3 index b01c900efea..53c35f5b5fe 100644 --- a/man/man3/fi_cntr.3 +++ b/man/man3/fi_cntr.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cntr" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cntr" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -91,14 +105,14 @@ That is, a counter actually stores two distinct values, with error completions updating an error specific value. .PP Counters are updated following the completion event semantics defined in -\f[C]fi_cq\f[R](3). +\f[V]fi_cq\f[R](3). The timing of the update is based on the type of transfer and any specified operation flags. .SS fi_cntr_open .PP fi_cntr_open allocates a new fabric counter. The properties and behavior of the counter are defined by -\f[C]struct fi_cntr_attr\f[R]. +\f[V]struct fi_cntr_attr\f[R]. .IP .nf \f[C] @@ -278,7 +292,7 @@ On error, a negative value corresponding to fabric errno is returned. fi_cntr_read / fi_cntr_readerr Returns the current value of the counter. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH NOTES .PP In order to support a variety of counter implementations, updates made @@ -300,7 +314,7 @@ fi_cntr_set / fi_cntr_seterr and results of related operations are reflected in the observed value of the counter. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_poll\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3), \f[V]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_collective.3 b/man/man3/fi_collective.3 index 1343d121c74..13627bb8a9c 100644 --- a/man/man3/fi_collective.3 +++ b/man/man3/fi_collective.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_collective" "3" "2024\-01\-08" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_collective" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .TP @@ -152,7 +166,7 @@ be used for required input. .PP In general collective operations can be thought of as coordinated atomic operations between a set of peer endpoints. -Readers should refer to the \f[C]fi_atomic\f[R](3) man page for details +Readers should refer to the \f[V]fi_atomic\f[R](3) man page for details on the atomic operations and datatypes defined by libfabric. .PP A collective operation is a group communication exchange. @@ -199,7 +213,7 @@ provider by creating and configuring an address vector set (AV set). An AV set represents an ordered subset of addresses in an address vector (AV). Details on creating and configuring an AV set are available in -\f[C]fi_av_set\f[R](3). +\f[V]fi_av_set\f[R](3). .PP Once an AV set has been programmed with the collective membership information, an endpoint is joined to the set. @@ -258,7 +272,7 @@ Applications must call fi_close on the collective group to disconnect the endpoint from the group. After a join operation has completed, the fi_mc_addr call may be used to retrieve the address associated with the multicast group. -See \f[C]fi_cm\f[R](3) for additional details on fi_mc_addr(). +See \f[V]fi_cm\f[R](3) for additional details on fi_mc_addr(). .SS Barrier (fi_barrier) .PP The fi_barrier operation provides a mechanism to synchronize peers. @@ -509,7 +523,7 @@ struct fi_collective_attr { \f[R] .fi .PP -For a description of struct fi_atomic_attr, see \f[C]fi_atomic\f[R](3). +For a description of struct fi_atomic_attr, see \f[V]fi_atomic\f[R](3). .TP \f[I]op\f[R] On input, this specifies the atomic operation involved with the @@ -552,7 +566,7 @@ collective operation through the provider. .PP Collective operations map to underlying fi_atomic operations. For a discussion of atomic completion semantics, see -\f[C]fi_atomic\f[R](3). +\f[V]fi_atomic\f[R](3). The completion, ordering, and atomicity of collective operations match those defined for point to point atomic operations. .SH FLAGS @@ -567,11 +581,11 @@ collective operation. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EOPNOTSUPP\f[R] @@ -587,11 +601,11 @@ As such, they follow most of the conventions and restrictions as peer to peer atomic operations. This includes data atomicity, data alignment, and message ordering semantics. -See \f[C]fi_atomic\f[R](3) for additional information on the datatypes +See \f[V]fi_atomic\f[R](3) for additional information on the datatypes and operations defined for atomic and collective operations. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_av\f[R](3), \f[C]fi_atomic\f[R](3), -\f[C]fi_cm\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_av\f[R](3), \f[V]fi_atomic\f[R](3), +\f[V]fi_cm\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_control.3 b/man/man3/fi_control.3 index 1e853d74718..27d8a52afd9 100644 --- a/man/man3/fi_control.3 +++ b/man/man3/fi_control.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_control" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_control" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -50,7 +64,7 @@ header files (\[cq]rdma/fi_ext_*.h\[cq]). Please refer to the provider man pages for details. .SH SEE ALSO .PP -\f[C]fi_endpoint\f[R](3), \f[C]fi_cm\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_cq\f[R](3), \f[C]fi_eq\f[R](3), +\f[V]fi_endpoint\f[R](3), \f[V]fi_cm\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_cq\f[R](3), \f[V]fi_eq\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cq.3 b/man/man3/fi_cq.3 index da07f4fcb2f..d788b5af76f 100644 --- a/man/man3/fi_cq.3 +++ b/man/man3/fi_cq.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cq" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cq" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -123,7 +137,7 @@ Unlike event queues, completion queues are associated with a resource domain and may be offloaded entirely in provider hardware. .PP The properties and behavior of a completion queue are defined by -\f[C]struct fi_cq_attr\f[R]. +\f[V]struct fi_cq_attr\f[R]. .IP .nf \f[C] @@ -354,8 +368,9 @@ Multiple completions may be retrieved from a CQ in a single call. The maximum number of entries to return is limited to the specified count parameter, with the number of entries successfully read from the CQ returned by the call. -(See return values section below.) A count value of 0 may be used to -drive progress on associated endpoints when manual progress is enabled. +(See return values section below.) +A count value of 0 may be used to drive progress on associated endpoints +when manual progress is enabled. .PP CQs are optimized to report operations which have completed successfully. @@ -429,7 +444,7 @@ fi_cq_readerr is a non-blocking call, returning immediately whether an error completion was found or not. .PP Error information is reported to the user through -\f[C]struct fi_cq_err_entry\f[R]. +\f[V]struct fi_cq_err_entry\f[R]. The format of this structure is defined below. .IP .nf @@ -522,8 +537,9 @@ Flags are set for all relevant completions. .TP \f[I]len\f[R] This len field applies to completed receive operations (e.g.\ fi_recv, -fi_trecv, etc.) and the completed write with remote cq data on the -responder side (e.g.\ fi_write, with FI_REMOTE_CQ_DATA flag). +fi_trecv, etc.) +and the completed write with remote cq data on the responder side +(e.g.\ fi_write, with FI_REMOTE_CQ_DATA flag). It indicates the size of transferred \f[I]message\f[R] data \[en] i.e.\ how many data bytes were placed into the associated receive/target buffer by a corresponding fi_send/fi_tsend/fi_write et al call. @@ -954,7 +970,7 @@ When heterogenous memory is involved, the concept of memory domains come into play. Memory domains identify the physical separation of memory, which may or may not be accessible through the same virtual address space. -See the \f[C]fi_mr\f[R](3) man page for further details on memory +See the \f[V]fi_mr\f[R](3) man page for further details on memory domains. .PP Completion ordering and data visibility are only well-defined for @@ -1014,7 +1030,7 @@ As a result, applications can request a lower completion semantic when posting receives. That indicates to the provider that the application will be responsible for handling any device specific flush operations that might be needed. -See \f[C]fi_msg\f[R](3) FLAGS. +See \f[V]fi_msg\f[R](3) FLAGS. .PP For data transfers that do not generate a completion at the target, such as RMA or atomics, it is the responsibility of the application to ensure @@ -1117,11 +1133,11 @@ returns -FI_EAGAIN. : Returns a character string interpretation of the provider specific error returned with a completion. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_poll\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_domain.3 b/man/man3/fi_domain.3 index 539d7e6919e..c3a1b104a28 100644 --- a/man/man3/fi_domain.3 +++ b/man/man3/fi_domain.3 @@ -1,7 +1,21 @@ -.\"t -.\" Automatically generated by Pandoc 2.9.2.1 +'\" t +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_domain" "3" "2024\-08\-27" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_domain" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -76,7 +90,7 @@ parameter. .PP Similar to fi_domain, but accepts an extra parameter \f[I]flags\f[R]. Mainly used for opening peer domain. -See \f[C]fi_peer\f[R](3). +See \f[V]fi_peer\f[R](3). .SS fi_open_ops .PP fi_open_ops is used to open provider specific interfaces. @@ -173,9 +187,9 @@ through the event queue. If an event queue is not bound to the domain with the FI_REG_MR flag, then memory registration requests complete synchronously. .PP -See \f[C]fi_av_bind\f[R](3), \f[C]fi_ep_bind\f[R](3), -\f[C]fi_mr_bind\f[R](3), \f[C]fi_pep_bind\f[R](3), and -\f[C]fi_scalable_ep_bind\f[R](3) for more information. +See \f[V]fi_av_bind\f[R](3), \f[V]fi_ep_bind\f[R](3), +\f[V]fi_mr_bind\f[R](3), \f[V]fi_pep_bind\f[R](3), and +\f[V]fi_scalable_ep_bind\f[R](3) for more information. .SS fi_close .PP The fi_close call is used to release all resources associated with a @@ -184,7 +198,7 @@ All objects associated with the opened domain must be released prior to calling fi_close, otherwise the call will return -FI_EBUSY. .SH DOMAIN ATTRIBUTES .PP -The \f[C]fi_domain_attr\f[R] structure defines the set of attributes +The \f[V]fi_domain_attr\f[R] structure defines the set of attributes associated with a domain. .IP .nf @@ -649,7 +663,7 @@ size as the endpoint queue(s) that are bound to it. .SS AV Type (av_type) .PP Specifies the type of address vectors that are usable with this domain. -For additional details on AV type, see \f[C]fi_av\f[R](3). +For additional details on AV type, see \f[V]fi_av\f[R](3). The following values may be specified. .TP \f[I]FI_AV_MAP\f[R] (deprecated) @@ -673,7 +687,7 @@ optimal AV type supported by this domain. .SS Memory Registration Mode (mr_mode) .PP Defines memory registration specific mode bits used with this domain. -Full details on MR mode options are available in \f[C]fi_mr\f[R](3). +Full details on MR mode options are available in \f[V]fi_mr\f[R](3). The following values may be specified. .TP \f[I]FI_MR_ALLOCATED\f[R] @@ -854,7 +868,7 @@ If this domain capability is not set, address vectors cannot be opened with FI_AV_USER_ID. Note that FI_AV_USER_ID can still be supported through the AV insert calls without this domain capability set. -See \f[C]fi_av\f[R](3). +See \f[V]fi_av\f[R](3). .TP \f[I]FI_PEER\f[R] Specifies that the domain must support importing resources to be used in @@ -885,7 +899,7 @@ provider, for example. Indicates that the domain supports the ability to share address vectors among multiple processes using the named address vector feature. .PP -See \f[C]fi_getinfo\f[R](3) for a discussion on primary versus secondary +See \f[V]fi_getinfo\f[R](3) for a discussion on primary versus secondary capabilities. .SS Default authorization key (auth_key) .PP @@ -932,7 +946,7 @@ cache or lookup tables. .PP This specifies the default traffic class that will be associated any endpoints created within the domain. -See \f[C]fi_endpoint\f[R](3) for additional information. +See \f[V]fi_endpoint\f[R](3) for additional information. .SS Max Authorization Keys per Endpoint (max_ep_auth_key) .PP The maximum number of authorization keys which can be supported per @@ -941,7 +955,7 @@ connectionless endpoint. .PP The maximum value that a peer group may be assigned, inclusive. Valid peer group id\[cq]s must be between 0 and max_group_id. -See \f[C]fi_av\f[R](3) for additional information on peer groups and +See \f[V]fi_av\f[R](3) for additional information on peer groups and their use. Users may request support for peer groups by setting this to a non-zero value. @@ -953,7 +967,7 @@ the application. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH NOTES .PP Users should call fi_close to release all resources allocated to the @@ -972,7 +986,7 @@ lightly loaded systems, without an administrator configuring system resources appropriately for the installed provider(s). .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_av\f[R](3), -\f[C]fi_eq\f[R](3), \f[C]fi_mr\f[R](3) \f[C]fi_peer\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_av\f[R](3), +\f[V]fi_eq\f[R](3), \f[V]fi_mr\f[R](3) \f[V]fi_peer\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_endpoint.3 b/man/man3/fi_endpoint.3 index a9dad7e3d40..5f17bd5c5b1 100644 --- a/man/man3/fi_endpoint.3 +++ b/man/man3/fi_endpoint.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_endpoint" "3" "2024\-08\-23" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_endpoint" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -263,7 +277,7 @@ been used. .PP Similar to fi_endpoint, buf accepts an extra parameter \f[I]flags\f[R]. Mainly used for opening endpoints that use peer transfer feature. -See \f[C]fi_peer\f[R](3) +See \f[V]fi_peer\f[R](3) .SS fi_close .PP Closes an endpoint and release all resources associated with it. @@ -576,7 +590,7 @@ FI_HMEM_P2P_DISABLED: Peer to peer support should not be used. fi_setopt() will return -FI_EOPNOTSUPP if the mode requested cannot be supported by the provider. The FI_HMEM_DISABLE_P2P environment variable discussed in -\f[C]fi_mr\f[R](3) takes precedence over this setopt option. +\f[V]fi_mr\f[R](3) takes precedence over this setopt option. .RE \[bu] .RS 2 .TP @@ -609,10 +623,10 @@ Define the maximum message size that can be transferred by the endpoint in a single untagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -621,10 +635,10 @@ Define the maximum message size that can be transferred by the endpoint in a single tagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -633,10 +647,10 @@ Define the maximum message size that can be transferred by the endpoint via a single RMA operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -645,10 +659,10 @@ Define the maximum data size that can be transferred by the endpoint via a single atomic operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -657,10 +671,10 @@ Define the maximum message size that can be injected by the endpoint in a single untagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -669,10 +683,10 @@ Define the maximum message size that can be injected by the endpoint in a single tagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -681,10 +695,10 @@ Define the maximum data size that can be injected by the endpoint in a single RMA operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -693,10 +707,10 @@ Define the maximum data size that can be injected by the endpoint in a single atomic operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE .SS fi_tc_dscp_set .PP @@ -1778,7 +1792,7 @@ value of transmit or receive context attributes of an endpoint. \f[I]FI_COMMIT_COMPLETE\f[R] Indicates that a completion should not be generated (locally or at the peer) until the result of an operation have been made persistent. -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_COMPLETION\f[R] Indicates that a completion queue entry should be written for data @@ -1791,7 +1805,7 @@ See the fi_ep_bind section above for more detail. \f[I]FI_DELIVERY_COMPLETE\f[R] Indicates that a completion should be generated when the operation has been processed by the destination endpoint(s). -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_INJECT\f[R] Indicates that all outbound data buffers should be returned to the @@ -1806,7 +1820,7 @@ This limit is indicated using inject_size (see inject_size above). \f[I]FI_INJECT_COMPLETE\f[R] Indicates that a completion should be generated when the source buffer(s) may be reused. -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_MULTICAST\f[R] Indicates that data transfers will target multicast addresses by @@ -1830,7 +1844,7 @@ space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). \f[I]FI_TRANSMIT_COMPLETE\f[R] Indicates that a completion should be generated when the transmit operation has completed relative to the local provider. -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .SH NOTES .PP Users should call fi_close to release all resources allocated to the @@ -1839,10 +1853,10 @@ fabric endpoint. Endpoints allocated with the FI_CONTEXT or FI_CONTEXT2 mode bits set must typically provide struct fi_context(2) as their per operation context parameter. -(See fi_getinfo.3 for details.) However, when FI_SELECTIVE_COMPLETION is -enabled to suppress CQ completion entries, and an operation is initiated -without the FI_COMPLETION flag set, then the context parameter is -ignored. +(See fi_getinfo.3 for details.) +However, when FI_SELECTIVE_COMPLETION is enabled to suppress CQ +completion entries, and an operation is initiated without the +FI_COMPLETION flag set, then the context parameter is ignored. An application does not need to pass in a valid struct fi_context(2) into such data transfers. .PP @@ -1881,7 +1895,7 @@ submitted for processing. For fi_setopt/fi_getopt, a return value of -FI_ENOPROTOOPT indicates the provider does not support the requested option. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EDOMAIN\f[R] @@ -1895,8 +1909,8 @@ The endpoint has not been configured with necessary completion queue. The endpoint\[cq]s state does not permit the requested operation. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) -\f[C]fi_msg\f[R](3), \f[C]fi_tagged\f[R](3), \f[C]fi_rma\f[R](3) -\f[C]fi_peer\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) +\f[V]fi_msg\f[R](3), \f[V]fi_tagged\f[R](3), \f[V]fi_rma\f[R](3) +\f[V]fi_peer\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_eq.3 b/man/man3/fi_eq.3 index 249c086cc3b..359feed69b9 100644 --- a/man/man3/fi_eq.3 +++ b/man/man3/fi_eq.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_eq" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_eq" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -109,7 +123,7 @@ as listening for connection requests. fi_eq_open allocates a new event queue. .PP The properties and behavior of an event queue are defined by -\f[C]struct fi_eq_attr\f[R]. +\f[V]struct fi_eq_attr\f[R]. .IP .nf \f[C] @@ -259,7 +273,7 @@ These include the following types of events: memory registration, address vector resolution, and multicast joins. .PP Control requests report their completion by inserting a -\f[C]struct fi_eq_entry\f[R] into the EQ. +\f[V]struct fi_eq_entry\f[R] into the EQ. The format of this structure is: .IP .nf @@ -283,7 +297,7 @@ The context field will be set to the context specified as part of the operation, if available, otherwise the context will be associated with the fabric descriptor. The data field will be set as described in the man page for the -corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a +corresponding object type (e.g., see \f[V]fi_av\f[R](3) for a description of how asynchronous address vector insertions are completed). .TP @@ -293,7 +307,7 @@ setup or tear down connections between endpoints. There are three connection notification events: FI_CONNREQ, FI_CONNECTED, and FI_SHUTDOWN. Connection notifications are reported using -\f[C]struct fi_eq_cm_entry\f[R]: +\f[V]struct fi_eq_cm_entry\f[R]: .IP .nf \f[C] @@ -432,7 +446,7 @@ The context field will be set to the context specified as part of the operation. .PP The data field will be set as described in the man page for the -corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a +corresponding object type (e.g., see \f[V]fi_av\f[R](3) for a description of how asynchronous address vector insertions are completed). .PP @@ -558,10 +572,10 @@ fi_eq_strerror Returns a character string interpretation of the provider specific error returned with a completion. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3), \f[C]fi_poll\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cntr\f[R](3), \f[V]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_errno.3 b/man/man3/fi_errno.3 index dcac687918e..e73772c435f 100644 --- a/man/man3/fi_errno.3 +++ b/man/man3/fi_errno.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_errno" "3" "2024\-03\-20" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_errno" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -184,6 +198,6 @@ Receiver not ready, no receive buffers available Memory registration limit exceeded .SH SEE ALSO .PP -\f[C]fabric\f[R](7) +\f[V]fabric\f[R](7) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_fabric.3 b/man/man3/fi_fabric.3 index 3049e798a3d..205a8d96691 100644 --- a/man/man3/fi_fabric.3 +++ b/man/man3/fi_fabric.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_fabric" "3" "2023\-09\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_fabric" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -146,11 +160,11 @@ The data parameter is ignored. .TP \f[I]FI_TYPE_EQ_EVENT\f[R] uint32_t event parameter returned from fi_eq_read(). -See \f[C]fi_eq(3)\f[R] for a list of known values. +See \f[V]fi_eq(3)\f[R] for a list of known values. .TP \f[I]FI_TYPE_CQ_EVENT_FLAGS\f[R] uint64_t flags field in fi_cq_xxx_entry structures. -See \f[C]fi_cq(3)\f[R] for valid flags. +See \f[V]fi_cq(3)\f[R] for valid flags. .TP \f[I]FI_TYPE_MR_MODE\f[R] struct fi_domain_attr::mr_mode flags @@ -245,7 +259,7 @@ these environment variables in a production setting. Version information for the fabric provider, in a major.minor format. The use of the FI_MAJOR() and FI_MINOR() version macros may be used to extract the major and minor version data. -See \f[C]fi_version(3)\f[R]. +See \f[V]fi_version(3)\f[R]. .PP In case of an utility provider layered over a core provider, the version would always refer to that of the utility provider. @@ -253,16 +267,16 @@ would always refer to that of the utility provider. .PP The interface version requested by the application. This value corresponds to the version parameter passed into -\f[C]fi_getinfo(3)\f[R]. +\f[V]fi_getinfo(3)\f[R]. .SH RETURN VALUE .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), -\f[C]fi_eq\f[R](3), \f[C]fi_endpoint\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), +\f[V]fi_eq\f[R](3), \f[V]fi_endpoint\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_getinfo.3 b/man/man3/fi_getinfo.3 index 9d2c8496612..5cf752d53fd 100644 --- a/man/man3/fi_getinfo.3 +++ b/man/man3/fi_getinfo.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_getinfo" "3" "2024\-08\-27" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_getinfo" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -218,8 +232,8 @@ manner. The fi_info::handle field is also used by fi_endpoint() and fi_reject() calls when processing connection requests or to inherit another endpoint\[cq]s attributes. -See \f[C]fi_eq\f[R](3), \f[C]fi_reject\f[R](3), and -\f[C]fi_endpoint\f[R](3). +See \f[V]fi_eq\f[R](3), \f[V]fi_reject\f[R](3), and +\f[V]fi_endpoint\f[R](3). The info->handle field will be ignored by fi_dupinfo and fi_freeinfo. .TP \f[I]tx_attr - transmit context attributes\f[R] @@ -252,7 +266,7 @@ set. On output, the actual endpoint attributes that can be provided will be returned. Output values will be greater than or equal to requested input values. -See \f[C]fi_endpoint\f[R](3) for details. +See \f[V]fi_endpoint\f[R](3) for details. .TP \f[I]domain_attr - domain attributes\f[R] Optionally supplied domain attributes. @@ -262,7 +276,7 @@ be set. On output, the actual domain attributes that can be provided will be returned. Output values will be greater than or equal to requested input values. -See \f[C]fi_domain\f[R](3) for details. +See \f[V]fi_domain\f[R](3) for details. .TP \f[I]fabric_attr - fabric attributes\f[R] Optionally supplied fabric attributes. @@ -271,14 +285,14 @@ When provided as hints, requested values of struct fi_fabric_attr should be set. On output, the actual fabric attributes that can be provided will be returned. -See \f[C]fi_fabric\f[R](3) for details. +See \f[V]fi_fabric\f[R](3) for details. .TP \f[I]nic - network interface details\f[R] Optional attributes related to the hardware NIC associated with the specified fabric, domain, and endpoint data. This field is only valid for providers where the corresponding attributes are closely associated with a hardware NIC. -See \f[C]fi_nic\f[R](3) for details. +See \f[V]fi_nic\f[R](3) for details. .SH CAPABILITIES .PP Interface capabilities are obtained by OR-ing the following flags @@ -310,12 +324,12 @@ Requests that the provider support the association of a user specified identifier with each address vector (AV) address. User identifiers are returned with completion data in place of the AV address. -See \f[C]fi_domain\f[R](3) and \f[C]fi_av\f[R](3) for more details. +See \f[V]fi_domain\f[R](3) and \f[V]fi_av\f[R](3) for more details. .TP \f[I]FI_COLLECTIVE\f[R] Requests support for collective operations. Endpoints that support this capability support the collective operations -defined in \f[C]fi_collective\f[R](3). +defined in \f[V]fi_collective\f[R](3). .TP \f[I]FI_DIRECTED_RECV\f[R] Requests that the communication endpoint use the source address of an @@ -482,7 +496,7 @@ endpoint as send-only or receive-only. \f[I]FI_TRIGGER\f[R] Indicates that the endpoint should support triggered operations. Endpoints support this capability must meet the usage model as described -by \f[C]fi_trigger\f[R](3). +by \f[V]fi_trigger\f[R](3). .TP \f[I]FI_WRITE\f[R] Indicates that the user requires an endpoint capable of initiating @@ -493,7 +507,7 @@ This flag requires that FI_RMA and/or FI_ATOMIC be set. Specifies that the endpoint should support transfers that may be initiated from heterogenous computation devices, such as GPUs. This flag requires that FI_TRIGGER be set. -For additional details on XPU triggers see \f[C]fi_trigger\f[R](3). +For additional details on XPU triggers see \f[V]fi_trigger\f[R](3). .PP Capabilities may be grouped into three general categories: primary, secondary, and primary modifiers. @@ -596,8 +610,8 @@ application for access domains opened with this capability. This flag is defined for compatibility and is ignored if the application version is 1.5 or later and the domain mr_mode is set to anything other than FI_MR_BASIC or FI_MR_SCALABLE. -See the domain attribute mr_mode \f[C]fi_domain\f[R](3) and -\f[C]fi_mr\f[R](3). +See the domain attribute mr_mode \f[V]fi_domain\f[R](3) and +\f[V]fi_mr\f[R](3). .TP \f[I]FI_MSG_PREFIX\f[R] Message prefix mode indicates that an application will provide buffer @@ -657,7 +671,7 @@ these operations. A provider may support one or more of the following addressing formats. In some cases, a selected addressing format may need to be translated or mapped into an address which is native to the fabric. -See \f[C]fi_av\f[R](3). +See \f[V]fi_av\f[R](3). .TP \f[I]FI_ADDR_EFA\f[R] Address is an Amazon Elastic Fabric Adapter (EFA) proprietary format. @@ -745,7 +759,7 @@ This flag is often used with passive endpoints. fi_getinfo() returns 0 on success. On error, fi_getinfo() returns a negative value corresponding to fabric errno. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .PP fi_allocinfo() returns a pointer to a new fi_info structure on success, or NULL on error. @@ -800,11 +814,11 @@ by fi_getinfo. If neither node, service or hints are provided, then fi_getinfo simply returns the list all available communication interfaces. .PP -Multiple threads may call \f[C]fi_getinfo\f[R] simultaneously, without +Multiple threads may call \f[V]fi_getinfo\f[R] simultaneously, without any requirement for serialization. .SH SEE ALSO .PP -\f[C]fi_open\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3), -\f[C]fi_nic\f[R](3) \f[C]fi_trigger\f[R](3) +\f[V]fi_open\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_domain\f[R](3), +\f[V]fi_nic\f[R](3) \f[V]fi_trigger\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_mr.3 b/man/man3/fi_mr.3 index f9ffbc1e841..dd1a03d27a0 100644 --- a/man/man3/fi_mr.3 +++ b/man/man3/fi_mr.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_mr" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_mr" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -690,7 +704,7 @@ specifying the FI_MR_DMABUF flag. The number of entries in the mr_iov array. The maximum number of memory buffers that may be associated with a single memory region is specified as the mr_iov_limit domain attribute. -See \f[C]fi_domain(3)\f[R]. +See \f[V]fi_domain(3)\f[R]. .SS access .PP Indicates the type of \f[I]operations\f[R] that the local or a peer @@ -772,7 +786,7 @@ This field is ignored unless the fabric is opened with API version 1.5 or greater. .PP If the domain is opened with FI_AV_AUTH_KEY, auth_key_size must equal -\f[C]sizeof(struct fi_mr_auth_key)\f[R]. +\f[V]sizeof(struct fi_mr_auth_key)\f[R]. .SS auth_key .PP Indicates the key to associate with this memory registration. @@ -785,7 +799,7 @@ This field is ignored unless the fabric is opened with API version 1.5 or greater. .PP If the domain is opened with FI_AV_AUTH_KEY, auth_key must point to a -user-defined \f[C]struct fi_mr_auth_key\f[R]. +user-defined \f[V]struct fi_mr_auth_key\f[R]. .SS iface .PP Indicates the software interfaces used by the application to allocate @@ -888,7 +902,7 @@ keys in the AV. .PP If the domain was opened with FI_DIRECTED_RECV, addr can be used to limit the memory region to a specific fi_addr_t, including -fi_addr_t\[cq]s return from \f[C]fi_av_insert_auth_key\f[R]. +fi_addr_t\[cq]s return from \f[V]fi_av_insert_auth_key\f[R]. .SH NOTES .PP Direct access to an application\[cq]s memory by a remote peer requires @@ -1011,7 +1025,7 @@ For example, the physical pages referenced by a virtual address range could migrate between host memory and GPU memory, depending on which computational unit is actively using it. .PP -See the \f[C]fi_endpoint\f[R](3) and \f[C]fi_cq\f[R](3) man pages for +See the \f[V]fi_endpoint\f[R](3) and \f[V]fi_cq\f[R](3) man pages for addition discussion on message, data, and completion ordering semantics, including the impact of memory domains. .SH RETURN VALUES @@ -1019,7 +1033,7 @@ including the impact of memory domains. Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_ENOKEY\f[R] @@ -1125,8 +1139,8 @@ Some level of control over the cache is possible through the above mentioned environment variables. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_rma\f[R](3), \f[C]fi_msg\f[R](3), -\f[C]fi_atomic\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_rma\f[R](3), \f[V]fi_msg\f[R](3), +\f[V]fi_atomic\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_msg.3 b/man/man3/fi_msg.3 index 0fe3a855391..81dbfdfc34b 100644 --- a/man/man3/fi_msg.3 +++ b/man/man3/fi_msg.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_msg" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_msg" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -65,7 +79,7 @@ Count of vectored data entries. .TP \f[I]desc\f[R] Descriptor associated with the data buffer. -See \f[C]fi_mr\f[R](3). +See \f[V]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the sent message. @@ -142,7 +156,7 @@ parameter to a remote endpoint as a single message. The fi_sendmsg call supports data transfers over both connected and connectionless endpoints, with the ability to control the send operation per call through the use of flags. -The fi_sendmsg function takes a \f[C]struct fi_msg\f[R] as input. +The fi_sendmsg function takes a \f[V]struct fi_msg\f[R] as input. .IP .nf \f[C] @@ -259,7 +273,7 @@ Note that an entry to the associated receive completion queue will always be generated when the buffer has been consumed, even if other receive completions have been suppressed (i.e.\ the Rx context has been configured for FI_SELECTIVE_COMPLETION). -See the FI_MULTI_RECV completion flag \f[C]fi_cq\f[R](3). +See the FI_MULTI_RECV completion flag \f[V]fi_cq\f[R](3). .TP \f[I]FI_INJECT_COMPLETE\f[R] Applies to fi_sendmsg. @@ -274,7 +288,7 @@ tracked by the provider. For receive operations, indicates that a completion may be generated as soon as the message has been processed by the local provider, even if the message data may not be visible to all processing elements. -See \f[C]fi_cq\f[R](3) for target side completion semantics. +See \f[V]fi_cq\f[R](3) for target side completion semantics. .TP \f[I]FI_DELIVERY_COMPLETE\f[R] Applies to fi_sendmsg. @@ -320,7 +334,7 @@ buffer length. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .PP See the discussion below for details handling FI_EAGAIN. .SH ERRORS @@ -353,7 +367,7 @@ acknowledgements or flow control messages may need to be processed in order to resume execution. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_nic.3 b/man/man3/fi_nic.3 index fdfeb9f9aa7..c04a55d30e1 100644 --- a/man/man3/fi_nic.3 +++ b/man/man3/fi_nic.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_nic" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_nic" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -9,7 +23,7 @@ fi_nic - Fabric network interface card attributes .PP The fid_nic structure defines attributes for a struct fi_info that is directly associated with underlying networking hardware and may be -returned directly from calling \f[C]fi_getinfo\f[R](3). +returned directly from calling \f[V]fi_getinfo\f[R](3). The format of fid_nic and the related substructures are defined below. .PP Note that not all fields of all structures may be available. @@ -135,7 +149,7 @@ Ethernet or InfiniBand. .PP Provider attributes reference provider specific details of the device. These attributes are both provider and device specific. -The attributes can be interpreted by \f[C]fi_tostr\f[R](3). +The attributes can be interpreted by \f[V]fi_tostr\f[R](3). Applications may also use the other attribute fields, such as related fi_fabric_attr: prov_name field, to determine an appropriate structure to cast the attributes. @@ -145,10 +159,10 @@ specific header file included with libfabric package. .SH NOTES .PP The fid_nic structure is returned as part of a call to -\f[C]fi_getinfo\f[R](3). -It is automatically freed as part of calling \f[C]fi_freeinfo\f[R](3) +\f[V]fi_getinfo\f[R](3). +It is automatically freed as part of calling \f[V]fi_freeinfo\f[R](3) .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3) +\f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_peer.3 b/man/man3/fi_peer.3 index 044390d40a7..1279c23dcd2 100644 --- a/man/man3/fi_peer.3 +++ b/man/man3/fi_peer.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_peer" "3" "2024\-08\-27" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_peer" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .TP @@ -85,6 +99,24 @@ However, because the goal of using peer providers is to avoid overhead, providers must be explicitly written to support the peer provider mechanisms. .PP +When importing any shared fabric object into a peer, the owner will +create a separate fid_peer_* for each peer provider it intends to import +into. +The owner will pass this unique fid_peer_* into each peer through the +context parameter of the init call for the resource (i.e.\ fi_cq_open, +fi_srx_context, fi_cntr_open, etc). +The fi_peer_\f[I]\f[R]context will indicate the owner-allocated +fid_peer\f[I]\f[R] for the peer to use but is temporary for the init +call and may not be accessed by the peer after initialization. +The peer will set just the peer_ops of the owner-allocated fid and save +a reference to the imported fid_peer_* for use in the peer API flow. +The peer will allocate its own fid for internal uses and return that fid +to the owner through the regular fid parameter of the init call (as if +it were just another opened resource). +The owner is responsible for saving the returned peer fid from the open +call in order to close it later (or to drive progress in the case of the +cq_fid). +.PP There are two peer provider models. In the example listed above, both peers are full providers in their own right and usable in a stand-alone fashion. @@ -255,8 +287,8 @@ If manual progress is needed on the peer CQ, the owner should drive progress by using the fi_cq_read() function with the buf parameter set to NULL and count equal 0. The peer provider should set other functions that attempt to read the -peer\[cq]s CQ (i.e.\ fi_cq_readerr, fi_cq_sread, etc.) to return --FI_ENOSYS. +peer\[cq]s CQ (i.e.\ fi_cq_readerr, fi_cq_sread, etc.) +to return -FI_ENOSYS. .SS fi_ops_cq_owner::write() .PP This call directs the owner to insert new completions into the CQ. @@ -347,8 +379,8 @@ Similar to the peer CQ, if manual progress is needed on the peer counter, the owner should drive progress by using the fi_cntr_read() and the fi_cntr_read() should do nothing but progress the peer cntr. The peer provider should set other functions that attempt to access the -peer\[cq]s cntr (i.e.\ fi_cntr_readerr, fi_cntr_set, etc.) to return --FI_ENOSYS. +peer\[cq]s cntr (i.e.\ fi_cntr_readerr, fi_cntr_set, etc.) +to return -FI_ENOSYS. .SS fi_ops_cntr_owner::inc() .PP This call directs the owner to increment the value of the cntr. @@ -783,9 +815,9 @@ callbacks. .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_provider\f[R](7), \f[C]fi_provider\f[R](3), \f[C]fi_cq\f[R](3), +\f[V]fi_provider\f[R](7), \f[V]fi_provider\f[R](3), \f[V]fi_cq\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_poll.3 b/man/man3/fi_poll.3 index 285965f3589..b49b9d35174 100644 --- a/man/man3/fi_poll.3 +++ b/man/man3/fi_poll.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_poll" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_poll" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -315,7 +329,7 @@ or fid. Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .TP fi_poll On success, if events are available, returns the number of entries @@ -392,7 +406,7 @@ The use of the fi_trywait() function is still required if accessing wait objects directly. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_eq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_profile.3 b/man/man3/fi_profile.3 index 3476bc77ddd..f24305abf82 100644 --- a/man/man3/fi_profile.3 +++ b/man/man3/fi_profile.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_profile" "3" "2023\-10\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_profile" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -323,6 +337,6 @@ be returned. For fi_profile_query_vars and fi_profile_query_events, a positive return value indicates the number of variables or events returned in the list. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_provider.3 b/man/man3/fi_provider.3 index 50166da32ca..0f123ba3246 100644 --- a/man/man3/fi_provider.3 +++ b/man/man3/fi_provider.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_provider" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_provider" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -183,7 +197,7 @@ the service or resource to which they correspond. The mr_cache object references the internal memory registration cache used by the different providers. Additional information on the cache is available in the -\f[C]fi_mr(3)\f[R] man page. +\f[V]fi_mr(3)\f[R] man page. .TP \f[I]logging\f[R] The logging object references the internal logging subsystem used by the @@ -193,8 +207,8 @@ Can be opened only once and only the last import is used if imported multiple times. .SS fi_import .PP -This helper function is a combination of \f[C]fi_open\f[R] and -\f[C]fi_import_fid\f[R]. +This helper function is a combination of \f[V]fi_open\f[R] and +\f[V]fi_import_fid\f[R]. It may be used to import a fabric object created and owned by the libfabric user. This allows the upper level libraries or the application to override or @@ -264,9 +278,9 @@ For integrated providers .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3) \f[C]fi_mr\f[R](3), +\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3) \f[V]fi_mr\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_rma.3 b/man/man3/fi_rma.3 index f9abc8b5ba7..dea50502f69 100644 --- a/man/man3/fi_rma.3 +++ b/man/man3/fi_rma.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_rma" "3" "2023\-11\-30" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_rma" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -78,7 +92,7 @@ FI_MR_SCALABLE. Protection key associated with the remote memory. .TP \f[I]desc\f[R] -Descriptor associated with the local data buffer See \f[C]fi_mr\f[R](3). +Descriptor associated with the local data buffer See \f[V]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the operation. @@ -175,7 +189,7 @@ struct fi_rma_iov { .PP The write inject call is an optimized version of fi_write. It provides similar completion semantics as fi_inject -\f[C]fi_msg\f[R](3). +\f[V]fi_msg\f[R](3). .SS fi_writedata .PP The write data call is similar to fi_write, but allows for the sending @@ -276,15 +290,15 @@ operation (inclusive) to the posting of a subsequent fenced operation .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_tagged.3 b/man/man3/fi_tagged.3 index 62d48a21298..1ac5bf963f8 100644 --- a/man/man3/fi_tagged.3 +++ b/man/man3/fi_tagged.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_tagged" "3" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_tagged" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -74,7 +88,7 @@ Mask of bits to ignore applied to the tag for receive operations. .TP \f[I]desc\f[R] Memory descriptor associated with the data buffer. -See \f[C]fi_mr\f[R](3). +See \f[V]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the sent data. @@ -199,7 +213,7 @@ struct fi_msg_tagged { .PP The tagged inject call is an optimized version of fi_tsend. It provides similar completion semantics as fi_inject -\f[C]fi_msg\f[R](3). +\f[V]fi_msg\f[R](3). .SS fi_tsenddata .PP The tagged send data call is similar to fi_tsend, but allows for the @@ -360,11 +374,11 @@ ignored. The tagged send and receive calls return 0 on success. On error, a negative value corresponding to fabric \f[I]errno \f[R] is returned. -Fabric errno values are defined in \f[C]fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EINVAL\f[R] @@ -374,7 +388,7 @@ Indicates that an invalid argument was supplied by the user. Indicates that an unspecified error occurred. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_trigger.3 b/man/man3/fi_trigger.3 index 68586cd026a..fea5833b591 100644 --- a/man/man3/fi_trigger.3 +++ b/man/man3/fi_trigger.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_trigger" "3" "2024\-03\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_trigger" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -199,7 +213,7 @@ If a specific request is not supported by the provider, it will fail the operation with -FI_ENOSYS. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_mr\f[R](3), -\f[C]fi_alias\f[R](3), \f[C]fi_cntr\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_mr\f[R](3), +\f[V]fi_alias\f[R](3), \f[V]fi_cntr\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_version.3 b/man/man3/fi_version.3 index c188f06b210..5352c05255e 100644 --- a/man/man3/fi_version.3 +++ b/man/man3/fi_version.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_version" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_version" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -34,6 +48,6 @@ The upper 16-bits of the version correspond to the major number, and the lower 16-bits correspond with the minor number. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fabric.7 b/man/man7/fabric.7 index 91659d5f7b3..73ed2109b19 100644 --- a/man/man7/fabric.7 +++ b/man/man7/fabric.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fabric" "7" "2024\-09\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fabric" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -16,7 +30,7 @@ fabric - Fabric Interface Library Libfabric is a high-performance fabric software library designed to provide low-latency interfaces to fabric hardware. For an in-depth discussion of the motivation and design see -\f[C]fi_guide\f[R](7). +\f[V]fi_guide\f[R](7). .SH OVERVIEW .PP Libfabric provides `process direct I/O' to application software @@ -37,7 +51,7 @@ All fabric hardware devices and their software drivers are required to support this framework. Devices and the drivers that plug into the libfabric framework are referred to as fabric providers, or simply providers. -Provider details may be found in \f[C]fi_provider\f[R](7). +Provider details may be found in \f[V]fi_provider\f[R](7). .TP \f[I]Fabric Interfaces\f[R] The second component is a set of communication operations. @@ -282,18 +296,18 @@ If the list begins with the `\[ha]' symbol, then the list will be negated. .PP Example: To enable the udp and tcp providers only, set: -\f[C]FI_PROVIDER=\[dq]udp,tcp\[dq]\f[R] +\f[V]FI_PROVIDER=\[dq]udp,tcp\[dq]\f[R] .PP When libfabric is installed, DL providers are put under the \f[I]default provider path\f[R], which is determined by how libfabric is built and installed. Usually the default provider path is -\f[C]/lib/libfabric\f[R] or -\f[C]/lib64/libfabric\f[R]. +\f[V]/lib/libfabric\f[R] or +\f[V]/lib64/libfabric\f[R]. By default, libfabric tries to find DL providers in the following order: .IP "1." 3 Use `dlopen' to load provider libraries named -\f[C]lib-fi.so\f[R] for all providers enabled at build time. +\f[V]lib-fi.so\f[R] for all providers enabled at build time. The search path of `ld.so' is used to locate the files. This step is skipped if libfabric is configured with the option `\[en]enable-restricted-dl'. @@ -363,7 +377,7 @@ can be used to retrieve information about which providers are available in the system. Additionally, it can retrieve a list of all environment variables that may be used to configure libfabric and each provider. -See \f[C]fi_info\f[R](1) for more details. +See \f[V]fi_info\f[R](1) for more details. .SH ENVIRONMENT VARIABLE CONTROLS .PP Core features of libfabric and its providers may be configured by an @@ -400,22 +414,22 @@ may not be available in a child process because of copy on write restrictions. .SS CUDA deadlock .PP -In some cases, calls to \f[C]cudaMemcpy()\f[R] within libfabric may +In some cases, calls to \f[V]cudaMemcpy()\f[R] within libfabric may result in a deadlock. This typically occurs when a CUDA kernel blocks until a -\f[C]cudaMemcpy\f[R] on the host completes. +\f[V]cudaMemcpy\f[R] on the host completes. Applications which can cause such behavior can restrict Libfabric\[cq]s ability to invoke CUDA API operations with the endpoint option -\f[C]FI_OPT_CUDA_API_PERMITTED\f[R]. -See \f[C]fi_endpoint\f[R](3) for more details. +\f[V]FI_OPT_CUDA_API_PERMITTED\f[R]. +See \f[V]fi_endpoint\f[R](3) for more details. .PP Another mechanism which can be used to avoid deadlock is Nvidia\[cq]s GDRCopy. Using GDRCopy requires an external library and kernel module available at https://github.com/NVIDIA/gdrcopy. Libfabric must be configured with GDRCopy support using the -\f[C]--with-gdrcopy\f[R] option, and be run with -\f[C]FI_HMEM_CUDA_USE_GDRCOPY=1\f[R]. +\f[V]--with-gdrcopy\f[R] option, and be run with +\f[V]FI_HMEM_CUDA_USE_GDRCOPY=1\f[R]. This may not be supported by all providers. .SH ABI CHANGES .PP @@ -509,9 +523,9 @@ Added new fields to the following attributes: Added max_group_id .SH SEE ALSO .PP -\f[C]fi_info\f[R](1), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3), -\f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_av\f[R](3), -\f[C]fi_eq\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_mr\f[R](3) +\f[V]fi_info\f[R](1), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3), +\f[V]fi_endpoint\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_av\f[R](3), +\f[V]fi_eq\f[R](3), \f[V]fi_cq\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_mr\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_arch.7 b/man/man7/fi_arch.7 index 7a749d4fca1..6bd4ad0abc5 100644 --- a/man/man7/fi_arch.7 +++ b/man/man7/fi_arch.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_arch" "7" "2023\-01\-02" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_arch" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .IP .nf diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 index c0ad9d32a0d..0360e4cb466 100644 --- a/man/man7/fi_cxi.7 +++ b/man/man7/fi_cxi.7 @@ -1,7 +1,21 @@ -.\"t -.\" Automatically generated by Pandoc 2.9.2.1 +'\" t +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cxi" "7" "2024\-03\-21" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cxi" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -176,7 +190,7 @@ Classes. .PP While a libfabric user provided authorization key is optional, it is highly encouraged that libfabric users provide an authorization key -through the domain attribute hints during \f[C]fi_getinfo()\f[R]. +through the domain attribute hints during \f[V]fi_getinfo()\f[R]. How libfabric users acquire the authorization key may vary between the users and is outside the scope of this document. .PP @@ -192,18 +206,18 @@ authorization key using them. .IP \[bu] 2 \f[I]SLINGSHOT_VNIS\f[R]: Comma separated list of VNIs. The CXI provider will only use the first VNI if multiple are provide. -Example: \f[C]SLINGSHOT_VNIS=234\f[R]. +Example: \f[V]SLINGSHOT_VNIS=234\f[R]. .IP \[bu] 2 \f[I]SLINGSHOT_DEVICES\f[R]: Comma separated list of device names. Each device index will use the same index to lookup the service ID in \f[I]SLINGSHOT_SVC_IDS\f[R]. -Example: \f[C]SLINGSHOT_DEVICES=cxi0,cxi1\f[R]. +Example: \f[V]SLINGSHOT_DEVICES=cxi0,cxi1\f[R]. .IP \[bu] 2 \f[I]SLINGSHOT_SVC_IDS\f[R]: Comma separated list of pre-configured CXI service IDs. Each service ID index will use the same index to lookup the CXI device in \f[I]SLINGSHOT_DEVICES\f[R]. -Example: \f[C]SLINGSHOT_SVC_IDS=5,6\f[R]. +Example: \f[V]SLINGSHOT_SVC_IDS=5,6\f[R]. .PP \f[B]Note:\f[R] How valid VNIs and device services are configured is outside the responsibility of the CXI provider. @@ -608,7 +622,7 @@ into the fi_control(FI_QUEUE_WORK) critical path. The following subsections outline the CXI provider fork support. .SS RDMA and Fork Overview .PP -Under Linux, \f[C]fork()\f[R] is implemented using copy-on-write (COW) +Under Linux, \f[V]fork()\f[R] is implemented using copy-on-write (COW) pages, so the only penalty that it incurs is the time and memory required to duplicate the parent\[cq]s page tables, mark all of the process\[cq]s page structs as read only and COW, and create a unique @@ -651,22 +665,22 @@ The crux of the issue is the parent issuing forks while trying to do RDMA operations to registered memory regions. Excluding software RDMA emulation, two options exist for RDMA NIC vendors to resolve this data corruption issue. -- Linux \f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC +- Linux \f[V]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC support for on-demand paging (ODP) .SS Linux madvise() MADV_DONTFORK and MADV_DOFORK .PP The generic (i.e.\ non-vendor specific) RDMA NIC solution to the Linux COW fork policy and RDMA problem is to use the following -\f[C]madvise()\f[R] operations during memory registration and +\f[V]madvise()\f[R] operations during memory registration and deregistration: - MADV_DONTFORK: Do not make the pages in this range -available to the child after a \f[C]fork()\f[R]. +available to the child after a \f[V]fork()\f[R]. This is useful to prevent copy-on-write semantics from changing the physical location of a page if the parent writes to it after a -\f[C]fork()\f[R]. +\f[V]fork()\f[R]. (Such page relocations cause problems for hardware that DMAs into the -page.) - MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the -default behavior, whereby a mapping is inherited across -\f[C]fork()\f[R]. +page.) +- MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the default +behavior, whereby a mapping is inherited across \f[V]fork()\f[R]. .PP In the Linux kernel, MADV_DONTFORK will result in the virtual memory area struct (VMA) being marked with the VM_DONTCOPY flag. @@ -677,14 +691,14 @@ Should the child reference the virtual address corresponding to the VMA which was not duplicated, it will segfault. .PP In the previous example, if Process A issued -\f[C]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing +\f[V]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing RDMA memory registration, the physical address 0x1000 would have remained with Process A. This would prevent the Process A data corruption as well. If Process B were to reference virtual address 0xffff0000, it will segfault due to the hole in the virtual address space. .PP -Using \f[C]madvise()\f[R] with MADV_DONTFORK may be problematic for +Using \f[V]madvise()\f[R] with MADV_DONTFORK may be problematic for applications performing RDMA and page aliasing. Paging aliasing is where the parent process uses part or all of a page to share information with the child process. @@ -738,7 +752,7 @@ The CXI provider is subjected to the Linux COW fork policy and RDMA issues described in section \f[I]RDMA and Fork Overview\f[R]. To prevent data corruption with fork, the CXI provider supports the following options: - CXI specific fork environment variables to enable -\f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support* +\f[V]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support* .PP **Formal ODP support pending.* .SS CXI Specific Fork Environment Variables @@ -746,27 +760,27 @@ following options: - CXI specific fork environment variables to enable The CXI software stack has two environment variables related to fork: 0 CXI_FORK_SAFE: Enables base fork safe support. With this environment variable set, regardless of value, libcxi will -issue \f[C]madvise()\f[R] with MADV_DONTFORK on the virtual address +issue \f[V]madvise()\f[R] with MADV_DONTFORK on the virtual address range being registered for RDMA. -In addition, libcxi always align the \f[C]madvise()\f[R] to the system +In addition, libcxi always align the \f[V]madvise()\f[R] to the system default page size. On x86, this is 4 KiB. -To prevent redundant \f[C]madvise()\f[R] calls with MADV_DONTFORK +To prevent redundant \f[V]madvise()\f[R] calls with MADV_DONTFORK against the same virtual address region, reference counting is used -against each tracked \f[C]madvise()\f[R] region. -In addition, libcxi will spilt and merge tracked \f[C]madvise()\f[R] +against each tracked \f[V]madvise()\f[R] region. +In addition, libcxi will spilt and merge tracked \f[V]madvise()\f[R] regions if needed. Once the reference count reaches zero, libcxi will call -\f[C]madvise()\f[R] with MADV_DOFORK, and no longer track the region. +\f[V]madvise()\f[R] with MADV_DOFORK, and no longer track the region. - CXI_FORK_SAFE_HP: With this environment variable set, in conjunction with CXI_FORK_SAFE, libcxi will not assume the page size is system default page size. -Instead, libcxi will walk \f[C]/proc//smaps\f[R] to determine the -correct page size and align the \f[C]madvise()\f[R] calls accordingly. +Instead, libcxi will walk \f[V]/proc//smaps\f[R] to determine the +correct page size and align the \f[V]madvise()\f[R] calls accordingly. This environment variable should be set if huge pages are being used for RDMA. To amortize the per memory registration walk of -\f[C]/proc//smaps\f[R], the libfabric MR cache should be used. +\f[V]/proc//smaps\f[R], the libfabric MR cache should be used. .PP Setting these environment variables will prevent data corruption when the parent issues a fork. @@ -800,7 +814,7 @@ transfer. The following is the CXI provider fork support guidance: - Enable CXI_FORK_SAFE. If huge pages are also used, CXI_FORK_SAFE_HP should be enabled as well. -Since enabling this will result in \f[C]madvice()\f[R] with +Since enabling this will result in \f[V]madvice()\f[R] with MADV_DONTFORK, the following steps should be taken to prevent a child process segfault: - Avoid using stack memory for RDMA - Avoid child process having to access a virtual address range the parent process is @@ -1616,7 +1630,7 @@ It can only be changed prior to any MR being created. .PP CXI domain extensions have been named \f[I]FI_CXI_DOM_OPS_6\f[R]. The flags parameter is ignored. -The fi_open_ops function takes a \f[C]struct fi_cxi_dom_ops\f[R]. +The fi_open_ops function takes a \f[V]struct fi_cxi_dom_ops\f[R]. See an example of usage below: .IP .nf @@ -1709,7 +1723,7 @@ removed from the domain opts prior to software release 2.2. .PP CXI counter extensions have been named \f[I]FI_CXI_COUNTER_OPS\f[R]. The flags parameter is ignored. -The fi_open_ops function takes a \f[C]struct fi_cxi_cntr_ops\f[R]. +The fi_open_ops function takes a \f[V]struct fi_cxi_cntr_ops\f[R]. See an example of usage below. .IP .nf @@ -1838,7 +1852,7 @@ memory operation as a PCIe operation as compared to a NIC operation. The CXI provider extension flag FI_CXI_PCIE_AMO is used to signify this. .PP Since not all libfabric atomic memory operations can be executed as a -PCIe atomic memory operation, \f[C]fi_query_atomic()\f[R] could be used +PCIe atomic memory operation, \f[V]fi_query_atomic()\f[R] could be used to query if a given libfabric atomic memory operation could be executed as PCIe atomic memory operation. .PP @@ -2156,6 +2170,6 @@ In this case, the target NIC is reachable. FI_EIO: Catch all errno. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_direct.7 b/man/man7/fi_direct.7 index 66415f928c5..c014a10b104 100644 --- a/man/man7/fi_direct.7 +++ b/man/man7/fi_direct.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_direct" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_direct" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -71,7 +85,7 @@ The provider sets FI_LOCAL_MR for fi_info:mode. See fi_getinfo for additional details. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_efa.7 b/man/man7/fi_efa.7 index 99396e71bbf..24809adde15 100644 --- a/man/man7/fi_efa.7 +++ b/man/man7/fi_efa.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_efa" "7" "2024\-10\-01" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_efa" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -95,7 +109,7 @@ No support for counters for the DGRAM endpoint. No support for inject. .SS zero-copy receive mode .IP \[bu] 2 -The receive operation cannot be cancelled via \f[C]fi_cancel()\f[R]. +The receive operation cannot be cancelled via \f[V]fi_cancel()\f[R]. .IP \[bu] 2 Zero-copy receive mode can be enabled only if SHM transfer is disabled. .IP \[bu] 2 @@ -166,12 +180,12 @@ If endpoint is not able to support this feature, it will return .PP The efa provider exports extensions for operations that are not provided by the standard libfabric interface. -These extensions are available via the \[lq]\f[C]fi_ext_efa.h\f[R]\[rq] +These extensions are available via the \[lq]\f[V]fi_ext_efa.h\f[R]\[rq] header file. .SS Domain Operation Extension .PP -Domain operation extension is obtained by calling \f[C]fi_open_ops\f[R] -(see \f[C]fi_domain(3)\f[R]) +Domain operation extension is obtained by calling \f[V]fi_open_ops\f[R] +(see \f[V]fi_domain(3)\f[R]) .IP .nf \f[C] @@ -180,9 +194,9 @@ int fi_open_ops(struct fid *domain, const char *name, uint64_t flags, \f[R] .fi .PP -and requesting \f[C]FI_EFA_DOMAIN_OPS\f[R] in \f[C]name\f[R]. -\f[C]fi_open_ops\f[R] returns \f[C]ops\f[R] as the pointer to the -function table \f[C]fi_efa_ops_domain\f[R] defined as follows: +and requesting \f[V]FI_EFA_DOMAIN_OPS\f[R] in \f[V]name\f[R]. +\f[V]fi_open_ops\f[R] returns \f[V]ops\f[R] as the pointer to the +function table \f[V]fi_efa_ops_domain\f[R] defined as follows: .IP .nf \f[C] @@ -224,20 +238,20 @@ FI_EFA_MR_ATTR_RDMA_RECV_IC_ID: rdma_recv_ic_id has a valid value. \f[I]recv_ic_id\f[R] Physical interconnect used by the device to reach the MR for receive operation. -It is only valid when \f[C]ic_id_validity\f[R] has the -\f[C]FI_EFA_MR_ATTR_RECV_IC_ID\f[R] bit. +It is only valid when \f[V]ic_id_validity\f[R] has the +\f[V]FI_EFA_MR_ATTR_RECV_IC_ID\f[R] bit. .TP \f[I]rdma_read_ic_id\f[R] Physical interconnect used by the device to reach the MR for RDMA read operation. -It is only valid when \f[C]ic_id_validity\f[R] has the -\f[C]FI_EFA_MR_ATTR_RDMA_READ_IC_ID\f[R] bit. +It is only valid when \f[V]ic_id_validity\f[R] has the +\f[V]FI_EFA_MR_ATTR_RDMA_READ_IC_ID\f[R] bit. .TP \f[I]rdma_recv_ic_id\f[R] Physical interconnect used by the device to reach the MR for RDMA write receive. -It is only valid when \f[C]ic_id_validity\f[R] has the -\f[C]FI_EFA_MR_ATTR_RDMA_RECV_IC_ID\f[R] bit. +It is only valid when \f[V]ic_id_validity\f[R] has the +\f[V]FI_EFA_MR_ATTR_RDMA_RECV_IC_ID\f[R] bit. .SS Return value .PP \f[B]query_mr()\f[R] returns 0 on success, or the value of errno on @@ -245,7 +259,7 @@ failure (which indicates the failure reason). .SH Traffic Class (tclass) in EFA .PP To prioritize the messages from a given endpoint, user can specify -\f[C]fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY\f[R] in the +\f[V]fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY\f[R] in the fi_endpoint() call to set the service level in rdma-core. All other tclass values will be ignored. .SH RUNTIME PARAMETERS @@ -328,7 +342,7 @@ to a peer after a receiver not ready error. Enable SHM provider to provide the communication across all intra-node processes. SHM transfer will be disabled in the case where -\f[C]ptrace protection\f[R] is turned on. +\f[V]ptrace protection\f[R] is turned on. You can turn it off to enable shm transfer. .PP FI_EFA_ENABLE_SHM_TRANSFER is parsed during the fi_domain call and is @@ -423,6 +437,6 @@ available. Setting this environment variable to 0 can disable this feature. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_guide.7 b/man/man7/fi_guide.7 index 9706838dfd4..00befca6279 100644 --- a/man/man7/fi_guide.7 +++ b/man/man7/fi_guide.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_guide" "7" "2023\-01\-02" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_guide" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -20,16 +34,16 @@ This guide describes the libfabric architecture and interfaces. Due to the length of the guide, it has been broken into multiple pages. These sections are: .TP -\f[I]Introduction \f[BI]\f[CBI]fi_intro\f[BI]\f[I](7)\f[R] +\f[I]Introduction \f[VI]fi_intro\f[I](7)\f[R] This section provides insight into the motivation for the libfabric design and underlying networking features that are being exposed through the API. .TP -\f[I]Architecture \f[BI]\f[CBI]fi_arch\f[BI]\f[I](7)\f[R] +\f[I]Architecture \f[VI]fi_arch\f[I](7)\f[R] This describes the exposed architecture of libfabric, including the object-model and their related operations .TP -\f[I]Setup \f[BI]\f[CBI]fi_setup\f[BI]\f[I](7)\f[R] +\f[I]Setup \f[VI]fi_setup\f[I](7)\f[R] This provides basic bootstrapping and setup for using the libfabric API. .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_hook.7 b/man/man7/fi_hook.7 index cd628f4e22e..6e52f716321 100644 --- a/man/man7/fi_hook.7 +++ b/man/man7/fi_hook.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_hook" "7" "2023\-04\-26" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_hook" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -166,6 +180,6 @@ Application that use FI_TRIGGER operations that attempt to hook calls will likely crash. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_intro.7 b/man/man7/fi_intro.7 index 97bca65b254..e15ccf34126 100644 --- a/man/man7/fi_intro.7 +++ b/man/man7/fi_intro.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_intro" "7" "2023\-01\-02" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_intro" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -8,7 +22,7 @@ fi_intro - libfabric introduction .SH OVERVIEW .PP This introduction is part of the libfabric\[cq]s programmer\[cq]s guide. -See \f[C]fi_guide\f[R](7). +See \f[V]fi_guide\f[R](7). This section provides insight into the motivation for the libfabric design and underlying networking features that are being exposed through the API. @@ -1124,9 +1138,9 @@ If an application is using 1000 endpoints and posts 100 buffers, each 4 KB, that results in 400 MB of memory space being consumed to receive data. (We can start to realize that by eliminating memory copies, one of the -trade offs is increased memory consumption.) While 400 MB seems like a -lot of memory, there is less than half a megabyte allocated to a single -receive queue. +trade offs is increased memory consumption.) +While 400 MB seems like a lot of memory, there is less than half a +megabyte allocated to a single receive queue. At today\[cq]s networking speeds, that amount of space can be consumed within milliseconds. The result is that if only a few endpoints are in use, the application @@ -1415,6 +1429,6 @@ but it does allow for optimizing network utilization. Libfabric is well architected to support the previously discussed features. For further information on the libfabric architecture, see the next -programmer\[cq]s guide section: \f[C]fi_arch\f[R](7). +programmer\[cq]s guide section: \f[V]fi_arch\f[R](7). .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_lpp.7 b/man/man7/fi_lpp.7 index 9a7007308d3..5b8626464bc 100644 --- a/man/man7/fi_lpp.7 +++ b/man/man7/fi_lpp.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_lpp" "7" "2024\-08\-30" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_lpp" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -74,6 +88,6 @@ Use the memcpy implementation in the system libc rather than provider-specific memcpy. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_mrail.7 b/man/man7/fi_mrail.7 index 97e3b44caee..cd608b1e134 100644 --- a/man/man7/fi_mrail.7 +++ b/man/man7/fi_mrail.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_mrail" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_mrail" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -83,18 +97,18 @@ Deprecated. Replaced by \f[I]FI_OFI_MRAIL_ADDR\f[R]. .TP \f[I]FI_OFI_MRAIL_CONFIG\f[R] -Comma separated list of \f[C]:\f[R] pairs, sorted in -ascending order of \f[C]\f[R]. +Comma separated list of \f[V]:\f[R] pairs, sorted in +ascending order of \f[V]\f[R]. Each pair indicated the rail sharing policy to be used for messages up -to the size \f[C]\f[R] and not covered by all previous pairs. -The value of \f[C]\f[R] can be \f[I]fixed\f[R] (a fixed rail is +to the size \f[V]\f[R] and not covered by all previous pairs. +The value of \f[V]\f[R] can be \f[I]fixed\f[R] (a fixed rail is used), \f[I]round-robin\f[R] (one rail per message, selected in round-robin fashion), or \f[I]striping\f[R] (striping across all the rails). -The default configuration is \f[C]16384:fixed,ULONG_MAX:striping\f[R]. +The default configuration is \f[V]16384:fixed,ULONG_MAX:striping\f[R]. The value ULONG_MAX can be input as -1. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_opx.7 b/man/man7/fi_opx.7 index 8d9d27d42ff..170ed2bdccc 100644 --- a/man/man7/fi_opx.7 +++ b/man/man7/fi_opx.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_opx" "7" "2024\-09\-12" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_opx" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .PP {%include JB/setup %} @@ -158,35 +172,35 @@ Defaults to \[lq]No\[rq] \f[I]FI_OPX_HFI_SELECT\f[R] String. Controls how OPX chooses which HFI to use when opening a context. -Has two forms: - \f[C]\f[R] Force OPX provider to use -\f[C]hfi-unit\f[R]. -- \f[C][,[,...,]]\f[R] Select HFI based -on first matching \f[C]selector\f[R] +Has two forms: - \f[V]\f[R] Force OPX provider to use +\f[V]hfi-unit\f[R]. +- \f[V][,[,...,]]\f[R] Select HFI based +on first matching \f[V]selector\f[R] .PP -Where \f[C]selector\f[R] is one of the following forms: - -\f[C]default\f[R] to use the default logic - \f[C]fixed:\f[R] -to fix to one \f[C]hfi-unit\f[R] - -\f[C]::\f[R] +Where \f[V]selector\f[R] is one of the following forms: - +\f[V]default\f[R] to use the default logic - \f[V]fixed:\f[R] +to fix to one \f[V]hfi-unit\f[R] - +\f[V]::\f[R] .PP -The above fields have the following meaning: - \f[C]selector-type\f[R] +The above fields have the following meaning: - \f[V]selector-type\f[R] The selector criteria the caller opening the context is evaluated against. -- \f[C]hfi-unit\f[R] The HFI to use if the caller matches the selector. -- \f[C]selector-data\f[R] Data the caller must match (e.g.\ NUMA node +- \f[V]hfi-unit\f[R] The HFI to use if the caller matches the selector. +- \f[V]selector-data\f[R] Data the caller must match (e.g.\ NUMA node ID). .PP -Where \f[C]selector-type\f[R] is one of the following: - \f[C]numa\f[R] +Where \f[V]selector-type\f[R] is one of the following: - \f[V]numa\f[R] True when caller is local to the NUMA node ID given by -\f[C]selector-data\f[R]. -- \f[C]core\f[R] True when caller is local to the CPU core given by -\f[C]selector-data\f[R]. +\f[V]selector-data\f[R]. +- \f[V]core\f[R] True when caller is local to the CPU core given by +\f[V]selector-data\f[R]. .PP -And \f[C]selector-data\f[R] is one of the following: - \f[C]value\f[R] -The specific value to match - \f[C]-\f[R] +And \f[V]selector-data\f[R] is one of the following: - \f[V]value\f[R] +The specific value to match - \f[V]-\f[R] Matches with any value in that range .PP In the second form, when opening a context, OPX uses the -\f[C]hfi-unit\f[R] of the first-matching selector. +\f[V]hfi-unit\f[R] of the first-matching selector. Selectors are evaluated left-to-right. OPX will return an error if the caller does not match any selector. .PP @@ -202,27 +216,27 @@ For the second form, as which HFI is selected depends on properties of the caller, deterministic HFI selection requires deterministic caller properties. E.g. -for the \f[C]numa\f[R] selector, if the caller can migrate between NUMA +for the \f[V]numa\f[R] selector, if the caller can migrate between NUMA domains, then HFI selection will not be deterministic. .PP The logic used will always be the first valid in a selector list. -For example, \f[C]default\f[R] and \f[C]fixed\f[R] will match all +For example, \f[V]default\f[R] and \f[V]fixed\f[R] will match all callers, so if either are in the beginning of a selector list, you will -only use \f[C]fixed\f[R] or \f[C]default\f[R] regardles of if there are +only use \f[V]fixed\f[R] or \f[V]default\f[R] regardles of if there are any more selectors. .PP -Examples: - \f[C]FI_OPX_HFI_SELECT=0\f[R] all callers will open contexts +Examples: - \f[V]FI_OPX_HFI_SELECT=0\f[R] all callers will open contexts on HFI 0. -- \f[C]FI_OPX_HFI_SELECT=1\f[R] all callers will open contexts on HFI 1. -- \f[C]FI_OPX_HFI_SELECT=numa:0:0,numa:1:1,numa:0:2,numa:1:3\f[R] +- \f[V]FI_OPX_HFI_SELECT=1\f[R] all callers will open contexts on HFI 1. +- \f[V]FI_OPX_HFI_SELECT=numa:0:0,numa:1:1,numa:0:2,numa:1:3\f[R] callers local to NUMA nodes 0 and 2 will use HFI 0, callers local to NUMA domains 1 and 3 will use HFI 1. -- \f[C]FI_OPX_HFI_SELECT=numa:0:0-3,default\f[R] callers local to NUMA +- \f[V]FI_OPX_HFI_SELECT=numa:0:0-3,default\f[R] callers local to NUMA nodes 0 thru 3 (including 0 and 3) will use HFI 0, and all else will use default selection logic. -- \f[C]FI_OPX_HFI_SELECT=core:1:0,fixed:0\f[R] callers local to CPU core +- \f[V]FI_OPX_HFI_SELECT=core:1:0,fixed:0\f[R] callers local to CPU core 0 will use HFI 1, and all others will use HFI 0. -- \f[C]FI_OPX_HFI_SELECT=default,core:1:0\f[R] all callers will use +- \f[V]FI_OPX_HFI_SELECT=default,core:1:0\f[R] all callers will use default HFI selection logic. .TP \f[I]FI_OPX_DELIVERY_COMPLETION_THRESHOLD\f[R] @@ -274,9 +288,9 @@ This feature is not currently supported. \f[I]FI_OPX_PROG_AFFINITY\f[R] String. This sets the affinity to be used for any progress threads. -Set as a colon-separated triplet as \f[C]start:end:stride\f[R], where +Set as a colon-separated triplet as \f[V]start:end:stride\f[R], where stride controls the interval between selected cores. -For example, \f[C]1:5:2\f[R] will have cores 1, 3, and 5 as valid cores +For example, \f[V]1:5:2\f[R] will have cores 1, 3, and 5 as valid cores for progress threads. By default no affinity is set. .TP @@ -320,6 +334,6 @@ Needs to be set to 1 in case of mixed network. Default is 0. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_provider.7 b/man/man7/fi_provider.7 index fb5f2541d83..7d71f23f586 100644 --- a/man/man7/fi_provider.7 +++ b/man/man7/fi_provider.7 @@ -1,13 +1,27 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_provider" "7" "2024\-03\-18" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_provider" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP fi_provider - Fabric Interface Providers .SH OVERVIEW .PP -See \f[C]fi_arch\f[R](7) for a brief description of how providers fit +See \f[V]fi_arch\f[R](7) for a brief description of how providers fit into the libfabric architecture. .PP Conceptually, a fabric provider implements and maps the libfabric API @@ -74,52 +88,52 @@ This list is not exhaustive. .TP \f[I]CXI\f[R] Provider for Cray\[cq]s Slingshot network. -See \f[C]fi_cxi\f[R](7) for more information. +See \f[V]fi_cxi\f[R](7) for more information. .TP \f[I]EFA\f[R] A provider for the Amazon EC2 Elastic Fabric Adapter (EFA) (https://aws.amazon.com/hpc/efa/), a custom-built OS bypass hardware interface for inter-instance communication on EC2. -See \f[C]fi_efa\f[R](7) for more information. +See \f[V]fi_efa\f[R](7) for more information. .TP \f[I]OPX\f[R] Supports Omni-Path networking from Cornelis Networks. -See \f[C]fi_opx\f[R](7) for more information. +See \f[V]fi_opx\f[R](7) for more information. .TP \f[I]PSM2\f[R] Older provider for Omni-Path networks. -See \f[C]fi_psm2\f[R](7) for more information. +See \f[V]fi_psm2\f[R](7) for more information. .TP \f[I]PSM3\f[R] Provider for Ethernet networking from Intel. -See \f[C]fi_psm3\f[R](7) for more information. +See \f[V]fi_psm3\f[R](7) for more information. .TP \f[I]SHM\f[R] A provider for intra-node communication using shared memory. -See \f[C]fi_shm\f[R](7) for more information. +See \f[V]fi_shm\f[R](7) for more information. .TP \f[I]TCP\f[R] A provider which runs over the TCP/IP protocol and is available on multiple operating systems. This provider enables develop of libfabric applications on most platforms. -See \f[C]fi_tcp\f[R](7) for more information. +See \f[V]fi_tcp\f[R](7) for more information. .TP \f[I]UCX\f[R] A provider which runs over the UCX library which is currently supported by Infiniband fabrics from NVIDIA. -See \f[C]fi_ucx\f[R](7) for more information. +See \f[V]fi_ucx\f[R](7) for more information. .TP \f[I]UDP\f[R] A provider which runs over the UDP/IP protocol and is available on multiple operating systems. This provider enables develop of libfabric applications on most platforms. -See \f[C]fi_udp\f[R](7) for more information. +See \f[V]fi_udp\f[R](7) for more information. .TP \f[I]Verbs\f[R] This provider targets RDMA NICs for both Linux and Windows platforms. -See \f[C]fi_verbs\f[R](7) for more information. +See \f[V]fi_verbs\f[R](7) for more information. .SH Utility Providers .PP Utility providers are named with a starting prefix of \[lq]ofi_\[rq]. @@ -132,17 +146,17 @@ simpler endpoint type. .PP Utility providers show up as part of the return\[cq]s provider\[cq]s name. -See \f[C]fi_fabric\f[R](3). +See \f[V]fi_fabric\f[R](3). Utility providers are enabled automatically for core providers that do not support the feature set requested by an application. .TP \f[I]RxM\f[R] Implements RDM endpoint semantics over MSG endpoints. -See \f[C]fi_rxm\f[R](7) for more information. +See \f[V]fi_rxm\f[R](7) for more information. .TP \f[I]RxD\f[R] Implements RDM endpoint semantis over DGRAM endpoints. -See \f[C]fi_rxd\f[R](7) for more information. +See \f[V]fi_rxd\f[R](7) for more information. .SH Hooking Providers .PP Hooking providers are mostly used for debugging purposes. @@ -153,7 +167,7 @@ Hooking providers can layer over all other providers and intercept, or hook, their calls in order to perform some dedicated task, such as gathering performance data on call paths or providing debug output. .PP -See \f[C]fi_hook\f[R](7) for more information. +See \f[V]fi_hook\f[R](7) for more information. .SH Offload Providers .PP Offload providers start with the naming prefix \[lq]off_\[rq]. @@ -165,6 +179,6 @@ have been offloaded into hardware, though actual hardware offload support is not a requirement. .SH SEE ALSO .PP -\f[C]fabric\f[R](7) \f[C]fi_provider\f[R](3) +\f[V]fabric\f[R](7) \f[V]fi_provider\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm2.7 b/man/man7/fi_psm2.7 index 38e6fef9084..a64cc54a270 100644 --- a/man/man7/fi_psm2.7 +++ b/man/man7/fi_psm2.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_psm2" "7" "2023\-06\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_psm2" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -214,11 +228,11 @@ See \f[I]FI_PSM2_PROG_AFFINITY\f[R]. When set, specify the set of CPU cores to set the progress thread affinity to. The format is -\f[C][:[:]][,[:[:]]]*\f[R], -where each triplet \f[C]::\f[R] defines a block of +\f[V][:[:]][,[:[:]]]*\f[R], +where each triplet \f[V]::\f[R] defines a block of core_ids. -Both \f[C]\f[R] and \f[C]\f[R] can be either the -\f[C]core_id\f[R] (when >=0) or \f[C]core_id - num_cores\f[R] (when <0). +Both \f[V]\f[R] and \f[V]\f[R] can be either the +\f[V]core_id\f[R] (when >=0) or \f[V]core_id - num_cores\f[R] (when <0). .PP By default affinity is not set. .TP @@ -324,6 +338,6 @@ Valid parameter names are defined in the header file \f[I]rdma/fi_ext_psm2.h\f[R]. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm3\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_psm3\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm3.7 b/man/man7/fi_psm3.7 index e66e84677a9..2b95e16ebc9 100644 --- a/man/man7/fi_psm3.7 +++ b/man/man7/fi_psm3.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_psm3" "7" "2023\-06\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_psm3" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -11,7 +25,7 @@ The \f[I]psm3\f[R] provider implements a Performance Scaled Messaging capability which supports most verbs UD and sockets devices. Additional features and optimizations can be enabled when running over Intel\[cq]s E810 Ethernet NICs and/or using Intel\[cq]s rendezvous -kernel module (\f[C]rv\f[R]). +kernel module (\f[V]rv\f[R]). PSM 3.x fully integrates the OFI provider and the underlying PSM3 protocols/implementation and only exports the OFI APIs. .SH SUPPORTED FEATURES @@ -209,11 +223,11 @@ See \f[I]FI_PSM3_PROG_AFFINITY\f[R]. When set, specify the set of CPU cores to set the progress thread affinity to. The format is -\f[C][:[:]][,[:[:]]]*\f[R], -where each triplet \f[C]::\f[R] defines a block of +\f[V][:[:]][,[:[:]]]*\f[R], +where each triplet \f[V]::\f[R] defines a block of core_ids. -Both \f[C]\f[R] and \f[C]\f[R] can be either the -\f[C]core_id\f[R] (when >=0) or \f[C]core_id - num_cores\f[R] (when <0). +Both \f[V]\f[R] and \f[V]\f[R] can be either the +\f[V]core_id\f[R] (when >=0) or \f[V]core_id - num_cores\f[R] (when <0). .PP By default affinity is not set. .TP @@ -304,6 +318,6 @@ Notice that if the provider is compiled with macro runtime option will be disabled. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm2\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_psm2\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_rxd.7 b/man/man7/fi_rxd.7 index 7590676f118..a87b61ac96b 100644 --- a/man/man7/fi_rxd.7 +++ b/man/man7/fi_rxd.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_rxd" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_rxd" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -61,6 +75,6 @@ Maximum number of packets (per peer) to send at a time. Default: 128 .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_rxm.7 b/man/man7/fi_rxm.7 index 037fc74d926..33c293ebb01 100644 --- a/man/man7/fi_rxm.7 +++ b/man/man7/fi_rxm.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_rxm" "7" "2024\-03\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_rxm" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -212,7 +226,7 @@ to only required values. .PP The data transfer API may return -FI_EAGAIN during on-demand connection setup of the core provider FI_MSG_EP. -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .SH Troubleshooting / Known issues .PP @@ -229,6 +243,6 @@ The workaround is to use shared receive contexts for the MSG provider (FI_OFI_RXM_MSG_TX_SIZE / FI_OFI_RXM_MSG_RX_SIZE). .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_setup.7 b/man/man7/fi_setup.7 index ebb88bddee3..a7126fbdd89 100644 --- a/man/man7/fi_setup.7 +++ b/man/man7/fi_setup.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_setup" "7" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_setup" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -459,8 +473,9 @@ libfabric defines a unique threading model. The libfabric design is heavily influenced by object-oriented programming concepts. A multi-threaded application must determine how libfabric objects -(domains, endpoints, completion queues, etc.) will be allocated among -its threads, or if any thread can access any object. +(domains, endpoints, completion queues, etc.) +will be allocated among its threads, or if any thread can access any +object. For example, an application may spawn a new thread to handle each new connected endpoint. The domain threading field provides a mechanism for an application to diff --git a/man/man7/fi_shm.7 b/man/man7/fi_shm.7 index 6353533c667..7b9d24db54d 100644 --- a/man/man7/fi_shm.7 +++ b/man/man7/fi_shm.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_shm" "7" "2023\-08\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_shm" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -89,7 +103,7 @@ was provided by the application), no supplemental information is required to make it unique and it will remain with only the application-defined address. Note that the actual endpoint name will not include the FI_ADDR_STR -\[dq]*://\[dq] prefix since it cannot be included in any shared memory +\[lq]*://\[rq] prefix since it cannot be included in any shared memory region names. The provider will strip off the prefix before setting the endpoint name. As a result, the addresses \[lq]fi_prefix1://my_node:my_service\[rq] and @@ -204,6 +218,6 @@ different systems. Default 262144 .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_sockets.7 b/man/man7/fi_sockets.7 index 4f48b3ea613..bd9b568ce93 100644 --- a/man/man7/fi_sockets.7 +++ b/man/man7/fi_sockets.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_sockets" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_sockets" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -128,6 +142,6 @@ The recommended parameters for large scale runs are \f[I]FI_SOCKETS_DEF_CQ_SZ\f[R], \f[I]FI_SOCKETS_DEF_EQ_SZ\f[R]. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_tcp.7 b/man/man7/fi_tcp.7 index bf3bfe4d8fb..3f661e6ec0b 100644 --- a/man/man7/fi_tcp.7 +++ b/man/man7/fi_tcp.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_tcp" "7" "2023\-03\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_tcp" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -114,6 +128,6 @@ from the tcp provider. This will provide the best performance. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_ucx.7 b/man/man7/fi_ucx.7 index c3b3e48c868..d160b304a13 100644 --- a/man/man7/fi_ucx.7 +++ b/man/man7/fi_ucx.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_ucx" "7" "2023\-02\-24" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_ucx" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -52,6 +66,6 @@ any). Check request leak (default: disabled). .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_udp.7 b/man/man7/fi_udp.7 index ea65939d131..5c5a5915e0f 100644 --- a/man/man7/fi_udp.7 +++ b/man/man7/fi_udp.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_udp" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_udp" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -51,6 +65,6 @@ No support for counters. No runtime parameters are currently defined. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_usnic.7 b/man/man7/fi_usnic.7 index cf03f28a0f7..01f035652b5 100644 --- a/man/man7/fi_usnic.7 +++ b/man/man7/fi_usnic.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_usnic" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_usnic" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -29,7 +43,7 @@ installing libnl from RPM or other packaging system, install the If you have libnl (either v1 or v3) installed in a non-standard location (e.g., not in /usr/lib or /usr/lib64), you may need to tell libfabric\[cq]s configure where to find libnl via the -\f[C]--with-libnl=DIR\f[R] command line option (where DIR is the +\f[V]--with-libnl=DIR\f[R] command line option (where DIR is the installation prefix of the libnl package). .RE .IP \[bu] 2 @@ -56,7 +70,7 @@ In particular, there are known bugs in RDM support in the presence of congestion or packet loss (issue 1621). RMA is not yet supported. .IP \[bu] 2 -\f[C]fi_provider\f[R](7) lists requirements for all providers. +\f[V]fi_provider\f[R](7) lists requirements for all providers. The following limitations exist in the \f[I]usnic\f[R] provider: .RS 2 .IP \[bu] 2 @@ -69,13 +83,13 @@ CM operations. Passive endpoints only support listen, setname, and getname CM operations. .IP \[bu] 2 -\f[I]FI_EP_DGRAM\f[R] endpoints support \f[C]fi_sendmsg()\f[R] and -\f[C]fi_recvmsg()\f[R], but some flags are ignored. -\f[C]fi_sendmsg()\f[R] supports \f[C]FI_INJECT\f[R] and -\f[C]FI_COMPLETION\f[R]. -\f[C]fi_recvmsg()\f[R] supports \f[C]FI_MORE\f[R]. +\f[I]FI_EP_DGRAM\f[R] endpoints support \f[V]fi_sendmsg()\f[R] and +\f[V]fi_recvmsg()\f[R], but some flags are ignored. +\f[V]fi_sendmsg()\f[R] supports \f[V]FI_INJECT\f[R] and +\f[V]FI_COMPLETION\f[R]. +\f[V]fi_recvmsg()\f[R] supports \f[V]FI_MORE\f[R]. .IP \[bu] 2 -Address vectors only support \f[C]FI_AV_MAP\f[R]. +Address vectors only support \f[V]FI_AV_MAP\f[R]. .IP \[bu] 2 No counters are supported. .IP \[bu] 2 @@ -119,19 +133,19 @@ file. Version 2 of the \[lq]fabric getinfo\[rq] extension was introduced in Libfabric release v1.3.0 and can be used to retrieve IP and SR-IOV information about a usNIC device obtained from the -\f[C]fi_getinfo\f[R](3) function. +\f[V]fi_getinfo\f[R](3) function. .PP The \[lq]fabric getinfo\[rq] extension is obtained by calling -\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_FABRIC_OPS_1\f[R] to +\f[V]fi_open_ops\f[R] and requesting \f[V]FI_USNIC_FABRIC_OPS_1\f[R] to get the usNIC fabric extension operations. -The \f[C]getinfo\f[R] function accepts a version parameter that can be +The \f[V]getinfo\f[R] function accepts a version parameter that can be used to select different versions of the extension. The information returned by the \[lq]fabric getinfo\[rq] extension is -accessible through a \f[C]fi_usnic_info\f[R] struct that uses a version +accessible through a \f[V]fi_usnic_info\f[R] struct that uses a version tagged union. The accessed union member must correspond with the requested version. It is recommended that applications explicitly request a version rather -than using the header provided \f[C]FI_EXT_USNIC_INFO_VERSION\f[R]. +than using the header provided \f[V]FI_EXT_USNIC_INFO_VERSION\f[R]. Although there is a version 1 of the extension, its use is discouraged, and it may not be available in future releases. .SS Compatibility issues @@ -244,8 +258,8 @@ struct fi_usnic_info_v1 { .fi .PP Version 1 of the \[lq]fabric getinfo\[rq] extension can be used by -explicitly requesting it in the call to \f[C]getinfo\f[R] and accessing -the \f[C]v1\f[R] portion of the \f[C]fi_usnic_info.ui\f[R] union. +explicitly requesting it in the call to \f[V]getinfo\f[R] and accessing +the \f[V]v1\f[R] portion of the \f[V]fi_usnic_info.ui\f[R] union. Use of version 1 is not recommended and it may be removed from future releases. .PP @@ -327,7 +341,7 @@ Libfabric release v1.0.0 and can be used to retrieve the network distance of an address. .PP The \[lq]get_distance\[rq] extension is obtained by calling -\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_AV_OPS_1\f[R] to get +\f[V]fi_open_ops\f[R] and requesting \f[V]FI_USNIC_AV_OPS_1\f[R] to get the usNIC address vector extension operations. .IP .nf @@ -343,9 +357,9 @@ Address vector Destination address .TP \f[I]metric\f[R] -On output this will contain \f[C]-1\f[R] if the destination host is -unreachable, \f[C]0\f[R] is the destination host is locally connected, -and \f[C]1\f[R] otherwise. +On output this will contain \f[V]-1\f[R] if the destination host is +unreachable, \f[V]0\f[R] is the destination host is locally connected, +and \f[V]1\f[R] otherwise. .PP See fi_ext_usnic.h for more details. .SH VERSION DIFFERENCES @@ -355,28 +369,28 @@ The release of libfabric v1.4 introduced a new naming convention for fabric and domain. However the usNIC provider remains backward compatible with applications supporting the old scheme and decides which one to use based on the -version passed to \f[C]fi_getinfo\f[R]: +version passed to \f[V]fi_getinfo\f[R]: .IP \[bu] 2 -When \f[C]FI_VERSION(1,4)\f[R] or higher is used: +When \f[V]FI_VERSION(1,4)\f[R] or higher is used: .RS 2 .IP \[bu] 2 fabric name is the network address with the CIDR notation (i.e., -\f[C]a.b.c.d/e\f[R]) +\f[V]a.b.c.d/e\f[R]) .IP \[bu] 2 -domain name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) +domain name is the usNIC Linux interface name (i.e., \f[V]usnic_X\f[R]) .RE .IP \[bu] 2 -When a lower version number is used, like \f[C]FI_VERSION(1, 3)\f[R], it +When a lower version number is used, like \f[V]FI_VERSION(1, 3)\f[R], it follows the same behavior the usNIC provider exhibited in libfabric <= v1.3: .RS 2 .IP \[bu] 2 -fabric name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) +fabric name is the usNIC Linux interface name (i.e., \f[V]usnic_X\f[R]) .IP \[bu] 2 -domain name is \f[C]NULL\f[R] +domain name is \f[V]NULL\f[R] .RE .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_open_ops\f[R](3), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_open_ops\f[R](3), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_verbs.7 b/man/man7/fi_verbs.7 index a4b8653ea0d..866829ed8e0 100644 --- a/man/man7/fi_verbs.7 +++ b/man/man7/fi_verbs.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_verbs" "7" "2024\-08\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_verbs" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -147,7 +161,7 @@ to be re-mapped when the process is forked (MADV_DONTFORK). .PP The XRC transport is intended to be used when layered with the RXM provider and requires the use of shared receive contexts. -See \f[C]fi_rxm\f[R](7). +See \f[V]fi_rxm\f[R](7). To enable XRC, the following environment variables must usually be set: FI_VERBS_PREFER_XRC and FI_OFI_RXM_USE_SRX. .SH RUNTIME PARAMETERS @@ -280,6 +294,6 @@ post excess receives without draining the CQ. CQ overruns can make the MSG endpoints unusable. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. From d38a92e70074a6051cc2d938a4ae23be662f741c Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Fri, 11 Oct 2024 10:00:01 -0700 Subject: [PATCH 104/393] contrib/intel/jenkins: Split mpichtestsuite into multiple stages Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index fd5bc54a65a..8d8116ec8db 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -691,11 +691,28 @@ pipeline { } } } - stage('mpichtestsuite') { + stage('mpichtestsuite-tcp') { steps { script { dir (RUN_LOCATION) { - def providers = [['tcp', null], ["verbs","rxm"]] + def providers = [['tcp', null]] + def MPIS = ["mpich"] + if (env.WEEKLY.toBoolean()) { + MPIS = ["impi", "mpich"] + } + for (def mpi in MPIS) { + run_middleware(providers, "mpichtestsuite", "mpichtestsuite", + "grass", "bulbasaur", "2", "${mpi}") + } + } + } + } + } + stage('mpichtestsuite-verbs') { + steps { + script { + dir (RUN_LOCATION) { + def providers = [["verbs","rxm"]] def MPIS = ["mpich"] if (env.WEEKLY.toBoolean()) { MPIS = ["impi", "mpich"] From 8e01a3f9479e74d176baf83cd437405502cc9f79 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Fri, 11 Oct 2024 10:03:30 -0700 Subject: [PATCH 105/393] contrib/intel/jenkins: Re-enable PSM3 to run in OneCCL-GPU Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 8d8116ec8db..30e0089c3d1 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -784,18 +784,18 @@ pipeline { steps { script { dir (RUN_LOCATION) { - // run_middleware([["psm3", null]], "oneCCL-GPU-v3", "onecclgpu", - // "gpu", "torchic", "1", null, null, - // "FI_HMEM_DISABLE_P2P=1") - run_middleware([["verbs", null]], "oneCCL-GPU-v3", "onecclgpu", + run_middleware([["psm3", null]], "oneCCL-GPU-v3", "onecclgpu", + "gpu", "torchic", "1", null, null, + "FI_HMEM_DISABLE_P2P=1") + run_middleware([["verbs", null]], "oneCCL-GPU-v3", "onecclgpu", + "gpu", "torchic", "1", null, null, + "FI_HMEM_DISABLE_P2P=1") + run_middleware([["tcp", null]], "oneCCL-GPU-v3", "onecclgpu", + "gpu", "torchic", "1", null, null, + "FI_HMEM_DISABLE_P2P=1") + run_middleware([["shm", null]], "oneCCL-GPU-v3", "onecclgpu", "gpu", "torchic", "1", null, null, "FI_HMEM_DISABLE_P2P=1") - run_middleware([["tcp", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, - "FI_HMEM_DISABLE_P2P=1") - run_middleware([["shm", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, - "FI_HMEM_DISABLE_P2P=1") } } } From 781327d5cd665a31cfc7c947f109085f08f8f40d Mon Sep 17 00:00:00 2001 From: Mike Uttormark Date: Wed, 2 Mar 2022 17:00:40 -0600 Subject: [PATCH 106/393] prov/util: Handle page faults in uffd monitor In order to receive unmap events, uffd uses 'mode missing' when registering memory regions. This implies getting page fault events as well. So handle them by returning a zero-filled page. Page faults come in 3 flavors: reads, writes and writes to protected pages. The only ones we can handle are writes to non-backed pages. Signed-off-by: Mike Uttormark Signed-off-by: Ian Ziemba --- configure.ac | 33 ++++++++- prov/util/src/util_mem_monitor.c | 117 ++++++++++++++++++++++++++++++- 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index e56e370ee7a..777a870f69a 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ dnl dnl Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. dnl Copyright (c) 2019-2021 Intel, Inc. All rights reserved. dnl Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. -dnl (C) Copyright 2020 Hewlett Packard Enterprise Development LP +dnl (C) Copyright 2020,2024 Hewlett Packard Enterprise Development LP dnl Copyright (c) 2022 DataDirect Networks, Inc. All rights reserved. dnl Copyright (c) 2023 Tactical Computing Labs, LLC. All rights reserved. dnl @@ -557,6 +557,37 @@ AS_IF([test $have_uffd -eq 1], AC_DEFINE_UNQUOTED([HAVE_UFFD_UNMAP], [$have_uffd], [Define to 1 if platform supports userfault fd unmap]) +dnl Check uffd thread id support +have_uffd_thread_id=0 +AS_IF([test $have_uffd -eq 1], + [AC_MSG_CHECKING([for userfaultfd thread id support]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include + #include + #include + #include + #include + #include + ]], + [[ + int fd; + struct uffdio_api api_obj; + api_obj.api = UFFD_API; + api_obj.features = UFFD_FEATURE_THREAD_ID | + UFFD_FEATURE_EVENT_UNMAP | + UFFD_FEATURE_EVENT_REMOVE | + UFFD_FEATURE_EVENT_REMAP; + fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + return ioctl(fd, UFFDIO_API, &api_obj); + ]]) + ], + [AC_MSG_RESULT([yes]) + have_uffd_thread_id=1], + [AC_MSG_RESULT([no])])]) + +AC_DEFINE_UNQUOTED([HAVE_UFFD_THREAD_ID], [$have_uffd_thread_id], + [Define to 1 if platform supports userfault fd thread id]) + dnl restricted DL open restricted_dl=0 AC_ARG_ENABLE([restricted_dl], diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index 9b4c0bc954d..746cf50fb59 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -3,7 +3,7 @@ * Copyright (c) 2017-2021 Intel Inc. All rights reserved. * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. * All rights reserved. - * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * (C) Copyright 2020,2024 Hewlett Packard Enterprise Development LP * Copyright (C) 2024 Cornelis Networks. All rights reserved. * * This software is available to you under a choice of one of two @@ -555,6 +555,8 @@ void ofi_monitor_unsubscribe_no_op(struct ofi_mem_monitor *notifier, #include #include +static void ofi_uffd_pagefault_handler(struct uffd_msg *msg); + /* The userfault fd monitor requires for events that could * trigger it to be handled outside of the monitor functions * itself. When a fault occurs on a monitored region, the @@ -588,6 +590,8 @@ static void *ofi_uffd_handler(void *arg) continue; } + FI_DBG(&core_prov, FI_LOG_MR, "Received UFFD event %d\n", msg.event); + switch (msg.event) { case UFFD_EVENT_REMOVE: ofi_monitor_unsubscribe(&uffd.monitor, @@ -606,6 +610,9 @@ static void *ofi_uffd_handler(void *arg) (void *) (uintptr_t) msg.arg.remap.from, (size_t) msg.arg.remap.len); break; + case UFFD_EVENT_PAGEFAULT: + ofi_uffd_pagefault_handler(&msg); + break; default: FI_WARN(&core_prov, FI_LOG_MR, "Unhandled uffd event %d\n", msg.event); @@ -617,6 +624,114 @@ static void *ofi_uffd_handler(void *arg) return NULL; } +static void ofi_uffd_pagefault_handler(struct uffd_msg *msg) +{ + struct uffdio_zeropage zp; + int i; + int ret; + void * const address = (void *) (uintptr_t) msg->arg.pagefault.address; + uint64_t const flags = (uint64_t) msg->arg.pagefault.flags; +#if HAVE_UFFD_THREAD_ID + uint32_t const ptid = (uint32_t) msg->arg.pagefault.feat.ptid; +#endif + /* ofi_uffd_register sets the mode to + * UFFDIO_REGISTER_MODE_MISSING. As a result, we can + * get read, write or write-protect notifications via + * UFFD_EVENT_PAGEFAULT. The only ones we can sensibly + * handle are writes to non-backed pages. + * (Read and write-protect notifications are likely + * application bugs.) + */ + + if (flags != UFFD_PAGEFAULT_FLAG_WRITE) { +#if HAVE_UFFD_THREAD_ID + FI_WARN(&core_prov, FI_LOG_MR, + "UFFD pagefault with unrecognized flags: %lu, address %p, thread %u\n", + flags, address, ptid); +#else + FI_WARN(&core_prov, FI_LOG_MR, + "UFFD pagefault with unrecognized flags: %lu, address %p\n", + flags, address); +#endif + /* The faulting thread is halted at this point. In + * theory we could wake it up with UFFDIO_WAKE. In + * practice that requires the address range of the + * fault, information we don't have from the + * pagefault event. + */ + + return; + } + + /* The event tells us the address of the fault + * (which can be anywhere on the page). It does not + * tell us the size of the page so we have to guess + * from the list of known page_sizes. + * + * We employ the standard resolution: install a zeroed page. + */ + + for (i = 0; i < num_page_sizes; ) { + /* setup a zeropage reqest for this pagesize */ + zp.range.start = (uint64_t) (uintptr_t) + ofi_get_page_start(address, page_sizes[i]); + zp.range.len = (uint64_t) page_sizes[i]; + zp.mode = 0; + zp.zeropage = 0; + + ret = ioctl(uffd.fd, UFFDIO_ZEROPAGE, &zp); + + if (ret == 0) /* success */ + return; + + /* Note: the documentation (man ioctl_userfaultfd) says + * that the ioctl() returns -1 on error and errno is set + * to indicate the error. It also says that the zeropage + * member of struct uffdio_zeropage is set to the negated + * error. The unit tests for uffd say + * real retval in uffdio_zeropage.zeropage + * so that's what we use here. + */ + + if (zp.zeropage == -EAGAIN) + /* This is a tough case. If the memory map is + * changing, the kernel returns EAGAIN before + * installing the zeroed page. So the page + * fault has not been rectified. If we don't try + * again, the application will crash. If we add + * a maximum retry count we could still end up + * with an unresolved page fault. + * + * It's likely a kernel bug or (something else + * bad like OOM) if it returns EAGAIN forever. + * So we retry until we get something besides + * EAGAIN. + */ + continue; /* retry this page size */ + + i++; /* try next page size */ + + if (zp.zeropage == -EINVAL) /* wrong page size */ + continue; + + /* If we get here we failed to install the zeroed + * page for this page size and it wasn't a size error. + * We could either stop trying or go on to the + * next pagesize. We choose to print a message and try + * another page size. + */ + + FI_DBG(&core_prov, FI_LOG_MR, + "Unable to install zeroed page of size %zu to handle page fault." + " address = %p zeropage = %lld errno = %d\n", + page_sizes[i], address, zp.zeropage, errno); + } + + FI_WARN(&core_prov, FI_LOG_MR, + "Unable to handle event UFFD_EVENT_PAGEFAULT for address %p.\n", + address); +} + static int ofi_uffd_register(const void *addr, size_t len, size_t page_size) { struct uffdio_register reg; From e1a6a7a13a03310a7696e194e10964345ab68901 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 14 Oct 2024 16:37:11 +0000 Subject: [PATCH 107/393] build(deps): bump actions/checkout from 4.2.0 to 4.2.1 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.2.0 to 4.2.1. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/d632683dd7b4114ad314bca15554477dd762a938...eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/clang-format-check-cn.yml | 2 +- .github/workflows/clang-format-check.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/coverity.yml | 2 +- .github/workflows/gh-man.yaml | 2 +- .github/workflows/nroff-elves.yaml | 2 +- .github/workflows/pr-ci.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/clang-format-check-cn.yml b/.github/workflows/clang-format-check-cn.yml index 4cfa1bb2a5b..8474d3326d4 100644 --- a/.github/workflows/clang-format-check-cn.yml +++ b/.github/workflows/clang-format-check-cn.yml @@ -9,7 +9,7 @@ jobs: path: - 'prov/opx' steps: - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Run clang-format style check for C/C++/Protobuf programs (Cornelis Networks-specific). uses: jidicula/clang-format-action@c74383674bf5f7c69f60ce562019c1c94bc1421a # v4.13.0 with: diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 7afb11d4147..47ca4512d60 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -11,7 +11,7 @@ jobs: path: - 'prov/sm2' steps: - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Run clang-format style check for C/C++/Protobuf programs. uses: jidicula/clang-format-action@c74383674bf5f7c69f60ce562019c1c94bc1421a # v4.13.0 with: diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 390b8a62945..8c61b6de241 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -48,7 +48,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 6bfdb0ff69d..745d79d4b3e 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -52,7 +52,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Download Coverity tools run: | wget https://scan.coverity.com/download/linux64 --post-data "token=${{ secrets.COVERITY_SCAN_TOKEN }}&project=ofiwg%2Flibfabric" -O coverity_tool.tgz diff --git a/.github/workflows/gh-man.yaml b/.github/workflows/gh-man.yaml index 4f29c5eabc6..754604332a1 100644 --- a/.github/workflows/gh-man.yaml +++ b/.github/workflows/gh-man.yaml @@ -25,7 +25,7 @@ jobs: echo "$GITHUB_DATA" - name: Check out the git repo - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Update the man pages in branch gh-pages run: .github/workflows/gh-man.sh diff --git a/.github/workflows/nroff-elves.yaml b/.github/workflows/nroff-elves.yaml index e2caa2d9213..5d72b4d1ea8 100644 --- a/.github/workflows/nroff-elves.yaml +++ b/.github/workflows/nroff-elves.yaml @@ -23,7 +23,7 @@ jobs: echo "$GITHUB_DATA" - name: Check out the git repo - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Get the required packages run: sudo apt install -y pandoc diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index b8894e1dbf8..9c491198f5a 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -56,7 +56,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Build Check run: | set -x @@ -96,7 +96,7 @@ jobs: sudo apt-add-repository 'deb [arch=amd64] https://repositories.intel.com/graphics/ubuntu focal main' sudo apt-get update sudo apt-get install -y level-zero level-zero-dev - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: HMEM Checks run: | set -x @@ -126,7 +126,7 @@ jobs: run: | brew install automake brew install --quiet libtool - - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Build Check run: | ./autogen.sh diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index d2fe181a360..1ce2ff7cb73 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -33,7 +33,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 with: persist-credentials: false From 3c23d9d6d42e7ad1031707626a65be76ade8cdce Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 14 Oct 2024 16:37:22 +0000 Subject: [PATCH 108/393] build(deps): bump github/codeql-action from 3.26.11 to 3.26.13 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.11 to 3.26.13. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea...f779452ac5af1c261dce0346a8f964149f49322b) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 8c61b6de241..934c372aa2b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea # v3.26.11 + uses: github/codeql-action/init@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea # v3.26.11 + uses: github/codeql-action/autobuild@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea # v3.26.11 + uses: github/codeql-action/analyze@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 1ce2ff7cb73..e78012684e8 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@6db8d6351fd0be61f9ed8ebd12ccd35dcec51fea # v3.26.11 + uses: github/codeql-action/upload-sarif@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 with: sarif_file: results.sarif From 2cda1fdc80ece1e4527e85545f81fd5a3fd3b9ba Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 14 Oct 2024 16:37:26 +0000 Subject: [PATCH 109/393] build(deps): bump actions/upload-artifact from 4.4.1 to 4.4.3 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.4.1 to 4.4.3. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/604373da6381bf24206979c74d06a550515601b9...b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/coverity.yml | 2 +- .github/workflows/pr-ci.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 745d79d4b3e..3eb28e6a0e8 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -94,7 +94,7 @@ jobs: --form description="`$PWD/install/bin/fi_info -l`" \ https://scan.coverity.com/builds?project=ofiwg%2Flibfabric - name: Upload build logs - uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: coverity-build-log.txt path: cov-int/build-log.txt diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 9c491198f5a..1a73450e5de 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -70,7 +70,7 @@ jobs: $PWD/install/bin/fi_info -l - name: Upload build logs if: failure() - uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: ${{ matrix.os }}-${{ matrix.cc }}-config.log path: config.log @@ -115,7 +115,7 @@ jobs: $PWD/install/bin/fi_info -c FI_HMEM - name: Upload build logs if: failure() - uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: hmem-config.log path: config.log @@ -139,7 +139,7 @@ jobs: make -j2 - name: Upload build logs if: failure() - uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: macos-config.log path: config.log diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index e78012684e8..2fb410afcfe 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -60,7 +60,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: SARIF file path: results.sarif From 50a90d5d692ca46483e1c8773bdf057f7aa5fcd3 Mon Sep 17 00:00:00 2001 From: Itai Masuari Date: Tue, 13 Aug 2024 11:30:30 +0300 Subject: [PATCH 110/393] use new synapse api Signed-off-by: Itai Masuari --- fabtests/common/hmem_synapseai.c | 85 ++------------------------------ 1 file changed, 3 insertions(+), 82 deletions(-) diff --git a/fabtests/common/hmem_synapseai.c b/fabtests/common/hmem_synapseai.c index 62a34b7358b..c6858638784 100644 --- a/fabtests/common/hmem_synapseai.c +++ b/fabtests/common/hmem_synapseai.c @@ -41,31 +41,6 @@ #include "habanalabs/synapse_api.h" #include "habanalabs/hlthunk.h" -#define SCAL_SUCCESS 0 - -#define DECLARE_HANDLE(name) struct name##__ { int unused; }; \ - typedef struct name##__ *name - -DECLARE_HANDLE(scal_handle_t); -DECLARE_HANDLE(scal_pool_handle_t); - -typedef struct _scal_memory_pool_infoV2 -{ - scal_handle_t scal; - const char * name; - unsigned idx; - uint64_t device_base_address; - void *host_base_address; - uint32_t core_base_address; // 0 when the pool is not mapped to the cores - uint64_t totalSize; - uint64_t freeSize; - uint64_t device_base_allocated_address; -} scal_memory_pool_infoV2; - -int scal_get_handle_from_fd(int fd, scal_handle_t* scal); -int scal_get_pool_handle_by_name(const scal_handle_t scal, const char *pool_name, scal_pool_handle_t *pool); -int scal_pool_get_infoV2(const scal_pool_handle_t pool, scal_memory_pool_infoV2 *info); - #define ACCEL_PAGE_SIZE 4096 struct synapseai_ops { synStatus (*synInitialize)(void); @@ -88,20 +63,14 @@ struct synapseai_ops { synStatus (*synDeviceGetInfoV2)(const synDeviceId deviceId, synDeviceInfoV2 *pDeviceInfo); int (*hlthunk_device_mapped_memory_export_dmabuf_fd)(int fd, uint64_t addr, uint64_t size, uint64_t offset, uint32_t flags); - int (*scal_pool_get_infoV2)(const scal_pool_handle_t pool, scal_memory_pool_infoV2 *info); - int (*scal_get_pool_handle_by_name)(const scal_handle_t scal, const char *pool_name, - scal_pool_handle_t *pool); - int (*scal_get_handle_from_fd)(int fd, scal_handle_t *scal); }; static void *synapseai_handle; static void *hlthunk_handle; -static void *scal_handle; static struct synapseai_ops synapseai_ops; static synDeviceId synapseai_fd = -1; static synStreamHandle synapseai_stream_handle; static synDeviceInfoV2 deviceInfo; -static uint64_t device_fd; static void cleanup_synapseai_ops(void) { @@ -114,11 +83,6 @@ static void cleanup_synapseai_ops(void) dlclose(hlthunk_handle); hlthunk_handle = NULL; } - - if (scal_handle) { - dlclose(scal_handle); - scal_handle = NULL; - } } int init_synapseai_ops(void) @@ -215,31 +179,6 @@ int init_synapseai_ops(void) goto err_dlclose; } - scal_handle = dlopen("libscal.so", RTLD_NOW); - if (!scal_handle) { - FT_ERR("Falid to dlopen libscal.so\n"); - goto err_dlclose; - } - - synapseai_ops.scal_pool_get_infoV2 = dlsym(scal_handle, "scal_pool_get_infoV2"); - if (!synapseai_ops.scal_pool_get_infoV2) { - FT_ERR("Failed to find scal_pool_get_infoV2\n"); - goto err_dlclose; - } - - synapseai_ops.scal_get_pool_handle_by_name = - dlsym(scal_handle, "scal_get_pool_handle_by_name"); - if (!synapseai_ops.scal_get_pool_handle_by_name) { - FT_ERR("Failed to find scal_get_pool_handle_by_name\n"); - goto err_dlclose; - } - - synapseai_ops.scal_get_handle_from_fd = dlsym(scal_handle, "scal_get_handle_from_fd"); - if (!synapseai_ops.scal_get_handle_from_fd) { - FT_ERR("Failed to find scal_get_handle_from_fd\n"); - goto err_dlclose; - } - return FI_SUCCESS; err_dlclose: @@ -284,7 +223,6 @@ int ft_synapseai_init(void) FT_ERR("Failed to synDeviceGetInfoV2()\n"); goto err; } - device_fd = deviceInfo.fd; if (synapseai_ops.synStreamCreateGeneric(&synapseai_stream_handle, synapseai_fd, 0) != synSuccess) { @@ -383,29 +321,12 @@ int ft_synapseai_copy_from_hmem(uint64_t device, void *dst, const void *src, siz int ft_synapseai_get_dmabuf_fd(void *buf, size_t len, int *dmabuf_fd, uint64_t *dmabuf_offset) { - scal_pool_handle_t mpHandle; - scal_memory_pool_infoV2 mpInfo; - scal_handle_t a = 0; - - if (synapseai_ops.scal_get_handle_from_fd(device_fd, &a) != SCAL_SUCCESS) { - return -FI_ENOBUFS; - } - - if (synapseai_ops.scal_get_pool_handle_by_name(a, "global_hbm", &mpHandle) != SCAL_SUCCESS) { - return -FI_ENOBUFS; - } - - if (synapseai_ops.scal_pool_get_infoV2(mpHandle, &mpInfo) != SCAL_SUCCESS) { - return -FI_ENOBUFS; - } - uint64_t baseAddress = mpInfo.device_base_allocated_address; - size_t buf_size = (len + ACCEL_PAGE_SIZE - 1) & ~(ACCEL_PAGE_SIZE - 1); *dmabuf_fd = - synapseai_ops.hlthunk_device_mapped_memory_export_dmabuf_fd(device_fd, - baseAddress, + synapseai_ops.hlthunk_device_mapped_memory_export_dmabuf_fd(deviceInfo.fd, + deviceInfo.globalHbmBaseAddress, buf_size, - (uint64_t)buf - baseAddress, + (uint64_t)buf - deviceInfo.globalHbmBaseAddress, (O_RDWR | O_CLOEXEC)); if (*dmabuf_fd < 0) { From 382f92f23ba29c423f8c62ca0c6e2f7d3c4dd01f Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 14 Oct 2024 11:44:55 -0700 Subject: [PATCH 111/393] windows/osd.h: fix and refactor logical operations on complex numbers Fix incorrect atomic LOR on complex numbers. The values were incorrectly getting ANDed together instead of ORed. This went unnoticed because the code was very difficult to read. This also refactors the logical checks with a helper function to make it more readible and less prone to errors. Signed-off-by: Alexia Ingerson --- include/windows/osd.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/include/windows/osd.h b/include/windows/osd.h index 88b1754979e..d9698bd9724 100644 --- a/include/windows/osd.h +++ b/include/windows/osd.h @@ -1054,12 +1054,16 @@ static inline ofi_complex_## type ofi_complex_prod_## type \ res.imag = a.real * b.imag + a.imag * b.real; \ return res; \ } \ +static bool ofi_complex_is_true_## type (ofi_complex_ ## type a)\ +{ \ + return a.real != 0 || a.imag != 0; \ +} \ static inline ofi_complex_## type ofi_complex_land_## type \ (ofi_complex_## type a, ofi_complex_## type b) \ { \ ofi_complex_## type res; \ - res.real = (type)(((a.real != 0) || (a.imag != 0)) && \ - ((b.real != 0) || (b.imag != 0))); \ + res.real = (type)(ofi_complex_is_true_## type (a) && \ + ofi_complex_is_true_## type (b)); \ res.imag = 0; \ return res; \ } \ @@ -1067,8 +1071,8 @@ static inline ofi_complex_## type ofi_complex_lor_## type \ (ofi_complex_## type a, ofi_complex_## type b) \ { \ ofi_complex_## type res; \ - res.real = (type)(((a.real != 0) || (a.imag != 0)) && \ - ((b.real != 0) || (b.imag != 0))); \ + res.real = (type)(ofi_complex_is_true_## type (a) || \ + ofi_complex_is_true_## type (b)); \ res.imag = 0; \ return res; \ } \ @@ -1076,10 +1080,10 @@ static inline ofi_complex_## type ofi_complex_lxor_## type \ (ofi_complex_## type a, ofi_complex_## type b) \ { \ ofi_complex_## type res; \ - res.real = (type)((((a.real != 0) || (a.imag != 0)) && \ - !((b.real != 0) || (b.imag != 0))) || \ - (!((a.real != 0) || (a.imag != 0)) && \ - ((b.real != 0) || (b.imag != 0)))); \ + res.real = (type)((ofi_complex_is_true_## type (a) && \ + !ofi_complex_is_true_## type (b))) || \ + (!ofi_complex_is_true_## type (a) && \ + ofi_complex_is_true_## type (b)); \ res.imag = 0; \ return res; \ } From 0d5359a82b4f09a25281a957aaf67d4c2c7d0492 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Wed, 18 Sep 2024 17:19:18 -0700 Subject: [PATCH 112/393] fabtests/common: move ubertest atomic validation code to common This allows fabtests to make use of atomic validation code There were many Windows atomics bugs, inconsistencies, and missing definitions. This patch also cleans up the entire ofi_atomic.c implementation for unix and windows The following changes are included: - Separate fill and check based on real or complex types as setting and reading complexes on windows is not allowed (not native datatype, abstracted). Complex versions use eq and set functions specific for complexes defined in osd.h - Remove duplicated ofi_complex definitions in ofi_atomic (already in osd.h file) - Add general check_atomic and fill_atomic calls and use them in ubertest - Add EXPAND ( x ) x define to work nicely with windows VA_ARGS handling - Fix inconsistency with ofi_complex_type/or naming ('complex' always should come first) - Fix inconsistency with op names "equ" and "mul" -> "eq" and "prod" - Add missing lxor complex op definitions on Windows Signed-off-by: Alexia Ingerson --- fabtests/Makefile.am | 4 +- fabtests/Makefile.win | 2 +- fabtests/{ubertest => common}/ofi_atomic.c | 8 +- fabtests/common/shared.c | 106 ++++++++++++++- fabtests/{ubertest => include}/ofi_atomic.h | 44 +----- fabtests/include/shared.h | 103 ++++++++++++++ fabtests/include/unix/osd.h | 16 ++- fabtests/include/windows/osd.h | 25 +++- fabtests/ubertest/test_ctrl.c | 4 +- fabtests/ubertest/verify.c | 140 +++----------------- 10 files changed, 265 insertions(+), 187 deletions(-) rename fabtests/{ubertest => common}/ofi_atomic.c (98%) rename fabtests/{ubertest => include}/ofi_atomic.h (72%) diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index b97ca169e5f..6d830668833 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -229,6 +229,8 @@ libfabtests_la_SOURCES = \ common/hmem_ze.c \ common/hmem_neuron.c \ common/hmem_synapseai.c \ + common/ofi_atomic.c \ + include/ofi_atomic.h \ include/shared.h \ include/ft_list.h \ include/hmem.h \ @@ -468,8 +470,6 @@ unit_fi_setopt_test_LDADD = libfabtests.la ubertest_fi_ubertest_SOURCES = \ ubertest/fabtest.h \ - ubertest/ofi_atomic.h \ - ubertest/ofi_atomic.c \ ubertest/uber.c \ ubertest/connect.c \ ubertest/cq.c \ diff --git a/fabtests/Makefile.win b/fabtests/Makefile.win index dc3a28fe0b6..da244c78735 100644 --- a/fabtests/Makefile.win +++ b/fabtests/Makefile.win @@ -46,7 +46,7 @@ CFLAGS = $(CFLAGS) /O2 /MT basedeps = common\hmem.c common\shared.c \ common\windows\getopt.c common\windows\osd.c \ common\hmem_cuda.c common\hmem_rocr.c common\hmem_ze.c \ - common\hmem_neuron.c common\hmem_synapseai.c + common\hmem_neuron.c common\hmem_synapseai.c common\ofi_atomic.c includes = /Iinclude /Iinclude\windows /I..\include /FIft_osd.h \ /Iinclude\windows\getopt /Imultinode\include diff --git a/fabtests/ubertest/ofi_atomic.c b/fabtests/common/ofi_atomic.c similarity index 98% rename from fabtests/ubertest/ofi_atomic.c rename to fabtests/common/ofi_atomic.c index 311747175d5..8483284f8ff 100644 --- a/fabtests/ubertest/ofi_atomic.c +++ b/fabtests/common/ofi_atomic.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2017 Intel Corporation. All rights reserved. + * Copyright (c) Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -141,7 +141,7 @@ ofi_complex_##type *r = (res); \ OFI_UNUSED(src); \ for (i = 0; i < cnt; i++) \ - r[i] = d[i]; \ + ofi_complex_set_##type (&r[i], d[i]); \ } /* @@ -173,7 +173,7 @@ const ofi_complex_##type *s = (src); \ ofi_complex_##type *r = (res); \ for (i = 0; i < cnt; i++) { \ - r[i] = d[i]; \ + ofi_complex_set_##type (&r[i], d[i]); \ op(type, d[i], s[i]); \ } \ } @@ -211,7 +211,7 @@ const ofi_complex_##type *c = (cmp); \ ofi_complex_##type *r = (res); \ for (i = 0; i < cnt; i++) { \ - r[i] = d[i]; \ + ofi_complex_set_##type (&r[i], d[i]); \ op(type, d[i], s[i], c[i]); \ } \ } diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index aea1c46d4aa..5a1ee5787c6 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -161,10 +161,6 @@ struct test_size_param *test_size = def_test_sizes; /* range of messages is dynamically allocated */ struct test_size_param *user_test_sizes; -static const char integ_alphabet[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; -static const int integ_alphabet_length = (sizeof(integ_alphabet)/sizeof(*integ_alphabet)) - 1; - - int ft_poll_fd(int fd, int timeout) { struct pollfd fds; @@ -3738,6 +3734,108 @@ int ft_fill_buf(void *buf, size_t size) return ret; } +int ft_fill_atomic(void *buf, size_t count, enum fi_datatype datatype) +{ + switch (datatype) { + case FI_INT8: + case FI_UINT8: + case FI_INT16: + case FI_UINT16: + case FI_INT32: + case FI_UINT32: + case FI_INT64: + case FI_UINT64: + case FI_INT128: + case FI_UINT128: + case FI_FLOAT: + case FI_DOUBLE: + case FI_LONG_DOUBLE: + SWITCH_REAL_TYPES(datatype, FT_FILL, buf, count); + break; + case FI_FLOAT_COMPLEX: + case FI_DOUBLE_COMPLEX: + case FI_LONG_DOUBLE_COMPLEX: + SWITCH_COMPLEX_TYPES(datatype, FT_FILL_COMPLEX, buf, count); + break; + default: + return -FI_EOPNOTSUPP; + } + return 0; +} + +static int ft_check_atomic_compare(void *buf, void *cmp, + enum fi_datatype datatype, size_t count) +{ + switch (datatype) { + case FI_INT8: + case FI_UINT8: + case FI_INT16: + case FI_UINT16: + case FI_INT32: + case FI_UINT32: + case FI_INT64: + case FI_UINT64: + case FI_INT128: + case FI_UINT128: + case FI_FLOAT: + case FI_DOUBLE: + case FI_LONG_DOUBLE: + SWITCH_REAL_TYPES(datatype, FT_CHECK, buf, cmp, count); + break; + case FI_FLOAT_COMPLEX: + case FI_DOUBLE_COMPLEX: + case FI_LONG_DOUBLE_COMPLEX: + SWITCH_COMPLEX_TYPES(datatype, FT_CHECK_COMPLEX, buf, cmp, + count); + break; + default: + return -FI_EOPNOTSUPP; + } + return 0; +} + +int ft_check_atomic(enum ft_atomic_opcodes atomic, enum fi_op op, + enum fi_datatype type, void *src, void *dst_cpy, void *dst, + void *cmp, void *res, size_t count) +{ + /* + * If we don't have the test function, return > 0 to indicate + * verification is unsupported. + */ + if (atomic == FT_ATOMIC_COMPARE) { + if (!ofi_atomic_swap_handler(op, type)) + return 1; + } else if (atomic == FT_ATOMIC_FETCH) { + if (!ofi_atomic_readwrite_handler(op, type)) + return 1; + } else { + if (!ofi_atomic_write_handler(op, type)) + return 1; + } + + if (atomic == FT_ATOMIC_COMPARE || atomic == FT_ATOMIC_FETCH) { + if (ft_check_atomic_compare(dst_cpy, res, type, count)) { + printf("Data check error on atomic fetch buffer\n"); + return -1; + } + } + + if (atomic == FT_ATOMIC_COMPARE) { + ofi_atomic_swap_op(op, type, dst_cpy, src, cmp, res, count); + } else if (atomic == FT_ATOMIC_FETCH) { + ofi_atomic_readwrite_op(op, type, dst_cpy, src, res, count); + } else { + ofi_atomic_write_op(op, type, dst_cpy, src, count); + } + + if (ft_check_atomic_compare(dst_cpy, dst, type, count)) { + printf("Data check error on atomic target buffer\n"); + return -1; + } + + return FI_SUCCESS; +} + int ft_check_buf(void *buf, size_t size) { char *recv_data; diff --git a/fabtests/ubertest/ofi_atomic.h b/fabtests/include/ofi_atomic.h similarity index 72% rename from fabtests/ubertest/ofi_atomic.h rename to fabtests/include/ofi_atomic.h index a61a7bae432..765a4a8137f 100644 --- a/fabtests/ubertest/ofi_atomic.h +++ b/fabtests/include/ofi_atomic.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Intel Corporation. All rights reserved. + * Copyright (c) Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,16 +33,14 @@ #ifndef _OFI_ATOMIC_H_ #define _OFI_ATOMIC_H_ -#include "fabtest.h" +#include "shared.h" +#include "ft_osd.h" #ifdef __cplusplus extern "C" { #endif typedef long double long_double; -typedef float complex ofi_complex_float; -typedef double complex ofi_complex_double; -typedef long double complex ofi_complex_long_double; #define OFI_WRITE_OP_START FI_MIN #define OFI_WRITE_OP_LAST (FI_ATOMIC_WRITE + 1) @@ -83,42 +81,6 @@ extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][OFI_DATATYPE_CNT]) #define ofi_atomic_swap_op(op, datatype, dst, src, cmp, res, cnt) \ ofi_atomic_swap_handler(op, datatype)(dst, src, cmp, res, cnt) -#define OFI_DEF_COMPLEX_OPS(type) \ -static inline int ofi_complex_eq_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a == b; \ -} \ -static inline ofi_complex_## type ofi_complex_sum_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a + b; \ -} \ -static inline ofi_complex_## type ofi_complex_prod_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a * b; \ -} \ -static inline ofi_complex_## type ofi_complex_land_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a && b; \ -} \ -static inline ofi_complex_## type ofi_complex_lor_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return a || b; \ -} \ -static inline int ofi_complex_lxor_## type \ - (ofi_complex_## type a, ofi_complex_## type b) \ -{ \ - return (a && !b) || (!a && b); \ -} \ - -OFI_DEF_COMPLEX_OPS(float) -OFI_DEF_COMPLEX_OPS(double) -OFI_DEF_COMPLEX_OPS(long_double) - #ifdef __cplusplus } #endif diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 7d56fdd7257..c18909c7fb7 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -43,11 +43,14 @@ #include #include #include +#include #include #include #include +#include "ofi_atomic.h" + #ifdef __cplusplus extern "C" { #endif @@ -276,7 +279,11 @@ void ft_mcusage(char *name, char *desc); void ft_csusage(char *name, char *desc); int ft_fill_buf(void *buf, size_t size); +int ft_fill_atomic(void *buf, size_t count, enum fi_datatype datatype); int ft_check_buf(void *buf, size_t size); +int ft_check_atomic(enum ft_atomic_opcodes atomic, enum fi_op op, + enum fi_datatype type, void *src, void *orig_dst, void *dst, + void *cmp, void *res, size_t count); int ft_check_opts(uint64_t flags); uint64_t ft_init_cq_data(struct fi_info *info); int ft_sock_listen(char *node, char *service); @@ -745,4 +752,100 @@ static inline void *ft_get_page_end(const void *addr, size_t page_size) + page_size, page_size) - 1); } +/* + * Common validation functions and variables + */ + +#define integ_alphabet "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +#define integ_alphabet_length (sizeof(integ_alphabet) - 1) + +#define FT_FILL(dst,cnt,type) \ + do { \ + int i, a = 0; \ + type *d = (dst); \ + for (i = 0; i < cnt; i++) { \ + d[i] = integ_alphabet[a]; \ + if (++a >= integ_alphabet_length) \ + a = 0; \ + } \ + } while (0); + +#define FT_FILL_COMPLEX(dst,cnt,type) \ + do { \ + int i, a = 0; \ + OFI_COMPLEX(type) *d = (dst); \ + for (i = 0; i < cnt; i++) { \ + ofi_complex_set_##type (&d[i], \ + *(OFI_COMPLEX(type) *) &integ_alphabet[a]); \ + if (++a >= integ_alphabet_length) \ + a = 0; \ + } \ + } while (0); + +#define FT_CHECK(buf,cmp,cnt,type) \ + do { \ + int i; \ + type *b = (buf); \ + type *c = (cmp); \ + for (i = 0; i < cnt; i++) { \ + if (b[i] != c[i]) \ + return -FI_EIO; \ + } \ + } while (0); + +#define FT_CHECK_COMPLEX(buf,cmp,cnt,type) \ + do { \ + int i; \ + OFI_COMPLEX(type) *b = (buf); \ + OFI_COMPLEX(type) *c = (cmp); \ + for (i = 0; i < cnt; i++) { \ + if (!ofi_complex_eq_##type (b[i], c[i])) \ + return -FI_EIO; \ + } \ + } while (0); + + +#ifdef HAVE___INT128 + +/* If __int128 supported, things just work. */ +#define FT_FILL_INT128(...) FT_FILL(__VA_ARGS__) +#define FT_CHECK_INT128(...) FT_CHECK(__VA_ARGS__) + +#else + +/* If __int128, we're not going to fill/verify. */ +#define FT_FILL_INT128(...) +#define FT_CHECK_INT128(...) + +#endif + +#define EXPAND( x ) x + +#define SWITCH_REAL_TYPES(type,FUNC,...) \ + switch (type) { \ + case FI_INT8: EXPAND( FUNC(__VA_ARGS__,int8_t) ); break; \ + case FI_UINT8: EXPAND( FUNC(__VA_ARGS__,uint8_t) ); break; \ + case FI_INT16: EXPAND( FUNC(__VA_ARGS__,int16_t) ); break; \ + case FI_UINT16: EXPAND( FUNC(__VA_ARGS__,uint16_t) ); break; \ + case FI_INT32: EXPAND( FUNC(__VA_ARGS__,int32_t) ); break; \ + case FI_UINT32: EXPAND( FUNC(__VA_ARGS__,uint32_t) ); break; \ + case FI_INT64: EXPAND( FUNC(__VA_ARGS__,int64_t) ); break; \ + case FI_UINT64: EXPAND( FUNC(__VA_ARGS__,uint64_t) ); break; \ + case FI_INT128: EXPAND( FUNC##_INT128(__VA_ARGS__,ofi_int128_t) ); break; \ + case FI_UINT128: EXPAND( FUNC##_INT128(__VA_ARGS__,ofi_uint128_t) ); break; \ + case FI_FLOAT: EXPAND( FUNC(__VA_ARGS__,float) ); break; \ + case FI_DOUBLE: EXPAND( FUNC(__VA_ARGS__,double) ); break; \ + case FI_LONG_DOUBLE: EXPAND( FUNC(__VA_ARGS__,long double) ); break; \ + default: return -FI_EOPNOTSUPP; \ + } + +#define SWITCH_COMPLEX_TYPES(type,FUNC,...) \ + switch (type) { \ + case FI_FLOAT_COMPLEX: EXPAND( FUNC(__VA_ARGS__,float) ); break; \ + case FI_DOUBLE_COMPLEX: EXPAND( FUNC(__VA_ARGS__,double) ); break; \ + case FI_LONG_DOUBLE_COMPLEX: EXPAND( FUNC(__VA_ARGS__,long_double) ); break;\ + default: return -FI_EOPNOTSUPP; \ + } + + #endif /* _SHARED_H_ */ diff --git a/fabtests/include/unix/osd.h b/fabtests/include/unix/osd.h index ec8ca1020fb..07a3ab09f60 100644 --- a/fabtests/include/unix/osd.h +++ b/fabtests/include/unix/osd.h @@ -84,8 +84,8 @@ static inline int ofi_sockerr(void) } /* complex operations implementation */ -#define OFI_COMPLEX(name) ofi_##name##_complex -#define OFI_COMPLEX_OP(name, op) ofi_complex_##name##_##op +#define OFI_COMPLEX(name) ofi_complex_##name +#define OFI_COMPLEX_OP(name, op) ofi_complex_##op##_##name #define OFI_COMPLEX_TYPE_DECL(name, type) typedef type complex OFI_COMPLEX(name); OFI_COMPLEX_TYPE_DECL(float, float) @@ -97,11 +97,11 @@ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, sum)(OFI_COMPLEX(name) v1, { \ return v1 + v2; \ } \ -static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, mul)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, prod)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ return v1 * v2; \ } \ -static inline int OFI_COMPLEX_OP(name, equ)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +static inline int OFI_COMPLEX_OP(name, eq)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ return v1 == v2; \ } \ @@ -112,6 +112,14 @@ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, land)(OFI_COMPLEX(name) v1, static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, lor)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ return v1 || v2; \ +} \ +static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, lxor)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2)\ +{ \ + return (v1 && !v2) || (!v1 && v2); \ +} \ +static inline void OFI_COMPLEX_OP(name, set)(OFI_COMPLEX(name) *v1, OFI_COMPLEX(name) v2) \ +{ \ + *v1 = v2; \ } OFI_COMPLEX_OPS(float) diff --git a/fabtests/include/windows/osd.h b/fabtests/include/windows/osd.h index 564f4453c16..49bee1d7751 100644 --- a/fabtests/include/windows/osd.h +++ b/fabtests/include/windows/osd.h @@ -726,9 +726,9 @@ ofi_send_socket(SOCKET fd, const void *buf, size_t count, int flags) /* complex operations implementation */ -#define OFI_COMPLEX(name) ofi_##name##_complex +#define OFI_COMPLEX(name) ofi_complex_##name #define OFI_COMPLEX_BASE(name) OFI_COMPLEX(name)##_base -#define OFI_COMPLEX_OP(name, op) ofi_complex_##name##_##op +#define OFI_COMPLEX_OP(name, op) ofi_complex_##op##_##name #define OFI_COMPLEX_TYPE_DECL(name, type) \ typedef type OFI_COMPLEX_BASE(name); \ typedef struct { \ @@ -754,29 +754,42 @@ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, sum)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) ret = {.re = v1.re + v2.re, .im = v1.im + v2.im}; \ return ret; \ } \ -static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, mul)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, prod)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ OFI_COMPLEX(name) ret = {.re = (v1.re * v2.re) - (v1.im * v2.im), \ .im = (v1.re * v2.im) + (v1.im * v2.re)}; \ return ret; \ } \ -static inline int OFI_COMPLEX_OP(name, equ)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +static inline int OFI_COMPLEX_OP(name, eq)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ return v1.re == v2.re && v1.im == v2.im; \ } \ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, land)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ OFI_COMPLEX(name) zero = {.re = 0, .im = 0}; \ - int equ = !OFI_COMPLEX_OP(name, equ)(v1, zero) && !OFI_COMPLEX_OP(name, equ)(v2, zero); \ + int equ = !OFI_COMPLEX_OP(name, eq)(v1, zero) && !OFI_COMPLEX_OP(name, eq)(v2, zero); \ OFI_COMPLEX(name) ret = {.re = equ ? 1.f : 0, .im = 0}; \ return ret; \ } \ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, lor)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ { \ OFI_COMPLEX(name) zero = {.re = 0, .im = 0}; \ - int equ = !OFI_COMPLEX_OP(name, equ)(v1, zero) || !OFI_COMPLEX_OP(name, equ)(v2, zero); \ + int equ = !OFI_COMPLEX_OP(name, eq)(v1, zero) || !OFI_COMPLEX_OP(name, eq)(v2, zero); \ OFI_COMPLEX(name) ret = {.re = equ ? 1.f : 0, .im = 0}; \ return ret; \ +} \ +static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, lxor)(OFI_COMPLEX(name) v1, OFI_COMPLEX(name) v2) \ +{ \ + OFI_COMPLEX(name) zero = {.re = 0, .im = 0}; \ + int equ = (!OFI_COMPLEX_OP(name, eq)(v1, zero) && OFI_COMPLEX_OP(name, eq)(v2, zero)) || \ + (OFI_COMPLEX_OP(name, eq)(v1, zero) && !OFI_COMPLEX_OP(name, eq)(v2, zero)); \ + OFI_COMPLEX(name) ret = {.re = equ ? 1.f : 0, .im = 0}; \ + return ret; \ +} \ +static inline void OFI_COMPLEX_OP(name, set)(OFI_COMPLEX(name) *v1, OFI_COMPLEX(name) v2) \ +{ \ + v1->re = v2.re; \ + v1->im = v2.im; \ } OFI_COMPLEX_OPS(float) diff --git a/fabtests/ubertest/test_ctrl.c b/fabtests/ubertest/test_ctrl.c index 4b4ee462813..30cae80ac23 100644 --- a/fabtests/ubertest/test_ctrl.c +++ b/fabtests/ubertest/test_ctrl.c @@ -870,9 +870,9 @@ static int ft_unit_atomic(void) ft_atom_ctrl.count = ft_tx_ctrl.rma_msg_size / ft_atom_ctrl.datatype_size; if (ret == -FI_ENOSYS || ret == -FI_EOPNOTSUPP || - ft_atom_ctrl.count > count || ft_atom_ctrl.count == 0) { + ft_atom_ctrl.count > count || ft_atom_ctrl.count == 0) return 0; - } + if (ret) return ret; diff --git a/fabtests/ubertest/verify.c b/fabtests/ubertest/verify.c index 1503d9ca2e3..ebedc8d4ed7 100644 --- a/fabtests/ubertest/verify.c +++ b/fabtests/ubertest/verify.c @@ -35,79 +35,14 @@ #include "ofi_atomic.h" #include "fabtest.h" -static const char integ_alphabet[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; -static const int integ_alphabet_length = (sizeof(integ_alphabet)/sizeof(*integ_alphabet)) - 1; - -#define CHECK_LOCAL(res,local,cnt,ret,TYPE) \ - do { \ - int i; \ - TYPE *r = (res); \ - TYPE *l = (local); \ - for (i = 0; i < cnt; i++) { \ - if (r[i] != l[i]) { \ - ret = -FI_EIO; \ - break; \ - } \ - } \ - } while (0) \ - - -#define FT_FILL(dst,cnt,TYPE) \ - do { \ - int i, a = 0; \ - TYPE *d = (dst); \ - for (i = 0; i < cnt; i++) { \ - d[i] = (TYPE) (integ_alphabet[a]); \ - if (++a >= integ_alphabet_length) \ - a = 0; \ - } \ - } while (0) - -#ifdef HAVE___INT128 - -/* If __int128 supported, things just work. */ -#define FT_FILL_INT128(...) FT_FILL(__VA_ARGS__) -#define CHECK_LOCAL_INT128(...) CHECK_LOCAL(__VA_ARGS__) - -#else - -/* If __int128, we're not going to fill/verify. */ -#define FT_FILL_INT128(...) -#define CHECK_LOCAL_INT128(...) - -#endif - -#define SWITCH_TYPES(type,FUNC,...) \ - switch (type) { \ - case FI_INT8: FUNC(__VA_ARGS__,int8_t); break; \ - case FI_UINT8: FUNC(__VA_ARGS__,uint8_t); break; \ - case FI_INT16: FUNC(__VA_ARGS__,int16_t); break; \ - case FI_UINT16: FUNC(__VA_ARGS__,uint16_t); break; \ - case FI_INT32: FUNC(__VA_ARGS__,int32_t); break; \ - case FI_UINT32: FUNC(__VA_ARGS__,uint32_t); break; \ - case FI_INT64: FUNC(__VA_ARGS__,int64_t); break; \ - case FI_UINT64: FUNC(__VA_ARGS__,uint64_t); break; \ - case FI_INT128: FUNC##_INT128(__VA_ARGS__,ofi_int128_t); break; \ - case FI_UINT128: FUNC##_INT128(__VA_ARGS__,ofi_uint128_t); break; \ - case FI_FLOAT: FUNC(__VA_ARGS__,float); break; \ - case FI_DOUBLE: FUNC(__VA_ARGS__,double); break; \ - case FI_LONG_DOUBLE: FUNC(__VA_ARGS__,long_double); break; \ - case FI_FLOAT_COMPLEX: FUNC(__VA_ARGS__,ofi_complex_float); break; \ - case FI_DOUBLE_COMPLEX: FUNC(__VA_ARGS__,ofi_complex_double); break; \ - case FI_LONG_DOUBLE_COMPLEX: FUNC(__VA_ARGS__,ofi_complex_long_double); break;\ - default: return -FI_EOPNOTSUPP; \ - } - int ft_sync_fill_bufs(size_t size) { int ret; ft_sock_sync(sock, 0); if (test_info.caps & FI_ATOMIC) { - SWITCH_TYPES(ft_atom_ctrl.datatype, FT_FILL, ft_tx_ctrl.buf, - ft_atom_ctrl.count); - SWITCH_TYPES(ft_atom_ctrl.datatype, FT_FILL, ft_mr_ctrl.buf, - ft_atom_ctrl.count); + (void)ft_fill_atomic(ft_tx_ctrl.buf, ft_atom_ctrl.count, ft_atom_ctrl.datatype); + (void)ft_fill_atomic(ft_mr_ctrl.buf, ft_atom_ctrl.count, ft_atom_ctrl.datatype); memcpy(ft_atom_ctrl.orig_buf, ft_mr_ctrl.buf, size); memcpy(ft_tx_ctrl.cpy_buf, ft_tx_ctrl.buf, size); } else if (is_read_func(test_info.class_function)) { @@ -131,67 +66,26 @@ int ft_sync_fill_bufs(size_t size) return 0; } -static int verify_atomic(void) -{ - int ret = 0; - void *dst, *src, *cmp, *tmp, *res; - enum fi_datatype type; - enum fi_op op; - size_t count; - - dst = ft_atom_ctrl.orig_buf; - src = ft_tx_ctrl.cpy_buf; - - cmp = ft_atom_ctrl.comp_buf; - tmp = ft_rx_ctrl.buf; - res = ft_atom_ctrl.res_buf; - - type = ft_atom_ctrl.datatype; - op = ft_atom_ctrl.op; - count = ft_atom_ctrl.count; - - /* - * If we don't have the test function, return > 0 to indicate - * verification is unsupported. - */ - if (is_compare_func(test_info.class_function)) { - if (!ofi_atomic_swap_handler(op, type)) - return 1; - } else if (is_fetch_func(test_info.class_function)) { - if (!ofi_atomic_readwrite_handler(op, type)) - return 1; - } else { - if (!ofi_atomic_write_handler(op, type)) - return 1; - } - - if (is_fetch_func(test_info.class_function) || - is_compare_func(test_info.class_function)) { - SWITCH_TYPES(type, CHECK_LOCAL, dst, res, count, ret); - if (ret) - return ret; - } - - if (is_compare_func(test_info.class_function)) { - ofi_atomic_swap_op(op, type, dst, src, cmp, tmp, count); - } else if (is_fetch_func(test_info.class_function)) { - ofi_atomic_readwrite_op(op, type, dst, src, tmp, count); - } else { - ofi_atomic_write_op(op, type, dst, src, count); - } - - SWITCH_TYPES(type, CHECK_LOCAL, dst, ft_mr_ctrl.buf, count, ret); - - return ret; -} - int ft_verify_bufs() { char *compare_buf; size_t compare_size; + enum ft_atomic_opcodes opcode; - if (test_info.caps & FI_ATOMIC) - return verify_atomic(); + if (test_info.caps & FI_ATOMIC) { + if (is_compare_func(test_info.class_function)) + opcode = FT_ATOMIC_COMPARE; + else if (is_fetch_func(test_info.class_function)) + opcode = FT_ATOMIC_FETCH; + else + opcode = FT_ATOMIC_BASE; + + return ft_check_atomic(opcode, ft_atom_ctrl.op, + ft_atom_ctrl.datatype, ft_tx_ctrl.cpy_buf, + ft_atom_ctrl.orig_buf, ft_mr_ctrl.buf, + ft_atom_ctrl.comp_buf, ft_atom_ctrl.res_buf, + ft_atom_ctrl.count); + } if (test_info.caps & FI_RMA) { compare_size = ft_tx_ctrl.rma_msg_size; From a2751ec70c5b5d5643c43cb2cca369ab40019997 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Fri, 5 Apr 2024 15:09:24 -0700 Subject: [PATCH 113/393] fabtests/common: add hmem support to common atomic validation To properly validate atomic data, we need host bounce buffers for the result and compare buffers in addition to the regular bounce buffer for the tx/rx bufs. This adds two extra bufs allocated only for atomic purposes and adds hmem support to the common atomic validation path. It also renames the alloc/free_tx_buf calls to generic alloc/free_host_bufs which allocates all three buffers at once. Signed-off-by: Alexia Ingerson --- fabtests/common/shared.c | 140 ++++++++++++++++++++++++++++++---- fabtests/include/shared.h | 6 +- fabtests/ubertest/test_ctrl.c | 2 +- 3 files changed, 128 insertions(+), 20 deletions(-) diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 5a1ee5787c6..ff3011193a6 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -90,7 +90,7 @@ char *buf = NULL, *tx_buf, *rx_buf; * dev_host_buf are used by ft_fill_buf() to stage data sent over wire, * when tx_buf is on device memory. */ -void *dev_host_buf = NULL; +void *dev_host_buf = NULL, *dev_host_comp = NULL, *dev_host_res = NULL; char **tx_mr_bufs = NULL, **rx_mr_bufs = NULL; size_t buf_size, tx_buf_size, rx_buf_size; @@ -546,14 +546,30 @@ static void ft_set_tx_rx_sizes(size_t *set_tx, size_t *set_rx) *set_tx += ft_tx_prefix_size(); } -void ft_free_host_tx_buf(void) +void ft_free_host_bufs(void) { int ret; - ret = ft_hmem_free_host(opts.iface, dev_host_buf); - if (ret) - FT_PRINTERR("ft_hmem_free_host", ret); - dev_host_buf = NULL; + if (dev_host_buf) { + ret = ft_hmem_free_host(opts.iface, dev_host_buf); + if (ret) + FT_PRINTERR("ft_hmem_free_host", ret); + dev_host_buf = NULL; + } + + if (dev_host_res) { + ret = ft_hmem_free_host(opts.iface, dev_host_res); + if (ret) + FT_PRINTERR("ft_hmem_free_host", ret); + dev_host_res = NULL; + } + + if (dev_host_comp) { + ret = ft_hmem_free_host(opts.iface, dev_host_comp); + if (ret) + FT_PRINTERR("ft_hmem_free_host", ret); + dev_host_comp = NULL; + } } /* @@ -641,6 +657,18 @@ int ft_alloc_msgs(void) max_msg_size * opts.window_size); if (ret) return ret; + + if (fi->caps & FI_ATOMIC) { + ret = ft_hmem_alloc_host(opts.iface, &dev_host_comp, + buf_size); + if (ret) + return ret; + + ret = ft_hmem_alloc_host(opts.iface, &dev_host_res, + buf_size); + if (ret) + return ret; + } } ret = ft_hmem_memset(opts.iface, opts.device, (void *) buf, 0, buf_size); @@ -1913,8 +1941,7 @@ void ft_free_res(void) buf = rx_buf = tx_buf = NULL; buf_size = rx_size = tx_size = tx_mr_size = rx_mr_size = 0; } - if (dev_host_buf) - ft_free_host_tx_buf(); + ft_free_host_bufs(); if (fi_pep) { fi_freeinfo(fi_pep); @@ -3736,6 +3763,16 @@ int ft_fill_buf(void *buf, size_t size) int ft_fill_atomic(void *buf, size_t count, enum fi_datatype datatype) { + void *fill_buf; + int ret = 0; + + if (opts.iface != FI_HMEM_SYSTEM) { + assert(dev_host_buf); + fill_buf = dev_host_buf; + } else { + fill_buf = buf; + } + switch (datatype) { case FI_INT8: case FI_UINT8: @@ -3750,17 +3787,24 @@ int ft_fill_atomic(void *buf, size_t count, enum fi_datatype datatype) case FI_FLOAT: case FI_DOUBLE: case FI_LONG_DOUBLE: - SWITCH_REAL_TYPES(datatype, FT_FILL, buf, count); + SWITCH_REAL_TYPES(datatype, FT_FILL, fill_buf, count); break; case FI_FLOAT_COMPLEX: case FI_DOUBLE_COMPLEX: case FI_LONG_DOUBLE_COMPLEX: - SWITCH_COMPLEX_TYPES(datatype, FT_FILL_COMPLEX, buf, count); + SWITCH_COMPLEX_TYPES(datatype, FT_FILL_COMPLEX, fill_buf, count); break; default: return -FI_EOPNOTSUPP; } - return 0; + + if (opts.iface != FI_HMEM_SYSTEM) { + ret = ft_hmem_copy_to(opts.iface, opts.device, buf, fill_buf, + count * datatype_to_size(datatype)); + if (ret) + FT_ERR("Failed to fill atomic buffer\n"); + } + return ret; } static int ft_check_atomic_compare(void *buf, void *cmp, @@ -3798,6 +3842,9 @@ int ft_check_atomic(enum ft_atomic_opcodes atomic, enum fi_op op, enum fi_datatype type, void *src, void *dst_cpy, void *dst, void *cmp, void *res, size_t count) { + int ret = 0; + void *check_res = res, *check_buf, *check_comp; + /* * If we don't have the test function, return > 0 to indicate * verification is unsupported. @@ -3814,21 +3861,82 @@ int ft_check_atomic(enum ft_atomic_opcodes atomic, enum fi_op op, } if (atomic == FT_ATOMIC_COMPARE || atomic == FT_ATOMIC_FETCH) { - if (ft_check_atomic_compare(dst_cpy, res, type, count)) { + if (opts.iface != FI_HMEM_SYSTEM) { + assert(dev_host_res); + ret = ft_hmem_copy_from(opts.iface, opts.device, + dev_host_res, res, + count * datatype_to_size(type)); + if (ret) { + FT_ERR("Failed to copy from atomic buffer\n"); + return ret; + } + + check_res = dev_host_res; + } else { + check_res = res; + } + if (ft_check_atomic_compare(dst_cpy, check_res, type, count)) { printf("Data check error on atomic fetch buffer\n"); return -1; } } if (atomic == FT_ATOMIC_COMPARE) { - ofi_atomic_swap_op(op, type, dst_cpy, src, cmp, res, count); + if (opts.iface != FI_HMEM_SYSTEM) { + assert(dev_host_comp); + ret = ft_hmem_copy_from(opts.iface, opts.device, + dev_host_comp, cmp, + count * datatype_to_size(type)); + if (ret) { + FT_ERR("Failed to copy from atomic buffer\n"); + return ret; + } + check_comp = dev_host_comp; + } else { + check_comp = cmp; + } + } + + if (opts.iface != FI_HMEM_SYSTEM) { + assert(dev_host_buf); + ret = ft_hmem_copy_from(opts.iface, opts.device, dev_host_buf, + src, count * datatype_to_size(type)); + if (ret) { + FT_ERR("Failed to copy from atomic buffer\n"); + return ret; + } + + check_buf = dev_host_buf; + + } else { + check_buf = src; + } + + if (atomic == FT_ATOMIC_COMPARE) { + ofi_atomic_swap_op(op, type, dst_cpy, check_buf, check_comp, + check_res, count); } else if (atomic == FT_ATOMIC_FETCH) { - ofi_atomic_readwrite_op(op, type, dst_cpy, src, res, count); + ofi_atomic_readwrite_op(op, type, dst_cpy, check_buf, + check_res, count); + } else { + ofi_atomic_write_op(op, type, dst_cpy, check_buf, count); + } + + if (opts.iface != FI_HMEM_SYSTEM) { + ret = ft_hmem_copy_from(opts.iface, opts.device, + dev_host_buf, dst, + count * datatype_to_size(type)); + if (ret) { + FT_ERR("Failed to copy from atomic buffer\n"); + return ret; + } + + check_buf = dev_host_buf; } else { - ofi_atomic_write_op(op, type, dst_cpy, src, count); + check_buf = dst; } - if (ft_check_atomic_compare(dst_cpy, dst, type, count)) { + if (ft_check_atomic_compare(dst_cpy, check_buf, type, count)) { printf("Data check error on atomic target buffer\n"); return -1; } diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index c18909c7fb7..4720a188bdb 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -243,7 +243,7 @@ extern struct fid_mc *mc; extern fi_addr_t remote_fi_addr; extern char *buf, *tx_buf, *rx_buf; -extern void *dev_host_buf; +extern void *dev_host_buf, *dev_host_comp, *dev_host_res; extern struct ft_context *tx_ctx_arr, *rx_ctx_arr; extern char **tx_mr_bufs, **rx_mr_bufs; extern size_t buf_size, tx_size, rx_size, tx_mr_size, rx_mr_size; @@ -458,8 +458,8 @@ int ft_alloc_ep_res(struct fi_info *fi, struct fid_cq **new_txcq, struct fid_cntr **new_rma_cntr, struct fid_av **new_av); int ft_alloc_msgs(void); -int ft_alloc_host_tx_buf(size_t size); -void ft_free_host_tx_buf(void); +int ft_alloc_host_bufs(size_t size); +void ft_free_host_bufs(void); int ft_alloc_active_res(struct fi_info *fi); int ft_enable_ep_recv(void); int ft_enable_ep(struct fid_ep *bind_ep, struct fid_eq *bind_eq, struct fid_av *bind_av, diff --git a/fabtests/ubertest/test_ctrl.c b/fabtests/ubertest/test_ctrl.c index 30cae80ac23..30b43749499 100644 --- a/fabtests/ubertest/test_ctrl.c +++ b/fabtests/ubertest/test_ctrl.c @@ -1018,7 +1018,7 @@ void ft_cleanup(void) FT_CLOSE_FID(ft_atom_ctrl.comp_mr); ft_cleanup_xcontrol(&ft_rx_ctrl); ft_cleanup_xcontrol(&ft_tx_ctrl); - ft_free_host_tx_buf(); + ft_free_host_bufs(); ft_cleanup_mr_control(&ft_mr_ctrl); ft_cleanup_atomic_control(&ft_atom_ctrl); ft_cleanup_random(); From becdc4945b8694049ed7d22b4d10f940767cbd0b Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 30 Apr 2024 10:50:25 -0700 Subject: [PATCH 114/393] fabtests/common: fix atomic buffer ft_post_atomic posted "buf" which is the base address for the entire send and recv buffer allocation. The first half of the allocation is the receive buffer and the second half is the send buffer. Posting just "buf" meant it was sending the receive buffer. This changes it to send the tx buf and do an atomic on the rx buf which allows us to properly do atomic validation Signed-off-by: Alexia Ingerson --- fabtests/common/shared.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index ff3011193a6..436a831df79 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -2471,18 +2471,18 @@ ssize_t ft_post_atomic(enum ft_atomic_opcodes opcode, struct fid_ep *ep, switch (opcode) { case FT_ATOMIC_BASE: FT_POST(fi_atomic, ft_progress, txcq, tx_seq, &tx_cq_cntr, - "fi_atomic", ep, buf, count, mr_desc, remote_fi_addr, + "fi_atomic", ep, tx_buf, count, mr_desc, remote_fi_addr, remote->addr, remote->key, datatype, atomic_op, context); break; case FT_ATOMIC_FETCH: FT_POST(fi_fetch_atomic, ft_progress, txcq, tx_seq, &tx_cq_cntr, - "fi_fetch_atomic", ep, buf, count, mr_desc, result, + "fi_fetch_atomic", ep, tx_buf, count, mr_desc, result, result_desc, remote_fi_addr, remote->addr, remote->key, datatype, atomic_op, context); break; case FT_ATOMIC_COMPARE: FT_POST(fi_compare_atomic, ft_progress, txcq, tx_seq, - &tx_cq_cntr, "fi_compare_atomic", ep, buf, count, + &tx_cq_cntr, "fi_compare_atomic", ep, tx_buf, count, mr_desc, compare, compare_desc, result, result_desc, remote_fi_addr, remote->addr, remote->key, datatype, atomic_op, context); From 0c635e7c2f10aa476005afac2e87e252480a764c Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 30 Apr 2024 12:49:04 -0700 Subject: [PATCH 115/393] fabtests/common: change sync message to be 0 bytes instead of 1 byte This allows us to post the rx buf without corrupting memory in case its needed for validation Signed-off-by: Alexia Ingerson --- fabtests/common/shared.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 436a831df79..40f53c84f6b 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -3081,7 +3081,7 @@ int ft_sync_inband(bool repost_rx) int ret; if (opts.dst_addr) { - ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 1, &tx_ctx, + ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 0, &tx_ctx, FI_DELIVERY_COMPLETE); if (ret) return ret; @@ -3094,7 +3094,7 @@ int ft_sync_inband(bool repost_rx) if (ret) return ret; - ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 1, &tx_ctx, + ret = ft_tx_msg(ep, remote_fi_addr, tx_buf, 0, &tx_ctx, FI_DELIVERY_COMPLETE); if (ret) return ret; From ef4949ea8a2cfb18dd17437c5f5591b5c5ee1f63 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 8 Jul 2024 13:06:55 -0700 Subject: [PATCH 116/393] fabtests/hmem: change ZE memset to use uint8 Match the behavior of memset() where the value passed in is an int, but it is interpreted as a char. While ZE can technically handle this scenario, others may not so we need to standardize across ifaces Signed-off-by: Alexia Ingerson --- fabtests/common/hmem.c | 4 ++++ fabtests/common/hmem_ze.c | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fabtests/common/hmem.c b/fabtests/common/hmem.c index 1c724510c4e..0cd736441b4 100644 --- a/fabtests/common/hmem.c +++ b/fabtests/common/hmem.c @@ -185,6 +185,10 @@ int ft_hmem_free_host(enum fi_hmem_iface iface, void *buf) return hmem_ops[iface].free_host(buf); } +/* + * Matches the behavior of memset where value is an int but + * used as a unsigned char + */ int ft_hmem_memset(enum fi_hmem_iface iface, uint64_t device, void *buf, int value, size_t size) { diff --git a/fabtests/common/hmem_ze.c b/fabtests/common/hmem_ze.c index 507470f06d1..305c58ff57d 100644 --- a/fabtests/common/hmem_ze.c +++ b/fabtests/common/hmem_ze.c @@ -382,6 +382,7 @@ int ft_ze_free(void *buf) int ft_ze_memset(uint64_t device, void *buf, int value, size_t size) { + unsigned char set_value = (unsigned char) value; ze_result_t ze_ret; ze_ret = (*libze_ops.zeCommandListReset)(cmd_list); @@ -389,7 +390,8 @@ int ft_ze_memset(uint64_t device, void *buf, int value, size_t size) return -FI_EINVAL; ze_ret = (*libze_ops.zeCommandListAppendMemoryFill)( - cmd_list, buf, &value, sizeof(value), + cmd_list, buf, &set_value, + sizeof(set_value), size, NULL, 0, NULL); if (ze_ret) return -FI_EINVAL; From 70411a19070c3e79706dda73a85981499e3a8d5b Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 30 Apr 2024 12:47:24 -0700 Subject: [PATCH 117/393] functional/rdm_atomic: add data validation Add data validation to the atomic test by using the newly added atomic fill and check support imported from ubertest. This code uses a macro that switches on datatype for filling and checking the buffer contents. The atomic validation path requires an extra buffer to copy the contents of the original atomic buffer in order to recreate the atomic function locally and check the buffer against the simulated atomic operation. This patch also refactors the entire test to remove the extremely confusing macros used for the base/fetch/compare operations. The macros made the code extremely difficult to read and debug and also made it difficult to add data validation. Separating it into three explicit functions is about the same amount of code and significantly more readable Synchronization messages are added in the validation case to ensure the atomic operation completed on both sides before validation occurs. This requires the addition of the FI_ORDER_SAW and FI_ORDER_SAR message ordering to ensure that we get the completion for the send/recv sync after the atomic message is processed Signed-off-by: Alexia Ingerson --- fabtests/functional/rdm_atomic.c | 313 ++++++++++++++++++++----------- fabtests/include/shared.h | 2 +- 2 files changed, 202 insertions(+), 113 deletions(-) diff --git a/fabtests/functional/rdm_atomic.c b/fabtests/functional/rdm_atomic.c index cef31ecb229..b329c6a66e7 100644 --- a/fabtests/functional/rdm_atomic.c +++ b/fabtests/functional/rdm_atomic.c @@ -40,13 +40,13 @@ static enum fi_op op_type = FI_MIN; static void *result; static void *compare; +static void *cpy_dst; static struct fid_mr *mr_result; static struct fid_mr *mr_compare; static struct fi_context fi_ctx_atomic; static enum fi_datatype datatype; -static size_t *count; static int run_all_ops = 1, run_all_datatypes = 1; static enum fi_op get_fi_op(char *op) @@ -148,92 +148,15 @@ static void print_opts_usage(char *name) FT_PRINT_OPTS_USAGE("", "int32|uint32|int64|uint64|int128|uint128|" "float|double|float_complex|double_complex|"); FT_PRINT_OPTS_USAGE("", "long_double|long_double_complex (default: all)"); + FT_PRINT_OPTS_USAGE("-v", "enables data_integrity checks"); } -#define create_atomic_op_executor(type) \ -static inline int execute_atomic_ ## type ## _op(enum fi_op op_type, \ - enum fi_datatype datatype) \ -{ \ - int ret = FI_SUCCESS, len, i; \ - len = snprintf((test_name), sizeof(test_name), "%s_", \ - fi_tostr(&(datatype), FI_TYPE_ATOMIC_TYPE)); \ - snprintf((test_name) + len, sizeof(test_name) - len, "%s_"#type"_lat", \ - fi_tostr(&op_type, FI_TYPE_ATOMIC_OP)); \ - opts.transfer_size = datatype_to_size(datatype); \ - \ - ft_start(); \ - for (i = 0; i < opts.iterations; i++) { \ - ret = execute_ ## type ## _atomic_op(op_type); \ - if (ret) \ - break; \ - } \ - ft_stop(); \ - report_perf(); \ - \ - return ret; \ -} - -#define create_atomic_op_handler(type) \ -create_atomic_op_executor(type) \ -static inline int handle_atomic_ ## type ## _op(int run_all_datatypes, \ - enum fi_op op_type, \ - size_t *count) \ -{ \ - int ret = FI_SUCCESS; \ - \ - if (run_all_datatypes) { \ - for (datatype = 0; datatype < OFI_DATATYPE_CNT; datatype++) { \ - ret = check_ ## type ## _atomic_op(ep, op_type, \ - datatype, count); \ - if (ret == -FI_ENOSYS || ret == -FI_EOPNOTSUPP) { \ - fprintf(stderr, \ - "Provider doesn't support %s ", \ - fi_tostr(&op_type, \ - FI_TYPE_ATOMIC_OP)); \ - fprintf(stderr, \ - #type" atomic operation on %s\n", \ - fi_tostr(&datatype, \ - FI_TYPE_ATOMIC_TYPE)); \ - continue; \ - } else if (ret) { \ - goto fn; \ - } \ - \ - ret = execute_atomic_ ##type ## _op(op_type, datatype); \ - if (ret) \ - goto fn; \ - } \ - } else { \ - ret = check_ ## type ## _atomic_op(ep, op_type, \ - datatype, count); \ - if (ret == -FI_ENOSYS || ret == -FI_EOPNOTSUPP) { \ - fprintf(stderr, \ - "Provider doesn't support %s ", \ - fi_tostr(&op_type, \ - FI_TYPE_ATOMIC_OP)); \ - fprintf(stderr, \ - #type" atomic operation on %s\n", \ - fi_tostr(&datatype, \ - FI_TYPE_ATOMIC_TYPE)); \ - goto fn; \ - } else if (ret) { \ - goto fn; \ - } \ - \ - ret = execute_atomic_ ## type ##_op(op_type, datatype); \ - } \ - \ -fn: \ - return ret; \ -} - - -static inline int execute_base_atomic_op(enum fi_op op) +static inline int execute_base_atomic_op(void) { int ret; ret = ft_post_atomic(FT_ATOMIC_BASE, ep, NULL, NULL, NULL, NULL, - &remote, datatype, op, &fi_ctx_atomic); + &remote, datatype, op_type, &fi_ctx_atomic); if (ret) return ret; @@ -242,13 +165,13 @@ static inline int execute_base_atomic_op(enum fi_op op) return ret; } -static inline int execute_fetch_atomic_op(enum fi_op op) +static inline int execute_fetch_atomic_op(void) { int ret; ret = ft_post_atomic(FT_ATOMIC_FETCH, ep, NULL, NULL, result, fi_mr_desc(mr_result), &remote, datatype, - op, &fi_ctx_atomic); + op_type, &fi_ctx_atomic); if (ret) return ret; @@ -257,13 +180,13 @@ static inline int execute_fetch_atomic_op(enum fi_op op) return ret; } -static inline int execute_compare_atomic_op(enum fi_op op) +static inline int execute_compare_atomic_op(void) { int ret; ret = ft_post_atomic(FT_ATOMIC_COMPARE, ep, compare, fi_mr_desc(mr_compare), result, fi_mr_desc(mr_result), &remote, datatype, - op, &fi_ctx_atomic); + op_type, &fi_ctx_atomic); if (ret) return ret; @@ -272,8 +195,44 @@ static inline int execute_compare_atomic_op(enum fi_op op) return ret; } +static int fill_data(enum ft_atomic_opcodes opcode) +{ + int ret; + + switch (opcode) { + case FT_ATOMIC_COMPARE: + ft_fill_atomic(compare, 1, datatype); + /* fall through */ + case FT_ATOMIC_FETCH: + ft_hmem_memset(opts.iface, opts.device, result, 0, + datatype_to_size(datatype)); + /* fall through */ + case FT_ATOMIC_BASE: + ft_fill_atomic(tx_buf, 1, datatype); + ft_fill_atomic(rx_buf, 1, datatype); + break; + default: + break; + } + + ret = ft_hmem_copy_from(opts.iface, opts.device, cpy_dst, + rx_buf, datatype_to_size(datatype)); + if (ret) + return ret; + + ft_sync(); + return ret; +} + static void report_perf(void) { + int len; + + len = snprintf((test_name), sizeof(test_name), "%s_", + fi_tostr(&(datatype), FI_TYPE_ATOMIC_TYPE)); + snprintf((test_name) + len, sizeof(test_name) - len, "%s_lat", + fi_tostr(&op_type, FI_TYPE_ATOMIC_OP)); + if (opts.machr) show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 1, opts.argc, opts.argv); @@ -281,21 +240,117 @@ static void report_perf(void) show_perf(test_name, opts.transfer_size, opts.iterations, &start, &end, 1); } -create_atomic_op_handler(base) -create_atomic_op_handler(fetch) -create_atomic_op_handler(compare) +static int handle_atomic_base_op(void) +{ + int ret = FI_SUCCESS, i; + size_t count = 0; -static int run_op(void) + ret = check_base_atomic_op(ep, op_type, datatype, &count); + if (ret) + return ret; + + opts.transfer_size = datatype_to_size(datatype); + ft_start(); + for (i = 0; i < opts.iterations; i++) { + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ret = fill_data(FT_ATOMIC_BASE); + if (ret) + return ret; + } + + ret = execute_base_atomic_op(); + if (ret) + break; + + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ft_sync(); + ret = ft_check_atomic(FT_ATOMIC_BASE, op_type, datatype, + tx_buf, cpy_dst, rx_buf, compare, + result, 1); + if (ret) + return ret; + } + } + ft_stop(); + report_perf(); + return FI_SUCCESS; +} + +static int handle_atomic_fetch_op(void) { - int ret = -FI_EINVAL; + int ret = FI_SUCCESS, i; + size_t count = 0; + + ret = check_fetch_atomic_op(ep, op_type, datatype, &count); + if (ret) + return ret; - count = (size_t *)malloc(sizeof(*count)); - if (!count) { - ret = -FI_ENOMEM; - perror("malloc"); - goto fn; + opts.transfer_size = datatype_to_size(datatype); + ft_start(); + for (i = 0; i < opts.iterations; i++) { + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ret = fill_data(FT_ATOMIC_FETCH); + if (ret) + return ret; + } + + ret = execute_fetch_atomic_op(); + if (ret) + break; + + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ft_sync(); + ret = ft_check_atomic(FT_ATOMIC_FETCH, op_type, datatype, + tx_buf, cpy_dst, rx_buf, compare, + result, 1); + if (ret) + return ret; + } } - ft_sync(); + ft_stop(); + report_perf(); + return FI_SUCCESS; +} + +static int handle_atomic_compare_op(void) +{ + int ret = FI_SUCCESS, i; + size_t count = 0; + + ret = check_compare_atomic_op(ep, op_type, datatype, &count); + if (ret) + return ret; + + opts.transfer_size = datatype_to_size(datatype); + ft_start(); + for (i = 0; i < opts.iterations; i++) { + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ret = fill_data(FT_ATOMIC_COMPARE); + if (ret) + return ret; + } + + ret = execute_compare_atomic_op(); + if (ret) + break; + + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ft_sync(); + ret = ft_check_atomic(FT_ATOMIC_COMPARE, op_type, datatype, + tx_buf, cpy_dst, rx_buf, compare, + result, 1); + if (ret) + return ret; + } + } + ft_stop(); + report_perf(); + return FI_SUCCESS; +} + +static int run_dt(void) +{ + int ret = -FI_EINVAL; switch (op_type) { case FI_MIN: @@ -309,12 +364,10 @@ static int run_op(void) case FI_LXOR: case FI_BXOR: case FI_ATOMIC_WRITE: - ret = handle_atomic_base_op(run_all_datatypes, - op_type, count); + ret = handle_atomic_base_op(); break; case FI_ATOMIC_READ: - ret = handle_atomic_fetch_op(run_all_datatypes, - op_type, count); + ret = handle_atomic_fetch_op(); break; case FI_CSWAP: case FI_CSWAP_NE: @@ -323,39 +376,64 @@ static int run_op(void) case FI_CSWAP_GE: case FI_CSWAP_GT: case FI_MSWAP: - ret = handle_atomic_compare_op(run_all_datatypes, - op_type, count); + ret = handle_atomic_compare_op(); break; default: FT_WARN("Invalid atomic operation type %d\n", op_type); break; } - ft_sync(); - free(count); -fn: + + if (ret == -FI_ENOSYS || ret == -FI_EOPNOTSUPP) { + fprintf(stderr, "Provider doesn't support %s ", + fi_tostr(&op_type, FI_TYPE_ATOMIC_OP)); + fprintf(stderr, "atomic operation on %s\n", + fi_tostr(&datatype, FI_TYPE_ATOMIC_TYPE)); + return FI_SUCCESS; + } + if (ret) { + fprintf(stderr, "Failed atomic op %s ", + fi_tostr(&op_type, FI_TYPE_ATOMIC_OP)); + fprintf(stderr, "with datatype %s\n", + fi_tostr(&datatype, FI_TYPE_ATOMIC_TYPE)); + } return ret; } -static int run_ops(void) +static int run_op(void) { int ret; - for (op_type = FI_MIN; op_type < OFI_ATOMIC_OP_CNT; op_type++) { - ret = run_op(); + if (!run_all_datatypes) + return run_dt(); + + for (datatype = 0; datatype < OFI_DATATYPE_CNT; datatype++) { + ret = run_dt(); if (ret && ret != -FI_ENOSYS && ret != -FI_EOPNOTSUPP) { FT_PRINTERR("run_op", ret); return ret; } } - - return 0; + return FI_SUCCESS; } static int run_test(void) { - return run_all_ops ? run_ops() : run_op(); + int ret; + + if (!run_all_ops) + return run_op(); + + for (op_type = FI_MIN; op_type < OFI_ATOMIC_OP_CNT; op_type++) { + ret = run_op(); + if (ret && ret != -FI_ENOSYS && ret != -FI_EOPNOTSUPP) { + FT_PRINTERR("run_op", ret); + return ret; + } + } + + return FI_SUCCESS; } static void free_res(void) @@ -370,6 +448,10 @@ static void free_res(void) ft_hmem_free(opts.iface, compare); compare = NULL; } + if (cpy_dst) { + ft_hmem_free_host(opts.iface, cpy_dst); + cpy_dst = NULL; + } } static uint64_t get_mr_key() @@ -396,6 +478,10 @@ static int alloc_ep_res(struct fi_info *fi) return -1; } + ret = ft_hmem_alloc_host(opts.iface, &cpy_dst, opts.transfer_size); + if (ret) + return ret; + // registers local data buffer that stores results ret = ft_reg_mr(fi, result, buf_size, (mr_local ? FI_READ : 0) | FI_REMOTE_WRITE, @@ -453,7 +539,7 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt_long(argc, argv, "ho:Uz:" CS_OPTS INFO_OPTS, + while ((op = getopt_long(argc, argv, "ho:Uz:v" CS_OPTS INFO_OPTS, long_opts, &lopt_idx)) != -1) { switch (op) { case 'o': @@ -483,6 +569,9 @@ int main(int argc, char **argv) } } break; + case 'v': + opts.options |= FT_OPT_VERIFY_DATA; + break; default: if (!ft_parse_long_opts(op, optarg)) continue; diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 4720a188bdb..7ff3aa49cb6 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -243,7 +243,7 @@ extern struct fid_mc *mc; extern fi_addr_t remote_fi_addr; extern char *buf, *tx_buf, *rx_buf; -extern void *dev_host_buf, *dev_host_comp, *dev_host_res; +extern void *dev_host_buf; extern struct ft_context *tx_ctx_arr, *rx_ctx_arr; extern char **tx_mr_bufs, **rx_mr_bufs; extern size_t buf_size, tx_size, rx_size, tx_mr_size, rx_mr_size; From f46fb093263da3ddaa3611c49b0e61e67395f781 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Tue, 30 Apr 2024 13:49:21 -0700 Subject: [PATCH 118/393] fabtests/runfabtests.sh: add rdm_atomic validation tests Run fi_rdm_atomic with data validation in standard and short test suites Signed-off-by: Alexia Ingerson --- fabtests/scripts/runfabtests.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index 8b402aa38e1..65cc9958f6f 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -173,6 +173,8 @@ short_tests=( "fi_rma_bw -e rdm -o writedata -I 5 -U" "fi_rdm_atomic -I 5 -o all" "fi_rdm_atomic -I 5 -o all -U" + "fi_rdm_atomic -I 5 -o all -v" + "fi_rdm_atomic -I 5 -o all -U -v" "fi_rdm_cntr_pingpong -I 5" "fi_multi_recv -e rdm -I 5" "fi_multi_recv -e msg -I 5" @@ -209,6 +211,8 @@ standard_tests=( "fi_rma_bw -e rdm -o writedata -U" "fi_rdm_atomic -o all -I 1000" "fi_rdm_atomic -o all -I 1000 -U" + "fi_rdm_atomic -o all -I 1000 -v" + "fi_rdm_atomic -o all -I 1000 -U -v" "fi_rdm_cntr_pingpong" "fi_multi_recv -e rdm" "fi_multi_recv -e msg" From fe47c3242ea009d3a926d40ea29333b322dcf508 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 10 Oct 2024 15:22:49 -0700 Subject: [PATCH 119/393] fabtests/runfabtests.cmd: add atomic tests to windows testing Signed-off-by: Alexia Ingerson --- fabtests/scripts/runfabtests.cmd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fabtests/scripts/runfabtests.cmd b/fabtests/scripts/runfabtests.cmd index d362f905cd1..07e21d4f2eb 100644 --- a/fabtests/scripts/runfabtests.cmd +++ b/fabtests/scripts/runfabtests.cmd @@ -94,6 +94,8 @@ set short_tests=^ "rma_bw -e rdm -o read -I 5 -U"^ "rma_bw -e rdm -o writedata -I 5"^ "rma_bw -e rdm -o writedata -I 5 -U"^ + "rdm_atomic -I 5 -o all"^ + "rdm_atomic -I 5 -o all -v"^ "rdm_cntr_pingpong -I 5"^ "multi_recv -e rdm -I 5"^ "rdm_pingpong -I 5"^ @@ -125,6 +127,8 @@ set standard_tests=^ "rma_bw -e rdm -o read -U"^ "rma_bw -e rdm -o writedata"^ "rma_bw -e rdm -o writedata -U"^ + "rdm_atomic -o all"^ + "rdm_atomic -o all -v"^ "rdm_cntr_pingpong"^ "multi_recv -e rdm"^ "rdm_pingpong"^ From 4caa9665cf670f8d9820dbba3d76ba00da222694 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 30 Sep 2024 14:22:48 -0700 Subject: [PATCH 120/393] prov/psm3: disable complex comparison combinations Comparison of complex numbers is undefined and not a valid combination of atomic ops. Disable in psm3 Signed-off-by: Alexia Ingerson --- prov/psm3/src/psmx3_atomic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/prov/psm3/src/psmx3_atomic.c b/prov/psm3/src/psmx3_atomic.c index da0781e654d..c5fa9351298 100644 --- a/prov/psm3/src/psmx3_atomic.c +++ b/prov/psm3/src/psmx3_atomic.c @@ -2067,6 +2067,11 @@ static int psmx3_atomic_writevalid_internal(size_t chunk_size, switch (op) { case FI_MIN: case FI_MAX: + if (datatype == FI_FLOAT_COMPLEX || + datatype == FI_DOUBLE_COMPLEX || + datatype == FI_LONG_DOUBLE_COMPLEX) + return -FI_EOPNOTSUPP; + /* fall through */ case FI_SUM: case FI_PROD: case FI_LOR: @@ -2098,6 +2103,11 @@ static int psmx3_atomic_readwritevalid_internal(size_t chunk_size, switch (op) { case FI_MIN: case FI_MAX: + if (datatype == FI_FLOAT_COMPLEX || + datatype == FI_DOUBLE_COMPLEX || + datatype == FI_LONG_DOUBLE_COMPLEX) + return -FI_EOPNOTSUPP; + /* fall through */ case FI_SUM: case FI_PROD: case FI_LOR: From f38320ae23013a92a7eeb629211218e6f33b3e7a Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 3 Oct 2024 20:34:52 -0700 Subject: [PATCH 121/393] prov/psm3: check atomic op error code Report atomic op errors back to the application. Some datatype/op combinations were falsely being reported to the application but failing when the atomic was being performed. These failures were silently treated as successful because the errors were not passed back. Check the error code to catch future issues Signed-off-by: Alexia Ingerson --- prov/psm3/src/psmx3_atomic.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/prov/psm3/src/psmx3_atomic.c b/prov/psm3/src/psmx3_atomic.c index c5fa9351298..d04ca5c95d7 100644 --- a/prov/psm3/src/psmx3_atomic.c +++ b/prov/psm3/src/psmx3_atomic.c @@ -601,7 +601,8 @@ int psmx3_am_atomic_handler(psm2_am_token_t token, if (!op_error) { addr += mr->offset; - psmx3_atomic_do_write(addr, src, datatype, op, count); + op_error = psmx3_atomic_do_write(addr, src, datatype, + op, count); if (rx->ep->caps & FI_RMA_EVENT) { cntr = rx->ep->remote_write_cntr; @@ -646,8 +647,8 @@ int psmx3_am_atomic_handler(psm2_am_token_t token, addr += mr->offset; tmp_buf = malloc(len); if (tmp_buf) - psmx3_atomic_do_readwrite(addr, src, tmp_buf, - datatype, op, count); + op_error = psmx3_atomic_do_readwrite(addr, src, + tmp_buf, datatype, op, count); else op_error = -FI_ENOMEM; @@ -698,9 +699,10 @@ int psmx3_am_atomic_handler(psm2_am_token_t token, addr += mr->offset; tmp_buf = malloc(len); if (tmp_buf) - psmx3_atomic_do_compwrite(addr, src, (uint8_t *)src + len, - tmp_buf, datatype, - op, count); + op_error = psmx3_atomic_do_compwrite(addr, src, + (uint8_t *)src + len, + tmp_buf, datatype, + op, count); else op_error = -FI_ENOMEM; From 1c75d4be26a72f412ee0b04cd8b4a8163c97ee3e Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 3 Oct 2024 20:47:29 -0700 Subject: [PATCH 122/393] prov/psm3: fix logical atomic function calls psm3 advertises support for logical ops (lor, land, lxor) with all datatypes but the functions are only defined for integer types. When the atomic op is called with a non-integer type, it drops down to the default case and returns an error (FI_ENOTSUPP) Signed-off-by: Alexia Ingerson --- prov/psm3/src/psmx3_atomic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/prov/psm3/src/psmx3_atomic.c b/prov/psm3/src/psmx3_atomic.c index d04ca5c95d7..b05416c5e07 100644 --- a/prov/psm3/src/psmx3_atomic.c +++ b/prov/psm3/src/psmx3_atomic.c @@ -401,12 +401,12 @@ static int psmx3_atomic_do_write(void *dest, void *src, break; case FI_LOR: - SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE, + SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE, dest,src,count,PSMX3_LOR); break; case FI_LAND: - SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE, + SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE, dest,src,count,PSMX3_LAND); break; @@ -421,7 +421,7 @@ static int psmx3_atomic_do_write(void *dest, void *src, break; case FI_LXOR: - SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE, + SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE, dest,src,count,PSMX3_LXOR); break; From 30eb5293f38a7db71f05e8a09788e1d5671459a8 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Sun, 29 Sep 2024 19:02:29 -0700 Subject: [PATCH 123/393] core: Move flags only used for memory registration calls to fi_domain.h Signed-off-by: Jianxin Xiong --- include/rdma/fabric.h | 6 +++--- include/rdma/fi_domain.h | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index 420d2eacc05..3b2ddd1b976 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -158,12 +158,12 @@ typedef struct fid *fid_t; #define FI_MATCH_COMPLETE (1ULL << 31) #define FI_PEER_TRANSFER (1ULL << 36) -#define FI_MR_DMABUF (1ULL << 40) +/* #define FI_MR_DMABUF (1ULL << 40) */ #define FI_AV_USER_ID (1ULL << 41) #define FI_PEER (1ULL << 43) /* #define FI_XPU_TRIGGER (1ULL << 44) */ -#define FI_HMEM_HOST_ALLOC (1ULL << 45) -#define FI_HMEM_DEVICE_ONLY (1ULL << 46) +/* #define FI_HMEM_HOST_ALLOC (1ULL << 45) */ +/* #define FI_HMEM_DEVICE_ONLY (1ULL << 46) */ #define FI_HMEM (1ULL << 47) /* #define FI_VARIABLE_MSG (1ULL << 48) */ #define FI_RMA_PMEM (1ULL << 49) diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h index 548e4b6ad3e..321b55c49f6 100644 --- a/include/rdma/fi_domain.h +++ b/include/rdma/fi_domain.h @@ -122,6 +122,11 @@ struct fid_av { * Tracks registered memory regions, primarily for remote access, * but also for local access until we can remove that need. */ + +#define FI_MR_DMABUF (1ULL << 40) +#define FI_HMEM_HOST_ALLOC (1ULL << 45) +#define FI_HMEM_DEVICE_ONLY (1ULL << 46) + struct fid_mr { struct fid fid; void *mem_desc; From f41794a3c086f3ec75f7a08ef8f5f1d181b876d5 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Sun, 29 Sep 2024 19:47:26 -0700 Subject: [PATCH 124/393] core: Define flag for single use MR Signed-off-by: Jianxin Xiong --- include/rdma/fi_domain.h | 1 + man/fi_mr.3.md | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h index 321b55c49f6..d42080c5f36 100644 --- a/include/rdma/fi_domain.h +++ b/include/rdma/fi_domain.h @@ -124,6 +124,7 @@ struct fid_av { */ #define FI_MR_DMABUF (1ULL << 40) +#define FI_MR_SINGLE_USE (1ULL << 41) #define FI_HMEM_HOST_ALLOC (1ULL << 45) #define FI_HMEM_DEVICE_ONLY (1ULL << 46) diff --git a/man/fi_mr.3.md b/man/fi_mr.3.md index 3a8e1fcd554..532b1f48a93 100644 --- a/man/fi_mr.3.md +++ b/man/fi_mr.3.md @@ -900,6 +900,12 @@ The follow flag may be specified to any memory registration call. fi_mr_attr structure. This flag is only usable for domains opened with FI_HMEM capability support. +*FI_MR_SINGLE_USE* +: This flag indicates that the memory region is only used for a single + operation. After the operation is complete, the key associated with the + memory region is automatically invalidated and can no longer be used for + remote access. + *FI_AUTH_KEY* : Only valid with domains configured with FI_AV_AUTH_KEY. When used with fi_mr_regattr, this flag denotes that the fi_mr_auth_key::src_addr field From cb4c846d1eb63859ef36c8e43244b3f714ae7a61 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Sun, 29 Sep 2024 20:23:50 -0700 Subject: [PATCH 125/393] core: Define capability bit for tagged multi receive Currently FI_MULTI_RECV is effectively only defined for untagged message only. Simply expanding the definition to tagged message would cause difficulties in either provider support or discovery. Define FI_TAGGED_MULTI_RECV to indicate that multi recv is supported in tagged message as well. This is only used as a capability bit. The op flag and cq flag continues to use FI_MULTI_RECV. Signed-off-by: Jianxin Xiong --- include/rdma/fabric.h | 4 ++-- man/fi_endpoint.3.md | 2 +- man/fi_getinfo.3.md | 9 +++++++-- man/fi_tagged.3.md | 18 ++++++++++++++++++ 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index 3b2ddd1b976..c6fe8f9d1ea 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -162,8 +162,8 @@ typedef struct fid *fid_t; #define FI_AV_USER_ID (1ULL << 41) #define FI_PEER (1ULL << 43) /* #define FI_XPU_TRIGGER (1ULL << 44) */ -/* #define FI_HMEM_HOST_ALLOC (1ULL << 45) */ -/* #define FI_HMEM_DEVICE_ONLY (1ULL << 46) */ + +#define FI_TAGGED_MULTI_RECV (1ULL << 46) #define FI_HMEM (1ULL << 47) /* #define FI_VARIABLE_MSG (1ULL << 48) */ #define FI_RMA_PMEM (1ULL << 49) diff --git a/man/fi_endpoint.3.md b/man/fi_endpoint.3.md index 146d9c7fcc9..516615cac7f 100644 --- a/man/fi_endpoint.3.md +++ b/man/fi_endpoint.3.md @@ -1339,7 +1339,7 @@ capability bits from the fi_info structure will be used. The following capabilities apply to the receive attributes: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV, -FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, +FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_TAGGED_MULTI_RECV, FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, and FI_XPU. diff --git a/man/fi_getinfo.3.md b/man/fi_getinfo.3.md index 6219792257e..cc3835bb7da 100644 --- a/man/fi_getinfo.3.md +++ b/man/fi_getinfo.3.md @@ -333,6 +333,10 @@ additional optimizations. : Specifies that the endpoint must support the FI_MULTI_RECV flag when posting receive buffers. +*FI_TAGGED_MULTI_RECV* +: Specifies that the endpoint must support the FI_MULTI_RECV flag when + posting tagged receive buffers. + *FI_NAMED_RX_CTX* : Requests that endpoints which support multiple receive contexts allow an initiator to target (or name) a specific receive context as @@ -468,8 +472,9 @@ FI_AV_USER_ID, FI_PEER Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, FI_REMOTE_READ, FI_REMOTE_WRITE -Secondary capabilities: FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SHARED_AV, -FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, FI_REMOTE_COMM, FI_SOURCE_ERR, FI_RMA_PMEM. +Secondary capabilities: FI_MULTI_RECV, FI_TAGGED_MULTI_RECV, FI_SOURCE, +FI_RMA_EVENT, FI_SHARED_AV, FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, +FI_REMOTE_COMM, FI_SOURCE_ERR, FI_RMA_PMEM. # MODE diff --git a/man/fi_tagged.3.md b/man/fi_tagged.3.md index 901a2b648cc..eaabd924323 100644 --- a/man/fi_tagged.3.md +++ b/man/fi_tagged.3.md @@ -264,6 +264,24 @@ and/or fi_tsendmsg. local buffer and transfer out of that buffer. This flag can only be used with messages smaller than inject_size. +*FI_MULTI_RECV* +: Applies to posted tagged receive operations when the FI_TAGGED_MULTI_RECV + capability is enabled. This flag allows the user to post a single + tagged receive buffer that will receive multiple incoming messages. + Received messages will be packed into the receive buffer until the + buffer has been consumed. Use of this flag may cause a single + posted receive operation to generate multiple events as messages are + placed into the buffer. The placement of received data into the + buffer may be subjected to provider specific alignment restrictions. + + The buffer will be released by the provider when the available buffer + space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). + Note that an entry to the associated receive completion queue will + always be generated when the buffer has been consumed, even if other + receive completions have been suppressed (i.e. the Rx context has been + configured for FI_SELECTIVE_COMPLETION). See the FI_MULTI_RECV + completion flag [`fi_cq`(3)](fi_cq.3.html). + *FI_INJECT_COMPLETE* : Applies to fi_tsendmsg. Indicates that a completion should be generated when the source buffer(s) may be reused. From e98e13f900d20e254e5d218293f09758eedeb549 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Sun, 29 Sep 2024 20:48:52 -0700 Subject: [PATCH 126/393] core: Define capability for tagged message only directed recv FI_DIRECTED_RECV covers both untagged and tagged message. However, the most often used case is for tagged message. Having a saparate bit for tagged message allows the provider to optimize non-tagged messsage implementation while maintain support directed recv over tagged message. Signed-off-by: Jianxin Xiong --- include/rdma/fabric.h | 1 + man/fi_av.3.md | 5 +++-- man/fi_endpoint.3.md | 3 ++- man/fi_getinfo.3.md | 8 ++++++-- man/fi_tagged.3.md | 6 +++--- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index c6fe8f9d1ea..59d2679aeeb 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -163,6 +163,7 @@ typedef struct fid *fid_t; #define FI_PEER (1ULL << 43) /* #define FI_XPU_TRIGGER (1ULL << 44) */ +#define FI_TAGGED_DIRECTED_RECV (1ULL << 45) #define FI_TAGGED_MULTI_RECV (1ULL << 46) #define FI_HMEM (1ULL << 47) /* #define FI_VARIABLE_MSG (1ULL << 48) */ diff --git a/man/fi_av.3.md b/man/fi_av.3.md index 006ce3f9d73..7aeba1802ea 100644 --- a/man/fi_av.3.md +++ b/man/fi_av.3.md @@ -384,8 +384,9 @@ Upon successful insert with FI_AUTH_KEY flag, the returned fi_addr_t's will map endpoint address against the specified authorization keys. These fi_addr_t's can be used as the target for local data transfer operations. -If the endpoint supports `FI_DIRECTED_RECV`, these fi_addr_t's can be used to -restrict receive buffers to a specific endpoint address and authorization key. +If the endpoint supports `FI_DIRECTED_RECV` or `FI_TAGGED_DIRECTED_RECV`, these +fi_addr_t's can be used to restrict receive buffers to a specific endpoint address +and authorization key. For address vectors configured with FI_AV_USER_ID, all subsequent target events corresponding to the address being inserted will return FI_ADDR_NOTAVAIL until diff --git a/man/fi_endpoint.3.md b/man/fi_endpoint.3.md index 516615cac7f..dfb51d69d00 100644 --- a/man/fi_endpoint.3.md +++ b/man/fi_endpoint.3.md @@ -1339,7 +1339,8 @@ capability bits from the fi_info structure will be used. The following capabilities apply to the receive attributes: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV, -FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_TAGGED_MULTI_RECV, +FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, +FI_TAGGED_DIRECTED_RECV, FI_TAGGED_MULTI_RECV, FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, and FI_XPU. diff --git a/man/fi_getinfo.3.md b/man/fi_getinfo.3.md index cc3835bb7da..7a29e56f2b3 100644 --- a/man/fi_getinfo.3.md +++ b/man/fi_getinfo.3.md @@ -290,6 +290,10 @@ additional optimizations. capability is not set, then the src_addr parameter for msg and tagged receive operations is ignored. +*FI_TAGGED_DIRECTED_RECV* +: Similar to FI_DIRECTED_RECV, but only applies to tagged receive + operations. + *FI_FENCE* : Indicates that the endpoint support the FI_FENCE flag on data transfer operations. Support requires tracking that all previous @@ -466,8 +470,8 @@ may optionally report non-selected secondary capabilities if doing so would not compromise performance or security. Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_MULTICAST, -FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_HMEM, FI_COLLECTIVE, FI_XPU, -FI_AV_USER_ID, FI_PEER +FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_TAGGED_DIRECTED_RECV, FI_HMEM, +FI_COLLECTIVE, FI_XPU, FI_AV_USER_ID, FI_PEER Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, FI_REMOTE_READ, FI_REMOTE_WRITE diff --git a/man/fi_tagged.3.md b/man/fi_tagged.3.md index eaabd924323..ec8c8ab8eb7 100644 --- a/man/fi_tagged.3.md +++ b/man/fi_tagged.3.md @@ -310,9 +310,9 @@ and/or fi_tsendmsg. *FI_AUTH_KEY* : Only valid with domains configured with FI_AV_AUTH_KEY and connectionless - endpoints configured with FI_DIRECTED_RECV. When used with fi_trecvmsg, this - flag denotes that the src_addr is an authorization key fi_addr_t instead of - an endpoint fi_addr_t. + endpoints configured with FI_DIRECTED_RECV or FI_TAGGED_DIRECTED_RECV. When + used with fi_trecvmsg, this flag denotes that the src_addr is an authorization + key fi_addr_t instead of an endpoint fi_addr_t. The following flags may be used with fi_trecvmsg. From e75829128326114909841a17c8f0f8e395de2ab9 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Sun, 29 Sep 2024 20:03:55 -0700 Subject: [PATCH 127/393] core: Define capbility for directed receive without wildcard src_addr The new bit FI_EXACT_DIRECTED_RECV is similar to FI_DIRECTED_RECV, but requires exact source address. I.e., the wildcard address FI_ADDR_UNSPEC is not allowed. It can be used alone, or be used together with FI_DIRECTED_RECV or FI_TAGGED_DIRECTED_RECV as a modifier. Not allowing wildcard source address allows the provider to better optmize the receive handling. Signed-off-by: Jianxin Xiong --- include/rdma/fabric.h | 2 +- man/fi_getinfo.3.md | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index 59d2679aeeb..c96d2c79ddc 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -166,7 +166,7 @@ typedef struct fid *fid_t; #define FI_TAGGED_DIRECTED_RECV (1ULL << 45) #define FI_TAGGED_MULTI_RECV (1ULL << 46) #define FI_HMEM (1ULL << 47) -/* #define FI_VARIABLE_MSG (1ULL << 48) */ +#define FI_EXACT_DIRECTED_RECV (1ULL << 48) #define FI_RMA_PMEM (1ULL << 49) #define FI_SOURCE_ERR (1ULL << 50) #define FI_LOCAL_COMM (1ULL << 51) diff --git a/man/fi_getinfo.3.md b/man/fi_getinfo.3.md index 7a29e56f2b3..1f5b6e8b5ae 100644 --- a/man/fi_getinfo.3.md +++ b/man/fi_getinfo.3.md @@ -294,6 +294,13 @@ additional optimizations. : Similar to FI_DIRECTED_RECV, but only applies to tagged receive operations. +*FI_EXACT_DIRECTED_RECV* +: Similar to FI_DIRECTED_RECV, but requires the source address to be + exact, i.e., FI_ADDR_UNSPEC is not allowed. This capability can + be used alone, or in conjunction with FI_DIRECTED_RECV or + FI_TAGGED_DIRECTED_RECV as a modifier to disallow FI_ADDR_UNSPEC + being used as the source address. + *FI_FENCE* : Indicates that the endpoint support the FI_FENCE flag on data transfer operations. Support requires tracking that all previous From 3e162dfdf29d72cd6d33c53da23605c6f7181e45 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Sun, 18 Aug 2024 16:53:34 -0700 Subject: [PATCH 128/393] core: Introduce Sub-MR Memory registration consists of two parts: map/pin the memory for local access and export with a key for remote access. The first part is usually heavyweight and requries kernel involvement. The second part is less expensive and can be further separated into key allocation and key assignment. Key allocation may needs kernel involvement, but key assignment can be done in user space. Here sub-MR is introduced as a way to allow separattion of the forementioned two parts, and key reservation is added to further optimize sub-MR creation. A sub-MR is created from an existing MR (the base MR). It inherits the memory mapping/pinning of the base MR but has its own access key. The address range exposed can be same as the the base MR or a subpart of that. The access rights can be different, too. Now the base MR can be created with a few extra keys reserved. These reserved keys will be automatically used for sub-MR registration. This only applies to FI_MR_PROV_KEY mode. Signed-off-by: Jianxin Xiong --- include/rdma/fi_domain.h | 2 ++ man/fi_mr.3.md | 39 +++++++++++++++++++++++++++++++++++-- prov/util/src/util_mr_map.c | 8 ++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h index d42080c5f36..0e6d0acb605 100644 --- a/include/rdma/fi_domain.h +++ b/include/rdma/fi_domain.h @@ -182,6 +182,8 @@ struct fi_mr_attr { } device; void *hmem_data; size_t page_size; + const struct fid_mr *base_mr; + size_t sub_mr_cnt; }; struct fi_mr_modify { diff --git a/man/fi_mr.3.md b/man/fi_mr.3.md index 532b1f48a93..7e13d587c47 100644 --- a/man/fi_mr.3.md +++ b/man/fi_mr.3.md @@ -139,6 +139,14 @@ attributes (mr_mode field). Each mr_mode bit requires that an application take specific steps in order to use memory buffers with libfabric interfaces. +As a special case, a new memory region can be created from an existing +memory region. Such a new memory region is called a sub-MR, and the existing +memory region is called the base MR. Sub-MRs may be used to shared hardware +resources, such as virtual to physical address translations and page pinning. +This can improve performance when creating and destroying sub-regions that +need different access rights. The base MR itself can also be a sub-MR, +allowing for a hierarchy of memory regions. + The following apply to memory registration. *Default Memory Registration* @@ -575,8 +583,8 @@ into calls as function parameters. ```c struct fi_mr_attr { union { - const struct iovec *mr_iov; - const struct fi_mr_dmabuf *dmabuf; + const struct iovec *mr_iov; + const struct fi_mr_dmabuf *dmabuf; }; size_t iov_count; uint64_t access; @@ -595,6 +603,8 @@ struct fi_mr_attr { } device; void *hmem_data; size_t page_size; + const struct fid_mr *base_mr; + size_t sub_mr_cnt; }; struct fi_mr_auth_key { @@ -810,6 +820,31 @@ or from the region. Providers may choose to ignore page size. This will result in a provider selected page size always being used. +## base_mr + +If non-NULL, create a sub-MR from an existing memory region specified by +the base_mr field. + +The sub-MR must be fully contained within the base MR; however, the sub-MR +has its own authorization keys and access rights. The following attributes +are inherited from the base MR, and as a result, are ignored when creating the +sub-MR: + +iface, device, hmem_data, page_size + +The sub-MR should hold a reference to the base MR. When fi_close is called +on the base MR, the call would fail if there are any outstanding sub-MRs. + +The base_mr field must be NULL if the FI_MR_DMABUF flag is set. + +## sub_mr_cnt + +The number of sub-MRs expected to be created from the memory region. This +value is not a limit. Instead, it is a hint to the provider to allow provider +specific optimization for sub-MR creation. For example, the provider may +reserve access keys or pre-allocation fid_mr objects. The provider may +ignore this hint. + ## fi_hmem_ze_device Returns an hmem device identifier for a level zero tuple. diff --git a/prov/util/src/util_mr_map.c b/prov/util/src/util_mr_map.c index f08e350b4db..be337247ad7 100644 --- a/prov/util/src/util_mr_map.c +++ b/prov/util/src/util_mr_map.c @@ -285,6 +285,14 @@ void ofi_mr_update_attr(uint32_t user_version, uint64_t caps, cur_abi_attr->page_size = user_attr->page_size; else cur_abi_attr->page_size = 0; + + if (FI_VERSION_GE(user_version, FI_VERSION(2, 0))) { + cur_abi_attr->base_mr = user_attr->base_mr; + cur_abi_attr->sub_mr_cnt = user_attr->sub_mr_cnt; + } else { + cur_abi_attr->base_mr = NULL; + cur_abi_attr->sub_mr_cnt = 0; + } } int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, From 2669c6211725194a1ea3e2e7ef2adf19743a9818 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Thu, 10 Oct 2024 14:25:08 -0700 Subject: [PATCH 129/393] man: Clarify FI_HMEM support of inject calls Add text to clarify that only FI_HMEM_SYSTEM is allowed for inject calls if FI_MR_HMEM is required. Signed-off-by: Jianxin Xiong --- man/fi_msg.3.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/man/fi_msg.3.md b/man/fi_msg.3.md index 4b6e67cf876..1dd5ecd5ae5 100644 --- a/man/fi_msg.3.md +++ b/man/fi_msg.3.md @@ -173,6 +173,11 @@ to write CQ entries for all successful completions. See the flags discussion below for more details. The requested message size that can be used with fi_inject is limited by inject_size. +If FI_HMEM is enabled, the fi_inject call can only accept buffer with +iface equal to FI_HMEM_SYSTEM if the provider requires the FI_MR_HMEM +mr_mode. This limitation applies to all the fi_\*inject\* calls and +does not affect how inject_size is reported. + ## fi_senddata The send data call is similar to fi_send, but allows for the sending From d9c31ef3cbf29b8040b5a06183d09a2f61afc981 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Tue, 15 Oct 2024 21:17:00 +0000 Subject: [PATCH 130/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man3/fi_av.3 | 9 ++++---- man/man3/fi_endpoint.3 | 7 +++--- man/man3/fi_getinfo.3 | 28 ++++++++++++++++++----- man/man3/fi_mr.3 | 52 +++++++++++++++++++++++++++++++++++++++--- man/man3/fi_msg.3 | 8 ++++++- man/man3/fi_tagged.3 | 25 ++++++++++++++++++-- 6 files changed, 110 insertions(+), 19 deletions(-) diff --git a/man/man3/fi_av.3 b/man/man3/fi_av.3 index 45d58d2dbf0..866d0d635cc 100644 --- a/man/man3/fi_av.3 +++ b/man/man3/fi_av.3 @@ -14,7 +14,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_av" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_av" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -430,9 +430,10 @@ authorization keys. These fi_addr_t\[cq]s can be used as the target for local data transfer operations. .PP -If the endpoint supports \f[V]FI_DIRECTED_RECV\f[R], these -fi_addr_t\[cq]s can be used to restrict receive buffers to a specific -endpoint address and authorization key. +If the endpoint supports \f[V]FI_DIRECTED_RECV\f[R] or +\f[V]FI_TAGGED_DIRECTED_RECV\f[R], these fi_addr_t\[cq]s can be used to +restrict receive buffers to a specific endpoint address and +authorization key. .PP For address vectors configured with FI_AV_USER_ID, all subsequent target events corresponding to the address being inserted will return diff --git a/man/man3/fi_endpoint.3 b/man/man3/fi_endpoint.3 index 5f17bd5c5b1..ea3b4076c9f 100644 --- a/man/man3/fi_endpoint.3 +++ b/man/man3/fi_endpoint.3 @@ -14,7 +14,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_endpoint" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_endpoint" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -1502,8 +1502,9 @@ capability bits from the fi_info structure will be used. .PP The following capabilities apply to the receive attributes: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV, -FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_MULTI_RECV, -FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, and FI_XPU. +FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, +FI_TAGGED_DIRECTED_RECV, FI_TAGGED_MULTI_RECV, FI_MULTI_RECV, FI_SOURCE, +FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, and FI_XPU. .PP Many applications will be able to ignore this field and rely solely on the fi_info::caps field. diff --git a/man/man3/fi_getinfo.3 b/man/man3/fi_getinfo.3 index 5cf752d53fd..c4ee256d526 100644 --- a/man/man3/fi_getinfo.3 +++ b/man/man3/fi_getinfo.3 @@ -14,7 +14,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_getinfo" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_getinfo" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -337,6 +337,17 @@ incoming message when matching it with a receive buffer. If this capability is not set, then the src_addr parameter for msg and tagged receive operations is ignored. .TP +\f[I]FI_TAGGED_DIRECTED_RECV\f[R] +Similar to FI_DIRECTED_RECV, but only applies to tagged receive +operations. +.TP +\f[I]FI_EXACT_DIRECTED_RECV\f[R] +Similar to FI_DIRECTED_RECV, but requires the source address to be +exact, i.e., FI_ADDR_UNSPEC is not allowed. +This capability can be used alone, or in conjunction with +FI_DIRECTED_RECV or FI_TAGGED_DIRECTED_RECV as a modifier to disallow +FI_ADDR_UNSPEC being used as the source address. +.TP \f[I]FI_FENCE\f[R] Indicates that the endpoint support the FI_FENCE flag on data transfer operations. @@ -386,6 +397,10 @@ send-only or receive-only. Specifies that the endpoint must support the FI_MULTI_RECV flag when posting receive buffers. .TP +\f[I]FI_TAGGED_MULTI_RECV\f[R] +Specifies that the endpoint must support the FI_MULTI_RECV flag when +posting tagged receive buffers. +.TP \f[I]FI_NAMED_RX_CTX\f[R] Requests that endpoints which support multiple receive contexts allow an initiator to target (or name) a specific receive context as part of a @@ -527,15 +542,16 @@ A provider may optionally report non-selected secondary capabilities if doing so would not compromise performance or security. .PP Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, -FI_MULTICAST, FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_HMEM, FI_COLLECTIVE, -FI_XPU, FI_AV_USER_ID, FI_PEER +FI_MULTICAST, FI_NAMED_RX_CTX, FI_DIRECTED_RECV, +FI_TAGGED_DIRECTED_RECV, FI_HMEM, FI_COLLECTIVE, FI_XPU, FI_AV_USER_ID, +FI_PEER .PP Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, FI_REMOTE_READ, FI_REMOTE_WRITE .PP -Secondary capabilities: FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, -FI_SHARED_AV, FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, FI_REMOTE_COMM, -FI_SOURCE_ERR, FI_RMA_PMEM. +Secondary capabilities: FI_MULTI_RECV, FI_TAGGED_MULTI_RECV, FI_SOURCE, +FI_RMA_EVENT, FI_SHARED_AV, FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, +FI_REMOTE_COMM, FI_SOURCE_ERR, FI_RMA_PMEM. .SH MODE .PP The operational mode bits are used to convey requirements that an diff --git a/man/man3/fi_mr.3 b/man/man3/fi_mr.3 index dd1a03d27a0..c651e29d73d 100644 --- a/man/man3/fi_mr.3 +++ b/man/man3/fi_mr.3 @@ -14,7 +14,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_mr" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_mr" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -152,6 +152,17 @@ mode bits, specified through the domain attributes (mr_mode field). Each mr_mode bit requires that an application take specific steps in order to use memory buffers with libfabric interfaces. .PP +As a special case, a new memory region can be created from an existing +memory region. +Such a new memory region is called a sub-MR, and the existing memory +region is called the base MR. +Sub-MRs may be used to shared hardware resources, such as virtual to +physical address translations and page pinning. +This can improve performance when creating and destroying sub-regions +that need different access rights. +The base MR itself can also be a sub-MR, allowing for a hierarchy of +memory regions. +.PP The following apply to memory registration. .TP \f[I]Default Memory Registration\f[R] @@ -630,8 +641,8 @@ passed directly into calls as function parameters. \f[C] struct fi_mr_attr { union { - const struct iovec *mr_iov; - const struct fi_mr_dmabuf *dmabuf; + const struct iovec *mr_iov; + const struct fi_mr_dmabuf *dmabuf; }; size_t iov_count; uint64_t access; @@ -650,6 +661,8 @@ struct fi_mr_attr { } device; void *hmem_data; size_t page_size; + const struct fid_mr *base_mr; + size_t sub_mr_cnt; }; struct fi_mr_auth_key { @@ -880,6 +893,32 @@ failed transfers to or from the region. .PP Providers may choose to ignore page size. This will result in a provider selected page size always being used. +.SS base_mr +.PP +If non-NULL, create a sub-MR from an existing memory region specified by +the base_mr field. +.PP +The sub-MR must be fully contained within the base MR; however, the +sub-MR has its own authorization keys and access rights. +The following attributes are inherited from the base MR, and as a +result, are ignored when creating the sub-MR: +.PP +iface, device, hmem_data, page_size +.PP +The sub-MR should hold a reference to the base MR. +When fi_close is called on the base MR, the call would fail if there are +any outstanding sub-MRs. +.PP +The base_mr field must be NULL if the FI_MR_DMABUF flag is set. +.SS sub_mr_cnt +.PP +The number of sub-MRs expected to be created from the memory region. +This value is not a limit. +Instead, it is a hint to the provider to allow provider specific +optimization for sub-MR creation. +For example, the provider may reserve access keys or pre-allocation +fid_mr objects. +The provider may ignore this hint. .SS fi_hmem_ze_device .PP Returns an hmem device identifier for a level zero @@ -979,6 +1018,13 @@ fi_mr_attr structure. This flag is only usable for domains opened with FI_HMEM capability support. .TP +\f[I]FI_MR_SINGLE_USE\f[R] +This flag indicates that the memory region is only used for a single +operation. +After the operation is complete, the key associated with the memory +region is automatically invalidated and can no longer be used for remote +access. +.TP \f[I]FI_AUTH_KEY\f[R] Only valid with domains configured with FI_AV_AUTH_KEY. When used with fi_mr_regattr, this flag denotes that the diff --git a/man/man3/fi_msg.3 b/man/man3/fi_msg.3 index 81dbfdfc34b..4343f68e1ad 100644 --- a/man/man3/fi_msg.3 +++ b/man/man3/fi_msg.3 @@ -14,7 +14,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_msg" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_msg" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -186,6 +186,12 @@ of the endpoint is to write CQ entries for all successful completions. See the flags discussion below for more details. The requested message size that can be used with fi_inject is limited by inject_size. +.PP +If FI_HMEM is enabled, the fi_inject call can only accept buffer with +iface equal to FI_HMEM_SYSTEM if the provider requires the FI_MR_HMEM +mr_mode. +This limitation applies to all the fi_*inject* calls and does not affect +how inject_size is reported. .SS fi_senddata .PP The send data call is similar to fi_send, but allows for the sending of diff --git a/man/man3/fi_tagged.3 b/man/man3/fi_tagged.3 index 1ac5bf963f8..399c4d278f9 100644 --- a/man/man3/fi_tagged.3 +++ b/man/man3/fi_tagged.3 @@ -14,7 +14,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_tagged" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_tagged" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -283,6 +283,26 @@ This may require that the underlying provider implementation copy the data into a local buffer and transfer out of that buffer. This flag can only be used with messages smaller than inject_size. .TP +\f[I]FI_MULTI_RECV\f[R] +Applies to posted tagged receive operations when the +FI_TAGGED_MULTI_RECV capability is enabled. +This flag allows the user to post a single tagged receive buffer that +will receive multiple incoming messages. +Received messages will be packed into the receive buffer until the +buffer has been consumed. +Use of this flag may cause a single posted receive operation to generate +multiple events as messages are placed into the buffer. +The placement of received data into the buffer may be subjected to +provider specific alignment restrictions. +.PP +The buffer will be released by the provider when the available buffer +space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). +Note that an entry to the associated receive completion queue will +always be generated when the buffer has been consumed, even if other +receive completions have been suppressed (i.e.\ the Rx context has been +configured for FI_SELECTIVE_COMPLETION). +See the FI_MULTI_RECV completion flag \f[V]fi_cq\f[R](3). +.TP \f[I]FI_INJECT_COMPLETE\f[R] Applies to fi_tsendmsg. Indicates that a completion should be generated when the source @@ -315,7 +335,8 @@ operation (inclusive) to the posting of a subsequent fenced operation .TP \f[I]FI_AUTH_KEY\f[R] Only valid with domains configured with FI_AV_AUTH_KEY and -connectionless endpoints configured with FI_DIRECTED_RECV. +connectionless endpoints configured with FI_DIRECTED_RECV or +FI_TAGGED_DIRECTED_RECV. When used with fi_trecvmsg, this flag denotes that the src_addr is an authorization key fi_addr_t instead of an endpoint fi_addr_t. .PP From 0031e7e3ef726eb8da17bc40ded0bceb3ab0266e Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 16 Oct 2024 16:18:38 +0000 Subject: [PATCH 131/393] contrib/aws: Increase Jenkins Job Timeout to 10h Signed-off-by: Seth Zegelstein --- contrib/aws/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index 74c6adf4f88..bba5a753093 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -151,7 +151,7 @@ pipeline { } options { buildDiscarder(logRotator(daysToKeepStr: "90")) - timeout(time: 8, unit: 'HOURS') + timeout(time: 10, unit: 'HOURS') } environment { // AWS region where the cluster is created From b66e9d1974db8071be7bd063225c6d57c81e8a7b Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 16 Oct 2024 16:22:46 +0000 Subject: [PATCH 132/393] contrib/aws: Set cluster level timeout for 9 hours Signed-off-by: Seth Zegelstein --- contrib/aws/Jenkinsfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index bba5a753093..5b3e22e8444 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -190,7 +190,8 @@ pipeline { script { def stages = [:] // This needs the extra space at the end - def addl_args_pr = "--test-libfabric-pr $env.CHANGE_ID " + // Set 9 hour timeout for all clusters + def addl_args_pr = "--timeout 540 --test-libfabric-pr $env.CHANGE_ID " // Single Node Tests - EFA stages["1_g4dn_alinux2-efa"] = get_test_stage("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) From 56a08e033a08c59f9cae9f63adea72038d919e77 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 16 Oct 2024 16:27:57 +0000 Subject: [PATCH 133/393] contrib/aws: Simplify Jenkinsfile Get rid of unstable b/c our jobs are either pass/fail. Get rid of try/catch when calling shell scripts with throw b/c we don't need to do anything afterwards. Signed-off-by: Seth Zegelstein --- contrib/aws/Jenkinsfile | 49 ++++++++++------------------------------- 1 file changed, 12 insertions(+), 37 deletions(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index 5b3e22e8444..ad3a5391095 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -23,16 +23,11 @@ def download_and_extract_portafiducia(outputDir) { /* Download PortaFiducia tarball from S3 and extract to outputDir */ def tempPath = "/tmp/portafiducia.tar.gz" def downloadPath = this.get_portafiducia_download_path() - - def ret = sh ( - script: "mkdir -p ${outputDir} && aws s3 cp ${downloadPath} ${tempPath} && " + - "tar xf ${tempPath} -C ${outputDir}", - returnStatus: true, - ) - - if (ret != 0) { - unstable('Failed to download and extract PortaFiducia') - } + sh """ + mkdir -p ${outputDir} + aws s3 cp ${downloadPath} ${tempPath} + tar xf ${tempPath} -C ${outputDir} + """ } def install_porta_fiducia() { @@ -55,17 +50,7 @@ def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_ */ def cluster_name = get_cluster_name(build_tag, os, instance_type) def args = "--config configs/${test_config_file} --os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml" - def ret = sh ( - script: ". venv/bin/activate; cd PortaFiducia/tests && ./test_orchestrator.py ${args}", - returnStatus: true - ) - if (ret == 65) - unstable('Scripts exited with status 65') - else if (ret != 0) - build_ok = false - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - sh "exit ${ret}" - } + sh ". venv/bin/activate; cd PortaFiducia/tests && ./test_orchestrator.py ${args}" } def get_random_string(len) { @@ -102,22 +87,12 @@ def get_single_node_windows_test_stage(stage_name) { */ return { stage("${stage_name}") { - def ret = sh ( - script: """ - . venv/bin/activate; - cd PortaFiducia/scripts; - export PULL_REQUEST_ID=${env.CHANGE_ID}; - env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py --ci public --s3-bucket-name libfabric-ci-windows-prod-test-output --pull-request-id ${env.CHANGE_ID}; - """, - returnStatus: true - ) - if (ret == 65) - unstable('Scripts exited with status 65') - else if (ret != 0) - build_ok = false - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - sh "exit ${ret}" - } + sh """ + . venv/bin/activate; + cd PortaFiducia/scripts; + export PULL_REQUEST_ID=${env.CHANGE_ID}; + env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py --ci public --s3-bucket-name libfabric-ci-windows-prod-test-output --pull-request-id ${env.CHANGE_ID}; + """ } } From a56a951e2aea1402966037f159a631aeec016a77 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 7 Oct 2024 08:50:32 -0500 Subject: [PATCH 134/393] prov/cxi: Update CXI provider Push internal CXI provider upstream. Signed-off-by: Ian Ziemba --- prov/cxi/configure.m4 | 4 +- prov/cxi/include/cxip.h | 133 +++- prov/cxi/include/cxip_faults.h | 93 +-- prov/cxi/include/fi_cxi_ext.h | 65 +- prov/cxi/src/cxip_atomic.c | 20 +- prov/cxi/src/cxip_cmdq.c | 128 +++- prov/cxi/src/cxip_cntr.c | 89 ++- prov/cxi/src/cxip_coll.c | 935 ++++++++++++++++++++-------- prov/cxi/src/cxip_coll_trace.c | 36 +- prov/cxi/src/cxip_cq.c | 70 ++- prov/cxi/src/cxip_ctrl.c | 2 +- prov/cxi/src/cxip_curl.c | 370 ++++++++--- prov/cxi/src/cxip_dom.c | 4 +- prov/cxi/src/cxip_ep.c | 15 +- prov/cxi/src/cxip_eq.c | 16 +- prov/cxi/src/cxip_evtq.c | 2 +- prov/cxi/src/cxip_faults.c | 86 ++- prov/cxi/src/cxip_if.c | 2 +- prov/cxi/src/cxip_info.c | 41 +- prov/cxi/src/cxip_iomm.c | 20 +- prov/cxi/src/cxip_mr.c | 65 +- prov/cxi/src/cxip_msg.c | 21 +- prov/cxi/src/cxip_msg_hpc.c | 84 +-- prov/cxi/src/cxip_msg_rnr.c | 24 +- prov/cxi/src/cxip_repsum.c | 4 +- prov/cxi/src/cxip_rma.c | 51 +- prov/cxi/src/cxip_rxc.c | 6 +- prov/cxi/src/cxip_telemetry.c | 53 +- prov/cxi/src/cxip_txc.c | 53 +- prov/cxi/test/atomic.c | 319 +++++++++- prov/cxi/test/cntr.c | 66 ++ prov/cxi/test/coll.c | 70 +-- prov/cxi/test/cuda.c | 10 +- prov/cxi/test/cxi_vm_commit.sh | 97 +++ prov/cxi/test/cxi_vm_pr.sh | 19 + prov/cxi/test/ep.c | 5 +- prov/cxi/test/mr.c | 6 +- prov/cxi/test/multinode/test_coll.c | 876 ++++++++++++-------------- prov/cxi/test/rma.c | 271 +++++++- prov/cxi/test/startvm-setup.sh | 2 +- prov/cxi/test/startvm.sh | 2 +- prov/cxi/test/test.sh | 0 42 files changed, 2902 insertions(+), 1333 deletions(-) create mode 100755 prov/cxi/test/cxi_vm_commit.sh create mode 100755 prov/cxi/test/cxi_vm_pr.sh mode change 100644 => 100755 prov/cxi/test/test.sh diff --git a/prov/cxi/configure.m4 b/prov/cxi/configure.m4 index b8b53d9fdb3..ec76590ca2e 100644 --- a/prov/cxi/configure.m4 +++ b/prov/cxi/configure.m4 @@ -97,8 +97,8 @@ AC_DEFUN([FI_CXI_CONFIGURE],[ cxi_LIBS="$cxi_LIBS $libcurl_LIBS" # Add on json if installed in non-default location. - if test "$with_json" != "" && test "$with_json" != "no"; then - FI_CHECK_PREFIX_DIR([$with_json], [json]) + if test "$with_json_c" != "" && test "$with_json_c" != "no"; then + FI_CHECK_PREFIX_DIR([$with_json_c], [json]) else json_PREFIX="" json_LIBDIR="" diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index b959ff99eaa..f1b38ed7ea9 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -4,7 +4,7 @@ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved. - * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP + * Copyright (c) 2018-2024 Hewlett Packard Enterprise Development LP */ #ifndef _CXIP_PROV_H_ @@ -123,6 +123,9 @@ #define CXIP_UX_BUFFER_SIZE (CXIP_OFLOW_BUF_MIN_POSTED * \ CXIP_OFLOW_BUF_SIZE) +#define CXIP_MR_CACHE_EVENTS_DISABLE_POLL_NSECS 100000U +#define CXIP_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS 1000000000U + /* When device memory is safe to access via load/store then the * CPU will be used to move data below this threshold. */ @@ -131,8 +134,8 @@ #define CXIP_EP_PRI_CAPS \ (FI_RMA | FI_ATOMICS | FI_TAGGED | FI_RECV | FI_SEND | \ FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE | \ - FI_DIRECTED_RECV | FI_MSG | FI_NAMED_RX_CTX | \ - FI_COLLECTIVE | FI_HMEM) + FI_DIRECTED_RECV | FI_MSG | FI_NAMED_RX_CTX | FI_HMEM | \ + FI_COLLECTIVE) #define CXIP_EP_SEC_CAPS \ (FI_SOURCE | FI_SOURCE_ERR | FI_LOCAL_COMM | \ FI_REMOTE_COMM | FI_RMA_EVENT | FI_MULTI_RECV | FI_FENCE | FI_TRIGGER) @@ -148,8 +151,7 @@ FI_ORDER_ATOMIC_RAR) #define CXIP_EP_CQ_FLAGS \ - (FI_SEND | FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION | \ - FI_COLLECTIVE) + (FI_SEND | FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION) #define CXIP_EP_CNTR_FLAGS \ (FI_SEND | FI_RECV | FI_READ | FI_WRITE | FI_REMOTE_READ | \ FI_REMOTE_WRITE) @@ -177,7 +179,7 @@ #define CXIP_MINOR_VERSION 1 #define CXIP_PROV_VERSION FI_VERSION(CXIP_MAJOR_VERSION, \ CXIP_MINOR_VERSION) -#define CXIP_FI_VERSION FI_VERSION(1, 21) +#define CXIP_FI_VERSION FI_VERSION(1, 22) #define CXIP_WIRE_PROTO_VERSION 1 #define CXIP_COLL_MAX_CONCUR 8 @@ -185,19 +187,24 @@ #define CXIP_COLL_MIN_RX_SIZE 4096 #define CXIP_COLL_MIN_MULTI_RECV 64 #define CXIP_COLL_MAX_DATA_SIZE 32 -#define CXIP_COLL_MAX_SEQNO (1 << 10) +#define CXIP_COLL_MAX_SEQNO ((1 << 10) - 1) +#define CXIP_COLL_MOD_SEQNO (CXIP_COLL_MAX_SEQNO - 1) + // TODO adjust based on performance testing -#define CXIP_COLL_MIN_RETRY_USEC 1 -#define CXIP_COLL_MAX_RETRY_USEC 32000 -#define CXIP_COLL_MIN_TIMEOUT_USEC 1 -#define CXIP_COLL_MAX_TIMEOUT_USEC 32000 +#define CXIP_COLL_MIN_RETRY_USEC 1 +#define CXIP_COLL_MAX_RETRY_USEC 32000 +#define CXIP_COLL_MIN_TIMEOUT_USEC 1 +#define CXIP_COLL_MAX_TIMEOUT_USEC 32000 +#define CXIP_COLL_MIN_FM_TIMEOUT_MSEC 1 +#define CXIP_COLL_DFL_FM_TIMEOUT_MSEC 100 +#define CXIP_COLL_MAX_FM_TIMEOUT_MSEC 1000000 #define CXIP_REQ_BUF_HEADER_MAX_SIZE (sizeof(struct c_port_fab_hdr) + \ sizeof(struct c_port_unrestricted_hdr)) #define CXIP_REQ_BUF_HEADER_MIN_SIZE (sizeof(struct c_port_fab_hdr) + \ sizeof(struct c_port_small_msg_hdr)) -extern int s_page_size; +extern int sc_page_size; extern char cxip_prov_name[]; extern struct fi_provider cxip_prov; extern struct util_prov cxip_util_prov; @@ -301,6 +308,7 @@ struct cxip_environment { char *coll_job_step_id; size_t coll_retry_usec; size_t coll_timeout_usec; + size_t coll_fm_timeout_msec; char *coll_fabric_mgr_url; char *coll_mcast_token; size_t hwcoll_addrs_per_job; @@ -316,6 +324,8 @@ struct cxip_environment { int enable_trig_op_limit; int hybrid_posted_recv_preemptive; int hybrid_unexpected_msg_preemptive; + size_t mr_cache_events_disable_poll_nsecs; + size_t mr_cache_events_disable_le_poll_nsecs; }; extern struct cxip_environment cxip_env; @@ -713,7 +723,7 @@ struct cxip_lni { /* Software remapped communication profiles. */ struct dlist_entry remap_cps; - ofi_spin_t lock; + pthread_rwlock_t cp_lock; }; /* A portals table define a network endpoint address. The endpoint address is @@ -1144,7 +1154,7 @@ struct cxip_req_recv { uint32_t rdzv_initiator; // Rendezvous initiator used for mrecvs uint32_t rget_nic; uint32_t rget_pid; - bool software_list; // Appended to HW or SW + int multirecv_inflight; // SW EP Multi-receives in progress bool canceled; // Request canceled? bool unlinked; bool multi_recv; @@ -1850,7 +1860,7 @@ struct cxip_rxc { struct cxip_evtq rx_evtq; struct cxip_pte *rx_pte; struct cxip_cmdq *rx_cmdq; - ofi_atomic32_t orx_reqs; + int orx_reqs; /* If FI_MULTI_RECV is supported, minimum receive size required * for buffers posted. @@ -2212,7 +2222,7 @@ struct cxip_txc { struct ofi_bufpool *ibuf_pool; struct cxip_cmdq *tx_cmdq; // added during cxip_txc_enable() - ofi_atomic32_t otx_reqs; // outstanding transmit requests + int otx_reqs; // outstanding transmit requests /* Queue of TX messages in flight for the context */ struct dlist_entry msg_queue; @@ -2431,6 +2441,54 @@ struct cxip_ep_obj { struct cxip_portals_table *ptable; }; +static inline void cxip_txc_otx_reqs_inc(struct cxip_txc *txc) +{ + assert(ofi_genlock_held(&txc->ep_obj->lock) == 1); + txc->otx_reqs++; +} + +static inline void cxip_txc_otx_reqs_dec(struct cxip_txc *txc) +{ + assert(ofi_genlock_held(&txc->ep_obj->lock) == 1); + txc->otx_reqs--; + assert(txc->otx_reqs >= 0); +} + +static inline int cxip_txc_otx_reqs_get(struct cxip_txc *txc) +{ + assert(ofi_genlock_held(&txc->ep_obj->lock) == 1); + return txc->otx_reqs; +} + +static inline void cxip_txc_otx_reqs_init(struct cxip_txc *txc) +{ + txc->otx_reqs = 0; +} + +static inline void cxip_rxc_orx_reqs_inc(struct cxip_rxc *rxc) +{ + assert(ofi_genlock_held(&rxc->ep_obj->lock) == 1); + rxc->orx_reqs++; +} + +static inline void cxip_rxc_orx_reqs_dec(struct cxip_rxc *rxc) +{ + assert(ofi_genlock_held(&rxc->ep_obj->lock) == 1); + rxc->orx_reqs--; + assert(rxc->orx_reqs >= 0); +} + +static inline int cxip_rxc_orx_reqs_get(struct cxip_rxc *rxc) +{ + assert(ofi_genlock_held(&rxc->ep_obj->lock) == 1); + return rxc->orx_reqs; +} + +static inline void cxip_rxc_orx_reqs_init(struct cxip_rxc *rxc) +{ + rxc->orx_reqs = 0; +} + /* * CXI endpoint implementations to support FI_CLASS_EP. */ @@ -2711,18 +2769,7 @@ enum cxip_coll_state { CXIP_COLL_STATE_FAULT, }; -/* Similar to C_RC_* provider errors, but pure libfabric */ -/* These should be in priority order, from lowest to highest */ -enum cxip_coll_prov_errno { - CXIP_PROV_ERRNO_OK = -1, // good - CXIP_PROV_ERRNO_PTE = -2, // PTE setup failure - CXIP_PROV_ERRNO_MCAST_INUSE = -3, // multicast in-use - CXIP_PROV_ERRNO_HWROOT_INUSE = -4, // hwroot in-use - CXIP_PROV_ERRNO_MCAST_INVALID = -5, // multicast invalid - CXIP_PROV_ERRNO_HWROOT_INVALID = -6, // hwroot invalid - CXIP_PROV_ERRNO_CURL = -7, // CURL failure - CXIP_PROV_ERRNO_LAST = -8, // last error code (unused) -}; +const char *cxip_strerror(int prov_errno); /* Rosetta reduction engine error codes */ typedef enum cxip_coll_rc { @@ -2778,6 +2825,33 @@ struct cxip_coll_data { bool initialized; }; +struct coll_counters { + int32_t coll_recv_cnt; + int32_t send_cnt; + int32_t recv_cnt; + int32_t pkt_cnt; + int32_t seq_err_cnt; + int32_t tmout_cnt; +}; + +struct cxip_coll_metrics_ep { + int myrank; + bool isroot; +}; +struct cxip_coll_metrics { + long red_count_bad; + long red_count_full; + long red_count_partial; + long red_count_unreduced; + struct cxip_coll_metrics_ep ep_data; +}; + +void cxip_coll_reset_mc_ctrs(struct fid_mc *mc); +void cxip_coll_get_mc_ctrs(struct fid_mc *mc, struct coll_counters *counters); + +void cxip_coll_init_metrics(void); +void cxip_coll_get_metrics(struct cxip_coll_metrics *metrics); + struct cxip_coll_reduction { struct cxip_coll_mc *mc_obj; // parent mc_obj uint32_t red_id; // reduction id @@ -2807,6 +2881,7 @@ struct cxip_coll_mc { struct cxip_zbcoll_obj *zb; // zb object for zbcol struct cxip_coll_pte *coll_pte; // collective PTE struct timespec timeout; // state machine timeout + struct timespec curlexpires; // CURL delete expiration timeout fi_addr_t mynode_fiaddr; // fi_addr of this node int mynode_idx; // av_set index of this node uint32_t hwroot_idx; // av_set index of hwroot node @@ -3165,8 +3240,6 @@ void cxip_coll_limit_red_id(struct fid_mc *mc, int max_red_id); void cxip_coll_drop_send(struct cxip_coll_reduction *reduction); void cxip_coll_drop_recv(struct cxip_coll_reduction *reduction); -void cxip_coll_reset_mc_ctrs(struct fid_mc *mc); - void cxip_dbl_to_rep(struct cxip_repsum *x, double d); void cxip_rep_to_dbl(double *d, const struct cxip_repsum *x); void cxip_rep_add(struct cxip_repsum *x, const struct cxip_repsum *y); diff --git a/prov/cxi/include/cxip_faults.h b/prov/cxi/include/cxip_faults.h index e9b28f17fe9..503a178e5dd 100644 --- a/prov/cxi/include/cxip_faults.h +++ b/prov/cxi/include/cxip_faults.h @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2019 Hewlett Packard Enterprise Development LP + * Copyright (c) 2019-2024 Hewlett Packard Enterprise Development LP */ /* Fault injection. */ @@ -60,89 +60,14 @@ enum { CXIP_TRAP_GETGRP, CXIP_TRAP_BCAST, CXIP_TRAP_REDUCE, + CXIP_TRAP_CURL_FM_URL, + CXIP_TRAP_CURL_TOKEN, + CXIP_TRAP_HWROOT_INVAL, + CXIP_TRAP_HWROOT_INUSE, + CXIP_TRAP_MCAST_INUSE, CXIP_TRAP_INITPTE, - CXIP_TRAP_CURLSND, - CXIP_TRAP_CURLRCV, }; -#if ENABLE_DEBUG -/* structure used to simulate failures */ -struct _cxip_trap { - struct dlist_entry link; - int index; - int trap; - int err; -}; - -struct dlist_entry _trap_list; -bool _trap_initialized; - -static void _cxip_trap_close(void) -{ - struct _cxip_trap *trap_obj; - - if (!_trap_initialized) - return; - while (!dlist_empty(&_trap_list)) { - dlist_pop_front(&_trap_list, struct _cxip_trap, trap_obj, link); - free(trap_obj); - } -} - -static void _cxip_trap_set(int index, int trap, int err) -{ - struct _cxip_trap *trap_obj; - - if (!_trap_initialized) { - dlist_init(&_trap_list); - _trap_initialized = true; - } - trap_obj = calloc(1, sizeof(*trap_obj)); - if (!trap_obj) - return; - dlist_init(&trap_obj->link); - trap_obj->index = index; - trap_obj->trap = trap; - trap_obj->err = err; - dlist_insert_tail(&_trap_list, &trap_obj->link); -} - -static bool _cxip_trap_search(int index, int trap, int *err) -{ - struct _cxip_trap *trap_obj; - struct dlist_entry *item; - - if (!_trap_initialized) - return false; - - dlist_foreach(&_trap_list, item) { - trap_obj = container_of(item, struct _cxip_trap, link); - if (trap_obj->index != index) - continue; - if (trap_obj->trap != trap) - continue; - dlist_remove(item); - *err = trap_obj->err; - free(trap_obj); - return true; - } - return false; -} - -static inline void cxip_trap_close(void) -{ - _cxip_trap_close(); -} -static inline void cxip_trap_set(int index, int trap, int err) -{ - _cxip_trap_set(index, trap, err); -} -static inline bool cxip_trap_search(int index, int trap, int *err) -{ - return _cxip_trap_search(index, trap, err); -} -#else -static inline void cxip_trap_close(void) {} -static inline void cxip_trap_set(int a, int b, int c) {} -static inline bool cxip_trap_search(int a, int b, int *c) {return false;} -#endif +void cxip_trap_close(void); +void cxip_trap_set(int index, int trap, int err, int prov_errno); +bool cxip_trap_search(int index, int trap, int *err, int *prov_errno); diff --git a/prov/cxi/include/fi_cxi_ext.h b/prov/cxi/include/fi_cxi_ext.h index a2775cbc253..e8205fc6d2a 100644 --- a/prov/cxi/include/fi_cxi_ext.h +++ b/prov/cxi/include/fi_cxi_ext.h @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2020-2022 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020-2024 Hewlett Packard Enterprise Development LP */ #ifndef _FI_CXI_EXT_H_ @@ -67,6 +67,15 @@ enum { */ #define FI_CXI_CNTR_EVENTS_BYTES 1 /* FI_CNTR_EVENTS_BYTES */ +/* + * CXI provider specific counter flag to return current/cached counter value + * in host memory. A request to update the count is requested, but the routine + * does not wait for the update to complete. Subsequent reads will pick up + * the updated counter value. The normal behavior is to wait for a memory update + * to complete (or to use the domain ops counter routines). + */ +#define FI_CXI_CNTR_CACHED (1ULL << 32) + /* * TODO: Set this to the upstream value prior to releasing software. * This flag returned in a completion and indicates that the message was @@ -396,6 +405,60 @@ enum cxip_comm_key_type { COMM_KEY_MAX }; +/* Extends C_RC_* driver errors for libfabric */ +/* Translated to strings by cxip_strerror() -- keep synchronized */ +enum cxip_coll_prov_errno { + /* C_RC_* from cxi-driver overlaps first 6 bits of space [0,63] */ + + /* collectives CQ reduction error codes + * highest number error predominates + */ + FI_CXI_ERRNO_RED_FIRST = 1024, + FI_CXI_ERRNO_RED_FLT_OVERFLOW = 1024, + /* double precision value overflow */ + FI_CXI_ERRNO_RED_FLT_INVALID = 1025, + /* double precision sNAN/inf value */ + FI_CXI_ERRNO_RED_INT_OVERFLOW = 1026, + /* reproducible sum overflow */ + FI_CXI_ERRNO_RED_CONTR_OVERFLOW = 1027, + /* reduction contribution overflow */ + FI_CXI_ERRNO_RED_OP_MISMATCH = 1028, + /* reduction opcode mismatch */ + FI_CXI_ERRNO_RED_MC_FAILURE = 1029, + /* unused */ + FI_CXI_ERRNO_RED_OTHER = 1030, + /* non-specific reduction error, fatal */ + FI_CXI_ERRNO_RED_LAST = 1031, + + /* collectives EQ join error codes + * highest number error predominates + */ + FI_CXI_ERRNO_JOIN_FIRST = 2048, + FI_CXI_ERRNO_JOIN_MCAST_INUSE = 2048, + /* endpoint already using mcast address */ + FI_CXI_ERRNO_JOIN_HWROOT_INUSE = 2049, + /* endpoint already serving as HWRoot */ + FI_CXI_ERRNO_JOIN_MCAST_INVALID = 2050, + /* mcast address from FM is invalid */ + FI_CXI_ERRNO_JOIN_HWROOT_INVALID = 2051, + /* HWRoot address from FM is invalid */ + FI_CXI_ERRNO_JOIN_CURL_FAILED = 2052, + /* libcurl initiation failed */ + FI_CXI_ERRNO_JOIN_CURL_TIMEOUT = 2053, + /* libcurl timed out */ + FI_CXI_ERRNO_JOIN_SERVER_ERR = 2054, + /* unhandled CURL response code */ + FI_CXI_ERRNO_JOIN_FAIL_PTE = 2055, + /* libfabric PTE allocation failed */ + FI_CXI_ERRNO_JOIN_OTHER = 2056, + /* non-specific JOIN error, fatal */ + FI_CXI_ERRNO_JOIN_LAST = FI_CXI_ERRNO_JOIN_FIRST + 43, + /* LAST is determined by the 43-bit error mask . + * Result is the OR of all bits set by different endpoints. + * This reserves space for all 43 bits for new errors. + */ +}; + typedef unsigned int cxip_coll_op_t; // CXI collective opcode struct cxip_coll_mcast_key { diff --git a/prov/cxi/src/cxip_atomic.c b/prov/cxi/src/cxip_atomic.c index be5446e517c..49218a324a3 100644 --- a/prov/cxi/src/cxip_atomic.c +++ b/prov/cxi/src/cxip_atomic.c @@ -35,7 +35,7 @@ _Static_assert(CXIP_AMO_MAX_IOV == 1, "Unexpected max IOV #"); /** * Data type codes for all of the supported fi_datatype values. */ -static enum c_atomic_type _cxip_amo_type_code[OFI_DATATYPE_LAST] = { +static enum c_atomic_type _cxip_amo_type_code[] = { [FI_INT8] = C_AMO_TYPE_INT8_T, [FI_UINT8] = C_AMO_TYPE_UINT8_T, [FI_INT16] = C_AMO_TYPE_INT16_T, @@ -48,13 +48,15 @@ static enum c_atomic_type _cxip_amo_type_code[OFI_DATATYPE_LAST] = { [FI_DOUBLE] = C_AMO_TYPE_DOUBLE_T, [FI_FLOAT_COMPLEX] = C_AMO_TYPE_FLOAT_COMPLEX_T, [FI_DOUBLE_COMPLEX] = C_AMO_TYPE_DOUBLE_COMPLEX_T, + /* Only 128-bit op suppported is FI_CSWAP, so FI_INT128 should work. */ + [FI_INT128] = C_AMO_TYPE_UINT128_T, + [FI_UINT128] = C_AMO_TYPE_UINT128_T, }; -//TODO: C_AMO_TYPE_UINT128_T /** * AMO operation codes for all of the fi_op values. */ -static enum c_atomic_op _cxip_amo_op_code[OFI_ATOMIC_OP_LAST] = { +static enum c_atomic_op _cxip_amo_op_code[FI_ATOMIC_OP_LAST] = { [FI_MIN] = C_AMO_OP_MIN, [FI_MAX] = C_AMO_OP_MAX, [FI_SUM] = C_AMO_OP_SUM, @@ -82,7 +84,7 @@ static enum c_atomic_op _cxip_amo_op_code[OFI_ATOMIC_OP_LAST] = { /** * AMO swap operation codes for the CSWAP comparison conditions. */ -static enum c_cswap_op _cxip_amo_swpcode[OFI_ATOMIC_OP_LAST] = { +static enum c_cswap_op _cxip_amo_swpcode[FI_ATOMIC_OP_LAST] = { [FI_CSWAP] = C_AMO_OP_CSWAP_EQ, [FI_CSWAP_NE] = C_AMO_OP_CSWAP_NE, [FI_CSWAP_LE] = C_AMO_OP_CSWAP_LE, @@ -96,7 +98,7 @@ static enum c_cswap_op _cxip_amo_swpcode[OFI_ATOMIC_OP_LAST] = { * correspond to the 14 possible fi_datatype values. The OP_VALID() macro will * return a 1 if the (request,op,dt) triple is supported by Cassini. */ -static uint16_t _cxip_amo_valid[CXIP_RQ_AMO_LAST][OFI_ATOMIC_OP_LAST] = { +static uint16_t _cxip_amo_valid[CXIP_RQ_AMO_LAST][FI_ATOMIC_OP_LAST] = { [CXIP_RQ_AMO] = { [FI_MIN] = 0x03ff, @@ -126,7 +128,7 @@ static uint16_t _cxip_amo_valid[CXIP_RQ_AMO_LAST][OFI_ATOMIC_OP_LAST] = { }, [CXIP_RQ_AMO_SWAP] = { - [FI_CSWAP] = 0x0fff, + [FI_CSWAP] = 0xcfff, [FI_CSWAP_NE] = 0x0fff, [FI_CSWAP_LE] = 0x03ff, [FI_CSWAP_LT] = 0x03ff, @@ -175,8 +177,8 @@ int _cxip_atomic_opcode(enum cxip_amo_req_type req_type, enum fi_datatype dt, int opcode; int dtcode; - if (dt < 0 || dt >= OFI_DATATYPE_LAST || - op < 0 || op >= OFI_ATOMIC_OP_LAST) + if (dt < 0 || dt >= ARRAY_SIZE(_cxip_amo_type_code) || + op < 0 || op >= FI_ATOMIC_OP_LAST) return -FI_EINVAL; if (!OP_VALID(req_type, op, dt)) @@ -448,7 +450,7 @@ static int _cxip_amo_cb(struct cxip_req *req, const union c_event *event) TXC_WARN_RET(txc, ret, "Failed to report error\n"); } - ofi_atomic_dec32(&req->amo.txc->otx_reqs); + cxip_txc_otx_reqs_dec(req->amo.txc); cxip_evtq_req_free(req); return FI_SUCCESS; diff --git a/prov/cxi/src/cxip_cmdq.c b/prov/cxi/src/cxip_cmdq.c index d2fae71c92b..b60eb06231c 100644 --- a/prov/cxi/src/cxip_cmdq.c +++ b/prov/cxi/src/cxip_cmdq.c @@ -25,19 +25,13 @@ enum cxi_traffic_class cxip_ofi_to_cxi_tc(uint32_t ofi_tclass) } } -static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, +static int cxip_cp_find(struct cxip_lni *lni, uint16_t vni, enum cxi_traffic_class tc, enum cxi_traffic_class_type tc_type, struct cxi_cp **cp) { - int ret; - int i; struct cxip_remap_cp *sw_cp; - static const enum cxi_traffic_class remap_tc = CXI_TC_BEST_EFFORT; - ofi_spin_lock(&lni->lock); - - /* Always prefer SW remapped CPs over allocating HW CP. */ dlist_foreach_container(&lni->remap_cps, struct cxip_remap_cp, sw_cp, remap_entry) { if (sw_cp->remap_cp.vni == vni && sw_cp->remap_cp.tc == tc && @@ -47,10 +41,40 @@ static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, cxi_tc_to_str(sw_cp->remap_cp.tc), cxi_tc_type_to_str(sw_cp->remap_cp.tc_type)); *cp = &sw_cp->remap_cp; - goto success_unlock; + return FI_SUCCESS; } } + return -FI_ENOENT; +} + +static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxi_cp **cp) +{ + int ret; + int i; + struct cxip_remap_cp *sw_cp; + static const enum cxi_traffic_class remap_tc = CXI_TC_BEST_EFFORT; + + /* Always prefer SW remapped CPs over allocating HW CP. */ + pthread_rwlock_rdlock(&lni->cp_lock); + ret = cxip_cp_find(lni, vni, tc, tc_type, cp); + pthread_rwlock_unlock(&lni->cp_lock); + + if (ret == FI_SUCCESS) + return FI_SUCCESS; + + /* Need to repeat search with write lock held to ensure no CPs have + * been added in threaded env. + */ + pthread_rwlock_wrlock(&lni->cp_lock); + ret = cxip_cp_find(lni, vni, tc, tc_type, cp); + + if (ret == FI_SUCCESS) + goto success_unlock; + /* Allocate a new SW remapped CP entry and attempt to allocate the * user requested HW CP. */ @@ -113,14 +137,14 @@ static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, *cp = &sw_cp->remap_cp; success_unlock: - ofi_spin_unlock(&lni->lock); + pthread_rwlock_unlock(&lni->cp_lock); return FI_SUCCESS; err_free_sw_cp: free(sw_cp); err_unlock: - ofi_spin_unlock(&lni->lock); + pthread_rwlock_unlock(&lni->cp_lock); return ret; } @@ -144,6 +168,7 @@ int cxip_cmdq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni, ret = cxi_cq_emit_cq_lcid(cmdq->dev_cmdq, cp->lcid); if (ret) { CXIP_DBG("Failed to update CMDQ(%p) CP: %d\n", cmdq, ret); + cxi_cq_ring(cmdq->dev_cmdq); ret = -FI_EAGAIN; } else { ret = FI_SUCCESS; @@ -241,6 +266,7 @@ int cxip_cmdq_emit_c_state(struct cxip_cmdq *cmdq, ret = cxi_cq_emit_c_state(cmdq->dev_cmdq, c_state); if (ret) { CXIP_DBG("Failed to issue C_STATE command: %d\n", ret); + cxi_cq_ring(cmdq->dev_cmdq); return -FI_EAGAIN; } @@ -262,7 +288,8 @@ int cxip_cmdq_emit_idc_put(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -270,17 +297,26 @@ int cxip_cmdq_emit_idc_put(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret, fi_strerror(-ret)); - return ret; + goto err; } ret = cxi_cq_emit_idc_put(cmdq->dev_cmdq, put, buf, len); if (ret) { CXIP_WARN("Failed to emit idc_put command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma, @@ -293,7 +329,8 @@ int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -301,10 +338,19 @@ int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma, if (ret) { CXIP_WARN("Failed to emit dma command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, @@ -333,7 +379,8 @@ int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -341,7 +388,7 @@ int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret, fi_strerror(-ret)); - return ret; + goto err; } /* Fetching AMO with flush requires two commands. Ensure there is enough @@ -349,13 +396,15 @@ int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, */ if (fetching_flush && __cxi_cq_free_slots(cmdq->dev_cmdq) < 16) { CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n"); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } ret = cxi_cq_emit_idc_amo(cmdq->dev_cmdq, amo, fetching); if (ret) { CXIP_WARN("Failed to emit IDC amo\n"); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } if (fetching_flush) { @@ -367,6 +416,14 @@ int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, @@ -394,7 +451,8 @@ int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -403,13 +461,15 @@ int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, */ if (fetching_flush && __cxi_cq_free_slots(cmdq->dev_cmdq) < 16) { CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n"); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } ret = cxi_cq_emit_dma_amo(cmdq->dev_cmdq, amo, fetching); if (ret) { CXIP_WARN("Failed to emit DMA amo\n"); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } if (fetching_flush) { @@ -421,6 +481,14 @@ int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } int cxip_cmdq_emit_idc_msg(struct cxip_cmdq *cmdq, @@ -435,7 +503,8 @@ int cxip_cmdq_emit_idc_msg(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } } @@ -443,15 +512,24 @@ int cxip_cmdq_emit_idc_msg(struct cxip_cmdq *cmdq, if (ret) { CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret, fi_strerror(-ret)); - return ret; + goto err; } ret = cxi_cq_emit_idc_msg(cmdq->dev_cmdq, msg, buf, len); if (ret) { CXIP_WARN("Failed to emit idc_msg command: %d:%s\n", ret, fi_strerror(-ret)); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto err; } return FI_SUCCESS; + +err: + /* On error (e.g. command queue full), always ring the CQ to prevent + * FI_MORE deadlock. + */ + cxi_cq_ring(cmdq->dev_cmdq); + + return ret; } diff --git a/prov/cxi/src/cxip_cntr.c b/prov/cxi/src/cxip_cntr.c index 8a0989b479e..c94933fbd97 100644 --- a/prov/cxi/src/cxip_cntr.c +++ b/prov/cxi/src/cxip_cntr.c @@ -56,20 +56,33 @@ static int cxip_cntr_get_ct_error(struct cxip_cntr *cntr, uint64_t *error) struct c_ct_writeback wb_copy; int ret; - /* Only can reference the ct_failure field directly if dealing with - * system memory. Device memory requires a memcpy of the contents into - * system memory. - */ if (cntr->wb_iface == FI_HMEM_SYSTEM) { - *error = cntr->wb->ct_failure; - return FI_SUCCESS; + do { + if (cntr->wb->ct_writeback || + cntr->attr.flags & FI_CXI_CNTR_CACHED) { + *error = cntr->wb->ct_failure; + return -FI_SUCCESS; + } + sched_yield(); + } while (true); } - ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); - if (ret) - return ret; + /* Device memory requires a memcpy of the contents into + * system memory. + */ + do { + ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); + if (ret) + return ret; + + if (wb_copy.ct_writeback || + cntr->attr.flags & FI_CXI_CNTR_CACHED) { + *error = wb_copy.ct_failure; + return -FI_SUCCESS; + } + sched_yield(); + } while (true); - *error = wb_copy.ct_failure; return FI_SUCCESS; } @@ -78,20 +91,33 @@ static int cxip_cntr_get_ct_success(struct cxip_cntr *cntr, uint64_t *success) struct c_ct_writeback wb_copy; int ret; - /* Only can reference the ct_success field directly if dealing with - * system memory. Device memory requires a memcpy of the contents into - * system memory. - */ if (cntr->wb_iface == FI_HMEM_SYSTEM) { - *success = cntr->wb->ct_success; - return FI_SUCCESS; + do { + if (cntr->wb->ct_writeback || + cntr->attr.flags & FI_CXI_CNTR_CACHED) { + *success = cntr->wb->ct_success; + return FI_SUCCESS; + } + sched_yield(); + } while (true); } - ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); - if (ret) - return ret; + /* Device memory requires a memcpy of the contents into + * system memory. + */ + do { + ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); + if (ret) + return ret; + + if (wb_copy.ct_writeback || + cntr->attr.flags & FI_CXI_CNTR_CACHED) { + *success = wb_copy.ct_success; + return FI_SUCCESS; + } + sched_yield(); + } while (true); - *success = wb_copy.ct_success; return FI_SUCCESS; } @@ -306,6 +332,7 @@ int cxip_cntr_mod(struct cxip_cntr *cxi_cntr, uint64_t value, bool set, return FI_SUCCESS; } +/* Caller must hold cntr->lock */ static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get) { int ret; @@ -313,8 +340,6 @@ static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get) /* The calling thread which changes CT writeback bit from 1 to 0 must * issue a CT get command. */ - ofi_mutex_lock(&cntr->lock); - ret = cxip_cntr_get_ct_writeback(cntr); if (ret < 0) { CXIP_WARN("Failed to read counter writeback: rc=%d\n", ret); @@ -334,8 +359,6 @@ static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get) *issue_ct_get = false; } - ofi_mutex_unlock(&cntr->lock); - return FI_SUCCESS; err_unlock: @@ -351,6 +374,8 @@ static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get) * Schedule hardware to write the value of a counter to memory. Avoid * scheduling multiple write-backs at once. The counter value will appear in * memory a small amount of time later. + * + * Caller must hold cntr->lock */ static int cxip_cntr_get(struct cxip_cntr *cxi_cntr, bool force) { @@ -367,7 +392,7 @@ static int cxip_cntr_get(struct cxip_cntr *cxi_cntr, bool force) return ret; } - if (!issue_ct_get) + if (!issue_ct_get && cxi_cntr->attr.flags & FI_CXI_CNTR_CACHED) return FI_SUCCESS; } @@ -422,10 +447,13 @@ static uint64_t cxip_cntr_read(struct fid_cntr *fid_cntr) cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); cxip_cntr_progress(cxi_cntr); + + ofi_mutex_lock(&cxi_cntr->lock); cxip_cntr_get(cxi_cntr, false); - /* TODO: Fall back to reading register on error? */ ret = cxip_cntr_get_ct_success(cxi_cntr, &success); + ofi_mutex_unlock(&cxi_cntr->lock); + if (ret != FI_SUCCESS) CXIP_WARN("Failed to read counter success: rc=%d\n", ret); @@ -444,10 +472,13 @@ static uint64_t cxip_cntr_readerr(struct fid_cntr *fid_cntr) cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); cxip_cntr_progress(cxi_cntr); + + ofi_mutex_lock(&cxi_cntr->lock); cxip_cntr_get(cxi_cntr, false); - /* TODO: Fall back to reading register on error? */ ret = cxip_cntr_get_ct_error(cxi_cntr, &error); + ofi_mutex_unlock(&cxi_cntr->lock); + if (ret != FI_SUCCESS) CXIP_WARN("Failed to read counter error: rc=%d\n", ret); @@ -746,9 +777,11 @@ int cxip_set_wb_buffer(struct fid *fid, void *buf, size_t len) } /* Force a counter writeback into the user's provider buffer. */ + ofi_mutex_lock(&cntr->lock); do { ret = cxip_cntr_get(cntr, true); } while (ret == -FI_EAGAIN); + ofi_mutex_unlock(&cntr->lock); return ret; } @@ -825,7 +858,7 @@ static int cxip_cntr_verify_attr(struct fi_cntr_attr *attr) return -FI_ENOSYS; } - if (attr->flags) + if (attr->flags & ~FI_CXI_CNTR_CACHED) return -FI_ENOSYS; return FI_SUCCESS; diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c index 40ef8a60f59..0b121496c33 100644 --- a/prov/cxi/src/cxip_coll.c +++ b/prov/cxi/src/cxip_coll.c @@ -3,7 +3,7 @@ * * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020-2024 Hewlett Packard Enterprise Development LP * Support for accelerated collective reductions. */ @@ -31,8 +31,6 @@ #define TRACE_PKT(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_PKT, fmt, \ ##__VA_ARGS__) -#define TRACE_CURL(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_CURL, fmt, \ - ##__VA_ARGS__) #define TRACE_JOIN(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_JOIN, fmt, \ ##__VA_ARGS__) #define TRACE_DEBUG(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_DEBUG, fmt, \ @@ -50,6 +48,56 @@ #define MAGIC 0x677d +/**************************************************************************** + * Metrics for evaluating collectives + */ + +struct cxip_coll_metrics_loc { + ofi_atomic64_t red_count_bad; + ofi_atomic64_t red_count_full; + ofi_atomic64_t red_count_partial; + ofi_atomic64_t red_count_unreduced; + struct cxip_coll_metrics_ep ep_data; +}; +static struct cxip_coll_metrics_loc _coll_metrics; + +void cxip_coll_init_metrics(void) +{ + ofi_atomic_initialize64(&_coll_metrics.red_count_bad, 0); + ofi_atomic_initialize64(&_coll_metrics.red_count_full, 0); + ofi_atomic_initialize64(&_coll_metrics.red_count_partial, 0); + ofi_atomic_initialize64(&_coll_metrics.red_count_unreduced, 0); + memset(&_coll_metrics.ep_data, 0, sizeof(_coll_metrics.ep_data)); +} + +void cxip_coll_get_metrics(struct cxip_coll_metrics *metrics) +{ + metrics->red_count_bad = + ofi_atomic_get64(&_coll_metrics.red_count_bad); + metrics->red_count_full = + ofi_atomic_get64(&_coll_metrics.red_count_full); + metrics->red_count_partial = + ofi_atomic_get64(&_coll_metrics.red_count_partial); + metrics->red_count_unreduced = + ofi_atomic_get64(&_coll_metrics.red_count_unreduced); + memcpy(&metrics->ep_data, &_coll_metrics.ep_data, + sizeof(struct cxip_coll_metrics_ep)); +} + +static inline void _measure_completions(int red_cnt, size_t total) +{ + if (red_cnt >= total) + ofi_atomic_inc64(&_coll_metrics.red_count_bad); + else if (red_cnt == total-1) + ofi_atomic_inc64(&_coll_metrics.red_count_full); + else if (red_cnt > 1) + ofi_atomic_inc64(&_coll_metrics.red_count_partial); + else if (red_cnt > 0) + ofi_atomic_inc64(&_coll_metrics.red_count_unreduced); + else + ofi_atomic_inc64(&_coll_metrics.red_count_bad); +} + /**************************************************************************** * Reduction packet for hardware accelerated collectives: * @@ -97,7 +145,22 @@ * * retry is a control bit that can be invoked by the hw root node to initiate a * retransmission of the data from the leaves, if packets are lost. + * + * A re-arm of an armed switch port may not clear the data in the port, + * resulting in incorrect results. Arming twice will guarantee that the + * old data is cleared. + * + * To disambiguate these two arming packets, it is recommended that the first + * arm use a reserved sequence number, allowing the software to receive the + * first arm packet (and data), identify it as a pre-emptive arm, and discard + * it. + * + * The sequence numbers occupy 10 bits of the packet header. The sequence + * numbers are monotonically incremented modulo ((1 << 10)-1), meaning that + * the largest sequence number will be ((1 << 10)-2). The unreachable value + * of ((1 << 10)-1) is designated the reserved value for pre-emptive arming. */ + struct cxip_coll_cookie { uint32_t mcast_id:13; uint32_t red_id:3; @@ -314,7 +377,6 @@ static cxip_coll_op_t _uint8_16_32_op_to_opcode[FI_CXI_OP_LAST]; static cxip_coll_op_t _int64_op_to_opcode[FI_CXI_OP_LAST]; static cxip_coll_op_t _uint64_op_to_opcode[FI_CXI_OP_LAST]; static cxip_coll_op_t _flt_op_to_opcode[FI_CXI_OP_LAST]; -static enum c_return_code _cxip_rc_to_cxi_rc[16]; static enum cxip_coll_redtype _cxi_op_to_redtype[COLL_OPCODE_MAX]; /* One-time dynamic initialization of FI to CXI opcode. @@ -323,7 +385,7 @@ void cxip_coll_populate_opcodes(void) { int i; - if ((int)FI_CXI_MINMAXLOC < (int)OFI_ATOMIC_OP_LAST) { + if ((int)FI_CXI_MINMAXLOC < (int)FI_ATOMIC_OP_LAST) { CXIP_FATAL("Invalid CXI_FMINMAXLOC value\n"); } for (i = 0; i < FI_CXI_OP_LAST; i++) { @@ -384,17 +446,6 @@ void cxip_coll_populate_opcodes(void) _cxi_op_to_redtype[COLL_OPCODE_INT_MINMAXLOC] = REDTYPE_IMINMAX; _cxi_op_to_redtype[COLL_OPCODE_FLT_MINMAXNUMLOC] = REDTYPE_FMINMAX; _cxi_op_to_redtype[COLL_OPCODE_FLT_REPSUM] = REDTYPE_REPSUM; - - for (i = 0; i < 16; i++) - _cxip_rc_to_cxi_rc[i] = C_RC_AMO_ALIGN_ERROR; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_SUCCESS] = C_RC_OK; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_INEXACT] = C_RC_AMO_FP_INEXACT; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_OVERFLOW] = C_RC_AMO_FP_OVERFLOW; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_INVALID] = C_RC_AMO_FP_INVALID; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_REP_INEXACT] = C_RC_AMO_FP_INEXACT; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_INT_OVERFLOW] = C_RC_AMO_FP_OVERFLOW; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_CONTR_OVERFLOW] = C_RC_AMO_LENGTH_ERROR; - _cxip_rc_to_cxi_rc[CXIP_COLL_RC_OP_MISMATCH] = C_RC_AMO_INVAL_OP_ERROR; } static inline int int8_16_32_op_to_opcode(int op) @@ -830,7 +881,7 @@ static void _coll_rx_req_report(struct cxip_req *req) } else { /* non-reduction packet */ err = FI_ENOMSG; - CXIP_INFO("Not reduction pkt: %p (err: %d, %s)\n", + CXIP_WARN("Not reduction pkt: %p (err: %d, %s)\n", req, err, cxi_rc_to_str(err)); } @@ -930,6 +981,11 @@ static void _coll_rx_progress(struct cxip_req *req, return; } #endif + // A re-arm of an armed switch port drop this packet + if (pkt->hdr.seqno == CXIP_COLL_MOD_SEQNO) { + CXIP_INFO("pre-rearm pkt dropped\n"); + return; + } /* Progress the reduction */ _dump_red_pkt(pkt, "recv"); @@ -1247,7 +1303,7 @@ bool _quiesce_nan(double *d) } /** - * Implement NaN comparison in RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM + * Implement NaN comparisons FLT_MINNUM and FLT_MAXNUM * * Only associative mode is supported. The old IEEE mode is incorrect, and has * been deprecated. @@ -1324,35 +1380,6 @@ void swpidx(uint64_t *i1, uint64_t i2, int swp) *i1 = i2; } -/* Determine if double precision sum is exact. This shifts the value with the - * lower exponent toward the MSBit by the amount of the bitwise overlap between - * the final sum and the value that resulted in that sum. If any non-zero bits - * remain in that smaller value, they were discarded during the summation, and - * the result is inexact. - */ -static inline -bool exact(double rslt, double d) -{ - // TODO verify sign and shift - unsigned long m1, m2; - int s1, e1, s2, e2; - int shft, dlte; - bool ret; - - _decompose_dbl(rslt, &s1, &e1, &m1); - _decompose_dbl(d, &s2, &e2, &m2); - dlte = e1 - e2; - - if (dlte < 0) { - shft = MIN(52 + dlte, 0); - ret = !(m1 << shft); - } else { - shft= MIN(52 - dlte, 0); - ret = !(m2 << shft); - } - return ret; -} - static inline void _dump_coll_data(const char *tag, const struct cxip_coll_data *coll_data) { @@ -1488,7 +1515,6 @@ static void _reduce(struct cxip_coll_data *accum, /* overflow not possible */ break; case COLL_OPCODE_INT_MINMAXLOC: - /* RSDG 4.5.9.2.2 MINMAXLOC */ /* return smallest value and its index */ if (accum->intminmax.iminval > coll_data->intminmax.iminval) { accum->intminmax.iminval = coll_data->intminmax.iminval; @@ -1526,21 +1552,18 @@ static void _reduce(struct cxip_coll_data *accum, } break; case COLL_OPCODE_FLT_MINNUM: - /* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */ for (i = 0; i < 4; i++) { swpnan2(&accum->fltval.fval[i], coll_data->fltval.fval[i], 1, &accum->red_rc); } break; case COLL_OPCODE_FLT_MAXNUM: - /* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */ for (i = 0; i < 4; i++) { swpnan2(&accum->fltval.fval[i], coll_data->fltval.fval[i], 0, &accum->red_rc); } break; case COLL_OPCODE_FLT_MINMAXNUMLOC: - /* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */ swp = swpnan2(&accum->fltminmax.fminval, coll_data->fltminmax.fminval, 1, &accum->red_rc); swpidx(&accum->fltminmax.fminidx, coll_data->fltminmax.fminidx, swp); @@ -1560,10 +1583,6 @@ static void _reduce(struct cxip_coll_data *accum, /* NOTE: arithmetic operations will quiesce snan */ accum->fltval.fval[i] += coll_data->fltval.fval[i]; - if (!exact(accum->fltval.fval[i], - coll_data->fltval.fval[i])) - SET_RED_RC(accum->red_rc, - CXIP_COLL_RC_FLT_INEXACT); if (isinf(accum->fltval.fval[i])) SET_RED_RC(accum->red_rc, CXIP_COLL_RC_FLT_OVERFLOW); @@ -1581,10 +1600,6 @@ static void _reduce(struct cxip_coll_data *accum, /* NOTE: arithmetic operations will quiesce snan */ accum->fltval.fval[i] += coll_data->fltval.fval[i]; - if (!exact(accum->fltval.fval[i], - coll_data->fltval.fval[i])) - SET_RED_RC(accum->red_rc, - CXIP_COLL_RC_FLT_INEXACT); if (isinf(accum->fltval.fval[i])) SET_RED_RC(accum->red_rc, CXIP_COLL_RC_FLT_OVERFLOW); @@ -1727,7 +1742,7 @@ int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, bool arm, bool retry) { struct red_pkt *pkt; - int ret; + int ret = FI_SUCCESS; pkt = (struct red_pkt *)reduction->tx_msg; @@ -1758,13 +1773,33 @@ int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, pkt->hdr.repsum_ovflid = 0; memset(pkt->data, 0, CXIP_COLL_MAX_DATA_SIZE); } - _dump_red_pkt(pkt, "send"); - _swappkt(pkt); - /* -FI_EAGAIN means HW queue is full, should self-clear */ - do { - ret = _send_pkt(reduction); - } while (ret == -FI_EAGAIN); + // A re-arm of an armed switch port send clearing packet + if (arm && retry) { + int save_seqno = pkt->hdr.seqno; + + // A re-arm of an armed switch port skip illegal value + pkt->hdr.seqno = CXIP_COLL_MOD_SEQNO; + _dump_red_pkt(pkt, "retry"); + _swappkt(pkt); + do { + /* -FI_EAGAIN means HW queue is full, self-clears */ + ret = _send_pkt(reduction); + } while (ret == -FI_EAGAIN); + _swappkt(pkt); + pkt->hdr.seqno = save_seqno; + } + + if (ret == FI_SUCCESS) { + _dump_red_pkt(pkt, "send"); + _swappkt(pkt); + do { + /* -FI_EAGAIN means HW queue is full, self-clears */ + ret = _send_pkt(reduction); + } while (ret == -FI_EAGAIN); + _swappkt(pkt); + } + /* any other error is a serious config/hardware issue */ if (ret) CXIP_WARN("Fatal send error = %d\n", ret); @@ -1776,28 +1811,44 @@ int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, static void _post_coll_complete(struct cxip_coll_reduction *reduction) { struct cxip_req *req; - int ret; + int ret, prov; /* Indicates collective completion by writing to the endpoint TX CQ */ req = reduction->op_inject_req; if (!req) return; - if (reduction->accum.red_rc == CXIP_COLL_RC_SUCCESS) { + /* convert Rosetta return codes to CXIP return codes */ + if (reduction->accum.red_rc == CXIP_COLL_RC_SUCCESS || + reduction->accum.red_rc == CXIP_COLL_RC_FLT_INEXACT) { ret = cxip_cq_req_complete(req); } else { - ret = cxip_cq_req_error(req, 0, - _cxip_rc_to_cxi_rc[reduction->accum.red_rc], - reduction->accum.red_rc, NULL, 0, FI_ADDR_UNSPEC); + switch (reduction->accum.red_rc) { + case CXIP_COLL_RC_FLT_OVERFLOW: + prov = FI_CXI_ERRNO_RED_FLT_OVERFLOW; + break; + case CXIP_COLL_RC_FLT_INVALID: + prov = FI_CXI_ERRNO_RED_FLT_INVALID; + break; + case CXIP_COLL_RC_INT_OVERFLOW: + prov = FI_CXI_ERRNO_RED_INT_OVERFLOW; + break; + case CXIP_COLL_RC_CONTR_OVERFLOW: + prov = FI_CXI_ERRNO_RED_CONTR_OVERFLOW; + break; + case CXIP_COLL_RC_OP_MISMATCH: + prov = FI_CXI_ERRNO_RED_OP_MISMATCH; + break; + default: + prov = FI_CXI_ERRNO_RED_OTHER; + break; + } + ret = cxip_cq_req_error(req, 0, -FI_EOTHER, prov, + NULL, 0, FI_ADDR_UNSPEC); } - if (ret) { - /* Is this possible? The only error is -FI_ENOMEM. It looks like - * send is blocked with -FI_EAGAIN until we are guaranteed EQ - * space in the queue. Display and ignore. - */ - CXIP_WARN("Attempt to post completion failed %s\n", + if (ret) + CXIP_FATAL("Attempt to post completion failed %s\n", fi_strerror(-ret)); - } /* req structure no longer needed */ cxip_evtq_req_free(req); @@ -1880,12 +1931,19 @@ static void _unpack_red_data(struct cxip_coll_data *coll_data, #define DECMOD(val, mod) do {(val)=((val)+(mod)-1)%(mod);} while (0) /* MONOTONIC timestamp operations for timeouts/retries */ + +/* get current time */ static inline void _tsget(struct timespec *ts) { - clock_gettime(CLOCK_MONOTONIC, ts); + uint64_t ns; + + ns = ofi_gettime_ns(); + ts->tv_sec = ns / 1000000000; + ts->tv_nsec = ns % 1000000000; } +/* advance time by delta */ static inline void _tsadd(struct timespec *ts, const struct timespec *dt) { @@ -1897,42 +1955,64 @@ void _tsadd(struct timespec *ts, const struct timespec *dt) } } -/* Set a timespec at expiration time (future) */ +/* set current time plus increment */ +static inline +void _tsset(struct timespec *ts, const struct timespec *dt) +{ + _tsget(ts); + _tsadd(ts, dt); +} + +/* test for expiration of time */ static inline -void _tsset(struct cxip_coll_reduction *reduction) +bool _tsexp(struct timespec *ts) { - _tsget(&reduction->tv_expires); - _tsadd(&reduction->tv_expires, &reduction->mc_obj->timeout); + struct timespec tsnow; + + _tsget(&tsnow); + TRACE_JOIN("now=%ld.%ld exp=%ld.%ld\n", + tsnow.tv_sec, tsnow.tv_nsec, + ts->tv_sec, ts->tv_nsec); + if (tsnow.tv_sec < ts->tv_sec) + return false; + if (tsnow.tv_sec > ts->tv_sec) + return true; + return (tsnow.tv_nsec >= ts->tv_nsec); +} + +/* test for {0,0} timestamp */ +static inline +bool _tsnul(struct timespec *ts) +{ + return !(ts->tv_sec | ts->tv_nsec); +} + +/* Set reduction expiration time (future) */ +static inline +void _ts_red_set(struct cxip_coll_reduction *reduction) +{ + _tsset(&reduction->tv_expires, &reduction->mc_obj->timeout); } /* Used to prevent first-use incast */ static inline bool _is_red_first_time(struct cxip_coll_reduction *reduction) { - return (reduction->tv_expires.tv_sec == 0L && - reduction->tv_expires.tv_nsec == 0L); + return _tsnul(&reduction->tv_expires); } /* Used to reduce incast congestion during run */ static inline bool _is_red_timed_out(struct cxip_coll_reduction *reduction) { - struct timespec tsnow; - if (reduction->mc_obj->retry_disable) return false; if (_is_red_first_time(reduction)) { - TRACE_DEBUG("=== root first time, retry\n"); + TRACE_DEBUG("=== root redid=%d first time, retry\n", + reduction->red_id); return true; } - _tsget(&tsnow); - if (tsnow.tv_sec < reduction->tv_expires.tv_sec) - return false; - if (tsnow.tv_sec == reduction->tv_expires.tv_sec && - tsnow.tv_nsec < reduction->tv_expires.tv_nsec) - return false; - TRACE_DEBUG("=== root timeout, retry\n"); - return true; + return _tsexp(&reduction->tv_expires); } /* Root node state machine progress. @@ -1954,12 +2034,14 @@ static void _progress_root(struct cxip_coll_reduction *reduction, if (_is_red_timed_out(reduction)) { /* reset reduction for retry send */ reduction->seqno = mc_obj->seqno; - INCMOD(mc_obj->seqno, CXIP_COLL_MAX_SEQNO); + TRACE_PKT("root T/O reduction seqno = %d\n", reduction->seqno); + INCMOD(mc_obj->seqno, CXIP_COLL_MOD_SEQNO); + TRACE_PKT("root T/O mc_obj seqno = %d\n", mc_obj->seqno); ofi_atomic_inc32(&mc_obj->tmout_cnt); ret = cxip_coll_send_red_pkt(reduction, NULL, !mc_obj->arm_disable, true); - _tsset(reduction); + _ts_red_set(reduction); if (ret) { SET_RED_RC(reduction->accum.red_rc, CXIP_COLL_RC_TX_FAILURE); @@ -1971,9 +2053,6 @@ static void _progress_root(struct cxip_coll_reduction *reduction, /* Process received packet */ if (pkt) { - /* Root has received a leaf packet */ - _dump_red_pkt(pkt, "Rrcv"); - /* Drop out-of-date packets */ if (pkt->hdr.resno != reduction->seqno) { TRACE_DEBUG("bad seqno, exp=%d saw=%d\n", @@ -1982,8 +2061,14 @@ static void _progress_root(struct cxip_coll_reduction *reduction, return; } - /* capture and reduce packet information */ + /* capture packet information */ _unpack_red_data(&coll_data, pkt); +#if ENABLE_DEBUG + /* capture completion metrics */ + _measure_completions(coll_data.red_cnt, + mc_obj->av_set_obj->fi_addr_cnt); +#endif + /* perform the reduction */ _reduce(&reduction->accum, &coll_data, false); _dump_coll_data("after leaf contrib to root", &reduction->accum); } @@ -1999,12 +2084,13 @@ static void _progress_root(struct cxip_coll_reduction *reduction, /* send reduction result to leaves, arm new seqno */ reduction->seqno = mc_obj->seqno; - INCMOD(mc_obj->seqno, CXIP_COLL_MAX_SEQNO); + INCMOD(mc_obj->seqno, CXIP_COLL_MOD_SEQNO); reduction->completed = true; + TRACE_DEBUG("root send seqno = %d\n", reduction->seqno); ret = cxip_coll_send_red_pkt(reduction, &reduction->accum, !mc_obj->arm_disable, false); - _tsset(reduction); + _ts_red_set(reduction); if (ret) SET_RED_RC(reduction->accum.red_rc, CXIP_COLL_RC_TX_FAILURE); @@ -2040,19 +2126,21 @@ static void _progress_leaf(struct cxip_coll_reduction *reduction, /* if reduction packet, reset timer, seqno, honor retry */ if (pkt) { - _dump_red_pkt(pkt, "Lrcv"); - _tsset(reduction); + TRACE_DEBUG("%s: packet seen\n", __func__); + _ts_red_set(reduction); reduction->seqno = pkt->hdr.seqno; reduction->resno = pkt->hdr.seqno; if (pkt->hdr.retry) reduction->pktsent = false; + TRACE_PKT("leaf rcv seqno = %d\n", reduction->seqno); } /* leaves lead with sending a packet */ if (!reduction->pktsent) { /* Avoid first-use incast, retry guaranteed */ if (_is_red_first_time(reduction)) { - TRACE_DEBUG("=== leaf first time, wait\n"); + TRACE_DEBUG("=== leaf redid=%d first time, wait\n", + reduction->red_id); return; } @@ -2418,15 +2506,19 @@ union pack_mcast { uint64_t mcast_addr: 16;// maximum anticipated multicast uint64_t hwroot_idx: 27;// 128M endpoints in tree uint64_t valid: 1; // success flag - uint64_t pad: 20; // needed by zbcoll + uint64_t pad: 20; // used by zbcoll } __attribute__((__packed__)); +} __attribute__((__packed__)); + +union pack_errbits { + uint64_t uint64; struct { uint64_t error_bits: 43;// up to 43 independent errors - uint64_t valid1: 1; // unused/reserved - uint64_t pad1: 20; // unused/reserved + uint64_t valid: 1; // success flag + uint64_t pad1: 20; // needed by zbcoll } __attribute__((__packed__)); -}; +} __attribute__((__packed__)); /* State structure for carrying data through the join sequence */ struct cxip_join_state { @@ -2434,10 +2526,12 @@ struct cxip_join_state { struct cxip_av_set *av_set_obj; // av set for this collective struct cxip_coll_mc *mc_obj; // mc object for this collective struct cxip_zbcoll_obj *zb; // zb object associated with state + struct timespec curlexpires; // multicast creation expiration timeout struct fid_mc **mc; // user pointer to return mc_obj void *context; // user context for concurrent joins uint64_t join_flags; // user-supplied libfabric join flags union pack_mcast bcast_data; // packed multicast data + union pack_errbits reduce_err; // packed join error bits bool rx_discard; // set if RX events should be discarded bool is_rank; // set if using COLL_RANK simulation model bool is_mcast; // set if using Rosetta multicast tree @@ -2455,49 +2549,65 @@ struct cxip_join_state { }; /* State structure for recovering data from CURL response */ -struct cxip_curl_mcast_usrptr { +struct cxip_curl_mcast_create_usrptr { struct cxip_join_state *jstate; // join state int mcast_id; // multicast address int hwroot_rank; // hardware root index }; +struct cxip_curl_mcast_delete_usrptr { + struct cxip_coll_mc *mc_obj; // multicast object +}; + /* pack provider errors into AND bitmask - address data */ void _proverr_to_bits(struct cxip_join_state *jstate) { int bitno; /* record error as a bit for this endpoint */ - jstate->bcast_data.error_bits = 0L; - if (!jstate->bcast_data.valid) { - bitno = -jstate->prov_errno; - jstate->bcast_data.error_bits |= (1L << bitno); + TRACE_JOIN("%s: prov_errno=%d\n", __func__, jstate->prov_errno); + jstate->reduce_err.error_bits = 0L; + if (jstate->prov_errno) { + if (jstate->prov_errno >= FI_CXI_ERRNO_JOIN_LAST) + jstate->prov_errno = FI_CXI_ERRNO_JOIN_OTHER; + bitno = jstate->prov_errno - FI_CXI_ERRNO_JOIN_FIRST; + jstate->reduce_err.error_bits |= (1L << bitno); } /* invert bits, zbcoll reduce does AND */ - jstate->bcast_data.error_bits ^= -1L; + TRACE_JOIN("%s: error bitmask=%016lx\n", __func__, + (uint64_t)jstate->reduce_err.error_bits); + jstate->reduce_err.error_bits ^= -1L; } -/* unpack AND bitmask into dominant provider error */ +/* unpack bitmask and return largest error */ void _bits_to_proverr(struct cxip_join_state *jstate) { - int bitno; + int prov_errno; + uint64_t bitmask; /* zbcoll reduce does AND, invert bits */ - jstate->bcast_data.error_bits ^= -1L; - - /* if data is valid, bits do not represent errors */ - if (jstate->bcast_data.valid) { - jstate->prov_errno = CXIP_PROV_ERRNO_OK; + jstate->reduce_err.error_bits ^= -1L; + TRACE_JOIN("%s: error bitmask=%016lx\n", __func__, + (uint64_t)jstate->reduce_err.error_bits); + + /* display all errors, capture the highest value error */ + jstate->prov_errno = 0L; + if (!jstate->reduce_err.error_bits) { + TRACE_JOIN("%s: no error seen\n", __func__); return; } - /* bits set represent multiple errors from endpoints */ - for (bitno = -CXIP_PROV_ERRNO_OK; bitno < -CXIP_PROV_ERRNO_LAST; bitno++) { - if (jstate->bcast_data.error_bits & (1 << bitno)) { - jstate->prov_errno = -bitno; - CXIP_WARN("join error %d seen\n", jstate->prov_errno); + bitmask = 1L; + for (prov_errno = FI_CXI_ERRNO_JOIN_FIRST; + prov_errno < FI_CXI_ERRNO_JOIN_LAST; + prov_errno++) { + if (jstate->reduce_err.error_bits & bitmask) { + jstate->prov_errno = prov_errno; + CXIP_WARN("%s\n", cxip_strerror(jstate->prov_errno)); + TRACE_JOIN("%s\n", cxip_strerror(jstate->prov_errno)); } + bitmask <<= 1; } - /* returns most significant of multiple errors as jstate->prov_errno */ } /* Close collective pte object - ep_obj->lock must be held */ @@ -2532,8 +2642,10 @@ static int _acquire_pte(struct cxip_ep_obj *ep_obj, int pid_idx, *coll_pte_ret = NULL; coll_pte = calloc(1, sizeof(*coll_pte)); - if (!coll_pte) + if (!coll_pte) { + TRACE_JOIN("out of memory\n"); return -FI_ENOMEM; + } /* initialize coll_pte */ coll_pte->ep_obj = ep_obj; @@ -2546,20 +2658,27 @@ static int _acquire_pte(struct cxip_ep_obj *ep_obj, int pid_idx, ret = cxip_pte_alloc(ep_obj->ptable, ep_obj->coll.rx_evtq->eq, pid_idx, is_mcast, &pt_opts, _coll_pte_cb, coll_pte, &coll_pte->pte); - if (ret) - goto fail; + if (ret) { + TRACE_JOIN("cxip_pte_alloc failed=%d\n", ret); + free(coll_pte); + return ret; + } /* enable the PTE */ ret = _coll_pte_enable(coll_pte, CXIP_PTE_IGNORE_DROPS); - if (ret) + if (ret) { + TRACE_JOIN("_coll_pte_enable failed=%d\n", ret); goto fail; + } /* add buffers to the PTE */ ret = _coll_add_buffers(coll_pte, ep_obj->coll.buffer_size, ep_obj->coll.buffer_count); - if (ret) + if (ret) { + TRACE_JOIN("_coll_add_buffers failed=%d\n", ret); goto fail; + } *coll_pte_ret = coll_pte; return FI_SUCCESS; @@ -2569,13 +2688,20 @@ static int _acquire_pte(struct cxip_ep_obj *ep_obj, int pid_idx, return ret; } +/* forward references for CURL operations */ +static void _create_mcast_addr(struct cxip_join_state *jstate); +static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle); +static void _curl_delete_mc_obj(struct cxip_coll_mc *mc_obj); +static void _cxip_delete_mcast_cb(struct cxip_curl_handle *handle); + /* Close multicast collective object */ -static void _close_mc(struct cxip_coll_mc *mc_obj) +static void _close_mc(struct cxip_coll_mc *mc_obj, bool delete) { int count; if (!mc_obj) return; + TRACE_JOIN("%s starting MC cleanup\n", __func__); /* clear the mcast_addr -> mc_obj reference*/ ofi_idm_clear(&mc_obj->ep_obj->coll.mcast_map, mc_obj->mcast_addr); mc_obj->ep_obj->coll.is_hwroot = false; @@ -2598,19 +2724,33 @@ static void _close_mc(struct cxip_coll_mc *mc_obj) _close_pte(mc_obj->ep_obj->coll.coll_pte); mc_obj->ep_obj->coll.coll_pte = NULL; } - free(mc_obj); + /* index zero deletes the multicast address */ + if (delete && mc_obj->is_multicast && !mc_obj->mynode_idx) { + struct timespec expires = { + cxip_env.coll_fm_timeout_msec/1000, + (cxip_env.coll_fm_timeout_msec%1000)*1000000}; + + _tsset(&mc_obj->curlexpires, &expires); + _curl_delete_mc_obj(mc_obj); + } else + free(mc_obj); } +/* The user can close an individual collective MC address. It must do so on + * all endpoints in the collective group, just as fi_join_collective() must + * be called on all endpoints in the group. + */ static int _fi_close_mc(struct fid *fid) { struct cxip_coll_mc *mc_obj; + TRACE_JOIN("%s: closing MC\n", __func__); mc_obj = container_of(fid, struct cxip_coll_mc, mc_fid.fid); - _close_mc(mc_obj); + _close_mc(mc_obj, true); return FI_SUCCESS; } -/* multicast object operational functions */ +/* multicast object libfabric functions */ static struct fi_ops mc_ops = { .size = sizeof(struct fi_ops), .close = _fi_close_mc, @@ -2678,9 +2818,10 @@ static int _initialize_mc(void *ptr) if (!mc_obj) return -FI_ENOMEM; - TRACE_DEBUG("acquiring PTE\n"); + TRACE_JOIN("acquiring PTE\n"); if (jstate->is_rank) { // NETSIM + TRACE_JOIN("acquiring PTE NETSIM\n"); // pid_idx = simulated collective rank pid_idx = CXIP_PTL_IDX_COLL + jstate->simrank; ret = _acquire_pte(ep_obj, pid_idx, false, &coll_pte); @@ -2689,11 +2830,13 @@ static int _initialize_mc(void *ptr) } else if (!jstate->is_mcast) { // UNICAST // pid_idx = simulated collective tree + TRACE_JOIN("acquiring PTE UNICAST\n"); pid_idx = CXIP_PTL_IDX_COLL; ret = _acquire_pte(ep_obj, pid_idx, false, &coll_pte); } else { // MULTICAST // pid_idx = bit-shifted multicast address + TRACE_JOIN("acquiring PTE MULTICAST\n"); memset(&pid_mcast, 0, sizeof(pid_mcast)); pid_mcast.mcast_id = jstate->bcast_data.mcast_addr; pid_mcast.mcast_pte_index = 0; @@ -2829,6 +2972,11 @@ static int _initialize_mc(void *ptr) /* Last field to set */ mc_obj->is_joined = true; + /* Prepare static metrics for this endpoint*/ + _coll_metrics.ep_data.myrank = mc_obj->mynode_idx; + _coll_metrics.ep_data.isroot = + mc_obj->hwroot_idx == mc_obj->mynode_idx; + /* Return information to the caller */ jstate->mc_obj = mc_obj; *jstate->mc = &mc_obj->mc_fid; @@ -2838,146 +2986,207 @@ static int _initialize_mc(void *ptr) return FI_SUCCESS; fail: - _close_mc(mc_obj); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_FAIL_PTE; + _close_mc(mc_obj, true); return ret; } /** - * CURL callback function upon completion of a request. + * CURL MODEL + * + * void _cxip_action(void *object); + * void _cxip_action_cb(struct cxip_curl_handle *handle); + * + * The action object must persist until the action has reached a conclusion, + * which may involve multiple CURL requests, particularly retries on busy + * responses. It must retain state for multiple retries of the action if the + * CURL response indicates a retry is needed. This is the cxip_join_state + * object for multicast creation, and the mc_obj object for multicast deletion. * - * This sets jstate->finished_mcast, even if the operation fails. - * This sets jstate->bcast_data.valid if the address is valid. + * The curl_usrptr object is allocated for each CURL request, and deleted after + * the response has been evaluated. The response may be a retry of the same + * CURL request, or it may be some other recovery or completion operation. + * + * This simplifies retries and adaptive responses to the CURL result. The + * callback function runs as an agent of the CURL processing, using the + * curl_usrptr object, and can assume that the CURL implementation (cxip_curl.c) + * will do all CURL memory cleanup, regardless of success or failure. This means + * that the callback can simply re-issue the same command as if for the first + * time to perform a retry on any kind of busy error. + * + * To prevent endless retries, the elapsing time must be recorded in the + * action object (so that it will persist across multiple CURL operations). */ -static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle) + +/** + * Perform a CURL request to delete a multicast address. + * + * This is the last thing done after closing down the mc_object in libfabric, so + * all that remains is to remove the actual multicast in the FM and delete + * allocated memory for mc_obj. If the CURL operation cannot complete + * successfully, the multicast delete will occur at the end of the job. + */ +static void _curl_delete_mc_obj(struct cxip_coll_mc *mc_obj) { - struct cxip_curl_mcast_usrptr *curl_usrptr = handle->usrptr; - struct cxip_join_state *jstate = curl_usrptr->jstate; + struct cxip_curl_mcast_delete_usrptr *curl_usrptr; + char *url; + int ret; + + /* early exit will attempt to free these */ + curl_usrptr = NULL; + url = NULL; + + TRACE_JOIN("deleting multicast address via REST\n"); + ret = asprintf(&url, "%s/%d", cxip_env.coll_fabric_mgr_url, + mc_obj->mcast_addr); + if (ret < 0) { + TRACE_JOIN("Failed to construct CURL address\n"); + goto quit; + } + /* create the return pointer */ + curl_usrptr = calloc(1, sizeof(*curl_usrptr)); + if (!curl_usrptr) { + TRACE_JOIN("curl_usrptr calloc() error\n"); + ret = -FI_ENOMEM; + goto quit; + } + curl_usrptr->mc_obj = mc_obj; + ret = cxip_curl_perform(url, NULL, cxip_env.coll_mcast_token, 0, + CURL_DELETE, false, _cxip_delete_mcast_cb, + curl_usrptr); + if (ret < 0) { + TRACE_JOIN("CURL delete mcast %d dispatch failed %d\n", + mc_obj->mcast_addr, ret); + goto quit; + } + TRACE_JOIN("CURL delete mcast %d dispatch successful\n", + mc_obj->mcast_addr); +quit: + free(url); + if (ret < 0) { + TRACE_JOIN("CURL delete mcast %d failed\n", + mc_obj->mcast_addr); + free(curl_usrptr); + free(mc_obj); + } +} + +static void _cxip_delete_mcast_cb(struct cxip_curl_handle *handle) +{ + struct cxip_curl_mcast_delete_usrptr *curl_usrptr = handle->usrptr; + struct cxip_coll_mc *mc_obj = curl_usrptr->mc_obj; struct json_object *json_obj; - struct cxip_addr caddr; - const char *hwrootstr; - int mcaddr, hwroot; - uint32_t octet[6], n; - int i, ret; + const char *errmsg = ""; - /* Creation process is done */ - TRACE_CURL("CURL COMPLETED!\n"); - jstate->finished_mcast = true; + /* note: allocates space for strings, free at end */ + json_obj = json_tokener_parse(handle->response); + if (json_obj) { + if (cxip_json_string("message", json_obj, &errmsg)) + errmsg = ""; + } else { + TRACE_JOIN("callback: malformed server response: '%s'\n", + handle->response); + } switch (handle->status) { case 200: case 201: - /* CURL succeeded, parse response */ - TRACE_CURL("CURL PARSE RESPONSE:\n%s\n", handle->response); - if (!(json_obj = json_tokener_parse(handle->response))) - break; - if (cxip_json_int("mcastID", json_obj, &mcaddr)) - break; - if (cxip_json_string("hwRoot", json_obj, &hwrootstr)) - break; - - memset(octet, 0, sizeof(octet)); - hwroot = 0; - n = sscanf(hwrootstr, "%x:%x:%x:%x:%x:%x", - &octet[5], &octet[4], &octet[3], - &octet[2], &octet[1], &octet[0]); - if (n < 3) { - TRACE_CURL("bad hwroot address = %s\n", hwrootstr); - break; - } - for (i = 0; i < n; i++) - hwroot |= octet[i] << (8*i); + TRACE_JOIN("callback: %ld SUCCESS MCAST DELETED\n", + handle->status); + free(mc_obj); + break; + case 409: + TRACE_JOIN("callback: delete mcast failed: %ld '%s'\n", + handle->status, errmsg); - TRACE_CURL("mcastID=%d hwRoot='%s'=%x\n", mcaddr, hwrootstr, - hwroot); - for (i = 0; i < jstate->av_set_obj->fi_addr_cnt; i++) { - ret = cxip_av_lookup_addr( - jstate->av_set_obj->cxi_av, - jstate->av_set_obj->fi_addr_ary[i], - &caddr); - if (ret < 0) - continue; - TRACE_JOIN("test %d == %d\n", hwroot, caddr.nic); - if (hwroot == caddr.nic) - break; - } - TRACE_CURL("final index=%d\n", i); - if (i >= jstate->av_set_obj->fi_addr_cnt) { - TRACE_CURL("multicast HWroot not found in av_set\n"); - jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INVALID; + if (_tsexp(&mc_obj->curlexpires)) { + TRACE_JOIN("callback: FM expired\n"); + free(mc_obj); break; } - /* Production MCAST address */ - jstate->bcast_data.valid = true; - jstate->bcast_data.hwroot_idx = i; - jstate->bcast_data.mcast_addr = (uint32_t)mcaddr; - jstate->is_mcast = true; - /* This succeeded */ - TRACE_CURL("curl: mcaddr =%08x\n", - jstate->bcast_data.mcast_addr); - TRACE_CURL("curl: hwrootidx=%d\n", - jstate->bcast_data.hwroot_idx); + /* try again */ + _curl_delete_mc_obj(mc_obj); break; default: - TRACE_CURL("ERRMSK SET CURL error %ld!\n", handle->status); - if (handle->response) - TRACE_CURL("ERROR RESPONSE:\n%s\n", handle->response); - // TODO finer error differentiation from CURL errors - jstate->prov_errno = CXIP_PROV_ERRNO_CURL; + TRACE_JOIN("callback: %ld unknown status\n", handle->status); + free(mc_obj); break; } + /* free json memory */ + json_object_put(json_obj); free(curl_usrptr); - TRACE_CURL("CURL COMPLETED!\n"); - jstate->finished_mcast = true; } /** - * Start a CURL request for a multicast address. + * Perform a CURL request to create a new multicast address. */ -static void _start_curl(void *ptr) +static void _create_mcast_addr(struct cxip_join_state *jstate) { - struct cxip_curl_mcast_usrptr *curl_usrptr; - struct cxip_join_state *jstate = ptr; + struct cxip_curl_mcast_create_usrptr *curl_usrptr; struct cxip_addr caddr; - char *jsonreq, *mac, *url, *p; + char *jsonreq, *mac, *url, *tok, *p; int i, ret; - /* early exit will attempt to free these */ + /* all exit paths attempt to free these */ curl_usrptr = NULL; jsonreq = NULL; mac = NULL; url = NULL; - - /* acquire the environment variables needed */ - TRACE_CURL("jobid = %s\n", cxip_env.coll_job_id); - TRACE_CURL("stepid = %s\n", cxip_env.coll_job_step_id); - TRACE_CURL("fmurl = %s\n", cxip_env.coll_fabric_mgr_url); - TRACE_CURL("token = %s\n", cxip_env.coll_mcast_token); - TRACE_CURL("maxadrs = %ld\n", cxip_env.hwcoll_addrs_per_job); - TRACE_CURL("minnodes= %ld\n", cxip_env.hwcoll_min_nodes); - TRACE_CURL("retry = %ld\n", cxip_env.coll_retry_usec); - TRACE_CURL("tmout = %ld\n", cxip_env.coll_timeout_usec); + tok = NULL; + + /* check the environment variables needed */ + TRACE_JOIN("ENV jobid = %s\n", cxip_env.coll_job_id); + TRACE_JOIN("ENV stepid = %s\n", cxip_env.coll_job_step_id); + TRACE_JOIN("ENV fmurl = %s\n", cxip_env.coll_fabric_mgr_url); + TRACE_JOIN("ENV token = %s\n", cxip_env.coll_mcast_token); + TRACE_JOIN("ENV maxadrs = %ld\n", cxip_env.hwcoll_addrs_per_job); + TRACE_JOIN("ENV minnodes= %ld\n", cxip_env.hwcoll_min_nodes); + TRACE_JOIN("ENV retry = %ld\n", cxip_env.coll_retry_usec); + TRACE_JOIN("ENV tmout = %ld\n", cxip_env.coll_timeout_usec); + TRACE_JOIN("ENV fmtmout = %ld\n", cxip_env.coll_fm_timeout_msec); /* Generic error for any preliminary failures */ - jstate->prov_errno = CXIP_PROV_ERRNO_CURL; - if (!cxip_env.coll_job_id || - !cxip_env.coll_fabric_mgr_url || - !cxip_env.coll_mcast_token) { - TRACE_JOIN("Check environment variables\n"); + ret = 0; + if (!cxip_env.coll_job_id) { + TRACE_JOIN("missing job id\n"); + ret = -FI_EINVAL; + } + if (!cxip_env.coll_fabric_mgr_url) { + TRACE_JOIN("missing FM url\n"); ret = -FI_EINVAL; - goto quit; } + if (!cxip_env.coll_mcast_token) { + TRACE_JOIN("missing FM token\n"); + ret = -FI_EINVAL; + } + if (ret < 0) + goto quit; - ret = asprintf(&url, "%s", cxip_env.coll_fabric_mgr_url); + if (cxip_trap_search(0, CXIP_TRAP_CURL_FM_URL, NULL, NULL)) + ret = asprintf(&url, "%s-bad", cxip_env.coll_fabric_mgr_url); + else + ret = asprintf(&url, "%s", cxip_env.coll_fabric_mgr_url); if (ret < 0) { - TRACE_JOIN("Failed to construct CURL address\n"); + TRACE_JOIN("failed to construct CURL address\n"); ret = -FI_ENOMEM; goto quit; } + TRACE_JOIN("final fmurl = %s\n", url); + if (cxip_trap_search(0, CXIP_TRAP_CURL_TOKEN, NULL, NULL)) + ret = asprintf(&tok, "%s-bad", cxip_env.coll_mcast_token); + else + ret = asprintf(&tok, "%s", cxip_env.coll_mcast_token); + if (ret < 0) { + TRACE_JOIN("failed to construct CURL token\n"); + ret = -FI_ENOMEM; + goto quit; + } + TRACE_JOIN("final token = %s\n", tok); /* five hex digits per mac, two colons, two quotes, comma */ p = mac = malloc(10*jstate->av_set_obj->fi_addr_cnt + 1); if (!mac) { - TRACE_JOIN("Failed to allocate mac list\n"); + TRACE_JOIN("failed to allocate mac list\n"); ret = -FI_ENOMEM; goto quit; } @@ -3008,38 +3217,195 @@ static void _start_curl(void *ptr) cxip_env.coll_job_id, cxip_env.coll_job_step_id); if (ret < 0) { - TRACE_JOIN("Creating JSON request = %d\n", ret); + TRACE_JOIN("failed to create jsonreq= %d\n", ret); ret = -FI_ENOMEM; goto quit; } single_to_double_quote(jsonreq); - TRACE_JOIN("JSON = %s\n", jsonreq); - /* create the mcast address */ + /* create the user return pointer */ curl_usrptr = calloc(1, sizeof(*curl_usrptr)); if (!curl_usrptr) { - TRACE_JOIN("curl_usrptr calloc() error\n"); + TRACE_JOIN("failed to calloc() curl_usrptr\n"); ret = -FI_ENOMEM; goto quit; } /* dispatch CURL request */ curl_usrptr->jstate = jstate; - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_CURLSND, &ret)) + ret = cxip_curl_perform(url, jsonreq, tok, 0, CURL_POST, false, + _cxip_create_mcast_cb, curl_usrptr); + if (ret < 0) { + TRACE_JOIN("CURL create mcast dispatch failed %d\n", ret); goto quit; - ret = cxip_curl_perform(url, jsonreq, cxip_env.coll_mcast_token, 0, - CURL_POST, false, _cxip_create_mcast_cb, - curl_usrptr); + } + TRACE_JOIN("CURL create mcast dispatch successful\n"); quit: + free(tok); free(url); free(mac); free(jsonreq); if (ret < 0) { - TRACE_JOIN("CURL execution failed\n"); + TRACE_JOIN("CURL create mcast failed\n"); free(curl_usrptr); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_CURL_FAILED; jstate->finished_mcast = true; } } +static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle) +{ + struct cxip_curl_mcast_create_usrptr *curl_usrptr = handle->usrptr; + struct cxip_join_state *jstate = curl_usrptr->jstate; + struct json_object *json_obj; + struct cxip_addr caddr; + const char *hwrootstr = ""; + const char *message = ""; + const char *cptr; + int mcaddr = -1; + int hwroot = -1; + int curl_errcode = 0; + uint32_t octet[6], n; + int i, ret; + + /* note: allocates space for strings, free at end */ + json_obj = json_tokener_parse(handle->response); + if (json_obj) { + if (cxip_json_string("message", json_obj, &message)) + message = ""; + if (cxip_json_string("hwRoot", json_obj, &hwrootstr)) + hwrootstr = ""; + if (cxip_json_int("mcastID", json_obj, &mcaddr)) + mcaddr = -1; + } else { + TRACE_JOIN("callback: malformed server response: '%s'\n", + handle->response); + } + TRACE_JOIN("%s status =%ld\n", __func__, handle->status); + TRACE_JOIN("%s response ='%s'\n", __func__, handle->response); + TRACE_JOIN("%s message ='%s'\n", __func__, message); + TRACE_JOIN("%s hwrootstr='%s'\n", __func__, hwrootstr); + TRACE_JOIN("%s mcaddr ='%d'\n", __func__, mcaddr); + + /* Process result */ + switch (handle->status) { + case 200: + case 201: + if (mcaddr < 0 || mcaddr >= 8192) { + TRACE_JOIN("callback: mcaddr=%d is invalid\n", mcaddr); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_MCAST_INVALID; + jstate->finished_mcast = true; + break; + } + memset(octet, 0, sizeof(octet)); + hwroot = 0; + n = 0; + if (hwrootstr) + n = sscanf(hwrootstr, "%x:%x:%x:%x:%x:%x", + &octet[5], &octet[4], &octet[3], + &octet[2], &octet[1], &octet[0]); + if (n < 3) { + TRACE_JOIN("callback: hwroot '%s' too few octets\n", + hwrootstr); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INVALID; + jstate->finished_mcast = true; + break; + } + for (i = 0; i < n; i++) + hwroot |= octet[i] << (8*i); + + for (i = 0; i < jstate->av_set_obj->fi_addr_cnt; i++) { + ret = cxip_av_lookup_addr( + jstate->av_set_obj->cxi_av, + jstate->av_set_obj->fi_addr_ary[i], + &caddr); + if (ret < 0) + continue; + if (hwroot == caddr.nic) + break; + } + if (i >= jstate->av_set_obj->fi_addr_cnt) { + TRACE_JOIN("callback: hwroot rank invalid\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INVALID; + jstate->finished_mcast = true; + break; + } + /* Production MCAST address */ + jstate->bcast_data.valid = true; + jstate->bcast_data.hwroot_idx = i; + jstate->bcast_data.mcast_addr = (uint32_t)mcaddr; + jstate->is_mcast = true; + /* This succeeded */ + TRACE_JOIN("callback: SUCCESS mcaddr=%d hwroot=%d\n", + jstate->bcast_data.mcast_addr, + jstate->bcast_data.hwroot_idx); + jstate->prov_errno = 0; + jstate->finished_mcast = true; + break; + case 400: + TRACE_JOIN("callback: create mcast failed: %ld '%s'\n", + handle->status, message ? message : ""); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_SERVER_ERR; + jstate->finished_mcast = true; + break; + case 409: + TRACE_JOIN("callback: create mcast failed: %ld '%s'\n", + handle->status, message); + + if (_tsexp(&jstate->curlexpires)) { + TRACE_JOIN("callback: FM expired\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_CURL_TIMEOUT; + jstate->finished_mcast = true; + break; + } + /* retry */ + _create_mcast_addr(jstate); + break; + case 507: + /* find and parse error instance number */ + cptr = message; + curl_errcode = 0; + while (cptr && *cptr != ':') + cptr++; + if (*cptr == ':') { + cptr -= 2; + sscanf(cptr, "%02d:", &curl_errcode); + TRACE_JOIN("error code = %d\n", curl_errcode); + } + switch (curl_errcode) { + case 1: + TRACE_JOIN("failed: no mcast, exceeded job limit\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_MCAST_INUSE; + break; + case 2: + TRACE_JOIN("failed: no mcast, no addresses left\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_MCAST_INUSE; + break; + case 3: + TRACE_JOIN("failed: no hwroot available in group\n"); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INUSE; + break; + default: + TRACE_JOIN("failed: errcode=%d\n", curl_errcode); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_SERVER_ERR; + break; + } + jstate->finished_mcast = true; + break; + default: + TRACE_JOIN("callback: unhandled CURL error %ld '%s'\n", + handle->status, message ? message : ""); + jstate->prov_errno = FI_CXI_ERRNO_JOIN_SERVER_ERR; + jstate->finished_mcast = true; + TRACE_JOIN("jstate->prov_errno = %d\n", jstate->prov_errno); + break; + } + TRACE_JOIN("jstate->prov_errno = %d\n", jstate->prov_errno); + /* free json memory */ + json_object_put(json_obj); + free(curl_usrptr); +} + + /**************************************************************************** * State machine for performing fi_join_collective() * @@ -3115,7 +3481,8 @@ static void _start_getgroup(void *ptr) TRACE_JOIN("%s on %d: entry\n", __func__, jstate->mynode_idx); - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_GETGRP, &zb->error)) + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_GETGRP, &zb->error, + &jstate->prov_errno)) goto quit; /* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */ zb->error = cxip_zbcoll_getgroup(zb); @@ -3137,6 +3504,10 @@ static void _finish_getgroup(void *ptr) /* Create a multicast address and broadcast it to all endpoints. * If jstate->create_mcast is set, this will use CURL to get an address. * Otherwise, this presumes static initialization, and sets bcast_data.valid. + * + * Caution: re-entrant routine. + * This routine is called repeatedly by rank 0, returning -FI_EAGAIN to drive + * the CURL state. See the branch to 'quit' below. */ static void _start_bcast(void *ptr) { @@ -3146,8 +3517,6 @@ static void _start_bcast(void *ptr) if (!suppress_busy_log) TRACE_JOIN("%s: entry\n", __func__); - /* error will indicate that the multicast request fails */ - jstate->prov_errno = C_RC_INVALID_DFA_FORMAT; /* rank 0 always does the work here */ if (jstate->mynode_idx == 0) { if (!suppress_busy_log) @@ -3155,9 +3524,16 @@ static void _start_bcast(void *ptr) if (jstate->create_mcast) { /* first call (only) initiates CURL request */ if (!jstate->creating_mcast) { + struct timespec expires = { + cxip_env.coll_fm_timeout_msec/1000, + (cxip_env.coll_fm_timeout_msec%1000)*1000000}; + TRACE_JOIN("%s create mcast\n", __func__); jstate->creating_mcast = true; - _start_curl(jstate); + + _tsset(&jstate->curlexpires, &expires); + _create_mcast_addr(jstate); + TRACE_JOIN("%s create mcast initiated\n", __func__); } /* every retry call checks to see if CURL is complete */ if (!jstate->finished_mcast) { @@ -3165,16 +3541,17 @@ static void _start_bcast(void *ptr) suppress_busy_log++; goto quit; } + TRACE_JOIN("%s create mcast completed\n", __func__); suppress_busy_log = 0; /* bcast_data.valid is set by curl callback */ } else { /* static bcast data is presumed correct */ + TRACE_JOIN("%s static multicast accepted\n", __func__); jstate->bcast_data.valid = true; } } - /* speculative prov_errno for trap */ - jstate->prov_errno = CXIP_PROV_ERRNO_CURL; - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_BCAST, &zb->error)) + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_BCAST, &zb->error, + &jstate->prov_errno)) goto quit; /* rank > 0 endpoints overwritten by rank = 0 data */ /* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */ @@ -3192,10 +3569,13 @@ static void _finish_bcast(void *ptr) bool is_hwroot; int ret; - TRACE_JOIN("%s: mc addr=%d hw_root=%d valid=%d\n", __func__, + TRACE_JOIN("%s: mc addr=%d hw_root=%d valid=%d\n", + __func__, jstate->bcast_data.mcast_addr, jstate->bcast_data.hwroot_idx, jstate->bcast_data.valid); + TRACE_JOIN("%s: jstate->prov_errno %d\n", __func__, + jstate->prov_errno); /* all NICs now have same mc_addr data, if invalid, fail */ /* jstate->prov_errno is presumed set if not valid */ if (!jstate->bcast_data.valid) @@ -3207,7 +3587,7 @@ static void _finish_bcast(void *ptr) if (jstate->bcast_data.hwroot_idx >= jstate->av_set_obj->fi_addr_cnt) { TRACE_JOIN("%s: reject invalid hwroot_idx\n", __func__); - jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INVALID; + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INVALID; ret = -FI_EINVAL; goto quit; } @@ -3216,7 +3596,7 @@ static void _finish_bcast(void *ptr) is_hwroot = (jstate->bcast_data.hwroot_idx == jstate->mynode_idx); if (is_hwroot && jstate->ep_obj->coll.is_hwroot) { TRACE_JOIN("%s: reject join, hwroot in use\n", __func__); - jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INUSE; + jstate->prov_errno = FI_CXI_ERRNO_JOIN_HWROOT_INUSE; ret = -FI_EINVAL; goto quit; @@ -3228,15 +3608,16 @@ static void _finish_bcast(void *ptr) jstate->bcast_data.mcast_addr)) { TRACE_JOIN("%s: reject join, mcast %d in use\n", __func__, jstate->bcast_data.mcast_addr); - jstate->prov_errno = CXIP_PROV_ERRNO_MCAST_INUSE; + jstate->prov_errno = FI_CXI_ERRNO_JOIN_MCAST_INUSE; ret = -FI_EINVAL; goto quit; } - /* speculative prov_errno for trap */ - jstate->prov_errno = CXIP_PROV_ERRNO_PTE; - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_INITPTE, &ret)) + jstate->prov_errno = 0; + + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_INITPTE, &ret, + &jstate->prov_errno)) goto quit; - TRACE_JOIN("%s: continuing to configure\n", __func__); + /* all endpoints initialize with same mcast addr and hwroot */ ret = _initialize_mc(jstate); quit: /* if initialization fails, invalidate bcast_data */ @@ -3253,11 +3634,13 @@ static void _start_reduce(void *ptr) struct cxip_join_state *jstate = ptr; struct cxip_zbcoll_obj *zb = jstate->zb; - /* reduce ANDs inverted bcast_data, if any invalid, all become invalid */ - if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_REDUCE, &zb->error)) + /* Create an error bitmask from the prov_errno */ + _proverr_to_bits(jstate); + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_REDUCE, &zb->error, + &jstate->prov_errno)) goto quit; /* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */ - zb->error = cxip_zbcoll_reduce(zb, &jstate->bcast_data.uint64); + zb->error = cxip_zbcoll_reduce(zb, &jstate->reduce_err.uint64); quit: if (zb->error) _append_sched(zb, jstate); @@ -3293,10 +3676,10 @@ static void _start_cleanup(void *ptr) &jstate->mc_obj->mc_fid.fid : NULL; entry.context = jstate->context; - if (jstate->prov_errno != CXIP_PROV_ERRNO_OK) { + if (jstate->prov_errno >= FI_CXI_ERRNO_JOIN_FIRST) { size = sizeof(struct fi_eq_err_entry); entry.data = FI_JOIN_COMPLETE; - entry.err = -FI_EAVAIL; + entry.err = -FI_ECONNREFUSED; entry.prov_errno = jstate->prov_errno; flags |= UTIL_FLAG_ERROR; } @@ -3756,6 +4139,18 @@ void cxip_coll_reset_mc_ctrs(struct fid_mc *mc) ofi_atomic_set32(&mc_obj->tmout_cnt, 0); } +void cxip_coll_get_mc_ctrs(struct fid_mc *mc, struct coll_counters *counters) +{ + struct cxip_coll_mc *mc_obj = (struct cxip_coll_mc *)mc; + + counters->coll_recv_cnt = ofi_atomic_get32(&mc_obj->coll_pte->recv_cnt); + counters->send_cnt = ofi_atomic_get32(&mc_obj->send_cnt); + counters->recv_cnt = ofi_atomic_get32(&mc_obj->recv_cnt); + counters->pkt_cnt = ofi_atomic_get32(&mc_obj->pkt_cnt); + counters->seq_err_cnt = ofi_atomic_get32(&mc_obj->seq_err_cnt); + counters->tmout_cnt = ofi_atomic_get32(&mc_obj->tmout_cnt); +} + /**************************************************************************** * Manage the static coll structure in the EP. Because of its specialized * nature, it made sense to manage it here, rather than in the EP module. @@ -3788,7 +4183,16 @@ struct fi_ops_collective cxip_collective_no_ops = { .msg = fi_coll_no_msg, }; -/* Close collectives - call during EP close, ep_obj->lock is held */ +/* Close collectives - called during EP close, ep_obj->lock is held. + * This does not issue CURL requests to delete multicast addresses. + * + * This is called as part of an endpoint shutdown, which is part of an + * application shutdown, and the SLURM cleanup handler will destroy all + * multicast addresses with an efficient method that deletes all per-job + * addresses. The concern is that if there is a large count of multicast + * addresses, deleting them individually in this code will create a delay, + * and could clog the REST API. + */ void cxip_coll_close(struct cxip_ep_obj *ep_obj) { struct cxip_coll_mc *mc_obj; @@ -3796,7 +4200,7 @@ void cxip_coll_close(struct cxip_ep_obj *ep_obj) while (!dlist_empty(&ep_obj->coll.mc_list)) { dlist_pop_front(&ep_obj->coll.mc_list, struct cxip_coll_mc, mc_obj, entry); - _close_mc(mc_obj); + _close_mc(mc_obj, false); } } @@ -3867,6 +4271,7 @@ int cxip_coll_enable(struct cxip_ep *ep) ep->ep.collective = &cxip_collective_ops; ep_obj->coll.enabled = true; + cxip_coll_init_metrics(); cxip_coll_trace_init(); return FI_SUCCESS; } diff --git a/prov/cxi/src/cxip_coll_trace.c b/prov/cxi/src/cxip_coll_trace.c index 276fa83498e..05bb10a2630 100644 --- a/prov/cxi/src/cxip_coll_trace.c +++ b/prov/cxi/src/cxip_coll_trace.c @@ -1,6 +1,7 @@ /* - * Copyright (c) 2021-2024 Hewlett Packard Enterprise Development LP * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021-2024 Hewlett Packard Enterprise Development LP */ /** @@ -106,7 +107,9 @@ char *cxip_coll_trace_pathname; FILE *cxip_coll_trace_fid; uint64_t cxip_coll_trace_mask; -/* Get environment variable as string representation of int */ +/* Get environment variable as string representation of int + * Return -1 if undefined, or not-a-number. + */ static int getenv_int(const char *name) { char *env; @@ -119,6 +122,21 @@ static int getenv_int(const char *name) return value; } +/* Get environment variable + * Return 0 if undefined, or defined as zero. + */ +static int getenv_is_set(const char *name) +{ + char *env; + + env = getenv(name); + if (!env) + return 0; + if (strcmp(env, "0") == 0) + return 0; + return 1; +} + void cxip_coll_trace_init(void) { const char *fpath; @@ -139,19 +157,19 @@ void cxip_coll_trace_init(void) fpath = getenv("CXIP_TRC_PATHNAME"); /* set bits in cxip_coll_trace_mask */ - if (getenv("CXIP_TRC_CTRL")) + if (getenv_is_set("CXIP_TRC_CTRL")) cxip_coll_trace_set(CXIP_TRC_CTRL); - if (getenv("CXIP_TRC_ZBCOLL")) + if (getenv_is_set("CXIP_TRC_ZBCOLL")) cxip_coll_trace_set(CXIP_TRC_ZBCOLL); - if (getenv("CXIP_TRC_COLL_CURL")) + if (getenv_is_set("CXIP_TRC_COLL_CURL")) cxip_coll_trace_set(CXIP_TRC_COLL_CURL); - if (getenv("CXIP_TRC_COLL_PKT")) + if (getenv_is_set("CXIP_TRC_COLL_PKT")) cxip_coll_trace_set(CXIP_TRC_COLL_PKT); - if (getenv("CXIP_TRC_COLL_JOIN")) + if (getenv_is_set("CXIP_TRC_COLL_JOIN")) cxip_coll_trace_set(CXIP_TRC_COLL_JOIN); - if (getenv("CXIP_TRC_COLL_DEBUG")) + if (getenv_is_set("CXIP_TRC_COLL_DEBUG")) cxip_coll_trace_set(CXIP_TRC_COLL_DEBUG); - if (getenv("CXIP_TRC_TEST_CODE")) + if (getenv_is_set("CXIP_TRC_TEST_CODE")) cxip_coll_trace_set(CXIP_TRC_TEST_CODE); /* if no trace masks set, do nothing */ diff --git a/prov/cxi/src/cxip_cq.c b/prov/cxi/src/cxip_cq.c index 675d91eeb56..5ca5f41abff 100644 --- a/prov/cxi/src/cxip_cq.c +++ b/prov/cxi/src/cxip_cq.c @@ -125,31 +125,63 @@ void cxip_util_cq_progress(struct util_cq *util_cq) ofi_genlock_unlock(&cq->ep_list_lock); } +/* common function for both eq and cq strerror function */ +const char *cxip_strerror(int prov_errno) +{ + /* both CXI driver error and collective errors share this function */ + if (prov_errno < FI_CXI_ERRNO_RED_FIRST) + return cxi_rc_to_str(prov_errno); + + switch (prov_errno) { + /* EQ JOIN error codes */ + case FI_CXI_ERRNO_JOIN_MCAST_INUSE: + return "coll join multicast address in-use"; + case FI_CXI_ERRNO_JOIN_HWROOT_INUSE: + return "coll join hwroot in-use"; + case FI_CXI_ERRNO_JOIN_MCAST_INVALID: + return "coll join multicast address invalid"; + case FI_CXI_ERRNO_JOIN_HWROOT_INVALID: + return "coll join hwroot invalid"; + case FI_CXI_ERRNO_JOIN_CURL_FAILED: + return "coll join FM REST CURL failed"; + case FI_CXI_ERRNO_JOIN_CURL_TIMEOUT: + return "coll join FM REST CURL timed out"; + case FI_CXI_ERRNO_JOIN_FAIL_PTE: + return "coll join PTE setup failed"; + case FI_CXI_ERRNO_JOIN_OTHER: + return "coll join unknown error"; + + /* CQ REDUCE error codes */ + case FI_CXI_ERRNO_RED_FLT_OVERFLOW: + return "coll reduce FLT overflow"; + case FI_CXI_ERRNO_RED_FLT_INVALID: + return "coll reduce FLT invalid"; + case FI_CXI_ERRNO_RED_INT_OVERFLOW: + return "coll reduce INT overflow"; + case FI_CXI_ERRNO_RED_CONTR_OVERFLOW: + return "coll reduce contribution overflow"; + case FI_CXI_ERRNO_RED_OP_MISMATCH: + return "coll reduce opcode mismatch"; + case FI_CXI_ERRNO_RED_MC_FAILURE: + return "coll reduce multicast timeout"; + + /* Unknown error */ + default: + return "coll unspecified error"; + } +} + /* * cxip_cq_strerror() - Converts provider specific error information into a * printable string. */ static const char *cxip_cq_strerror(struct fid_cq *cq, int prov_errno, - const void *err_data, char *buf, - size_t len) + const void *err_data, char *buf, size_t len) { - switch (prov_errno) { - case CXIP_PROV_ERRNO_OK: - return "CXIP_COLL_OK"; - case CXIP_PROV_ERRNO_PTE: - return "CXIP_COLL_PTE_ERROR"; - case CXIP_PROV_ERRNO_MCAST_INUSE: - return "CXIP_COLL_MCAST_IN_USE"; - case CXIP_PROV_ERRNO_HWROOT_INUSE: - return "CXIP_COLL_HWROOT_IN_USE"; - case CXIP_PROV_ERRNO_MCAST_INVALID: - return "CXIP_COLL_MCAST_INVALID"; - case CXIP_PROV_ERRNO_HWROOT_INVALID: - return "CXIP_COLL_HWROOT_INVALID"; - case CXIP_PROV_ERRNO_CURL: - return "CXIP_COLL_CURL_ERROR"; - } - return cxi_rc_to_str(prov_errno); + const char *errmsg = cxip_strerror(prov_errno); + if (buf && len > 0) + strncpy(buf, errmsg, len); + return errmsg; } /* diff --git a/prov/cxi/src/cxip_ctrl.c b/prov/cxi/src/cxip_ctrl.c index b60858742b7..e54572fd53e 100644 --- a/prov/cxi/src/cxip_ctrl.c +++ b/prov/cxi/src/cxip_ctrl.c @@ -638,7 +638,7 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) } } - ret = cxip_ep_ctrl_eq_alloc(ep_obj, 4 * s_page_size, + ret = cxip_ep_ctrl_eq_alloc(ep_obj, 4 * sc_page_size, &ep_obj->ctrl.tx_evtq_buf, &ep_obj->ctrl.tx_evtq_buf_md, &ep_obj->ctrl.tx_evtq); diff --git a/prov/cxi/src/cxip_curl.c b/prov/cxi/src/cxip_curl.c index 97fcfcfac10..e849ebad806 100644 --- a/prov/cxi/src/cxip_curl.c +++ b/prov/cxi/src/cxip_curl.c @@ -1,24 +1,31 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + * Copyright (c) 2021-2024 Hewlett Packard Enterprise Development LP */ #include +#include #include #include #include #include #include #include +#include +#include #include #include "cxip.h" +static void *cxip_curlhandle; +static CURLM *cxip_curlm; +static int cxip_curl_count; #define TRACE_CURL(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_CURL, fmt, \ ##__VA_ARGS__) #define CXIP_DBG(...) _CXIP_DBG(FI_LOG_FABRIC, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_FABRIC, __VA_ARGS__) #define CXIP_WARN(...) _CXIP_WARN(FI_LOG_FABRIC, __VA_ARGS__) #define CHUNK_SIZE 4096 @@ -117,28 +124,139 @@ static size_t write_callback(void *curl_rcvd, size_t size, size_t nmemb, * The CURL library must be explicitly initialized. It is application-global, * and the initialization is not thread-safe, according to the documentation. We * do not protect this call, because it is running under CXI_INIT (see - * cxip_info.c), which is single-threaded. The curl_global_init() call can be + * cxip_info.c), which is single-threaded. The (*dl_curl_global_init)() call can be * issued multiple times (non-concurrently) and has the same end result as * calling it once. */ -static CURLM *cxip_curlm; -static int cxip_curl_count; /** * Initialize CURL globally for the application, enabling multi-curl * (concurrent calls). */ + +/* Each of these should be referenced in curlary[] below */ +CURLcode (*dl_curl_global_init)(long); +void (*dl_curl_global_cleanup)(void); +CURL * (*dl_curl_easy_init)(void); +void (*dl_curl_easy_cleanup)(CURL *); +CURLcode (*dl_curl_easy_getinfo)(CURL *, CURLINFO, ...); +CURLcode (*dl_curl_easy_setopt)(CURL *, CURLoption, ...); +const char *(*dl_curl_easy_strerror)(CURLcode); +CURLcode (*dl_curl_easy_perform)(CURL *); +CURLM * (*dl_curl_multi_init)(void); +CURLMcode (*dl_curl_multi_cleanup)(CURLM *); +CURLMcode (*dl_curl_multi_add_handle)(CURLM *multi_handle, CURL *); +CURLMsg * (*dl_curl_multi_info_read)(CURLM *multi_handle, int *); +CURLMcode (*dl_curl_multi_perform)(CURLM *multi_handle, int *); +const char *(*dl_curl_multi_strerror)(CURLMcode); +struct curl_slist *(*dl_curl_slist_append)(struct curl_slist *, const char *); +void (*dl_curl_slist_free_all)(struct curl_slist *); + +struct curlfunc { + void **fptr; + char *name; +}; + +struct curlfunc curlary[] = { + {(void **)&dl_curl_global_init, "curl_global_init"}, + {(void **)&dl_curl_global_cleanup, "curl_global_cleanup"}, + {(void **)&dl_curl_easy_init, "curl_easy_init"}, + {(void **)&dl_curl_easy_cleanup, "curl_easy_cleanup"}, + {(void **)&dl_curl_easy_getinfo, "curl_easy_getinfo"}, + {(void **)&dl_curl_easy_setopt, "curl_easy_setopt"}, + {(void **)&dl_curl_easy_strerror, "curl_easy_strerror"}, + {(void **)&dl_curl_easy_perform, "curl_easy_perform"}, + {(void **)&dl_curl_multi_init, "curl_multi_init"}, + {(void **)&dl_curl_multi_cleanup, "curl_multi_cleanup"}, + {(void **)&dl_curl_multi_add_handle, "curl_multi_add_handle"}, + {(void **)&dl_curl_multi_info_read, "curl_multi_info_read"}, + {(void **)&dl_curl_multi_perform, "curl_multi_perform"}, + {(void **)&dl_curl_multi_strerror, "curl_multi_strerror"}, + {(void **)&dl_curl_slist_append, "curl_slist_append"}, + {(void **)&dl_curl_slist_free_all, "curl_slist_free_all"}, + {NULL, NULL} +}; + +int cxip_curl_load_symbols(void) +{ + struct curlfunc *funcptr; + char libfile[256], *libpath; + int version; + int errcnt; + void *h; + + /* load successfully only once */ + if (cxip_curlhandle) + return 0; + + /* Try to find latest usable version */ + // TODO test earlier versions + for (version = 4; version >= 4; version--) { + sprintf(libfile, "/usr/lib64/libcurl.so.%d", version); + libpath = realpath(libfile, NULL); + if (!libpath) { + TRACE_CURL("could not expand '%s'\n", libfile); + CXIP_INFO("could not expand '%s'\n", libfile); + continue; + } + TRACE_CURL("dlopen '%s'\n", libpath); + h = dlopen(libpath, RTLD_NOW); + if (!h) { + TRACE_CURL("%s not found\n", libpath); + CXIP_INFO("%s not found\n", libpath); + free(libpath); + continue; + } + TRACE_CURL("%s found\n", libpath); + free(libpath); + break; + } + if (!h) { + TRACE_CURL("libcurl not supported\n"); + CXIP_WARN("libcurl not supported\n"); + CXIP_WARN("Accelerated collectives cannot be enabled\n"); + return -FI_EOPNOTSUPP; + } + /* Load all the necessary functions, or none */ + errcnt = 0; + funcptr = curlary; + while (funcptr->fptr) { + *funcptr->fptr = dlsym(h, funcptr->name); + if (!(*funcptr->fptr)) { + CXIP_WARN("curl function '%s' not found\n", + funcptr->name); + errcnt++; + } + funcptr++; + } + if (errcnt) { + funcptr = curlary; + while (funcptr->fptr) + *funcptr->fptr = NULL; + CXIP_WARN("libcurl incomplete support\n"); + return -FI_EOPNOTSUPP; + } + /* record handle to prevent reloading */ + cxip_curlhandle = h; + return 0; +} + int cxip_curl_init(void) { - int ret = FI_SUCCESS; CURLcode res; + int ret; + + /* can be safely called multiple times */ + ret = cxip_curl_load_symbols(); + if (ret) + return ret; if (!cxip_curlm) { - res = curl_global_init(CURL_GLOBAL_DEFAULT); + res = (*dl_curl_global_init)(CURL_GLOBAL_DEFAULT); if (res == CURLE_OK) { - cxip_curlm = curl_multi_init(); + cxip_curlm = (*dl_curl_multi_init)(); if (!cxip_curlm) { - curl_global_cleanup(); + (*dl_curl_global_cleanup)(); ret = -FI_EINVAL; } } else @@ -154,8 +272,8 @@ void cxip_curl_fini(void) { cxip_curl_count = 0; if (cxip_curlm) { - curl_multi_cleanup(cxip_curlm); - curl_global_cleanup(); + (*dl_curl_multi_cleanup)(cxip_curlm); + (*dl_curl_global_cleanup)(); cxip_curlm = NULL; } } @@ -207,7 +325,11 @@ void cxip_curl_free(struct cxip_curl_handle *handle) * The usrfunc is called in cxip_curl_progress() when the request completes, * and receives the handle as its sole argument. The handle also contains an * arbitrary usrptr supplied by the caller. This usrptr can contain specific - * information to identify which of multiple concurrent requests has completed. + * user information to identify which of multiple concurrent requests has + * completed. + * + * An error return indicates that the dispatch was unsuccessful. All memory + * cleanup is done here. * * There are no "normal" REST errors from this call. REST errors are instead * returned on attempts to progress the dispatched operation. @@ -220,7 +342,9 @@ void cxip_curl_free(struct cxip_curl_handle *handle) * @param userfunc : user-defined completion function * @param usrptr : user-defined data pointer * - * @return int : 0 on success, -1 on failure + * @return int : 0 on success, -errno on failure + * -FI_ENOMEM : out-of-memory + * -FI_ECONNREFUSED : CURL easy/multi init failed */ int cxip_curl_perform(const char *endpoint, const char *request, const char *sessionToken, size_t rsp_init_size, @@ -230,125 +354,177 @@ int cxip_curl_perform(const char *endpoint, const char *request, struct cxip_curl_handle *handle; struct curl_slist *headers; char *token; - char *verify_peer_str; - int verify_peer; + char *cert_env_var; + bool verify = true; + bool isdir = false; + bool isfile = false; + struct stat buf; CURLMcode mres; CURL *curl; int running; int ret; - ret = -FI_ENOMEM; handle = calloc(1, sizeof(*handle)); - if (!handle) + if (!handle) { + ret = -FI_ENOMEM; goto fail; + } /* libcurl is fussy about NULL requests */ handle->endpoint = strdup(endpoint); - if (!handle->endpoint) + if (!handle->endpoint) { + ret = -FI_ENOMEM; goto fail; + } handle->request = strdup(request ? request : ""); - if (!handle->request) + if (!handle->request) { + ret = -FI_ENOMEM; goto fail; + } handle->response = NULL; handle->recv = (void *)init_curl_buffer(rsp_init_size); - if (!handle->recv) + if (!handle->recv) { + ret = -FI_ENOMEM; goto fail; + } + /* add user completion function and pointer */ handle->usrfunc = usrfunc; handle->usrptr = usrptr; - ret = -FI_EACCES; - curl = curl_easy_init(); + curl = (*dl_curl_easy_init)(); if (!curl) { - CXIP_WARN("curl_easy_init() failed\n"); + CXIP_WARN("(*dl_curl_easy_init)() failed\n"); + ret = -FI_ECONNREFUSED; goto fail; } /* HTTP 1.1 assumed */ headers = NULL; - headers = curl_slist_append(headers, "Expect:"); - headers = curl_slist_append(headers, "Accept: application/json"); - headers = curl_slist_append(headers, "Content-Type: application/json"); - headers = curl_slist_append(headers, "charset: utf-8"); + headers = (*dl_curl_slist_append)(headers, "Expect:"); + headers = (*dl_curl_slist_append)(headers, "Accept: application/json"); + headers = (*dl_curl_slist_append)(headers, "Content-Type: application/json"); + headers = (*dl_curl_slist_append)(headers, "charset: utf-8"); token = NULL; if (sessionToken) { ret = asprintf(&token, "Authorization: Bearer %s", sessionToken); if (ret < 0) { CXIP_WARN("token string create failed\n"); + ret = -FI_ENOMEM; goto fail; } - headers = curl_slist_append(headers, token); + headers = (*dl_curl_slist_append)(headers, token); } handle->headers = (void *)headers; - curl_easy_setopt(curl, CURLOPT_URL, handle->endpoint); + (*dl_curl_easy_setopt)(curl, CURLOPT_URL, handle->endpoint); if (op == CURL_GET) { - curl_easy_setopt(curl, CURLOPT_HTTPGET, 1L); + (*dl_curl_easy_setopt)(curl, CURLOPT_HTTPGET, 1L); + } else if (op == CURL_DELETE) { + (*dl_curl_easy_setopt)(curl, CURLOPT_CUSTOMREQUEST, "DELETE"); } else { - curl_easy_setopt(curl, CURLOPT_POST, 1L); - curl_easy_setopt(curl, CURLOPT_POSTFIELDS, handle->request); - curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, + (*dl_curl_easy_setopt)(curl, CURLOPT_POST, 1L); + (*dl_curl_easy_setopt)(curl, CURLOPT_POSTFIELDS, handle->request); + (*dl_curl_easy_setopt)(curl, CURLOPT_POSTFIELDSIZE, strlen(handle->request)); } - curl_easy_setopt(curl, CURLOPT_STDERR, stderr); - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, handle->recv); - curl_easy_setopt(curl, CURLOPT_PRIVATE, (void *)handle); - curl_easy_setopt(curl, CURLOPT_VERBOSE, (long)verbose); - curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, cxip_curl_opname(op)); - - verify_peer_str = getenv("CURLOPT_SSL_VERIFYPEER"); - if (verify_peer_str) - verify_peer = atoi(verify_peer_str); - else - verify_peer = 0; - curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, verify_peer); - - curl_multi_add_handle(cxip_curlm, curl); - mres = curl_multi_perform(cxip_curlm, &running); + (*dl_curl_easy_setopt)(curl, CURLOPT_STDERR, stderr); + (*dl_curl_easy_setopt)(curl, CURLOPT_HTTPHEADER, headers); + (*dl_curl_easy_setopt)(curl, CURLOPT_WRITEFUNCTION, write_callback); + (*dl_curl_easy_setopt)(curl, CURLOPT_WRITEDATA, handle->recv); + (*dl_curl_easy_setopt)(curl, CURLOPT_PRIVATE, (void *)handle); + (*dl_curl_easy_setopt)(curl, CURLOPT_VERBOSE, (long)verbose); + (*dl_curl_easy_setopt)(curl, CURLOPT_CUSTOMREQUEST, cxip_curl_opname(op)); + + /* Value of fm_cacert variable in slurmctld configuration */ + /* If set to 'yes' or a path, the CACERT will be validated and used for the connection */ + cert_env_var = getenv("FI_CXI_COLL_FABRIC_MGR_CACERT"); + + if (!cert_env_var || !strcmp(cert_env_var, "no")) + verify = false; + else if (!strcmp(cert_env_var, "yes")) + verify = true; + else { + if (stat(cert_env_var, &buf) == -1) { + ret = FI_ENOENT; + goto fail; + } + if (S_ISDIR(buf.st_mode)) + isdir = true; + else if (S_ISREG(buf.st_mode)) + isfile = true; + else { + ret = FI_EINVAL; + goto fail; + } + } + + if (!verify) { + /* These are needed to work with self-signed certificates */ + (*dl_curl_easy_setopt)(curl, CURLOPT_SSL_VERIFYPEER, 0L); + (*dl_curl_easy_setopt)(curl, CURLOPT_SSL_VERIFYHOST, 0L); + } else { + /* FI_CXI_COLL_FABRIC_MGR_CACERT is "yes" or a pathname */ + (*dl_curl_easy_setopt)(curl, CURLOPT_SSL_VERIFYPEER, 1L); + (*dl_curl_easy_setopt)(curl, CURLOPT_SSL_VERIFYHOST, 2L); + } + + /* If certificate file/dir specified, use it */ + if (isdir) + (*dl_curl_easy_setopt)(curl, CURLOPT_CAPATH, cert_env_var); + else if (isfile) + (*dl_curl_easy_setopt)(curl, CURLOPT_CAINFO, cert_env_var); + + (*dl_curl_multi_add_handle)(cxip_curlm, curl); + mres = (*dl_curl_multi_perform)(cxip_curlm, &running); if (mres != CURLM_OK) { - CXIP_WARN("curl_multi_perform() failed: %s\n", - curl_multi_strerror(mres)); + CXIP_WARN("(*dl_curl_multi_perform)() failed: %s\n", + (*dl_curl_multi_strerror)(mres)); + ret = -FI_ECONNREFUSED; goto fail; } cxip_curl_count += 1; return FI_SUCCESS; fail: - CXIP_WARN("%s failed %d\n", __func__, ret); + CXIP_WARN("%s failed %d (%s)\n", __func__, ret, fi_strerror(ret)); cxip_curl_free(handle); return ret; } /** - * Progress the CURL requests. + * Progress the pending CURL requests. * * This progresses concurrent CURL requests, and returns the following: * - * - 0 indicates an operation completed - * - -FI_EAGAIN indicates operations are pending, none completed - * - -FI_ENODATA indicates no operations are pending - * - -errorcode a fatal error + * - 0 success + * - -FI_EAGAIN indicates operations are pending, none completed + * - -FI_ENODATA indicates no operations are pending + * - -FI_ECONNREFUSED fatal error, CURL is not functioning properly + * + * Note that -FI_ECONNREFUSED should be treated as a fatal CURL error. It + * indicates that CURL is behaving in an abnormal fashion, and cannot be + * relied upon. In normal use, it should not happen. * - * Repeated calls will return additional completions, until there are no more - * pending and -FI_ENODATA is returned. + * All other error handling is performed by the usrfunc function (supplied + * during cxip_curl_perform() call), see below. * - * Note that a CURL request will succeed if the server is not reachable. It will - * return a handle->status value of 0, which is an invalid HTTP status, and - * indicates that it could not connect to a server. + * A CURL request will complete if the server is not reachable. It will return a + * handle->status value of 0, which is an invalid HTTP status, and indicates + * that it could not connect to a server. * - * For unit testing, it is useful for the test to be able to inspect the handle - * directly, and it can be obtained by specifying a non-null handleptr value. If - * handleptr is supplied, the caller is responsible for calling cxip_curl_free() - * on the returned handle. In normal usage, handleptr is NULL, and this routine - * will clean up the handle after the operation completes. + * In normal use, handleptr is NULL. the caller has passed a a usrfunc callback + * routine when dispatching the CURL request to process the returned errors and + * data: see cxip_curl_perform(). This usrfunc callback is called after + * completion of the request, before the handle is destroyed, and is expected to + * know enough about CURL operations to interpret the results. This routine will + * delete the handle after the callback has processed it. * - * The user should provide a callback routine to examine the final state of the - * CURL request, as well as any data it returns: see cxip_curl_perform(). This - * user callback is called after completion of the request, before the handle is - * destroyed. + * For unit testing, it can be useful for the test to be able to inspect the + * handle and the error return, and it can be obtained by specifying a non-null + * handleptr. If handleptr is supplied, the caller is responsible for + * calling cxip_curl_free() on the returned handle. * * The callback routine has read-only access to the handle, and read-write * access to its own data area, available as handle->usrptr. @@ -356,7 +532,7 @@ int cxip_curl_perform(const char *endpoint, const char *request, * The handle contains the following documented fields: * * - status = HTTP status of the op, or 0 if the endpoint could not be reached - * - endpoint = copy of the endpoint address supplied for the post + * - endpoint = copy of the endpoint address (URL) supplied for the post * - request = copy of the JSON request data supplied for the post * - response = pointer to the JSON response returned by the endpoint * - usrptr = arbitrary user pointer supplied during CURL request @@ -379,55 +555,57 @@ int cxip_curl_progress(struct cxip_curl_handle **handleptr) if (!cxip_curl_count) return -FI_ENODATA; - handle = NULL; - /* running returns the number of curls running */ - mres = curl_multi_perform(cxip_curlm, &running); + mres = (*dl_curl_multi_perform)(cxip_curlm, &running); if (mres != CURLM_OK) { - CXIP_WARN("curl_multi_perform() failed: %s\n", - curl_multi_strerror(mres)); - return -FI_EOTHER; + CXIP_WARN("(*dl_curl_multi_perform)() failed: %s\n", + (*dl_curl_multi_strerror)(mres)); + return -FI_ECONNREFUSED; } /* messages returns the number of additional curls finished */ - msg = curl_multi_info_read(cxip_curlm, &messages); + msg = (*dl_curl_multi_info_read)(cxip_curlm, &messages); if (!msg || msg->msg != CURLMSG_DONE) { return (running) ? -FI_EAGAIN : -FI_ENODATA; } + /* These should not occur, but if (*dl_curl_easy_getinfo)() succeeds, we + * don't really care. Just post a warning. + */ if (msg->data.result >= CURL_LAST) { CXIP_WARN("CURL unknown result %d\n", msg->data.result); - } - else if (msg->data.result > CURLE_OK) { + } else if (msg->data.result > CURLE_OK) { CXIP_WARN("CURL error '%s'\n", - curl_easy_strerror(msg->data.result)); + (*dl_curl_easy_strerror)(msg->data.result)); } + /* retrieve our handle from the private pointer */ - res = curl_easy_getinfo(msg->easy_handle, + handle = NULL; + res = (*dl_curl_easy_getinfo)(msg->easy_handle, CURLINFO_PRIVATE, (char **)&handle); if (res != CURLE_OK) { - TRACE_CURL("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_PRIVATE", curl_easy_strerror(res)); - CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_PRIVATE", curl_easy_strerror(res)); - return -FI_EOTHER; + TRACE_CURL("(*dl_curl_easy_getinfo)(%s) failed: %s\n", + "CURLINFO_PRIVATE", (*dl_curl_easy_strerror)(res)); + CXIP_WARN("(*dl_curl_easy_getinfo)(%s) failed: %s\n", + "CURLINFO_PRIVATE", (*dl_curl_easy_strerror)(res)); + return -FI_ECONNREFUSED; } /* handle is now valid, must eventually be freed */ /* retrieve the status code, should not fail */ - res = curl_easy_getinfo(msg->easy_handle, + res = (*dl_curl_easy_getinfo)(msg->easy_handle, CURLINFO_RESPONSE_CODE, &status); if (res != CURLE_OK) { - TRACE_CURL("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_RESPONSE_CODE", curl_easy_strerror(res)); - CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_RESPONSE_CODE", curl_easy_strerror(res)); + TRACE_CURL("(*dl_curl_easy_getinfo)(%s) failed: %s\n", + "CURLINFO_RESPONSE_CODE", (*dl_curl_easy_strerror)(res)); + CXIP_WARN("(*dl_curl_easy_getinfo)(%s) failed: %s\n", + "CURLINFO_RESPONSE_CODE", (*dl_curl_easy_strerror)(res)); /* continue, handle->status should show zero */ } - TRACE_CURL("curl_easy_getinfo() success\n"); + TRACE_CURL("(*dl_curl_easy_getinfo)() success\n"); /* we can recover resources now */ - curl_slist_free_all((struct curl_slist *)handle->headers); - curl_easy_cleanup(msg->easy_handle); + (*dl_curl_slist_free_all)((struct curl_slist *)handle->headers); + (*dl_curl_easy_cleanup)(msg->easy_handle); handle->headers = NULL; /* make sure response string is terminated */ diff --git a/prov/cxi/src/cxip_dom.c b/prov/cxi/src/cxip_dom.c index 4a928018679..07cdb4a498a 100644 --- a/prov/cxi/src/cxip_dom.c +++ b/prov/cxi/src/cxip_dom.c @@ -395,7 +395,7 @@ int cxip_domain_prov_mr_id_alloc(struct cxip_domain *dom, */ key.events = mr->count_events || mr->rma_events || mr->cntr; - key.opt = cxip_env.optimized_mrs && + key.opt = dom->optimized_mrs && key.id < CXIP_PTL_IDX_PROV_MR_OPT_CNT; mr->key = key.raw; ofi_spin_unlock(&dom->ctrl_id_lock); @@ -596,7 +596,7 @@ static int cxip_dom_bind(struct fid *fid, struct fid *bfid, uint64_t flags) return -FI_EINVAL; dom->eq = eq; - if (flags & OFI_REG_MR) + if (flags & FI_REG_MR) dom->mr_eq = eq; return 0; diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c index fabdea22be3..50ff7b9bd96 100644 --- a/prov/cxi/src/cxip_ep.c +++ b/prov/cxi/src/cxip_ep.c @@ -477,11 +477,18 @@ ssize_t cxip_ep_cancel(fid_t fid, void *context) if (!ofi_recv_allowed(ep->ep_obj->caps)) return -FI_ENOENT; + ofi_genlock_lock(&ep->ep_obj->lock); + ret = cxip_rxc_cancel(ep->ep_obj->rxc, context); if (ret != -FI_ENOENT) - return ret; + goto out_unlock; + + ret = cxip_txc_cancel(ep->ep_obj->txc, context); - return cxip_txc_cancel(ep->ep_obj->txc, context); +out_unlock: + ofi_genlock_unlock(&ep->ep_obj->lock); + + return ret; } /* @@ -963,7 +970,7 @@ static inline int cxip_ep_set_val(struct cxip_ep *cxi_ep, uint64_t *req_order; uint64_t *req_rnr_max_time; uint32_t *req_tclass; - uint32_t new_tclass; + uint32_t new_tclass = FI_TC_UNSPEC; if (!val->val) return -FI_EINVAL; @@ -1185,7 +1192,7 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, { int ret; struct cxip_ep_obj *ep_obj; - uint32_t txc_tclass; + uint32_t txc_tclass = FI_TC_UNSPEC; uint32_t nic; uint32_t pid; int i; diff --git a/prov/cxi/src/cxip_eq.c b/prov/cxi/src/cxip_eq.c index 61aad506663..010fdcc7183 100644 --- a/prov/cxi/src/cxip_eq.c +++ b/prov/cxi/src/cxip_eq.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2020 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020-2024 Cray Inc. All rights reserved. */ /* @@ -58,6 +58,18 @@ static void cxip_eq_progress(struct cxip_eq *eq) ofi_mutex_unlock(&eq->list_lock); } +/* cxip_cq_strerror() - Converts provider specific error information into a + * printable string. Not eq-specific. + */ +static const char *cxip_eq_strerror(struct fid_eq *eq, int prov_errno, + const void *err_data, char *buf, size_t len) +{ + const char *errmsg = cxip_strerror(prov_errno); + if (buf && len > 0) + strncpy(buf, errmsg, len); + return errmsg; +} + ssize_t cxip_eq_read(struct fid_eq *eq_fid, uint32_t *event, void *buf, size_t len, uint64_t flags) { @@ -78,7 +90,7 @@ static struct fi_ops_eq cxi_eq_ops = { .readerr = ofi_eq_readerr, .sread = ofi_eq_sread, .write = ofi_eq_write, - .strerror = ofi_eq_strerror, + .strerror = cxip_eq_strerror, // customized }; static struct fi_ops cxi_eq_fi_ops = { diff --git a/prov/cxi/src/cxip_evtq.c b/prov/cxi/src/cxip_evtq.c index c40dd7e7c2f..e4c90c31980 100644 --- a/prov/cxi/src/cxip_evtq.c +++ b/prov/cxi/src/cxip_evtq.c @@ -155,7 +155,7 @@ void cxip_evtq_flush_trig_reqs(struct cxip_evtq *evtq) req->type); } - ofi_atomic_dec32(&txc->otx_reqs); + cxip_txc_otx_reqs_dec(txc); cxip_evtq_req_free_no_lock(req); } diff --git a/prov/cxi/src/cxip_faults.c b/prov/cxi/src/cxip_faults.c index 04564b1bd04..8c273d7a203 100644 --- a/prov/cxi/src/cxip_faults.c +++ b/prov/cxi/src/cxip_faults.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2019 Hewlett Packard Enterprise Development LP + * Copyright (c) 2019-2024 Hewlett Packard Enterprise Development LP */ /* Fault injection. */ @@ -64,7 +64,91 @@ void cxip_fault_inject_fini(void) fault_fini(&malloc_fault); } +/****************************************************/ + +/* structure used to simulate failures */ +struct cxip_trap { + struct dlist_entry link; + int index; + int trap; + int err; + int prov_errno; +}; + +struct dlist_entry trap_list; +bool trap_initialized; + +void cxip_trap_close(void) +{ + struct cxip_trap *trap_obj; + + if (!trap_initialized) + return; + while (!dlist_empty(&trap_list)) { + dlist_pop_front(&trap_list, struct cxip_trap, trap_obj, link); + free(trap_obj); + } +} + +void cxip_trap_set(int index, int trap, int err, int prov_errno) +{ + struct cxip_trap *trap_obj; + + if (!trap_initialized) { + dlist_init(&trap_list); + trap_initialized = true; + } + trap_obj = calloc(1, sizeof(*trap_obj)); + if (!trap_obj) { + return; + } + dlist_init(&trap_obj->link); + trap_obj->index = index; + trap_obj->trap = trap; + trap_obj->err = err; + trap_obj->prov_errno = prov_errno; + dlist_insert_tail(&trap_list, &trap_obj->link); +} + +bool cxip_trap_search(int index, int trap, int *err, int *prov_errno) +{ + struct cxip_trap *trap_obj; + struct dlist_entry *item; + + if (!trap_initialized) { + return false; + } + + int cnt = 0; + dlist_foreach(&trap_list, item) { + cnt++; + trap_obj = container_of(item, struct cxip_trap, link); + if (trap_obj->index != index) + continue; + if (trap_obj->trap != trap) + continue; + dlist_remove(item); + if (err) + *err = trap_obj->err; + if (prov_errno) { + if (trap_obj->err == -FI_EAVAIL) + *prov_errno = trap_obj->prov_errno; + else + *prov_errno = 0; + } + free(trap_obj); + return true; + } + return false; +} #else void cxip_fault_inject_init(void) {} void cxip_fault_inject_fini(void) {} + +void cxip_trap_close(void) {} +void cxip_trap_set(int index, int trap, int err, int prov_errno) {} +bool cxip_trap_search(int index, int trap, int *err, int *prov_errno) +{ + return false; +} #endif diff --git a/prov/cxi/src/cxip_if.c b/prov/cxi/src/cxip_if.c index 1d14aecf470..62ebb4f86b7 100644 --- a/prov/cxi/src/cxip_if.c +++ b/prov/cxi/src/cxip_if.c @@ -247,7 +247,7 @@ int cxip_alloc_lni(struct cxip_if *iface, uint32_t svc_id, } lni->iface = iface; - ofi_spin_init(&lni->lock); + pthread_rwlock_init(&lni->cp_lock, NULL); dlist_init(&lni->remap_cps); CXIP_DBG("Allocated LNI, %s RGID: %u\n", diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index 5c6e34ac1a1..7273d5bc2dc 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2019,2022 Hewlett Packard Enterprise Development LP + * Copyright (c) 2019,2022-2024 Hewlett Packard Enterprise Development LP */ /* CXI fabric discovery implementation. */ @@ -270,6 +270,8 @@ struct fi_rx_attr cxip_rx_attr = { .caps = CXIP_EP_CAPS & ~OFI_IGNORED_RX_CAPS, .op_flags = CXIP_RX_OP_FLAGS, .msg_order = CXIP_MSG_ORDER, + .comp_order = FI_ORDER_NONE, + .total_buffered_recv = CXIP_UX_BUFFER_SIZE, .size = CXIP_MAX_RX_SIZE, .iov_limit = 1, }; @@ -288,6 +290,8 @@ struct fi_rx_attr cxip_multi_auth_key_rx_attr = { .caps = CXIP_EP_CAPS & ~OFI_IGNORED_RX_CAPS & ~FI_DIRECTED_RECV, .op_flags = CXIP_RX_OP_FLAGS, .msg_order = CXIP_MSG_ORDER, + .comp_order = FI_ORDER_NONE, + .total_buffered_recv = CXIP_UX_BUFFER_SIZE, .size = CXIP_MAX_RX_SIZE, .iov_limit = 1, }; @@ -386,13 +390,13 @@ struct util_prov cxip_util_prov = { .flags = 0, }; -int s_page_size; +int sc_page_size; /* Get _SC_PAGESIZE */ static void set_system_page_size(void) { - if (!s_page_size) - s_page_size = sysconf(_SC_PAGESIZE); + if (!sc_page_size) + sc_page_size = sysconf(_SC_PAGESIZE); } /* @@ -510,6 +514,7 @@ static int cxip_info_init(void) fi->tx_attr->inject_size = 0; fi->rx_attr->msg_order = CXIP_MSG_ORDER & ~FI_ORDER_SAS; fi->rx_attr->caps |= FI_DIRECTED_RECV; + fi->rx_attr->total_buffered_recv = 0; CXIP_DBG("%s RNR info created\n", nic_if->info->device_name); @@ -660,12 +665,17 @@ struct cxip_environment cxip_env = { .coll_fabric_mgr_url = NULL, .coll_retry_usec = CXIP_COLL_MAX_RETRY_USEC, .coll_timeout_usec = CXIP_COLL_MAX_TIMEOUT_USEC, + .coll_fm_timeout_msec = CXIP_COLL_DFL_FM_TIMEOUT_MSEC, .coll_use_dma_put = false, .telemetry_rgid = -1, .disable_hmem_dev_register = 0, .ze_hmem_supported = 0, .rdzv_proto = CXIP_RDZV_PROTO_DEFAULT, .enable_trig_op_limit = false, + .mr_cache_events_disable_poll_nsecs = + CXIP_MR_CACHE_EVENTS_DISABLE_POLL_NSECS, + .mr_cache_events_disable_le_poll_nsecs = + CXIP_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS, }; static void cxip_env_init(void) @@ -1246,6 +1256,17 @@ static void cxip_env_init(void) if (cxip_env.coll_timeout_usec > CXIP_COLL_MAX_TIMEOUT_USEC) cxip_env.coll_timeout_usec = CXIP_COLL_MAX_TIMEOUT_USEC; + fi_param_define(&cxip_prov, "coll_fm_timeout_msec", FI_PARAM_SIZE_T, + "FM API timeout (msec) (default %d, min %d, max %d).", + cxip_env.coll_fm_timeout_msec, CXIP_COLL_MIN_FM_TIMEOUT_MSEC, + CXIP_COLL_MAX_FM_TIMEOUT_MSEC); + fi_param_get_size_t(&cxip_prov, "coll_fm_timeout_msec", + &cxip_env.coll_fm_timeout_msec); + if (cxip_env.coll_fm_timeout_msec < CXIP_COLL_MIN_FM_TIMEOUT_MSEC) + cxip_env.coll_fm_timeout_msec = CXIP_COLL_MIN_FM_TIMEOUT_MSEC; + if (cxip_env.coll_fm_timeout_msec > CXIP_COLL_MAX_FM_TIMEOUT_MSEC) + cxip_env.coll_fm_timeout_msec = CXIP_COLL_MAX_FM_TIMEOUT_MSEC; + fi_param_define(&cxip_prov, "default_tx_size", FI_PARAM_SIZE_T, "Default provider tx_attr.size (default: %lu).", cxip_env.default_tx_size); @@ -1328,6 +1349,18 @@ static void cxip_env_init(void) param_str = NULL; } + fi_param_define(&cxip_prov, "mr_cache_events_disable_poll_nsecs", FI_PARAM_SIZE_T, + "Max amount of time to poll when disabling an MR configured with MR match events (default: %lu).", + cxip_env.mr_cache_events_disable_poll_nsecs); + fi_param_get_size_t(&cxip_prov, "mr_cache_events_disable_poll_nsecs", + &cxip_env.mr_cache_events_disable_poll_nsecs); + + fi_param_define(&cxip_prov, "mr_cache_events_disable_le_poll_nsecs", FI_PARAM_SIZE_T, + "Max amount of time to poll when LE invalidate disabling an MR configured with MR match events (default: %lu).", + cxip_env.mr_cache_events_disable_le_poll_nsecs); + fi_param_get_size_t(&cxip_prov, "mr_cache_events_disable_le_poll_nsecs", + &cxip_env.mr_cache_events_disable_le_poll_nsecs); + set_system_page_size(); } diff --git a/prov/cxi/src/cxip_iomm.c b/prov/cxi/src/cxip_iomm.c index 14f4d955978..69975cfb06a 100644 --- a/prov/cxi/src/cxip_iomm.c +++ b/prov/cxi/src/cxip_iomm.c @@ -115,6 +115,17 @@ static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) goto err; } + /* If the md len is larger than the iov_len, the VA and len have + * been aligned to a larger page size. Update the cache memory + * region registered by returning -FI_EAGAIN. Note, that GPU memory + * cannot be aligned since the aligned iov_base may fall outside the + * valid device address. + */ + if (entry->info.iface == FI_HMEM_SYSTEM) { + entry->info.iov.iov_base = (void *)md->md->va; + entry->info.iov.iov_len = md->md->len; + } + /* zeHostMalloc() returns FI_HMEM_ZE but this cannot currently be * registered with ofi_hmem_dev_register(). Thus skip it. */ @@ -475,10 +486,15 @@ static void cxip_map_get_mem_region_size(const void *buf, unsigned long len, { int ret; - ret = ofi_hmem_get_base_addr(iface, buf, len, out_buf, out_len); - if (ret) { + if (iface == FI_HMEM_SYSTEM) { *out_buf = (void *)buf; *out_len = len; + } else { + ret = ofi_hmem_get_base_addr(iface, buf, len, out_buf, out_len); + if (ret) { + *out_buf = (void *)buf; + *out_len = len; + } } CXIP_DBG("%s: User addr=%p User len=%lu Region addr=%p Region len=0x%lx\n", diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c index 6d088e21262..6a53ea87af5 100644 --- a/prov/cxi/src/cxip_mr.c +++ b/prov/cxi/src/cxip_mr.c @@ -198,6 +198,29 @@ static int cxip_mr_enable_std(struct cxip_mr *mr) return FI_SUCCESS; } +/* If MR event counts are recorded then we can check event counts to determine + * if invalidate can be skipped. + */ +static bool cxip_mr_disable_check_count_events(struct cxip_mr *mr, + uint64_t timeout) +{ + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + uint64_t end = ofi_gettime_ns() + timeout; + + while (true) { + + if (ofi_atomic_get32(&mr->match_events) == + ofi_atomic_get32(&mr->access_events)) + return true; + + if (ofi_gettime_ns() >= end) + return false; + + sched_yield(); + cxip_ep_tgt_ctrl_progress_locked(ep_obj); + } +} + /* * cxip_mr_disable_std() - Free HW resources from the standard MR. * @@ -207,35 +230,45 @@ static int cxip_mr_disable_std(struct cxip_mr *mr) { int ret; struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + bool count_events_disabled; /* TODO: Handle -FI_EAGAIN. */ ret = cxip_pte_unlink(ep_obj->ctrl.pte, C_PTL_LIST_PRIORITY, mr->req.req_id, ep_obj->ctrl.tgq); - assert(ret == FI_SUCCESS); + if (ret != FI_SUCCESS) + CXIP_FATAL("Unable to queue unlink command: %d\n", ret); do { sched_yield(); cxip_ep_tgt_ctrl_progress_locked(ep_obj); } while (mr->mr_state != CXIP_MR_UNLINKED); - /* If MR event counts are recorded then we can check event counts - * to determine if invalidate can be skipped. - */ - if (!mr->count_events || ofi_atomic_get32(&mr->match_events) != - ofi_atomic_get32(&mr->access_events)) { - /* TODO: Temporary debug helper for DAOS to track if - * Match events detect a need to flush. - */ - if (mr->count_events) - CXIP_WARN("Match events required pte LE invalidate\n"); + if (mr->count_events) { + count_events_disabled = cxip_mr_disable_check_count_events(mr, cxip_env.mr_cache_events_disable_poll_nsecs); + if (count_events_disabled) + goto disabled_success; - ret = cxil_invalidate_pte_le(ep_obj->ctrl.pte->pte, mr->key, - C_PTL_LIST_PRIORITY); - if (ret) - CXIP_WARN("MR %p key 0x%016lX invalidate failed %d\n", - mr, mr->key, ret); + CXIP_WARN("Match events required pte LE invalidate: match_events=%u access_events=%u\n", + ofi_atomic_get32(&mr->match_events), + ofi_atomic_get32(&mr->access_events)); + } + + ret = cxil_invalidate_pte_le(ep_obj->ctrl.pte->pte, mr->key, + C_PTL_LIST_PRIORITY); + if (ret) + CXIP_FATAL("MR %p key 0x%016lX invalidate failed %d\n", mr, + mr->key, ret); + + /* For LE invalidate and MR events, need to flush event queues until + * access equals match. + */ + if (mr->count_events) { + count_events_disabled = cxip_mr_disable_check_count_events(mr, cxip_env.mr_cache_events_disable_le_poll_nsecs); + if (!count_events_disabled) + CXIP_FATAL("Failed LE MR invalidation\n"); } +disabled_success: mr->enabled = false; CXIP_DBG("Standard MR disabled: %p (key: 0x%016lX)\n", mr, mr->key); diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c index 4d3830dc18f..29c9589baa0 100644 --- a/prov/cxi/src/cxip_msg.c +++ b/prov/cxi/src/cxip_msg.c @@ -104,7 +104,7 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, dlist_init(&req->recv.children); dlist_init(&req->recv.rxc_entry); - ofi_atomic_inc32(&rxc->orx_reqs); + cxip_rxc_orx_reqs_inc(rxc); *cxip_req = req; return FI_SUCCESS; @@ -123,7 +123,7 @@ void cxip_recv_req_free(struct cxip_req *req) assert(dlist_empty(&req->recv.children)); assert(dlist_empty(&req->recv.rxc_entry)); - ofi_atomic_dec32(&rxc->orx_reqs); + cxip_rxc_orx_reqs_dec(rxc); if (req->recv.recv_md && !req->recv.hybrid_md) cxip_unmap(req->recv.recv_md); @@ -217,7 +217,12 @@ void cxip_recv_req_report(struct cxip_req *req) parent->recv.mrecv_bytes == parent->recv.mrecv_unlink_bytes) unlinked = true; } else { - if ((parent->recv.ulen - parent->recv.mrecv_bytes) < rxc->min_multi_recv) + parent->recv.multirecv_inflight--; + assert(parent->recv.multirecv_inflight >= 0); + + if (!parent->recv.multirecv_inflight && + ((parent->recv.ulen - parent->recv.mrecv_bytes) < + rxc->min_multi_recv)) unlinked = true; } @@ -314,6 +319,9 @@ struct cxip_req *cxip_mrecv_req_dup(struct cxip_req *mrecv_req) /* Update fields specific to this Send */ req->recv.parent = mrecv_req; + /* Parent keeps track of operations in flight */ + mrecv_req->recv.multirecv_inflight++; + /* Start pointer and data_len must be set elsewhere! */ return req; @@ -460,7 +468,7 @@ int cxip_recv_cancel(struct cxip_req *req) /* In hybrid mode requests could be on priority list * or software receive list. */ - if (req->recv.software_list) { + if (!req->recv.hw_offloaded) { dlist_remove_init(&req->recv.rxc_entry); req->recv.canceled = true; req->recv.unlinked = true; @@ -526,7 +534,7 @@ int cxip_flush_appends(struct cxip_rxc_hpc *rxc, ret = -FI_EAGAIN; goto err; } - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); rxc->base.rx_evtq.ack_batch_size = 1; @@ -553,7 +561,7 @@ int cxip_flush_appends(struct cxip_rxc_hpc *rxc, return FI_SUCCESS; err_dec_free_cq_req: - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); cxip_evtq_req_free(req); err: return ret; @@ -575,6 +583,7 @@ int cxip_recv_req_dropped(struct cxip_req *req) assert(rxc->base.protocol == FI_PROTO_CXI); assert(dlist_empty(&req->recv.rxc_entry)); + req->recv.hw_offloaded = false; dlist_insert_tail(&req->recv.rxc_entry, &rxc->replay_queue); RXC_DBG(rxc, "Receive dropped: %p\n", req); diff --git a/prov/cxi/src/cxip_msg_hpc.c b/prov/cxi/src/cxip_msg_hpc.c index 5d68d40c51a..89de6e4f01b 100644 --- a/prov/cxi/src/cxip_msg_hpc.c +++ b/prov/cxi/src/cxip_msg_hpc.c @@ -1066,7 +1066,7 @@ int cxip_rdzv_pte_zbp_cb(struct cxip_req *req, const union c_event *event) */ cxip_report_send_completion(put_req, true); - ofi_atomic_dec32(&put_req->send.txc->otx_reqs); + cxip_txc_otx_reqs_dec(put_req->send.txc); cxip_evtq_req_free(put_req); return FI_SUCCESS; @@ -1337,12 +1337,13 @@ static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) if (req->recv.multi_recv && !req->recv.rdzv_events) { dlist_remove(&req->recv.children); + req->recv.parent->recv.multirecv_inflight--; cxip_evtq_req_free(req); } return -FI_EAGAIN; } - RXC_DBG(rxc, "Software issued Get, req: %p\n", req); + RXC_DBG(rxc, "Software issued RGet, req: %p\n", req); } /* Count the rendezvous event. */ @@ -1357,17 +1358,22 @@ static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) } /* If a rendezvous operation requires a done notification - * send it. Must wait for the ACK from the notify to be returned - * before completing the target operation. + * it was initiated by software. Re-use the existing + * rendezvous get TX credit. Need to wait for the ACK from + * the done notify to be returned before releasing the + * TX credit and completing the target operation. */ - if (req->recv.done_notify) { - if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > - rxc->base.max_tx || cxip_rdzv_done_notify(req)) { + if (req->recv.done_notify && cxip_rdzv_done_notify(req)) + return -FI_EAGAIN; - /* Could not issue notify, will be retried */ - ofi_atomic_dec32(&rxc->orx_tx_reqs); - return -FI_EAGAIN; - } + /* If RGet initiated by software return the TX credit unless + * it will be used for sending an alt_read done_notify message. + */ + if (!event->init_short.rendezvous && + !req->recv.done_notify) { + ofi_atomic_dec32(&req->recv.rxc_hpc->orx_tx_reqs); + assert(ofi_atomic_get32(&req->recv.rxc_hpc->orx_tx_reqs) + >= 0); } /* Rendezvous Get completed, update event counts and @@ -1376,13 +1382,6 @@ static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) req->recv.rc = cxi_init_event_rc(event); rdzv_recv_req_event(req, event->hdr.event_type); - /* If RGet initiated by software return the TX credit */ - if (!event->init_short.rendezvous) { - ofi_atomic_dec32(&req->recv.rxc_hpc->orx_tx_reqs); - assert(ofi_atomic_get32(&req->recv.rxc_hpc->orx_tx_reqs) - >= 0); - } - return FI_SUCCESS; case C_EVENT_ACK: @@ -1394,7 +1393,7 @@ static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) /* Special case of the ZBP destination EQ being full and ZBP * could not complete. This must be retried, we use the TX - * credit already allocated. + * credit already allocated for the done notify. */ if (event_rc == C_RC_ENTRY_NOT_FOUND) { usleep(CXIP_DONE_NOTIFY_RETRY_DELAY_US); @@ -2066,7 +2065,7 @@ static void cxip_ux_onload_complete(struct cxip_req *req) else cxip_post_ux_onload_fc(rxc); - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); cxip_evtq_req_free(req); } @@ -2253,7 +2252,7 @@ static int cxip_ux_onload(struct cxip_rxc_hpc *rxc) ret = -FI_EAGAIN; goto err_free_onload_offset; } - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); req->cb = cxip_ux_onload_cb; req->type = CXIP_REQ_SEARCH; @@ -2279,7 +2278,7 @@ static int cxip_ux_onload(struct cxip_rxc_hpc *rxc) return FI_SUCCESS; err_dec_free_cq_req: - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); cxip_evtq_req_free(req); err_free_onload_offset: free(rxc->ule_offsets); @@ -2304,7 +2303,7 @@ static int cxip_flush_appends_cb(struct cxip_req *req, ret = cxip_ux_onload(rxc); if (ret == FI_SUCCESS) { - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); cxip_evtq_req_free(req); } @@ -3179,19 +3178,23 @@ static int cxip_recv_sw_matched(struct cxip_req *req, /* Make sure we can issue the RGet; if not we stall * and TX event queue progress will free up credits. */ - if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > rxc->base.max_tx) { - ofi_atomic_dec32(&rxc->orx_tx_reqs); - return -FI_EAGAIN; - } + do { + if (ofi_atomic_inc32(&rxc->orx_tx_reqs) <= + rxc->base.max_tx) + break; - ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, - mrecv_start, mrecv_len, req_done); - if (ret != FI_SUCCESS) { - req->recv.start_offset -= mrecv_len; ofi_atomic_dec32(&rxc->orx_tx_reqs); + cxip_evtq_progress(&rxc->base.ep_obj->txc->tx_evtq); + } while (true); - return ret; - } + do { + ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, + mrecv_start, mrecv_len, req_done); + if (ret == FI_SUCCESS) + break; + + cxip_evtq_progress(&rxc->base.ep_obj->txc->tx_evtq); + } while (true); /* If multi-recv, a child request was created from * cxip_ux_send(). Need to lookup this request. @@ -3243,7 +3246,7 @@ static int cxip_recv_sw_matched(struct cxip_req *req, if (ret != FI_SUCCESS) { /* undo mrecv_req_put_bytes() */ - req->recv.start_offset -= mrecv_len; + req->recv.start_offset = mrecv_start; return ret; } } @@ -3464,8 +3467,7 @@ static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq) if (ret) goto err_dequeue_req; } else { - - req->recv.software_list = true; + req->recv.hw_offloaded = false; dlist_insert_tail(&req->recv.rxc_entry, &rxc->sw_recv_queue); } @@ -3866,7 +3868,7 @@ static int cxip_rxc_check_recv_count_hybrid_preempt(struct cxip_rxc *rxc) if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE && cxip_env.hybrid_posted_recv_preemptive == 1) { - count = ofi_atomic_get32(&rxc->orx_reqs); + count = cxip_rxc_orx_reqs_get(rxc); if (count > rxc->attr.size) { assert(rxc->state == RXC_ENABLED); @@ -4125,7 +4127,7 @@ static void rdzv_send_req_complete(struct cxip_req *req) cxip_report_send_completion(req, true); - ofi_atomic_dec32(&req->send.txc->otx_reqs); + cxip_txc_otx_reqs_dec(req->send.txc); cxip_evtq_req_free(req); } @@ -4460,7 +4462,7 @@ static int cxip_send_eager_cb(struct cxip_req *req, /* If MATCH_COMPLETE was requested, software must manage counters. */ cxip_report_send_completion(req, match_complete); - ofi_atomic_dec32(&req->send.txc->otx_reqs); + cxip_txc_otx_reqs_dec(req->send.txc); cxip_evtq_req_free(req); return FI_SUCCESS; @@ -4885,7 +4887,7 @@ int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid, * a TXC credit for replay. _cxip_send_req() will take the * credit again. */ - ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_dec(&txc->base); /* -FI_EAGAIN can be return if the command queue is full. Loop * until this goes through. @@ -5159,7 +5161,7 @@ cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, } /* Restrict outstanding success event requests to queue size */ - if (ofi_atomic_get32(&txc->otx_reqs) >= txc->attr.size) { + if (cxip_txc_otx_reqs_get(txc) >= txc->attr.size) { ret = -FI_EAGAIN; goto err_req_free; } diff --git a/prov/cxi/src/cxip_msg_rnr.c b/prov/cxi/src/cxip_msg_rnr.c index b5ae1410e7d..ec5064a4fe5 100644 --- a/prov/cxi/src/cxip_msg_rnr.c +++ b/prov/cxi/src/cxip_msg_rnr.c @@ -339,7 +339,7 @@ static int cxip_rxc_rnr_msg_init(struct cxip_rxc *rxc_base) dlist_init(&req->recv.rxc_entry); /* Selective does not count toward outstanding RX operations */ - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); ret = cxip_recv_req_alloc(&rxc->base, NULL, 0, NULL, &req, cxip_rnr_recv_selective_comp_cb); @@ -359,7 +359,7 @@ static int cxip_rxc_rnr_msg_init(struct cxip_rxc *rxc_base) dlist_init(&req->recv.rxc_entry); /* Selective does not count toward outstanding RX operations */ - ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_dec(&rxc->base); rxc->hybrid_mr_desc = true; } @@ -400,12 +400,12 @@ static int cxip_rxc_rnr_msg_init(struct cxip_rxc *rxc_base) cxip_pte_free(rxc->base.rx_pte); free_req_tag: if (rxc->req_selective_comp_tag) { - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); cxip_recv_req_free(rxc->req_selective_comp_tag); } free_req_msg: if (rxc->req_selective_comp_msg) { - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); cxip_recv_req_free(rxc->req_selective_comp_msg); } @@ -423,11 +423,11 @@ static int cxip_rxc_rnr_msg_fini(struct cxip_rxc *rxc_base) * back before freeing. */ if (rxc->req_selective_comp_msg) { - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); cxip_recv_req_free(rxc->req_selective_comp_msg); } if (rxc->req_selective_comp_tag) { - ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_rxc_orx_reqs_inc(&rxc->base); cxip_recv_req_free(rxc->req_selective_comp_tag); } @@ -827,7 +827,7 @@ static int cxip_process_rnr_time_wait(struct cxip_txc_rnr *txc) ofi_atomic_dec32(&txc->time_wait_reqs); cxip_send_buf_fini(req); cxip_report_send_completion(req, true); - ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_dec(&txc->base); cxip_evtq_req_free(req); continue; @@ -836,10 +836,10 @@ static int cxip_process_rnr_time_wait(struct cxip_txc_rnr *txc) /* Must TX return credit, will take it back if * we could not send. */ - ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_dec(&txc->base); ret = cxip_rnr_msg_send(req); if (ret != FI_SUCCESS) { - ofi_atomic_inc32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_inc(&txc->base); goto reset_min_time_wait; } @@ -1031,7 +1031,7 @@ static int cxip_rnr_send_cb(struct cxip_req *req, const union c_event *event) req->send.caddr.nic, req->send.caddr.pid, req->send.tagged ? '*' : '-', req->send.tag, req->send.retries, - ofi_atomic_get32(&txc->base.otx_reqs)); + cxip_txc_otx_reqs_get(&txc->base)); } cxip_rnr_send_req_dequeue(req); @@ -1054,7 +1054,7 @@ static int cxip_rnr_send_cb(struct cxip_req *req, const union c_event *event) cxip_report_send_completion(req, req->send.canceled); - ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_txc_otx_reqs_dec(&txc->base); cxip_evtq_req_free(req); return FI_SUCCESS; @@ -1147,7 +1147,7 @@ cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, } /* Restrict outstanding success event requests to queue size */ - if (ofi_atomic_get32(&txc->otx_reqs) > txc->attr.size) { + if (cxip_txc_otx_reqs_get(txc) > txc->attr.size) { ret = -FI_EAGAIN; goto free_req; } diff --git a/prov/cxi/src/cxip_repsum.c b/prov/cxi/src/cxip_repsum.c index 6c0f5c93186..56ffe342a41 100644 --- a/prov/cxi/src/cxip_repsum.c +++ b/prov/cxi/src/cxip_repsum.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2022 Hewlett Packard Enterprise Development LP + * Copyright (c) 2022-2024 Cray Inc. All rights reserved. */ /* Notes: @@ -123,7 +123,7 @@ void cxip_dbl_to_rep(struct cxip_repsum *x, double d) // Subnormal values, zero x->M = BIN(1); w = OFF(1); - } + } /** * Copy the mantissa into the correct locations within T[]. diff --git a/prov/cxi/src/cxip_rma.c b/prov/cxi/src/cxip_rma.c index 9c36addddd3..7b691c17e20 100644 --- a/prov/cxi/src/cxip_rma.c +++ b/prov/cxi/src/cxip_rma.c @@ -150,12 +150,46 @@ static int cxip_rma_cb(struct cxip_req *req, const union c_event *event) TXC_WARN(txc, "Failed to report error: %d\n", ret); } - ofi_atomic_dec32(&req->rma.txc->otx_reqs); + cxip_txc_otx_reqs_dec(req->rma.txc); cxip_evtq_req_free(req); return FI_SUCCESS; } +static bool cxip_rma_emit_dma_need_req(size_t len, uint64_t flags, + struct cxip_mr *mr) +{ + /* DMA commands with FI_INJECT always require a request structure to + * track the bounce buffer. + */ + if (len && (flags & FI_INJECT)) + return true; + + /* If user request FI_COMPLETION, need request structure to return + * user context back. + * + * TODO: This can be optimized for zero byte operations. Specifically, + * The user context can be associated with the DMA command. But, this + * requires reworking on event queue processing to support. + */ + if (flags & FI_COMPLETION) + return true; + + /* If the user has provider their own MR, internal memory registration + * is not needed. Thus, no request structure is needed. + */ + if (mr) + return false; + + /* In the initiator buffer length is zero, no memory registration is + * needed. Thus, no request structure is needed. + */ + if (!len) + return false; + + return true; +} + static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, struct cxip_mr *mr, union c_fab_addr *dfa, uint8_t *idx_ext, uint16_t vni, uint64_t addr, @@ -169,7 +203,7 @@ static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, { struct cxip_req *req = NULL; struct cxip_md *dma_md = NULL; - void *dma_buf; + void *dma_buf = NULL; struct c_full_dma_cmd dma_cmd = {}; int ret; struct cxip_domain *dom = txc->domain; @@ -180,12 +214,7 @@ static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, if (!dom->hybrid_mr_desc) mr = NULL; - /* DMA commands always require a request structure regardless if - * FI_COMPLETION is set. This is due to the provider doing internally - * memory registration and having to clean up the registration on DMA - * operation completion. - */ - if ((len && (flags & FI_INJECT)) || (flags & FI_COMPLETION) || !mr) { + if (cxip_rma_emit_dma_need_req(len, flags, mr)) { req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); if (!req) { ret = -FI_EAGAIN; @@ -503,10 +532,6 @@ static bool cxip_rma_is_idc(struct cxip_txc *txc, uint64_t key, size_t len, { size_t max_idc_size = unr ? CXIP_INJECT_SIZE : C_MAX_IDC_PAYLOAD_RES; - /* DISABLE_NON_INJECT_MSG_IDC disables the IDC - */ - if (cxip_env.disable_non_inject_msg_idc) - return false; /* IDC commands are not supported for unoptimized MR since the IDC * small message format does not support remote offset which is needed * for RMA commands. @@ -608,7 +633,7 @@ ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, /* Select the correct traffic class type within a traffic class. */ if (!unr && (flags & FI_CXI_HRP)) tc_type = CXI_TC_TYPE_HRP; - else if (!unr) + else if (!unr && !triggered) tc_type = CXI_TC_TYPE_RESTRICTED; else tc_type = CXI_TC_TYPE_DEFAULT; diff --git a/prov/cxi/src/cxip_rxc.c b/prov/cxi/src/cxip_rxc.c index cdcaed39a59..72b1ed92c43 100644 --- a/prov/cxi/src/cxip_rxc.c +++ b/prov/cxi/src/cxip_rxc.c @@ -227,7 +227,7 @@ void cxip_rxc_recv_req_cleanup(struct cxip_rxc *rxc) uint64_t start; int canceled = 0; - if (!ofi_atomic_get32(&rxc->orx_reqs)) + if (!cxip_rxc_orx_reqs_get(rxc)) return; cxip_evtq_req_discard(&rxc->rx_evtq, rxc); @@ -242,7 +242,7 @@ void cxip_rxc_recv_req_cleanup(struct cxip_rxc *rxc) CXIP_DBG("Canceled %d Receives: %p\n", canceled, rxc); start = ofi_gettime_ms(); - while (ofi_atomic_get32(&rxc->orx_reqs)) { + while (cxip_rxc_orx_reqs_get(rxc)) { sched_yield(); cxip_evtq_progress(&rxc->rx_evtq); @@ -436,7 +436,7 @@ struct cxip_rxc *cxip_rxc_calloc(struct cxip_ep_obj *ep_obj, void *context) rxc->attr = ep_obj->rx_attr; rxc->hmem = !!(rxc->attr.caps & FI_HMEM); rxc->pid_bits = ep_obj->domain->iface->dev->info.pid_bits; - ofi_atomic_initialize32(&rxc->orx_reqs, 0); + cxip_rxc_orx_reqs_init(rxc); rxc->sw_ep_only = cxip_env.rx_match_mode == CXIP_PTLTE_SOFTWARE_MODE; diff --git a/prov/cxi/src/cxip_telemetry.c b/prov/cxi/src/cxip_telemetry.c index 3bbdb6f48c5..d2b7ecffbfd 100644 --- a/prov/cxi/src/cxip_telemetry.c +++ b/prov/cxi/src/cxip_telemetry.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Hewlett Packard Enterprise Development LP + * Copyright (c) 2022,2024 Hewlett Packard Enterprise Development LP * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ #include "config.h" @@ -112,8 +112,10 @@ static bool cxip_telemetry_entry_validate_token(struct cxip_telemetry *telemetry, const char *telemetry_token) { - /* The telemetry directory has an ALL-in-binary entry. This file is - * considered invalid for this telemetry implementation. + /* + * Cassini NextGen-Telemetry no longer provides 'ALL-in-binary'. + * Keeping this simple logic in place temporarily allows this + * logic to run with older versions of the driver. */ if (strcmp(telemetry_token, "ALL-in-binary") == 0) return false; @@ -165,54 +167,9 @@ static int cxip_telemetry_entry_alloc(struct cxip_telemetry *telemetry, return ret; } -static int cxip_telemetry_sleep_duration(void) -{ - int ret; - int msec_sleep; - char *path = "/sys/module/cxi_core/parameters/cntr_refresh_interval"; - FILE *f; - - f = fopen(path, "r"); - if (!f) - return -errno; - - ret = fscanf(f, "%d", &msec_sleep); - if (ret != 1) { - if (ret == EOF) - ret = -errno; - else - ret = -FI_EINVAL; - } else { - /* Convert sleep duration to seconds. */ - ret = msec_sleep / 1000; - if (msec_sleep % 1000) - ret++; - ret = MAX(ret, 1); - } - - fclose(f); - - return ret; -} - void cxip_telemetry_dump_delta(struct cxip_telemetry *telemetry) { struct cxip_telemetry_entry *entry; - int sleep_duration; - - /* Since sysfs telemetry entries are refreshed as some interval, we need - * to sleep for a refresh interval to get updates. Else, the application - * could run and telemetry deltas would be zero. - */ - sleep_duration = cxip_telemetry_sleep_duration(); - if (sleep_duration < 0) { - DOM_WARN(telemetry->dom, - "Failed to retrieve telemetry sleep duration: %d:%s\n", - sleep_duration, fi_strerror(-sleep_duration)); - return; - } - - sleep(sleep_duration); dlist_foreach_container(&telemetry->telemetry_list, struct cxip_telemetry_entry, entry, diff --git a/prov/cxi/src/cxip_txc.c b/prov/cxi/src/cxip_txc.c index 24564a5ef72..94bc470ba68 100644 --- a/prov/cxi/src/cxip_txc.c +++ b/prov/cxi/src/cxip_txc.c @@ -295,6 +295,9 @@ static size_t cxip_txc_get_num_events(struct cxip_txc *txc) /* Account for internal operations. */ num_events += CXIP_INTERNAL_TX_REQS; + /* ACK batching */ + num_events += cxip_env.eq_ack_batch_size; + return num_events; } @@ -375,13 +378,13 @@ static void txc_cleanup(struct cxip_txc *txc) { uint64_t start; - if (!ofi_atomic_get32(&txc->otx_reqs)) + if (!cxip_txc_otx_reqs_get(txc)) goto proto_cleanup; cxip_evtq_req_discard(&txc->tx_evtq, txc); start = ofi_gettime_ms(); - while (ofi_atomic_get32(&txc->otx_reqs)) { + while (cxip_txc_otx_reqs_get(txc)) { sched_yield(); cxip_evtq_progress(&txc->tx_evtq); @@ -393,7 +396,7 @@ static void txc_cleanup(struct cxip_txc *txc) } } - assert(ofi_atomic_get32(&txc->otx_reqs) == 0); + assert(cxip_txc_otx_reqs_get(txc) == 0); proto_cleanup: txc->ops.cleanup(txc); @@ -434,16 +437,20 @@ void cxip_txc_flush_msg_trig_reqs(struct cxip_txc *txc) struct cxip_req *req; struct dlist_entry *tmp; + ofi_genlock_lock(&txc->ep_obj->lock); + /* Drain the message queue. */ dlist_foreach_container_safe(&txc->msg_queue, struct cxip_req, req, send.txc_entry, tmp) { if (cxip_is_trig_req(req)) { - ofi_atomic_dec32(&txc->otx_reqs); + cxip_txc_otx_reqs_dec(txc); dlist_remove(&req->send.txc_entry); cxip_unmap(req->send.send_md); cxip_evtq_req_free(req); } } + + ofi_genlock_unlock(&txc->ep_obj->lock); } static bool cxip_txc_can_emit_op(struct cxip_txc *txc, @@ -456,7 +463,7 @@ static bool cxip_txc_can_emit_op(struct cxip_txc *txc, /* If taking a successful completion, limit outstanding operations */ if (!event_success_disabled && - (ofi_atomic_get32(&txc->otx_reqs) >= txc->attr.size)) { + (cxip_txc_otx_reqs_get(txc) >= txc->attr.size)) { TXC_WARN(txc, "TXC attr size saturated\n"); return false; } @@ -483,7 +490,7 @@ int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain idc put: %d\n", ret); else if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -506,10 +513,10 @@ int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -534,7 +541,7 @@ int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni, "Failed to emit trigger dma command: %d:%s\n", ret, fi_strerror(-ret)); else if (!dma->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -545,7 +552,7 @@ int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain dma command: %d\n", ret); else if (!dma->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -567,10 +574,10 @@ int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!dma->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -594,7 +601,7 @@ int cxip_txc_emit_idc_amo(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain idc amo: %d\n", ret); else if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -617,10 +624,10 @@ int cxip_txc_emit_idc_amo(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -647,7 +654,7 @@ int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, "Failed to emit trigger amo command: %d:%s\n", ret, fi_strerror(-ret)); else if (!amo->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -659,7 +666,7 @@ int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain amo: %d\n", ret); else if (!amo->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -681,10 +688,10 @@ int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!amo->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -708,7 +715,7 @@ int cxip_txc_emit_idc_msg(struct cxip_txc *txc, uint16_t vni, TXC_WARN(txc, "Failed to emit domain idc msg: %d\n", ret); else if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return ret; } @@ -731,10 +738,10 @@ int cxip_txc_emit_idc_msg(struct cxip_txc *txc, uint16_t vni, /* Kick the command queue. */ cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), - ofi_atomic_get32(&txc->otx_reqs)); + cxip_txc_otx_reqs_get(txc)); if (!c_state->event_success_disable) - ofi_atomic_inc32(&txc->otx_reqs); + cxip_txc_otx_reqs_inc(txc); return FI_SUCCESS; } @@ -778,7 +785,7 @@ struct cxip_txc *cxip_txc_calloc(struct cxip_ep_obj *ep_obj, void *context) dlist_init(&txc->msg_queue); dlist_init(&txc->dom_entry); - ofi_atomic_initialize32(&txc->otx_reqs, 0); + cxip_txc_otx_reqs_init(txc); /* Derived initialization/overrides */ txc->ops.init_struct(txc, ep_obj); diff --git a/prov/cxi/test/atomic.c b/prov/cxi/test/atomic.c index d33dfdc455f..6f5ec289579 100644 --- a/prov/cxi/test/atomic.c +++ b/prov/cxi/test/atomic.c @@ -76,13 +76,13 @@ Test(atomic_invalid, invalid_amo) int ret; ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, - FI_UINT64, OFI_ATOMIC_OP_LAST, 0); + FI_UINT64, FI_ATOMIC_OP_LAST, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, FI_UINT64, -1, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, - OFI_DATATYPE_LAST, FI_SUM, 0); + FI_VOID, FI_SUM, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, -1, FI_SUM, 0); @@ -132,13 +132,13 @@ Test(atomic_invalid, invalid_fetch) ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, cxit_ep_fi_addr, 0, 0, FI_UINT64, - OFI_ATOMIC_OP_LAST, 0); + FI_ATOMIC_OP_LAST, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, cxit_ep_fi_addr, 0, 0, FI_UINT64, -1, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, - cxit_ep_fi_addr, 0, 0, OFI_DATATYPE_LAST, FI_SUM, + cxit_ep_fi_addr, 0, 0, FI_VOID, FI_SUM, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, @@ -220,7 +220,7 @@ Test(atomic_invalid, invalid_swap) &compare, 0, &result, 0, cxit_ep_fi_addr, 0, 0, - FI_UINT64, OFI_ATOMIC_OP_LAST, 0); + FI_UINT64, FI_ATOMIC_OP_LAST, 0); cr_assert_eq(ret, -FI_EINVAL); ret = fi_compare_atomic(cxit_ep, &operand1, 1, 0, @@ -234,7 +234,7 @@ Test(atomic_invalid, invalid_swap) &compare, 0, &result, 0, cxit_ep_fi_addr, 0, 0, - OFI_DATATYPE_LAST, FI_CSWAP_NE, NULL); + FI_VOID, FI_CSWAP_NE, NULL); cr_assert_eq(ret, -FI_EINVAL); ret = fi_compare_atomic(cxit_ep, &operand1, 1, 0, @@ -277,7 +277,7 @@ Test(atomic_invalid, invalid_swap) &result, 0, cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_CSWAP_NE, NULL); - + cr_assert_eq(ret, -FI_EINVAL); ret = fi_compare_atomicv(cxit_ep, &iov, 0, 1, &ciov, 0, 1, @@ -1037,6 +1037,18 @@ struct test_int_parms { uint64_t key; }; +static enum fi_datatype int_datatypes[] = { + FI_UINT8, + FI_INT16, + FI_UINT16, + FI_INT32, + FI_UINT32, + FI_INT64, + FI_UINT64, + FI_INT128, + FI_UINT128, +}; + static struct test_int_parms int_parms[] = { { _AMO|_FAMO, 11, FI_MIN, 0, 0, 123, 120, 120 }, { _AMO|_FAMO, 12, FI_MIN, 0, 0, 120, 123, 120 }, @@ -1128,42 +1140,73 @@ ParameterizedTestParameters(atomic, test_int) tests * 2); } + +/* Don't rely on compiler __int128 support. */ +typedef struct { + uint64_t u64[2]; +} __attribute__ ((aligned (16))) amo128_t; + +#define AMO128_INIT(_v64) { .u64 = { _v64, 0 } } + +static int test_int_expect_err(int err, enum fi_datatype dt, enum fi_op op) +{ + if (!err && op != FI_CSWAP && (dt == FI_INT128 || dt == FI_UINT128)) + err = 1; + + return err; +} + ParameterizedTest(struct test_int_parms *p, atomic, test_int) { struct mem_region mr; enum fi_datatype dt; uint64_t *rma; - uint64_t *loc; - uint64_t lini = -1; + uint64_t *loc = NULL; + int err; + /* Need 128-bit data types for FI_INT128/FI_UINT128. */ + amo128_t o1_128 = AMO128_INIT(p->o1); + void *o1 = &o1_128; + amo128_t comp_128 = AMO128_INIT(p->comp); + void *comp = &comp_128; + amo128_t lini_128 = AMO128_INIT(-1); + void *lini = &lini_128; + amo128_t rini_128 = AMO128_INIT(p->rini); + void *rini = &rini_128; + amo128_t rexp_128 = AMO128_INIT(p->rexp); + void *rexp = &rexp_128; + size_t i; rma = _cxit_create_mr(&mr, &p->key); - loc = calloc(1, RMA_WIN_LEN); - cr_assert_not_null(loc); + err = posix_memalign((void **)&loc, ofi_datatype_size(FI_UINT128), + RMA_WIN_LEN); + cr_assert(err == 0); + memset(loc, 0, RMA_WIN_LEN); if (p->opmask & _AMO) { - for (dt = FI_INT8; dt <= FI_UINT64; dt++) { - _test_amo(p->index, dt, p->op, p->err, &p->o1, - 0, 0, 0, - rma, &p->rini, &p->rexp, - p->key); + for (i = 0; i < ARRAY_SIZE(int_datatypes); i++) { + dt = int_datatypes[i]; + err = test_int_expect_err(p->err, dt, p->op); + _test_amo(p->index, dt, p->op, err, o1, + 0, 0, 0, rma, rini, rexp, p->key); } } if (p->opmask & _FAMO) { - for (dt = FI_INT8; dt <= FI_UINT64; dt++) { - _test_amo(p->index, dt, p->op, p->err, &p->o1, - 0, loc, &lini, rma, &p->rini, &p->rexp, - p->key); + for (i = 0; i < ARRAY_SIZE(int_datatypes); i++) { + dt = int_datatypes[i]; + err = test_int_expect_err(p->err, dt, p->op); + _test_amo(p->index, dt, p->op, err, o1, + 0, loc, lini, rma, rini, rexp, p->key); } } if (p->opmask & _CAMO) { - for (dt = FI_INT8; dt <= FI_UINT64; dt++) { - _test_amo(p->index, dt, p->op, p->err, &p->o1, - &p->comp, loc, &lini, rma, &p->rini, - &p->rexp, - p->key); + for (i = 0; i < ARRAY_SIZE(int_datatypes); i++) { + dt = int_datatypes[i]; + err = test_int_expect_err(p->err, dt, p->op); + _test_amo(p->index, dt, p->op, err, o1, + comp, loc, lini, rma, rini, rexp, p->key); } } @@ -1942,6 +1985,214 @@ void cxit_setup_amo_selective_completion_suppress(void) cxit_setup_rma(); } +void cxit_setup_amo_selective_completion_suppress_hybrid_mr_desc(void) +{ + int ret; + + cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + + cxit_setup_getinfo(); + cxit_fi_hints->tx_attr->op_flags = 0; + cxit_setup_rma(); + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0, + (void **)&dom_ops, NULL); + cr_assert(ret == FI_SUCCESS, "fi_open_ops v2"); + cr_assert(dom_ops->cntr_read != NULL && + dom_ops->topology != NULL && + dom_ops->enable_hybrid_mr_desc != NULL, + "V3 functions returned"); + + ret = dom_ops->enable_hybrid_mr_desc(&cxit_domain->fid, true); + cr_assert(ret == FI_SUCCESS, "enable_hybrid_mr_desc failed"); +} + +Test(atomic_sel, fi_more_amo_stream_optimzied, + .init = cxit_setup_amo_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = 0x0; + size_t rma_len = 1; + struct fi_msg_atomic msg = {}; + struct fi_rma_ioc rma = {}; + struct fi_ioc src_iov = {}; + unsigned int count = 0; + struct fid_cntr *cntr = cxit_write_cntr; + char src_buf = 0; + + mr_create(rma_len, FI_REMOTE_WRITE, 0, &key_val, &mem_window); + + src_iov.addr = &src_buf; + src_iov.count = 1; + + rma.count = 1; + rma.key = key_val; + + msg.msg_iov = &src_iov; + msg.iov_count = 1; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_INT8; + msg.op = FI_SUM; + + do { + ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + count++; + } while (ret != -FI_EAGAIN); + + cr_assert(count >= cxit_fi_hints->tx_attr->size); + + do { + ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_atomicmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_cntr_wait(cntr, count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); +} + +Test(atomic_sel, fi_more_amo_stream_mix_optimzied_unoptimized, + .init = cxit_setup_amo_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region opt_mem_window; + struct mem_region mem_window; + uint64_t opt_key_val = 0x0; + uint64_t key_val = 0x1234; + size_t rma_len = 1; + struct fi_msg_atomic msg = {}; + struct fi_rma_ioc rma = {}; + struct fi_ioc src_iov = {}; + unsigned int count = 0; + struct fid_cntr *cntr = cxit_write_cntr; + char src_buf = 0; + + mr_create(rma_len, FI_REMOTE_WRITE, 0, &opt_key_val, &opt_mem_window); + mr_create(rma_len, FI_REMOTE_WRITE, 0, &key_val, &mem_window); + + src_iov.addr = &src_buf; + src_iov.count = 1; + + rma.count = 1; + rma.key = opt_key_val; + + msg.msg_iov = &src_iov; + msg.iov_count = 1; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_INT8; + msg.op = FI_SUM; + + do { + ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + count++; + } while (ret != -FI_EAGAIN); + + cr_assert(count >= cxit_fi_hints->tx_attr->size); + + rma.key = key_val; + do { + ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_atomicmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_cntr_wait(cntr, count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); + mr_destroy(&opt_mem_window); +} + +Test(atomic_sel, fi_more_fetch_amo_stream_optimzied, + .init = cxit_setup_amo_selective_completion_suppress_hybrid_mr_desc, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = 0x0; + size_t rma_len = 1; + struct fi_msg_atomic msg = {}; + struct fi_rma_ioc rma = {}; + struct fi_ioc src_iov = {}; + unsigned int count = 0; + struct fid_cntr *cntr = cxit_read_cntr; + char src_buf = 0; + struct fi_ioc result_iov = {}; + void *mr; + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0, + (void **)&dom_ops, NULL); + + mr_create(rma_len, + FI_REMOTE_WRITE | FI_REMOTE_READ | FI_WRITE | FI_READ, 0, + &key_val, &mem_window); + mr = fi_mr_desc(mem_window.mr); + + result_iov.addr = mem_window.mem; + result_iov.count = 1; + + src_iov.addr = &src_buf; + src_iov.count = 1; + + rma.count = 1; + rma.key = key_val; + + msg.msg_iov = &src_iov; + msg.iov_count = 1; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_INT8; + msg.op = FI_SUM; + + do { + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_iov, &mr, 1, + FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + count++; + } while (ret != -FI_EAGAIN); + + cr_assert(count >= cxit_fi_hints->tx_attr->size); + + do { + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_iov, &mr, 1, + FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_iov, &mr, 1, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_cntr_wait(cntr, count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); +} + /* Test selective completion behavior with RMA. */ Test(atomic_sel, selective_completion_suppress, .init = cxit_setup_amo_selective_completion_suppress, @@ -3634,7 +3885,23 @@ ParameterizedTestParameters(atomic, query_atomic) .valid_atomic_attr = true, .flags = FI_FETCH_ATOMIC, .expected_rc = FI_SUCCESS, - } + }, + /* FI_UINT128 unsupported for FI_MIN. */ + { + .datatype = FI_UINT128, + .op = FI_MIN, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + }, + /* FI_UINT128 supported for FI_CSWAP. */ + { + .datatype = FI_UINT128, + .op = FI_CSWAP, + .valid_atomic_attr = true, + .flags = FI_COMPARE_ATOMIC, + .expected_rc = FI_SUCCESS, + }, }; size_t param_sz = ARRAY_SIZE(params); diff --git a/prov/cxi/test/cntr.c b/prov/cxi/test/cntr.c index f16655e0fbc..3c4f1c31241 100644 --- a/prov/cxi/test/cntr.c +++ b/prov/cxi/test/cntr.c @@ -844,3 +844,69 @@ Test(cntr, cntr_wait_success_increment) cntr_wait_success_and_error_runner(&args); } + +Test(cntr, verify_sync) +{ + struct fid_cntr *cntr; + struct fi_cntr_attr cntr_attr = { + .wait_obj = FI_WAIT_UNSPEC, + }; + uint64_t success; + int ret; + + ret = fi_cntr_open(cxit_domain, &cntr_attr, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_cntr_set(cntr, 2); + cr_assert(ret == FI_SUCCESS, "fi_cntr_set ret %d", ret); + + success = fi_cntr_read(cntr); + cr_assert(success == 2, + "Unexpected counter success count %lu", success); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS); +} + +/* This test is non-deterministic in that the counter write back + * associated with the set can occur before the fi_cntr_read() + * is issued, invalidating the test. Disable the test until another + * approach is implemented. + */ +Test(cntr, verify_no_sync, .disabled = true) +{ + struct fid_cntr *cntr; + struct fi_cntr_attr cntr_attr = { + .wait_obj = FI_WAIT_UNSPEC, + .flags = FI_CXI_CNTR_CACHED, + }; + struct cxip_ep *ep = container_of(cxit_ep, struct cxip_ep, ep); + uint64_t success; + int ret; + + /* Test is only deterministic with netsim */ + if (!is_netsim(ep->ep_obj)) { + cr_assert(1); + return; + } + + ret = fi_cntr_open(cxit_domain, &cntr_attr, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_cntr_set(cntr, 2); + cr_assert(ret == FI_SUCCESS, "fi_cntr_set ret %d", ret); + + success = fi_cntr_read(cntr); + /* should have returned cached value */ + cr_assert(success == 0, + "Unexpected counter success count %lu", success); + + do { + success = fi_cntr_read(cntr); + } while (success < 2); + cr_assert(success == 2, + "Unexpected counter success count %lu", success); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS); +} diff --git a/prov/cxi/test/coll.c b/prov/cxi/test/coll.c index 5ffb811567e..4dbcfcf26b8 100644 --- a/prov/cxi/test/coll.c +++ b/prov/cxi/test/coll.c @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * * Copyright (c) 2017-2019 Intel Corporation. All rights reserved. - * Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020-2024 Hewlett Packard Enterprise Development LP */ /* @@ -37,7 +37,6 @@ #define MIN(a,b) (((a)<(b))?(a):(b)) -/***************************************/ /** * Sanity tests for proper integration with EP, enable/disable checks. */ @@ -367,7 +366,7 @@ Test(coll_join, retry_getgroup) { TRACE("=========================\n"); TRACE("join retry getgroup\n"); for (node = 0; node < 5; node++) { - cxip_trap_set(node, CXIP_TRAP_GETGRP, -FI_EAGAIN); + cxip_trap_set(node, CXIP_TRAP_GETGRP, -FI_EAGAIN, 0); _create_netsim_collective(5, true, FI_SUCCESS); _wait_for_join(5, FI_SUCCESS, 0); _destroy_netsim_collective(); @@ -381,7 +380,7 @@ Test(coll_join, retry_broadcast) { TRACE("=========================\n"); TRACE("join retry broadcast\n"); for (node = 0; node < 5; node++) { - cxip_trap_set(node, CXIP_TRAP_BCAST, -FI_EAGAIN); + cxip_trap_set(node, CXIP_TRAP_BCAST, -FI_EAGAIN, 0); _create_netsim_collective(5, true, FI_SUCCESS); _wait_for_join(5, FI_SUCCESS, 0); _destroy_netsim_collective(); @@ -395,7 +394,7 @@ Test(coll_join, retry_reduce) { TRACE("=========================\n"); TRACE("join retry reduce\n"); for (node = 0; node < 5; node++) { - cxip_trap_set(node, CXIP_TRAP_REDUCE, -FI_EAGAIN); + cxip_trap_set(node, CXIP_TRAP_REDUCE, -FI_EAGAIN, 0); _create_netsim_collective(5, true, FI_SUCCESS); _wait_for_join(5, FI_SUCCESS, 0); _destroy_netsim_collective(); @@ -409,9 +408,10 @@ Test(coll_join, fail_ptlte) { TRACE("=========================\n"); TRACE("join fail mixed errors\n"); for (node = 0; node < 5; node++) { - cxip_trap_set(node, CXIP_TRAP_INITPTE, -FI_EFAULT); + cxip_trap_set(node, CXIP_TRAP_INITPTE, -FI_EAVAIL, + FI_CXI_ERRNO_JOIN_FAIL_PTE); _create_netsim_collective(5, true, FI_SUCCESS); - _wait_for_join(5, -FI_EAVAIL, CXIP_PROV_ERRNO_PTE); + _wait_for_join(5, -FI_ECONNREFUSED, FI_CXI_ERRNO_JOIN_FAIL_PTE); _destroy_netsim_collective(); cxip_trap_close(); } @@ -1058,7 +1058,7 @@ void _allreduce(int start_node, int bad_node, int concur) uint64_t expval, actval; /* If there was a bad node, all reductions should fail */ - rc_err0 = (bad_node < 0) ? 0 : CXIP_COLL_RC_OP_MISMATCH; + rc_err0 = (bad_node < 0) ? 0 : FI_CXI_ERRNO_RED_OP_MISMATCH; for (node = 0; node < nodes; node++) { _allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node][first]); @@ -1820,8 +1820,6 @@ Test(coll_reduce_ops, bor) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1845,8 +1843,6 @@ Test(coll_reduce_ops, band) cr_assert(!ret, "_allreduceop() failed = %d\n", ret); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1870,8 +1866,6 @@ Test(coll_reduce_ops, bxor) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1895,8 +1889,6 @@ Test(coll_reduce_ops, imin) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1920,8 +1912,6 @@ Test(coll_reduce_ops, imax) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1945,8 +1935,6 @@ Test(coll_reduce_ops, isum) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -1978,8 +1966,6 @@ Test(coll_reduce_ops, iminmaxloc) cr_assert(!ret, "_allreduceop() failed = %d\n", ret); ret = _check_iminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); STDCLEANUP } @@ -2009,8 +1995,6 @@ Test(coll_reduce_ops, fsum) cr_assert(!ret, "_allreduceop() failed\n"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INEXACT); - cr_assert(!ret, "rc failed\n"); /* Note: inexact computation is guaranteed by the small value included * in the data set. There is a hidden trick when performing the @@ -2040,8 +2024,6 @@ Test(coll_reduce_ops, fmin) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); data[1].fval[1] = NAN; _predict_fmin(nodes, data, &check, true); @@ -2049,7 +2031,7 @@ Test(coll_reduce_ops, fmin) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); cr_assert(!ret, "rc failed NAN\n"); data[1].fval[1] = _snan64(); @@ -2058,7 +2040,7 @@ Test(coll_reduce_ops, fmin) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); cr_assert(!ret, "rc failed sNAN\n"); STDCLEANUP } @@ -2080,8 +2062,6 @@ Test(coll_reduce_ops, fmax) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); data[1].fval[1] = NAN; _predict_fmax(nodes, data, &check, true); @@ -2089,7 +2069,7 @@ Test(coll_reduce_ops, fmax) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); cr_assert(!ret, "rc failed NAN\n"); data[1].fval[1] = _snan64(); @@ -2098,7 +2078,7 @@ Test(coll_reduce_ops, fmax) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); cr_assert(!ret, "rc failed sNAN\n"); STDCLEANUP } @@ -2132,8 +2112,6 @@ Test(coll_reduce_ops, fminmaxloc) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); /* NAN is given preference over number */ data[1].fminval = NAN; @@ -2144,8 +2122,6 @@ Test(coll_reduce_ops, fminmaxloc) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed NAN\n"); /* SNAN is given preference over NAN */ data[1].fminval = NAN; @@ -2157,7 +2133,7 @@ Test(coll_reduce_ops, fminmaxloc) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); cr_assert(!ret, "rc failed sNAN\n"); STDCLEANUP } @@ -2180,8 +2156,6 @@ Test(coll_reduce_ops, fminnum) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); /* number is given preference over NAN */ data[1].fval[1] = NAN; @@ -2191,7 +2165,7 @@ Test(coll_reduce_ops, fminnum) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); cr_assert(!ret, "rc failed NAN\n"); /* number is given preference over NAN */ @@ -2202,7 +2176,7 @@ Test(coll_reduce_ops, fminnum) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); cr_assert(!ret, "rc failed sNAN\n"); STDCLEANUP } @@ -2225,8 +2199,6 @@ Test(coll_reduce_ops, fmaxnum) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); /* number is given preference over NAN */ data[1].fval[1] = NAN; @@ -2236,7 +2208,7 @@ Test(coll_reduce_ops, fmaxnum) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); cr_assert(!ret, "rc failed NAN\n"); /* SNAN is given preference over number */ @@ -2247,7 +2219,7 @@ Test(coll_reduce_ops, fmaxnum) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); cr_assert(!ret, "rc failed sNAN\n"); STDCLEANUP } @@ -2281,8 +2253,6 @@ Test(coll_reduce_ops, fminmaxnumloc) cr_assert(!ret, "_allreduceop failed normal"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed normal\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed normal\n"); /* NAN is given preference over number */ data[1].fminval = NAN; @@ -2293,8 +2263,6 @@ Test(coll_reduce_ops, fminmaxnumloc) cr_assert(!ret, "_allreduceop failed NAN"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed NAN\n"); /* SNAN is given preference over NAN */ data[1].fminval = NAN; @@ -2306,7 +2274,7 @@ Test(coll_reduce_ops, fminmaxnumloc) cr_assert(!ret, "_allreduceop failed sNAN"); ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); cr_assert(!ret, "rc failed sNAN\n"); STDCLEANUP } @@ -2367,8 +2335,6 @@ Test(coll_reduce_ops, prereduce) /* validate results */ ret = _check_ival(nodes, rslt, &check); cr_assert(!ret, "compare failed\n"); - ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); - cr_assert(!ret, "rc failed\n"); free(accum1); free(mc_obj); diff --git a/prov/cxi/test/cuda.c b/prov/cxi/test/cuda.c index 5398dcd98f3..53338f60fd6 100644 --- a/prov/cxi/test/cuda.c +++ b/prov/cxi/test/cuda.c @@ -31,7 +31,7 @@ static void cuda_init(void) srand(seed); } -TestSuite(cuda, .timeout = CXIT_DEFAULT_TIMEOUT, .init = cuda_init); +TestSuite(cuda, .timeout = 60, .init = cuda_init); static void cuda_message_runner(void *cuda_send_buf, void *cuda_recv_buf, size_t buf_size, bool device_only_mem, @@ -423,3 +423,11 @@ Test(cuda, verify_hmemDevReg) verify_dev_reg_handle(true); } + + +/* Verify that large transfers (4+ GiB) work. */ +#define LARGE_XFER ((4ULL * 1024 * 1024 * 1024) - 1) +Test(cuda, large_transfer) +{ + cuda_dev_memory_test(LARGE_XFER, 2, false, true); +} diff --git a/prov/cxi/test/cxi_vm_commit.sh b/prov/cxi/test/cxi_vm_commit.sh new file mode 100755 index 00000000000..d3bb8807ed9 --- /dev/null +++ b/prov/cxi/test/cxi_vm_commit.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +test_short="" +exclude_commit_subject="" + +while getopts "se:" option; do + case "${option}" in + s) + test_short="-s" + ;; + e) + exclude_commit_subject=${OPTARG} + ;; + *) + exit 1; + esac +done + +# Assumes that the commit subject is unique between all commits in the PR. +head_commit_subject_collapsed=$(git log -1 --pretty=%s | tr -d ' ') +if [[ "$head_commit_subject_collapsed" == "$exclude_commit_subject" ]]; then + echo "Skippping commit \"$(git log -1 --pretty=%s)\"" + exit 0 +fi + +git log -1 + +set -e + +cd ../../../ +./autogen.sh +./configure \ + --prefix=$PWD/install \ + --disable-sockets \ + --disable-udp \ + --disable-verbs \ + --disable-rxm \ + --disable-mrail \ + --disable-rxd \ + --disable-shm \ + --disable-tcp \ + --disable-usnic \ + --disable-rstream \ + --disable-efa \ + --disable-psm2 \ + --disable-psm3 \ + --disable-opx \ + --enable-debug \ + --with-default-monitor=uffd \ + --with-criterion=$(realpath ../Criterion/build/install/) \ + --with-cassini-headers=$(realpath ../cassini-headers/install) \ + --with-cxi-uapi-headers=$(realpath ../cxi-driver) \ + --enable-cxi=$(realpath ../libcxi/install) \ + --with-kdreg2=$(realpath ../kdreg2/include) + + +make clean +make -j 8 install + +test_dir=$(realpath ./prov/cxi/test) +test_result_file="run_tests_vm_output.txt" +ssh -tt localhost "cd ${test_dir}; ./run_tests_vm.sh $test_short" | tee ${test_result_file} + +set +e + +# Search ssh output for the following string. This is a test failure +# which is not reported as a tap test failure. +test_error_code=1 +test_error=$(grep "cxitest return non-zero exit code. Possible failures in test teardown" ${test_result_file}) || test_error_code=$(($?^1)) +if [ -z "${test_error}" ] && [ "$test_error_code" -eq "0" ]; then + echo "Zero 'non-zero exit codes' failures in output" +else + echo $test_error + exit 1 +fi + +# Grep all tap out files for "not ok" string. This is a test failure. +test_failures_code=1 +test_failures=$(grep "not ok" ${test_dir}/*.tap) || test_failures_code=$(($?^1)) +if [ -z "${test_failures}" ] && [ "$test_failures_code" -eq "0" ] ; then + echo "Zero 'not ok' failures in tap output" +else + echo $test_failures + exit 1 +fi + +signed_off=$(git log -1 | grep -i "Signed-off-by: ") +if [ -z "${signed_off}" ]; then + echo "Commit not signed off" + exit 1 +else + echo "Commit signed-off check passed" +fi + +echo "Tests passed" +rm ${test_result_file} +exit 0 diff --git a/prov/cxi/test/cxi_vm_pr.sh b/prov/cxi/test/cxi_vm_pr.sh new file mode 100755 index 00000000000..c10c0da1ac6 --- /dev/null +++ b/prov/cxi/test/cxi_vm_pr.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Cache head commit which will be cherry-picked later +head_commit=$(git rev-parse HEAD) + +git checkout -b rebase-test-branch +db=$(git remote show https://github.com/ofiwg/libfabric.git | grep 'HEAD branch' | cut -d' ' -f5) +mb=$(git merge-base origin/${db} HEAD) + +# Run a shorten test suite against each commits except the head commit. +git reset --hard HEAD~1 +git rebase ${mb} --exec "bash ./cxi_vm_commit.sh -s" +if [[ $? -ne 0 ]]; then + exit 1 +fi + +# Run longer test suite against all commits together. +git cherry-pick ${head_commit} +bash ./cxi_vm_commit.sh diff --git a/prov/cxi/test/ep.c b/prov/cxi/test/ep.c index e415c11f018..3292138f049 100644 --- a/prov/cxi/test/ep.c +++ b/prov/cxi/test/ep.c @@ -1,7 +1,8 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + * Copyright (c) 2018 Cray Inc. All rights reserved. + * Copyright (c) 2020-2024 Hewlett Packard Enterprise Development LP */ #include @@ -1758,8 +1759,8 @@ Test(ep_caps, coll_only) &info); cr_assert(ret == FI_SUCCESS); verify_caps_only(info, FI_COLLECTIVE | FI_MSG); - fi_freeinfo(info); + cxit_teardown_getinfo(); } diff --git a/prov/cxi/test/mr.c b/prov/cxi/test/mr.c index fab3cbab7d7..0c21f1e3c5d 100644 --- a/prov/cxi/test/mr.c +++ b/prov/cxi/test/mr.c @@ -159,7 +159,7 @@ Test(mr, mr_zero_len) /* Validate that unique keys are enforced. */ Test(mr, mr_unique_key) { - char buf[256]; + char buf[256] = {}; struct fid_mr *mr1; struct fid_mr *mr2; int ret; @@ -185,7 +185,7 @@ Test(mr, mr_unique_key) /* Validate not recycling non-cached FI_MR_PROV_KEY */ Test(mr, mr_recycle) { - char buf[256]; + char buf[256] = {}; struct fid_mr *mr1; struct fid_mr *mr2; struct fid_mr *mr3; @@ -273,7 +273,7 @@ Test(mr, mr_recycle) /* Validate that RKEY are not required for local MR */ Test(mr, mr_no_local_rkey) { - char buf[256]; + char buf[256] = {}; struct fid_mr *mr1; struct fid_mr *mr2; uint64_t rkey = 0; diff --git a/prov/cxi/test/multinode/test_coll.c b/prov/cxi/test/multinode/test_coll.c index 3dc80b86278..a8bff9127e6 100644 --- a/prov/cxi/test/multinode/test_coll.c +++ b/prov/cxi/test/multinode/test_coll.c @@ -1,7 +1,7 @@ /* * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only * - * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP + * Copyright (c) 2021-2024 Hewlett Packard Enterprise Development LP */ /* @@ -27,6 +27,7 @@ #include #include #include +#include #include "multinode_frmwk.h" /* If not compiled with DEBUG=1, this is a no-op */ @@ -313,12 +314,12 @@ struct join_item { struct dlist_entry entry; struct fid_av_set *avset; struct fid_mc *mc; + int join_index; int prov_errno; int retval; - int trace_no; }; -/* poll the collective eq once, count of completions (0 or 1) */ +/* poll the collective eq once, return 0 on success, errno on failure */ static int _poll_eq(void) { struct cxip_ep *ep; @@ -333,31 +334,38 @@ static int _poll_eq(void) jctx = NULL; ret = fi_eq_read(eq, &event, &eqd, sizeof(eqd), 0); + /* silent retry*/ + if (ret == -FI_EAGAIN) + return -FI_EAGAIN; + /* simple response */ if (ret >= 0) { TRACE("read EQ = %d\n", ret); if (ret < sizeof(struct fi_eq_entry)) { - TRACE("fi_eq_read()=%d, exp=%ld\n", + TRACE("fi_eq_read()=%d, exp=%ld, too small\n", ret, sizeof(struct fi_eq_entry)); return -FI_EINVAL; } - TRACE("=== EQ SUCCESS\n"); + TRACE("EQ RESPONSE\n"); TRACE(" size = %d\n", ret); TRACE(" event = %d\n", event); TRACE(" fid = %p\n", eqd.fid); TRACE(" context = %p\n", eqd.context); TRACE(" data = %lx\n", eqd.data); - if (eqd.context && event == FI_JOIN_COMPLETE) { - jctx = eqd.context; - jctx->retval = 0; - jctx->prov_errno = 0; - return 1; + if (!eqd.context || event != FI_JOIN_COMPLETE) { + TRACE("Unexpected eqd response\n"); + return -FI_EINVAL; } + TRACE("=== EQ SUCCESS\n"); + jctx = eqd.context; + jctx->retval = 0; + jctx->prov_errno = 0; + return FI_SUCCESS; } if (ret == -FI_EAVAIL) { TRACE("read EQ = %d\n", ret); ret = fi_eq_readerr(eq, &eqd, 0); if (ret < sizeof(struct fi_eq_err_entry)) { - TRACE("fi_eq_readerr()=%d, exp=%ld\n", + TRACE("fi_eq_readerr()=%d, exp=%ld too small\n", ret, sizeof(struct fi_eq_err_entry)); return -FI_EINVAL; } @@ -367,17 +375,18 @@ static int _poll_eq(void) TRACE(" fid = %p\n", eqd.fid); TRACE(" context = %p\n", eqd.context); TRACE(" data = %lx\n", eqd.data); - TRACE(" err = %s (%d)\n", - fi_strerror(-eqd.err), eqd.err); + TRACE(" err = %s (%d)\n", fi_strerror(-eqd.err), eqd.err); TRACE(" prov_err= %d\n", eqd.prov_errno); TRACE(" err_data= %p\n", eqd.err_data); TRACE(" err_size= %ld\n", eqd.err_data_size); - if (eqd.context) { - jctx = eqd.context; - jctx->retval = eqd.err; - jctx->prov_errno = eqd.prov_errno; - return 1; + if (!eqd.context) { + TRACE("Unexpected eqd response\n"); + return -FI_EINVAL; } + jctx = eqd.context; + jctx->retval = eqd.err; + jctx->prov_errno = eqd.prov_errno; + return FI_SUCCESS; } if (ret != -FI_EAGAIN) { TRACE("read EQ = %d\n", ret); @@ -392,6 +401,9 @@ static int _poll_eq(void) void coll_multi_release(struct dlist_entry *joinlist) { struct join_item *jctx; + int poll_count = 0; + int count = 0; + int ret; TRACE("coll_multi_release\n"); while (!dlist_empty(joinlist)) { @@ -399,14 +411,33 @@ void coll_multi_release(struct dlist_entry *joinlist) TRACE("close mc, empty = %d\n", dlist_empty(joinlist)); if (jctx->mc) fi_close(&jctx->mc->fid); + TRACE("free jctx\n"); free(jctx); + count++; } - TRACE("return\n"); + while (count > 0) { + ret = cxip_curl_progress(NULL); + if (ret == -FI_EAGAIN) { + poll_count++; + usleep(10); + continue; + } + if (ret < 0 && ret != -FI_ENODATA) { + TRACE("Curl progress failed, count=%d error=%d\n", + count, ret); + break; + } + count--; + } + TRACE("CURL cleanup delay = %d usec\n", 10*poll_count); } -/* initiate join on all sets in setary, and append to joinlist */ -int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) +/* initiate join on all sets in setary, and append to joinlist + * must succeed completely or cleans up and reports failure + */ +int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist, + int limit) { struct join_item *jctx; int i, ret, total, count; @@ -418,9 +449,14 @@ int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) count = 0; for (i = 0; i < total; i++) { jctx = calloc(1, sizeof(*jctx)); - jctx->trace_no = i; - jctx->avset = setary->avset[i]; + if (!jctx) { + TRACE("calloc failed on jctx[%d]\n", i); + ret = -FI_ENOMEM; + goto fail; + } dlist_init(&jctx->entry); + jctx->join_index = i; + jctx->avset = setary->avset[i]; TRACE("join %d of %d initiating\n", i, total); ret = fi_join_collective(cxit_ep, FI_ADDR_NOTAVAIL, setary->avset[i], 0L, &jctx->mc, jctx); @@ -429,16 +465,21 @@ int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) free(jctx); continue; } - TRACE("join %d continuing ret=%d\n", i, ret); if (ret != FI_SUCCESS) { - TRACE("join %d FAILED\n", ret); + TRACE("join %d FAILED join %d\n", i, ret); + free(jctx); goto fail; } /* wait for join to complete */ do { _poll_cqs(); ret = _poll_eq(); - } while (ret == 0); + } while (ret == -FI_EAGAIN); + if (ret < 0) { + TRACE("join %d FAILED eq poll %d\n", i, ret); + free(jctx); + goto fail; + } dlist_insert_tail(&jctx->entry, joinlist); count++; } @@ -446,7 +487,7 @@ int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) return FI_SUCCESS; fail: - TRACE("TEST failed\n"); + TRACE("MULTIJOIN failed\n"); coll_multi_release(joinlist); return ret; } @@ -499,8 +540,8 @@ struct join_item *coll_single_join(fi_addr_t *fiaddrs, size_t size, } dlist_init(joinlist); - ret = coll_multi_join(setary, joinlist); - if (ret) { + ret = coll_multi_join(setary, joinlist, -1); + if (ret < 0) { TRACE("%s JOIN coll_multi_join()=%d\n", msg, ret); goto quit; } @@ -526,46 +567,6 @@ struct join_item *coll_single_join(fi_addr_t *fiaddrs, size_t size, return NULL; } -#if 0 -int _test_multi_barrier(struct avset_ary *setary, struct dlist_entry *joinlist, - int N, long *nsec_delay, int total_secs) -{ - struct timespec *nsec_times, nsec_start; - int i, ret; - - nsec_times = calloc(sizeof(struct timespec), N); - ret = coll_init_multi_join(setary, joinlist); - if (ret) { - TRACE("multicast_join init error = %d\n", ret); - goto quit; - } - ret = coll_wait_multi_join(joinlist); - if (ret) { - TRACE("multicast_join wait error = %d\n", ret); - goto quit; - } - - _nsecs_from_now(&nsec_start, 0L); - nsec_start.tv_sec += total_secs; - - for (i = 0; i < N; i++) - _nsecs_from_now(&nsec_times[i], nsec_delay[i]); - while (!_nsecs_expired(&nsec_start)) { - for (i = 0; i < N; i++) { - if (!_nsecs_expired(&nsec_times[i])) - continue; - for (j = 0; j < ) - } - - } -quit: - free(nsec_times); - coll_multi_releasejoinlist); - avset_ary_destroy(setary); - return ret; -} -#endif - int _simple_join(fi_addr_t *fiaddrs, size_t size, struct avset_ary *setary, struct dlist_entry *joinlist) @@ -578,8 +579,8 @@ int _simple_join(fi_addr_t *fiaddrs, size_t size, return ret; dlist_init(joinlist); - ret = coll_multi_join(setary, joinlist); - if (ret) + ret = coll_multi_join(setary, joinlist, -1); + if (ret < 0) return ret; return 0; @@ -590,43 +591,28 @@ uint64_t _simple_get_mc(struct dlist_entry *joinlist) struct join_item *jctx; jctx = dlist_first_entry_or_null(joinlist, struct join_item, entry); + if (jctx == NULL) { + TRACE("Join item is NULL\n"); + return 0; + } return (uint64_t)jctx->mc; } void _simple_join_release(struct avset_ary *setary, struct dlist_entry *joinlist) { - coll_multi_release(joinlist); - avset_ary_destroy(setary); -} - -/** - * @brief Simple test of join/delete returns a count of errors. - * - * This creates a single avset_ary from the supplied addresses, with hwroot - * of zero, and performs a single join, tests errors, and cleans up. Used to - * probe the basic error conditions. - */ -int _test_join(fi_addr_t *fiaddrs, size_t size) -{ - struct avset_ary setary; - struct dlist_entry joinlist; - int ret; - - ret = _simple_join(fiaddrs, size, &setary, &joinlist); - _simple_join_release(&setary, &joinlist); - - return ret; + coll_join_cleanup(setary, joinlist); } -/* Simple test of barrier, returns a count of errors. */ -int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count) +/* Simple test of count barriers, returns a count of errors. */ +int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count, + struct cxip_coll_metrics *metrics) { struct avset_ary setary; struct dlist_entry joinlist; uint64_t context; uint64_t mc; - int i, ret, total; + int i, ret; TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); ret = _simple_join(fiaddrs, size, &setary, &joinlist); @@ -651,7 +637,6 @@ int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count) TRACE("spin 1...\n"); _wait_cqs(&context); TRACE("BARRIER COMPLETE #%d\n", i); - total++; } else { TRACE("BARRIER FAILED #%d, ret=%d\n", i, ret); goto quit; @@ -661,20 +646,22 @@ int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count) quit: TRACE("BARRIER exit\n"); - frmwk_log0("Barrier total=%d\n", total); + if (metrics) + cxip_coll_get_metrics(metrics); _simple_join_release(&setary, &joinlist); return ret; } -/* Simple test of broadcast, returns a count of errors. */ -int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int rootidx) +/* Simple test of count broadcasts, returns a count of errors. */ +int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int count, + struct cxip_coll_metrics *metrics) { struct avset_ary setary; struct dlist_entry joinlist; uint64_t data[4], rslt[4]; uint64_t context; uint64_t mc; - int i, ret; + int i, root, ret; TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); @@ -691,49 +678,56 @@ int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int rootidx) goto quit; } - data[0] = 0x12345678; - data[1] = 0x2468ace0; - data[2] = 0x13579bdf; - data[3] = 0x10101010; - memset(rslt, 0, sizeof(rslt)); - if (frmwk_rank == rootidx) - memcpy(rslt, data, sizeof(rslt)); - do { - _poll_cqs(); - ret = fi_broadcast(cxit_ep, rslt, 4, NULL, - mc, fiaddrs[rootidx], - FI_UINT64, 0L, &context); - } while (ret == -FI_EAGAIN); - if (ret) - goto quit; + for (i = 0; i < count; i++) { + for (root = 0; root < size; root++) { + data[0] = i; + data[1] = root; + data[2] = 0x13579bdf; + data[3] = 0x10101010; + memset(rslt, 0, sizeof(rslt)); + if (frmwk_rank == root) + memcpy(rslt, data, sizeof(rslt)); + do { + _poll_cqs(); + ret = fi_broadcast(cxit_ep, rslt, 4, NULL, mc, + fiaddrs[root], FI_UINT64, 0L, + &context); + } while (ret == -FI_EAGAIN); + if (ret) + goto quit; - TRACE("spin 1...\n"); - _wait_cqs(&context); - TRACE("BROADCAST COMPLETE\n"); - if (memcmp(rslt, data, sizeof(rslt))) { - for (i = 0; i < 4; i++) - TRACE("[%d] %016lx exp %016lx\n", - i, rslt[i], data[i]); - ret = -1; + TRACE("spin 1...\n"); + _wait_cqs(&context); + TRACE("BROADCAST COMPLETE\n"); + if (memcmp(rslt, data, sizeof(rslt))) { + for (i = 0; i < 4; i++) + TRACE("[%d] %016lx exp %016lx\n", + i, rslt[i], data[i]); + ret = -1; + } + } } quit: TRACE("BROADCAST exit\n"); + if (metrics) + cxip_coll_get_metrics(metrics); _simple_join_release(&setary, &joinlist); return ret; } const struct timespec usec1 = {.tv_sec = 0, .tv_nsec = 10000}; -/* simple test of allreduce, returns a count of errors. */ -int _test_allreduce(fi_addr_t *fiaddrs, size_t size) +/* simple test of count allreduce int sums, returns a count of errors. */ +int _test_allreduce_isum(fi_addr_t *fiaddrs, size_t size, int count, + struct cxip_coll_metrics *metrics) { struct avset_ary setary; struct dlist_entry joinlist; int64_t *data, *rslt, *comp; uint64_t context; uint64_t mc; - int r, v, ret; + int i, r, v, ret; TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); @@ -755,29 +749,107 @@ int _test_allreduce(fi_addr_t *fiaddrs, size_t size) data = calloc(frmwk_numranks*4, sizeof(int64_t)); comp = calloc(4, sizeof(int64_t)); rslt = calloc(4, sizeof(int64_t)); - for (v = 0; v < 4; v++) - for (r = 0; r < frmwk_numranks; r++) - data[4*r + v] = 4*r + v; - for (v = 0; v < 4; v++) - for (r = 0; r < frmwk_numranks; r++) - comp[v] += data[4*r + v]; - do { - _poll_cqs(); - ret = fi_allreduce(cxit_ep, &data[frmwk_rank*4], 4, NULL, - rslt, NULL, mc, FI_INT64, - FI_SUM, 0L, &context); - } while (ret == -FI_EAGAIN); - if (ret) + for (i = 0; i < count; i++) { + memset(data, 0, frmwk_numranks * 4 * sizeof(int64_t)); + memset(comp, 0, 4 * sizeof(int64_t)); + memset(rslt, 0, 4 * sizeof(int64_t)); + for (v = 0; v < 4; v++) + for (r = 0; r < frmwk_numranks; r++) + data[4*r + v] = 4*r + v; + for (v = 0; v < 4; v++) + for (r = 0; r < frmwk_numranks; r++) + comp[v] += data[4*r + v]; + do { + _poll_cqs(); + ret = fi_allreduce(cxit_ep, &data[frmwk_rank*4], 4, NULL, + rslt, NULL, mc, FI_INT64, + FI_SUM, 0L, &context); + } while (ret == -FI_EAGAIN); + if (ret) + goto quit; + + TRACE("spin...\n"); + _wait_cqs(&context); + TRACE("ALLREDUCE COMPLETE\n"); + for (v = 0; v < 4; v++) { + if (rslt[v] != comp[v]) { + TRACE("[%d] %016lx exp %016lx\n", + v, rslt[v], comp[v]); + ret = 1; + } + } + } + free(rslt); + free(comp); + free(data); + +quit: + TRACE("ALLREDUCE exit\n"); + if (metrics) + cxip_coll_get_metrics(metrics); + _simple_join_release(&setary, &joinlist); + return ret; +} + +/* simple test of allreduce double sums, returns a count of errors. */ +int _test_allreduce_dsum(fi_addr_t *fiaddrs, size_t size, int count, + struct cxip_coll_metrics *metrics) +{ + struct avset_ary setary; + struct dlist_entry joinlist; + double *data, *rslt, *comp; + uint64_t context; + uint64_t mc; + int i, r, v, ret; + + TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); + + ret = _simple_join(fiaddrs, size, &setary, &joinlist); + if (ret) { + TRACE("join failed\n"); + goto quit; + } + + mc = _simple_get_mc(&joinlist); + if (!mc) { + TRACE("ALLREDUCE MC invalid\n"); + ret = -1; goto quit; + } + if (_is_hwroot(_get_join_jctx(&joinlist, 0))) + nanosleep(&usec1, NULL); - TRACE("spin...\n"); - _wait_cqs(&context); - TRACE("ALLREDUCE COMPLETE\n"); - for (v = 0; v < 4; v++) { - if (rslt[v] != comp[v]) { - TRACE("[%d] %016lx exp %016lx\n", - v, rslt[v], comp[v]); - ret = 1; + data = calloc(frmwk_numranks*4, sizeof(double)); + comp = calloc(4, sizeof(double)); + rslt = calloc(4, sizeof(double)); + ret = 0; + for (i = 0; i < count; i++) { + for (v = 0; v < 4; v++) + for (r = 0; r < frmwk_numranks; r++) + data[4*r + v] = (4*r + v)/1000.0; + for (v = 0; v < 4; v++) { + comp[v] = 0.0; + for (r = 0; r < frmwk_numranks; r++) + comp[v] += data[4*r + v]; + } + do { + _poll_cqs(); + ret = fi_allreduce(cxit_ep, &data[frmwk_rank*4], 4, NULL, + rslt, NULL, mc, FI_DOUBLE, + FI_SUM, 0L, &context); + } while (ret == -FI_EAGAIN); + if (ret) + goto quit; + + TRACE("spin...\n"); + _wait_cqs(&context); + TRACE("ALLREDUCE COMPLETE\n"); + for (v = 0; v < 4; v++) { + if (fabs(rslt[v] - comp[v]) > 0.00000001) { + TRACE("[%d] %f exp %f\n", + v, rslt[v], comp[v]); + ret = 1; + } } } free(rslt); @@ -786,6 +858,8 @@ int _test_allreduce(fi_addr_t *fiaddrs, size_t size) quit: TRACE("ALLREDUCE exit\n"); + if (metrics) + cxip_coll_get_metrics(metrics); _simple_join_release(&setary, &joinlist); return ret; } @@ -823,8 +897,39 @@ static uint64_t testmask = 0L; if (skip) break; \ ret = 0 +static uint64_t get_range_mask(char *str) +{ + uint64_t mask = 0L; + char *s, *p; + int i, j; + + while (*str) { + while (*str == ' ') + str++; + s = str; + while (*str && *str != ',') + str++; + if (*str) + *str++ = 0; + p = s; + while (*p && *p != '-') + p++; + if (*p) + *p++ = 0; + i = (*s) ? atoi(s) : 0; + j = (*p) ? atoi(p) : i; + if (j > 63) + j = 63; + while (i <= j) { + mask |= (1L << i++); + } + } + return mask; +} + int main(int argc, char **argv) { + struct cxip_coll_metrics metrics; fi_addr_t *fiaddrs = NULL; fi_addr_t myaddr; struct cxip_addr mycaddr; @@ -832,10 +937,11 @@ int main(int argc, char **argv) size_t size = 0; int errcnt = 0; int tstcnt = 0; + int skpcnt = 0; int tstnum = 0; int ret = 0; - int N = 0; - int S = 1; + int trees = 1; + int opcount = 1; bool help = false; bool trace_muted = true; struct join_item *jctx; @@ -843,53 +949,31 @@ int main(int argc, char **argv) struct dlist_entry joinlist; const char *testname; char opt; - int i, j; + int i; /* by default, perform all tests */ testmask = -1L; testname = NULL; setvbuf(stdout, NULL, _IONBF, 0); - while ((opt = getopt(argc, argv, "hvVS:Mt:N:")) != -1) { - char *str, *s, *p; - + while ((opt = getopt(argc, argv, "hvVS:Mt:N:n:")) != -1) { switch (opt) { case 't': /* perform only selected tests */ - str = optarg; - i = j = 0; - testmask = 0L; - while (*str) { - while (*str == ' ') - str++; - s = str; - while (*str && *str != ',') - str++; - if (*str) - *str++ = 0; - p = s; - while (*p && *p != '-') - p++; - if (*p) - *p++ = 0; - i = (*s) ? atoi(s) : 0; - j = (*p) ? atoi(p) : i; - if (j > 63) - j = 63; - while (i <= j) { - testmask |= (1L << i++); - } - } + testmask = get_range_mask(optarg); + break; + case 'x': + /* exclude all selected tests */ + testmask = ~get_range_mask(optarg); break; case 'M': create_multicast = true; break; case 'N': - N = atoi(optarg); + trees = atoi(optarg); break; - case 'S': - S = atoi(optarg); - printf("S = %d\n", S); + case 'n': + opcount = atoi(optarg); break; case 'V': /* tracing is enabled below */ @@ -916,11 +1000,17 @@ int main(int argc, char **argv) do { if (help) { frmwk_log0( - "Usage: t est_coll [-hvV] -M -Ncount [-t testno[-testno][,...]]\n" - " -h generate help and quit.\n" + "Usage: test_coll [-hvV] -M -N treecnt -n opcnt\n" + " [-t testno[-testno][,...]\n" + " [-x testno[-testno][,...]]...\n" + " -h generate help and quit\n" + " -v verbose to stdout\n" + " -V verbose to trace files\n" " -M use multicast model (default unicast model)\n" - " -N iterations (default 1)\n" - " -t test list (default all)\n"); + " -N concurrent trees (default 1)\n" + " -n operation count (default 1)\n" + " -t set test list (default all)\n" + " -x exclude from test list (can be repeated)\n"); break; } @@ -964,8 +1054,8 @@ int main(int argc, char **argv) ret = 0; tstcnt += 1; errcnt += !!ret; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; @@ -981,8 +1071,8 @@ int main(int argc, char **argv) avset_ary_destroy(&setary); tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; @@ -1007,12 +1097,12 @@ int main(int argc, char **argv) errcnt += !!(setary.avset_cnt != 0); errcnt += !!(setary.avset_siz != 0); tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; - /* Sanity test for _test_join() utility function. + /* Sanity test for coll_single_join(). */ do { PREAMBLE(0, tstnum, "test join (simple)"); @@ -1025,8 +1115,8 @@ int main(int argc, char **argv) coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; @@ -1042,8 +1132,8 @@ int main(int argc, char **argv) coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; @@ -1059,15 +1149,17 @@ int main(int argc, char **argv) coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test zbcoll transient failure to acquire a group ID. + */ do { PREAMBLE(0, tstnum, "force -FI_EAGAIN on root getgroup"); // cause zbcoll root (rank 0) to reject getgroup requests once - cxip_trap_set(0, CXIP_TRAP_GETGRP, -FI_EAGAIN); + cxip_trap_set(0, CXIP_TRAP_GETGRP, -FI_EAGAIN, 0); // cause non-root ranks attempt zbcoll getgroup first if (frmwk_rank == 0) usleep(10000); @@ -1076,285 +1168,161 @@ int main(int argc, char **argv) coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test zbcoll transient failure to perform a broadcast. + */ do { PREAMBLE(0, tstnum, "force -FI_EAGAIN on root broadcast"); // cause zbcoll root (rank 0) to reject broadcast requests once - cxip_trap_set(0, CXIP_TRAP_BCAST, -FI_EAGAIN); + cxip_trap_set(0, CXIP_TRAP_BCAST, -FI_EAGAIN, 0); jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, &setary, &joinlist, "FI_EAGAIN root bcast"); coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test zbcoll transient failure to perform a reduce. + */ do { PREAMBLE(0, tstnum, "force -FI_EAGAIN on root reduce"); // cause zbcoll root (rank 0) to reject join reduce once - cxip_trap_set(0, CXIP_TRAP_REDUCE, -FI_EAGAIN); + cxip_trap_set(0, CXIP_TRAP_REDUCE, -FI_EAGAIN, 0); jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, &setary, &joinlist, "FI_EAGAIN root reduce"); coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; -#if 0 + /* Test failure to acquire a PTE. + */ do { - PREAMBLE(0, tstnum, "force -FI_EFAULT on PTE alloc"); + PREAMBLE(0, tstnum, "force -FI_EAVAIL on PTE alloc"); // cause zbcoll root (rank 0) to simulate PTE alloc failure - cxip_trap_set(0, CXIP_TRAP_INITPTE, -FI_EFAULT); - ret = _test_join(fiaddrs, size, -FI_EAVAIL, - CXIP_PROV_ERRNO_PTE); + cxip_trap_set(0, CXIP_TRAP_INITPTE, -FI_EAVAIL, + FI_CXI_ERRNO_JOIN_FAIL_PTE); + jctx = coll_single_join(fiaddrs, size, 0, 0, + 0, 0, + &setary, &joinlist, + "fail PTE alloc"); + TRACE("Aborting\n"); tstcnt += 1; + ret = 0; errcnt += !!ret; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); - } while (0); - tstnum++; -#endif - - do { - struct cxip_coll_mc *mc_obj; - struct cxip_coll_reduction *reduction; - struct cxip_coll_data coll_data; - int ret; - - PREAMBLE(0, tstnum, "test single packet send"); - // Create multicast and send packet through HWRoot - TRACE("======= %s\n", testname); - TRACE("starting join\n"); - - /* root is index 0, others are leaves */ - jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, - &setary, &joinlist, "simple"); - TRACE("completed join jctx = %p\n", jctx); - mc_obj = (struct cxip_coll_mc *)jctx->mc; - mc_obj->arm_disable = true; - mc_obj->retry_disable = true; - TRACE("S=%d rank=%d hwroot=%d\n", S, frmwk_rank, - mc_obj->hwroot_idx); - reduction = &mc_obj->reduction[0]; - coll_data.red_cnt = 1; - coll_data.intval.ival[0] = 1234; - coll_data.intval.ival[1] = frmwk_rank; - memset(&reduction->accum, 0, sizeof(reduction->accum)); - if (frmwk_rank == S) { - TRACE("test starting send on %d\n", S); - do { - ret = cxip_coll_send_red_pkt( - reduction, &coll_data, - false, false); - TRACE("send result = %d\n", ret); - } while (ret == -FI_EAGAIN); - TRACE("completed send = %d\n", ret); - } - while (1) - _poll_cqs(); - - coll_join_cleanup(&setary, &joinlist); - errcnt += !!!jctx; - tstcnt += 1; frmwk_log0("%4s\n", STDMSG(ret)); - frmwk_barrier(); } while (0); tstnum++; -/*###############################################################*/ + /* Placeholder + */ do { - uint64_t context; - - PREAMBLE(0, tstnum, "test barrier (simple)"); - // Test single join over one array list + PREAMBLE(0, tstnum, "(not implemented)"); + // Placeholder, preserve to keep other numbering the same TRACE("======= %s\n", testname); - TRACE("[%d] starting join\n", frmwk_rank); - jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, - &setary, &joinlist, "simple"); - TRACE("completed join jctx = %p\n", jctx); - TRACE("start barrier\n"); - do { - ret = fi_barrier(cxit_ep, (fi_addr_t )jctx->mc, - &context); - TRACE("barrier = %d\n", ret); - } while (ret == -FI_EAGAIN); - - if (ret == FI_SUCCESS) { - TRACE("spin 1...\n"); - _wait_cqs(&context); - TRACE("BARRIER COMPLETE #%d\n", i); - } else { - TRACE("BARRIER FAILED #%d, ret=%d\n", i, ret); - errcnt++; - } - coll_join_cleanup(&setary, &joinlist); - errcnt += !!!jctx; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); + ret = 0; frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test opcount barriers. + */ do { - PREAMBLE(0, tstnum, "perform barrier"); - TRACE("Starting barrier\n"); - ret = _test_barrier(fiaddrs, size, 1); + PREAMBLE(0, tstnum, "perform barrier x opcount (default 1)"); + ret = _test_barrier(fiaddrs, size, opcount, &metrics); errcnt += !!ret; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); + fprintf(stdout, + "reductions [%2d] %s " + "full:%-4ld none:%-4ld part:%-4ld bad: %-4ld\n", + metrics.ep_data.myrank, + metrics.ep_data.isroot ? "root" : "leaf", + metrics.red_count_full, + metrics.red_count_unreduced, + metrics.red_count_partial, + metrics.red_count_bad); frmwk_barrier(); - } while (0); - tstnum++; - - do { - PREAMBLE(0, tstnum, "perform broadcast"); - for (i = 0; i < frmwk_numranks; i++) { - ret = _test_broadcast(fiaddrs, size, i); - errcnt += !!ret; - } - tstcnt += 1; frmwk_log0("%4s\n", STDMSG(ret)); - frmwk_barrier(); } while (0); tstnum++; + /* Test opcount broadcasts. + */ do { - PREAMBLE(0, tstnum, "perform allreduce sum"); - ret = _test_allreduce(fiaddrs, size); - TRACE("allreduce ret = %d\n", ret); + PREAMBLE(0, tstnum, "perform broadcast x opcount (default 1)"); + ret = _test_broadcast(fiaddrs, size, opcount, &metrics); errcnt += !!ret; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); + fprintf(stdout, + "reductions [%2d] %s " + "bad: %-4ld full:%-4ld part:%-4ld none:%-4ld\n", + metrics.ep_data.myrank, + metrics.ep_data.isroot ? "root" : "leaf", + metrics.red_count_bad, + metrics.red_count_full, + metrics.red_count_partial, + metrics.red_count_unreduced); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; + /* Test opcount int64 sum reductions + */ do { - PREAMBLE(0, tstnum, "perform barrier x N"); - ret = _test_barrier(fiaddrs, size, N); + PREAMBLE(0, tstnum, "perform allreduce int64 sum x opcount (default 1)"); + ret = _test_allreduce_isum(fiaddrs, size, opcount, &metrics); + TRACE("allreduce ret = %d\n", ret); errcnt += !!ret; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); + fprintf(stdout, + "reductions [%2d] %s " + "bad: %-4ld full:%-4ld part:%-4ld none:%-4ld\n", + metrics.ep_data.myrank, + metrics.ep_data.isroot ? "root" : "leaf", + metrics.red_count_bad, + metrics.red_count_full, + metrics.red_count_partial, + metrics.red_count_unreduced); frmwk_barrier(); - } while (0); - tstnum++; - - do { - PREAMBLE(0, tstnum, "test mcast dup"); - avset_ary_init(&setary); - TRACE("avset initialized\n"); - ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); - TRACE("avset append 1 = %d\n", ret); - ret = avset_ary_append(fiaddrs, size, 0, 1, &setary); - TRACE("avset append 2 = %d\n", ret); - - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - TRACE("join = %d\n", ret); - - jctx = _get_join_jctx(&joinlist, 0); - TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval || jctx->prov_errno) { - TRACE("unexpected result on coll 0\n"); - errcnt++; - } - jctx = _get_join_jctx(&joinlist, 1); - TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval != -FI_EAVAIL || - jctx->prov_errno != CXIP_PROV_ERRNO_MCAST_INUSE) { - TRACE("unexpected result on coll 1\n"); - errcnt++; - } - tstcnt += 1; - - frmwk_log0("%4s\n", STDMSG(ret)); - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); - } while (0); - tstnum++; - - do { - PREAMBLE(0, tstnum, "test hwroot dup"); - avset_ary_init(&setary); - TRACE("avset initialized\n"); - ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); - TRACE("avset append 1 = %d\n", ret); - ret = avset_ary_append(fiaddrs, size, 1, 0, &setary); - TRACE("avset append 2 = %d\n", ret); - - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - TRACE("join = %d\n", ret); - - jctx = _get_join_jctx(&joinlist, 0); - TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval || jctx->prov_errno) { - TRACE("unexpected result on coll 0\n"); - errcnt++; - } - jctx = _get_join_jctx(&joinlist, 1); - TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval != -FI_EAVAIL || - jctx->prov_errno != CXIP_PROV_ERRNO_HWROOT_INUSE) { - TRACE("unexpected result on coll 1\n"); - errcnt++; - } - tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); } while (0); tstnum++; + /* Test opcount double sum reductions + */ do { - PREAMBLE(0, tstnum, "test hwroot and mcast dup"); - avset_ary_init(&setary); - TRACE("avset initialized\n"); - ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); - TRACE("avset append 1 = %d\n", ret); - ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); - TRACE("avset append 2 = %d\n", ret); - - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - TRACE("join = %d\n", ret); - - jctx = _get_join_jctx(&joinlist, 0); - TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval || jctx->prov_errno) { - TRACE("unexpected result on coll 0\n"); - errcnt++; - } - jctx = _get_join_jctx(&joinlist, 1); - TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", - jctx->mc, jctx->retval, jctx->prov_errno); - if (jctx->retval != -FI_EAVAIL || - jctx->prov_errno != CXIP_PROV_ERRNO_HWROOT_INUSE) { - TRACE("unexpected result on coll 1\n"); - errcnt++; - } + PREAMBLE(0, tstnum, "perform allreduce double sum x opcount (default 1)"); + ret = _test_allreduce_dsum(fiaddrs, size, opcount, &metrics); + TRACE("allreduce ret = %d\n", ret); + errcnt += !!ret; tstcnt += 1; - + fprintf(stdout, + "reductions [%2d] %s " + "bad: %-4ld full:%-4ld part:%-4ld none:%-4ld\n", + metrics.ep_data.myrank, + metrics.ep_data.isroot ? "root" : "leaf", + metrics.red_count_bad, + metrics.red_count_full, + metrics.red_count_partial, + metrics.red_count_unreduced); + frmwk_barrier(); frmwk_log0("%4s\n", STDMSG(ret)); - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); } while (0); tstnum++; @@ -1363,18 +1331,18 @@ int main(int argc, char **argv) avset_ary_init(&setary); TRACE("avset initialized\n"); - for (i = 0; i < N; i++) { + for (i = 0; i < trees; i++) { ret = avset_ary_append(fiaddrs, size, i, i, &setary); TRACE("avset append %d = %d\n", i, ret); } dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); + ret = coll_multi_join(&setary, &joinlist, -1); TRACE("multijoin = %d\n", ret); - for (i = 0; i < N; i++) { + for (i = 0; i < trees; i++) { int exp_ret = (i < size) ? 0 : -FI_EAVAIL; - int exp_errno = (i < size) ? 0 : CXIP_PROV_ERRNO_HWROOT_INUSE; + int exp_errno = (i < size) ? 0 : FI_CXI_ERRNO_JOIN_HWROOT_INUSE; int good; jctx = _get_join_jctx(&joinlist, i); @@ -1391,108 +1359,23 @@ int main(int argc, char **argv) } tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); coll_multi_release(&joinlist); avset_ary_destroy(&setary); - } while (0); - tstnum++; - - - do { - PREAMBLE(0, tstnum, "test multiple broadcast"); - - uint64_t **datary, *ctxary, *ctxptr; - int in_progress, tree, root, i, j; - - /* set up maximum number of trees possible */ - avset_ary_init(&setary); - for (tree = 0; tree < size; tree++) { - ret = avset_ary_append(fiaddrs, size, tree, tree, &setary); - TRACE("avset append group %d = %d\n", tree, ret); - } - TRACE("avset initialized\n"); - - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - TRACE("multijoin = %d\n", ret); - - /* context and data for each collective tree */ - ctxary = calloc(size, sizeof(uint64_t)); - datary = calloc(size, sizeof(void *)); - for (tree = 0; tree < size; tree++) { - datary[tree] = calloc(4, sizeof(uint64_t)); - ctxary[tree] = tree; - } - - /* repeat the collective N times as requested*/ - for (i = 0; i < N; i++) { - in_progress = 0; - - /* rotate root every time */ - root = i%size; - - /* start a broadcast on every tree */ - for (tree = 0; tree < size; tree++) { - uint64_t id = (uint64_t)tree << 32; - - /* prepare the data */ - memset(datary[tree], 0, 4*sizeof(uint64_t)); - if (frmwk_rank == root) { - for (j = 0; j < 4; j++) - datary[tree][j] = id|root; - } - TRACE("strt=%d tree=%d\n", i, tree); - for (j = 0; j < 4; j++) - TRACE(" %016lx\n", datary[tree][j]); - - } - for (tree = 0; tree < size; tree++) { - int tree2 = (tree + frmwk_rank)%size; - - usleep(rand() % 100); - jctx = _get_join_jctx(&joinlist, tree2); - ret = fi_broadcast(cxit_ep, datary[tree2], 4, NULL, - (fi_addr_t )jctx->mc, - fiaddrs[root], FI_UINT64, - 0L, &ctxary[tree2]); - in_progress++; - TRACE("in_progress=%d\n", in_progress); - if ((ctxptr = _poll_cqs())) { - in_progress--; - TRACE("ctxptr=%ld in_progress=%d\n", - *ctxptr, in_progress); - } - } - while (in_progress > 0) { - if ((ctxptr = _poll_cqs())) { - in_progress--; - TRACE("ctxptr=%ld in_progress=%d\n", - *ctxptr, in_progress); - } - } - for (tree = 0; tree < size; tree++) { - TRACE("rslt=%d tree=%d\n", i, tree); - for (j = 0; j < 4; j++) - TRACE(" %016lx\n", datary[tree][j]); - - } - } - tstcnt += 1; - + frmwk_barrier(); frmwk_log0("%4s\n", STDMSG(ret)); - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); } while (0); tstnum++; + #if 0 + // template for test case // do { PREAMBLE(0, tstnum, "title of test"); ret = 0; // some test errcnt += !!ret; tstcnt += 1; - frmwk_log0("%4s\n", STDMSG(ret)); frmwk_barrier(); + frmwk_log0("%4s\n", STDMSG(ret)); } while (0); tstnum++; #endif @@ -1501,8 +1384,11 @@ int main(int argc, char **argv) return (errcnt); done: - frmwk_log0("%2d tests run, %d failures\n", tstcnt, errcnt); - frmwk_log0(!!errcnt ? "ERRORS SEEN\n" : "SUCCESS\n"); + frmwk_log0("\nFinal Report =====================================\n"); + frmwk_barrier(); + frmwk_log("%2d tests skipped, %2d tests run, %d failures\n", + skpcnt, tstcnt, errcnt); + frmwk_log(!!errcnt ? "ERRORS SEEN\n" : "SUCCESS\n"); free(fiaddrs); frmwk_free_libfabric(); frmwk_term(); diff --git a/prov/cxi/test/rma.c b/prov/cxi/test/rma.c index 27990b9a8f8..5642e2389cf 100644 --- a/prov/cxi/test/rma.c +++ b/prov/cxi/test/rma.c @@ -14,6 +14,59 @@ #define RMA_WIN_KEY 0x1f +TestSuite(rma_no_init, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rma_no_init, xfer_disable_optimized_mrs_disable_prov_key_cache) +{ + int ret; + bool value; + uint64_t key; + struct mem_region mem_window; + size_t len = 16 * 1024; + uint8_t *send_buf; + struct fi_cq_tagged_entry cqe; + struct cxip_mr_key mr_key; + + send_buf = calloc(1, len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + ret = setenv("CXIP_TEST_PROV_KEY", "1", 1); + cr_assert_eq(ret, 0); + + cxit_setup_rma(); + + value = false; + ret = fi_control(&cxit_domain->fid, + FI_OPT_CXI_SET_OPTIMIZED_MRS, &value); + cr_assert_eq(ret, FI_SUCCESS, "Unexpected call failure"); + + value = false; + ret = fi_control(&cxit_domain->fid, + FI_OPT_CXI_SET_PROV_KEY_CACHE, &value); + cr_assert_eq(ret, FI_SUCCESS, "Unexpected call failure"); + + ret = mr_create(len, FI_REMOTE_READ | FI_REMOTE_WRITE, 0, &key, + &mem_window); + cr_assert_eq(ret, FI_SUCCESS); + + mr_key.raw = key; + cr_assert(mr_key.opt == 0); + + ret = fi_write(cxit_ep, send_buf, len, NULL, cxit_ep_fi_addr, 0, key, + NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + mr_destroy(&mem_window); + cxit_teardown_rma(); + free(send_buf); +} + TestSuite(rma, .init = cxit_setup_rma, .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT); @@ -580,9 +633,10 @@ void cxit_rma_setup_no_rma_events(void) } /* Test HRP Put */ -Test(rma_opt, hrp, +Test(rma_opt_hrp, hrp, .init = cxit_rma_setup_no_rma_events, - .fini = cxit_teardown_rma) + .fini = cxit_teardown_rma, + .timeout = CXIT_DEFAULT_TIMEOUT) { int ret; uint64_t hrp_acks_start; @@ -1543,6 +1597,143 @@ Test(rma_sel, selective_completion_suppress, free(send_buf); } +Test(rma_sel, fi_more_write_stream_optimized, + .init = cxit_setup_rma_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = 0x0; + struct fi_msg_rma msg = {}; + struct fi_rma_iov rma = {}; + unsigned int write_count = 0; + struct fid_cntr *cntr = cxit_write_cntr; + + mr_create(0, FI_REMOTE_WRITE, 0, &key_val, &mem_window); + + rma.key = key_val; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + do { + ret = fi_writemsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + write_count++; + } while (ret != -FI_EAGAIN); + + cr_assert(write_count >= cxit_fi_hints->tx_attr->size); + + do { + ret = fi_writemsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + write_count++; + + ret = fi_writemsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + write_count++; + + ret = fi_cntr_wait(cntr, write_count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); +} + +Test(rma_sel, fi_more_write_stream_mix_optimzied_unoptimized, + .init = cxit_setup_rma_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region opt_mem_window; + struct mem_region mem_window; + uint64_t opt_key_val = 0x0; + uint64_t key_val = 0x1234; + struct fi_msg_rma msg = {}; + struct fi_rma_iov rma = {}; + unsigned int write_count = 0; + struct fid_cntr *cntr = cxit_write_cntr; + + mr_create(0, FI_REMOTE_WRITE, 0, &opt_key_val, &opt_mem_window); + mr_create(0, FI_REMOTE_WRITE, 0, &key_val, &mem_window); + + rma.key = opt_key_val; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + do { + ret = fi_writemsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + write_count++; + } while (ret != -FI_EAGAIN); + + cr_assert(write_count >= cxit_fi_hints->tx_attr->size); + + rma.key = key_val; + do { + ret = fi_writemsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + write_count++; + + ret = fi_writemsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + write_count++; + + ret = fi_cntr_wait(cntr, write_count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); + mr_destroy(&opt_mem_window); +} + +Test(rma_sel, fi_more_read_stream, + .init = cxit_setup_rma_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = 0x0; + struct fi_msg_rma msg = {}; + struct fi_rma_iov rma = {}; + unsigned int count = 0; + struct fid_cntr *cntr = cxit_read_cntr; + + mr_create(0, FI_REMOTE_READ, 0, &key_val, &mem_window); + + rma.key = key_val; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + do { + ret = fi_readmsg(cxit_ep, &msg, FI_MORE); + cr_assert((ret == FI_SUCCESS) || (ret == -FI_EAGAIN)); + if (ret == FI_SUCCESS) + count++; + } while (ret != -FI_EAGAIN); + + cr_assert(count >= cxit_fi_hints->tx_attr->size); + + do { + ret = fi_readmsg(cxit_ep, &msg, FI_MORE); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_readmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + ret = fi_cntr_wait(cntr, count, 10000); + cr_assert(ret == FI_SUCCESS, "ret=%d", ret); + + mr_destroy(&mem_window); +} + /* Test remote counter events with RMA */ Test(rma, rem_cntr) { @@ -1796,6 +1987,82 @@ Test(rma, invalid_read_target_opt_mr_key) rma_invalid_read_target_mr_key(0x10); } +/* Tests to verify FI_RM_ENABLED */ + +static void mr_overrun(bool write) +{ + int ret; + uint8_t *local; + size_t good_len = 4096; + uint64_t key_val = 0xa; + struct fi_cq_err_entry err; + struct fi_cq_tagged_entry cqe; + struct mem_region remote; + + /* Create over-sized local buffer */ + local = calloc(1, good_len * 2); + cr_assert_not_null(local, "local alloc failed"); + + mr_create(good_len, write ? FI_REMOTE_WRITE : FI_REMOTE_READ, 0xc0, + &key_val, &remote); + + /* Perform good length data transfer first */ + if(write) { + ret = fi_write(cxit_ep, local, good_len, NULL, cxit_ep_fi_addr, 0, + key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_write() failed (%d)", ret); + } + else { + ret = fi_read(cxit_ep, local, good_len, NULL, cxit_ep_fi_addr, 0, + key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret); + } + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret); + + validate_tx_event(&cqe, FI_RMA | (write ? FI_WRITE : FI_READ), NULL); + + /* Validate read data */ + for (int i = 0; i < good_len; i++) + cr_expect_eq(local[i], remote.mem[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + local[i], remote.mem[i]); + + /* Perform overrun data transfer */ + if (write) { + ret = fi_write(cxit_ep, local, good_len*2, NULL, cxit_ep_fi_addr, + 0, key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_write() failed (%d)", ret); + } + else { + ret = fi_read(cxit_ep, local, good_len*2, NULL, cxit_ep_fi_addr, + 0, key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret); + } + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret); + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert(ret == 1); + cr_assert_eq(err.err, FI_EIO, "Error return %d", err.err); + + mr_destroy(&remote); + free(local); +} + +Test(rma, read_mr_overrun) +{ + mr_overrun(false); +} + +Test(rma, write_mr_overrun) +{ + mr_overrun(true); +} + static void rma_hybrid_mr_desc_test_runner(bool write, bool cq_events) { struct mem_region source_window; diff --git a/prov/cxi/test/startvm-setup.sh b/prov/cxi/test/startvm-setup.sh index 0e79f611a86..ea660279f03 100755 --- a/prov/cxi/test/startvm-setup.sh +++ b/prov/cxi/test/startvm-setup.sh @@ -17,7 +17,7 @@ modprobe ptp modprobe iommu_v2 || modprobe amd_iommu_v2 insmod $DBS_DIR/slingshot_base_link/cxi-sbl.ko insmod $DBS_DIR/sl-driver/knl/cxi-sl.ko -insmod $DBS_DIR/cxi-driver/cxi/cxi-core.ko disable_default_svc=0 +insmod $DBS_DIR/cxi-driver/cxi/cxi-ss1.ko disable_default_svc=0 insmod $DBS_DIR/cxi-driver/cxi/cxi-user.ko insmod $DBS_DIR/cxi-driver/cxi/cxi-eth.ko insmod $DBS_DIR/kdreg2/kdreg2.ko diff --git a/prov/cxi/test/startvm.sh b/prov/cxi/test/startvm.sh index 933bd082fed..97271732d4d 100755 --- a/prov/cxi/test/startvm.sh +++ b/prov/cxi/test/startvm.sh @@ -67,7 +67,7 @@ else DEVICE=$(cat /sys/class/cxi/cxi0/device/virtfn0/device) # Unbind VF from cxi core driver. cxi1 no longer exists - echo $PCIFN > /sys/bus/pci/drivers/cxi_core/unbind + echo $PCIFN > /sys/bus/pci/drivers/cxi_ss1/unbind # Bind the VF to vfio driver modprobe vfio_pci diff --git a/prov/cxi/test/test.sh b/prov/cxi/test/test.sh old mode 100644 new mode 100755 From 8d1c7d07b400fae539b1900ba9a20d7d2937b389 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 7 Oct 2024 11:28:18 -0500 Subject: [PATCH 135/393] prov/cxi: Remove setting total_buffered_recv This is a libfabric 2.0 depreciated value. Signed-off-by: Ian Ziemba --- prov/cxi/include/cxip.h | 2 -- prov/cxi/src/cxip_info.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index f1b38ed7ea9..9eae29d960d 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -120,8 +120,6 @@ #define CXIP_REQ_BUF_SIZE (2*1024*1024) #define CXIP_REQ_BUF_MIN_POSTED 4 #define CXIP_REQ_BUF_MAX_CACHED 0 -#define CXIP_UX_BUFFER_SIZE (CXIP_OFLOW_BUF_MIN_POSTED * \ - CXIP_OFLOW_BUF_SIZE) #define CXIP_MR_CACHE_EVENTS_DISABLE_POLL_NSECS 100000U #define CXIP_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS 1000000000U diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index 7273d5bc2dc..6681aae0480 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -271,7 +271,6 @@ struct fi_rx_attr cxip_rx_attr = { .op_flags = CXIP_RX_OP_FLAGS, .msg_order = CXIP_MSG_ORDER, .comp_order = FI_ORDER_NONE, - .total_buffered_recv = CXIP_UX_BUFFER_SIZE, .size = CXIP_MAX_RX_SIZE, .iov_limit = 1, }; @@ -291,7 +290,6 @@ struct fi_rx_attr cxip_multi_auth_key_rx_attr = { .op_flags = CXIP_RX_OP_FLAGS, .msg_order = CXIP_MSG_ORDER, .comp_order = FI_ORDER_NONE, - .total_buffered_recv = CXIP_UX_BUFFER_SIZE, .size = CXIP_MAX_RX_SIZE, .iov_limit = 1, }; From 826bce9b29800adcb4e316a0102966f04b6ec169 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 7 Oct 2024 09:23:23 -0500 Subject: [PATCH 136/393] prov/cxi: Update to version to 2.0 Signed-off-by: Ian Ziemba --- prov/cxi/include/cxip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 9eae29d960d..ed549d556f4 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -177,7 +177,7 @@ #define CXIP_MINOR_VERSION 1 #define CXIP_PROV_VERSION FI_VERSION(CXIP_MAJOR_VERSION, \ CXIP_MINOR_VERSION) -#define CXIP_FI_VERSION FI_VERSION(1, 22) +#define CXIP_FI_VERSION FI_VERSION(2, 0) #define CXIP_WIRE_PROTO_VERSION 1 #define CXIP_COLL_MAX_CONCUR 8 From e23da5b081ed2fd2aabd21c2e4f6c5a3d3bc748d Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 14 Oct 2024 19:44:57 -0500 Subject: [PATCH 137/393] man/fi_cxi: Update documentation Signed-off-by: Ian Ziemba --- man/fi_cxi.7.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md index 0a32850cee8..3be247ae462 100644 --- a/man/fi_cxi.7.md +++ b/man/fi_cxi.7.md @@ -1294,6 +1294,13 @@ The CXI provider checks for the following environment variables: : Enable enforcement of triggered operation limit. Doing this can prevent fi_control(FI_QUEUE_WORK) deadlocking at the cost of performance. +*FI_CXI_MR_CACHE_EVENTS_DISABLE_POLL_NSECS* +: Max amount of time to poll when disabling an MR configured with MR match events. + +*FI_CXI_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS* +: Max amount of time to poll when LE invalidate disabling an MR configured with MR + match events. + Note: Use the fi_info utility to query provider environment variables: fi_info -p cxi -e @@ -1373,10 +1380,9 @@ struct fi_cxi_dom_ops { }; ``` -*cntr_read* extension is used to read hardware counter values. Valid values -of the cntr argument are found in the Cassini-specific header file -cassini_cntr_defs.h. Note that Counter accesses by applications may be -rate-limited to 1HZ. +*cntr_read* extension is used to read Cassini Telemetry items that consists of +counters and gauges. The items available and their content are dependent upon +the Cassini ASIC version and Cassini Driver version. *topology* extension is used to return CXI NIC address topology information for the domain. Currently only a dragonfly fabric topology is reported. @@ -1578,7 +1584,7 @@ To enable PCIe fetch add for libfabric, the following CXI driver kernel module parameter must be set to non-zero. ``` -/sys/module/cxi_core/parameters/amo_remap_to_pcie_fadd +/sys/module/cxi_ss1/parameters/amo_remap_to_pcie_fadd ``` The following are the possible values for this kernel module and the impact of From 0f7cc83b5e4449afa4115b2a402c3ae58fcd099e Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Thu, 17 Oct 2024 18:01:35 +0000 Subject: [PATCH 138/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- fabtests/man/man7/fabtests.7 | 28 +++-------- man/man1/fi_info.1 | 28 +++-------- man/man1/fi_pingpong.1 | 48 +++++++----------- man/man1/fi_strerror.1 | 26 +++------- man/man3/fi_atomic.3 | 30 +++-------- man/man3/fi_av.3 | 74 +++++++++++---------------- man/man3/fi_av_set.3 | 24 ++------- man/man3/fi_cm.3 | 24 ++------- man/man3/fi_cntr.3 | 28 +++-------- man/man3/fi_collective.3 | 38 +++++--------- man/man3/fi_control.3 | 22 ++------- man/man3/fi_cq.3 | 44 ++++++----------- man/man3/fi_domain.3 | 48 +++++++----------- man/man3/fi_endpoint.3 | 78 ++++++++++++----------------- man/man3/fi_eq.3 | 34 ++++--------- man/man3/fi_errno.3 | 20 ++------ man/man3/fi_fabric.3 | 32 ++++-------- man/man3/fi_getinfo.3 | 52 +++++++------------ man/man3/fi_mr.3 | 36 +++++--------- man/man3/fi_msg.3 | 32 ++++-------- man/man3/fi_nic.3 | 28 +++-------- man/man3/fi_peer.3 | 30 +++-------- man/man3/fi_poll.3 | 24 ++------- man/man3/fi_profile.3 | 20 ++------ man/man3/fi_provider.3 | 28 +++-------- man/man3/fi_rma.3 | 30 +++-------- man/man3/fi_tagged.3 | 32 ++++-------- man/man3/fi_trigger.3 | 22 ++------- man/man3/fi_version.3 | 20 ++------ man/man7/fabric.7 | 52 +++++++------------ man/man7/fi_arch.7 | 18 +------ man/man7/fi_cxi.7 | 96 +++++++++++++++++------------------- man/man7/fi_direct.7 | 22 ++------- man/man7/fi_efa.7 | 50 +++++++------------ man/man7/fi_guide.7 | 24 ++------- man/man7/fi_hook.7 | 20 ++------ man/man7/fi_intro.7 | 28 +++-------- man/man7/fi_lpp.7 | 20 ++------ man/man7/fi_mrail.7 | 30 +++-------- man/man7/fi_opx.7 | 78 ++++++++++++----------------- man/man7/fi_provider.7 | 50 +++++++------------ man/man7/fi_psm2.7 | 28 +++-------- man/man7/fi_psm3.7 | 30 +++-------- man/man7/fi_rxd.7 | 20 ++------ man/man7/fi_rxm.7 | 22 ++------- man/man7/fi_setup.7 | 23 ++------- man/man7/fi_shm.7 | 22 ++------- man/man7/fi_sockets.7 | 20 ++------ man/man7/fi_tcp.7 | 20 ++------ man/man7/fi_ucx.7 | 20 ++------ man/man7/fi_udp.7 | 20 ++------ man/man7/fi_usnic.7 | 72 +++++++++++---------------- man/man7/fi_verbs.7 | 22 ++------- 53 files changed, 525 insertions(+), 1262 deletions(-) diff --git a/fabtests/man/man7/fabtests.7 b/fabtests/man/man7/fabtests.7 index ec2d2c07aae..567ef27e978 100644 --- a/fabtests/man/man7/fabtests.7 +++ b/fabtests/man/man7/fabtests.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fabtests" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fabtests" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -282,7 +268,7 @@ FI_ENORX) can be read by the application, if RNR happens. \f[I]fi_efa_rnr_queue_resend\f[R] This test modifies the RNR retry count (rnr_retry) to 0 via fi_setopt, and then tests RNR queue/re-send logic for different packet types. -To run the test, one needs to use \f[V]-c\f[R] option to specify the +To run the test, one needs to use \f[C]-c\f[R] option to specify the category of packet types. .SS Component tests .PP @@ -462,9 +448,9 @@ The default endpoint type is rdm. Allocate data buffers on the specified device, rather than in host memory. Valid options are ze, cuda and synapseai. -.TP *-a -The name of a shared address vector. +.IP \[bu] 2 +: The name of a shared address vector. This option only applies to tests that support shared address vectors. .TP \f[I]-B \f[R] @@ -476,9 +462,9 @@ endpoints to the server. .TP \f[I]-P \f[R] Specifies the port number of the peer endpoint, overriding the default. -.TP *-s -Specifies the address of the local endpoint. +.IP \[bu] 2 +: Specifies the address of the local endpoint. .TP *-F Specifies the address format. diff --git a/man/man1/fi_info.1 b/man/man1/fi_info.1 index ac741a93b27..b0d5f5aa8c9 100644 --- a/man/man1/fi_info.1 +++ b/man/man1/fi_info.1 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_info" "1" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_info" "1" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -70,7 +56,7 @@ For more information on address formats, see fi_getinfo(3). .TP \f[I]-p, \[en]provider=\f[R] Filter fabric interfaces by the provider implementation. -For a list of providers, see the \f[V]--list\f[R] option. +For a list of providers, see the \f[C]--list\f[R] option. .TP \f[I]-d, \[en]domain=\f[R] Filter interfaces to only those with the given domain name. @@ -144,7 +130,7 @@ provider: tcp \f[R] .fi .PP -To see the full fi_info structure, specify the \f[V]-v\f[R] option. +To see the full fi_info structure, specify the \f[C]-v\f[R] option. .IP .nf \f[C] @@ -237,7 +223,7 @@ fi_info: \f[R] .fi .PP -To see libfabric related environment variables \f[V]-e\f[R] option. +To see libfabric related environment variables \f[C]-e\f[R] option. .IP .nf \f[C] @@ -257,7 +243,7 @@ $ ./fi_info -e .fi .PP To see libfabric related environment variables with substring use -\f[V]-g\f[R] option. +\f[C]-g\f[R] option. .IP .nf \f[C] @@ -295,6 +281,6 @@ $ ./fi_info -g tcp .fi .SH SEE ALSO .PP -\f[V]fi_getinfo(3)\f[R], \f[V]fi_endpoint(3)\f[R] +\f[C]fi_getinfo(3)\f[R], \f[C]fi_endpoint(3)\f[R] .SH AUTHORS OpenFabrics. diff --git a/man/man1/fi_pingpong.1 b/man/man1/fi_pingpong.1 index 419269ebf71..671ec872133 100644 --- a/man/man1/fi_pingpong.1 +++ b/man/man1/fi_pingpong.1 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_pingpong" "1" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_pingpong" "1" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -35,7 +21,7 @@ fi_pingpong also displays aggregated statistics after each test run, and can additionally verify data integrity upon receipt. .PP By default, the datagram (FI_EP_DGRAM) endpoint is used for the test, -unless otherwise specified via \f[V]-e\f[R]. +unless otherwise specified via \f[C]-e\f[R]. .SH HOW TO RUN TESTS .PP Two copies of the program must be launched: first, one copy must be @@ -61,15 +47,15 @@ client$ fi_pingpong .PP The server and client must be able to communicate properly for the fi_pingpong utility to function. -If any of the \f[V]-e\f[R], \f[V]-I\f[R], \f[V]-S\f[R], or \f[V]-p\f[R] +If any of the \f[C]-e\f[R], \f[C]-I\f[R], \f[C]-S\f[R], or \f[C]-p\f[R] options are used, then they must be specified on the invocation for both the server and the client process. -If the \f[V]-d\f[R] option is specified on the server, then the client +If the \f[C]-d\f[R] option is specified on the server, then the client will select the appropriate domain if no hint is provided on the client side. -If the \f[V]-d\f[R] option is specified on the client, then it must also +If the \f[C]-d\f[R] option is specified on the client, then it must also be specified on the server. -If both the server and client specify the \f[V]-d\f[R] option and the +If both the server and client specify the \f[C]-d\f[R] option and the given domains cannot communicate, then the application will fail. .SS Control Messaging .TP @@ -124,19 +110,19 @@ Activate output debugging (warning: highly verbose) Displays help output for the pingpong test. .SH USAGE EXAMPLES .SS A simple example -.SS Server: \f[V]fi_pingpong -p \f[R] +.SS Server: \f[C]fi_pingpong -p \f[R] .PP -\f[V]server$ fi_pingpong -p sockets\f[R] -.SS Client: \f[V]fi_pingpong -p \f[R] +\f[C]server$ fi_pingpong -p sockets\f[R] +.SS Client: \f[C]fi_pingpong -p \f[R] .PP -\f[V]client$ fi_pingpong -p sockets 192.168.0.123\f[R] +\f[C]client$ fi_pingpong -p sockets 192.168.0.123\f[R] .SS An example with various options .SS Server: .PP -\f[V]server$ fi_pingpong -p usnic -I 1000 -S 1024\f[R] +\f[C]server$ fi_pingpong -p usnic -I 1000 -S 1024\f[R] .SS Client: .PP -\f[V]client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123\f[R] +\f[C]client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123\f[R] .PP Specifically, this will run a pingpong test with: .IP \[bu] 2 @@ -150,14 +136,14 @@ server node as 192.168.0.123 .SS A longer test .SS Server: .PP -\f[V]server$ fi_pingpong -p usnic -I 10000 -S all\f[R] +\f[C]server$ fi_pingpong -p usnic -I 10000 -S all\f[R] .SS Client: .PP -\f[V]client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123\f[R] +\f[C]client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123\f[R] .SH DEFAULTS .PP There is no default provider; if a provider is not specified via the -\f[V]-p\f[R] switch, the test will pick one from the list of available +\f[C]-p\f[R] switch, the test will pick one from the list of available providers (as returned by fi_getinfo(3)). .PP If no endpoint type is specified, `dgram' is used. @@ -192,6 +178,6 @@ client per second .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3) \f[V]fabric\f[R](7), +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3) \f[C]fabric\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man1/fi_strerror.1 b/man/man1/fi_strerror.1 index 70605dabbd6..80cb05cd760 100644 --- a/man/man1/fi_strerror.1 +++ b/man/man1/fi_strerror.1 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_strerror" "1" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_strerror" "1" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -28,16 +14,16 @@ fi_strerror FI_ERROR_CODE .fi .SH DESCRIPTION .PP -Display the error string for the given numeric \f[V]FI_ERROR_CODE\f[R]. -\f[V]FI_ERROR_CODE\f[R] may be a hexadecimal, octal, or decimal +Display the error string for the given numeric \f[C]FI_ERROR_CODE\f[R]. +\f[C]FI_ERROR_CODE\f[R] may be a hexadecimal, octal, or decimal constant. -Although the \f[V]fi_strerror\f[R](3) library function only accepts +Although the \f[C]fi_strerror\f[R](3) library function only accepts positive error values, for convenience this utility accepts both positive and negative error values. .PP This is primarily a convenience tool for developers. .SH SEE ALSO .PP -\f[V]fabric\f[R](7) \f[V]fi_errno\f[R](3) +\f[C]fabric\f[R](7) \f[C]fi_errno\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_atomic.3 b/man/man3/fi_atomic.3 index a1b0bd7c716..2054cbbfe1c 100644 --- a/man/man3/fi_atomic.3 +++ b/man/man3/fi_atomic.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_atomic" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_atomic" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -143,7 +129,7 @@ Local data buffer to store initial value of remote buffer \f[I]desc / compare_desc / result_desc\f[R] Data descriptor associated with the local data buffer, local compare buffer, and local result buffer, respectively. -See \f[V]fi_mr\f[R](3). +See \f[C]fi_mr\f[R](3). .TP \f[I]dest_addr\f[R] Destination address for connectionless atomic operations. @@ -707,11 +693,11 @@ parameter specifying the tag. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[V]fi_msg\f[R](3) for a detailed description of handling +See \f[C]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EOPNOTSUPP\f[R] @@ -755,11 +741,11 @@ assigned to the transmitting and receiving endpoints. Both message and data ordering are required if the results of two atomic operations to the same memory buffers are to reflect the second operation acting on the results of the first. -See \f[V]fi_endpoint\f[R](3) for further details and message size +See \f[C]fi_endpoint\f[R](3) for further details and message size restrictions. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3), \f[V]fi_rma\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_rma\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_av.3 b/man/man3/fi_av.3 index 866d0d635cc..41870d477c5 100644 --- a/man/man3/fi_av.3 +++ b/man/man3/fi_av.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_av" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_av" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -163,14 +149,14 @@ been deprecated, see below). See the NOTES section for AV restrictions on duplicate addresses. .PP \f[B]Deprecated\f[R]: AV operations may be set to operate asynchronously -by specifying the FI_EVENT flag to \f[V]fi_av_open\f[R]. +by specifying the FI_EVENT flag to \f[C]fi_av_open\f[R]. When requesting asynchronous operation, the application must first bind an event queue to the AV before inserting addresses. .SS fi_av_open .PP fi_av_open allocates or opens an address vector. The properties and behavior of the address vector are defined by -\f[V]struct fi_av_attr\f[R]. +\f[C]struct fi_av_attr\f[R]. .IP .nf \f[C] @@ -301,7 +287,7 @@ The context field in all completions will be the context specified to the insert call, and the data field in the final completion entry will report the number of addresses successfully inserted. If an error occurs during the asynchronous insertion, an error -completion entry is returned (see \f[V]fi_eq\f[R](3) for a discussion of +completion entry is returned (see \f[C]fi_eq\f[R](3) for a discussion of the fi_eq_err_entry error completion struct). The context field of the error completion will be the context that was specified in the insert call; the data field will contain the index of @@ -361,10 +347,10 @@ the call will return -FI_EBUSY. .SS fi_av_bind (deprecated) .PP Associates an event queue with the AV. -If an AV has been opened with \f[V]FI_EVENT\f[R], then an event queue +If an AV has been opened with \f[C]FI_EVENT\f[R], then an event queue must be bound to the AV before any insertion calls are attempted. Any calls to insert addresses before an event queue has been bound will -fail with \f[V]-FI_ENOEQ\f[R]. +fail with \f[C]-FI_ENOEQ\f[R]. Flags are reserved for future use and must be 0. .SS fi_av_insert .PP @@ -375,7 +361,7 @@ AV. Addresses inserted into an address vector must be in the same format as specified in the addr_format field of the fi_info struct provided when opening the corresponding domain. -When using the \f[V]FI_ADDR_STR\f[R] format, the \f[V]addr\f[R] +When using the \f[C]FI_ADDR_STR\f[R] format, the \f[C]addr\f[R] parameter should reference an array of strings (char **). .PP \f[B]Deprecated\f[R]: For AV\[cq]s of type FI_AV_MAP, once inserted @@ -409,14 +395,14 @@ buffer must remain valid until the insertion operation completes. Note that if fi_addr is NULL and synchronous operation is requested without using FI_SYNC_ERR flag, individual insertion failures cannot be reported and the application must use other calls, such as -\f[V]fi_av_lookup\f[R] to learn which specific addresses failed to +\f[C]fi_av_lookup\f[R] to learn which specific addresses failed to insert. .PP If the address vector is configured with authorization keys, the fi_addr parameter may be used as input to define the authorization keys associated with the endpoint addresses being inserted. This is done by setting the fi_addr to an authorization key fi_addr_t -generated from \f[V]fi_av_insert_auth_key\f[R] and setting the +generated from \f[C]fi_av_insert_auth_key\f[R] and setting the FI_AUTH_KEY flag. If the FI_AUTH_KEY flag is not set, addresses being inserted will not be associated with any authorization keys. @@ -430,8 +416,8 @@ authorization keys. These fi_addr_t\[cq]s can be used as the target for local data transfer operations. .PP -If the endpoint supports \f[V]FI_DIRECTED_RECV\f[R] or -\f[V]FI_TAGGED_DIRECTED_RECV\f[R], these fi_addr_t\[cq]s can be used to +If the endpoint supports \f[C]FI_DIRECTED_RECV\f[R] or +\f[C]FI_TAGGED_DIRECTED_RECV\f[R], these fi_addr_t\[cq]s can be used to restrict receive buffers to a specific endpoint address and authorization key. .PP @@ -494,10 +480,10 @@ Node should be a string that corresponds to a hostname or network address. The service string corresponds to a textual representation of a transport address. -Applications may also pass in an \f[V]FI_ADDR_STR\f[R] formatted address +Applications may also pass in an \f[C]FI_ADDR_STR\f[R] formatted address as the node parameter. In such cases, the service parameter must be NULL. -See fi_getinfo.3 for details on using \f[V]FI_ADDR_STR\f[R]. +See fi_getinfo.3 for details on using \f[C]FI_ADDR_STR\f[R]. Supported flags are the same as for fi_av_insert. .SS fi_av_insertsym .PP @@ -541,7 +527,7 @@ Note that removing an address may not disable receiving data from the peer endpoint. fi_av_close will automatically cleanup any associated resource. .PP -If the address being removed came from \f[V]fi_av_insert_auth_key\f[R], +If the address being removed came from \f[C]fi_av_insert_auth_key\f[R], the address will only be removed if all endpoints, which have been enabled against the corresponding authorization key, have been closed. If all endpoints are not closed, -FI_EBUSY will be returned. @@ -591,8 +577,8 @@ fi_av_straddr returns a pointer to buf. .SS fi_av_insert_auth_key .PP This function associates authorization keys with an address vector. -This requires the domain to be opened with \f[V]FI_AV_AUTH_KEY\f[R]. -\f[V]FI_AV_AUTH_KEY\f[R] enables endpoints and memory regions to be +This requires the domain to be opened with \f[C]FI_AV_AUTH_KEY\f[R]. +\f[C]FI_AV_AUTH_KEY\f[R] enables endpoints and memory regions to be associated with authorization keys from the address vector. This behavior enables a single endpoint or memory region to be associated with multiple authorization keys. @@ -602,38 +588,38 @@ address vector authorization keys at that point in time. Later authorization key insertions will not propagate to already enabled endpoints and memory regions. .PP -The \f[V]auth_key\f[R] and \f[V]auth_key_size\f[R] parameters are used +The \f[C]auth_key\f[R] and \f[C]auth_key_size\f[R] parameters are used to input the authorization key into the address vector. The structure of the authorization key is provider specific. -If the \f[V]auth_key_size\f[R] does not align with provider specific +If the \f[C]auth_key_size\f[R] does not align with provider specific structure, -FI_EINVAL will be returned. .PP -The output of \f[V]fi_av_insert_auth_key\f[R] is an authorization key +The output of \f[C]fi_av_insert_auth_key\f[R] is an authorization key fi_addr_t handle representing all endpoint addresses against this specific authorization key. For all operations, including address vector, memory registration, and data transfers, which may accept an authorization key fi_addr_t as input, the FI_AUTH_KEY flag must be specified. Otherwise, the fi_addr_t will be treated as an fi_addr_t returned from -the \f[V]fi_av_insert\f[R] and related functions. +the \f[C]fi_av_insert\f[R] and related functions. .PP For endpoints enabled with FI_DIRECTED_RECV, authorization key fi_addr_t\[cq]s can be used to restrict incoming messages to only endpoint addresses within the authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[V]fi_recvmsg\f[R] and \f[V]fi_trecvmsg\f[R]. +\f[C]fi_recvmsg\f[R] and \f[C]fi_trecvmsg\f[R]. .PP For domains enabled with FI_DIRECTED_RECV, authorization key fi_addr_t\[cq]s can be used to restrict memory region access to only endpoint addresses within the authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[V]fi_mr_regattr\f[R]. +\f[C]fi_mr_regattr\f[R]. .PP These authorization key fi_addr_t\[cq]s can later be used an input for endpoint address insertion functions to generate an fi_addr_t for a specific endpoint address and authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[V]fi_av_insert\f[R] and related functions. +\f[C]fi_av_insert\f[R] and related functions. .PP For address vectors configured with FI_AV_USER_ID and endpoints with FI_SOURCE_ERR, all subsequent FI_EADDRNOTAVAIL error events will return @@ -651,7 +637,7 @@ Flags are reserved for future use and must be 0. This functions returns the authorization key associated with a fi_addr_t. Acceptable fi_addr_t\[cq]s input are the output of -\f[V]fi_av_insert_auth_key\f[R] and AV address insertion functions. +\f[C]fi_av_insert_auth_key\f[R] and AV address insertion functions. The returned authorization key is in a provider specific format. On input, the auth_key_size parameter should indicate the size of the auth_key buffer. @@ -760,14 +746,14 @@ function. This function is used to set the group ID portion of an fi_addr_t. .SH RETURN VALUES .PP -Insertion calls, excluding \f[V]fi_av_insert_auth_key\f[R], for an AV +Insertion calls, excluding \f[C]fi_av_insert_auth_key\f[R], for an AV opened for synchronous operation will return the number of addresses that were successfully inserted. In the case of failure, the return value will be less than the number of addresses that was specified. .PP \f[B]Deprecated\f[R]: Insertion calls, excluding -\f[V]fi_av_insert_auth_key\f[R], for an AV opened for asynchronous +\f[C]fi_av_insert_auth_key\f[R], for an AV opened for asynchronous operation (with FI_EVENT flag specified) will return FI_SUCCESS if the operation was successfully initiated. In the case of failure, a negative fabric errno will be returned. @@ -782,10 +768,10 @@ FI_ADDR_NOTAVAIL. .PP All other calls return FI_SUCCESS on success, or a negative value corresponding to fabric errno on error. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_av_set.3 b/man/man3/fi_av_set.3 index 71cff544452..04742ab2629 100644 --- a/man/man3/fi_av_set.3 +++ b/man/man3/fi_av_set.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_av_set" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_av_set" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -106,7 +92,7 @@ The creation and manipulation of an AV set is a local operation. No fabric traffic is exchanged between peers. As a result, each peer is responsible for creating matching AV sets as part of their collective membership definition. -See \f[V]fi_collective\f[R](3) for a discussion of membership models. +See \f[C]fi_collective\f[R](3) for a discussion of membership models. .SS fi_av_set .PP The fi_av_set call creates a new AV set. @@ -277,9 +263,9 @@ It is an error for a user to request an unsupported collective. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[V]fi_av\f[R](3), \f[V]fi_collective\f[R](3) +\f[C]fi_av\f[R](3), \f[C]fi_collective\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cm.3 b/man/man3/fi_cm.3 index 36fff256d99..7723f65eb7e 100644 --- a/man/man3/fi_cm.3 +++ b/man/man3/fi_cm.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_cm" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cm" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -275,7 +261,7 @@ or an error will occur. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH ERRORS .SH NOTES .PP @@ -293,7 +279,7 @@ events, or as additional err_data to fi_eq_err_entry, in the case of a rejected connection. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cntr.3 b/man/man3/fi_cntr.3 index 53c35f5b5fe..2dcdb911498 100644 --- a/man/man3/fi_cntr.3 +++ b/man/man3/fi_cntr.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_cntr" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cntr" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -105,14 +91,14 @@ That is, a counter actually stores two distinct values, with error completions updating an error specific value. .PP Counters are updated following the completion event semantics defined in -\f[V]fi_cq\f[R](3). +\f[C]fi_cq\f[R](3). The timing of the update is based on the type of transfer and any specified operation flags. .SS fi_cntr_open .PP fi_cntr_open allocates a new fabric counter. The properties and behavior of the counter are defined by -\f[V]struct fi_cntr_attr\f[R]. +\f[C]struct fi_cntr_attr\f[R]. .IP .nf \f[C] @@ -292,7 +278,7 @@ On error, a negative value corresponding to fabric errno is returned. fi_cntr_read / fi_cntr_readerr Returns the current value of the counter. .PP -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH NOTES .PP In order to support a variety of counter implementations, updates made @@ -314,7 +300,7 @@ fi_cntr_set / fi_cntr_seterr and results of related operations are reflected in the observed value of the counter. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3), \f[V]fi_poll\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_collective.3 b/man/man3/fi_collective.3 index 13627bb8a9c..58e3121c6b2 100644 --- a/man/man3/fi_collective.3 +++ b/man/man3/fi_collective.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_collective" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_collective" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .TP @@ -166,7 +152,7 @@ be used for required input. .PP In general collective operations can be thought of as coordinated atomic operations between a set of peer endpoints. -Readers should refer to the \f[V]fi_atomic\f[R](3) man page for details +Readers should refer to the \f[C]fi_atomic\f[R](3) man page for details on the atomic operations and datatypes defined by libfabric. .PP A collective operation is a group communication exchange. @@ -213,7 +199,7 @@ provider by creating and configuring an address vector set (AV set). An AV set represents an ordered subset of addresses in an address vector (AV). Details on creating and configuring an AV set are available in -\f[V]fi_av_set\f[R](3). +\f[C]fi_av_set\f[R](3). .PP Once an AV set has been programmed with the collective membership information, an endpoint is joined to the set. @@ -272,7 +258,7 @@ Applications must call fi_close on the collective group to disconnect the endpoint from the group. After a join operation has completed, the fi_mc_addr call may be used to retrieve the address associated with the multicast group. -See \f[V]fi_cm\f[R](3) for additional details on fi_mc_addr(). +See \f[C]fi_cm\f[R](3) for additional details on fi_mc_addr(). .SS Barrier (fi_barrier) .PP The fi_barrier operation provides a mechanism to synchronize peers. @@ -523,7 +509,7 @@ struct fi_collective_attr { \f[R] .fi .PP -For a description of struct fi_atomic_attr, see \f[V]fi_atomic\f[R](3). +For a description of struct fi_atomic_attr, see \f[C]fi_atomic\f[R](3). .TP \f[I]op\f[R] On input, this specifies the atomic operation involved with the @@ -566,7 +552,7 @@ collective operation through the provider. .PP Collective operations map to underlying fi_atomic operations. For a discussion of atomic completion semantics, see -\f[V]fi_atomic\f[R](3). +\f[C]fi_atomic\f[R](3). The completion, ordering, and atomicity of collective operations match those defined for point to point atomic operations. .SH FLAGS @@ -581,11 +567,11 @@ collective operation. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[V]fi_msg\f[R](3) for a detailed description of handling +See \f[C]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EOPNOTSUPP\f[R] @@ -601,11 +587,11 @@ As such, they follow most of the conventions and restrictions as peer to peer atomic operations. This includes data atomicity, data alignment, and message ordering semantics. -See \f[V]fi_atomic\f[R](3) for additional information on the datatypes +See \f[C]fi_atomic\f[R](3) for additional information on the datatypes and operations defined for atomic and collective operations. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_av\f[R](3), \f[V]fi_atomic\f[R](3), -\f[V]fi_cm\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_av\f[R](3), \f[C]fi_atomic\f[R](3), +\f[C]fi_cm\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_control.3 b/man/man3/fi_control.3 index 27d8a52afd9..65b0890e0f8 100644 --- a/man/man3/fi_control.3 +++ b/man/man3/fi_control.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_control" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_control" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -64,7 +50,7 @@ header files (\[cq]rdma/fi_ext_*.h\[cq]). Please refer to the provider man pages for details. .SH SEE ALSO .PP -\f[V]fi_endpoint\f[R](3), \f[V]fi_cm\f[R](3), \f[V]fi_cntr\f[R](3), -\f[V]fi_cq\f[R](3), \f[V]fi_eq\f[R](3), +\f[C]fi_endpoint\f[R](3), \f[C]fi_cm\f[R](3), \f[C]fi_cntr\f[R](3), +\f[C]fi_cq\f[R](3), \f[C]fi_eq\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cq.3 b/man/man3/fi_cq.3 index d788b5af76f..f3bce7ce489 100644 --- a/man/man3/fi_cq.3 +++ b/man/man3/fi_cq.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_cq" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cq" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -137,7 +123,7 @@ Unlike event queues, completion queues are associated with a resource domain and may be offloaded entirely in provider hardware. .PP The properties and behavior of a completion queue are defined by -\f[V]struct fi_cq_attr\f[R]. +\f[C]struct fi_cq_attr\f[R]. .IP .nf \f[C] @@ -368,9 +354,8 @@ Multiple completions may be retrieved from a CQ in a single call. The maximum number of entries to return is limited to the specified count parameter, with the number of entries successfully read from the CQ returned by the call. -(See return values section below.) -A count value of 0 may be used to drive progress on associated endpoints -when manual progress is enabled. +(See return values section below.) A count value of 0 may be used to +drive progress on associated endpoints when manual progress is enabled. .PP CQs are optimized to report operations which have completed successfully. @@ -444,7 +429,7 @@ fi_cq_readerr is a non-blocking call, returning immediately whether an error completion was found or not. .PP Error information is reported to the user through -\f[V]struct fi_cq_err_entry\f[R]. +\f[C]struct fi_cq_err_entry\f[R]. The format of this structure is defined below. .IP .nf @@ -537,9 +522,8 @@ Flags are set for all relevant completions. .TP \f[I]len\f[R] This len field applies to completed receive operations (e.g.\ fi_recv, -fi_trecv, etc.) -and the completed write with remote cq data on the responder side -(e.g.\ fi_write, with FI_REMOTE_CQ_DATA flag). +fi_trecv, etc.) and the completed write with remote cq data on the +responder side (e.g.\ fi_write, with FI_REMOTE_CQ_DATA flag). It indicates the size of transferred \f[I]message\f[R] data \[en] i.e.\ how many data bytes were placed into the associated receive/target buffer by a corresponding fi_send/fi_tsend/fi_write et al call. @@ -970,7 +954,7 @@ When heterogenous memory is involved, the concept of memory domains come into play. Memory domains identify the physical separation of memory, which may or may not be accessible through the same virtual address space. -See the \f[V]fi_mr\f[R](3) man page for further details on memory +See the \f[C]fi_mr\f[R](3) man page for further details on memory domains. .PP Completion ordering and data visibility are only well-defined for @@ -1030,7 +1014,7 @@ As a result, applications can request a lower completion semantic when posting receives. That indicates to the provider that the application will be responsible for handling any device specific flush operations that might be needed. -See \f[V]fi_msg\f[R](3) FLAGS. +See \f[C]fi_msg\f[R](3) FLAGS. .PP For data transfers that do not generate a completion at the target, such as RMA or atomics, it is the responsibility of the application to ensure @@ -1133,11 +1117,11 @@ returns -FI_EAGAIN. : Returns a character string interpretation of the provider specific error returned with a completion. .PP -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3), \f[V]fi_cntr\f[R](3), -\f[V]fi_poll\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_cntr\f[R](3), +\f[C]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_domain.3 b/man/man3/fi_domain.3 index c3a1b104a28..c8f9721e562 100644 --- a/man/man3/fi_domain.3 +++ b/man/man3/fi_domain.3 @@ -1,21 +1,7 @@ -'\" t -.\" Automatically generated by Pandoc 3.1.3 +.\"t +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_domain" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_domain" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -90,7 +76,7 @@ parameter. .PP Similar to fi_domain, but accepts an extra parameter \f[I]flags\f[R]. Mainly used for opening peer domain. -See \f[V]fi_peer\f[R](3). +See \f[C]fi_peer\f[R](3). .SS fi_open_ops .PP fi_open_ops is used to open provider specific interfaces. @@ -187,9 +173,9 @@ through the event queue. If an event queue is not bound to the domain with the FI_REG_MR flag, then memory registration requests complete synchronously. .PP -See \f[V]fi_av_bind\f[R](3), \f[V]fi_ep_bind\f[R](3), -\f[V]fi_mr_bind\f[R](3), \f[V]fi_pep_bind\f[R](3), and -\f[V]fi_scalable_ep_bind\f[R](3) for more information. +See \f[C]fi_av_bind\f[R](3), \f[C]fi_ep_bind\f[R](3), +\f[C]fi_mr_bind\f[R](3), \f[C]fi_pep_bind\f[R](3), and +\f[C]fi_scalable_ep_bind\f[R](3) for more information. .SS fi_close .PP The fi_close call is used to release all resources associated with a @@ -198,7 +184,7 @@ All objects associated with the opened domain must be released prior to calling fi_close, otherwise the call will return -FI_EBUSY. .SH DOMAIN ATTRIBUTES .PP -The \f[V]fi_domain_attr\f[R] structure defines the set of attributes +The \f[C]fi_domain_attr\f[R] structure defines the set of attributes associated with a domain. .IP .nf @@ -663,7 +649,7 @@ size as the endpoint queue(s) that are bound to it. .SS AV Type (av_type) .PP Specifies the type of address vectors that are usable with this domain. -For additional details on AV type, see \f[V]fi_av\f[R](3). +For additional details on AV type, see \f[C]fi_av\f[R](3). The following values may be specified. .TP \f[I]FI_AV_MAP\f[R] (deprecated) @@ -687,7 +673,7 @@ optimal AV type supported by this domain. .SS Memory Registration Mode (mr_mode) .PP Defines memory registration specific mode bits used with this domain. -Full details on MR mode options are available in \f[V]fi_mr\f[R](3). +Full details on MR mode options are available in \f[C]fi_mr\f[R](3). The following values may be specified. .TP \f[I]FI_MR_ALLOCATED\f[R] @@ -868,7 +854,7 @@ If this domain capability is not set, address vectors cannot be opened with FI_AV_USER_ID. Note that FI_AV_USER_ID can still be supported through the AV insert calls without this domain capability set. -See \f[V]fi_av\f[R](3). +See \f[C]fi_av\f[R](3). .TP \f[I]FI_PEER\f[R] Specifies that the domain must support importing resources to be used in @@ -899,7 +885,7 @@ provider, for example. Indicates that the domain supports the ability to share address vectors among multiple processes using the named address vector feature. .PP -See \f[V]fi_getinfo\f[R](3) for a discussion on primary versus secondary +See \f[C]fi_getinfo\f[R](3) for a discussion on primary versus secondary capabilities. .SS Default authorization key (auth_key) .PP @@ -946,7 +932,7 @@ cache or lookup tables. .PP This specifies the default traffic class that will be associated any endpoints created within the domain. -See \f[V]fi_endpoint\f[R](3) for additional information. +See \f[C]fi_endpoint\f[R](3) for additional information. .SS Max Authorization Keys per Endpoint (max_ep_auth_key) .PP The maximum number of authorization keys which can be supported per @@ -955,7 +941,7 @@ connectionless endpoint. .PP The maximum value that a peer group may be assigned, inclusive. Valid peer group id\[cq]s must be between 0 and max_group_id. -See \f[V]fi_av\f[R](3) for additional information on peer groups and +See \f[C]fi_av\f[R](3) for additional information on peer groups and their use. Users may request support for peer groups by setting this to a non-zero value. @@ -967,7 +953,7 @@ the application. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH NOTES .PP Users should call fi_close to release all resources allocated to the @@ -986,7 +972,7 @@ lightly loaded systems, without an administrator configuring system resources appropriately for the installed provider(s). .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_av\f[R](3), -\f[V]fi_eq\f[R](3), \f[V]fi_mr\f[R](3) \f[V]fi_peer\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_av\f[R](3), +\f[C]fi_eq\f[R](3), \f[C]fi_mr\f[R](3) \f[C]fi_peer\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_endpoint.3 b/man/man3/fi_endpoint.3 index ea3b4076c9f..cbc4a28a885 100644 --- a/man/man3/fi_endpoint.3 +++ b/man/man3/fi_endpoint.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_endpoint" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_endpoint" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -277,7 +263,7 @@ been used. .PP Similar to fi_endpoint, buf accepts an extra parameter \f[I]flags\f[R]. Mainly used for opening endpoints that use peer transfer feature. -See \f[V]fi_peer\f[R](3) +See \f[C]fi_peer\f[R](3) .SS fi_close .PP Closes an endpoint and release all resources associated with it. @@ -590,7 +576,7 @@ FI_HMEM_P2P_DISABLED: Peer to peer support should not be used. fi_setopt() will return -FI_EOPNOTSUPP if the mode requested cannot be supported by the provider. The FI_HMEM_DISABLE_P2P environment variable discussed in -\f[V]fi_mr\f[R](3) takes precedence over this setopt option. +\f[C]fi_mr\f[R](3) takes precedence over this setopt option. .RE \[bu] .RS 2 .TP @@ -623,10 +609,10 @@ Define the maximum message size that can be transferred by the endpoint in a single untagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[V]ep_attr->max_msg_size\f[R]. +\f[C]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -635,10 +621,10 @@ Define the maximum message size that can be transferred by the endpoint in a single tagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[V]ep_attr->max_msg_size\f[R]. +\f[C]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -647,10 +633,10 @@ Define the maximum message size that can be transferred by the endpoint via a single RMA operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[V]ep_attr->max_msg_size\f[R]. +\f[C]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -659,10 +645,10 @@ Define the maximum data size that can be transferred by the endpoint via a single atomic operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[V]ep_attr->max_msg_size\f[R]. +\f[C]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -671,10 +657,10 @@ Define the maximum message size that can be injected by the endpoint in a single untagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[V]tx_attr->inject_size\f[R]. +\f[C]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[V]tx_attr->inject_size\f[R] should be used. +In that case, \f[C]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -683,10 +669,10 @@ Define the maximum message size that can be injected by the endpoint in a single tagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[V]tx_attr->inject_size\f[R]. +\f[C]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[V]tx_attr->inject_size\f[R] should be used. +In that case, \f[C]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -695,10 +681,10 @@ Define the maximum data size that can be injected by the endpoint in a single RMA operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[V]tx_attr->inject_size\f[R]. +\f[C]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[V]tx_attr->inject_size\f[R] should be used. +In that case, \f[C]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -707,10 +693,10 @@ Define the maximum data size that can be injected by the endpoint in a single atomic operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[V]tx_attr->inject_size\f[R]. +\f[C]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[V]tx_attr->inject_size\f[R] should be used. +In that case, \f[C]tx_attr->inject_size\f[R] should be used. .RE .SS fi_tc_dscp_set .PP @@ -1793,7 +1779,7 @@ value of transmit or receive context attributes of an endpoint. \f[I]FI_COMMIT_COMPLETE\f[R] Indicates that a completion should not be generated (locally or at the peer) until the result of an operation have been made persistent. -See \f[V]fi_cq\f[R](3) for additional details on completion semantics. +See \f[C]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_COMPLETION\f[R] Indicates that a completion queue entry should be written for data @@ -1806,7 +1792,7 @@ See the fi_ep_bind section above for more detail. \f[I]FI_DELIVERY_COMPLETE\f[R] Indicates that a completion should be generated when the operation has been processed by the destination endpoint(s). -See \f[V]fi_cq\f[R](3) for additional details on completion semantics. +See \f[C]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_INJECT\f[R] Indicates that all outbound data buffers should be returned to the @@ -1821,7 +1807,7 @@ This limit is indicated using inject_size (see inject_size above). \f[I]FI_INJECT_COMPLETE\f[R] Indicates that a completion should be generated when the source buffer(s) may be reused. -See \f[V]fi_cq\f[R](3) for additional details on completion semantics. +See \f[C]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_MULTICAST\f[R] Indicates that data transfers will target multicast addresses by @@ -1845,7 +1831,7 @@ space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). \f[I]FI_TRANSMIT_COMPLETE\f[R] Indicates that a completion should be generated when the transmit operation has completed relative to the local provider. -See \f[V]fi_cq\f[R](3) for additional details on completion semantics. +See \f[C]fi_cq\f[R](3) for additional details on completion semantics. .SH NOTES .PP Users should call fi_close to release all resources allocated to the @@ -1854,10 +1840,10 @@ fabric endpoint. Endpoints allocated with the FI_CONTEXT or FI_CONTEXT2 mode bits set must typically provide struct fi_context(2) as their per operation context parameter. -(See fi_getinfo.3 for details.) -However, when FI_SELECTIVE_COMPLETION is enabled to suppress CQ -completion entries, and an operation is initiated without the -FI_COMPLETION flag set, then the context parameter is ignored. +(See fi_getinfo.3 for details.) However, when FI_SELECTIVE_COMPLETION is +enabled to suppress CQ completion entries, and an operation is initiated +without the FI_COMPLETION flag set, then the context parameter is +ignored. An application does not need to pass in a valid struct fi_context(2) into such data transfers. .PP @@ -1896,7 +1882,7 @@ submitted for processing. For fi_setopt/fi_getopt, a return value of -FI_ENOPROTOOPT indicates the provider does not support the requested option. .PP -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EDOMAIN\f[R] @@ -1910,8 +1896,8 @@ The endpoint has not been configured with necessary completion queue. The endpoint\[cq]s state does not permit the requested operation. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) -\f[V]fi_msg\f[R](3), \f[V]fi_tagged\f[R](3), \f[V]fi_rma\f[R](3) -\f[V]fi_peer\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[C]fi_msg\f[R](3), \f[C]fi_tagged\f[R](3), \f[C]fi_rma\f[R](3) +\f[C]fi_peer\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_eq.3 b/man/man3/fi_eq.3 index 359feed69b9..0622ffbbf62 100644 --- a/man/man3/fi_eq.3 +++ b/man/man3/fi_eq.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_eq" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_eq" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -123,7 +109,7 @@ as listening for connection requests. fi_eq_open allocates a new event queue. .PP The properties and behavior of an event queue are defined by -\f[V]struct fi_eq_attr\f[R]. +\f[C]struct fi_eq_attr\f[R]. .IP .nf \f[C] @@ -273,7 +259,7 @@ These include the following types of events: memory registration, address vector resolution, and multicast joins. .PP Control requests report their completion by inserting a -\f[V]struct fi_eq_entry\f[R] into the EQ. +\f[C]struct fi_eq_entry\f[R] into the EQ. The format of this structure is: .IP .nf @@ -297,7 +283,7 @@ The context field will be set to the context specified as part of the operation, if available, otherwise the context will be associated with the fabric descriptor. The data field will be set as described in the man page for the -corresponding object type (e.g., see \f[V]fi_av\f[R](3) for a +corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a description of how asynchronous address vector insertions are completed). .TP @@ -307,7 +293,7 @@ setup or tear down connections between endpoints. There are three connection notification events: FI_CONNREQ, FI_CONNECTED, and FI_SHUTDOWN. Connection notifications are reported using -\f[V]struct fi_eq_cm_entry\f[R]: +\f[C]struct fi_eq_cm_entry\f[R]: .IP .nf \f[C] @@ -446,7 +432,7 @@ The context field will be set to the context specified as part of the operation. .PP The data field will be set as described in the man page for the -corresponding object type (e.g., see \f[V]fi_av\f[R](3) for a +corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a description of how asynchronous address vector insertions are completed). .PP @@ -572,10 +558,10 @@ fi_eq_strerror Returns a character string interpretation of the provider specific error returned with a completion. .PP -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_cntr\f[R](3), \f[V]fi_poll\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3), \f[C]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_errno.3 b/man/man3/fi_errno.3 index e73772c435f..be13272dd97 100644 --- a/man/man3/fi_errno.3 +++ b/man/man3/fi_errno.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_errno" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_errno" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -198,6 +184,6 @@ Receiver not ready, no receive buffers available Memory registration limit exceeded .SH SEE ALSO .PP -\f[V]fabric\f[R](7) +\f[C]fabric\f[R](7) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_fabric.3 b/man/man3/fi_fabric.3 index 205a8d96691..cc2a0e63219 100644 --- a/man/man3/fi_fabric.3 +++ b/man/man3/fi_fabric.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_fabric" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_fabric" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -160,11 +146,11 @@ The data parameter is ignored. .TP \f[I]FI_TYPE_EQ_EVENT\f[R] uint32_t event parameter returned from fi_eq_read(). -See \f[V]fi_eq(3)\f[R] for a list of known values. +See \f[C]fi_eq(3)\f[R] for a list of known values. .TP \f[I]FI_TYPE_CQ_EVENT_FLAGS\f[R] uint64_t flags field in fi_cq_xxx_entry structures. -See \f[V]fi_cq(3)\f[R] for valid flags. +See \f[C]fi_cq(3)\f[R] for valid flags. .TP \f[I]FI_TYPE_MR_MODE\f[R] struct fi_domain_attr::mr_mode flags @@ -259,7 +245,7 @@ these environment variables in a production setting. Version information for the fabric provider, in a major.minor format. The use of the FI_MAJOR() and FI_MINOR() version macros may be used to extract the major and minor version data. -See \f[V]fi_version(3)\f[R]. +See \f[C]fi_version(3)\f[R]. .PP In case of an utility provider layered over a core provider, the version would always refer to that of the utility provider. @@ -267,16 +253,16 @@ would always refer to that of the utility provider. .PP The interface version requested by the application. This value corresponds to the version parameter passed into -\f[V]fi_getinfo(3)\f[R]. +\f[C]fi_getinfo(3)\f[R]. .SH RETURN VALUE .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH ERRORS .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), -\f[V]fi_eq\f[R](3), \f[V]fi_endpoint\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), +\f[C]fi_eq\f[R](3), \f[C]fi_endpoint\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_getinfo.3 b/man/man3/fi_getinfo.3 index c4ee256d526..8c531815b94 100644 --- a/man/man3/fi_getinfo.3 +++ b/man/man3/fi_getinfo.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_getinfo" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_getinfo" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -232,8 +218,8 @@ manner. The fi_info::handle field is also used by fi_endpoint() and fi_reject() calls when processing connection requests or to inherit another endpoint\[cq]s attributes. -See \f[V]fi_eq\f[R](3), \f[V]fi_reject\f[R](3), and -\f[V]fi_endpoint\f[R](3). +See \f[C]fi_eq\f[R](3), \f[C]fi_reject\f[R](3), and +\f[C]fi_endpoint\f[R](3). The info->handle field will be ignored by fi_dupinfo and fi_freeinfo. .TP \f[I]tx_attr - transmit context attributes\f[R] @@ -266,7 +252,7 @@ set. On output, the actual endpoint attributes that can be provided will be returned. Output values will be greater than or equal to requested input values. -See \f[V]fi_endpoint\f[R](3) for details. +See \f[C]fi_endpoint\f[R](3) for details. .TP \f[I]domain_attr - domain attributes\f[R] Optionally supplied domain attributes. @@ -276,7 +262,7 @@ be set. On output, the actual domain attributes that can be provided will be returned. Output values will be greater than or equal to requested input values. -See \f[V]fi_domain\f[R](3) for details. +See \f[C]fi_domain\f[R](3) for details. .TP \f[I]fabric_attr - fabric attributes\f[R] Optionally supplied fabric attributes. @@ -285,14 +271,14 @@ When provided as hints, requested values of struct fi_fabric_attr should be set. On output, the actual fabric attributes that can be provided will be returned. -See \f[V]fi_fabric\f[R](3) for details. +See \f[C]fi_fabric\f[R](3) for details. .TP \f[I]nic - network interface details\f[R] Optional attributes related to the hardware NIC associated with the specified fabric, domain, and endpoint data. This field is only valid for providers where the corresponding attributes are closely associated with a hardware NIC. -See \f[V]fi_nic\f[R](3) for details. +See \f[C]fi_nic\f[R](3) for details. .SH CAPABILITIES .PP Interface capabilities are obtained by OR-ing the following flags @@ -324,12 +310,12 @@ Requests that the provider support the association of a user specified identifier with each address vector (AV) address. User identifiers are returned with completion data in place of the AV address. -See \f[V]fi_domain\f[R](3) and \f[V]fi_av\f[R](3) for more details. +See \f[C]fi_domain\f[R](3) and \f[C]fi_av\f[R](3) for more details. .TP \f[I]FI_COLLECTIVE\f[R] Requests support for collective operations. Endpoints that support this capability support the collective operations -defined in \f[V]fi_collective\f[R](3). +defined in \f[C]fi_collective\f[R](3). .TP \f[I]FI_DIRECTED_RECV\f[R] Requests that the communication endpoint use the source address of an @@ -511,7 +497,7 @@ endpoint as send-only or receive-only. \f[I]FI_TRIGGER\f[R] Indicates that the endpoint should support triggered operations. Endpoints support this capability must meet the usage model as described -by \f[V]fi_trigger\f[R](3). +by \f[C]fi_trigger\f[R](3). .TP \f[I]FI_WRITE\f[R] Indicates that the user requires an endpoint capable of initiating @@ -522,7 +508,7 @@ This flag requires that FI_RMA and/or FI_ATOMIC be set. Specifies that the endpoint should support transfers that may be initiated from heterogenous computation devices, such as GPUs. This flag requires that FI_TRIGGER be set. -For additional details on XPU triggers see \f[V]fi_trigger\f[R](3). +For additional details on XPU triggers see \f[C]fi_trigger\f[R](3). .PP Capabilities may be grouped into three general categories: primary, secondary, and primary modifiers. @@ -626,8 +612,8 @@ application for access domains opened with this capability. This flag is defined for compatibility and is ignored if the application version is 1.5 or later and the domain mr_mode is set to anything other than FI_MR_BASIC or FI_MR_SCALABLE. -See the domain attribute mr_mode \f[V]fi_domain\f[R](3) and -\f[V]fi_mr\f[R](3). +See the domain attribute mr_mode \f[C]fi_domain\f[R](3) and +\f[C]fi_mr\f[R](3). .TP \f[I]FI_MSG_PREFIX\f[R] Message prefix mode indicates that an application will provide buffer @@ -687,7 +673,7 @@ these operations. A provider may support one or more of the following addressing formats. In some cases, a selected addressing format may need to be translated or mapped into an address which is native to the fabric. -See \f[V]fi_av\f[R](3). +See \f[C]fi_av\f[R](3). .TP \f[I]FI_ADDR_EFA\f[R] Address is an Amazon Elastic Fabric Adapter (EFA) proprietary format. @@ -775,7 +761,7 @@ This flag is often used with passive endpoints. fi_getinfo() returns 0 on success. On error, fi_getinfo() returns a negative value corresponding to fabric errno. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .PP fi_allocinfo() returns a pointer to a new fi_info structure on success, or NULL on error. @@ -830,11 +816,11 @@ by fi_getinfo. If neither node, service or hints are provided, then fi_getinfo simply returns the list all available communication interfaces. .PP -Multiple threads may call \f[V]fi_getinfo\f[R] simultaneously, without +Multiple threads may call \f[C]fi_getinfo\f[R] simultaneously, without any requirement for serialization. .SH SEE ALSO .PP -\f[V]fi_open\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_domain\f[R](3), -\f[V]fi_nic\f[R](3) \f[V]fi_trigger\f[R](3) +\f[C]fi_open\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3), +\f[C]fi_nic\f[R](3) \f[C]fi_trigger\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_mr.3 b/man/man3/fi_mr.3 index c651e29d73d..3c11b40fe5f 100644 --- a/man/man3/fi_mr.3 +++ b/man/man3/fi_mr.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_mr" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_mr" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -717,7 +703,7 @@ specifying the FI_MR_DMABUF flag. The number of entries in the mr_iov array. The maximum number of memory buffers that may be associated with a single memory region is specified as the mr_iov_limit domain attribute. -See \f[V]fi_domain(3)\f[R]. +See \f[C]fi_domain(3)\f[R]. .SS access .PP Indicates the type of \f[I]operations\f[R] that the local or a peer @@ -799,7 +785,7 @@ This field is ignored unless the fabric is opened with API version 1.5 or greater. .PP If the domain is opened with FI_AV_AUTH_KEY, auth_key_size must equal -\f[V]sizeof(struct fi_mr_auth_key)\f[R]. +\f[C]sizeof(struct fi_mr_auth_key)\f[R]. .SS auth_key .PP Indicates the key to associate with this memory registration. @@ -812,7 +798,7 @@ This field is ignored unless the fabric is opened with API version 1.5 or greater. .PP If the domain is opened with FI_AV_AUTH_KEY, auth_key must point to a -user-defined \f[V]struct fi_mr_auth_key\f[R]. +user-defined \f[C]struct fi_mr_auth_key\f[R]. .SS iface .PP Indicates the software interfaces used by the application to allocate @@ -941,7 +927,7 @@ keys in the AV. .PP If the domain was opened with FI_DIRECTED_RECV, addr can be used to limit the memory region to a specific fi_addr_t, including -fi_addr_t\[cq]s return from \f[V]fi_av_insert_auth_key\f[R]. +fi_addr_t\[cq]s return from \f[C]fi_av_insert_auth_key\f[R]. .SH NOTES .PP Direct access to an application\[cq]s memory by a remote peer requires @@ -1071,7 +1057,7 @@ For example, the physical pages referenced by a virtual address range could migrate between host memory and GPU memory, depending on which computational unit is actively using it. .PP -See the \f[V]fi_endpoint\f[R](3) and \f[V]fi_cq\f[R](3) man pages for +See the \f[C]fi_endpoint\f[R](3) and \f[C]fi_cq\f[R](3) man pages for addition discussion on message, data, and completion ordering semantics, including the impact of memory domains. .SH RETURN VALUES @@ -1079,7 +1065,7 @@ including the impact of memory domains. Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. .PP -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_ENOKEY\f[R] @@ -1185,8 +1171,8 @@ Some level of control over the cache is possible through the above mentioned environment variables. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_rma\f[R](3), \f[V]fi_msg\f[R](3), -\f[V]fi_atomic\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_rma\f[R](3), \f[C]fi_msg\f[R](3), +\f[C]fi_atomic\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_msg.3 b/man/man3/fi_msg.3 index 4343f68e1ad..5919225afc4 100644 --- a/man/man3/fi_msg.3 +++ b/man/man3/fi_msg.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_msg" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_msg" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -79,7 +65,7 @@ Count of vectored data entries. .TP \f[I]desc\f[R] Descriptor associated with the data buffer. -See \f[V]fi_mr\f[R](3). +See \f[C]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the sent message. @@ -156,7 +142,7 @@ parameter to a remote endpoint as a single message. The fi_sendmsg call supports data transfers over both connected and connectionless endpoints, with the ability to control the send operation per call through the use of flags. -The fi_sendmsg function takes a \f[V]struct fi_msg\f[R] as input. +The fi_sendmsg function takes a \f[C]struct fi_msg\f[R] as input. .IP .nf \f[C] @@ -279,7 +265,7 @@ Note that an entry to the associated receive completion queue will always be generated when the buffer has been consumed, even if other receive completions have been suppressed (i.e.\ the Rx context has been configured for FI_SELECTIVE_COMPLETION). -See the FI_MULTI_RECV completion flag \f[V]fi_cq\f[R](3). +See the FI_MULTI_RECV completion flag \f[C]fi_cq\f[R](3). .TP \f[I]FI_INJECT_COMPLETE\f[R] Applies to fi_sendmsg. @@ -294,7 +280,7 @@ tracked by the provider. For receive operations, indicates that a completion may be generated as soon as the message has been processed by the local provider, even if the message data may not be visible to all processing elements. -See \f[V]fi_cq\f[R](3) for target side completion semantics. +See \f[C]fi_cq\f[R](3) for target side completion semantics. .TP \f[I]FI_DELIVERY_COMPLETE\f[R] Applies to fi_sendmsg. @@ -340,7 +326,7 @@ buffer length. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .PP See the discussion below for details handling FI_EAGAIN. .SH ERRORS @@ -373,7 +359,7 @@ acknowledgements or flow control messages may need to be processed in order to resume execution. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_nic.3 b/man/man3/fi_nic.3 index c04a55d30e1..e64dfe31473 100644 --- a/man/man3/fi_nic.3 +++ b/man/man3/fi_nic.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_nic" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_nic" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -23,7 +9,7 @@ fi_nic - Fabric network interface card attributes .PP The fid_nic structure defines attributes for a struct fi_info that is directly associated with underlying networking hardware and may be -returned directly from calling \f[V]fi_getinfo\f[R](3). +returned directly from calling \f[C]fi_getinfo\f[R](3). The format of fid_nic and the related substructures are defined below. .PP Note that not all fields of all structures may be available. @@ -149,7 +135,7 @@ Ethernet or InfiniBand. .PP Provider attributes reference provider specific details of the device. These attributes are both provider and device specific. -The attributes can be interpreted by \f[V]fi_tostr\f[R](3). +The attributes can be interpreted by \f[C]fi_tostr\f[R](3). Applications may also use the other attribute fields, such as related fi_fabric_attr: prov_name field, to determine an appropriate structure to cast the attributes. @@ -159,10 +145,10 @@ specific header file included with libfabric package. .SH NOTES .PP The fid_nic structure is returned as part of a call to -\f[V]fi_getinfo\f[R](3). -It is automatically freed as part of calling \f[V]fi_freeinfo\f[R](3) +\f[C]fi_getinfo\f[R](3). +It is automatically freed as part of calling \f[C]fi_freeinfo\f[R](3) .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3) +\f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_peer.3 b/man/man3/fi_peer.3 index 1279c23dcd2..24b6464f9f3 100644 --- a/man/man3/fi_peer.3 +++ b/man/man3/fi_peer.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_peer" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_peer" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .TP @@ -287,8 +273,8 @@ If manual progress is needed on the peer CQ, the owner should drive progress by using the fi_cq_read() function with the buf parameter set to NULL and count equal 0. The peer provider should set other functions that attempt to read the -peer\[cq]s CQ (i.e.\ fi_cq_readerr, fi_cq_sread, etc.) -to return -FI_ENOSYS. +peer\[cq]s CQ (i.e.\ fi_cq_readerr, fi_cq_sread, etc.) to return +-FI_ENOSYS. .SS fi_ops_cq_owner::write() .PP This call directs the owner to insert new completions into the CQ. @@ -379,8 +365,8 @@ Similar to the peer CQ, if manual progress is needed on the peer counter, the owner should drive progress by using the fi_cntr_read() and the fi_cntr_read() should do nothing but progress the peer cntr. The peer provider should set other functions that attempt to access the -peer\[cq]s cntr (i.e.\ fi_cntr_readerr, fi_cntr_set, etc.) -to return -FI_ENOSYS. +peer\[cq]s cntr (i.e.\ fi_cntr_readerr, fi_cntr_set, etc.) to return +-FI_ENOSYS. .SS fi_ops_cntr_owner::inc() .PP This call directs the owner to increment the value of the cntr. @@ -815,9 +801,9 @@ callbacks. .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[V]fi_provider\f[R](7), \f[V]fi_provider\f[R](3), \f[V]fi_cq\f[R](3), +\f[C]fi_provider\f[R](7), \f[C]fi_provider\f[R](3), \f[C]fi_cq\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_poll.3 b/man/man3/fi_poll.3 index b49b9d35174..a3ee5cbbcb0 100644 --- a/man/man3/fi_poll.3 +++ b/man/man3/fi_poll.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_poll" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_poll" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -329,7 +315,7 @@ or fid. Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. .PP -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .TP fi_poll On success, if events are available, returns the number of entries @@ -406,7 +392,7 @@ The use of the fi_trywait() function is still required if accessing wait objects directly. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_cntr\f[R](3), -\f[V]fi_eq\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3), +\f[C]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_profile.3 b/man/man3/fi_profile.3 index f24305abf82..234a5a10f54 100644 --- a/man/man3/fi_profile.3 +++ b/man/man3/fi_profile.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_profile" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_profile" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -337,6 +323,6 @@ be returned. For fi_profile_query_vars and fi_profile_query_events, a positive return value indicates the number of variables or events returned in the list. .PP -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_provider.3 b/man/man3/fi_provider.3 index 0f123ba3246..4814ef7b6d8 100644 --- a/man/man3/fi_provider.3 +++ b/man/man3/fi_provider.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_provider" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_provider" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -197,7 +183,7 @@ the service or resource to which they correspond. The mr_cache object references the internal memory registration cache used by the different providers. Additional information on the cache is available in the -\f[V]fi_mr(3)\f[R] man page. +\f[C]fi_mr(3)\f[R] man page. .TP \f[I]logging\f[R] The logging object references the internal logging subsystem used by the @@ -207,8 +193,8 @@ Can be opened only once and only the last import is used if imported multiple times. .SS fi_import .PP -This helper function is a combination of \f[V]fi_open\f[R] and -\f[V]fi_import_fid\f[R]. +This helper function is a combination of \f[C]fi_open\f[R] and +\f[C]fi_import_fid\f[R]. It may be used to import a fabric object created and owned by the libfabric user. This allows the upper level libraries or the application to override or @@ -278,9 +264,9 @@ For integrated providers .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3) \f[V]fi_mr\f[R](3), +\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3) \f[C]fi_mr\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_rma.3 b/man/man3/fi_rma.3 index dea50502f69..236b922f418 100644 --- a/man/man3/fi_rma.3 +++ b/man/man3/fi_rma.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_rma" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_rma" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -92,7 +78,7 @@ FI_MR_SCALABLE. Protection key associated with the remote memory. .TP \f[I]desc\f[R] -Descriptor associated with the local data buffer See \f[V]fi_mr\f[R](3). +Descriptor associated with the local data buffer See \f[C]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the operation. @@ -189,7 +175,7 @@ struct fi_rma_iov { .PP The write inject call is an optimized version of fi_write. It provides similar completion semantics as fi_inject -\f[V]fi_msg\f[R](3). +\f[C]fi_msg\f[R](3). .SS fi_writedata .PP The write data call is similar to fi_write, but allows for the sending @@ -290,15 +276,15 @@ operation (inclusive) to the posting of a subsequent fenced operation .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[V]fi_msg\f[R](3) for a detailed description of handling +See \f[C]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_tagged.3 b/man/man3/fi_tagged.3 index 399c4d278f9..7e11b18b037 100644 --- a/man/man3/fi_tagged.3 +++ b/man/man3/fi_tagged.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_tagged" "3" "2024\-10\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_tagged" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -88,7 +74,7 @@ Mask of bits to ignore applied to the tag for receive operations. .TP \f[I]desc\f[R] Memory descriptor associated with the data buffer. -See \f[V]fi_mr\f[R](3). +See \f[C]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the sent data. @@ -213,7 +199,7 @@ struct fi_msg_tagged { .PP The tagged inject call is an optimized version of fi_tsend. It provides similar completion semantics as fi_inject -\f[V]fi_msg\f[R](3). +\f[C]fi_msg\f[R](3). .SS fi_tsenddata .PP The tagged send data call is similar to fi_tsend, but allows for the @@ -301,7 +287,7 @@ Note that an entry to the associated receive completion queue will always be generated when the buffer has been consumed, even if other receive completions have been suppressed (i.e.\ the Rx context has been configured for FI_SELECTIVE_COMPLETION). -See the FI_MULTI_RECV completion flag \f[V]fi_cq\f[R](3). +See the FI_MULTI_RECV completion flag \f[C]fi_cq\f[R](3). .TP \f[I]FI_INJECT_COMPLETE\f[R] Applies to fi_tsendmsg. @@ -395,11 +381,11 @@ ignored. The tagged send and receive calls return 0 on success. On error, a negative value corresponding to fabric \f[I]errno \f[R] is returned. -Fabric errno values are defined in \f[V]fi_errno.h\f[R]. +Fabric errno values are defined in \f[C]fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[V]fi_msg\f[R](3) for a detailed description of handling +See \f[C]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EINVAL\f[R] @@ -409,7 +395,7 @@ Indicates that an invalid argument was supplied by the user. Indicates that an unspecified error occurred. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_trigger.3 b/man/man3/fi_trigger.3 index fea5833b591..38e85f16bfb 100644 --- a/man/man3/fi_trigger.3 +++ b/man/man3/fi_trigger.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_trigger" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_trigger" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -213,7 +199,7 @@ If a specific request is not supported by the provider, it will fail the operation with -FI_ENOSYS. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_mr\f[R](3), -\f[V]fi_alias\f[R](3), \f[V]fi_cntr\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_mr\f[R](3), +\f[C]fi_alias\f[R](3), \f[C]fi_cntr\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_version.3 b/man/man3/fi_version.3 index 5352c05255e..b046adf9132 100644 --- a/man/man3/fi_version.3 +++ b/man/man3/fi_version.3 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_version" "3" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_version" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -48,6 +34,6 @@ The upper 16-bits of the version correspond to the major number, and the lower 16-bits correspond with the minor number. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fabric.7 b/man/man7/fabric.7 index 73ed2109b19..b928c04ed89 100644 --- a/man/man7/fabric.7 +++ b/man/man7/fabric.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fabric" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fabric" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -30,7 +16,7 @@ fabric - Fabric Interface Library Libfabric is a high-performance fabric software library designed to provide low-latency interfaces to fabric hardware. For an in-depth discussion of the motivation and design see -\f[V]fi_guide\f[R](7). +\f[C]fi_guide\f[R](7). .SH OVERVIEW .PP Libfabric provides `process direct I/O' to application software @@ -51,7 +37,7 @@ All fabric hardware devices and their software drivers are required to support this framework. Devices and the drivers that plug into the libfabric framework are referred to as fabric providers, or simply providers. -Provider details may be found in \f[V]fi_provider\f[R](7). +Provider details may be found in \f[C]fi_provider\f[R](7). .TP \f[I]Fabric Interfaces\f[R] The second component is a set of communication operations. @@ -296,18 +282,18 @@ If the list begins with the `\[ha]' symbol, then the list will be negated. .PP Example: To enable the udp and tcp providers only, set: -\f[V]FI_PROVIDER=\[dq]udp,tcp\[dq]\f[R] +\f[C]FI_PROVIDER=\[dq]udp,tcp\[dq]\f[R] .PP When libfabric is installed, DL providers are put under the \f[I]default provider path\f[R], which is determined by how libfabric is built and installed. Usually the default provider path is -\f[V]/lib/libfabric\f[R] or -\f[V]/lib64/libfabric\f[R]. +\f[C]/lib/libfabric\f[R] or +\f[C]/lib64/libfabric\f[R]. By default, libfabric tries to find DL providers in the following order: .IP "1." 3 Use `dlopen' to load provider libraries named -\f[V]lib-fi.so\f[R] for all providers enabled at build time. +\f[C]lib-fi.so\f[R] for all providers enabled at build time. The search path of `ld.so' is used to locate the files. This step is skipped if libfabric is configured with the option `\[en]enable-restricted-dl'. @@ -377,7 +363,7 @@ can be used to retrieve information about which providers are available in the system. Additionally, it can retrieve a list of all environment variables that may be used to configure libfabric and each provider. -See \f[V]fi_info\f[R](1) for more details. +See \f[C]fi_info\f[R](1) for more details. .SH ENVIRONMENT VARIABLE CONTROLS .PP Core features of libfabric and its providers may be configured by an @@ -414,22 +400,22 @@ may not be available in a child process because of copy on write restrictions. .SS CUDA deadlock .PP -In some cases, calls to \f[V]cudaMemcpy()\f[R] within libfabric may +In some cases, calls to \f[C]cudaMemcpy()\f[R] within libfabric may result in a deadlock. This typically occurs when a CUDA kernel blocks until a -\f[V]cudaMemcpy\f[R] on the host completes. +\f[C]cudaMemcpy\f[R] on the host completes. Applications which can cause such behavior can restrict Libfabric\[cq]s ability to invoke CUDA API operations with the endpoint option -\f[V]FI_OPT_CUDA_API_PERMITTED\f[R]. -See \f[V]fi_endpoint\f[R](3) for more details. +\f[C]FI_OPT_CUDA_API_PERMITTED\f[R]. +See \f[C]fi_endpoint\f[R](3) for more details. .PP Another mechanism which can be used to avoid deadlock is Nvidia\[cq]s GDRCopy. Using GDRCopy requires an external library and kernel module available at https://github.com/NVIDIA/gdrcopy. Libfabric must be configured with GDRCopy support using the -\f[V]--with-gdrcopy\f[R] option, and be run with -\f[V]FI_HMEM_CUDA_USE_GDRCOPY=1\f[R]. +\f[C]--with-gdrcopy\f[R] option, and be run with +\f[C]FI_HMEM_CUDA_USE_GDRCOPY=1\f[R]. This may not be supported by all providers. .SH ABI CHANGES .PP @@ -523,9 +509,9 @@ Added new fields to the following attributes: Added max_group_id .SH SEE ALSO .PP -\f[V]fi_info\f[R](1), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3), -\f[V]fi_endpoint\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_av\f[R](3), -\f[V]fi_eq\f[R](3), \f[V]fi_cq\f[R](3), \f[V]fi_cntr\f[R](3), -\f[V]fi_mr\f[R](3) +\f[C]fi_info\f[R](1), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3), +\f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_av\f[R](3), +\f[C]fi_eq\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_cntr\f[R](3), +\f[C]fi_mr\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_arch.7 b/man/man7/fi_arch.7 index 6bd4ad0abc5..fe62ebd155b 100644 --- a/man/man7/fi_arch.7 +++ b/man/man7/fi_arch.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_arch" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_arch" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .IP .nf diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 index 0360e4cb466..a81b6dd18b7 100644 --- a/man/man7/fi_cxi.7 +++ b/man/man7/fi_cxi.7 @@ -1,21 +1,7 @@ -'\" t -.\" Automatically generated by Pandoc 3.1.3 +.\"t +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_cxi" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cxi" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -190,7 +176,7 @@ Classes. .PP While a libfabric user provided authorization key is optional, it is highly encouraged that libfabric users provide an authorization key -through the domain attribute hints during \f[V]fi_getinfo()\f[R]. +through the domain attribute hints during \f[C]fi_getinfo()\f[R]. How libfabric users acquire the authorization key may vary between the users and is outside the scope of this document. .PP @@ -206,18 +192,18 @@ authorization key using them. .IP \[bu] 2 \f[I]SLINGSHOT_VNIS\f[R]: Comma separated list of VNIs. The CXI provider will only use the first VNI if multiple are provide. -Example: \f[V]SLINGSHOT_VNIS=234\f[R]. +Example: \f[C]SLINGSHOT_VNIS=234\f[R]. .IP \[bu] 2 \f[I]SLINGSHOT_DEVICES\f[R]: Comma separated list of device names. Each device index will use the same index to lookup the service ID in \f[I]SLINGSHOT_SVC_IDS\f[R]. -Example: \f[V]SLINGSHOT_DEVICES=cxi0,cxi1\f[R]. +Example: \f[C]SLINGSHOT_DEVICES=cxi0,cxi1\f[R]. .IP \[bu] 2 \f[I]SLINGSHOT_SVC_IDS\f[R]: Comma separated list of pre-configured CXI service IDs. Each service ID index will use the same index to lookup the CXI device in \f[I]SLINGSHOT_DEVICES\f[R]. -Example: \f[V]SLINGSHOT_SVC_IDS=5,6\f[R]. +Example: \f[C]SLINGSHOT_SVC_IDS=5,6\f[R]. .PP \f[B]Note:\f[R] How valid VNIs and device services are configured is outside the responsibility of the CXI provider. @@ -622,7 +608,7 @@ into the fi_control(FI_QUEUE_WORK) critical path. The following subsections outline the CXI provider fork support. .SS RDMA and Fork Overview .PP -Under Linux, \f[V]fork()\f[R] is implemented using copy-on-write (COW) +Under Linux, \f[C]fork()\f[R] is implemented using copy-on-write (COW) pages, so the only penalty that it incurs is the time and memory required to duplicate the parent\[cq]s page tables, mark all of the process\[cq]s page structs as read only and COW, and create a unique @@ -665,22 +651,22 @@ The crux of the issue is the parent issuing forks while trying to do RDMA operations to registered memory regions. Excluding software RDMA emulation, two options exist for RDMA NIC vendors to resolve this data corruption issue. -- Linux \f[V]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC +- Linux \f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC support for on-demand paging (ODP) .SS Linux madvise() MADV_DONTFORK and MADV_DOFORK .PP The generic (i.e.\ non-vendor specific) RDMA NIC solution to the Linux COW fork policy and RDMA problem is to use the following -\f[V]madvise()\f[R] operations during memory registration and +\f[C]madvise()\f[R] operations during memory registration and deregistration: - MADV_DONTFORK: Do not make the pages in this range -available to the child after a \f[V]fork()\f[R]. +available to the child after a \f[C]fork()\f[R]. This is useful to prevent copy-on-write semantics from changing the physical location of a page if the parent writes to it after a -\f[V]fork()\f[R]. +\f[C]fork()\f[R]. (Such page relocations cause problems for hardware that DMAs into the -page.) -- MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the default -behavior, whereby a mapping is inherited across \f[V]fork()\f[R]. +page.) - MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the +default behavior, whereby a mapping is inherited across +\f[C]fork()\f[R]. .PP In the Linux kernel, MADV_DONTFORK will result in the virtual memory area struct (VMA) being marked with the VM_DONTCOPY flag. @@ -691,14 +677,14 @@ Should the child reference the virtual address corresponding to the VMA which was not duplicated, it will segfault. .PP In the previous example, if Process A issued -\f[V]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing +\f[C]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing RDMA memory registration, the physical address 0x1000 would have remained with Process A. This would prevent the Process A data corruption as well. If Process B were to reference virtual address 0xffff0000, it will segfault due to the hole in the virtual address space. .PP -Using \f[V]madvise()\f[R] with MADV_DONTFORK may be problematic for +Using \f[C]madvise()\f[R] with MADV_DONTFORK may be problematic for applications performing RDMA and page aliasing. Paging aliasing is where the parent process uses part or all of a page to share information with the child process. @@ -752,7 +738,7 @@ The CXI provider is subjected to the Linux COW fork policy and RDMA issues described in section \f[I]RDMA and Fork Overview\f[R]. To prevent data corruption with fork, the CXI provider supports the following options: - CXI specific fork environment variables to enable -\f[V]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support* +\f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support* .PP **Formal ODP support pending.* .SS CXI Specific Fork Environment Variables @@ -760,27 +746,27 @@ following options: - CXI specific fork environment variables to enable The CXI software stack has two environment variables related to fork: 0 CXI_FORK_SAFE: Enables base fork safe support. With this environment variable set, regardless of value, libcxi will -issue \f[V]madvise()\f[R] with MADV_DONTFORK on the virtual address +issue \f[C]madvise()\f[R] with MADV_DONTFORK on the virtual address range being registered for RDMA. -In addition, libcxi always align the \f[V]madvise()\f[R] to the system +In addition, libcxi always align the \f[C]madvise()\f[R] to the system default page size. On x86, this is 4 KiB. -To prevent redundant \f[V]madvise()\f[R] calls with MADV_DONTFORK +To prevent redundant \f[C]madvise()\f[R] calls with MADV_DONTFORK against the same virtual address region, reference counting is used -against each tracked \f[V]madvise()\f[R] region. -In addition, libcxi will spilt and merge tracked \f[V]madvise()\f[R] +against each tracked \f[C]madvise()\f[R] region. +In addition, libcxi will spilt and merge tracked \f[C]madvise()\f[R] regions if needed. Once the reference count reaches zero, libcxi will call -\f[V]madvise()\f[R] with MADV_DOFORK, and no longer track the region. +\f[C]madvise()\f[R] with MADV_DOFORK, and no longer track the region. - CXI_FORK_SAFE_HP: With this environment variable set, in conjunction with CXI_FORK_SAFE, libcxi will not assume the page size is system default page size. -Instead, libcxi will walk \f[V]/proc//smaps\f[R] to determine the -correct page size and align the \f[V]madvise()\f[R] calls accordingly. +Instead, libcxi will walk \f[C]/proc//smaps\f[R] to determine the +correct page size and align the \f[C]madvise()\f[R] calls accordingly. This environment variable should be set if huge pages are being used for RDMA. To amortize the per memory registration walk of -\f[V]/proc//smaps\f[R], the libfabric MR cache should be used. +\f[C]/proc//smaps\f[R], the libfabric MR cache should be used. .PP Setting these environment variables will prevent data corruption when the parent issues a fork. @@ -814,7 +800,7 @@ transfer. The following is the CXI provider fork support guidance: - Enable CXI_FORK_SAFE. If huge pages are also used, CXI_FORK_SAFE_HP should be enabled as well. -Since enabling this will result in \f[V]madvice()\f[R] with +Since enabling this will result in \f[C]madvice()\f[R] with MADV_DONTFORK, the following steps should be taken to prevent a child process segfault: - Avoid using stack memory for RDMA - Avoid child process having to access a virtual address range the parent process is @@ -1565,6 +1551,14 @@ GPU direct RDMA may or may not work in this case. Enable enforcement of triggered operation limit. Doing this can prevent fi_control(FI_QUEUE_WORK) deadlocking at the cost of performance. +.TP +\f[I]FI_CXI_MR_CACHE_EVENTS_DISABLE_POLL_NSECS\f[R] +Max amount of time to poll when disabling an MR configured with MR match +events. +.TP +\f[I]FI_CXI_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS\f[R] +Max amount of time to poll when LE invalidate disabling an MR configured +with MR match events. .PP Note: Use the fi_info utility to query provider environment variables: fi_info -p cxi -e @@ -1630,7 +1624,7 @@ It can only be changed prior to any MR being created. .PP CXI domain extensions have been named \f[I]FI_CXI_DOM_OPS_6\f[R]. The flags parameter is ignored. -The fi_open_ops function takes a \f[V]struct fi_cxi_dom_ops\f[R]. +The fi_open_ops function takes a \f[C]struct fi_cxi_dom_ops\f[R]. See an example of usage below: .IP .nf @@ -1660,10 +1654,10 @@ struct fi_cxi_dom_ops { \f[R] .fi .PP -\f[I]cntr_read\f[R] extension is used to read hardware counter values. -Valid values of the cntr argument are found in the Cassini-specific -header file cassini_cntr_defs.h. -Note that Counter accesses by applications may be rate-limited to 1HZ. +\f[I]cntr_read\f[R] extension is used to read Cassini Telemetry items +that consists of counters and gauges. +The items available and their content are dependent upon the Cassini +ASIC version and Cassini Driver version. .PP \f[I]topology\f[R] extension is used to return CXI NIC address topology information for the domain. @@ -1723,7 +1717,7 @@ removed from the domain opts prior to software release 2.2. .PP CXI counter extensions have been named \f[I]FI_CXI_COUNTER_OPS\f[R]. The flags parameter is ignored. -The fi_open_ops function takes a \f[V]struct fi_cxi_cntr_ops\f[R]. +The fi_open_ops function takes a \f[C]struct fi_cxi_cntr_ops\f[R]. See an example of usage below. .IP .nf @@ -1852,7 +1846,7 @@ memory operation as a PCIe operation as compared to a NIC operation. The CXI provider extension flag FI_CXI_PCIE_AMO is used to signify this. .PP Since not all libfabric atomic memory operations can be executed as a -PCIe atomic memory operation, \f[V]fi_query_atomic()\f[R] could be used +PCIe atomic memory operation, \f[C]fi_query_atomic()\f[R] could be used to query if a given libfabric atomic memory operation could be executed as PCIe atomic memory operation. .PP @@ -1902,7 +1896,7 @@ module parameter must be set to non-zero. .IP .nf \f[C] -/sys/module/cxi_core/parameters/amo_remap_to_pcie_fadd +/sys/module/cxi_ss1/parameters/amo_remap_to_pcie_fadd \f[R] .fi .PP @@ -2170,6 +2164,6 @@ In this case, the target NIC is reachable. FI_EIO: Catch all errno. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_direct.7 b/man/man7/fi_direct.7 index c014a10b104..3f9c6a34870 100644 --- a/man/man7/fi_direct.7 +++ b/man/man7/fi_direct.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_direct" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_direct" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -85,7 +71,7 @@ The provider sets FI_LOCAL_MR for fi_info:mode. See fi_getinfo for additional details. .SH SEE ALSO .PP -\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), -\f[V]fi_domain\f[R](3) +\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), +\f[C]fi_domain\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_efa.7 b/man/man7/fi_efa.7 index 24809adde15..ed99b5a3e8a 100644 --- a/man/man7/fi_efa.7 +++ b/man/man7/fi_efa.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_efa" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_efa" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -109,7 +95,7 @@ No support for counters for the DGRAM endpoint. No support for inject. .SS zero-copy receive mode .IP \[bu] 2 -The receive operation cannot be cancelled via \f[V]fi_cancel()\f[R]. +The receive operation cannot be cancelled via \f[C]fi_cancel()\f[R]. .IP \[bu] 2 Zero-copy receive mode can be enabled only if SHM transfer is disabled. .IP \[bu] 2 @@ -180,12 +166,12 @@ If endpoint is not able to support this feature, it will return .PP The efa provider exports extensions for operations that are not provided by the standard libfabric interface. -These extensions are available via the \[lq]\f[V]fi_ext_efa.h\f[R]\[rq] +These extensions are available via the \[lq]\f[C]fi_ext_efa.h\f[R]\[rq] header file. .SS Domain Operation Extension .PP -Domain operation extension is obtained by calling \f[V]fi_open_ops\f[R] -(see \f[V]fi_domain(3)\f[R]) +Domain operation extension is obtained by calling \f[C]fi_open_ops\f[R] +(see \f[C]fi_domain(3)\f[R]) .IP .nf \f[C] @@ -194,9 +180,9 @@ int fi_open_ops(struct fid *domain, const char *name, uint64_t flags, \f[R] .fi .PP -and requesting \f[V]FI_EFA_DOMAIN_OPS\f[R] in \f[V]name\f[R]. -\f[V]fi_open_ops\f[R] returns \f[V]ops\f[R] as the pointer to the -function table \f[V]fi_efa_ops_domain\f[R] defined as follows: +and requesting \f[C]FI_EFA_DOMAIN_OPS\f[R] in \f[C]name\f[R]. +\f[C]fi_open_ops\f[R] returns \f[C]ops\f[R] as the pointer to the +function table \f[C]fi_efa_ops_domain\f[R] defined as follows: .IP .nf \f[C] @@ -238,20 +224,20 @@ FI_EFA_MR_ATTR_RDMA_RECV_IC_ID: rdma_recv_ic_id has a valid value. \f[I]recv_ic_id\f[R] Physical interconnect used by the device to reach the MR for receive operation. -It is only valid when \f[V]ic_id_validity\f[R] has the -\f[V]FI_EFA_MR_ATTR_RECV_IC_ID\f[R] bit. +It is only valid when \f[C]ic_id_validity\f[R] has the +\f[C]FI_EFA_MR_ATTR_RECV_IC_ID\f[R] bit. .TP \f[I]rdma_read_ic_id\f[R] Physical interconnect used by the device to reach the MR for RDMA read operation. -It is only valid when \f[V]ic_id_validity\f[R] has the -\f[V]FI_EFA_MR_ATTR_RDMA_READ_IC_ID\f[R] bit. +It is only valid when \f[C]ic_id_validity\f[R] has the +\f[C]FI_EFA_MR_ATTR_RDMA_READ_IC_ID\f[R] bit. .TP \f[I]rdma_recv_ic_id\f[R] Physical interconnect used by the device to reach the MR for RDMA write receive. -It is only valid when \f[V]ic_id_validity\f[R] has the -\f[V]FI_EFA_MR_ATTR_RDMA_RECV_IC_ID\f[R] bit. +It is only valid when \f[C]ic_id_validity\f[R] has the +\f[C]FI_EFA_MR_ATTR_RDMA_RECV_IC_ID\f[R] bit. .SS Return value .PP \f[B]query_mr()\f[R] returns 0 on success, or the value of errno on @@ -259,7 +245,7 @@ failure (which indicates the failure reason). .SH Traffic Class (tclass) in EFA .PP To prioritize the messages from a given endpoint, user can specify -\f[V]fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY\f[R] in the +\f[C]fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY\f[R] in the fi_endpoint() call to set the service level in rdma-core. All other tclass values will be ignored. .SH RUNTIME PARAMETERS @@ -342,7 +328,7 @@ to a peer after a receiver not ready error. Enable SHM provider to provide the communication across all intra-node processes. SHM transfer will be disabled in the case where -\f[V]ptrace protection\f[R] is turned on. +\f[C]ptrace protection\f[R] is turned on. You can turn it off to enable shm transfer. .PP FI_EFA_ENABLE_SHM_TRANSFER is parsed during the fi_domain call and is @@ -437,6 +423,6 @@ available. Setting this environment variable to 0 can disable this feature. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_guide.7 b/man/man7/fi_guide.7 index 00befca6279..3917b17b1ea 100644 --- a/man/man7/fi_guide.7 +++ b/man/man7/fi_guide.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_guide" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_guide" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -34,16 +20,16 @@ This guide describes the libfabric architecture and interfaces. Due to the length of the guide, it has been broken into multiple pages. These sections are: .TP -\f[I]Introduction \f[VI]fi_intro\f[I](7)\f[R] +\f[I]Introduction \f[BI]\f[CBI]fi_intro\f[BI]\f[I](7)\f[R] This section provides insight into the motivation for the libfabric design and underlying networking features that are being exposed through the API. .TP -\f[I]Architecture \f[VI]fi_arch\f[I](7)\f[R] +\f[I]Architecture \f[BI]\f[CBI]fi_arch\f[BI]\f[I](7)\f[R] This describes the exposed architecture of libfabric, including the object-model and their related operations .TP -\f[I]Setup \f[VI]fi_setup\f[I](7)\f[R] +\f[I]Setup \f[BI]\f[CBI]fi_setup\f[BI]\f[I](7)\f[R] This provides basic bootstrapping and setup for using the libfabric API. .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_hook.7 b/man/man7/fi_hook.7 index 6e52f716321..714c8a1f46b 100644 --- a/man/man7/fi_hook.7 +++ b/man/man7/fi_hook.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_hook" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_hook" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -180,6 +166,6 @@ Application that use FI_TRIGGER operations that attempt to hook calls will likely crash. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_intro.7 b/man/man7/fi_intro.7 index e15ccf34126..c6965739ae0 100644 --- a/man/man7/fi_intro.7 +++ b/man/man7/fi_intro.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_intro" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_intro" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -22,7 +8,7 @@ fi_intro - libfabric introduction .SH OVERVIEW .PP This introduction is part of the libfabric\[cq]s programmer\[cq]s guide. -See \f[V]fi_guide\f[R](7). +See \f[C]fi_guide\f[R](7). This section provides insight into the motivation for the libfabric design and underlying networking features that are being exposed through the API. @@ -1138,9 +1124,9 @@ If an application is using 1000 endpoints and posts 100 buffers, each 4 KB, that results in 400 MB of memory space being consumed to receive data. (We can start to realize that by eliminating memory copies, one of the -trade offs is increased memory consumption.) -While 400 MB seems like a lot of memory, there is less than half a -megabyte allocated to a single receive queue. +trade offs is increased memory consumption.) While 400 MB seems like a +lot of memory, there is less than half a megabyte allocated to a single +receive queue. At today\[cq]s networking speeds, that amount of space can be consumed within milliseconds. The result is that if only a few endpoints are in use, the application @@ -1429,6 +1415,6 @@ but it does allow for optimizing network utilization. Libfabric is well architected to support the previously discussed features. For further information on the libfabric architecture, see the next -programmer\[cq]s guide section: \f[V]fi_arch\f[R](7). +programmer\[cq]s guide section: \f[C]fi_arch\f[R](7). .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_lpp.7 b/man/man7/fi_lpp.7 index 5b8626464bc..05e0d9a3f03 100644 --- a/man/man7/fi_lpp.7 +++ b/man/man7/fi_lpp.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_lpp" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_lpp" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -88,6 +74,6 @@ Use the memcpy implementation in the system libc rather than provider-specific memcpy. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_mrail.7 b/man/man7/fi_mrail.7 index cd608b1e134..2939f3b6f30 100644 --- a/man/man7/fi_mrail.7 +++ b/man/man7/fi_mrail.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_mrail" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_mrail" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -97,18 +83,18 @@ Deprecated. Replaced by \f[I]FI_OFI_MRAIL_ADDR\f[R]. .TP \f[I]FI_OFI_MRAIL_CONFIG\f[R] -Comma separated list of \f[V]:\f[R] pairs, sorted in -ascending order of \f[V]\f[R]. +Comma separated list of \f[C]:\f[R] pairs, sorted in +ascending order of \f[C]\f[R]. Each pair indicated the rail sharing policy to be used for messages up -to the size \f[V]\f[R] and not covered by all previous pairs. -The value of \f[V]\f[R] can be \f[I]fixed\f[R] (a fixed rail is +to the size \f[C]\f[R] and not covered by all previous pairs. +The value of \f[C]\f[R] can be \f[I]fixed\f[R] (a fixed rail is used), \f[I]round-robin\f[R] (one rail per message, selected in round-robin fashion), or \f[I]striping\f[R] (striping across all the rails). -The default configuration is \f[V]16384:fixed,ULONG_MAX:striping\f[R]. +The default configuration is \f[C]16384:fixed,ULONG_MAX:striping\f[R]. The value ULONG_MAX can be input as -1. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_opx.7 b/man/man7/fi_opx.7 index 170ed2bdccc..2a5fbf57c42 100644 --- a/man/man7/fi_opx.7 +++ b/man/man7/fi_opx.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_opx" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_opx" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .PP {%include JB/setup %} @@ -172,35 +158,35 @@ Defaults to \[lq]No\[rq] \f[I]FI_OPX_HFI_SELECT\f[R] String. Controls how OPX chooses which HFI to use when opening a context. -Has two forms: - \f[V]\f[R] Force OPX provider to use -\f[V]hfi-unit\f[R]. -- \f[V][,[,...,]]\f[R] Select HFI based -on first matching \f[V]selector\f[R] +Has two forms: - \f[C]\f[R] Force OPX provider to use +\f[C]hfi-unit\f[R]. +- \f[C][,[,...,]]\f[R] Select HFI based +on first matching \f[C]selector\f[R] .PP -Where \f[V]selector\f[R] is one of the following forms: - -\f[V]default\f[R] to use the default logic - \f[V]fixed:\f[R] -to fix to one \f[V]hfi-unit\f[R] - -\f[V]::\f[R] +Where \f[C]selector\f[R] is one of the following forms: - +\f[C]default\f[R] to use the default logic - \f[C]fixed:\f[R] +to fix to one \f[C]hfi-unit\f[R] - +\f[C]::\f[R] .PP -The above fields have the following meaning: - \f[V]selector-type\f[R] +The above fields have the following meaning: - \f[C]selector-type\f[R] The selector criteria the caller opening the context is evaluated against. -- \f[V]hfi-unit\f[R] The HFI to use if the caller matches the selector. -- \f[V]selector-data\f[R] Data the caller must match (e.g.\ NUMA node +- \f[C]hfi-unit\f[R] The HFI to use if the caller matches the selector. +- \f[C]selector-data\f[R] Data the caller must match (e.g.\ NUMA node ID). .PP -Where \f[V]selector-type\f[R] is one of the following: - \f[V]numa\f[R] +Where \f[C]selector-type\f[R] is one of the following: - \f[C]numa\f[R] True when caller is local to the NUMA node ID given by -\f[V]selector-data\f[R]. -- \f[V]core\f[R] True when caller is local to the CPU core given by -\f[V]selector-data\f[R]. +\f[C]selector-data\f[R]. +- \f[C]core\f[R] True when caller is local to the CPU core given by +\f[C]selector-data\f[R]. .PP -And \f[V]selector-data\f[R] is one of the following: - \f[V]value\f[R] -The specific value to match - \f[V]-\f[R] +And \f[C]selector-data\f[R] is one of the following: - \f[C]value\f[R] +The specific value to match - \f[C]-\f[R] Matches with any value in that range .PP In the second form, when opening a context, OPX uses the -\f[V]hfi-unit\f[R] of the first-matching selector. +\f[C]hfi-unit\f[R] of the first-matching selector. Selectors are evaluated left-to-right. OPX will return an error if the caller does not match any selector. .PP @@ -216,27 +202,27 @@ For the second form, as which HFI is selected depends on properties of the caller, deterministic HFI selection requires deterministic caller properties. E.g. -for the \f[V]numa\f[R] selector, if the caller can migrate between NUMA +for the \f[C]numa\f[R] selector, if the caller can migrate between NUMA domains, then HFI selection will not be deterministic. .PP The logic used will always be the first valid in a selector list. -For example, \f[V]default\f[R] and \f[V]fixed\f[R] will match all +For example, \f[C]default\f[R] and \f[C]fixed\f[R] will match all callers, so if either are in the beginning of a selector list, you will -only use \f[V]fixed\f[R] or \f[V]default\f[R] regardles of if there are +only use \f[C]fixed\f[R] or \f[C]default\f[R] regardles of if there are any more selectors. .PP -Examples: - \f[V]FI_OPX_HFI_SELECT=0\f[R] all callers will open contexts +Examples: - \f[C]FI_OPX_HFI_SELECT=0\f[R] all callers will open contexts on HFI 0. -- \f[V]FI_OPX_HFI_SELECT=1\f[R] all callers will open contexts on HFI 1. -- \f[V]FI_OPX_HFI_SELECT=numa:0:0,numa:1:1,numa:0:2,numa:1:3\f[R] +- \f[C]FI_OPX_HFI_SELECT=1\f[R] all callers will open contexts on HFI 1. +- \f[C]FI_OPX_HFI_SELECT=numa:0:0,numa:1:1,numa:0:2,numa:1:3\f[R] callers local to NUMA nodes 0 and 2 will use HFI 0, callers local to NUMA domains 1 and 3 will use HFI 1. -- \f[V]FI_OPX_HFI_SELECT=numa:0:0-3,default\f[R] callers local to NUMA +- \f[C]FI_OPX_HFI_SELECT=numa:0:0-3,default\f[R] callers local to NUMA nodes 0 thru 3 (including 0 and 3) will use HFI 0, and all else will use default selection logic. -- \f[V]FI_OPX_HFI_SELECT=core:1:0,fixed:0\f[R] callers local to CPU core +- \f[C]FI_OPX_HFI_SELECT=core:1:0,fixed:0\f[R] callers local to CPU core 0 will use HFI 1, and all others will use HFI 0. -- \f[V]FI_OPX_HFI_SELECT=default,core:1:0\f[R] all callers will use +- \f[C]FI_OPX_HFI_SELECT=default,core:1:0\f[R] all callers will use default HFI selection logic. .TP \f[I]FI_OPX_DELIVERY_COMPLETION_THRESHOLD\f[R] @@ -288,9 +274,9 @@ This feature is not currently supported. \f[I]FI_OPX_PROG_AFFINITY\f[R] String. This sets the affinity to be used for any progress threads. -Set as a colon-separated triplet as \f[V]start:end:stride\f[R], where +Set as a colon-separated triplet as \f[C]start:end:stride\f[R], where stride controls the interval between selected cores. -For example, \f[V]1:5:2\f[R] will have cores 1, 3, and 5 as valid cores +For example, \f[C]1:5:2\f[R] will have cores 1, 3, and 5 as valid cores for progress threads. By default no affinity is set. .TP @@ -334,6 +320,6 @@ Needs to be set to 1 in case of mixed network. Default is 0. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](7), +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_provider.7 b/man/man7/fi_provider.7 index 7d71f23f586..571ec75e21f 100644 --- a/man/man7/fi_provider.7 +++ b/man/man7/fi_provider.7 @@ -1,27 +1,13 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_provider" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_provider" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP fi_provider - Fabric Interface Providers .SH OVERVIEW .PP -See \f[V]fi_arch\f[R](7) for a brief description of how providers fit +See \f[C]fi_arch\f[R](7) for a brief description of how providers fit into the libfabric architecture. .PP Conceptually, a fabric provider implements and maps the libfabric API @@ -88,52 +74,52 @@ This list is not exhaustive. .TP \f[I]CXI\f[R] Provider for Cray\[cq]s Slingshot network. -See \f[V]fi_cxi\f[R](7) for more information. +See \f[C]fi_cxi\f[R](7) for more information. .TP \f[I]EFA\f[R] A provider for the Amazon EC2 Elastic Fabric Adapter (EFA) (https://aws.amazon.com/hpc/efa/), a custom-built OS bypass hardware interface for inter-instance communication on EC2. -See \f[V]fi_efa\f[R](7) for more information. +See \f[C]fi_efa\f[R](7) for more information. .TP \f[I]OPX\f[R] Supports Omni-Path networking from Cornelis Networks. -See \f[V]fi_opx\f[R](7) for more information. +See \f[C]fi_opx\f[R](7) for more information. .TP \f[I]PSM2\f[R] Older provider for Omni-Path networks. -See \f[V]fi_psm2\f[R](7) for more information. +See \f[C]fi_psm2\f[R](7) for more information. .TP \f[I]PSM3\f[R] Provider for Ethernet networking from Intel. -See \f[V]fi_psm3\f[R](7) for more information. +See \f[C]fi_psm3\f[R](7) for more information. .TP \f[I]SHM\f[R] A provider for intra-node communication using shared memory. -See \f[V]fi_shm\f[R](7) for more information. +See \f[C]fi_shm\f[R](7) for more information. .TP \f[I]TCP\f[R] A provider which runs over the TCP/IP protocol and is available on multiple operating systems. This provider enables develop of libfabric applications on most platforms. -See \f[V]fi_tcp\f[R](7) for more information. +See \f[C]fi_tcp\f[R](7) for more information. .TP \f[I]UCX\f[R] A provider which runs over the UCX library which is currently supported by Infiniband fabrics from NVIDIA. -See \f[V]fi_ucx\f[R](7) for more information. +See \f[C]fi_ucx\f[R](7) for more information. .TP \f[I]UDP\f[R] A provider which runs over the UDP/IP protocol and is available on multiple operating systems. This provider enables develop of libfabric applications on most platforms. -See \f[V]fi_udp\f[R](7) for more information. +See \f[C]fi_udp\f[R](7) for more information. .TP \f[I]Verbs\f[R] This provider targets RDMA NICs for both Linux and Windows platforms. -See \f[V]fi_verbs\f[R](7) for more information. +See \f[C]fi_verbs\f[R](7) for more information. .SH Utility Providers .PP Utility providers are named with a starting prefix of \[lq]ofi_\[rq]. @@ -146,17 +132,17 @@ simpler endpoint type. .PP Utility providers show up as part of the return\[cq]s provider\[cq]s name. -See \f[V]fi_fabric\f[R](3). +See \f[C]fi_fabric\f[R](3). Utility providers are enabled automatically for core providers that do not support the feature set requested by an application. .TP \f[I]RxM\f[R] Implements RDM endpoint semantics over MSG endpoints. -See \f[V]fi_rxm\f[R](7) for more information. +See \f[C]fi_rxm\f[R](7) for more information. .TP \f[I]RxD\f[R] Implements RDM endpoint semantis over DGRAM endpoints. -See \f[V]fi_rxd\f[R](7) for more information. +See \f[C]fi_rxd\f[R](7) for more information. .SH Hooking Providers .PP Hooking providers are mostly used for debugging purposes. @@ -167,7 +153,7 @@ Hooking providers can layer over all other providers and intercept, or hook, their calls in order to perform some dedicated task, such as gathering performance data on call paths or providing debug output. .PP -See \f[V]fi_hook\f[R](7) for more information. +See \f[C]fi_hook\f[R](7) for more information. .SH Offload Providers .PP Offload providers start with the naming prefix \[lq]off_\[rq]. @@ -179,6 +165,6 @@ have been offloaded into hardware, though actual hardware offload support is not a requirement. .SH SEE ALSO .PP -\f[V]fabric\f[R](7) \f[V]fi_provider\f[R](3) +\f[C]fabric\f[R](7) \f[C]fi_provider\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm2.7 b/man/man7/fi_psm2.7 index a64cc54a270..8233bc835ab 100644 --- a/man/man7/fi_psm2.7 +++ b/man/man7/fi_psm2.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_psm2" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_psm2" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -228,11 +214,11 @@ See \f[I]FI_PSM2_PROG_AFFINITY\f[R]. When set, specify the set of CPU cores to set the progress thread affinity to. The format is -\f[V][:[:]][,[:[:]]]*\f[R], -where each triplet \f[V]::\f[R] defines a block of +\f[C][:[:]][,[:[:]]]*\f[R], +where each triplet \f[C]::\f[R] defines a block of core_ids. -Both \f[V]\f[R] and \f[V]\f[R] can be either the -\f[V]core_id\f[R] (when >=0) or \f[V]core_id - num_cores\f[R] (when <0). +Both \f[C]\f[R] and \f[C]\f[R] can be either the +\f[C]core_id\f[R] (when >=0) or \f[C]core_id - num_cores\f[R] (when <0). .PP By default affinity is not set. .TP @@ -338,6 +324,6 @@ Valid parameter names are defined in the header file \f[I]rdma/fi_ext_psm2.h\f[R]. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_psm3\f[R](7), +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm3\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm3.7 b/man/man7/fi_psm3.7 index 2b95e16ebc9..892fea3f805 100644 --- a/man/man7/fi_psm3.7 +++ b/man/man7/fi_psm3.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_psm3" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_psm3" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -25,7 +11,7 @@ The \f[I]psm3\f[R] provider implements a Performance Scaled Messaging capability which supports most verbs UD and sockets devices. Additional features and optimizations can be enabled when running over Intel\[cq]s E810 Ethernet NICs and/or using Intel\[cq]s rendezvous -kernel module (\f[V]rv\f[R]). +kernel module (\f[C]rv\f[R]). PSM 3.x fully integrates the OFI provider and the underlying PSM3 protocols/implementation and only exports the OFI APIs. .SH SUPPORTED FEATURES @@ -223,11 +209,11 @@ See \f[I]FI_PSM3_PROG_AFFINITY\f[R]. When set, specify the set of CPU cores to set the progress thread affinity to. The format is -\f[V][:[:]][,[:[:]]]*\f[R], -where each triplet \f[V]::\f[R] defines a block of +\f[C][:[:]][,[:[:]]]*\f[R], +where each triplet \f[C]::\f[R] defines a block of core_ids. -Both \f[V]\f[R] and \f[V]\f[R] can be either the -\f[V]core_id\f[R] (when >=0) or \f[V]core_id - num_cores\f[R] (when <0). +Both \f[C]\f[R] and \f[C]\f[R] can be either the +\f[C]core_id\f[R] (when >=0) or \f[C]core_id - num_cores\f[R] (when <0). .PP By default affinity is not set. .TP @@ -318,6 +304,6 @@ Notice that if the provider is compiled with macro runtime option will be disabled. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_psm2\f[R](7), +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm2\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_rxd.7 b/man/man7/fi_rxd.7 index a87b61ac96b..1bf65d50151 100644 --- a/man/man7/fi_rxd.7 +++ b/man/man7/fi_rxd.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_rxd" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_rxd" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -75,6 +61,6 @@ Maximum number of packets (per peer) to send at a time. Default: 128 .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_rxm.7 b/man/man7/fi_rxm.7 index 33c293ebb01..c683dde87b1 100644 --- a/man/man7/fi_rxm.7 +++ b/man/man7/fi_rxm.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_rxm" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_rxm" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -226,7 +212,7 @@ to only required values. .PP The data transfer API may return -FI_EAGAIN during on-demand connection setup of the core provider FI_MSG_EP. -See \f[V]fi_msg\f[R](3) for a detailed description of handling +See \f[C]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .SH Troubleshooting / Known issues .PP @@ -243,6 +229,6 @@ The workaround is to use shared receive contexts for the MSG provider (FI_OFI_RXM_MSG_TX_SIZE / FI_OFI_RXM_MSG_RX_SIZE). .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_setup.7 b/man/man7/fi_setup.7 index a7126fbdd89..44cad0029c6 100644 --- a/man/man7/fi_setup.7 +++ b/man/man7/fi_setup.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_setup" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_setup" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -473,9 +459,8 @@ libfabric defines a unique threading model. The libfabric design is heavily influenced by object-oriented programming concepts. A multi-threaded application must determine how libfabric objects -(domains, endpoints, completion queues, etc.) -will be allocated among its threads, or if any thread can access any -object. +(domains, endpoints, completion queues, etc.) will be allocated among +its threads, or if any thread can access any object. For example, an application may spawn a new thread to handle each new connected endpoint. The domain threading field provides a mechanism for an application to diff --git a/man/man7/fi_shm.7 b/man/man7/fi_shm.7 index 7b9d24db54d..ff7c5f241bb 100644 --- a/man/man7/fi_shm.7 +++ b/man/man7/fi_shm.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_shm" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_shm" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -103,7 +89,7 @@ was provided by the application), no supplemental information is required to make it unique and it will remain with only the application-defined address. Note that the actual endpoint name will not include the FI_ADDR_STR -\[lq]*://\[rq] prefix since it cannot be included in any shared memory +\[dq]*://\[dq] prefix since it cannot be included in any shared memory region names. The provider will strip off the prefix before setting the endpoint name. As a result, the addresses \[lq]fi_prefix1://my_node:my_service\[rq] and @@ -218,6 +204,6 @@ different systems. Default 262144 .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_sockets.7 b/man/man7/fi_sockets.7 index bd9b568ce93..1af06fc6eba 100644 --- a/man/man7/fi_sockets.7 +++ b/man/man7/fi_sockets.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_sockets" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_sockets" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -142,6 +128,6 @@ The recommended parameters for large scale runs are \f[I]FI_SOCKETS_DEF_CQ_SZ\f[R], \f[I]FI_SOCKETS_DEF_EQ_SZ\f[R]. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_tcp.7 b/man/man7/fi_tcp.7 index 3f661e6ec0b..7d291ccae3e 100644 --- a/man/man7/fi_tcp.7 +++ b/man/man7/fi_tcp.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_tcp" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_tcp" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -128,6 +114,6 @@ from the tcp provider. This will provide the best performance. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_ucx.7 b/man/man7/fi_ucx.7 index d160b304a13..a4928235e12 100644 --- a/man/man7/fi_ucx.7 +++ b/man/man7/fi_ucx.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_ucx" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_ucx" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -66,6 +52,6 @@ any). Check request leak (default: disabled). .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_udp.7 b/man/man7/fi_udp.7 index 5c5a5915e0f..8da3619cb8d 100644 --- a/man/man7/fi_udp.7 +++ b/man/man7/fi_udp.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_udp" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_udp" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -65,6 +51,6 @@ No support for counters. No runtime parameters are currently defined. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_usnic.7 b/man/man7/fi_usnic.7 index 01f035652b5..c351d23f01b 100644 --- a/man/man7/fi_usnic.7 +++ b/man/man7/fi_usnic.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_usnic" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_usnic" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -43,7 +29,7 @@ installing libnl from RPM or other packaging system, install the If you have libnl (either v1 or v3) installed in a non-standard location (e.g., not in /usr/lib or /usr/lib64), you may need to tell libfabric\[cq]s configure where to find libnl via the -\f[V]--with-libnl=DIR\f[R] command line option (where DIR is the +\f[C]--with-libnl=DIR\f[R] command line option (where DIR is the installation prefix of the libnl package). .RE .IP \[bu] 2 @@ -70,7 +56,7 @@ In particular, there are known bugs in RDM support in the presence of congestion or packet loss (issue 1621). RMA is not yet supported. .IP \[bu] 2 -\f[V]fi_provider\f[R](7) lists requirements for all providers. +\f[C]fi_provider\f[R](7) lists requirements for all providers. The following limitations exist in the \f[I]usnic\f[R] provider: .RS 2 .IP \[bu] 2 @@ -83,13 +69,13 @@ CM operations. Passive endpoints only support listen, setname, and getname CM operations. .IP \[bu] 2 -\f[I]FI_EP_DGRAM\f[R] endpoints support \f[V]fi_sendmsg()\f[R] and -\f[V]fi_recvmsg()\f[R], but some flags are ignored. -\f[V]fi_sendmsg()\f[R] supports \f[V]FI_INJECT\f[R] and -\f[V]FI_COMPLETION\f[R]. -\f[V]fi_recvmsg()\f[R] supports \f[V]FI_MORE\f[R]. +\f[I]FI_EP_DGRAM\f[R] endpoints support \f[C]fi_sendmsg()\f[R] and +\f[C]fi_recvmsg()\f[R], but some flags are ignored. +\f[C]fi_sendmsg()\f[R] supports \f[C]FI_INJECT\f[R] and +\f[C]FI_COMPLETION\f[R]. +\f[C]fi_recvmsg()\f[R] supports \f[C]FI_MORE\f[R]. .IP \[bu] 2 -Address vectors only support \f[V]FI_AV_MAP\f[R]. +Address vectors only support \f[C]FI_AV_MAP\f[R]. .IP \[bu] 2 No counters are supported. .IP \[bu] 2 @@ -133,19 +119,19 @@ file. Version 2 of the \[lq]fabric getinfo\[rq] extension was introduced in Libfabric release v1.3.0 and can be used to retrieve IP and SR-IOV information about a usNIC device obtained from the -\f[V]fi_getinfo\f[R](3) function. +\f[C]fi_getinfo\f[R](3) function. .PP The \[lq]fabric getinfo\[rq] extension is obtained by calling -\f[V]fi_open_ops\f[R] and requesting \f[V]FI_USNIC_FABRIC_OPS_1\f[R] to +\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_FABRIC_OPS_1\f[R] to get the usNIC fabric extension operations. -The \f[V]getinfo\f[R] function accepts a version parameter that can be +The \f[C]getinfo\f[R] function accepts a version parameter that can be used to select different versions of the extension. The information returned by the \[lq]fabric getinfo\[rq] extension is -accessible through a \f[V]fi_usnic_info\f[R] struct that uses a version +accessible through a \f[C]fi_usnic_info\f[R] struct that uses a version tagged union. The accessed union member must correspond with the requested version. It is recommended that applications explicitly request a version rather -than using the header provided \f[V]FI_EXT_USNIC_INFO_VERSION\f[R]. +than using the header provided \f[C]FI_EXT_USNIC_INFO_VERSION\f[R]. Although there is a version 1 of the extension, its use is discouraged, and it may not be available in future releases. .SS Compatibility issues @@ -258,8 +244,8 @@ struct fi_usnic_info_v1 { .fi .PP Version 1 of the \[lq]fabric getinfo\[rq] extension can be used by -explicitly requesting it in the call to \f[V]getinfo\f[R] and accessing -the \f[V]v1\f[R] portion of the \f[V]fi_usnic_info.ui\f[R] union. +explicitly requesting it in the call to \f[C]getinfo\f[R] and accessing +the \f[C]v1\f[R] portion of the \f[C]fi_usnic_info.ui\f[R] union. Use of version 1 is not recommended and it may be removed from future releases. .PP @@ -341,7 +327,7 @@ Libfabric release v1.0.0 and can be used to retrieve the network distance of an address. .PP The \[lq]get_distance\[rq] extension is obtained by calling -\f[V]fi_open_ops\f[R] and requesting \f[V]FI_USNIC_AV_OPS_1\f[R] to get +\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_AV_OPS_1\f[R] to get the usNIC address vector extension operations. .IP .nf @@ -357,9 +343,9 @@ Address vector Destination address .TP \f[I]metric\f[R] -On output this will contain \f[V]-1\f[R] if the destination host is -unreachable, \f[V]0\f[R] is the destination host is locally connected, -and \f[V]1\f[R] otherwise. +On output this will contain \f[C]-1\f[R] if the destination host is +unreachable, \f[C]0\f[R] is the destination host is locally connected, +and \f[C]1\f[R] otherwise. .PP See fi_ext_usnic.h for more details. .SH VERSION DIFFERENCES @@ -369,28 +355,28 @@ The release of libfabric v1.4 introduced a new naming convention for fabric and domain. However the usNIC provider remains backward compatible with applications supporting the old scheme and decides which one to use based on the -version passed to \f[V]fi_getinfo\f[R]: +version passed to \f[C]fi_getinfo\f[R]: .IP \[bu] 2 -When \f[V]FI_VERSION(1,4)\f[R] or higher is used: +When \f[C]FI_VERSION(1,4)\f[R] or higher is used: .RS 2 .IP \[bu] 2 fabric name is the network address with the CIDR notation (i.e., -\f[V]a.b.c.d/e\f[R]) +\f[C]a.b.c.d/e\f[R]) .IP \[bu] 2 -domain name is the usNIC Linux interface name (i.e., \f[V]usnic_X\f[R]) +domain name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) .RE .IP \[bu] 2 -When a lower version number is used, like \f[V]FI_VERSION(1, 3)\f[R], it +When a lower version number is used, like \f[C]FI_VERSION(1, 3)\f[R], it follows the same behavior the usNIC provider exhibited in libfabric <= v1.3: .RS 2 .IP \[bu] 2 -fabric name is the usNIC Linux interface name (i.e., \f[V]usnic_X\f[R]) +fabric name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) .IP \[bu] 2 -domain name is \f[V]NULL\f[R] +domain name is \f[C]NULL\f[R] .RE .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_open_ops\f[R](3), \f[V]fi_provider\f[R](7), +\f[C]fabric\f[R](7), \f[C]fi_open_ops\f[R](3), \f[C]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_verbs.7 b/man/man7/fi_verbs.7 index 866829ed8e0..c5ffb67d719 100644 --- a/man/man7/fi_verbs.7 +++ b/man/man7/fi_verbs.7 @@ -1,20 +1,6 @@ -.\" Automatically generated by Pandoc 3.1.3 +.\" Automatically generated by Pandoc 2.9.2.1 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} -.TH "fi_verbs" "7" "2024\-10\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_verbs" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -161,7 +147,7 @@ to be re-mapped when the process is forked (MADV_DONTFORK). .PP The XRC transport is intended to be used when layered with the RXM provider and requires the use of shared receive contexts. -See \f[V]fi_rxm\f[R](7). +See \f[C]fi_rxm\f[R](7). To enable XRC, the following environment variables must usually be set: FI_VERBS_PREFER_XRC and FI_OFI_RXM_USE_SRX. .SH RUNTIME PARAMETERS @@ -294,6 +280,6 @@ post excess receives without draining the CQ. CQ overruns can make the MSG endpoints unusable. .SH SEE ALSO .PP -\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. From ecb2b3f64bd49e0fe4d2ca9969089187a44a2dde Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 18 Oct 2024 02:46:31 +0000 Subject: [PATCH 139/393] fabtests/pytest/efa: Loose assertion for read request counters The current assertion check for send read wrs and bytes are based on the exact calculation of benchmark data size + fabtests control message size, which is too strict. Fabtests may change control message sizes which should not impact this test. This patch looses the assertion for read request counters which make it have enough coverage for the benchmark data size. Signed-off-by: Shi Jin --- fabtests/pytest/efa/test_runt.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fabtests/pytest/efa/test_runt.py b/fabtests/pytest/efa/test_runt.py index 8735298ad49..701406be26d 100644 --- a/fabtests/pytest/efa/test_runt.py +++ b/fabtests/pytest/efa/test_runt.py @@ -74,16 +74,15 @@ def test_runt_read_functional(cmdline_args, memory_type, copy_method): if copy_method == "localread": # when local read copy is used, server issue RDMA requests to copy received data # - # so in this case, total read wr is 11, which is + # so in this case, total read wr is at least 9, which is # 1 remote read of 192k # 8 local read for the 64k data transfer by send - # 2 local read for 2 fabtests control messages + # More local reads for fabtests control messages # - # and total read_bytes will be 262149, which is: - # 256k message + 2 fabtests control messages (1 byte and 4 byte each) + # and total read_bytes will be >= 256K including the control messages # - assert server_read_wrs == 11 - assert server_read_bytes == 262149 + assert server_read_wrs >= 9 + assert server_read_bytes >= 262144 else: # The other 192 KB is transfer by RDMA read # for which the server (receiver) will issue 1 read request. From 066a32a2c94be302fb91ec3cc67b22782458f1c1 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Wed, 16 Oct 2024 16:19:10 -0700 Subject: [PATCH 140/393] xpmem: Change the log level to info We build libfabric with xpmem support but we may run it on a OS without the kernel module loaded. Change FI_WARN to FI_INFO to reduce the noise in the log following the same model as cuda dlopen. Signed-off-by: Jessie Yang --- src/xpmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpmem.c b/src/xpmem.c index 82b9811e01b..9f73db3f1d1 100644 --- a/src/xpmem.c +++ b/src/xpmem.c @@ -108,7 +108,7 @@ int ofi_xpmem_init(void) xpmem->pinfo.seg_id = xpmem_make(0, XPMEM_MAXADDR_SIZE, XPMEM_PERMIT_MODE, (void *) 0666); if (xpmem->pinfo.seg_id == -1) { - FI_WARN(&core_prov, FI_LOG_CORE, + FI_INFO(&core_prov, FI_LOG_CORE, "Failed to export process virtual address space for use with xpmem\n"); ret = -FI_ENODATA; goto fail; From 44a7cfbaeed15cc79d07e8ddb65159f091b1458f Mon Sep 17 00:00:00 2001 From: Cody Mann Date: Fri, 30 Aug 2024 15:05:48 -0400 Subject: [PATCH 141/393] prov/opx: Only posting one completion for rzv truncation receives. Signed-off-by: Cody Mann --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 32 +- .../rdma/opx/fi_opx_fabric_transport.h | 17 +- .../include/rdma/opx/fi_opx_hfi1_transport.h | 9 + prov/opx/src/fi_opx_hfi1.c | 364 ++++++++++++++++++ 4 files changed, 392 insertions(+), 30 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 2421c12596a..b5814148b2a 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -1296,28 +1296,15 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, const uint8_t u8_rx = hdr->rendezvous.origin_rx; const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); - uint8_t * rbuf = (uint8_t *)recv_buf; - - FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, - hdr, - payload, - u8_rx, 1, - origin_byte_counter_vaddr, - context, - (uintptr_t)(rbuf), /* receive buffer virtual address */ - FI_HMEM_SYSTEM, /* receive buffer iface */ - 0UL, /* receive buffer device */ - 0UL, /* immediate_data */ - 0UL, /* immediate_end_block_count */ - src_dst_iov, - FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC, - is_intranode, - reliability, /* compile-time constant expression */ - u32_ext_rx, - hfi1_type); + assert(payload != NULL); - if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr); + FI_OPX_FABRIC_RX_RZV_RTS_ETRUNC(opx_ep, + (const void * const)hdr, + u8_rx, + origin_byte_counter_vaddr, + is_intranode, + reliability, /* compile-time constant expression */ + u32_ext_rx, hfi1_type); /* Post a E_TRUNC to our local RX error queue because a client called receive with too small a buffer. Tell them about it via the error cq */ @@ -1993,6 +1980,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, .len = (p->rendezvous.contiguous.src_blocks << 6), .device = p->rendezvous.contiguous.src_device_id, .iface = (enum fi_hmem_iface) p->rendezvous.contiguous.src_iface + }; const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; const uint64_t * const immediate_qw = p->rendezvous.contiguous.immediate_qw; @@ -2099,7 +2087,7 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep * opx_ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- %s RENDEZVOUS CTS (begin)\n", is_intranode ? "SHM":"HFI"); - assert(payload != NULL); + assert(payload != NULL || hdr->cts.target.opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC); const uint8_t u8_rx = hdr->cts.origin_rx; const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->cts.origin_rx); diff --git a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h index c821caebc15..fd7c339a36a 100644 --- a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h @@ -38,14 +38,15 @@ #ifdef FI_OPX_FABRIC_HFI1 #include "rdma/opx/fi_opx_hfi1_transport.h" -#define FI_OPX_FABRIC_TX_INJECT fi_opx_hfi1_tx_inject -#define FI_OPX_FABRIC_TX_SEND_EGR fi_opx_hfi1_tx_send_egr_select -#define FI_OPX_FABRIC_TX_SENDV_EGR fi_opx_hfi1_tx_sendv_egr_select -#define FI_OPX_FABRIC_TX_SEND_RZV fi_opx_hfi1_tx_send_rzv_select -#define FI_OPX_FABRIC_TX_SENDV_RZV fi_opx_hfi1_tx_sendv_rzv -#define FI_OPX_FABRIC_RX_RZV_RTS fi_opx_hfi1_rx_rzv_rts -#define FI_OPX_FABRIC_RX_RZV_CTS fi_opx_hfi1_rx_rzv_cts -#define FI_OPX_FABRIC_TX_DO_PUT fi_opx_hfi1_do_dput +#define FI_OPX_FABRIC_TX_INJECT fi_opx_hfi1_tx_inject +#define FI_OPX_FABRIC_TX_SEND_EGR fi_opx_hfi1_tx_send_egr_select +#define FI_OPX_FABRIC_TX_SENDV_EGR fi_opx_hfi1_tx_sendv_egr_select +#define FI_OPX_FABRIC_TX_SEND_RZV fi_opx_hfi1_tx_send_rzv_select +#define FI_OPX_FABRIC_TX_SENDV_RZV fi_opx_hfi1_tx_sendv_rzv +#define FI_OPX_FABRIC_RX_RZV_RTS fi_opx_hfi1_rx_rzv_rts +#define FI_OPX_FABRIC_RX_RZV_RTS_ETRUNC fi_opx_hfi1_rx_rzv_rts_etrunc +#define FI_OPX_FABRIC_RX_RZV_CTS fi_opx_hfi1_rx_rzv_cts +#define FI_OPX_FABRIC_TX_DO_PUT fi_opx_hfi1_do_dput #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index 63077754d6e..ab3efa4c511 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -640,6 +640,15 @@ void fi_opx_store_inject_and_copy_scb2_16B(volatile uint64_t scb[8], local[8] = d8; } +void fi_opx_hfi1_rx_rzv_rts_etrunc (struct fi_opx_ep *opx_ep, + const union opx_hfi1_packet_hdr * const hdr, + const uint8_t u8_rx, + uintptr_t origin_byte_counter_vaddr, + const unsigned is_intranode, + const enum ofi_reliability_kind reliability, + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type); + void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr * const hdr, const void * const payload, const uint8_t u8_rx, const uint64_t niov, diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index d15a9cd9d3b..b697f0c095b 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -1727,6 +1727,370 @@ int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) return -FI_EAGAIN; } +int opx_hfi1_rx_rzv_rts_send_etrunc_intranode(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + + struct fi_opx_ep * opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, SHM -- RENDEZVOUS RTS ETRUNC (begin)\n"); + uint64_t pos; + /* Possible SHM connections required for certain applications (i.e., DAOS) + * exceeds the max value of the legacy u8_rx field. Use u32_extended field. + */ + ssize_t rc = fi_opx_shm_dynamic_tx_connect(OPX_INTRANODE_TRUE, opx_ep, + params->u32_extended_rx, params->target_hfi_unit); + + if (OFI_UNLIKELY(rc)) { + return -FI_EAGAIN; + } + + union opx_hfi1_packet_hdr * const tx_hdr = + opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, + opx_ep->daos_info.rank_inst, &rc); + + if(!tx_hdr) return rc; + + /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), + because this is intranode and since it's a CTS packet, lrh.pktlen + isn't used/needed */ + tx_hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid; + tx_hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + tx_hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; + tx_hdr->qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + tx_hdr->qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode; + tx_hdr->qw_9B[5] = params->origin_byte_counter_vaddr; + + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, SHM -- RENDEZVOUS RTS ETRUNC (end)\n"); + + return FI_SUCCESS; +} + +int opx_hfi1_rx_rzv_rts_send_etrunc_intranode_16B(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep * opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + const uint64_t lrh_dlid_16B = htons(lrh_dlid >> 16); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV 16B, SHM -- RENDEZVOUS RTS ETRUNC (begin)\n"); + uint64_t pos; + /* Possible SHM connections required for certain applications (i.e., DAOS) + * exceeds the max value of the legacy u8_rx field. Use u32_extended field. + */ + ssize_t rc = fi_opx_shm_dynamic_tx_connect(OPX_INTRANODE_TRUE, opx_ep, + params->u32_extended_rx, params->target_hfi_unit); + + if (OFI_UNLIKELY(rc)) { + return -FI_EAGAIN; + } + + union opx_hfi1_packet_hdr * const tx_hdr = + opx_shm_tx_next(&opx_ep->tx->shm, params->target_hfi_unit, params->u8_rx, &pos, + opx_ep->daos_info.hfi_rank_enabled, params->u32_extended_rx, + opx_ep->daos_info.rank_inst, &rc); + + if(!tx_hdr) return rc; + + /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), + because this is intranode and since it's a CTS packet, lrh.pktlen + isn't used/needed */ + tx_hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B)); + tx_hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + tx_hdr->qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; + tx_hdr->qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3]; + tx_hdr->qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + tx_hdr->qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | params->opcode; + tx_hdr->qw_16B[6] = params->origin_byte_counter_vaddr; + + opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, SHM -- RENDEZVOUS RTS ETRUNC (end)\n"); + + return FI_SUCCESS; +} + +int opx_hfi1_rx_rzv_rts_send_etrunc(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (begin)\n"); + + const uint64_t pbc_dws = + 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + const uint16_t lrh_dws = htons(pbc_dws - 1); + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1) < 1)) { + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1) < 1) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (EAGAIN credits)\n"); + return -FI_EAGAIN; + } + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, + &opx_ep->reliability->state, + params->slid, + params->u8_rx, + params->origin_rs, + &psn_ptr, + &replay, + params->reliability, + OPX_HFI1_TYPE); + if(OFI_UNLIKELY(psn == -1)) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (EAGAIN psn/replay)\n"); + return -FI_EAGAIN; + } + + volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); + + fi_opx_store_and_copy_scb_9B(scb, &replay->scb_9B, + opx_ep->rx->tx.cts_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | + ((uint64_t) lrh_dws << 32), + opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn, + opx_ep->rx->tx.cts_9B.hdr.qw_9B[3], + opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | params->opcode, + params->origin_byte_counter_vaddr, 0); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + /* consume one credit */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + /* save the updated txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + params->origin_rs, + params->origin_rx, + psn_ptr, + replay, + params->reliability, + OPX_HFI1_TYPE); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (end)"); + + return FI_SUCCESS; +} + +int opx_hfi1_rx_rzv_rts_send_etrunc_16B(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t lrh_dlid_16B = htons(params->lrh_dlid >> 16); + const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (begin)\n"); + + const uint64_t pbc_dws = + 2 + /* pbc */ + 4 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2; + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + + // Note: Only need 1 credit here for the message truncation error case. Just + // the opcode and origin_byte_counter_vaddr is needed for replaying back to the + // sender. + if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 2) < 2)) { + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 2) < 2) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (EAGAIN credits)\n"); + return -FI_EAGAIN; + } + } + + struct fi_opx_reliability_tx_replay *replay; + union fi_opx_reliability_tx_psn *psn_ptr; + int64_t psn; + + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, + &opx_ep->reliability->state, + params->slid, + params->u8_rx, + params->origin_rs, + &psn_ptr, + &replay, + params->reliability, + OPX_HFI1_TYPE); + if(OFI_UNLIKELY(psn == -1)) { + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (EAGAIN psn/replay)\n"); + return -FI_EAGAIN; + } + + volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); + + fi_opx_store_and_copy_scb_16B(scb, &replay->scb_16B, + opx_ep->rx->tx.cts_16B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, OPX_HFI1_JKR), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t) lrh_qws << 20), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[3] | psn, + opx_ep->rx->tx.cts_16B.hdr.qw_16B[4], + opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | params->opcode, + params->origin_byte_counter_vaddr); + + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + // 2nd cacheline + volatile uint64_t * const scb2 = + FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + + fi_opx_store_and_copy_qw(scb2, &replay->scb_16B.hdr.qw_16B[7], + 0, 0, 0, 0, 0, 0, 0, 0); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + + FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + + /* save the updated txe state */ + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + params->origin_rs, + params->origin_rx, + psn_ptr, + replay, + params->reliability, + OPX_HFI1_TYPE); + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS ETRUNC (end)"); + + return FI_SUCCESS; +} + +void fi_opx_hfi1_rx_rzv_rts_etrunc (struct fi_opx_ep *opx_ep, + const union opx_hfi1_packet_hdr * const hdr, + const uint8_t u8_rx, + uintptr_t origin_byte_counter_vaddr, + const unsigned is_intranode, + const enum ofi_reliability_kind reliability, + const uint32_t u32_extended_rx, + const enum opx_hfi1_type hfi1_type) +{ + + union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); + assert(work != NULL); + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; + params->opx_ep = opx_ep; + params->work_elem.slist_entry.next = NULL; + + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_intranode %u, opcode=%u\n", + is_intranode, FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC); + + if (is_intranode) { + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_etrunc_intranode; + } else { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_etrunc_intranode_16B; + } + params->work_elem.work_type = OPX_WORK_TYPE_SHM; + + uint32_t lid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + lid = hdr->lrh_9B.slid; + else + lid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + + if (lid == opx_ep->rx->self.uid.lid) { + params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; + } else { + struct fi_opx_hfi_local_lookup *hfi_lookup = fi_opx_hfi1_get_lid_local(lid); + assert(hfi_lookup); + params->target_hfi_unit = hfi_lookup->hfi_unit; + } + } else { + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_etrunc; + } else { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_etrunc_16B; + } + params->work_elem.work_type = OPX_WORK_TYPE_PIO; + params->target_hfi_unit = 0xFF; + } + + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->slid = hdr->lrh_9B.slid; + if (hfi1_type & OPX_HFI1_WFR) + params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; + else + params->lrh_dlid = hdr->lrh_9B.slid << 16; + } else { + params->slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); + params->lrh_dlid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid) << 16; // Send CTS to the SLID that sent RTS + } + + params->pbc_dlid = OPX_PBC_LRH_DLID_TO_PBC_DLID(params->lrh_dlid, hfi1_type); + params->origin_rx = hdr->rendezvous.origin_rx; + params->origin_rs = hdr->rendezvous.origin_rs; + params->u8_rx = u8_rx; + params->u32_extended_rx = u32_extended_rx; + params->origin_byte_counter_vaddr = origin_byte_counter_vaddr; + params->is_intranode = is_intranode; + params->reliability = reliability; + params->opcode = FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC; + + int rc = params->work_elem.work_fn(work); + if(rc == FI_SUCCESS) { + OPX_BUF_FREE(work); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_SUCCESS\n"); + return; + } + assert(rc == -FI_EAGAIN); + /* Try again later*/ + assert(work->work_elem.slist_entry.next == NULL); + slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending[params->work_elem.work_type]); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); +} + void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr * const hdr, const void * const payload, From 790f61c94145a3d22467c27a490f4e7f6e78acb8 Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Mon, 2 Sep 2024 19:04:04 -0500 Subject: [PATCH 142/393] prov/opx: Remove FI_CONTEXT2 requirement Signed-off-by: Ben Lynam --- prov/opx/include/rdma/opx/fi_opx.h | 2 +- .../include/rdma/opx/fi_opx_cq_ops_table.h | 13 +- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 762 ++++++------------ prov/opx/include/rdma/opx/fi_opx_eq.h | 90 ++- .../opx/include/rdma/opx/fi_opx_hfi1_packet.h | 71 +- .../include/rdma/opx/fi_opx_hfi1_progress.h | 28 +- .../include/rdma/opx/fi_opx_hfi1_transport.h | 216 ++--- prov/opx/include/rdma/opx/fi_opx_internal.h | 68 +- prov/opx/include/rdma/opx/fi_opx_match.h | 2 +- .../opx/include/rdma/opx/fi_opx_reliability.h | 14 +- prov/opx/include/rdma/opx/fi_opx_rma.h | 38 +- prov/opx/src/fi_opx_atomic.c | 198 +++-- prov/opx/src/fi_opx_cq.c | 19 +- prov/opx/src/fi_opx_ep.c | 179 ++-- prov/opx/src/fi_opx_hfi1.c | 288 +++++-- prov/opx/src/fi_opx_init.c | 17 +- prov/opx/src/fi_opx_rma.c | 230 ++++-- prov/opx/src/fi_opx_tagged.c | 127 ++- 18 files changed, 1162 insertions(+), 1200 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx.h b/prov/opx/include/rdma/opx/fi_opx.h index 5de50e4e66f..c5e1f3a9167 100644 --- a/prov/opx/include/rdma/opx/fi_opx.h +++ b/prov/opx/include/rdma/opx/fi_opx.h @@ -250,7 +250,7 @@ static const uint64_t FI_OPX_HDRQ_MASK_8192 = 0X000000000003FFE0UL; (FI_OPX_BASE_CAPS | FI_OPX_RXONLY_CAPS) #define FI_OPX_DEFAULT_MODE \ - (FI_CONTEXT2 | FI_ASYNC_IOV) + (FI_ASYNC_IOV) diff --git a/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h b/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h index 8554faa85c4..66388eb8d8a 100644 --- a/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h +++ b/prov/opx/include/rdma/opx/fi_opx_cq_ops_table.h @@ -60,26 +60,23 @@ fi_opx_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, uint64_t flags if (IS_PROGRESS_MANUAL(opx_cq->domain)) { - struct fi_opx_context_ext * ext = - (struct fi_opx_context_ext *) opx_cq->err.head; + struct opx_context *context = + (struct opx_context *) opx_cq->err.head; - if ((ext == NULL) || (ext->opx_context.byte_counter != 0)) { + if ((context == NULL) || (context->byte_counter != 0)) { /* perhaps an in-progress truncated rendezvous receive? */ errno = FI_EAGAIN; return -errno; } - assert(ext->opx_context.flags & FI_OPX_CQ_CONTEXT_EXT); /* DEBUG */ - const enum fi_threading threading = opx_cq->domain->threading; const int lock_required = fi_opx_threading_lock_required(threading, fi_opx_global.progress); fi_opx_lock_if_required(&opx_cq->lock, lock_required); ofi_cq_err_memcpy(opx_cq->domain->fabric->fabric_fid.api_version, - buf, &ext->err_entry); + buf, &context->err_entry); slist_remove_head((struct slist *)&opx_cq->err); - OPX_BUF_FREE(ext); - ext = NULL; + OPX_BUF_FREE(context); fi_opx_unlock_if_required(&opx_cq->lock, lock_required); } else { diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index b5814148b2a..1429a393484 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -66,17 +66,12 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); #define OPX_FLAGS_OVERRIDE_TRUE (1) #define OPX_FLAGS_OVERRIDE_FALSE (0) -#define OPX_CONTEXT_EXTENDED_TRUE (1) -#define OPX_CONTEXT_EXTENDED_FALSE (0) - #define OPX_MULTI_RECV_TRUE (1) #define OPX_MULTI_RECV_FALSE (0) #define OPX_HMEM_TRUE (1) #define OPX_HMEM_FALSE (0) -#define OPX_CANCEL_CONTEXT_TRUE (1) -#define OPX_CANCEL_CONTEXT_FALSE (0) // #define FI_OPX_TRACE 1 // #define FI_OPX_REMOTE_COMPLETION @@ -402,7 +397,7 @@ struct fi_opx_ep_rx { struct slist * cq_pending_ptr; struct slist * cq_completed_ptr; struct ofi_bufpool * ue_packet_pool; - struct ofi_bufpool * ctx_ext_pool; + struct ofi_bufpool * ctx_pool; uint64_t unused_cacheline_3[4]; /* == CACHE LINE 4 == */ @@ -611,22 +606,19 @@ struct fi_opx_sep { struct fi_opx_av *av; struct fi_info *info; void *memptr; - struct fi_opx_ep *ep[FI_OPX_ADDR_SEP_RX_MAX]; - struct fi_opx_hfi1_context *hfi1[FI_OPX_ADDR_SEP_RX_MAX]; - struct fi_opx_ep_reliability *reliability[FI_OPX_ADDR_SEP_RX_MAX]; - struct fi_opx_ep_tx *tx[FI_OPX_ADDR_SEP_RX_MAX]; - struct fi_opx_ep_rx *rx[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_ep *ep[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_hfi1_context *hfi1[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_ep_reliability *reliability[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_ep_tx *tx[FI_OPX_ADDR_SEP_RX_MAX]; + struct fi_opx_ep_rx *rx[FI_OPX_ADDR_SEP_RX_MAX]; - int64_t ref_cnt; + int64_t ref_cnt; } __attribute((aligned(L2_CACHE_LINE_SIZE))); struct fi_opx_rzv_completion { - union { - union fi_opx_context *context; - struct fi_opx_context_ext *extended_context; - }; + struct opx_context *context; uint64_t tid_length; uint64_t tid_vaddr; uint64_t tid_byte_counter; @@ -647,8 +639,8 @@ struct fi_opx_rma_request { __attribute__((noinline)) void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, const uint64_t static_flags, - union fi_opx_context * context, - const uint64_t rx_op_flags, const uint64_t is_context_ext, + struct opx_context *context, + const uint64_t rx_op_flags, const uint64_t is_hmem, const int lock_required, const enum fi_av_type av_type, const enum ofi_reliability_kind reliability, @@ -726,6 +718,7 @@ void fi_opx_ep_clear_credit_return(struct fi_opx_ep *opx_ep) { #define FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep) fi_opx_ep_clear_credit_return(opx_ep) + #include "rdma/opx/fi_opx_fabric_transport.h" #ifdef OPX_DAOS_DEBUG @@ -851,7 +844,7 @@ uint64_t fi_opx_ep_is_matching_packet(const uint64_t origin_tag, __OPX_FORCE_INLINE__ struct fi_opx_hfi1_ue_packet *fi_opx_ep_find_matching_packet(struct fi_opx_ep *opx_ep, - union fi_opx_context * context, + struct opx_context *context, const uint64_t kind, const enum opx_hfi1_type hfi1_type) { @@ -891,7 +884,7 @@ struct fi_opx_hfi1_ue_packet *fi_opx_ep_find_matching_packet(struct fi_opx_ep *o __OPX_FORCE_INLINE__ uint64_t is_match (struct fi_opx_ep * opx_ep, const union opx_hfi1_packet_hdr * const hdr, - union fi_opx_context * context, + struct opx_context *context, uint32_t rank, uint32_t rank_inst, unsigned is_intranode, const uint64_t slid) @@ -966,50 +959,34 @@ uint32_t fi_opx_ep_get_u32_extended_rx (struct fi_opx_ep * opx_ep, } __OPX_FORCE_INLINE__ -void fi_opx_enqueue_completed(struct slist *queue, - void *context, - const uint64_t is_context_ext, - const int lock_required) +void fi_opx_enqueue_completed(struct slist *queue, struct opx_context *context, const int lock_required) { assert(!lock_required); - - union fi_opx_context *real_context; - - if (is_context_ext) { - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) context; - real_context = (union fi_opx_context *) ext->msg.op_context; - *real_context = ext->opx_context; - real_context->flags &= ~(FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM); - real_context->next = NULL; - OPX_BUF_FREE(ext); - } else { - real_context = (union fi_opx_context *) context; - } - - slist_insert_tail((struct slist_entry *) real_context, queue); + assert(context); + context->flags &= ~FI_OPX_CQ_CONTEXT_HMEM; + slist_insert_tail((struct slist_entry *) context, queue); } __OPX_FORCE_INLINE__ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, - const union fi_opx_hfi1_packet_payload * const payload, - struct fi_opx_ep * opx_ep, - const uint64_t origin_tag, - const uint8_t opcode, - union fi_opx_context *context, - const uint64_t is_context_ext, - const uint64_t is_multi_receive, - const unsigned is_intranode, - const uint64_t is_hmem, - const int lock_required, - const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type, - const uintptr_t origin_byte_counter_vaddr, - const struct fi_opx_hmem_iov *iov, - const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info, - const struct fi_opx_hmem_iov *src_dst_iov, - const uint8_t * const immediate_byte, - const uint64_t * const immediate_qw, - const union cacheline * const immediate_block) + const union fi_opx_hfi1_packet_payload * const payload, + struct fi_opx_ep * opx_ep, + const uint64_t origin_tag, + const uint8_t opcode, + struct opx_context *context, + const uint64_t is_multi_receive, + const unsigned is_intranode, + const uint64_t is_hmem, + const int lock_required, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type, + const uintptr_t origin_byte_counter_vaddr, + const struct fi_opx_hmem_iov *iov, + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info, + const struct fi_opx_hmem_iov *src_dst_iov, + const uint8_t * const immediate_byte, + const uint64_t * const immediate_qw, + const union cacheline * const immediate_block) { assert( (opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) || (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); @@ -1030,8 +1007,8 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, assert(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS); const uint8_t u8_rx = hdr->rendezvous.origin_rx; const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); - union fi_opx_context * original_multi_recv_context = context; - context = (union fi_opx_context *)((uintptr_t)recv_buf - sizeof(union fi_opx_context)); + struct opx_context * original_multi_recv_context = context; + context = (struct opx_context *)((uintptr_t)recv_buf - sizeof(struct opx_context)); assert((((uintptr_t)context) & 0x07) == 0); context->flags = FI_RECV | FI_MSG | FI_OPX_CQ_CONTEXT_MULTIRECV; @@ -1134,7 +1111,7 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, } } - uint64_t bytes_consumed = ((xfer_len + 8) & (~0x07ull)) + sizeof(union fi_opx_context); + uint64_t bytes_consumed = ((xfer_len + 8) & (~0x07ull)) + sizeof(struct opx_context); original_multi_recv_context->len -= bytes_consumed; original_multi_recv_context->byte_counter++; // re-using the byte counter as a "pending flag" original_multi_recv_context->tag = (uintptr_t)opx_ep; // re-using tag to store the ep @@ -1162,8 +1139,7 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, enum fi_hmem_iface rbuf_iface; uint64_t hmem_handle; if (is_hmem) { /* Branch should compile out */ - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; rbuf_device = hmem_info->device; rbuf_iface = hmem_info->iface; hmem_handle = hmem_info->hmem_dev_reg_handle; @@ -1309,38 +1285,23 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, /* Post a E_TRUNC to our local RX error queue because a client called receive with too small a buffer. Tell them about it via the error cq */ - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - ext->err_entry.op_context = ext->msg.op_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory error.\n"); - abort(); - } - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - ext->err_entry.op_context = context; - } - - ext->err_entry.flags = context->flags; - ext->err_entry.len = recv_len; - ext->err_entry.buf = recv_buf; - ext->err_entry.data = ofi_data; - ext->err_entry.tag = origin_tag; - ext->err_entry.olen = xfer_len - recv_len; - ext->err_entry.err = FI_ETRUNC; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; + context->err_entry.flags = context->flags; + context->err_entry.len = recv_len; + context->err_entry.buf = recv_buf; + context->err_entry.data = ofi_data; + context->err_entry.tag = origin_tag; + context->err_entry.olen = xfer_len - recv_len; + context->err_entry.err = FI_ETRUNC; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; - ext->opx_context.byte_counter = 0; - ext->opx_context.next = NULL; + context->byte_counter = 0; + context->next = NULL; /* post an 'error' completion event */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - slist_insert_tail((struct slist_entry *) ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_err_ptr); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-RTS"); @@ -1357,13 +1318,12 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, * \param[in,out] entry Completion entry */ __OPX_FORCE_INLINE__ -void complete_receive_operation_internal (struct fid_ep *ep, +void opx_ep_complete_receive_operation (struct fid_ep *ep, const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, const uint64_t origin_tag, - union fi_opx_context ** context_ptr, + struct opx_context *context, const uint8_t opcode, - const uint64_t is_context_ext, const uint64_t is_multi_receive, const unsigned is_intranode, const uint64_t is_hmem, @@ -1372,12 +1332,10 @@ void complete_receive_operation_internal (struct fid_ep *ep, const enum opx_hfi1_type hfi1_type) { - assert((is_hmem && is_context_ext) || !is_hmem); assert((is_multi_receive && !is_hmem) || !is_multi_receive); struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); struct fi_opx_ep_rx * const rx = opx_ep->rx; - union fi_opx_context *context = *context_ptr; const uint64_t recv_len = context->len; /* @@ -1407,8 +1365,8 @@ void complete_receive_operation_internal (struct fid_ep *ep, if (send_len) memcpy(recv_buf, (void*)&hdr->inject.app_data_u8[0], send_len); - union fi_opx_context * original_multi_recv_context = context; - context = (union fi_opx_context *)((uintptr_t)recv_buf - sizeof(union fi_opx_context)); + struct opx_context * original_multi_recv_context = context; + context = (struct opx_context *)((uintptr_t)recv_buf - sizeof(struct opx_context)); assert((((uintptr_t)context) & 0x07) == 0); context->flags = FI_RECV | FI_MSG | FI_OPX_CQ_CONTEXT_MULTIRECV; @@ -1421,7 +1379,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, context->next = NULL; /* the next 'fi_opx_context' must be 8-byte aligned */ - uint64_t bytes_consumed = ((send_len + 8) & (~0x07ull)) + sizeof(union fi_opx_context); + uint64_t bytes_consumed = ((send_len + 8) & (~0x07ull)) + sizeof(struct opx_context); original_multi_recv_context->len -= bytes_consumed; original_multi_recv_context->buf = (void*)((uintptr_t)(original_multi_recv_context->buf) + bytes_consumed); @@ -1431,8 +1389,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, } else if (OFI_LIKELY(send_len <= recv_len)) { if (is_hmem && send_len) { - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, recv_buf, hdr->inject.app_data_u8, send_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); @@ -1497,45 +1454,30 @@ void complete_receive_operation_internal (struct fid_ep *ep, context->next = NULL; /* post a completion event for the individual receive */ - fi_opx_enqueue_completed(rx->cq_completed_ptr, context, is_context_ext, lock_required); + fi_opx_enqueue_completed(rx->cq_completed_ptr, context, lock_required); } else { /* truncation - unlikely */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "INJECT truncation - send_len %lu > recv_len %lu posting error\n", send_len, recv_len); - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - ext->err_entry.op_context = ext->msg.op_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory error.\n"); - abort(); - } - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - ext->err_entry.op_context = context; - } - - ext->err_entry.flags = context->flags; - ext->err_entry.len = recv_len; - ext->err_entry.buf = recv_buf; - ext->err_entry.data = ofi_data; - ext->err_entry.tag = origin_tag; - ext->err_entry.olen = send_len - recv_len; - ext->err_entry.err = FI_ETRUNC; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; + context->err_entry.flags = context->flags; + context->err_entry.len = recv_len; + context->err_entry.buf = recv_buf; + context->err_entry.data = ofi_data; + context->err_entry.tag = origin_tag; + context->err_entry.olen = send_len - recv_len; + context->err_entry.err = FI_ETRUNC; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; - ext->opx_context.byte_counter = 0; - ext->opx_context.next = NULL; + context->byte_counter = 0; + context->next = NULL; /* post an 'error' completion event for the receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - slist_insert_tail((struct slist_entry *) ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_err_ptr); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-INJECT"); @@ -1561,9 +1503,9 @@ void complete_receive_operation_internal (struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "EAGER is_multi_recv\n"); - union fi_opx_context * original_multi_recv_context = context; - //assert(original_multi_recv_context->next == NULL); - context = (union fi_opx_context *)((uintptr_t)recv_buf - sizeof(union fi_opx_context)); + struct opx_context *original_multi_recv_context = context; + + context = (struct opx_context *)((uintptr_t)recv_buf - sizeof(struct opx_context)); assert((((uintptr_t)context) & 0x07) == 0); context->flags = FI_RECV | FI_MSG | FI_OPX_CQ_CONTEXT_MULTIRECV; context->buf = recv_buf; @@ -1589,7 +1531,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, } /* the next 'fi_opx_context' must be 8-byte aligned */ - uint64_t bytes_consumed = ((send_len + 8) & (~0x07ull)) + sizeof(union fi_opx_context); + uint64_t bytes_consumed = ((send_len + 8) & (~0x07ull)) + sizeof(struct opx_context); original_multi_recv_context->len -= bytes_consumed; original_multi_recv_context->buf = (void*)((uintptr_t)(original_multi_recv_context->buf) + bytes_consumed); @@ -1623,9 +1565,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, } if (is_hmem) { - assert(is_context_ext); - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, context->buf, opx_ep->hmem_copy_buf, send_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); @@ -1653,45 +1593,30 @@ void complete_receive_operation_internal (struct fid_ep *ep, context->next = NULL; /* post a completion event for the individual receive */ - fi_opx_enqueue_completed(rx->cq_completed_ptr, context, is_context_ext, lock_required); + fi_opx_enqueue_completed(rx->cq_completed_ptr, context, lock_required); } else { /* truncation - unlikely */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "EAGER truncation - send_len %lu > recv_len %lu posting error\n", send_len, recv_len); - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - ext->err_entry.op_context = ext->msg.op_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory error.\n"); - abort(); - } - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - ext->err_entry.op_context = context; - } - - ext->err_entry.flags = context->flags; - ext->err_entry.len = recv_len; - ext->err_entry.buf = recv_buf; - ext->err_entry.data = ofi_data; - ext->err_entry.tag = origin_tag; - ext->err_entry.olen = send_len - recv_len; - ext->err_entry.err = FI_ETRUNC; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; + context->err_entry.flags = context->flags; + context->err_entry.len = recv_len; + context->err_entry.buf = recv_buf; + context->err_entry.data = ofi_data; + context->err_entry.tag = origin_tag; + context->err_entry.olen = send_len - recv_len; + context->err_entry.err = FI_ETRUNC; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; - ext->opx_context.byte_counter = 0; - ext->opx_context.next = NULL; + context->byte_counter = 0; + context->next = NULL; /* post an 'error' completion event for the receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - slist_insert_tail((struct slist_entry *) ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, rx->cq_err_ptr); } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-EAGER"); @@ -1762,8 +1687,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, context->next = NULL; if (is_hmem) { - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, recv_buf, opx_ep->hmem_copy_buf, packet_payload_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); @@ -1779,36 +1703,19 @@ void complete_receive_operation_internal (struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "EAGER truncation - xfer_len %lu > recv_len %lu posting error\n", payload_total_len, recv_len); - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - ext->err_entry.op_context = ext->msg.op_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory error."); - abort(); - } - ext->opx_context = *context; - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - ext->err_entry.op_context = context; - } + context->err_entry.flags = context->flags; + context->err_entry.len = recv_len; + context->err_entry.buf = recv_buf; + context->err_entry.data = ofi_data; + context->err_entry.tag = origin_tag; + context->err_entry.olen = payload_total_len - recv_len; + context->err_entry.err = FI_ETRUNC; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; - ext->err_entry.flags = context->flags; - ext->err_entry.len = recv_len; - ext->err_entry.buf = recv_buf; - ext->err_entry.data = ofi_data; - ext->err_entry.tag = origin_tag; - ext->err_entry.olen = payload_total_len - recv_len; - ext->err_entry.err = FI_ETRUNC; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; - - ext->opx_context.byte_counter = payload_total_len - packet_payload_len; - ext->opx_context.next = NULL; - *context_ptr = (union fi_opx_context*)ext; + context->byte_counter = payload_total_len - packet_payload_len; + context->next = NULL; } #ifndef NDEBUG if (context->byte_counter == 0) { @@ -1838,8 +1745,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, /* If we flagged this context w/ an error, just decrement the byte counter that this * nth packet would have filled in */ - if (OFI_UNLIKELY(is_context_ext && - ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { + if (OFI_UNLIKELY(context->err_entry.err == FI_ETRUNC)) { context->byte_counter -= send_len; return; } @@ -1906,8 +1812,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, if (is_hmem) { recv_buf = (void*)((uint8_t*) context->buf + hdr->mp_eager_nth.payload_offset); - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *)context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, recv_buf, opx_ep->hmem_copy_buf, send_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); @@ -1961,7 +1866,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; fi_opx_handle_recv_rts(hdr, payload, opx_ep, origin_tag, opcode, - context, is_context_ext, is_multi_receive, is_intranode, is_hmem, + context, is_multi_receive, is_intranode, is_hmem, lock_required, reliability, hfi1_type, origin_byte_counter_vaddr, iov, immediate_info, &src_dst_iov, immediate_byte, immediate_qw, immediate_block); @@ -1988,44 +1893,13 @@ void complete_receive_operation_internal (struct fid_ep *ep, const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; fi_opx_handle_recv_rts(hdr, payload, opx_ep, origin_tag, opcode, - context, is_context_ext, is_multi_receive, is_intranode, is_hmem, + context, is_multi_receive, is_intranode, is_hmem, lock_required, reliability, hfi1_type, origin_byte_counter_vaddr, iov, immediate_info, &src_dst_iov, immediate_byte, immediate_qw, immediate_block); } FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); } -/** - * \brief Complete a receive operation that has matched the packet header with - * the match information - * - * \param[in] rx Receive endoint - * \param[in] hdr MU packet header that matched - * \param[in,out] entry Completion entry - */ -__OPX_FORCE_INLINE__ -void complete_receive_operation(struct fid_ep *ep, - const union opx_hfi1_packet_hdr * const hdr, - const union fi_opx_hfi1_packet_payload * const payload, - const uint64_t origin_tag, - union fi_opx_context * context, - const uint8_t opcode, - const uint64_t is_context_ext, - const uint64_t is_multi_receive, - const unsigned is_intranode, - const uint64_t is_hmem, - const int lock_required, - const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) -{ - union fi_opx_context * original_context = context; - (void) original_context; - complete_receive_operation_internal(ep, hdr, payload, origin_tag, &context, - opcode, is_context_ext, is_multi_receive, - is_intranode, is_hmem, lock_required, reliability, hfi1_type); - assert(context == original_context); -} - __OPX_FORCE_INLINE__ ssize_t fi_opx_shm_dynamic_tx_connect(const unsigned is_intranode, struct fi_opx_ep * opx_ep, @@ -2222,7 +2096,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, { struct fi_opx_rzv_completion * rzv_comp = (struct fi_opx_rzv_completion *)(hdr->dput.target.rzv.completion_vaddr); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-DATA-HFI-DPUT:%p", rzv_comp); - union fi_opx_context *target_context = rzv_comp->context; + struct opx_context *target_context = rzv_comp->context; assert(target_context); uint64_t* rbuf_qws = (uint64_t *) fi_opx_dput_rbuf_in(hdr->dput.target.rzv.rbuf); @@ -2239,9 +2113,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, const uint64_t *sbuf_qws = (uint64_t*)&payload->byte[0]; #ifdef OPX_HMEM if (target_context->flags & FI_OPX_CQ_CONTEXT_HMEM) { - assert(target_context->flags & FI_OPX_CQ_CONTEXT_EXT); - struct fi_opx_context_ext *ext = rzv_comp->extended_context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) target_context->hmem_info_qws; assert(hmem_info->iface > FI_HMEM_SYSTEM); opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, rbuf_qws, sbuf_qws, bytes, @@ -2272,7 +2144,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RX_PROCESS_HEADER_RZV_TID"); FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_rcv_pkts); struct fi_opx_rzv_completion * rzv_comp = (struct fi_opx_rzv_completion *)(hdr->dput.target.rzv.completion_vaddr); - union fi_opx_context *target_context = rzv_comp->context; + struct opx_context *target_context = rzv_comp->context; assert(target_context); /* TID packets are mixed 4k/8k packets and length adjusted, @@ -2314,8 +2186,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, "TID REPLAY rbuf_qws %p, sbuf_qws %p, bytes %u/%#x, target_context->byte_counter %p\n", (void*)rbuf_qws, (void*)sbuf_qws, bytes, bytes, &target_context->byte_counter); if (target_context->flags & FI_OPX_CQ_CONTEXT_HMEM) { - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) target_context; - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) target_context->hmem_info_qws; assert(hmem_info->iface > FI_HMEM_SYSTEM); opx_copy_to_hmem(hmem_info->iface, hmem_info->device, hmem_info->hmem_dev_reg_handle, rbuf_qws, sbuf_qws, bytes, @@ -2695,7 +2566,7 @@ uint64_t fi_opx_mp_egr_id_from_nth_packet(const union opx_hfi1_packet_hdr *hdr, __OPX_FORCE_INLINE__ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, - union fi_opx_context *context, + struct opx_context *context, union fi_opx_mp_egr_id mp_egr_id, const unsigned is_intranode, const int lock_required, @@ -2703,7 +2574,6 @@ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - const uint64_t is_context_ext = context->flags & FI_OPX_CQ_CONTEXT_EXT; const uint64_t is_hmem = context->flags & FI_OPX_CQ_CONTEXT_HMEM; struct fi_opx_hfi1_ue_packet *uepkt = opx_ep->rx->mp_egr_queue.ue.head; @@ -2719,13 +2589,12 @@ void fi_opx_ep_rx_process_pending_mp_eager_ue(struct fid_ep *ep, if (fi_opx_mp_egr_id_from_nth_packet(&uepkt->hdr, slid) == mp_egr_id.id) { - complete_receive_operation(ep, + opx_ep_complete_receive_operation(ep, &uepkt->hdr, &uepkt->payload, 0, /* OFI Tag, N/A for multi-packet eager nth */ context, FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH, - is_context_ext, OPX_MULTI_RECV_FALSE, OPX_INTRANODE_FALSE, is_hmem, @@ -2768,8 +2637,8 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, const uint64_t kind = (static_flags & FI_TAGGED) ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG; assert((kind == FI_OPX_KIND_TAG && opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST) || (kind == FI_OPX_KIND_MSG && opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST)); - union fi_opx_context * context = (union fi_opx_context *) opx_ep->rx->queue[kind].mq.head; - union fi_opx_context * prev = NULL; + struct opx_context *context = (struct opx_context *) opx_ep->rx->queue[kind].mq.head; + struct opx_context *prev = NULL; while ( context && @@ -2809,13 +2678,12 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, (struct slist_entry *) context, (struct slist_entry *) prev); - uint64_t is_context_ext = context->flags & FI_OPX_CQ_CONTEXT_EXT; uint64_t is_hmem = context->flags & FI_OPX_CQ_CONTEXT_HMEM; + /* Copy this packet's payload to the context's buffer. */ - complete_receive_operation_internal(ep, hdr, payload, - hdr->match.ofi_tag, &context, + opx_ep_complete_receive_operation(ep, hdr, payload, + hdr->match.ofi_tag, context, opcode, - is_context_ext, OPX_MULTI_RECV_FALSE, OPX_INTRANODE_FALSE, /* Should always be false for mp_eager */ is_hmem, @@ -2839,12 +2707,11 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, } else { context->next = NULL; - if (OFI_UNLIKELY(is_context_ext && - ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { + if (OFI_UNLIKELY(context->err_entry.err == FI_ETRUNC)) { slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); } else { fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, - is_context_ext, lock_required); + lock_required); } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_completed_eager_first); } @@ -2871,8 +2738,8 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, /* Search mp-eager queue for the context w/ matching mp-eager ID */ const uint64_t mp_egr_id = fi_opx_mp_egr_id_from_nth_packet(hdr, slid); - union fi_opx_context *context = (union fi_opx_context *) opx_ep->rx->mp_egr_queue.mq.head; - union fi_opx_context *prev = NULL; + struct opx_context *context = (struct opx_context *) opx_ep->rx->mp_egr_queue.mq.head; + struct opx_context *prev = NULL; FI_OPX_DEBUG_COUNTERS_DECLARE_TMP(length); @@ -2899,14 +2766,12 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_nth_match); /* We found a match! */ - const uint64_t is_context_ext = context->flags & FI_OPX_CQ_CONTEXT_EXT; - complete_receive_operation(ep, + opx_ep_complete_receive_operation(ep, hdr, payload, 0, /* OFI Tag, N/A for multi-packet eager nth */ context, opcode, // FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH - is_context_ext, OPX_MULTI_RECV_FALSE, is_intranode, context->flags & FI_OPX_CQ_CONTEXT_HMEM, @@ -2920,12 +2785,11 @@ void fi_opx_ep_rx_process_header_mp_eager_nth(struct fid_ep *ep, (struct slist_entry *) context, (struct slist_entry *) prev); - if (OFI_UNLIKELY(is_context_ext && - ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { + if (OFI_UNLIKELY(context->err_entry.err == FI_ETRUNC)) { slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); } else { fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, - is_context_ext, lock_required); + lock_required); } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_completed_eager_nth); @@ -2982,8 +2846,8 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, assert(static_flags & (FI_TAGGED | FI_MSG)); const uint64_t kind = (static_flags & FI_TAGGED) ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG; - union fi_opx_context * context = (union fi_opx_context *) opx_ep->rx->queue[kind].mq.head; - union fi_opx_context * prev = NULL; + struct opx_context *context = (struct opx_context *) opx_ep->rx->queue[kind].mq.head; + struct opx_context *prev = NULL; while (OFI_LIKELY(context != NULL) && !is_match(opx_ep, @@ -3038,9 +2902,8 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, context->next = NULL; - complete_receive_operation(ep, hdr, payload, + opx_ep_complete_receive_operation(ep, hdr, payload, hdr->match.ofi_tag, context, opcode, - rx_op_flags & FI_OPX_CQ_CONTEXT_EXT, OPX_MULTI_RECV_FALSE, is_intranode, rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM, @@ -3049,7 +2912,6 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, hfi1_type); return; - } /* @@ -3059,12 +2921,10 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, const uint64_t recv_len = context->len; const uint64_t send_len = fi_opx_hfi1_packet_hdr_message_length(hdr); - assert(!(context->flags & FI_OPX_CQ_CONTEXT_EXT)); assert(!(context->flags & FI_OPX_CQ_CONTEXT_HMEM)); if (OFI_LIKELY(send_len <= recv_len)) { - complete_receive_operation(ep, hdr, payload, + opx_ep_complete_receive_operation(ep, hdr, payload, 0, context, opcode, - OPX_CONTEXT_EXTENDED_FALSE, OPX_MULTI_RECV_TRUE, is_intranode, OPX_HMEM_FALSE, @@ -3309,48 +3169,33 @@ void fi_opx_ep_rx_poll (struct fid_ep *ep, } __OPX_FORCE_INLINE__ -int fi_opx_ep_cancel_context(struct fi_opx_ep * opx_ep, +int fi_opx_ep_cancel_context(struct fi_opx_ep *opx_ep, const uint64_t cancel_context, - union fi_opx_context * context, + struct opx_context *context, const uint64_t rx_op_flags, - const uint64_t is_context_ext, const int lock_required) { FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "unimplemented; abort\n"); abort(); - const uint64_t compare_context = is_context_ext ? - (uint64_t)(((struct fi_opx_context_ext *)context)->msg.op_context) : - (uint64_t)context; + const uint64_t compare_context = (uint64_t) context->err_entry.op_context; if (compare_context == cancel_context) { - struct fi_opx_context_ext * ext; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - return -FI_ENOMEM; - } - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - } - - ext->opx_context.byte_counter = 0; - ext->err_entry.op_context = (void *)cancel_context; - ext->err_entry.flags = rx_op_flags; - ext->err_entry.len = 0; - ext->err_entry.buf = 0; - ext->err_entry.data = 0; - ext->err_entry.tag = context->tag; - ext->err_entry.olen = 0; - ext->err_entry.err = FI_ECANCELED; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; + context->byte_counter = 0; + context->err_entry.flags = rx_op_flags; + context->err_entry.len = 0; + context->err_entry.buf = 0; + context->err_entry.data = 0; + context->err_entry.tag = context->tag; + context->err_entry.olen = 0; + context->err_entry.err = FI_ECANCELED; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; /* post an 'error' completion event for the canceled receive */ if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - slist_insert_tail((struct slist_entry *) ext, opx_ep->rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); return FI_ECANCELED; } @@ -3361,8 +3206,7 @@ int fi_opx_ep_cancel_context(struct fi_opx_ep * opx_ep, __OPX_FORCE_INLINE__ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, const uint64_t static_flags, - union fi_opx_context * context, - const uint64_t is_context_ext, + struct opx_context * context, const uint64_t is_hmem, const int lock_required, const enum ofi_reliability_kind reliability, @@ -3398,19 +3242,18 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, const unsigned is_intranode = opx_lrh_is_intranode(&(uepkt->hdr), hfi1_type); if (is_mp_eager) { - complete_receive_operation_internal(ep, - &uepkt->hdr, - &uepkt->payload, - uepkt->hdr.match.ofi_tag, - &context, - uepkt->hdr.bth.opcode, - is_context_ext, - OPX_MULTI_RECV_FALSE, - is_intranode, - is_hmem, - lock_required, - reliability, - hfi1_type); + opx_ep_complete_receive_operation(ep, + &uepkt->hdr, + &uepkt->payload, + uepkt->hdr.match.ofi_tag, + context, + uepkt->hdr.bth.opcode, + OPX_MULTI_RECV_FALSE, + is_intranode, + is_hmem, + lock_required, + reliability, + hfi1_type); /* Since this is the first multi-packet eager packet, the uid portion of the mp_egr_id will be this packet's PSN */ @@ -3437,22 +3280,20 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.recv_completed_process_context); context->next = NULL; - if (OFI_UNLIKELY(is_context_ext && - ((struct fi_opx_context_ext *)context)->err_entry.err == FI_ETRUNC)) { + if (OFI_UNLIKELY(context->err_entry.err == FI_ETRUNC)) { slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); } else { fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, - is_context_ext, lock_required); + lock_required); } } } else { - complete_receive_operation(ep, + opx_ep_complete_receive_operation(ep, &uepkt->hdr, &uepkt->payload, uepkt->hdr.match.ofi_tag, context, uepkt->hdr.bth.opcode, - is_context_ext, OPX_MULTI_RECV_FALSE, is_intranode, is_hmem, @@ -3487,12 +3328,10 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, } /* rx_op_flags is only checked for FI_PEEK | FI_CLAIM | FI_MULTI_RECV - * rx_op_flags is only used if FI_PEEK | FI_CLAIM | cancel_context - * is_context_ext is only used if FI_PEEK | cancel_context | iovec + * rx_op_flags is only used if FI_PEEK | FI_CLAIM * * The "normal" data movement functions, such as fi_[t]recv(), can safely - * specify '0' for cancel_context, rx_op_flags, and is_context_ext, in - * order to reduce code path. + * specify '0' for rx_op_flags in order to reduce code path. * * TODO - use payload pointer? keep data in hfi eager buffer as long * as possible to avoid memcpy? @@ -3501,34 +3340,24 @@ __OPX_FORCE_INLINE__ int fi_opx_ep_rx_process_context ( struct fi_opx_ep * opx_ep, const uint64_t static_flags, - const uint64_t cancel_context, union fi_opx_context * context, - const uint64_t rx_op_flags, const uint64_t is_context_ext, + struct opx_context *context, + const uint64_t rx_op_flags, const uint64_t is_hmem, const int lock_required, const enum fi_av_type av_type, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { - if (cancel_context) { /* branch should compile out */ - int rc = fi_opx_ep_cancel_context(opx_ep, cancel_context, context, - rx_op_flags, is_context_ext, lock_required); - - if (rc != FI_SUCCESS) return rc; - } - if (OFI_LIKELY((rx_op_flags & (FI_PEEK | FI_CLAIM | FI_MULTI_RECV)) == 0)) { if (is_hmem) { /* branch should compile out */ - assert(is_context_ext); return fi_opx_ep_process_context_match_ue_packets(opx_ep, static_flags, context, - OPX_CONTEXT_EXTENDED_TRUE, - OPX_HMEM_TRUE, - lock_required, reliability, hfi1_type); + OPX_HMEM_TRUE, lock_required, + reliability, hfi1_type); } return fi_opx_ep_process_context_match_ue_packets(opx_ep, static_flags, context, - OPX_CONTEXT_EXTENDED_FALSE, - OPX_HMEM_FALSE, - lock_required, reliability, hfi1_type); + OPX_HMEM_FALSE, lock_required, + reliability, hfi1_type); } else { /* @@ -3539,7 +3368,8 @@ int fi_opx_ep_rx_process_context ( "process peek, claim, or multi-receive context\n"); fi_opx_ep_rx_process_context_noinline(opx_ep, static_flags, - context, rx_op_flags, is_context_ext, is_hmem, lock_required, av_type, reliability, hfi1_type); + context, rx_op_flags, is_hmem, lock_required, av_type, + reliability, hfi1_type); } return 0; @@ -3579,7 +3409,8 @@ fi_addr_t fi_opx_ep_get_src_addr(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *user_context, const int lock_required, const enum fi_av_type av_type, const uint64_t static_flags, const enum ofi_reliability_kind reliability, @@ -3592,40 +3423,44 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, FI_OPX_DEBUG_COUNTERS_INC_COND(static_flags & FI_TAGGED, opx_ep->debug_counters.recv.posted_recv_tag); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== POST RECV: context = %p\n", context); + "===================================== POST RECV: context = %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "POST-RECV"); + struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + const uint64_t rx_op_flags = opx_ep->rx->op_flags; uint64_t rx_caps = opx_ep->rx->caps; - assert(context); - assert(((uintptr_t)context & 0x07ull) == 0); /* must be 8 byte aligned */ - union fi_opx_context * opx_context = (union fi_opx_context *)context; - opx_context->flags = rx_op_flags; - opx_context->len = len; - opx_context->buf = buf; + context->next = NULL; + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->flags = rx_op_flags; + context->len = len; + context->buf = buf; + context->src_addr = (rx_caps & FI_DIRECTED_RECV) + ? fi_opx_ep_get_src_addr(opx_ep, av_type, src_addr) + : FI_ADDR_UNSPEC; + context->tag = tag; + context->ignore = ignore; + context->byte_counter = (uint64_t)-1; - if (rx_caps & FI_DIRECTED_RECV) { - opx_context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, src_addr); - } else { - opx_context->src_addr = FI_ADDR_UNSPEC; - } #ifdef FI_OPX_TRACE fprintf(stderr,"fi_opx_recv_generic from source addr:\n"); - FI_OPX_ADDR_DUMP(&opx_context->src_addr); + FI_OPX_ADDR_DUMP(&context->src_addr); #endif - opx_context->tag = tag; - opx_context->ignore = ignore; - opx_context->byte_counter = (uint64_t)-1; - assert(IS_PROGRESS_MANUAL(opx_ep->domain)); if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "process context (check unexpected queue, append match queue)\n"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "process context (check unexpected queue, append match queue)\n"); #ifdef OPX_HMEM uint64_t hmem_device; @@ -3633,29 +3468,17 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, if (hmem_iface != FI_HMEM_SYSTEM) { FI_OPX_DEBUG_COUNTERS_INC_COND(static_flags & FI_MSG, opx_ep->debug_counters.hmem.posted_recv_msg); FI_OPX_DEBUG_COUNTERS_INC_COND(static_flags & FI_TAGGED, opx_ep->debug_counters.hmem.posted_recv_tag); - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== POST RECV RETURN FI_ENOMEM\n"); - OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "POST-RECV"); - return -FI_ENOMEM; - } - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &ext->hmem_info_qws[0]; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &context->hmem_info_qws[0]; hmem_info->iface = hmem_iface; hmem_info->device = hmem_device; hmem_info->hmem_dev_reg_handle = ((struct fi_opx_mr *)desc)->hmem_dev_reg_handle; - ext->err_entry.err = 0; - ext->opx_context = *opx_context; - ext->opx_context.flags = rx_op_flags | FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM; - ext->msg.op_context = (struct fi_context2 *) context; + context->flags |= FI_OPX_CQ_CONTEXT_HMEM; fi_opx_ep_rx_process_context(opx_ep, static_flags, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, + context, 0, // rx_op_flags - OPX_CONTEXT_EXTENDED_TRUE, OPX_HMEM_TRUE, lock_required, av_type, @@ -3666,10 +3489,8 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, { fi_opx_ep_rx_process_context(opx_ep, static_flags, - OPX_CANCEL_CONTEXT_FALSE, context, 0, // rx_op_flags - OPX_CONTEXT_EXTENDED_FALSE, OPX_HMEM_FALSE, lock_required, av_type, @@ -3678,7 +3499,8 @@ ssize_t fi_opx_ep_rx_recv_internal (struct fi_opx_ep *opx_ep, } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECV"); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECV RETURN\n"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== POST RECV RETURN\n"); return 0; } @@ -3706,12 +3528,17 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, FI_OPX_DEBUG_COUNTERS_INC_COND(!(flags & FI_MULTI_RECV), opx_ep->debug_counters.recv.posted_recv_msg); FI_OPX_DEBUG_COUNTERS_INC_COND((flags & FI_MULTI_RECV), opx_ep->debug_counters.recv.posted_multi_recv); assert(!lock_required); - assert(msg->context); - assert(((uintptr_t)msg->context & 0x07ull) == 0); /* must be 8 byte aligned */ - if (OFI_LIKELY(flags & FI_MULTI_RECV)) { - union fi_opx_context * opx_context = (union fi_opx_context *) msg->context; + struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->next = NULL; + context->err_entry.err = 0; + context->err_entry.op_context = msg->context; + if (OFI_LIKELY(flags & FI_MULTI_RECV)) { uint64_t len = msg->msg_iov[0].iov_len; void * base = msg->msg_iov[0].iov_base; @@ -3723,20 +3550,16 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, base = (void *)new_base; } assert(((uintptr_t)base & 0x07ull) == 0); - assert(len >= (sizeof(union fi_opx_context) + opx_ep->rx->min_multi_recv)); - opx_context->flags = FI_MULTI_RECV; - opx_context->len = len - sizeof(union fi_opx_context); - opx_context->buf = (void *)((uintptr_t)base + sizeof(union fi_opx_context)); - opx_context->next = NULL; - opx_context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - opx_context->byte_counter = 0; - opx_context->multi_recv_next = (union fi_opx_context *)base; - opx_context->ignore = (uint64_t)-1; + assert(len >= (sizeof(struct opx_context) + opx_ep->rx->min_multi_recv)); + context->flags = FI_MULTI_RECV; + context->len = len - sizeof(struct opx_context); + context->buf = (void *)((uintptr_t)base + sizeof(struct opx_context)); + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->byte_counter = 0; + context->ignore = (uint64_t)-1; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, reliability, @@ -3746,20 +3569,16 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, return rc; } else if (msg->iov_count == 0) { - union fi_opx_context * opx_context = (union fi_opx_context *) msg->context; - opx_context->flags = flags; - opx_context->len = 0; - opx_context->buf = NULL; - opx_context->next = NULL; - opx_context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - opx_context->tag = 0; - opx_context->ignore = (uint64_t)-1; - opx_context->byte_counter = (uint64_t)-1; + context->flags = flags; + context->len = 0; + context->buf = NULL; + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->tag = 0; + context->ignore = (uint64_t)-1; + context->byte_counter = (uint64_t)-1; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, reliability, @@ -3790,34 +3609,22 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, #endif if (hmem_iface != FI_HMEM_SYSTEM) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.posted_recv_msg); - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== POST RECVMSG (HMEM) RETURN FI_ENOMEM\n"); - OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "POST-RECVMSG"); - return -FI_ENOMEM; - } - - ext->err_entry.err = 0; - ext->opx_context.flags = flags | FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM; - ext->opx_context.len = msg->msg_iov[0].iov_len; - ext->opx_context.buf = msg->msg_iov[0].iov_base; - ext->opx_context.byte_counter = (uint64_t)-1; - ext->opx_context.src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - ext->opx_context.tag = 0; - ext->opx_context.ignore = (uint64_t)-1; - ext->msg.op_context = (struct fi_context2 *)msg->context; - ext->msg.iov_count = msg->iov_count; - ext->msg.iov = (struct iovec *)msg->msg_iov; - - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &ext->hmem_info_qws[0]; + context->flags = flags | FI_OPX_CQ_CONTEXT_HMEM; + context->len = msg->msg_iov[0].iov_len; + context->buf = msg->msg_iov[0].iov_base; + context->byte_counter = (uint64_t)-1; + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->tag = 0; + context->ignore = (uint64_t)-1; + context->msg.iov_count = msg->iov_count; + context->msg.iov = (struct iovec *)msg->msg_iov; + + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &context->hmem_info_qws[0]; hmem_info->iface = hmem_iface; hmem_info->device = hmem_device; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, ext->opx_context.flags, - OPX_CONTEXT_EXTENDED_TRUE, + context, context->flags, OPX_HMEM_TRUE, lock_required, av_type, reliability, @@ -3829,23 +3636,16 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, #endif if (msg->iov_count == 1) { - assert(msg->context); - assert(((uintptr_t)msg->context & 0x07ull) == 0); /* must be 8 byte aligned */ - - union fi_opx_context * opx_context = - (union fi_opx_context *) msg->context; - opx_context->flags = flags; - opx_context->len = msg->msg_iov[0].iov_len; - opx_context->buf = msg->msg_iov[0].iov_base; - opx_context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - opx_context->tag = 0; - opx_context->ignore = (uint64_t)-1; - opx_context->byte_counter = (uint64_t)-1; + context->flags = flags; + context->len = msg->msg_iov[0].iov_len; + context->buf = msg->msg_iov[0].iov_base; + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->tag = 0; + context->ignore = (uint64_t)-1; + context->byte_counter = (uint64_t)-1; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, reliability, @@ -3857,27 +3657,16 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, /* msg->iov_count > 1 */ - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECVMSG RETURN FI_ENOMEM\n"); - OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "POST-RECVMSG"); - return -FI_ENOMEM; - } - - ext->opx_context.flags = flags | FI_OPX_CQ_CONTEXT_EXT; - ext->opx_context.byte_counter = (uint64_t)-1; - ext->opx_context.src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); - ext->opx_context.tag = 0; - ext->opx_context.ignore = (uint64_t)-1; - ext->msg.op_context = (struct fi_context2 *)msg->context; - ext->msg.iov_count = msg->iov_count; - ext->msg.iov = (struct iovec *)msg->msg_iov; + context->flags = flags; + context->byte_counter = (uint64_t)-1; + context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); + context->tag = 0; + context->ignore = (uint64_t)-1; + context->msg.iov_count = msg->iov_count; + context->msg.iov = (struct iovec *)msg->msg_iov; ssize_t rc = fi_opx_ep_rx_process_context(opx_ep, FI_MSG, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, - ext->opx_context.flags, - OPX_CONTEXT_EXTENDED_TRUE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, reliability, @@ -3940,7 +3729,7 @@ void fi_opx_ep_tx_cq_completion_rzv(struct fid_ep *ep, struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); assert(context); assert(((uintptr_t)context & 0x07ull) == 0); /* must be 8 byte aligned */ - union fi_opx_context * opx_context = (union fi_opx_context *)context; + struct opx_context *opx_context = (struct opx_context *) context; opx_context->flags = FI_SEND | (caps & (FI_TAGGED | FI_MSG)); opx_context->len = len; opx_context->buf = NULL; /* receive data buffer */ @@ -4213,42 +4002,24 @@ ssize_t fi_opx_ep_tx_send_rzv(struct fid_ep *ep, const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - union fi_opx_context * opx_context = (union fi_opx_context *)context; - uintptr_t byte_counter_ptr; - uint64_t *byte_counter; - uint64_t fake_cntr; ssize_t rc; - if (OFI_LIKELY(do_cq_completion != 0)) { - assert(context); - assert(((uintptr_t)context & 0x07ull) == 0); /* must be 8 byte aligned */ - byte_counter_ptr = (uintptr_t) &opx_context->byte_counter; - byte_counter = (uint64_t *) &opx_context->byte_counter; - } else { - // Give a 'fake' counter here to 'value' part of the SEND_RZV. This - // does look a bit weird, but it saves from a few if checks in - // SEND_RZV and won't store a pointer to the stack variable - // fake_cntr in the RZV protocol headers - byte_counter_ptr = (uintptr_t) NULL; - byte_counter = (uint64_t *) &fake_cntr; - } - do { if (is_contiguous) { rc = FI_OPX_FABRIC_TX_SEND_RZV( ep, buf, len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hmem_iface, hmem_device, hfi1_type); + caps, reliability, + do_cq_completion, + hmem_iface, hmem_device, hfi1_type); } else { rc = FI_OPX_FABRIC_TX_SENDV_RZV( ep, local_iov, niov, total_len, desc, addr.fi, tag, context, data, lock_required, override_flags, tx_op_flags, addr.hfi1_rx, - byte_counter_ptr, - byte_counter, - caps, reliability, hmem_iface, hmem_device, hfi1_type); + caps, reliability, + do_cq_completion, + hmem_iface, hmem_device, hfi1_type); } if (OFI_UNLIKELY(rc == -EAGAIN)) { @@ -4256,11 +4027,6 @@ ssize_t fi_opx_ep_tx_send_rzv(struct fid_ep *ep, } } while (rc == -EAGAIN); - if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_completion_rzv(ep, context, len, - lock_required, tag, caps); - } - return rc; } @@ -4533,7 +4299,7 @@ ssize_t fi_opx_ep_tx_inject(struct fid_ep *ep, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -4555,7 +4321,7 @@ ssize_t fi_opx_recv_generic(struct fid_ep *ep, const int lock_required, const enum fi_av_type av_type, const uint64_t static_flags, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -4573,7 +4339,7 @@ ssize_t fi_opx_recvmsg_generic(struct fid_ep *ep, const struct fi_msg *msg, uint64_t flags, const int lock_required, const enum fi_av_type av_type, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type ) + const enum opx_hfi1_type hfi1_type ) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); diff --git a/prov/opx/include/rdma/opx/fi_opx_eq.h b/prov/opx/include/rdma/opx/fi_opx_eq.h index a3c353f654b..f386f05a2f8 100644 --- a/prov/opx/include/rdma/opx/fi_opx_eq.h +++ b/prov/opx/include/rdma/opx/fi_opx_eq.h @@ -130,7 +130,7 @@ struct fi_opx_cq { struct slist pending; struct slist completed; - struct slist err; /* 'struct fi_opx_context_ext' element linked list */ + struct slist err; struct { uint64_t ep_count; @@ -139,7 +139,6 @@ struct fi_opx_cq { struct fi_opx_progress_track *progress_track; -// struct fi_opx_context_ext *err_tail; uint64_t pad_1[9]; struct fi_opx_domain *domain; @@ -174,8 +173,8 @@ int fi_opx_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, fprintf(stderr,"%s:%s():%d entry_id = %u\n", __FILE__, __func__, __LINE__, (entry)->recv.entry_id); \ }) -int fi_opx_cq_enqueue_err (struct fi_opx_cq * opx_cq, - struct fi_opx_context_ext * ext, +int fi_opx_cq_enqueue_err (struct fi_opx_cq *opx_cq, + struct opx_context *context, const int lock_required); struct fi_ops_cq * fi_opx_cq_select_non_locking_2048_ops(const enum fi_cq_format format, @@ -211,8 +210,8 @@ struct fi_ops_cq * fi_opx_cq_select_locking_runtime_ops(const enum fi_cq_format void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); static inline -int fi_opx_cq_enqueue_pending (struct fi_opx_cq * opx_cq, - union fi_opx_context * context, +int fi_opx_cq_enqueue_pending (struct fi_opx_cq *opx_cq, + struct opx_context *context, const int lock_required) { @@ -225,8 +224,8 @@ int fi_opx_cq_enqueue_pending (struct fi_opx_cq * opx_cq, static inline -int fi_opx_cq_enqueue_completed (struct fi_opx_cq * opx_cq, - union fi_opx_context * context, +int fi_opx_cq_enqueue_completed (struct fi_opx_cq *opx_cq, + struct opx_context *context, const int lock_required) { assert(0 == context->byte_counter); @@ -244,11 +243,10 @@ int fi_opx_cq_enqueue_completed (struct fi_opx_cq * opx_cq, static inline size_t fi_opx_cq_fill(uintptr_t output, - union fi_opx_context * context, + struct opx_context *context, const enum fi_cq_format format) { assert(!(context->flags & FI_OPX_CQ_CONTEXT_HMEM)); - assert(!(context->flags & FI_OPX_CQ_CONTEXT_EXT)); const uint64_t is_multi_recv = context->flags & FI_OPX_CQ_CONTEXT_MULTIRECV; size_t return_size; @@ -276,9 +274,9 @@ static inline size_t fi_opx_cq_fill(uintptr_t output, } if (OFI_LIKELY(!is_multi_recv)) { - entry->op_context = (void *)context; + entry->op_context = context->err_entry.op_context; } else { - entry->op_context = (void *)context->multi_recv_context; + entry->op_context = ((struct opx_context *)context->multi_recv_context)->err_entry.op_context; } return return_size; @@ -301,58 +299,64 @@ static ssize_t fi_opx_cq_poll_noinline (struct fi_opx_cq *opx_cq, /* examine each context in the pending completion queue and, if the * operation is complete, initialize the cq entry in the application * buffer and remove the context from the queue. */ - union fi_opx_context * pending_head = (union fi_opx_context *) opx_cq->pending.head; - union fi_opx_context * pending_tail = (union fi_opx_context *) opx_cq->pending.tail; + struct opx_context *pending_head = (struct opx_context *) opx_cq->pending.head; + struct opx_context *pending_tail = (struct opx_context *) opx_cq->pending.tail; if (NULL != pending_head) { - union fi_opx_context * context = pending_head; - union fi_opx_context * prev = NULL; + struct opx_context *context = pending_head; + struct opx_context *prev = NULL; while ((count - num_entries) > 0 && context != NULL) { const uint64_t byte_counter = context->byte_counter; if (byte_counter == 0) { + bool free_context; if (context->flags & FI_OPX_CQ_CONTEXT_MULTIRECV) { assert(!(context->flags & FI_OPX_CQ_CONTEXT_HMEM)); - assert(!(context->flags & FI_OPX_CQ_CONTEXT_EXT)); - union fi_opx_context *multi_recv_context = context->multi_recv_context; + struct opx_context *multi_recv_context = context->multi_recv_context; assert(multi_recv_context != NULL); multi_recv_context->byte_counter-=1; assert(((int64_t)multi_recv_context->byte_counter) >= 0); // Reusing byte counter as pending flag // re-using tag to store the min multi_receive struct fi_opx_ep * opx_ep = (struct fi_opx_ep *)multi_recv_context->tag; - if(multi_recv_context->len < opx_ep->rx->min_multi_recv && - multi_recv_context->byte_counter == 0) { + if (multi_recv_context->len < opx_ep->rx->min_multi_recv && + multi_recv_context->byte_counter == 0) { /* Signal the user to repost their buffers */ assert(multi_recv_context->next == NULL); slist_insert_tail((struct slist_entry *) multi_recv_context, opx_ep->rx->cq_completed_ptr); } - } else if (context->flags & FI_OPX_CQ_CONTEXT_EXT) { - struct fi_opx_context_ext *ext = (struct fi_opx_context_ext *) context; - context = (union fi_opx_context *) ext->msg.op_context; - *context = ext->opx_context; - context->flags &= ~(FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM); - OPX_BUF_FREE(ext); + free_context = false; + } else { + free_context = true; } + context->flags &= ~FI_OPX_CQ_CONTEXT_HMEM; output += fi_opx_cq_fill(output, context, format); - ++ num_entries; + ++num_entries; - if (prev) + if (prev) { prev->next = context->next; - else + } else { /* remove the head */ pending_head = context->next; + } + + struct opx_context *next = context->next; - if (!(context->next)) + if (!next) { /* remove the tail */ pending_tail = prev; - } - else + } + if (free_context) { + OPX_BUF_FREE(context); + } + context = next; + } else { prev = context; - context = context->next; + context = context->next; + } } /* save the updated pending head and pending tail pointers */ @@ -361,13 +365,17 @@ static ssize_t fi_opx_cq_poll_noinline (struct fi_opx_cq *opx_cq, } - union fi_opx_context * head = (union fi_opx_context *) opx_cq->completed.head; + struct opx_context *head = (struct opx_context *) opx_cq->completed.head; if (head) { - union fi_opx_context * context = head; + struct opx_context *context = head; while ((count - num_entries) > 0 && context != NULL) { output += fi_opx_cq_fill(output, context, format); - ++ num_entries; - context = context->next; + ++num_entries; + struct opx_context *next = context->next; + if (!(context->flags & FI_OPX_CQ_CONTEXT_MULTIRECV)) { + OPX_BUF_FREE(context); + } + context = next; } opx_cq->completed.head = (struct slist_entry *) context; if (!context) opx_cq->completed.tail = NULL; @@ -464,11 +472,15 @@ ssize_t fi_opx_cq_poll_inline(struct fid_cq *cq, void *buf, size_t count, if (0 == (tmp_eh | tmp_ph)) { uintptr_t output = (uintptr_t) buf; - union fi_opx_context * context = (union fi_opx_context *)tmp_ch; + struct opx_context *context = (struct opx_context *) tmp_ch; while ((count - num_entries) > 0 && context != NULL) { output += fi_opx_cq_fill(output, context, format); ++ num_entries; - context = context->next; + struct opx_context *next = context->next; + if (!(context->flags & FI_OPX_CQ_CONTEXT_MULTIRECV)) { + OPX_BUF_FREE(context); + } + context = next; } opx_cq->completed.head = (struct slist_entry *) context; if (!context) opx_cq->completed.tail = NULL; diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h index 353b1db6399..746ba96aa2c 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h @@ -288,19 +288,19 @@ struct fi_opx_hfi1_stl_packet_hdr_16B { formats, but in units of DWs for 9B formats.*/ __le32 pktlen:11; __le32 b:1; - + __le32 dlid:20; /* dw[1] */ __le32 sc:5; __le32 rc:3; __le32 f:1; __le32 l2:2; __le32 lt:1; - + __le32 l4:8; /* dw[2] qw[1] */ __le32 slid20:4; __le32 dlid20:4; __le32 pkey:16; - + __le32 entropy:16; /* dw[3] */ __le32 age:3; __le32 cspec:5; @@ -1290,7 +1290,7 @@ union opx_hfi1_packet_hdr { static inline -fi_opx_uid_t fi_opx_hfi1_packet_hdr_uid (const union opx_hfi1_packet_hdr * const hdr, +fi_opx_uid_t fi_opx_hfi1_packet_hdr_uid (const union opx_hfi1_packet_hdr * const hdr, const uint64_t slid) { const union fi_opx_uid uid = { @@ -1394,7 +1394,7 @@ void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, const pid_t pid = getpid(); //fi_opx_hfi1_dump_stl_packet_hdr (hdr, hfi1_type, fn, ln); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u ==== dump packet header @ %p [%016lx %016lx %016lx %016lx]\n", pid, fn, ln, hdr, qw[0], qw[1], qw[2], qw[3]); - if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.flags ........... 0x%04hx\n", pid, fn, ln, hdr->lrh_9B.flags); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.dlid ............ 0x%04hx (be: %5hu, le: %5hu)\n", pid, fn, ln, hdr->lrh_9B.dlid, hdr->lrh_9B.dlid, ntohs(hdr->lrh_9B.dlid)); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .lrh.pktlen .......... 0x%04hx (be: %5hu, le: %5hu)\n", pid, fn, ln, hdr->lrh_9B.pktlen, hdr->lrh_9B.pktlen, ntohs(hdr->lrh_9B.pktlen)); @@ -1466,26 +1466,47 @@ void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, return; } -#define OPX_DEBUG_PRINT_HDR(__hdr,__hfi1_type) \ - if (__hfi1_type & OPX_HFI1_JKR) { \ - OPX_JKR_PRINT_16B_LRH(__hdr->qw_16B[0], \ - __hdr->qw_16B[1]); \ - OPX_JKR_PRINT_16B_BTH(__hdr->qw_16B[2], \ - __hdr->qw_16B[3]); \ - } else { \ - fi_opx_hfi1_dump_packet_hdr(__hdr, __hfi1_type, \ - __func__, __LINE__); \ +#else +// Disable the macros +#define OPX_JKR_PRINT_16B_PBC(a) +#define OPX_JKR_PRINT_16B_LRH(a,b) +#define OPX_JKR_PRINT_16B_BTH(a,b) + +void opx_jkr_print_16B_pbc(uint64_t pbc1, const char* func); +void opx_jkr_print_16B_lrh(uint64_t lrh1, uint64_t lrh2, const char* func); +void opx_jkr_print_16B_bth(uint64_t bth1, uint64_t bth2, const char* func); + +static inline +void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, + const enum opx_hfi1_type hfi1_type, + const char * fn, const unsigned ln) +{ + return; +} + +#endif + +#ifdef OPX_JKR_DEBUG +#define OPX_DEBUG_PRINT_HDR(__hdr,__hfi1_type) \ + if (__hfi1_type & OPX_HFI1_JKR) { \ + OPX_JKR_PRINT_16B_LRH(__hdr->qw_16B[0], \ + __hdr->qw_16B[1]); \ + OPX_JKR_PRINT_16B_BTH(__hdr->qw_16B[2], \ + __hdr->qw_16B[3]); \ + } else { \ + fi_opx_hfi1_dump_packet_hdr(__hdr, __hfi1_type, \ + __func__, __LINE__);\ } #define OPX_DEBUG_PRINT_PBC_HDR(__pbc,__hdr,__hfi1_type) \ if (__hfi1_type & OPX_HFI1_JKR) { \ OPX_JKR_PRINT_16B_PBC(__pbc); \ - OPX_JKR_PRINT_16B_LRH(__hdr->qw_16B[0], \ + OPX_JKR_PRINT_16B_LRH(__hdr->qw_16B[0], \ __hdr->qw_16B[1]); \ - OPX_JKR_PRINT_16B_BTH(__hdr->qw_16B[2], \ + OPX_JKR_PRINT_16B_BTH(__hdr->qw_16B[2], \ __hdr->qw_16B[3]); \ } else { \ - fi_opx_hfi1_dump_packet_hdr(__hdr, __hfi1_type, \ + fi_opx_hfi1_dump_packet_hdr(__hdr, __hfi1_type, \ __func__, __LINE__);\ } @@ -1497,22 +1518,6 @@ void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, } #else -// Disable the macros -#define OPX_JKR_PRINT_16B_PBC(a) -#define OPX_JKR_PRINT_16B_LRH(a,b) -#define OPX_JKR_PRINT_16B_BTH(a,b) - -void opx_jkr_print_16B_pbc(uint64_t pbc1, const char* func); -void opx_jkr_print_16B_lrh(uint64_t lrh1, uint64_t lrh2, const char* func); -void opx_jkr_print_16B_bth(uint64_t bth1, uint64_t bth2, const char* func); - -static inline -void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, - const enum opx_hfi1_type hfi1_type, - const char * fn, const unsigned ln) -{ - return; -} #define OPX_DEBUG_PRINT_HDR(__hdr,__hfi1_type) #define OPX_DEBUG_PRINT_PBC_HDR(__pbc,__hdr,__hfi1_type) diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h index 370f5306e33..9e445a4e415 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h @@ -359,10 +359,10 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, const int lock_required, const enum ofi_reliability_kind reliability, const uint8_t origin_rx, - const uint64_t rhf, - const enum opx_hfi1_type hfi1_type, - const uint64_t slid, - const uint16_t pktlen) + const uint64_t rhf, + const enum opx_hfi1_type hfi1_type, + const uint64_t slid, + const uint16_t pktlen) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "================ received a packet from the fabric\n"); @@ -374,8 +374,8 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, FI_OPX_HFI_BTH_OPCODE_TAG_INJECT, origin_rx, OPX_INTRANODE_FALSE, - lock_required, reliability, - hfi1_type, slid); + lock_required, reliability, + hfi1_type, slid); } else if (opcode > FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { /* all other "tag" packets */ fi_opx_ep_rx_process_header_tag(&opx_ep->ep_fid, hdr, NULL, 0, opcode, @@ -402,7 +402,7 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, uint16_t lrh_pktlen_le; size_t total_bytes_to_copy; size_t payload_bytes_to_copy; - + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { lrh_pktlen_le = ntohs(pktlen); total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ @@ -522,8 +522,10 @@ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, */ if (OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd, hfi1_type)) { const uint32_t rhf_msb = rhf_rcvd >> 32; +#ifdef OPX_JKR_DEBUG FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "OPX_RHF_SEQ_MATCH = %d rhf_rcvd = %#lx rhf_seq = %#lx\n", - OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd, hfi1_type), rhf_rcvd, rhf_seq); + OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd, hfi1_type), rhf_rcvd, rhf_seq); +#endif const uint64_t hdrq_offset_dws = (rhf_msb >> 12) & 0x01FFu; @@ -566,7 +568,7 @@ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, slid = htons((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid)); /* BE for lower layers */ pktlen = (uint16_t) hdr->lrh_16B.pktlen; /* pass it down unchanged. lower layers handle BE/LE */ dlid = htons(((hdr->lrh_16B.dlid20 << 20) | (hdr->lrh_16B.dlid))); /* BE for lower layers */ - } + } if (OFI_UNLIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_UD)) { @@ -575,7 +577,7 @@ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, * process "unreliable datagram" packets first - before all the * software reliability protocol checks. */ - return fi_opx_hfi1_handle_ud_packet(opx_ep, hdr, rhf_seq, hdrq_offset, rhf_rcvd, + return fi_opx_hfi1_handle_ud_packet(opx_ep, hdr, rhf_seq, hdrq_offset, rhf_rcvd, slid, dlid, pktlen, hfi1_type); } @@ -596,7 +598,7 @@ unsigned fi_opx_hfi1_poll_once(struct fid_ep *ep, const int lock_required, } fi_opx_hfi1_handle_packet(opx_ep, opcode, hdr, rhf_seq, - hdrq_offset, lock_required, reliability, origin_rx, rhf_rcvd, + hdrq_offset, lock_required, reliability, origin_rx, rhf_rcvd, hfi1_type, slid, pktlen); return 1; /* one packet was processed */ } @@ -631,7 +633,7 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, dlid = hdr->lrh_9B.dlid; } else { dlid = htons((hdr->lrh_16B.dlid20 << 20) | (hdr->lrh_16B.dlid)); - } + } assert(dlid == opx_ep->rx->self.uid.lid); assert(hdr->bth.rx == opx_ep->rx->self.hfi1_rx || @@ -718,7 +720,7 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ uint16_t lrh_pktlen_le; - size_t total_bytes_to_copy; + size_t total_bytes_to_copy; size_t payload_bytes_to_copy; if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index ab3efa4c511..fcbdbf1a3f9 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -75,8 +75,8 @@ // Function for performing FI_INJECT_COMPLETIONs. __OPX_FORCE_INLINE__ -void fi_opx_ep_tx_cq_inject_completion(struct fid_ep *ep, - void *context, +ssize_t fi_opx_ep_tx_cq_inject_completion(struct fid_ep *ep, + void *user_context, const size_t len, const int lock_required, const uint64_t tag, @@ -91,20 +91,28 @@ void fi_opx_ep_tx_cq_inject_completion(struct fid_ep *ep, /* initialize the completion entry */ struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - assert(context); - assert(((uintptr_t)context & 0x07ull) == 0); /* must be 8 byte aligned */ assert((caps & (FI_TAGGED | FI_MSG)) != (FI_TAGGED | FI_MSG)); - union fi_opx_context * opx_context = (union fi_opx_context *)context; - opx_context->flags = FI_SEND | (caps & (FI_TAGGED | FI_MSG)); - opx_context->len = len; - opx_context->buf = NULL; /* receive data buffer */ - opx_context->byte_counter = 0; - opx_context->tag = tag; - opx_context->next = NULL; + + struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->flags = FI_SEND | (caps & (FI_TAGGED | FI_MSG)); + context->len = len; + context->buf = NULL; /* receive data buffer */ + context->byte_counter = 0; + context->tag = tag; + context->next = NULL; if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== TX CQ COMPLETION QUEUED\n"); - slist_insert_tail((struct slist_entry *) opx_context, opx_ep->tx->cq_completed_ptr); + slist_insert_tail((struct slist_entry *) context, opx_ep->tx->cq_completed_ptr); + + return FI_SUCCESS; } // faster than memcpy() for this amount of data. @@ -653,7 +661,7 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr * const hdr, const void * const payload, const uint8_t u8_rx, const uint64_t niov, uintptr_t origin_byte_counter_vaddr, - union fi_opx_context *const target_context, + struct opx_context *const target_context, const uintptr_t dst_vaddr, const enum fi_hmem_iface dst_iface, const uint64_t dst_device, @@ -1398,14 +1406,13 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, if (xfer_bytes_tail) { ssize_t tail_len = xfer_bytes_tail; remain = total_len - tail_len; - while (false == - fi_opx_hfi1_fill_from_iov8( - iov_ptr, /* In: iovec array */ - *niov_ptr, /* In: total iovecs */ - buf, /* In: target buffer to fill */ - &tail_len, /* In/Out: buffer length to fill */ - &iov_idx, /* In/Out: start index, returns end */ - &iov_base_offset)) { /* In/Out: start offset, returns offset */ + while (false == fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + buf, /* In: target buffer to fill */ + &tail_len, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ // copy until done; } assert(tail_len == 0); @@ -1416,8 +1423,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, (union fi_opx_hfi1_packet_payload *)(hdr + 1); buf = payload->byte; - while (false == - fi_opx_hfi1_fill_from_iov8( + while (false == fi_opx_hfi1_fill_from_iov8( iov_ptr, /* In: iovec array */ *niov_ptr, /* In: total iovecs */ buf, /* In: target buffer to fill */ @@ -1431,15 +1437,17 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, fi_opx_shm_poll_many(&opx_ep->ep_fid, 0, hfi1_type); if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, - lock_required, tag, caps); + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-SHM"); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SENDV, SHM -- EAGER (end)\n"); - return FI_SUCCESS; + return rc; } + __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, @@ -1558,28 +1566,27 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz OPX_NO_16B_SUPPORT(hfi1_type); - replay->scb_9B.qw0 = opx_ep->tx->send_9B.qw0 | - OPX_PBC_LEN(pbc_dws, hfi1_type) | - OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); + replay->scb_9B.qw0 = opx_ep->tx->send_9B.qw0 | + OPX_PBC_LEN(pbc_dws, hfi1_type) | + OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); replay->scb_9B.hdr.qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); replay->scb_9B.hdr.qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER + : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); replay->scb_9B.hdr.qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2] | psn; replay->scb_9B.hdr.qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); replay->scb_9B.hdr.qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); if (xfer_bytes_tail) { ssize_t tail_len = xfer_bytes_tail; remain = total_len - tail_len; - while (false == - fi_opx_hfi1_fill_from_iov8( - iov_ptr, /* In: iovec array */ - *niov_ptr, /* In: total iovecs */ - &replay->scb_9B.hdr.qw_9B[5], /* In: target buffer to fill */ - &tail_len, /* In/Out: buffer length to fill */ - &iov_idx, /* In/Out: start index, returns end */ - &iov_base_offset)) { /* In/Out: start offset, returns offset */ + while (false == fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + &replay->scb_9B.hdr.qw_9B[5], /* In: target buffer to fill */ + &tail_len, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ // copy until done; } assert(tail_len == 0); @@ -1588,14 +1595,13 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz remain = total_len - xfer_bytes_tail; uint64_t *payload = replay->payload; - while (false == - fi_opx_hfi1_fill_from_iov8( - iov_ptr, /* In: iovec array */ - *niov_ptr, /* In: total iovecs */ - payload, /* In: target buffer to fill */ - &remain, /* In/Out: buffer length to fill */ - &iov_idx, /* In/Out: start index, returns end */ - &iov_base_offset)) { /* In/Out: start offset, returns offset */ + while (false == fi_opx_hfi1_fill_from_iov8( + iov_ptr, /* In: iovec array */ + *niov_ptr, /* In: total iovecs */ + payload, /* In: target buffer to fill */ + &remain, /* In/Out: buffer length to fill */ + &iov_idx, /* In/Out: start index, returns end */ + &iov_base_offset)) { /* In/Out: start offset, returns offset */ // copy until done; } @@ -1603,24 +1609,24 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz &opx_ep->reliability->state, addr.reliability_rx, dest_rx, psn_ptr, replay, reliability, hfi1_type); - if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, - lock_required, tag, caps); - } - fi_opx_reliability_service_do_replay(&opx_ep->reliability->service, replay); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + ssize_t rc; + if (OFI_LIKELY(do_cq_completion)) { + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SENDV, HFI -- EAGER (end)\n"); - - return FI_SUCCESS; + return rc; } - __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, const struct iovec *iov, size_t niov, @@ -1744,17 +1750,17 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, fi_opx_shm_poll_many(&opx_ep->ep_fid, 0, hfi1_type); if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, - lock_required, tag, caps); + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-SHM"); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SENDV 16B, SHM -- EAGER (end)\n"); - return FI_SUCCESS; + return rc; } - __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, @@ -1924,24 +1930,24 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, &opx_ep->reliability->state, addr.reliability_rx, dest_rx, psn_ptr, replay, reliability, hfi1_type); - if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, - lock_required, tag, caps); - } - fi_opx_reliability_service_do_replay(&opx_ep->reliability->service, replay); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + ssize_t rc; + if (OFI_LIKELY(do_cq_completion)) { + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, total_len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-EAGER-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SENDV 16B, HFI -- EAGER (end)\n"); - - return FI_SUCCESS; + return rc; } - __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_sendv_egr_select(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, @@ -2094,14 +2100,15 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); if (do_cq_completion) { - fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, - tag, caps); + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-SHM"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND, SHM -- EAGER (end)\n"); - return FI_SUCCESS; + return rc; } @@ -2205,14 +2212,15 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode_16B(struct fid_ep *ep, opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); if (do_cq_completion) { - fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, - tag, caps); + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-SHM"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND 16B, SHM -- EAGER (end)\n"); - return FI_SUCCESS; + return rc; } @@ -2363,8 +2371,6 @@ ssize_t fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(struct fi_opx_ep *opx_ep } - - __OPX_FORCE_INLINE__ ssize_t fi_opx_hfi1_tx_egr_store_full_payload_blocks(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, @@ -2466,7 +2472,6 @@ ssize_t fi_opx_hfi1_tx_egr_store_payload_tail(struct fi_opx_ep *opx_ep, return 1; /* Consumed 1 credit */ } - __OPX_FORCE_INLINE__ void fi_opx_hfi1_tx_send_egr_write_replay_data(struct fi_opx_ep *opx_ep, const union fi_opx_addr addr, @@ -2632,16 +2637,18 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, xfer_bytes_tail, local_temp, buf, payload_qws_total, reliability, hfi1_type); + ssize_t rc; if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, len, - lock_required, tag, caps); + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND, HFI -- EAGER (end)\n"); - return FI_SUCCESS; + return rc; } __OPX_FORCE_INLINE__ @@ -2802,16 +2809,18 @@ ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, xfer_bytes_tail, local_temp, buf, payload_qws_total, reliability, hfi1_type); + ssize_t rc; if (OFI_LIKELY(do_cq_completion)) { - fi_opx_ep_tx_cq_inject_completion(ep, context, len, - lock_required, tag, caps); + rc = fi_opx_ep_tx_cq_inject_completion(ep, context, len, lock_required, tag, caps); + } else { + rc = FI_SUCCESS; } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-EAGER-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND 16B, HFI -- EAGER (end)\n"); - return FI_SUCCESS; + return rc; } __OPX_FORCE_INLINE__ @@ -3770,33 +3779,36 @@ static inline void fi_opx_shm_write_fence(struct fi_opx_ep *opx_ep, ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, - void *context, const uint32_t data, int lock_required, + void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, + const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void *context, + fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, + const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void *context, + fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, + const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type); @@ -3806,9 +3818,10 @@ ssize_t fi_opx_hfi1_tx_send_rzv_select(struct fid_ep *ep, const void *buf, size_ fi_addr_t dest_addr, uint64_t tag, void *context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, + const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type) @@ -3816,21 +3829,18 @@ ssize_t fi_opx_hfi1_tx_send_rzv_select(struct fid_ep *ep, const void *buf, size_ if (hfi1_type & OPX_HFI1_WFR) { return fi_opx_hfi1_tx_send_rzv(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, override_flags, tx_op_flags, dest_rx, - origin_byte_counter_vaddr, - origin_byte_counter_value, - caps, reliability, hmem_iface, hmem_device, OPX_HFI1_WFR); + caps, reliability, do_cq_completion, hmem_iface, + hmem_device, OPX_HFI1_WFR); } else if (hfi1_type & OPX_HFI1_JKR) { return fi_opx_hfi1_tx_send_rzv_16B(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, override_flags, tx_op_flags, dest_rx, - origin_byte_counter_vaddr, - origin_byte_counter_value, - caps, reliability, hmem_iface, hmem_device, OPX_HFI1_JKR); + caps, reliability, do_cq_completion, hmem_iface, + hmem_device, OPX_HFI1_JKR); } else if (hfi1_type & OPX_HFI1_JKR_9B) { return fi_opx_hfi1_tx_send_rzv(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, override_flags, tx_op_flags, dest_rx, - origin_byte_counter_vaddr, - origin_byte_counter_value, - caps, reliability, hmem_iface, hmem_device, OPX_HFI1_JKR_9B); + caps, reliability, do_cq_completion, hmem_iface, + hmem_device, OPX_HFI1_JKR_9B); } abort(); return (ssize_t)-1L; diff --git a/prov/opx/include/rdma/opx/fi_opx_internal.h b/prov/opx/include/rdma/opx/fi_opx_internal.h index b8ea3eefd94..138de08bbbb 100644 --- a/prov/opx/include/rdma/opx/fi_opx_internal.h +++ b/prov/opx/include/rdma/opx/fi_opx_internal.h @@ -45,9 +45,8 @@ #define FI_OPX_CACHE_LINE_SIZE (64) -#define FI_OPX_CQ_CONTEXT_EXT (0x8000000000000000ull) +#define FI_OPX_CQ_CONTEXT_HMEM (0x8000000000000000ull) #define FI_OPX_CQ_CONTEXT_MULTIRECV (0x4000000000000000ull) -#define FI_OPX_CQ_CONTEXT_HMEM (0x2000000000000000ull) #define OPX_HMEM_SIZE_QWS (3) @@ -62,53 +61,42 @@ union fi_opx_mp_egr_id { }; } __attribute__((__packed__)); -union fi_opx_context { - struct fi_context2 context; - struct { - //struct slist_entry entry; /* fi_cq_entry::op_context */ - union fi_opx_context * next; /* fi_cq_entry::op_context */ - uint64_t flags; /* fi_cq_msg_entry::flags */ - size_t len; /* fi_cq_msg_entry::len */ - void *buf; /* fi_cq_data_entry::buf (unused for tagged cq's and non-multi-receive message cq's) */ - - union { - uint64_t data; /* fi_cq_data_entry::data; only used _after_ a message is matched */ - fi_addr_t src_addr; /* only used _before_ a message is matched ('FI_DIRECTED_RECEIVE') */ - }; - - union { - uint64_t tag; /* fi_cq_tagged_entry::tag */ - union fi_opx_context *multi_recv_next; /* only for multi-receives; which is not tagged */ - }; - union { - uint64_t ignore; /* only for tagged receive */ - void *claim; /* only for peek/claim */ - void *multi_recv_context; /* only for individual FI_MULTI_RECV's */ - union fi_opx_mp_egr_id mp_egr_id; - }; - - volatile uint64_t byte_counter; +struct opx_context { + /**** CACHELINE 0 ****/ + struct opx_context *next; /* fi_cq_entry::op_context */ + uint64_t flags; /* fi_cq_msg_entry::flags */ + size_t len; /* fi_cq_msg_entry::len */ + void *buf; /* fi_cq_data_entry::buf (unused for tagged cq's and non-multi-receive message cq's) */ + + union { + uint64_t data; /* fi_cq_data_entry::data; only used _after_ a message is matched */ + fi_addr_t src_addr; /* only used _before_ a message is matched ('FI_DIRECTED_RECEIVE') */ + }; + + uint64_t tag; /* fi_cq_tagged_entry::tag */ + union { + uint64_t ignore; /* only for tagged receive */ + void *claim; /* only for peek/claim */ + void *multi_recv_context; /* only for individual FI_MULTI_RECV's */ + union fi_opx_mp_egr_id mp_egr_id; }; -}; -struct fi_opx_context_ext { - union fi_opx_context opx_context; - struct fi_cq_err_entry err_entry; + volatile uint64_t byte_counter; - // offset 144 bytes + /**** CACHELINE 1 & 2 ****/ + uint64_t hmem_info_qws[OPX_HMEM_SIZE_QWS]; struct { - struct fi_context2 *op_context; size_t iov_count; struct iovec *iov; } msg; - // offset 168 bytes - uint64_t hmem_info_qws[OPX_HMEM_SIZE_QWS]; - - // 184 bytes - uint64_t unused; -} __attribute__((__aligned__(32))); + struct fi_cq_err_entry err_entry; // 88 bytes +} __attribute__((__packed__)) __attribute__((__aligned__(64))); +static_assert(offsetof(struct opx_context, hmem_info_qws) == FI_OPX_CACHE_LINE_SIZE, + "struct opx_context.hmem_info_qws offset should start at Cacheline 1!"); +static_assert(sizeof(struct opx_context) == (FI_OPX_CACHE_LINE_SIZE * 3), + "sizeof(struct opx_context) should be equal to 3 cachelines!"); struct opx_sdma_queue { struct slist list; diff --git a/prov/opx/include/rdma/opx/fi_opx_match.h b/prov/opx/include/rdma/opx/fi_opx_match.h index 0df7929e64e..3bc62703896 100644 --- a/prov/opx/include/rdma/opx/fi_opx_match.h +++ b/prov/opx/include/rdma/opx/fi_opx_match.h @@ -240,7 +240,7 @@ struct fi_opx_hfi1_ue_packet *fi_opx_match_find_uepkt_by_tag(struct fi_opx_match __OPX_FORCE_INLINE__ struct fi_opx_hfi1_ue_packet *fi_opx_match_find_uepkt(struct fi_opx_match_ue_hash *ue_hash, - const union fi_opx_context *context, + const struct opx_context *context, struct fi_opx_debug_counters *debug_counters) { if (!ue_hash->ue.head) { diff --git a/prov/opx/include/rdma/opx/fi_opx_reliability.h b/prov/opx/include/rdma/opx/fi_opx_reliability.h index 5e690377dd8..c43d13fe053 100644 --- a/prov/opx/include/rdma/opx/fi_opx_reliability.h +++ b/prov/opx/include/rdma/opx/fi_opx_reliability.h @@ -83,7 +83,7 @@ struct fi_opx_completion_counter { struct fi_opx_cntr *cntr; struct fi_opx_cq *cq; union { - union fi_opx_context *context; + struct opx_context *context; void *container; }; void (*hit_zero)(struct fi_opx_completion_counter*); @@ -447,7 +447,7 @@ union fi_opx_reliability_tx_psn { uint64_t bytes_outstanding:24; } psn; } __attribute__((__packed__)); - + // TODO - make these tunable. #define FI_OPX_RELIABILITY_TX_REPLAY_BLOCKS (2048) #define FI_OPX_RELIABILITY_TX_REPLAY_IOV_BLOCKS (8192) @@ -689,7 +689,7 @@ uint16_t fi_opx_reliability_rx_drop_packet (struct fi_opx_reliability_client_sta const uint16_t tmp = state->drop_count & state->drop_mask; if (tmp == 0) - FI_WARN(fi_opx_global.prov,FI_LOG_EP_DATA, + FI_WARN(fi_opx_global.prov,FI_LOG_EP_DATA, "DEBUG: discarding packet %hu\n", state->drop_count); state->drop_count = tmp + 1; @@ -744,7 +744,7 @@ size_t fi_opx_reliability_replay_get_payload_size(struct fi_opx_reliability_tx_r const uint16_t lrh_pktlen_le = replay->scb_16B.hdr.lrh_16B.pktlen; const size_t total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ return (total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B)); - } + } } __OPX_FORCE_INLINE__ @@ -1137,7 +1137,7 @@ int32_t fi_opx_reliability_get_replay (struct fid_ep *ep, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { - + union fi_opx_reliability_service_flow_key key = { .slid = (uint32_t) state->lid_be, .tx = (uint32_t) state->tx, @@ -1153,7 +1153,7 @@ int32_t fi_opx_reliability_get_replay (struct fid_ep *ep, opx_reliability_handshake_init(ep, key, target_reliability_rx, hfi1_type); return -1; } - + *psn_ptr = (union fi_opx_reliability_tx_psn *)fi_opx_rbt_value_ptr(state->tx_flow_rbtree, itr); union fi_opx_reliability_tx_psn psn_value = **psn_ptr; @@ -1180,7 +1180,7 @@ int32_t fi_opx_reliability_get_replay (struct fid_ep *ep, fi_opx_reliability_inc_throttle_maxo(ep); return -1; } - + *replay = fi_opx_reliability_client_replay_allocate(state, false); if (*replay == NULL) { return -1; diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index d04bf19fad7..b1da57de82e 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -57,7 +57,6 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, const union fi_opx_addr opx_target_addr, const uint64_t *addr_offset, const uint64_t *key, - union fi_opx_context *opx_context, const uint64_t tx_op_flags, const struct fi_opx_cq *opx_cq, const struct fi_opx_cntr *opx_cntr, @@ -170,7 +169,6 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, const size_t niov, const union fi_opx_addr opx_dst_addr, uint64_t addr_offset, const uint64_t key, - union fi_opx_context *opx_context, struct fi_opx_completion_counter *cc, enum fi_datatype dt, enum fi_op op, const uint64_t tx_op_flags, @@ -268,26 +266,50 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending[params->work_elem.work_type]); } +__OPX_FORCE_INLINE__ +ssize_t opx_rma_get_context(struct fi_opx_ep *opx_ep, const void *user_context, + const void *cq, const uint64_t flags, + struct opx_context **context) +{ + if (!cq || !user_context) { + *context = NULL; + return FI_SUCCESS; + } + + struct opx_context *ctx = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(ctx == NULL)) { + *context = NULL; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + + ctx->next = NULL; + ctx->flags = (uint64_t) flags; + ctx->err_entry.err = 0; + ctx->err_entry.op_context = (void *) user_context; + *context = ctx; + return FI_SUCCESS; +} ssize_t fi_opx_inject_write_generic(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type); + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_write_generic(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type); + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_writev_generic(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type); + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_writemsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, @@ -298,18 +320,18 @@ ssize_t fi_opx_read_generic(struct fid_ep *ep, void *buf, size_t len, void *desc fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type); + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_readv_generic(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type); + const enum opx_hfi1_type hfi1_type); ssize_t fi_opx_readmsg_generic(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type); + const enum opx_hfi1_type hfi1_type); #ifdef __cplusplus } diff --git a/prov/opx/src/fi_opx_atomic.c b/prov/opx/src/fi_opx_atomic.c index 596b899df2f..32069d853e5 100644 --- a/prov/opx/src/fi_opx_atomic.c +++ b/prov/opx/src/fi_opx_atomic.c @@ -113,7 +113,6 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, const uint64_t key, const struct fi_opx_hmem_iov *fetch_iov, const struct fi_opx_hmem_iov *compare_iov, - union fi_opx_context *opx_context, const uint64_t tx_op_flags, const struct fi_opx_cq *opx_cq, const struct fi_opx_cntr *opx_cntr, @@ -252,7 +251,7 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, const union fi_opx_addr opx_dst_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, - void *context, struct fi_opx_completion_counter *cc, + struct fi_opx_completion_counter *cc, const unsigned is_fetch, const void *fetch_vaddr, const unsigned is_compare, const void *compare_vaddr, const uint64_t tx_op_flags, const int lock_required, @@ -274,7 +273,7 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, cc->cntr = opx_ep->read_cntr; fi_opx_readv_internal(opx_ep, &fetch_iov, 1, opx_dst_addr, &addr, &key, - (union fi_opx_context *)context, opx_ep->tx->op_flags, + opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); @@ -308,10 +307,10 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, fi_opx_atomic_op_internal(opx_ep, FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH, &buf_iov, opx_dst_addr, addr, key, &fetch_iov, - NULL, (union fi_opx_context *)context, - opx_ep->tx->op_flags, opx_ep->rx->cq, + NULL, opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, - lock_required, caps, reliability, is_hmem, is_intranode, hfi1_type); + lock_required, caps, reliability, is_hmem, + is_intranode, hfi1_type); } else { struct fi_opx_hmem_iov compare_iov; @@ -325,10 +324,10 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, fi_opx_atomic_op_internal(opx_ep, FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH, &buf_iov, opx_dst_addr, addr, key, &fetch_iov, - &compare_iov, (union fi_opx_context *)context, - opx_ep->tx->op_flags, opx_ep->rx->cq, + &compare_iov, opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, - lock_required, caps, reliability, is_hmem, is_intranode, hfi1_type); + lock_required, caps, reliability, is_hmem, + is_intranode, hfi1_type); } FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC %s (end)\n", @@ -347,10 +346,8 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, uint64_t is_hmem = fi_opx_hmem_iov_init(buf, buf_len, NULL, &buf_iov); fi_opx_write_internal(opx_ep, &buf_iov, 1, opx_dst_addr, addr, key, - (union fi_opx_context *)NULL, cc, - datatype, op, opx_ep->tx->op_flags, - is_hmem, lock_required, caps, - reliability, + cc, datatype, op, opx_ep->tx->op_flags, + is_hmem, lock_required, caps, reliability, hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC WRITE (end)\n"); @@ -361,9 +358,9 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, ssize_t fi_opx_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, fi_addr_t dst_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, - void *context, const int lock_required, const enum fi_av_type av_type, + void *user_context, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; @@ -387,23 +384,31 @@ ssize_t fi_opx_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, return -FI_EAGAIN; } + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_ATOMIC | FI_WRITE, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = sizeofdt(datatype) * count; cc->initial_byte_count = cc->byte_counter; - cc->cq = (((opx_ep->tx->op_flags & FI_COMPLETION) == FI_COMPLETION) || - ((opx_ep->tx->op_flags & FI_DELIVERY_COMPLETE) == FI_DELIVERY_COMPLETE)) ? - opx_ep->rx->cq : - NULL; + cc->cq = cq; cc->context = context; cc->hit_zero = fi_opx_hit_zero; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_WRITE; - size_t xfer __attribute__((unused)); xfer = fi_opx_atomic_internal(opx_ep, buf, count, opx_addr, addr, key, datatype, op, - context, cc, 0, NULL, 0, NULL, opx_ep->tx->op_flags, + cc, 0, NULL, 0, NULL, opx_ep->tx->op_flags, lock_required, av_type, caps, reliability, hfi1_type); assert(xfer == count); @@ -451,7 +456,21 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, return -FI_EAGAIN; } + struct fi_opx_cq *cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_ATOMIC | FI_WRITE, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + size_t index; cc->next = NULL; cc->byte_counter = 0; @@ -460,11 +479,8 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, } cc->initial_byte_count = cc->byte_counter; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_WRITE; - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; const size_t dtsize = sizeofdt(datatype); @@ -486,7 +502,7 @@ ssize_t fi_opx_atomic_writemsg_generic(struct fid_ep *ep, const size_t count_transfered = fi_opx_atomic_internal(opx_ep, (void *)msg_iov_vaddr, count_requested, opx_dst_addr, rma_iov_addr, rma_iov_key, datatype, - op, NULL, cc, 0, NULL, 0, NULL, flags, lock_required, + op, cc, 0, NULL, 0, NULL, flags, lock_required, av_type, caps, reliability, hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -570,7 +586,21 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, uint64_t rst_iov_dtcount = resultv[rst_iov_index].count; uintptr_t rst_iov_vaddr = (uintptr_t)resultv[rst_iov_index].addr; + struct fi_opx_cq *cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_ATOMIC| FI_READ, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = 0; ssize_t index = 0; @@ -578,12 +608,8 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, cc->byte_counter += sizeofdt(datatype) * msg->msg_iov[index].count; } cc->initial_byte_count = cc->byte_counter; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_READ; - - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; if (op != FI_ATOMIC_READ) { /* likely */ @@ -599,10 +625,10 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, const size_t count_transfered = fi_opx_atomic_internal(opx_ep, (void *)msg_iov_vaddr, count_requested, opx_dst_addr, rma_iov_addr, - rma_iov_key, datatype, op, NULL, cc, 1, + rma_iov_key, datatype, op, cc, 1, (const void *)rst_iov_vaddr, 0, NULL, flags, lock_required, av_type, caps, reliability, - hfi1_type); + hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -643,7 +669,7 @@ ssize_t fi_opx_atomic_readwritemsg_generic(struct fid_ep *ep, while (rma_iov_dtcount != 0 && rst_iov_dtcount != 0) { const size_t count_transfered = fi_opx_atomic_internal( opx_ep, NULL, count_requested, opx_dst_addr, rma_iov_addr, - rma_iov_key, datatype, op, NULL, cc, 1, (const void *)rst_iov_vaddr, + rma_iov_key, datatype, op, cc, 1, (const void *)rst_iov_vaddr, 0, NULL, flags, lock_required, av_type, caps, reliability, hfi1_type); const size_t bytes_transfered = dtsize * count_transfered; @@ -742,7 +768,21 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, uint64_t cmp_iov_dtcount = comparev[cmp_iov_index].count; uintptr_t cmp_iov_vaddr = (uintptr_t)comparev[cmp_iov_index].addr; + struct fi_opx_cq *cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_ATOMIC | FI_READ, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = 0; ssize_t index; @@ -750,11 +790,8 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, cc->byte_counter += sizeofdt(datatype)* msg->msg_iov[index].count; } cc->initial_byte_count = cc->byte_counter; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_READ; - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; while (msg_iov_dtcount != 0 && rma_iov_dtcount != 0 && rst_iov_dtcount != 0 && @@ -765,7 +802,7 @@ ssize_t fi_opx_atomic_compwritemsg_generic(struct fid_ep *ep, const size_t count_transfered = fi_opx_atomic_internal(opx_ep, (void *)msg_iov_vaddr, count_requested, opx_dst_addr, rma_iov_addr, rma_iov_key, datatype, - op, NULL, cc, 1, (const void *)rst_iov_vaddr, 1, + op, cc, 1, (const void *)rst_iov_vaddr, 1, (const void *)cmp_iov_vaddr, flags, lock_required, av_type, caps, reliability, hfi1_type); @@ -821,7 +858,7 @@ __OPX_FORCE_INLINE__ ssize_t fi_opx_fetch_compare_atomic_generic( struct fid_ep *ep, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, - uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, int lock_required, + uint64_t key, enum fi_datatype datatype, enum fi_op op, void *user_context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) @@ -848,35 +885,44 @@ ssize_t fi_opx_fetch_compare_atomic_generic( return -FI_EAGAIN; } + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_ATOMIC | FI_WRITE, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = sizeofdt(datatype) * count; cc->initial_byte_count = cc->byte_counter; - cc->cq = (((opx_ep->tx->op_flags & FI_COMPLETION) == FI_COMPLETION) || - ((opx_ep->tx->op_flags & FI_DELIVERY_COMPLETE) == FI_DELIVERY_COMPLETE)) ? - opx_ep->rx->cq : - NULL; + cc->cq = cq; cc->context = context; cc->hit_zero = fi_opx_hit_zero; - union fi_opx_context *opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_ATOMIC | FI_WRITE; - size_t xfer __attribute__((unused)); xfer = fi_opx_atomic_internal(opx_ep, buf, count, opx_addr, addr, key, datatype, op, - context, cc, 1, result, compare!=NULL, compare, opx_ep->tx->op_flags, + cc, 1, result, compare!=NULL, compare, opx_ep->tx->op_flags, lock_required, av_type, caps, reliability, hfi1_type); assert(xfer == count); return 0; } + ssize_t fi_opx_fetch_atomic_generic(struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { return fi_opx_fetch_compare_atomic_generic(ep, buf, count, desc, NULL, NULL, result, result_desc, dest_addr, addr, key, datatype, op, @@ -892,7 +938,7 @@ ssize_t fi_opx_compare_atomic_generic(struct fid_ep *ep, const void *buf, size_t void *context, const int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { return fi_opx_fetch_compare_atomic_generic(ep, buf, count, desc, compare, compare_desc, result, result_desc, dest_addr, addr, key, @@ -906,7 +952,7 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t const int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -933,6 +979,11 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t } struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = sizeofdt(datatype) * count; cc->initial_byte_count = cc->byte_counter; @@ -948,7 +999,7 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t const uint64_t is_hmem = (const uint64_t) fi_opx_hmem_iov_init(buf, count * sizeofdt(datatype), NULL, &iov); - fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr, key, NULL, cc, + fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr, key, cc, datatype, op, opx_ep->tx->op_flags | FI_INJECT, is_hmem, lock_required, caps, reliability, hfi1_type); @@ -976,22 +1027,19 @@ ssize_t fi_opx_atomic(struct fid_ep *ep, const void *buf, size_t count, void *de /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ if (OPX_HFI1_TYPE & OPX_HFI1_WFR) { rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, - context, FI_OPX_LOCK_NOT_REQUIRED, - opx_ep->av_type, 0x0018000000000000ull, - OPX_RELIABILITY, - OPX_HFI1_WFR); + context, FI_OPX_LOCK_NOT_REQUIRED, + opx_ep->av_type, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_WFR); } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, - context, FI_OPX_LOCK_NOT_REQUIRED, - opx_ep->av_type, 0x0018000000000000ull, - OPX_RELIABILITY, - OPX_HFI1_JKR_9B); + context, FI_OPX_LOCK_NOT_REQUIRED, + opx_ep->av_type, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR_9B); } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { rc = fi_opx_atomic_generic(ep, buf, count, dst_addr, addr, key, datatype, op, - context, FI_OPX_LOCK_NOT_REQUIRED, - opx_ep->av_type, 0x0018000000000000ull, - OPX_RELIABILITY, - OPX_HFI1_JKR); + context, FI_OPX_LOCK_NOT_REQUIRED, + opx_ep->av_type, 0x0018000000000000ull, + OPX_RELIABILITY, OPX_HFI1_JKR); } else { rc = -FI_EPERM; FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); @@ -1233,13 +1281,13 @@ ssize_t fi_opx_atomic_writemsg(struct fid_ep *ep, const struct fi_msg_atomic *ms FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, - OPX_HFI1_WFR); + OPX_HFI1_WFR); } else { rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, - OPX_HFI1_WFR); + OPX_HFI1_WFR); } } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR_9B) { if (opx_ep->av_type == FI_AV_MAP) { @@ -1247,13 +1295,13 @@ ssize_t fi_opx_atomic_writemsg(struct fid_ep *ep, const struct fi_msg_atomic *ms FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, - OPX_HFI1_JKR_9B); + OPX_HFI1_JKR_9B); } else { rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, - OPX_HFI1_JKR_9B); + OPX_HFI1_JKR_9B); } } else if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { if (opx_ep->av_type == FI_AV_MAP) { @@ -1261,13 +1309,13 @@ ssize_t fi_opx_atomic_writemsg(struct fid_ep *ep, const struct fi_msg_atomic *ms FI_AV_MAP, 0x0018000000000000ull, OPX_RELIABILITY, - OPX_HFI1_JKR); + OPX_HFI1_JKR); } else { rc = fi_opx_atomic_writemsg_generic(ep, msg, flags, FI_OPX_LOCK_NOT_REQUIRED, FI_AV_TABLE, 0x0018000000000000ull, OPX_RELIABILITY, - OPX_HFI1_JKR); + OPX_HFI1_JKR); } } else { rc = -FI_EPERM; diff --git a/prov/opx/src/fi_opx_cq.c b/prov/opx/src/fi_opx_cq.c index f1e5e098d5f..7b831d2ffd9 100644 --- a/prov/opx/src/fi_opx_cq.c +++ b/prov/opx/src/fi_opx_cq.c @@ -47,18 +47,19 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line) { char *s = str; size_t len = 2047; int n = 0; - union fi_opx_context * context = NULL;; + struct opx_context *context = NULL;; struct fi_opx_cq *opx_cq = (struct fi_opx_cq *)cq; if (!func) func = "undef"; - n = snprintf(s, len, "%s():%d [%p] completed(%p,%p)", func, line, opx_cq, opx_cq->completed.head, opx_cq->completed.tail); + n = snprintf(s, len, "%s():%d [%p] completed(%p,%p)", func, line, + opx_cq, opx_cq->completed.head, opx_cq->completed.tail); s += n; len -= n; if (opx_cq->completed.head != NULL) { - context = (union fi_opx_context *) opx_cq->completed.head; + context = (struct opx_context *) opx_cq->completed.head; n = snprintf(s, len, " = { %p", context); s += n; len -= n; context = context->next; @@ -73,7 +74,7 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line) { n = 0; len = 2047; s = str; *s = 0; n = snprintf(s, len, "%s():%d [%p] pending(%p,%p)", func, line, opx_cq, opx_cq->pending.head, opx_cq->pending.tail); s += n; len -= n; if (opx_cq->pending.head != NULL) { - context = (union fi_opx_context *) opx_cq->pending.head; + context = (struct opx_context *) opx_cq->pending.head; n = snprintf(s, len, " = { %p(%lu,0x%016lx)", context, context->byte_counter, context->byte_counter); s += n; len -= n; context = context->next; @@ -89,7 +90,7 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line) { n = 0; len = 2047; s = str; *s = 0; n = snprintf(s, len, "%s():%d [%p] err(%p,%p)", func, line, opx_cq, opx_cq->err.head, opx_cq->err.tail); s += n; len -= n; if (opx_cq->err.head != NULL) { - context = (union fi_opx_context *) opx_cq->err.head; + context = (struct opx_context *) opx_cq->err.head; n = snprintf(s, len, " = { %p(%lu)", context, context->byte_counter); s += n; len -= n; context = context->next; @@ -185,15 +186,13 @@ static struct fi_ops fi_opx_fi_ops = { .ops_open = fi_opx_ops_open_cq }; -int fi_opx_cq_enqueue_err (struct fi_opx_cq * opx_cq, - struct fi_opx_context_ext * ext, +int fi_opx_cq_enqueue_err (struct fi_opx_cq *opx_cq, + struct opx_context *context, const int lock_required) { - assert(ext->opx_context.flags & FI_OPX_CQ_CONTEXT_EXT); /* DEBUG */ assert(!lock_required); - ext->opx_context.next = NULL; - slist_insert_tail((struct slist_entry *) ext, &opx_cq->err); + slist_insert_tail((struct slist_entry *) context, &opx_cq->err); return 0; } diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 5474e6cd9d6..826266422c5 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -57,8 +57,8 @@ #include "rdma/opx/fi_opx_fabric.h" #define FI_OPX_EP_RX_UEPKT_BLOCKSIZE (256) -#define FI_OPX_EP_RX_CTX_EXT_BLOCKSIZE (2048) #define FI_OPX_VER_CHECK_BUF_LEN (512) +#define OPX_EP_RX_CTX_BLOCKSIZE (2048) #define OPX_MODINFO_PATH "/sbin/modinfo" #define OPX_MODINFO_DRV_VERS OPX_MODINFO_PATH " hfi1 -F version" #define OPX_MODINFO_SRC_VERS OPX_MODINFO_PATH " hfi1 -F srcversion" @@ -548,6 +548,31 @@ static void fi_opx_unbind_cq_ep(struct fi_opx_cq *cq, struct fi_opx_ep *ep) } +__OPX_FORCE_INLINE__ +int opx_ep_free_match_queue_list_contexts(struct slist *list) +{ + int count = 0; + + while (!slist_empty(list)) { + struct opx_context *context = (struct opx_context *) slist_remove_head(list); + OPX_BUF_FREE(context); + ++count; + } + + return count; +} + +__OPX_FORCE_INLINE__ +void opx_ep_free_match_queued_contexts(struct fi_opx_ep *opx_ep) +{ + int tag_count = opx_ep_free_match_queue_list_contexts(&opx_ep->rx->queue[0].mq); + int msg_count = opx_ep_free_match_queue_list_contexts(&opx_ep->rx->queue[1].mq); + + FI_LOG(fi_opx_global.prov, FI_LOG_DEBUG, FI_LOG_FABRIC, + "Freed %d contexts from tag match queue, %d contexts from msg match queue\n", + tag_count, msg_count); +} + static int fi_opx_close_ep(fid_t fid) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "close ep\n"); @@ -653,6 +678,7 @@ static int fi_opx_close_ep(fid_t fid) } } if (opx_ep->rx && (opx_ep->rx->cq && (fid->fclass == FI_CLASS_EP || fid->fclass == FI_CLASS_RX_CTX))) { + opx_ep_free_match_queued_contexts(opx_ep); ret = fi_opx_ref_dec(&opx_ep->rx->cq->ref_cnt, "completion queue"); if (ret) { errno = -ret; @@ -709,9 +735,9 @@ static int fi_opx_close_ep(fid_t fid) if (opx_ep->rx->match_ue_tag_hash) { fi_opx_match_ue_hash_free(&opx_ep->rx->match_ue_tag_hash); } - if (opx_ep->rx->ctx_ext_pool) { - ofi_bufpool_destroy(opx_ep->rx->ctx_ext_pool); - opx_ep->rx->ctx_ext_pool = NULL; + if (opx_ep->rx->ctx_pool) { + ofi_bufpool_destroy(opx_ep->rx->ctx_pool); + opx_ep->rx->ctx_pool = NULL; } free(opx_ep->rx->mem); } @@ -1165,10 +1191,10 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) goto err; } - opx_ep->rx->ctx_ext_pool = NULL; - if (ofi_bufpool_create(&opx_ep->rx->ctx_ext_pool, - sizeof(struct fi_opx_context_ext), - 8, UINT_MAX, FI_OPX_EP_RX_CTX_EXT_BLOCKSIZE, 0)) { + opx_ep->rx->ctx_pool = NULL; + if (ofi_bufpool_create(&opx_ep->rx->ctx_pool, + sizeof(struct opx_context), + 64, UINT_MAX, OPX_EP_RX_CTX_BLOCKSIZE, 0)) { goto err; } struct fi_opx_domain * opx_domain = opx_ep->domain; @@ -1387,9 +1413,9 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) fi_opx_match_ue_hash_free(&opx_ep->rx->match_ue_tag_hash); - if (opx_ep->rx->ctx_ext_pool) { - ofi_bufpool_destroy(opx_ep->rx->ctx_ext_pool); - opx_ep->rx->ctx_ext_pool = NULL; + if (opx_ep->rx->ctx_pool) { + ofi_bufpool_destroy(opx_ep->rx->ctx_pool); + opx_ep->rx->ctx_pool = NULL; } return -FI_ENOMEM; @@ -1990,9 +2016,9 @@ static int fi_opx_setopt_ep(fid_t fid, int level, int optname, return 0; } -int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx * rx, +int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx *rx, const uint64_t static_flags, - const union fi_opx_context * cancel_context, + const uintptr_t cancel_context, const int lock_required) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "(begin)\n"); @@ -2004,16 +2030,13 @@ int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx * rx, * search the match queue for this context */ - union fi_opx_context * prev = NULL; - union fi_opx_context * item = (union fi_opx_context *) rx->queue[kind].mq.head; + struct opx_context *prev = NULL; + struct opx_context *item = (struct opx_context *) rx->queue[kind].mq.head; while (item) { - const uint64_t is_context_ext = item->flags & FI_OPX_CQ_CONTEXT_EXT; - const uint64_t compare_context = is_context_ext ? - (uint64_t)(((struct fi_opx_context_ext *)item)->msg.op_context) : - (uint64_t)item; + const uintptr_t compare_context = (uintptr_t) item->err_entry.op_context; - if ((uintptr_t)cancel_context == compare_context) { + if (cancel_context == compare_context) { if (prev) prev->next = item->next; else @@ -2022,36 +2045,21 @@ int fi_opx_ep_rx_cancel (struct fi_opx_ep_rx * rx, if (!item->next) rx->queue[kind].mq.tail = (struct slist_entry *) prev; - struct fi_opx_context_ext * ext = NULL; - if (cancel_context->flags & FI_OPX_CQ_CONTEXT_EXT) { - ext = (struct fi_opx_context_ext *)cancel_context; - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory.\n"); - return -FI_ENOMEM; - } - - ext->opx_context.flags = FI_OPX_CQ_CONTEXT_EXT; - } - - ext->opx_context.byte_counter = 0; - ext->opx_context.next = NULL; - ext->err_entry.op_context = (void *)cancel_context; - ext->err_entry.flags = cancel_context->flags; - ext->err_entry.len = 0; - ext->err_entry.buf = 0; - ext->err_entry.data = 0; - ext->err_entry.tag = cancel_context->tag; - ext->err_entry.olen = 0; - ext->err_entry.err = FI_ECANCELED; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; + item->byte_counter = 0; + item->next = NULL; + item->err_entry.flags = item->flags; + item->err_entry.len = 0; + item->err_entry.buf = 0; + item->err_entry.data = 0; + item->err_entry.tag = item->tag; + item->err_entry.olen = 0; + item->err_entry.err = FI_ECANCELED; + item->err_entry.prov_errno = 0; + item->err_entry.err_data = NULL; + item->err_entry.err_data_size = 0; if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } - slist_insert_tail((struct slist_entry *) ext, rx->cq_err_ptr); + slist_insert_tail((struct slist_entry *) item, rx->cq_err_ptr); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "(end) canceled\n"); return FI_ECANCELED; @@ -2079,7 +2087,7 @@ ssize_t fi_opx_cancel(fid_t fid, void *context) if (opx_ep->rx->caps & FI_MSG) { fi_opx_ep_rx_cancel(opx_ep->rx, FI_MSG, - (const union fi_opx_context *) context, + (const uintptr_t) context, FI_OPX_LOCK_NOT_REQUIRED); } @@ -2087,7 +2095,7 @@ ssize_t fi_opx_cancel(fid_t fid, void *context) if (opx_ep->rx->caps & FI_TAGGED) { fi_opx_ep_rx_cancel(opx_ep->rx, FI_TAGGED, - (const union fi_opx_context *) context, + (const uintptr_t) context, FI_OPX_LOCK_NOT_REQUIRED); } fi_opx_unlock_if_required(&opx_ep->lock, lock_required); @@ -2127,7 +2135,7 @@ int fi_opx_alloc_default_rx_attr(struct fi_rx_attr **rx_attr) goto err; attr->caps = FI_OPX_DEFAULT_RX_CAPS; - attr->mode = FI_CONTEXT2 | FI_ASYNC_IOV; + attr->mode = FI_ASYNC_IOV; attr->op_flags = 0; attr->msg_order = FI_OPX_DEFAULT_MSG_ORDER; attr->size = SIZE_MAX; //FI_OPX_RX_SIZE; @@ -2157,7 +2165,7 @@ int fi_opx_alloc_default_tx_attr(struct fi_tx_attr **tx_attr) goto err; attr->caps = FI_OPX_DEFAULT_TX_CAPS; - attr->mode = FI_CONTEXT2 | FI_ASYNC_IOV; + attr->mode = FI_ASYNC_IOV; attr->op_flags = FI_TRANSMIT_COMPLETE; attr->msg_order = FI_OPX_DEFAULT_MSG_ORDER; attr->inject_size = FI_OPX_HFI1_PACKET_IMM; @@ -2523,7 +2531,7 @@ int fi_opx_endpoint_rx_tx (struct fid_domain *dom, struct fi_info *info, #if defined(OPX_HMEM) && HAVE_CUDA int use_gdrcopy; int gdrcopy_enabled = cuda_is_gdrcopy_enabled(); - + if (fi_param_get_bool(NULL, "hmem_cuda_use_gdrcopy", &use_gdrcopy) != FI_SUCCESS) { FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "FI_HMEM_CUDA_USE_GDRCOPY either not specified or invalid. Using default value of 1\n"); use_gdrcopy = 1; /* Set to the libfabric default of FI_HMEM_CUDA_USE_GDRCOPY=1 */ @@ -2612,18 +2620,14 @@ int fi_opx_ep_tx_check (struct fi_opx_ep_tx * tx, enum fi_av_type av_type) /* rx_op_flags is only checked for FI_PEEK | FI_CLAIM | FI_MULTI_RECV; * rx_op_flags is only used if FI_PEEK | FI_CLAIM; - * is_context_ext is only used if FI_PEEK | iovec; - * - * The "normal" data movement functions, such as fi_[t]recv(), can safely - * specify '0' for rx_op_flags, and is_context_ext, in order to reduce code path. * * See `fi_opx_ep_rx_process_context()` */ __attribute__((noinline)) void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, const uint64_t static_flags, - union fi_opx_context * context, - const uint64_t rx_op_flags, const uint64_t is_context_ext, + struct opx_context *context, + const uint64_t rx_op_flags, const uint64_t is_hmem, const int lock_required, const enum fi_av_type av_type, const enum ofi_reliability_kind reliability, @@ -2681,8 +2685,8 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, #endif } - fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, - is_context_ext, lock_required); + fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, lock_required); + return; } @@ -2690,43 +2694,23 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, * did not find a match for this "peek"; notify the application * via completion queue error entry */ - - struct fi_opx_context_ext * ext = NULL; - if (is_context_ext) { - ext = (struct fi_opx_context_ext *)context; - assert((ext->opx_context.flags & FI_OPX_CQ_CONTEXT_EXT) != 0); - } else { - ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory.\n"); - abort(); - } - ext->opx_context.flags = rx_op_flags | FI_OPX_CQ_CONTEXT_EXT; - } - - ext->err_entry.op_context = context; - ext->err_entry.flags = rx_op_flags; - ext->err_entry.len = 0; - ext->err_entry.buf = 0; - ext->err_entry.data = 0; - ext->err_entry.tag = 0; - ext->err_entry.olen = 0; - ext->err_entry.err = FI_ENOMSG; - ext->err_entry.prov_errno = 0; - ext->err_entry.err_data = NULL; - ext->err_entry.err_data_size = 0; - ext->opx_context.byte_counter = 0; - + context->err_entry.flags = rx_op_flags; + context->err_entry.len = 0; + context->err_entry.buf = 0; + context->err_entry.data = 0; + context->err_entry.tag = 0; + context->err_entry.olen = 0; + context->err_entry.err = FI_ENOMSG; + context->err_entry.prov_errno = 0; + context->err_entry.err_data = NULL; + context->err_entry.err_data_size = 0; + context->byte_counter = 0; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "no match found on unexpected queue posting error\n"); - fi_opx_cq_enqueue_err(opx_ep->rx->cq, ext, lock_required); + fi_opx_cq_enqueue_err(opx_ep->rx->cq, context, lock_required); } else if (rx_op_flags & FI_CLAIM) { - assert((!(rx_op_flags & FI_OPX_CQ_CONTEXT_EXT) && !(rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM)) || - ((rx_op_flags & FI_OPX_CQ_CONTEXT_EXT) && (rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM))); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "rx_op_flags & FI_CLAIM complete receive operation\n"); /* only FI_CLAIM was specified @@ -2742,13 +2726,12 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, const unsigned is_intranode = opx_lrh_is_intranode(&(claimed_pkt->hdr), hfi1_type); - complete_receive_operation(ep, + opx_ep_complete_receive_operation(ep, &claimed_pkt->hdr, (union fi_opx_hfi1_packet_payload *)&claimed_pkt->payload, claimed_pkt->hdr.match.ofi_tag, context, claimed_pkt->hdr.bth.opcode, - rx_op_flags & FI_OPX_CQ_CONTEXT_EXT, OPX_MULTI_RECV_FALSE, is_intranode, rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM, @@ -2766,8 +2749,7 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, } else if ((static_flags & FI_MSG) && (rx_op_flags & FI_MULTI_RECV)) { /* TODO: HMEM not supported for multi-receive */ - assert(!(rx_op_flags & FI_OPX_CQ_CONTEXT_EXT) && - !(rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM)); + assert(!(rx_op_flags & FI_OPX_CQ_CONTEXT_HMEM)); context->src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, context->src_addr); @@ -2813,13 +2795,12 @@ void fi_opx_ep_rx_process_context_noinline (struct fi_opx_ep * opx_ep, /* the 'context->len' field will be updated to the * new multi-receive buffer free space as part of * the receive completion */ - complete_receive_operation(ep, + opx_ep_complete_receive_operation(ep, &uepkt->hdr, (union fi_opx_hfi1_packet_payload *)&uepkt->payload, uepkt->hdr.match.ofi_tag, context, uepkt->hdr.bth.opcode, - OPX_CONTEXT_EXTENDED_FALSE, OPX_MULTI_RECV_TRUE, OPX_HMEM_FALSE, is_intranode, diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index b697f0c095b..2988b220998 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -233,7 +233,7 @@ void process_hfi_lookup(int hfi_unit, unsigned int lid) } -void fi_opx_init_hfi_lookup() +void fi_opx_init_hfi_lookup() { int hfi_unit = 0; int hfi_units = MIN(opx_hfi_get_num_units(), FI_OPX_MAX_HFIS); @@ -1064,7 +1064,7 @@ int opx_hfi1_rx_rzv_rts_send_cts_intranode_16B(union fi_opx_hfi1_deferred_work * /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), because this is intranode and since it's a CTS packet, lrh.pktlen isn't used/needed */ - hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B)); hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); @@ -2096,7 +2096,7 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, const void * const payload, const uint8_t u8_rx, const uint64_t niov, uintptr_t origin_byte_counter_vaddr, - union fi_opx_context *const target_context, + struct opx_context *const target_context, const uintptr_t dst_vaddr, const enum fi_hmem_iface dst_iface, const uint64_t dst_device, @@ -2316,7 +2316,7 @@ void opx_hfi1_dput_fence(struct fi_opx_ep *opx_ep, params->work_elem.payload_copy = NULL; params->work_elem.complete = false; params->work_elem.work_type = OPX_WORK_TYPE_SHM; - + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; else @@ -2332,7 +2332,7 @@ void opx_hfi1_dput_fence(struct fi_opx_ep *opx_ep, slid = hdr->lrh_9B.slid; else slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); - + if (slid == opx_ep->rx->self.uid.lid) { params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; } else { @@ -2436,8 +2436,8 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) } else { const uint64_t additional_hdr_tail_byte = 2 * 8; /* 1 QW for hdr that spills to 2nd cacheline 1 QW for ICRC/tail */ - uint64_t payload_n_additional_hdr_tail_bytes = (MIN(bytes_to_send + params->payload_bytes_for_iovec + additional_hdr_tail_byte, - max_bytes_per_packet)); + uint64_t payload_n_additional_hdr_tail_bytes = (MIN(bytes_to_send + params->payload_bytes_for_iovec + additional_hdr_tail_byte, + max_bytes_per_packet)); uint64_t tail_bytes = payload_n_additional_hdr_tail_bytes & 0x3Ful; blocks_to_send_in_this_packet = (payload_n_additional_hdr_tail_bytes >> 6) + (tail_bytes ? 1 : 0); bytes_to_send_this_packet = payload_n_additional_hdr_tail_bytes - additional_hdr_tail_byte; @@ -2457,7 +2457,7 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) pbc_dws = 2 + /* pbc */ 4 + /* lrh */ 3 + /* bth */ - 7 + /* kdeth */ + 7 + /* kdeth */ (blocks_to_send_in_this_packet << 4); // ICRC and the kdeth in the second cacheline are accounted for here lrh_dws = (pbc_dws - 1) >> 1; } @@ -2892,7 +2892,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) return -FI_EAGAIN; } - + opx_hfi1_sdma_flush(opx_ep, params->sdma_we, ¶ms->sdma_reqs, @@ -3403,7 +3403,8 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ const unsigned is_intranode, const enum ofi_reliability_kind reliability, const uint32_t u32_extended_rx, - const enum opx_hfi1_type hfi1_type) { + const enum opx_hfi1_type hfi1_type) +{ union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); struct fi_opx_hfi1_dput_params *params = &work->dput; @@ -3535,11 +3536,12 @@ uint64_t num_sends; uint64_t total_sendv_bytes; ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, void *desc, fi_addr_t dest_addr, uint64_t tag, - void *context, const uint32_t data, int lock_required, + void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, - const uint64_t dest_rx, const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, + const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type) @@ -3552,7 +3554,6 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(addr.fi); assert(niov <= MIN(FI_OPX_MAX_DPUT_IOV, FI_OPX_MAX_HMEM_IOV)); - *origin_byte_counter_value = total_len; FI_OPX_DEBUG_COUNTERS_DECLARE_TMP(hmem_non_system); @@ -3596,7 +3597,8 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { FI_DBG_TRACE( fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, SHM -- RENDEZVOUS RTS Noncontig (begin) context %p\n",context); + "===================================== SENDV, SHM -- RENDEZVOUS RTS Noncontig (begin) context %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-RZV-RTS-NONCONTIG-SHM"); uint64_t pos; @@ -3607,12 +3609,29 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz if (!hdr) return rc; + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = total_len; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { hdr->qw_9B[0] = opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); hdr->qw_9B[1] = opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); - hdr->qw_9B[2] = opx_ep->tx->rzv_9B.hdr.qw_9B[2]; hdr->qw_9B[3] = opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); hdr->qw_9B[4] = opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; @@ -3665,15 +3684,22 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz .send.rzv_noncontig); opx_shm_tx_advance(&opx_ep->tx->shm, (void *)hdr, pos); + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, total_len, + lock_required, tag, caps); + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-RZV-RTS-NONCONTIG-SHM"); FI_DBG_TRACE( fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, SHM -- RENDEZVOUS RTS (end) context %p\n",context); + "===================================== SENDV, SHM -- RENDEZVOUS RTS (end) context %p\n", + user_context); fi_opx_shm_poll_many(&opx_ep->ep_fid, 0, hfi1_type); return FI_SUCCESS; } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SENDV, HFI -- RENDEZVOUS RTS (begin) context %p\n",context); + "===================================== SENDV, HFI -- RENDEZVOUS RTS (begin) context %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SENDV-RZV-RTS-HFI"); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; @@ -3693,13 +3719,34 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz } } + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = total_len; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + struct fi_opx_reliability_tx_replay *replay; union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); - if(OFI_UNLIKELY(psn == -1)) { + if (OFI_UNLIKELY(psn == -1)) { + if (OFI_LIKELY(do_cq_completion)) { + OPX_BUF_FREE(context); + } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); return -FI_EAGAIN; } @@ -3822,7 +3869,6 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz replay_payload += 7; rem_payload_size = (sizeof(struct fi_opx_hmem_iov) * (niov - 2) + 8); // overflow 8 bytes from 2nd cacheline } - if (payload_blocks_total > 1) { assert(niov > 2); @@ -3851,6 +3897,10 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz /* update the hfi txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, total_len, + lock_required, tag, caps); + } OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SENDV-RZV-RTS-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SENDV, HFI -- RENDEZVOUS RTS (end) context %p\n",context); @@ -3860,14 +3910,13 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void* context, + fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, const uint64_t dest_rx, - const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface src_iface, const uint64_t src_device_id, const enum opx_hfi1_type hfi1_type) @@ -3888,13 +3937,6 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, * data block for alignment. Limit this to SDMA (8K+) for now */ const uint64_t immediate_block_count = (len > opx_ep->tx->sdma_min_payload_bytes && opx_ep->use_expected_tid_rzv) ? 1 : 0; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "immediate_block_count %#lX *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, " - "*origin_byte_counter_vaddr %lu/%#lX, len %lu/%#lX\n", - immediate_block_count, *origin_byte_counter_value, (uint64_t*)origin_byte_counter_vaddr, - origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, - origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, len, len ); - const uint64_t immediate_end_block_count = immediate_block_count; assert((immediate_block_count + immediate_end_block_count) <= max_immediate_block_count); @@ -3923,18 +3965,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, .unused = 0 }; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "max_immediate_block_count %#lX, len %#lX >> 6 %#lX, immediate_total %#lX, " - "immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX, " - "origin_byte_counter %lu/%#lX, adjusted origin_byte_counter %lu/%#lX\n", - max_immediate_block_count, len, (len >> 6), immediate_total, immediate_byte_count, - immediate_qw_count, immediate_block_count, *origin_byte_counter_value, - *origin_byte_counter_value, len - immediate_total, len - immediate_total); - assert(((len - immediate_total) & 0x003Fu) == 0); - *origin_byte_counter_value = len - immediate_total; - const uint64_t payload_blocks_total = 1 + /* rzv metadata */ immediate_fragment + @@ -3952,7 +3984,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, SHM -- RENDEZVOUS RTS (begin) context %p\n",context); + "===================================== SEND, SHM -- RENDEZVOUS RTS (begin) context %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-SHM"); uint64_t pos; ssize_t rc; @@ -3966,6 +3999,24 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, return rc; } + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = len - immediate_total; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + FI_OPX_DEBUG_COUNTERS_INC_COND(src_iface != FI_HMEM_SYSTEM, opx_ep->debug_counters.hmem.intranode .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] @@ -4030,14 +4081,21 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, len, + lock_required, tag, caps); + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-SHM"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, SHM -- RENDEZVOUS RTS (end) context %p\n",context); + "===================================== SEND, SHM -- RENDEZVOUS RTS (end) context %p\n", + user_context); return FI_SUCCESS; } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, HFI -- RENDEZVOUS RTS (begin) context %p\n",context); + "===================================== SEND, HFI -- RENDEZVOUS RTS (begin) context %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-HFI:%ld", tag); /* @@ -4064,13 +4122,34 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, } } + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = len - immediate_total; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + struct fi_opx_reliability_tx_replay *replay; union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); - if(OFI_UNLIKELY(psn == -1)) { + if (OFI_UNLIKELY(psn == -1)) { + if (OFI_LIKELY(do_cq_completion)) { + OPX_BUF_FREE(context); + } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); return -FI_EAGAIN; } @@ -4092,7 +4171,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, fi_opx_store_and_copy_qw(scb, local_temp, opx_ep->tx->rzv_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | - OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), + OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | ((caps & FI_MSG) ? @@ -4120,13 +4199,13 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); uint64_t temp[8]; fi_opx_store_and_copy_qw(scb_payload, temp, - (uintptr_t)buf + immediate_total, /* src_vaddr */ - (len - immediate_total) >> 6, /* src_blocks */ - src_device_id, - (uint64_t) src_iface, - immediate_info.qw0, - origin_byte_counter_vaddr, - 0, 0 /* unused */); + (uintptr_t)buf + immediate_total, /* src_vaddr */ + (len - immediate_total) >> 6, /* src_blocks */ + src_device_id, + (uint64_t) src_iface, + immediate_info.qw0, + origin_byte_counter_vaddr, + 0, 0 /* unused */); /* consume one credit for the rendezvous payload metadata */ FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); @@ -4219,7 +4298,6 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, if (immediate_end_block_count) { char* sbuf_end = (char *)buf + len - (immediate_end_block_count << 6); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE SEND RZV buf %p, buf end %p, sbuf immediate end block %p\n",(char *)buf, (char *)buf+len, sbuf_end); union { uint8_t immediate_byte[64]; uint64_t immediate_qw[8]; @@ -4256,23 +4334,27 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, /* update the hfi txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, len, lock_required, tag, caps); + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-HFI:%ld",tag); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND, HFI -- RENDEZVOUS RTS (end) context %p\n",context); + "===================================== SEND, HFI -- RENDEZVOUS RTS (end) context %p\n", + user_context); return FI_SUCCESS; } ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void* context, + fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, uint64_t tx_op_flags, const uint64_t dest_rx, - const uintptr_t origin_byte_counter_vaddr, - uint64_t *origin_byte_counter_value, const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface src_iface, const uint64_t src_device_id, const enum opx_hfi1_type hfi1_type) @@ -4295,13 +4377,6 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, const uint64_t immediate_block_count = (len > opx_ep->tx->sdma_min_payload_bytes && opx_ep->use_expected_tid_rzv) ? 1 : 0; const uint64_t immediate_end_block_count = immediate_block_count; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "immediate_block_count %#lX *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, " - "*origin_byte_counter_vaddr %lu/%#lX, len %lu/%#lX\n", - immediate_block_count, *origin_byte_counter_value, (uint64_t*)origin_byte_counter_vaddr, - origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, - origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, len, len ); - assert((immediate_block_count + immediate_end_block_count) <= max_immediate_block_count); const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; @@ -4342,14 +4417,6 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, .unused = 0 }; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "max_immediate_block_count %#lX, len %#lX >> 6 %#lX, immediate_total %#lX, " - "immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX, " - "origin_byte_counter %lu/%#lX, adjusted origin_byte_counter %lu/%#lX\n", - max_immediate_block_count, len, (len >> 6), immediate_total, immediate_byte_count, - immediate_qw_count, immediate_block_count, *origin_byte_counter_value, - *origin_byte_counter_value, len - immediate_total, len - immediate_total); - assert(immediate_byte_count <= UINT8_MAX); assert(immediate_qw_count <= UINT8_MAX); assert(immediate_block_count <= UINT8_MAX); @@ -4359,8 +4426,6 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, assert(((len - immediate_total) & 0x003Fu) == 0); - *origin_byte_counter_value = len - immediate_total; - /* full blocks only. icrc_end_block/icrc_fragment_block count 1 qw only */ const uint64_t payload_blocks_total = 1 + /* rzv metadata */ @@ -4376,12 +4441,13 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, 4 + /* software kdeth + unused */ (payload_blocks_total << 4) + ((icrc_end_block | icrc_fragment_block) << 1); /* 1 QW of any added tail block */ - + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; // Does not include PBC and is in QW if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND 16B, SHM -- RENDEZVOUS RTS (begin) context %p\n",context); + "===================================== SEND 16B, SHM -- RENDEZVOUS RTS (begin) context %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-SHM"); uint64_t pos; ssize_t rc; @@ -4395,16 +4461,34 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, return rc; } + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = len - immediate_total; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + FI_OPX_DEBUG_COUNTERS_INC_COND(src_iface != FI_HMEM_SYSTEM, opx_ep->debug_counters.hmem.intranode .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] .send.rzv); - hdr->qw_16B[0] = opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + hdr->qw_16B[0] = opx_ep->tx->rzv_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_qws << 20); - hdr->qw_16B[1] = opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + hdr->qw_16B[1] = opx_ep->tx->rzv_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); hdr->qw_16B[2] = opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | @@ -4463,12 +4547,18 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-SHM"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND 16B, SHM -- RENDEZVOUS RTS (end) context %p\n",context); + "===================================== SEND 16B, SHM -- RENDEZVOUS RTS (end) context %p\n", + user_context); + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, len, + lock_required, tag, caps); + } return FI_SUCCESS; } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND 16B, HFI -- RENDEZVOUS RTS (begin) context %p\n",context); + "===================================== SEND 16B, HFI -- RENDEZVOUS RTS (begin) context %p\n", + user_context); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-RTS-HFI:%ld", tag); /* @@ -4494,13 +4584,34 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, } } + struct opx_context *context; + uintptr_t origin_byte_counter_vaddr; + if (OFI_LIKELY(do_cq_completion)) { + context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->err_entry.err = 0; + context->err_entry.op_context = user_context; + context->next = NULL; + context->byte_counter = len - immediate_total; + origin_byte_counter_vaddr = (uintptr_t) &context->byte_counter; + } else { + context = NULL; + origin_byte_counter_vaddr = (uintptr_t) NULL; + } + struct fi_opx_reliability_tx_replay *replay; union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability, hfi1_type); - if(OFI_UNLIKELY(psn == -1)) { + if (OFI_UNLIKELY(psn == -1)) { + if (OFI_LIKELY(do_cq_completion)) { + OPX_BUF_FREE(context); + } return -FI_EAGAIN; } @@ -4522,10 +4633,10 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, fi_opx_store_and_copy_scb_16B(scb, &tmp, opx_ep->tx->rzv_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), - opx_ep->tx->rzv_16B.hdr.qw_16B[0] | + opx_ep->tx->rzv_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_qws << 20), - opx_ep->tx->rzv_16B.hdr.qw_16B[1] | + opx_ep->tx->rzv_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | ((caps & FI_MSG) ? @@ -4608,7 +4719,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, struct tmp_payload_t *tmp_payload = (void*)temp; if (immediate_byte_count > 0) { memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count); - } + } for (int i=0; iimmediate_qw[i] = sbuf_qw[i]; @@ -4657,7 +4768,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, if(immediate_block_count) { #ifndef NDEBUG - /* Tail will be it's own block */ + /* Tail will be it's own block */ assert(icrc_end_block && !icrc_fragment_block && !icrc_fragment && immediate_end_block_count); /* assert immediate_block_count can be used for both * full_block_credits_needed and total_credits_available parameters @@ -4739,9 +4850,14 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, /* update the hfi txe state */ opx_ep->tx->pio_state->qw0 = pio_state.qw0; + if (OFI_LIKELY(do_cq_completion)) { + fi_opx_ep_tx_cq_completion_rzv(ep, context, len, lock_required, tag, caps); + } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-RZV-RTS-HFI:%ld",tag); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND 16B, HFI -- RENDEZVOUS RTS (end) context %p\n",context); + "===================================== SEND 16B, HFI -- RENDEZVOUS RTS (end) context %p\n", + user_context); return FI_SUCCESS; } diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 0e340c735c8..6aafeb433c8 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -158,7 +158,7 @@ int fi_opx_check_info(const struct fi_info *info) static int fi_opx_fillinfo(struct fi_info *fi, const char *node, const char* service, const struct fi_info *hints, - uint64_t flags, enum fi_progress progress) + uint64_t flags, enum fi_progress progress) { int ret; uint64_t caps; @@ -172,19 +172,11 @@ static int fi_opx_fillinfo(struct fi_info *fi, const char *node, if (!hints && !node && !service) goto err; - if (hints && (((hints->mode & FI_CONTEXT) != 0) && ((hints->mode & FI_CONTEXT2) == 0))) { - FI_WARN(fi_opx_global.prov, FI_LOG_FABRIC, - "FI_CONTEXT mode is not supported. Use FI_CONTEXT2 mode instead.\n"); - errno = FI_ENODATA; - return -errno; - } - fi->next = NULL; fi->caps = FI_OPX_DEFAULT_CAPS; /* set the mode that we require */ fi->mode = FI_ASYNC_IOV; - fi->mode |= (FI_CONTEXT2); fi->addr_format = FI_ADDR_OPX; fi->src_addrlen = 0; @@ -195,7 +187,7 @@ static int fi_opx_fillinfo(struct fi_info *fi, const char *node, // Process the node field. Service is treated identically to node. if (node) { if (!ofi_str_toaddr(node, &fmt, (void **)&addr, &len) && - fmt == FI_ADDR_OPX) { + fmt == FI_ADDR_OPX) { if (flags & FI_SOURCE) { fi->src_addr = addr; fi->src_addrlen = sizeof(union fi_opx_addr); @@ -666,11 +658,6 @@ static void do_static_assert_tests() OPX_COMPILE_TIME_ASSERT(sizeof(*payload) == sizeof(payload->rendezvous.noncontiguous), "Non-contiguous rendezvous payload size error"); - OPX_COMPILE_TIME_ASSERT(sizeof(struct fi_context2) == sizeof(union fi_opx_context), - "fi_opx_context size error"); - - OPX_COMPILE_TIME_ASSERT((sizeof(struct fi_opx_context_ext) & 0x1F) == 0, - "sizeof(fi_opx_context_ext) should be a multiple of 32") ; OPX_COMPILE_TIME_ASSERT((sizeof(struct fi_opx_hmem_info) >> 3) == OPX_HMEM_SIZE_QWS, "sizeof(fi_opx_hmem_info) >> 3 != OPX_HMEM_SIZE_QWS") ; OPX_COMPILE_TIME_ASSERT(OPX_HFI1_TID_PAGESIZE == 4096, diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index 0dae5a3bbf0..3850c7f822d 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -57,16 +57,19 @@ void fi_opx_hit_zero(struct fi_opx_completion_counter *cc) FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== NO COUNTER INCREMENT\n"); } if (cc->cq && cc->context) { - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - opx_context->next = NULL; - opx_context->len = 0; - opx_context->buf = NULL; - opx_context->byte_counter = 0; - opx_context->tag = 0; + cc->context->next = NULL; + cc->context->len = 0; + cc->context->buf = NULL; + cc->context->byte_counter = 0; + cc->context->tag = 0; + assert(cc->context->err_entry.op_context != NULL); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== CQ ENQUEUE COMPLETION\n"); fi_opx_cq_enqueue_completed(cc->cq, cc->context, FI_OPX_LOCK_NOT_REQUIRED); } else { + if (cc->context) { + OPX_BUF_FREE(cc->context); + } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "=================== NO CQ COMPLETION\n"); } OPX_BUF_FREE(cc); @@ -192,7 +195,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { fi_opx_store_and_copy_qw(scb, local_temp, opx_ep->rx->tx.cts_9B.qw0 | OPX_PBC_LEN(params->pbc_dws, hfi1_type) | credit_return | - params->pbc_dlid, + params->pbc_dlid, opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | params->lrh_dlid | (params->lrh_dws << 32), opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | params->bth_rx, opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn, @@ -233,12 +236,12 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(params->lrh_dlid)); fi_opx_store_and_copy_qw(scb, local_temp, opx_ep->rx->tx.cts_16B.qw0 | OPX_PBC_LEN(params->pbc_dws, hfi1_type) | - credit_return | params->pbc_dlid, + credit_return | params->pbc_dlid, opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | - ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | - ((uint64_t)params->lrh_dws << 20), + ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | + ((uint64_t)params->lrh_dws << 20), opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | - ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), + ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | params->bth_rx, opx_ep->rx->tx.cts_16B.hdr.qw_16B[3] | psn, opx_ep->rx->tx.cts_16B.hdr.qw_16B[4], @@ -292,7 +295,7 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -335,9 +338,10 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t const uint64_t is_hmem = fi_opx_hmem_iov_init(buf, len, NULL, &iov); fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr_offset, key, - NULL, cc, FI_VOID, FI_NOOP, + cc, FI_VOID, FI_NOOP, opx_ep->tx->op_flags | FI_INJECT, - is_hmem, lock_required, caps, reliability, hfi1_type); + is_hmem, lock_required, caps, reliability, + hfi1_type); return 0; } @@ -362,10 +366,10 @@ inline ssize_t fi_opx_inject_write_generic(struct fid_ep *ep, const void *buf, s __OPX_FORCE_INLINE__ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dst_addr, uint64_t addr_offset, uint64_t key, - void *context, int lock_required, const enum fi_av_type av_type, - const uint64_t caps, + void *user_context, int lock_required, + const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -385,24 +389,33 @@ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); const union fi_opx_addr opx_dst_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,dst_addr); + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_WRITE, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->initial_byte_count = len; cc->byte_counter = len; cc->cntr = opx_ep->write_cntr; - cc->cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + cc->cq = cq; cc->context = context; - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - if (opx_context && cc->cq) { - opx_context->flags = FI_RMA | FI_WRITE; - } - cc->hit_zero = fi_opx_hit_zero; struct fi_opx_hmem_iov iov; const uint64_t is_hmem = fi_opx_hmem_iov_init(buf, len, desc, &iov); fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, addr_offset, key, - (union fi_opx_context *)context, cc, FI_VOID, + cc, FI_VOID, FI_NOOP, opx_ep->tx->op_flags, is_hmem, lock_required, caps, reliability, hfi1_type); @@ -414,7 +427,7 @@ inline ssize_t fi_opx_write_generic(struct fid_ep *ep, const void *buf, size_t l void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); @@ -428,10 +441,10 @@ inline ssize_t fi_opx_write_generic(struct fid_ep *ep, const void *buf, size_t l __OPX_FORCE_INLINE__ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dst_addr, uint64_t addr_offset, - uint64_t key, void *context, int lock_required, + uint64_t key, void *user_context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -452,7 +465,21 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); const union fi_opx_addr opx_dst_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,dst_addr); + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_WRITE, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + size_t index; cc->next = NULL; cc->byte_counter = 0; @@ -461,11 +488,8 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void } cc->initial_byte_count = cc->byte_counter; cc->cntr = opx_ep->write_cntr; - cc->cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + cc->cq = cq; cc->context = context; - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_WRITE; - cc->hit_zero = fi_opx_hit_zero; struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)desc; @@ -483,10 +507,9 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void mr_ptr, &hmem_iov); fi_opx_write_internal(opx_ep, &hmem_iov, 1, opx_dst_addr, - addr_offset, key, - (union fi_opx_context *)context, cc, - FI_VOID, FI_NOOP, 0, is_hmem, - lock_required, caps, reliability, hfi1_type); + addr_offset, key, cc, FI_VOID, FI_NOOP, + 0, is_hmem, lock_required, caps, + reliability, hfi1_type); addr_offset += iov[index].iov_len; } @@ -499,7 +522,7 @@ inline ssize_t fi_opx_writev_generic(struct fid_ep *ep, const struct iovec *iov, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); @@ -552,10 +575,10 @@ void fi_opx_get_daos_av_addr_rank(struct fi_opx_ep *opx_ep, __OPX_FORCE_INLINE__ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, - uint64_t flags, int lock_required, - const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + uint64_t flags, int lock_required, + const enum fi_av_type av_type, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -577,7 +600,21 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg const union fi_opx_addr opx_dst_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,msg->addr); fi_opx_get_daos_av_addr_rank(opx_ep, opx_dst_addr); + struct fi_opx_cq *cq = (flags & FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_RMA | FI_WRITE, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + size_t index; cc->next = NULL; cc->byte_counter = 0; @@ -587,11 +624,8 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg cc->initial_byte_count = cc->byte_counter; cc->cntr = opx_ep->write_cntr; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_WRITE; - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; size_t rma_iov_index = 0; @@ -621,7 +655,7 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg iov.buf = msg_iov_vaddr; iov.len = len; fi_opx_write_internal(opx_ep, &iov, 1, opx_dst_addr, rma_iov_addr, - rma_iov_key, NULL, cc, FI_VOID, FI_NOOP, 0, + rma_iov_key, cc, FI_VOID, FI_NOOP, 0, is_hmem, lock_required, caps, reliability, hfi1_type); @@ -653,7 +687,7 @@ inline ssize_t fi_opx_writemsg_generic(struct fid_ep *ep, const struct fi_msg_rm uint64_t flags, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); @@ -667,9 +701,9 @@ inline ssize_t fi_opx_writemsg_generic(struct fid_ep *ep, const struct fi_msg_rm __OPX_FORCE_INLINE__ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr_offset, uint64_t key, - void *context, int lock_required, const enum fi_av_type av_type, + void *user_context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -705,20 +739,30 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des const union fi_opx_addr opx_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,src_addr); fi_opx_get_daos_av_addr_rank(opx_ep, opx_addr); + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_READ, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + cc->next = NULL; cc->byte_counter = len; cc->initial_byte_count = len; cc->cntr = opx_ep->read_cntr; - cc->cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + cc->cq = cq; cc->context = context; - union fi_opx_context * opx_context = (union fi_opx_context *)cc->context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_READ; - cc->hit_zero = fi_opx_hit_zero; fi_opx_readv_internal(opx_ep, &iov, 1, opx_addr, &addr_offset, &key, - (union fi_opx_context *)context, opx_ep->tx->op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, @@ -745,10 +789,10 @@ inline ssize_t fi_opx_read_generic(struct fid_ep *ep, void *buf, size_t len, voi __OPX_FORCE_INLINE__ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr_offset, - uint64_t key, void *context, int lock_required, + uint64_t key, void *user_context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -768,7 +812,12 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); const union fi_opx_addr opx_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,src_addr); - union fi_opx_context *opx_context = (union fi_opx_context *)context; + struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_READ, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + const uint64_t tx_op_flags = opx_ep->tx->op_flags; uint64_t addr_v[8] = { addr_offset, addr_offset, addr_offset, addr_offset, @@ -776,6 +825,14 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, uint64_t key_v[8] = { key, key, key, key, key, key, key, key }; struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + size_t index; cc->next = NULL; cc->byte_counter = 0; @@ -784,9 +841,8 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, } cc->initial_byte_count = cc->byte_counter; cc->cntr = opx_ep->read_cntr; - cc->cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; + cc->cq = cq; cc->context = context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_READ; cc->hit_zero = fi_opx_hit_zero; uint64_t hmem_device; @@ -812,7 +868,7 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, hmem_iovs[i].device = hmem_device; } fi_opx_readv_internal(opx_ep, hmem_iovs, 8, opx_addr, addr_v, key_v, - NULL, 0, NULL, NULL, cc, FI_VOID, FI_NOOP, + 0, NULL, NULL, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); } @@ -833,10 +889,11 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, hmem_iovs[i].iface = hmem_iface; hmem_iovs[i].device = hmem_device; } - fi_opx_readv_internal(opx_ep, hmem_iovs, partial_ndesc, opx_addr, addr_v, key_v, - opx_context, tx_op_flags, opx_ep->rx->cq, opx_ep->read_cntr, cc, - FI_VOID, FI_NOOP, - FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); + fi_opx_readv_internal(opx_ep, hmem_iovs, partial_ndesc, opx_addr, addr_v, + key_v, tx_op_flags, opx_ep->rx->cq, + opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, + FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, + caps, reliability, hfi1_type); return 0; } @@ -846,7 +903,7 @@ inline ssize_t fi_opx_readv_generic(struct fid_ep *ep, const struct iovec *iov, uint64_t key, void *context, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); @@ -859,10 +916,10 @@ inline ssize_t fi_opx_readv_generic(struct fid_ep *ep, const struct iovec *iov, __OPX_FORCE_INLINE__ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, - uint64_t flags, int lock_required, - const enum fi_av_type av_type, const uint64_t caps, - const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + uint64_t flags, int lock_required, + const enum fi_av_type av_type, const uint64_t caps, + const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -878,20 +935,17 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, abort(); } - /* TODO - should this be a different cq than the one used by tsend, etc? */ - struct fi_opx_cq *cq = opx_ep->tx->cq; - if (((cq != NULL) && - ((cq->bflags & FI_SELECTIVE_COMPLETION) && (flags & FI_COMPLETION) == 0))) { - cq = NULL; - } - - union fi_opx_context *opx_context = (union fi_opx_context *)msg->context; - assert(msg->addr != FI_ADDR_UNSPEC); assert((FI_AV_TABLE == opx_ep->av_type) || (FI_AV_MAP == opx_ep->av_type)); const union fi_opx_addr opx_src_addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,msg->addr); fi_opx_get_daos_av_addr_rank(opx_ep, opx_src_addr); + struct fi_opx_cq *cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; + struct opx_context *context; + if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_RMA | FI_READ, &context) != FI_SUCCESS)) { + return -FI_ENOMEM; + } + /* for fi_read*(), the 'src' is the remote data */ size_t src_iov_index = 0; const size_t src_iov_count = msg->rma_iov_count; @@ -912,6 +966,14 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, ssize_t index; struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); + if (OFI_UNLIKELY(cc == NULL)) { + if (context) { + OPX_BUF_FREE(context); + } + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + memset(cc, 0, sizeof(*cc)); cc->byte_counter = 0; for(index=0; index < msg->iov_count; index++) { @@ -927,10 +989,8 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, assert(totsize == cc->byte_counter); #endif cc->cntr = opx_ep->read_cntr; - cc->cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; - cc->context = msg->context; - if(opx_context && cc->cq) opx_context->flags = FI_RMA | FI_READ; - + cc->cq = cq; + cc->context = context; cc->hit_zero = fi_opx_hit_zero; struct fi_opx_mr **mr_ptr_array = (struct fi_opx_mr **)msg->desc; @@ -971,7 +1031,7 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, #endif fi_opx_readv_internal( opx_ep, iov, niov + 1, opx_src_addr, addr, key, - opx_context, flags, cq, + flags, cq, opx_ep->read_cntr, /* enable_cq, enable_cntr */ cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); @@ -1019,7 +1079,7 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, assert(totsize_issued <= totsize); #endif fi_opx_readv_internal(opx_ep, iov, 8, opx_src_addr, addr, key, - NULL, 0, NULL, NULL, /* disable_cq, disable_cntr */ + 0, NULL, NULL, /* disable_cq, disable_cntr */ cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); @@ -1037,7 +1097,7 @@ inline ssize_t fi_opx_readmsg_generic(struct fid_ep *ep, const struct fi_msg_rma uint64_t flags, int lock_required, const enum fi_av_type av_type, const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); fi_opx_lock_if_required(&opx_ep->lock, lock_required); diff --git a/prov/opx/src/fi_opx_tagged.c b/prov/opx/src/fi_opx_tagged.c index 815ec13ed13..d0ef9a23aa3 100644 --- a/prov/opx/src/fi_opx_tagged.c +++ b/prov/opx/src/fi_opx_tagged.c @@ -59,40 +59,43 @@ ssize_t fi_opx_trecvmsg_generic (struct fid_ep *ep, const enum fi_progress progress, const enum opx_hfi1_type hfi1_type) { + assert(!lock_required); + assert(!(flags & FI_MULTI_RECV)); /* Multi-receive incompatible with tagged receives */ + struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - union fi_opx_context * opx_context = NULL; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST TRECVMSG\n"); - assert(!lock_required); - assert(!(flags & FI_MULTI_RECV)); /* Multi-receive incompatible with tagged receives */ - assert(msg->context); - assert(((uintptr_t)msg->context & 0x07ull) == 0); /* must be 8 byte aligned */ + struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); + if (OFI_UNLIKELY(context == NULL)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + return -FI_ENOMEM; + } + context->next = NULL; + context->src_addr = msg->addr; + context->flags = flags; + context->err_entry.err = 0; + context->err_entry.op_context = msg->context; FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.posted_recv_tag); if (msg->iov_count == 0) { - opx_context = (union fi_opx_context *) msg->context; - opx_context->next = NULL; - opx_context->src_addr = msg->addr; - opx_context->flags = flags; - opx_context->len = 0; - opx_context->buf = NULL; - opx_context->byte_counter = (uint64_t)-1; + context->len = 0; + context->buf = NULL; + context->byte_counter = (uint64_t)-1; + if ((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM) { /* do not overwrite state from a previous "peek|claim" operation */ - opx_context->tag = msg->tag; - opx_context->ignore = msg->ignore; + context->tag = msg->tag; + context->ignore = msg->ignore; } return fi_opx_ep_rx_process_context(opx_ep, FI_TAGGED, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, reliability, - hfi1_type); + hfi1_type); } #ifdef OPX_HMEM @@ -116,104 +119,70 @@ ssize_t fi_opx_trecvmsg_generic (struct fid_ep *ep, #endif if (hmem_iface != FI_HMEM_SYSTEM) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.posted_recv_tag); - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory.\n"); - return -FI_ENOMEM; - } - flags |= FI_OPX_CQ_CONTEXT_EXT | FI_OPX_CQ_CONTEXT_HMEM; + flags |= FI_OPX_CQ_CONTEXT_HMEM; - ext->err_entry.err = 0; - ext->opx_context.next = NULL; - ext->opx_context.src_addr = msg->addr; - ext->opx_context.flags = flags; - ext->opx_context.byte_counter = (uint64_t)-1; - ext->msg.op_context = msg->context; - ext->msg.iov_count = msg->iov_count; - ext->msg.iov = (struct iovec *)msg->msg_iov; + context->byte_counter = (uint64_t)-1; + context->msg.iov_count = msg->iov_count; + context->msg.iov = (struct iovec *)msg->msg_iov; if (msg->iov_count == 1) { - ext->opx_context.len = msg->msg_iov[0].iov_len; - ext->opx_context.buf = msg->msg_iov[0].iov_base; + context->len = msg->msg_iov[0].iov_len; + context->buf = msg->msg_iov[0].iov_base; if ((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM) { /* do not overwrite state from a previous "peek|claim" operation */ - ext->opx_context.tag = msg->tag; - ext->opx_context.ignore = msg->ignore; + context->tag = msg->tag; + context->ignore = msg->ignore; } } else { assert((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM); /* TODO - why not? */ - ext->opx_context.tag = msg->tag; - ext->opx_context.ignore = msg->ignore; + context->tag = msg->tag; + context->ignore = msg->ignore; } - struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) ext->hmem_info_qws; + struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; hmem_info->iface = hmem_iface; hmem_info->device = hmem_device; return fi_opx_ep_rx_process_context(opx_ep, FI_TAGGED, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, flags, - OPX_CONTEXT_EXTENDED_TRUE, + context, flags, OPX_HMEM_TRUE, lock_required, av_type, reliability, - hfi1_type); + hfi1_type); } #endif if (msg->iov_count == 1) { - opx_context = (union fi_opx_context *) msg->context; - opx_context->next = NULL; - opx_context->src_addr = msg->addr; - opx_context->flags = flags; - opx_context->len = msg->msg_iov[0].iov_len; - opx_context->buf = msg->msg_iov[0].iov_base; - opx_context->byte_counter = (uint64_t)-1; + context->len = msg->msg_iov[0].iov_len; + context->buf = msg->msg_iov[0].iov_base; + context->byte_counter = (uint64_t)-1; if ((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM) { /* do not overwrite state from a previous "peek|claim" operation */ - opx_context->tag = msg->tag; - opx_context->ignore = msg->ignore; + context->tag = msg->tag; + context->ignore = msg->ignore; } return fi_opx_ep_rx_process_context(opx_ep, FI_TAGGED, - OPX_CANCEL_CONTEXT_FALSE, - opx_context, flags, - OPX_CONTEXT_EXTENDED_FALSE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, reliability, - hfi1_type); + hfi1_type); } assert((flags & (FI_PEEK | FI_CLAIM)) != FI_CLAIM); /* TODO - why not? */ - struct fi_opx_context_ext * ext = (struct fi_opx_context_ext *) ofi_buf_alloc(opx_ep->rx->ctx_ext_pool); - if (OFI_UNLIKELY(ext == NULL)) { - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Out of memory.\n"); - return -FI_ENOMEM; - } - flags |= FI_OPX_CQ_CONTEXT_EXT; - - ext->err_entry.err = 0; - ext->opx_context.next = NULL; - ext->opx_context.src_addr = msg->addr; - ext->opx_context.flags = flags; - ext->opx_context.byte_counter = (uint64_t)-1; - ext->opx_context.tag = msg->tag; - ext->opx_context.ignore = msg->ignore; - ext->msg.op_context = msg->context; - ext->msg.iov_count = msg->iov_count; - ext->msg.iov = (struct iovec *)msg->msg_iov; + context->byte_counter = (uint64_t)-1; + context->tag = msg->tag; + context->ignore = msg->ignore; + context->msg.iov_count = msg->iov_count; + context->msg.iov = (struct iovec *)msg->msg_iov; return fi_opx_ep_rx_process_context(opx_ep, FI_TAGGED, - OPX_CANCEL_CONTEXT_FALSE, - (union fi_opx_context *) ext, flags, - OPX_CONTEXT_EXTENDED_TRUE, + context, flags, OPX_HMEM_FALSE, lock_required, av_type, reliability, - hfi1_type); + hfi1_type); } From cb7fed3dd211d9e4d062f96040137fb7f1501890 Mon Sep 17 00:00:00 2001 From: Thomas Huber Date: Tue, 3 Sep 2024 15:13:30 -0400 Subject: [PATCH 143/393] prov/opx: Capitalized env var used for production override, also added opx to the front. Signed-off-by: Thomas Huber --- prov/opx/configure.m4 | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/prov/opx/configure.m4 b/prov/opx/configure.m4 index b8cb174a1b6..25d9d168fff 100644 --- a/prov/opx/configure.m4 +++ b/prov/opx/configure.m4 @@ -154,6 +154,10 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ AC_MSG_NOTICE([hfi1_user.h struct sdma_req_meminfo defined... no]) opx_happy=0 ]) + OPX_PRODUCTION_BUILD_OVERRIDE=${OPX_PRODUCTION_BUILD_OVERRIDE:-""} + AS_IF([test "x$OPX_PRODUCTION_BUILD_OVERRIDE" != "x"], [ + AC_MSG_NOTICE([OPX_PRODUCTION_BUILD_OVERRIDE is set to $OPX_PRODUCTION_BUILD_OVERRIDE]) + ]) CPPFLAGS=$save_CPPFLAGS opx_hfi_version=$(/sbin/modinfo hfi1 -F version) opx_hfi_version_sorted=$(echo -e "10.14.0.0\n$opx_hfi_version" | sort -V | tail -n 1) @@ -164,7 +168,7 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ test $opx_hfi_version != $opx_hfi_version_sorted],[ opx_hfi_dev_override=$(echo $CPPFLAGS | grep -w "DOPX_DEV_OVERRIDE") - AS_IF([test "x$opx_hfi_dev_override" != "x"],[ + AS_IF([test "x$opx_hfi_dev_override" != "x" -o "x$OPX_PRODUCTION_BUILD_OVERRIDE" != "x"],[ AC_MSG_NOTICE([hfi1 driver version is CUDA-compatible... no, overridden]) ],[ AC_MSG_NOTICE([hfi1 driver version is CUDA-compatible... no]) From f124b4e13eea665d1677ccef4e9d5f04104ba239 Mon Sep 17 00:00:00 2001 From: Thomas Huber Date: Wed, 4 Sep 2024 10:58:37 -0400 Subject: [PATCH 144/393] prov/opx: Updated configure.m4 for ROCR Signed-off-by: Thomas Huber --- prov/opx/configure.m4 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/prov/opx/configure.m4 b/prov/opx/configure.m4 index 25d9d168fff..a678e602e72 100644 --- a/prov/opx/configure.m4 +++ b/prov/opx/configure.m4 @@ -141,7 +141,7 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ opx_happy=0 ]) ]) - AS_IF([test $opx_happy -eq 1 && test $have_cuda -eq 1],[ + AS_IF([test $opx_happy -eq 1 && (test $have_cuda -eq 1 || test $have_rocr -eq 1)], [ save_CPPFLAGS=$CPPFLAGS CPPFLAGS="-I/usr/include/uapi" AC_COMPILE_IFELSE([AC_LANG_PROGRAM( @@ -169,14 +169,14 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ opx_hfi_dev_override=$(echo $CPPFLAGS | grep -w "DOPX_DEV_OVERRIDE") AS_IF([test "x$opx_hfi_dev_override" != "x" -o "x$OPX_PRODUCTION_BUILD_OVERRIDE" != "x"],[ - AC_MSG_NOTICE([hfi1 driver version is CUDA-compatible... no, overridden]) + AC_MSG_NOTICE([hfi1 driver version is GPU-compatible... no, overridden]) ],[ - AC_MSG_NOTICE([hfi1 driver version is CUDA-compatible... no]) + AC_MSG_NOTICE([hfi1 driver version is GPU-compatible... no]) opx_happy=0 ]) ], - [AC_MSG_NOTICE([hfi1 driver version is CUDA-compatible... yes]) + [AC_MSG_NOTICE([hfi1 driver version is GPU-compatible... yes]) ]) AS_IF([test $opx_happy -eq 1],[ AC_MSG_NOTICE([Appending OPX_HMEM to opx_CPPFLAGS]) From 2335628793aaaa449ae12ea1a14da49228ac24f2 Mon Sep 17 00:00:00 2001 From: Jack Morrison Date: Tue, 10 Sep 2024 23:01:53 -0400 Subject: [PATCH 145/393] github/actions: Adjust Cornelis Networks internal workflows Replace running of on-merge workflow with a nightly workflow instead. Signed-off-by: Jack Morrison --- .github/workflows/cn.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cn.yml b/.github/workflows/cn.yml index b6ca2fabe0a..d0df84b405f 100644 --- a/.github/workflows/cn.yml +++ b/.github/workflows/cn.yml @@ -13,6 +13,8 @@ on: paths-ignore: - 'man/**' - 'docs/**' + schedule: + - cron: '0 21 * * *' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -22,12 +24,11 @@ jobs: opx-ci: name: OPX CI if: | - github.repository == 'cornelisnetworks/libfabric-internal' && - github.event.pull_request.merged != true + github.repository == 'cornelisnetworks/libfabric-internal' uses: cornelisnetworks/libfabric-devel/.github/workflows/cn.yml@master - on-merge: - name: On-Merge + nightly: + name: Nightly if: | github.repository == 'cornelisnetworks/libfabric-internal' && - github.event.pull_request.merged == true - uses: cornelisnetworks/libfabric-devel/.github/workflows/merge.yml@master + github.event.schedule == '0 21 * * *' + uses: cornelisnetworks/libfabric-devel/.github/workflows/nightly.yml@master From bfc50d193c2e4e68c7f005e243c7afaa888a08f1 Mon Sep 17 00:00:00 2001 From: Jack Morrison Date: Wed, 11 Sep 2024 15:15:57 -0400 Subject: [PATCH 146/393] github/actions: Cornelis Networks workflows Do not use PR closed events as workflow triggers. Allow triggering PR events when targeting any branch, not just main. Change cron schedule to account for UTC. Improve conditional execution of reusable workflows. Signed-off-by: Jack Morrison --- .github/workflows/cn.yml | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/.github/workflows/cn.yml b/.github/workflows/cn.yml index d0df84b405f..5669c52f4ea 100644 --- a/.github/workflows/cn.yml +++ b/.github/workflows/cn.yml @@ -7,14 +7,8 @@ on: - opened - reopened - synchronize - - closed - branches: - - main - paths-ignore: - - 'man/**' - - 'docs/**' schedule: - - cron: '0 21 * * *' + - cron: '0 23 * * *' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -23,12 +17,9 @@ concurrency: jobs: opx-ci: name: OPX CI - if: | - github.repository == 'cornelisnetworks/libfabric-internal' + if: ${{ github.repository == 'cornelisnetworks/libfabric-internal' && github.event_name == 'pull_request' }} uses: cornelisnetworks/libfabric-devel/.github/workflows/cn.yml@master nightly: name: Nightly - if: | - github.repository == 'cornelisnetworks/libfabric-internal' && - github.event.schedule == '0 21 * * *' + if: ${{ github.repository == 'cornelisnetworks/libfabric-internal' && github.event_name == 'schedule' }} uses: cornelisnetworks/libfabric-devel/.github/workflows/nightly.yml@master From a57e788b2d2c5936ad698f79c7b4cbff5cd1d127 Mon Sep 17 00:00:00 2001 From: Bob Cernohous Date: Fri, 13 Sep 2024 08:33:05 -0500 Subject: [PATCH 147/393] prov/opx: scb/hdr changes Signed-off-by: Bob Cernohous --- prov/opx/include/rdma/fi_direct_atomic.h | 2 +- prov/opx/include/rdma/opx/fi_opx_hfi1.h | 21 ++--- .../opx/include/rdma/opx/fi_opx_hfi1_packet.h | 37 +++++---- prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h | 33 +++++--- .../include/rdma/opx/fi_opx_hfi1_transport.h | 42 +++++----- .../opx/include/rdma/opx/fi_opx_reliability.h | 29 +++---- prov/opx/src/fi_opx_hfi1.c | 80 +++++++++---------- prov/opx/src/fi_opx_hfi1_sdma.c | 6 +- prov/opx/src/fi_opx_msg.c | 14 ++-- prov/opx/src/fi_opx_reliability.c | 66 +++++++-------- prov/opx/src/fi_opx_rma.c | 4 +- 11 files changed, 169 insertions(+), 165 deletions(-) diff --git a/prov/opx/include/rdma/fi_direct_atomic.h b/prov/opx/include/rdma/fi_direct_atomic.h index 65487254fd1..61ca69d7bd7 100644 --- a/prov/opx/include/rdma/fi_direct_atomic.h +++ b/prov/opx/include/rdma/fi_direct_atomic.h @@ -48,7 +48,7 @@ extern "C" { #define fi_inject_atomic(ep, buf, count, dest_addr, addr, key, \ datatype, op) \ - (fi_opx_inject_atomic_FABRIC_DIRECT(ep, buf, count, dest_addr,\ + (fi_opx_inject_atomic_FABRIC_DIRECT(ep, buf, count, dest_addr, \ addr, key, datatype, op)) #define fi_fetch_atomic(ep, buf, count, desc, result, result_desc, \ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1.h b/prov/opx/include/rdma/opx/fi_opx_hfi1.h index 22fd27eccf1..405b2f178b9 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1.h @@ -305,7 +305,7 @@ static inline void fi_opx_store_scb_qw(volatile uint64_t dest[8], const uint64_t */ -/* Only 8 QWs valid in 16 QW storage. */ +/* 8 QWs valid in 16 QW storage. */ struct fi_opx_hfi1_txe_scb_9B { union { /* 15 QWs union*/ @@ -323,28 +323,23 @@ struct fi_opx_hfi1_txe_scb_9B { uint64_t pad; /* 1 QW pad (to 16 QWs) */ } __attribute__((__aligned__(8))) __attribute__((packed)); -/* 16 QW valid in 16 QW storage. */ +/* 9 QWs valid in 16 QW storage. */ struct fi_opx_hfi1_txe_scb_16B { - uint64_t qw0; /* PBC */ - union opx_hfi1_packet_hdr hdr; /* 15 QWs 16B header */ + uint64_t qw0; /* PBC */ + union opx_hfi1_packet_hdr hdr; /* 8 QWs 16B header + 7 QWs currently unused */ } __attribute__((__aligned__(8))) __attribute__((packed)); -static_assert((sizeof(struct fi_opx_hfi1_txe_scb_9B) == sizeof(struct fi_opx_hfi1_txe_scb_16B)), "storge for scbs should match"); +static_assert((sizeof(struct fi_opx_hfi1_txe_scb_9B) == sizeof(struct fi_opx_hfi1_txe_scb_16B)), "storage for scbs should match"); static_assert((sizeof(struct fi_opx_hfi1_txe_scb_9B) == (sizeof(uint64_t)*16)), "16 qw scb storage"); /* Storage for a scb. Use HFI1 type to access the correct structure */ union opx_hfi1_txe_scb_union { struct fi_opx_hfi1_txe_scb_9B scb_9B; struct fi_opx_hfi1_txe_scb_16B scb_16B; -}; - -struct fi_opx_hfi1_rxe_hdr { - - union opx_hfi1_packet_hdr hdr; - uint64_t rhf; - -} __attribute__((__aligned__(64))); +} __attribute__((__aligned__(8))) __attribute__((packed)); +static_assert((sizeof(struct fi_opx_hfi1_txe_scb_9B) == sizeof(union opx_hfi1_txe_scb_union)), "storage for scbs should match"); +static_assert((sizeof(struct fi_opx_hfi1_txe_scb_16B) == sizeof(union opx_hfi1_txe_scb_union)), "storage for scbs should match"); diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h index 746ba96aa2c..b0d826e3c60 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h @@ -997,7 +997,7 @@ union opx_hfi1_packet_hdr { uint64_t reserved_3[2]; /* QW[5-6] SW */ uint64_t ofi_tag; /* QW[7] SW last 9B quadword */ - uint64_t reserved_n[6]; /* QW[8-14] SW */ + uint64_t reserved_n[7]; /* QW[8-14] SW */ } __attribute__((__packed__)) match; @@ -1022,7 +1022,7 @@ union opx_hfi1_packet_hdr { uint64_t app_data_u64[2]; }; - uint64_t reserved_n[7]; /* QW[7-14] SW */ + uint64_t reserved_n[8]; /* QW[7-14] SW */ } __attribute__((__packed__)) inject; @@ -1046,7 +1046,7 @@ union opx_hfi1_packet_hdr { /* QW[6] SW */ uint64_t xfer_tail; - uint64_t reserved_n[7]; /* QW[7-14] SW */ + uint64_t reserved_n[8]; /* QW[7-14] SW */ } __attribute__((__packed__)) send; @@ -1069,7 +1069,7 @@ union opx_hfi1_packet_hdr { /* QW[5-6] SW */ uint64_t xfer_tail[2]; - uint64_t reserved_n[7]; /* QW[7-14] SW */ + uint64_t reserved_n[8]; /* QW[7-14] SW */ } __attribute__((__packed__)) mp_eager_first; @@ -1092,7 +1092,7 @@ union opx_hfi1_packet_hdr { uint32_t payload_offset; uint32_t mp_egr_uid; - uint64_t reserved_n[6]; /* QW[8-14] SW */ + uint64_t reserved_n[7]; /* QW[8-14] SW */ } __attribute__((__packed__)) mp_eager_nth; @@ -1117,7 +1117,7 @@ union opx_hfi1_packet_hdr { /* QW[6] SW */ uint64_t message_length; /* total length in bytes of all non-contiguous buffers and immediate data */ - uint64_t reserved_n[7]; /* QW[7-14] SW */ + uint64_t reserved_n[8]; /* QW[7-14] SW */ } __attribute__((__packed__)) rendezvous; @@ -1133,7 +1133,7 @@ union opx_hfi1_packet_hdr { /* QW[3-4] BTH/KDETH */ uint64_t reserved_3[2]; - /* QW[5-14] SW */ + /* QW[5-7] SW */ union { uint8_t opcode; struct { @@ -1176,7 +1176,7 @@ union opx_hfi1_packet_hdr { } fence; } target; - uint64_t reserved_n[6]; /* QW[8-14] SW */ + uint64_t reserved_n[7]; /* QW[8-14] SW */ } __attribute__((__packed__)) cts; @@ -1189,16 +1189,17 @@ union opx_hfi1_packet_hdr { uint8_t origin_rx; uint8_t reserved_o2; - /* == quadword 2 == */ + /* QW[3] BTH/KDETH */ uint64_t reserved_3; - /* == quadword 3 == */ + /* QW[4] KDETH/SW */ uint64_t reserved_4; + /* QW[5,6,7] KDETH/SW */ union { - /* QW[5] SW */ /* Common fields */ struct { + /* QW[5] KDETH/SW */ uint8_t opcode; uint8_t origin_tx; uint8_t dt; @@ -1206,6 +1207,7 @@ union opx_hfi1_packet_hdr { uint16_t last_bytes; uint16_t bytes; + /* QW[6,7] SW */ uint64_t reserved[2]; /* op-specific */ }; @@ -1233,7 +1235,7 @@ union opx_hfi1_packet_hdr { /* QW[5] SW */ uint64_t reserved; /* Common fields */ - /* QW[6-7] SW */ + /* QW[6,7] SW */ uintptr_t key; uintptr_t offset; } mr; @@ -1242,13 +1244,13 @@ union opx_hfi1_packet_hdr { /* QW[5] SW */ uint64_t reserved; /* Common fields */ - /* QW[6-7] SW */ + /* QW[6,7] SW */ uintptr_t completion_counter; uint64_t bytes_to_fence; } fence; } target; - uint64_t reserved_n[6]; /* QW[8-14] SW */ + uint64_t reserved_n[7]; /* QW[8-14] SW */ } __attribute__((__packed__)) dput; @@ -1261,7 +1263,7 @@ union opx_hfi1_packet_hdr { uint8_t opcode; uint8_t reserved_2; - uint64_t reserved_n[11]; /* QW[3-14] SW */ + uint64_t reserved_n[12]; /* QW[3-14] SW */ } __attribute__((__packed__)) ud; @@ -1283,11 +1285,14 @@ union opx_hfi1_packet_hdr { uint64_t psn_start; uint64_t key; /* fi_opx_reliability_service_flow_key */ - uint64_t reserved_n[6]; /* QW[8-14] SW */ + uint64_t reserved_n[7]; /* QW[8-14] SW */ } __attribute__((__packed__)) service; /* "reliability service" */ } __attribute__((__packed__)) __attribute__((__aligned__(8))); +static_assert(sizeof(union opx_hfi1_packet_hdr) == sizeof(uint64_t[15]), + "sizeof(union opx_hfi1_packet_hdr) must be 15 qwords!"); + static inline fi_opx_uid_t fi_opx_hfi1_packet_hdr_uid (const union opx_hfi1_packet_hdr * const hdr, diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h index dbd1c6d06b5..b18ea2f9095 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h @@ -99,8 +99,7 @@ struct fi_opx_hfi1_sdma_header_vec { #endif } hmem; }; - - struct fi_opx_hfi1_txe_scb_9B scb; + union opx_hfi1_txe_scb_union scb; }; static const size_t OPX_SDMA_REQ_INFO_OFFSET[2] = { @@ -134,7 +133,7 @@ struct opx_sdma_request { /* ==== CACHELINE 1 ==== */ struct iovec iovecs[OPX_SDMA_REQUEST_IOVS]; - struct fi_opx_hfi1_sdma_header_vec header_vec; // 72 bytes or 208 bytes (OPX_HMEM) + struct fi_opx_hfi1_sdma_header_vec header_vec; // 72 bytes 9B or 80 bytes 16B, plus 136 bytes (OPX_HMEM) }; OPX_COMPILE_TIME_ASSERT(offsetof(struct opx_sdma_request, iovecs) == FI_OPX_CACHE_LINE_SIZE, "Offset of opx_sdma_request->iovecs should start at cacheline 1!"); @@ -471,8 +470,7 @@ __OPX_FORCE_INLINE__ int opx_hfi1_sdma_enqueue_request(struct fi_opx_ep *opx_ep, void *requester, enum opx_sdma_comp_state *requester_comp_state, - struct fi_opx_hfi1_txe_scb_9B *source_scb, -/* struct opx_hfi1_txe_scb_union *source_scb, */ + union opx_hfi1_txe_scb_union *source_scb, struct iovec *iovs, const uint16_t num_iovs, const uint16_t num_packets, @@ -517,13 +515,22 @@ int opx_hfi1_sdma_enqueue_request(struct fi_opx_ep *opx_ep, uint64_t set_ack_bit = (num_packets == 1) ? (uint64_t)htonl(0x80000000) : 0; OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); - request->header_vec.scb = *source_scb; - request->header_vec.scb.hdr.qw_9B[2] |= ((uint64_t)kdeth << 32) | set_ack_bit; - request->header_vec.scb.hdr.qw_9B[4] |= (last_packet_bytes << 32); - request->iovecs[0].iov_len = OPX_SDMA_REQ_HDR_SIZE[set_meminfo]; + request->iovecs[0].iov_base = req_info; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + request->header_vec.scb.scb_9B = (source_scb->scb_9B); + request->header_vec.scb.scb_9B.hdr.qw_9B[2] |= ((uint64_t)kdeth << 32) | set_ack_bit; + request->header_vec.scb.scb_9B.hdr.qw_9B[4] |= (last_packet_bytes << 32); + request->iovecs[0].iov_len = OPX_SDMA_REQ_HDR_SIZE[set_meminfo]; + } else { + request->header_vec.scb.scb_16B = (source_scb->scb_16B); + request->header_vec.scb.scb_16B.hdr.qw_16B[3] |= ((uint64_t)kdeth << 32) | set_ack_bit; + request->header_vec.scb.scb_16B.hdr.qw_16B[5] |= (last_packet_bytes << 32); + request->iovecs[0].iov_len = OPX_SDMA_REQ_HDR_SIZE[set_meminfo] + 8; // extra QWORD in 16B LRH + } + for (int i = 0; i < num_iovs; ++i) { request->iovecs[i + 1] = iovs[i]; } @@ -552,7 +559,7 @@ int opx_hfi1_sdma_enqueue_replay(struct fi_opx_ep *opx_ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== Enqueuing replay for SDMA Send\n"); return opx_hfi1_sdma_enqueue_request(opx_ep, we, &we->comp_state, - &replay->scb_9B, replay->iov, + &replay->scb, replay->iov, OPX_SDMA_REPLAY_DATA_IOV_COUNT, 1, // num_packets, (payload_bytes + 63) & 0xFFC0, // Frag_size @@ -599,7 +606,7 @@ uint16_t opx_hfi1_sdma_register_replays(struct fi_opx_ep *opx_ep, uint32_t fragsize = 0; for (int i = 0; i < we->num_packets; ++i) { fragsize = MAX(fragsize, we->packets[i].length); - we->packets[i].replay->scb_9B.hdr.qw_9B[2] |= (uint64_t)htonl((uint32_t)psn); + we->packets[i].replay->scb.scb_9B.hdr.qw_9B[2] |= (uint64_t)htonl((uint32_t)psn); we->packets[i].replay->sdma_we_use_count = we->bounce_buf.use_count; we->packets[i].replay->sdma_we = replay_back_ptr; we->packets[i].replay->hmem_iface = we->hmem.iface; @@ -634,7 +641,7 @@ void opx_hfi1_sdma_enqueue_dput(struct fi_opx_ep *opx_ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== Enqueuing non-tid request for SDMA Send\n"); opx_hfi1_sdma_enqueue_request(opx_ep, we, &we->comp_state, - &we->packets[0].replay->scb_9B, + &we->packets[0].replay->scb, &payload_iov, OPX_SDMA_NONTID_DATA_IOV_COUNT, we->num_packets, @@ -699,7 +706,7 @@ void opx_hfi1_sdma_enqueue_dput_tid(struct fi_opx_ep *opx_ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== Enqueuing tid request for SDMA Send\n"); opx_hfi1_sdma_enqueue_request(opx_ep, we, &we->comp_state, - &we->packets[0].replay->scb_9B, + &we->packets[0].replay->scb, payload_tid_iovs, OPX_SDMA_TID_DATA_IOV_COUNT, we->num_packets, diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index fcbdbf1a3f9..c8025713ec4 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -1216,9 +1216,9 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_temp); } else { - fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, local_temp); + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, local_temp); } fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, addr.reliability_rx, @@ -1566,24 +1566,24 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz OPX_NO_16B_SUPPORT(hfi1_type); - replay->scb_9B.qw0 = opx_ep->tx->send_9B.qw0 | + replay->scb.scb_9B.qw0 = opx_ep->tx->send_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); - replay->scb_9B.hdr.qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); - replay->scb_9B.hdr.qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | + replay->scb.scb_9B.hdr.qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); + replay->scb.scb_9B.hdr.qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); - replay->scb_9B.hdr.qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2] | psn; - replay->scb_9B.hdr.qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); - replay->scb_9B.hdr.qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); + replay->scb.scb_9B.hdr.qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2] | psn; + replay->scb.scb_9B.hdr.qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); + replay->scb.scb_9B.hdr.qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); if (xfer_bytes_tail) { ssize_t tail_len = xfer_bytes_tail; remain = total_len - tail_len; while (false == fi_opx_hfi1_fill_from_iov8( iov_ptr, /* In: iovec array */ *niov_ptr, /* In: total iovecs */ - &replay->scb_9B.hdr.qw_9B[5], /* In: target buffer to fill */ + &replay->scb.scb_9B.hdr.qw_9B[5], /* In: target buffer to fill */ &tail_len, /* In/Out: buffer length to fill */ &iov_idx, /* In/Out: start index, returns end */ &iov_base_offset)) { /* In/Out: start offset, returns offset */ @@ -1591,7 +1591,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz } assert(tail_len == 0); } - replay->scb_9B.hdr.qw_9B[6] = tag; + replay->scb.scb_9B.hdr.qw_9B[6] = tag; remain = total_len - xfer_bytes_tail; uint64_t *payload = replay->payload; @@ -1883,19 +1883,19 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, OPX_NO_9B_SUPPORT(hfi1_type); - replay->scb_16B.qw0 = opx_ep->tx->send_16B.qw0 | + replay->scb.scb_16B.qw0 = opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | pbc_dlid; //OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid_16B, hfi1_type); - replay->scb_16B.hdr.qw_16B[0] = opx_ep->tx->send_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_qws << 20); - replay->scb_16B.hdr.qw_16B[1] = opx_ep->tx->send_16B.hdr.qw_16B[1] |((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); - replay->scb_16B.hdr.qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + replay->scb.scb_16B.hdr.qw_16B[0] = opx_ep->tx->send_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_qws << 20); + replay->scb.scb_16B.hdr.qw_16B[1] = opx_ep->tx->send_16B.hdr.qw_16B[1] |((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); + replay->scb.scb_16B.hdr.qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); - replay->scb_16B.hdr.qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3] | psn; - replay->scb_16B.hdr.qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); - replay->scb_16B.hdr.qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); + replay->scb.scb_16B.hdr.qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3] | psn; + replay->scb.scb_16B.hdr.qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); + replay->scb.scb_16B.hdr.qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); if (xfer_bytes_tail) { ssize_t tail_len = xfer_bytes_tail; remain = total_len - tail_len; @@ -1903,7 +1903,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, fi_opx_hfi1_fill_from_iov8( iov_ptr, /* In: iovec array */ *niov_ptr, /* In: total iovecs */ - &replay->scb_16B.hdr.qw_16B[6], /* In: target buffer to fill */ + &replay->scb.scb_16B.hdr.qw_16B[6], /* In: target buffer to fill */ &tail_len, /* In/Out: buffer length to fill */ &iov_idx, /* In/Out: start index, returns end */ &iov_base_offset)) { /* In/Out: start offset, returns offset */ @@ -1911,7 +1911,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, } assert(tail_len == 0); } - replay->scb_16B.hdr.qw_16B[7] = tag; + replay->scb.scb_16B.hdr.qw_16B[7] = tag; remain = total_len - xfer_bytes_tail; uint64_t *payload = replay->payload; @@ -2486,9 +2486,9 @@ void fi_opx_hfi1_tx_send_egr_write_replay_data(struct fi_opx_ep *opx_ep, { if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) - fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_source); + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_source); else - fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, local_source); + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, local_source); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); uint64_t * payload = replay->payload; diff --git a/prov/opx/include/rdma/opx/fi_opx_reliability.h b/prov/opx/include/rdma/opx/fi_opx_reliability.h index c43d13fe053..61046b9c12b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_reliability.h +++ b/prov/opx/include/rdma/opx/fi_opx_reliability.h @@ -284,10 +284,7 @@ struct fi_opx_reliability_tx_replay { /* == CACHE LINE == */ /* --- MUST BE 64 BYTE ALIGNED --- */ - union { - struct fi_opx_hfi1_txe_scb_9B scb_9B; - struct fi_opx_hfi1_txe_scb_16B scb_16B; - }; + union opx_hfi1_txe_scb_union scb; uint8_t data[]; } __attribute__((__aligned__(64))); @@ -295,11 +292,11 @@ struct fi_opx_reliability_tx_replay { #define OPX_REPLAY_HDR(_replay) OPX_REPLAY_HDR_TYPE(_replay, OPX_HFI1_TYPE) #define OPX_REPLAY_HDR_TYPE(_replay,_hfi1_type) ((_hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? \ - (&((_replay)->scb_9B.hdr)) : (&((_replay)->scb_16B.hdr)) ) + (&((_replay)->scb.scb_9B.hdr)) : (&((_replay)->scb.scb_16B.hdr)) ) OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_reliability_tx_replay, sdma_we) == FI_OPX_CACHE_LINE_SIZE, "Reliability Replay sdma_we should start on first cacheline!"); -OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_reliability_tx_replay, scb_9B) & (FI_OPX_CACHE_LINE_SIZE - 1)) == 0, +OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_reliability_tx_replay, scb) & (FI_OPX_CACHE_LINE_SIZE - 1)) == 0, "Reliability Replay scb must be 64-byte aligned!"); struct fi_opx_reliability_resynch_flow { @@ -737,11 +734,11 @@ size_t fi_opx_reliability_replay_get_payload_size(struct fi_opx_reliability_tx_r /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ /* Inlined but called from non-inlined functions with no const hfi1 type, so just use the runtime check */ if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - const uint16_t lrh_pktlen_le = ntohs(replay->scb_9B.hdr.lrh_9B.pktlen); + const uint16_t lrh_pktlen_le = ntohs(replay->scb.scb_9B.hdr.lrh_9B.pktlen); const size_t total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ return (total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)); } else { - const uint16_t lrh_pktlen_le = replay->scb_16B.hdr.lrh_16B.pktlen; + const uint16_t lrh_pktlen_le = replay->scb.scb_16B.hdr.lrh_16B.pktlen; const size_t total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ return (total_bytes - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B)); } @@ -1217,16 +1214,16 @@ void fi_opx_reliability_client_replay_register_no_update (struct fi_opx_reliabil uint8_t hdr_rx; if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - lrh_pktlen_le = ntohs(replay->scb_9B.hdr.lrh_9B.pktlen); + lrh_pktlen_le = ntohs(replay->scb.scb_9B.hdr.lrh_9B.pktlen); total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - hdr_dlid = replay->scb_9B.hdr.lrh_9B.dlid; + hdr_dlid = replay->scb.scb_9B.hdr.lrh_9B.dlid; /* hardcoded replay hfi type for macros */ hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)), hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)->bth.rx; } else { - lrh_pktlen_le = replay->scb_16B.hdr.lrh_16B.pktlen; + lrh_pktlen_le = replay->scb.scb_16B.hdr.lrh_16B.pktlen; total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ - hdr_dlid = htons(replay->scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb_16B.hdr.lrh_16B.dlid); + hdr_dlid = htons(replay->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb.scb_16B.hdr.lrh_16B.dlid); /* hardcoded replay hfi type for macros */ hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)); hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)->bth.rx; @@ -1281,16 +1278,16 @@ void fi_opx_reliability_client_replay_register_with_update (struct fi_opx_reliab /* global note: runtime HFI1 type - may need macro/inlining/const parameter hfi1_type to be branchless */ if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - lrh_pktlen_le = ntohs(replay->scb_9B.hdr.lrh_9B.pktlen); + lrh_pktlen_le = ntohs(replay->scb.scb_9B.hdr.lrh_9B.pktlen); total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ - hdr_dlid = replay->scb_9B.hdr.lrh_9B.dlid; + hdr_dlid = replay->scb.scb_9B.hdr.lrh_9B.dlid; /* hardcoded replay hfi type for macros */ hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)), hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR_9B)->bth.rx; } else { - lrh_pktlen_le = replay->scb_16B.hdr.lrh_16B.pktlen; + lrh_pktlen_le = replay->scb.scb_16B.hdr.lrh_16B.pktlen; total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ - hdr_dlid = htons(replay->scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb_16B.hdr.lrh_16B.dlid); + hdr_dlid = htons(replay->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb.scb_16B.hdr.lrh_16B.dlid); /* hardcoded replay hfi type for macros */ hdr_tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)); hdr_rx = OPX_REPLAY_HDR_TYPE(replay, OPX_HFI1_JKR)->bth.rx; diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 2988b220998..9c5d6003eac 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -1179,19 +1179,19 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) // The "memcopy first" code is here as an alternative to the more complicated // direct write to pio followed by memory copy of the reliability buffer - replay->scb_9B.qw0 = opx_ep->rx->tx.cts_9B.qw0 | + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.cts_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid; - replay->scb_9B.hdr.qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | + replay->scb.scb_9B.hdr.qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t) lrh_dws << 32); - replay->scb_9B.hdr.qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; - replay->scb_9B.hdr.qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn; - replay->scb_9B.hdr.qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; - replay->scb_9B.hdr.qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | + replay->scb.scb_9B.hdr.qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; + replay->scb.scb_9B.hdr.qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2] | psn; + replay->scb.scb_9B.hdr.qw_9B[3] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[3]; + replay->scb.scb_9B.hdr.qw_9B[4] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[4] | ((uint64_t) params->tid_info.npairs << 32) | (params->niov << 48) | params->opcode; - replay->scb_9B.hdr.qw_9B[5] = params->origin_byte_counter_vaddr; - replay->scb_9B.hdr.qw_9B[6] = (uint64_t) params->rzv_comp; + replay->scb.scb_9B.hdr.qw_9B[5] = params->origin_byte_counter_vaddr; + replay->scb.scb_9B.hdr.qw_9B[6] = (uint64_t) params->rzv_comp; union fi_opx_hfi1_packet_payload *const tx_payload = (union fi_opx_hfi1_packet_payload *) replay->payload; @@ -1339,35 +1339,35 @@ int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) // The "memcopy first" code is here as an alternative to the more complicated // direct write to pio followed by memory copy of the reliability buffer - replay->scb_16B.qw0 = opx_ep->rx->tx.cts_16B.qw0 | + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.cts_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, OPX_HFI1_JKR); - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "replay->scb_16B.qw0 = %#lx pbc_dws = %ld\n", replay->scb_16B.qw0, pbc_dws); - replay->scb_16B.hdr.qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "replay->scb_16B.qw0 = %#lx pbc_dws = %ld\n", replay->scb.scb_16B.qw0, pbc_dws); + replay->scb.scb_16B.hdr.qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t) lrh_qws << 20); - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "lrh_qws = %d replay->scb_16B.hdr.lrh_16B.pktlen = %d\n", lrh_qws, replay->scb_16B.hdr.lrh_16B.pktlen); - replay->scb_16B.hdr.qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "lrh_qws = %d replay->scb_16B.hdr.lrh_16B.pktlen = %d\n", lrh_qws, replay->scb.scb_16B.hdr.lrh_16B.pktlen); + replay->scb.scb_16B.hdr.qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); - replay->scb_16B.hdr.qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; - replay->scb_16B.hdr.qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3] | psn; - replay->scb_16B.hdr.qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; - replay->scb_16B.hdr.qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | + replay->scb.scb_16B.hdr.qw_16B[2] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[2] | bth_rx; + replay->scb.scb_16B.hdr.qw_16B[3] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[3] | psn; + replay->scb.scb_16B.hdr.qw_16B[4] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[4]; + replay->scb.scb_16B.hdr.qw_16B[5] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[5] | ((uint64_t) params->tid_info.npairs << 32) | (params->niov << 48) | params->opcode; - replay->scb_16B.hdr.qw_16B[6] = params->origin_byte_counter_vaddr; + replay->scb.scb_16B.hdr.qw_16B[6] = params->origin_byte_counter_vaddr; - replay->scb_16B.hdr.qw_16B[7] = (uint64_t) params->rzv_comp; + replay->scb.scb_16B.hdr.qw_16B[7] = (uint64_t) params->rzv_comp; #ifndef NDEBUG if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { - OPX_JKR_PRINT_16B_PBC(replay->scb_16B.qw0); - OPX_JKR_PRINT_16B_LRH(replay->scb_16B.hdr.qw_16B[0], replay->scb_16B.hdr.qw_16B[1]); - OPX_JKR_PRINT_16B_BTH(replay->scb_16B.hdr.qw_16B[2], replay->scb_16B.hdr.qw_16B[3]); + OPX_JKR_PRINT_16B_PBC(replay->scb.scb_16B.qw0); + OPX_JKR_PRINT_16B_LRH(replay->scb.scb_16B.hdr.qw_16B[0], replay->scb.scb_16B.hdr.qw_16B[1]); + OPX_JKR_PRINT_16B_BTH(replay->scb.scb_16B.hdr.qw_16B[2], replay->scb.scb_16B.hdr.qw_16B[3]); } else { abort(); - fi_opx_hfi1_dump_packet_hdr(&(replay->scb_9B.hdr), OPX_HFI1_TYPE, __func__, __LINE__); + fi_opx_hfi1_dump_packet_hdr(&(replay->scb.scb_9B.hdr), OPX_HFI1_TYPE, __func__, __LINE__); } #endif @@ -1392,12 +1392,12 @@ int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) } #ifndef NDEBUG if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { - OPX_JKR_PRINT_16B_PBC(replay->scb_16B.qw0); - OPX_JKR_PRINT_16B_LRH(replay->scb_16B.hdr.qw_16B[0], replay->scb_16B.hdr.qw_16B[1]); - OPX_JKR_PRINT_16B_BTH(replay->scb_16B.hdr.qw_16B[2], replay->scb_16B.hdr.qw_16B[3]); + OPX_JKR_PRINT_16B_PBC(replay->scb.scb_16B.qw0); + OPX_JKR_PRINT_16B_LRH(replay->scb.scb_16B.hdr.qw_16B[0], replay->scb.scb_16B.hdr.qw_16B[1]); + OPX_JKR_PRINT_16B_BTH(replay->scb.scb_16B.hdr.qw_16B[2], replay->scb.scb_16B.hdr.qw_16B[3]); } else { abort(); - fi_opx_hfi1_dump_packet_hdr(&(replay->scb_9B.hdr), OPX_HFI1_TYPE, __func__, __LINE__); + fi_opx_hfi1_dump_packet_hdr(&(replay->scb.scb_9B.hdr), OPX_HFI1_TYPE, __func__, __LINE__); } #endif @@ -1871,7 +1871,7 @@ int opx_hfi1_rx_rzv_rts_send_etrunc(union fi_opx_hfi1_deferred_work *work) volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - fi_opx_store_and_copy_scb_9B(scb, &replay->scb_9B, + fi_opx_store_and_copy_scb_9B(scb, &replay->scb.scb_9B, opx_ep->rx->tx.cts_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid, opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t) lrh_dws << 32), @@ -1959,7 +1959,7 @@ int opx_hfi1_rx_rzv_rts_send_etrunc_16B(union fi_opx_hfi1_deferred_work *work) volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - fi_opx_store_and_copy_scb_16B(scb, &replay->scb_16B, + fi_opx_store_and_copy_scb_16B(scb, &replay->scb.scb_16B, opx_ep->rx->tx.cts_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, OPX_HFI1_JKR), @@ -1982,7 +1982,7 @@ int opx_hfi1_rx_rzv_rts_send_etrunc_16B(union fi_opx_hfi1_deferred_work *work) volatile uint64_t * const scb2 = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - fi_opx_store_and_copy_qw(scb2, &replay->scb_16B.hdr.qw_16B[7], + fi_opx_store_and_copy_qw(scb2, &replay->scb.scb_16B.hdr.qw_16B[7], 0, 0, 0, 0, 0, 0, 0, 0); FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); @@ -2530,12 +2530,12 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); if (hfi1_type & OPX_HFI1_JKR) { - replay->scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | params->pbc_dlid; } else { - replay->scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | params->pbc_dlid; @@ -2853,10 +2853,10 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) assert(replay != NULL); if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { - replay->scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid; } else { - replay->scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid; } @@ -3285,10 +3285,10 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { - replay->scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid; } else { - replay->scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | params->pbc_dlid; } @@ -3789,7 +3789,7 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, total_len, tag); - fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_temp); } else { const uint64_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); fi_opx_store_and_copy_qw(scb, local_temp, @@ -3864,7 +3864,7 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz rem_payload_size = sizeof(struct fi_opx_hmem_iov) * (niov - 2); } else { local_temp[7] = local_temp_payload[0]; - fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, local_temp); + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, local_temp); fi_opx_copy_cacheline(replay_payload, &local_temp_payload[1]); replay_payload += 7; rem_payload_size = (sizeof(struct fi_opx_hmem_iov) * (niov - 2) + 8); // overflow 8 bytes from 2nd cacheline @@ -4190,7 +4190,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_temp); /* * write the rendezvous payload "send control blocks" @@ -4656,7 +4656,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); tmp.hdr.qw_16B[7] = tag; - fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, (uint64_t *)&tmp.qw0); + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, (uint64_t *)&tmp.qw0); /* * write the rendezvous payload "send control blocks" diff --git a/prov/opx/src/fi_opx_hfi1_sdma.c b/prov/opx/src/fi_opx_hfi1_sdma.c index 6816ef9988b..6449d99f7bd 100644 --- a/prov/opx/src/fi_opx_hfi1_sdma.c +++ b/prov/opx/src/fi_opx_hfi1_sdma.c @@ -212,9 +212,9 @@ void fi_opx_hfi1_sdma_handle_errors(struct fi_opx_ep *opx_ep, } #endif fprintf(stderr, "(%d) [%d] PBC: %#16.16lX\n", - pid, req_num, header_vec->scb.qw0); + pid, req_num, header_vec->scb.scb_9B.qw0); - fi_opx_hfi1_dump_packet_hdr(&header_vec->scb.hdr, OPX_HFI1_TYPE, func, line); + fi_opx_hfi1_dump_packet_hdr(&header_vec->scb.scb_9B.hdr, OPX_HFI1_TYPE, func, line); fprintf(stderr, "(%d) [%d] req data iov=%p len=%lu\n", pid, req_num, iov_ptr[1].iov_base, iov_ptr[1].iov_len); @@ -246,7 +246,7 @@ void fi_opx_hfi1_sdma_handle_errors(struct fi_opx_ep *opx_ep, "(%d) [%d] ERROR: Request opcode is set to EXPECTED (TID), but TID IOV's length is < minimum!\n", pid, req_num); } - uint32_t kdeth = (uint32_t) (header_vec->scb.hdr.qw_9B[2] >> 32); + uint32_t kdeth = (uint32_t) (header_vec->scb.scb_9B.hdr.qw_9B[2] >> 32); uint32_t tidctrl = (kdeth >> FI_OPX_HFI1_KDETH_TIDCTRL_SHIFT) & FI_OPX_HFI1_KDETH_TIDCTRL; uint32_t tididx = (kdeth >> FI_OPX_HFI1_KDETH_TID_SHIFT) & FI_OPX_HFI1_KDETH_TID; uint32_t tidOMshift = (kdeth >> KDETH_OM_SHIFT) & KDETH_OM_MASK; diff --git a/prov/opx/src/fi_opx_msg.c b/prov/opx/src/fi_opx_msg.c index 6cc9c0a343c..0ec84a5a183 100644 --- a/prov/opx/src/fi_opx_msg.c +++ b/prov/opx/src/fi_opx_msg.c @@ -194,22 +194,22 @@ FI_OPX_MSG_SPECIALIZED_FUNC(FI_OPX_LOCK_REQUIRED, FI_AV_UNSPEC, 0x0010000000 #define FI_OPX_MSG_OPS_STRUCT_NAME(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) -#define FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ +#define FI_OPX_MSG_OPS_STRUCT_NAME_(LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE) \ fi_opx_ops_msg_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE #define FI_OPX_MSG_OPS_STRUCT(LOCK,AV,CAPS,RELIABILITY,HFI1_TYPE) \ -static struct fi_ops_msg \ +static struct fi_ops_msg \ FI_OPX_MSG_OPS_STRUCT_NAME(LOCK,AV,CAPS,RELIABILITY, HFI1_TYPE) __attribute__ ((unused)) = { \ - .size = sizeof(struct fi_ops_msg), \ + .size = sizeof(struct fi_ops_msg), \ .recv = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recv, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ - .recvv = fi_no_msg_recvv, \ + .recvv = fi_no_msg_recvv, \ .recvmsg = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(recvmsg, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .send = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(send, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ - .sendv = fi_opx_sendv, \ - .sendmsg = fi_opx_sendmsg, \ + .sendv = fi_opx_sendv, \ + .sendmsg = fi_opx_sendmsg, \ .inject = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(inject, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ .senddata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(senddata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ - .injectdata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE),\ + .injectdata = FI_OPX_MSG_SPECIALIZED_FUNC_NAME(injectdata, LOCK, AV, CAPS, RELIABILITY, HFI1_TYPE), \ } /* FI_LOCAL_COMM | FI_REMOTE_COMM = 0x0018000000000000ull */ diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index 44c8cfc59aa..31fa9960ed5 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -1195,10 +1195,10 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - lrh_pktlen_le = ntohs(tmp->scb_9B.hdr.lrh_9B.pktlen); + lrh_pktlen_le = ntohs(tmp->scb.scb_9B.hdr.lrh_9B.pktlen); total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ } else { - lrh_pktlen_le = tmp->scb_16B.hdr.lrh_16B.pktlen; + lrh_pktlen_le = tmp->scb.scb_16B.hdr.lrh_16B.pktlen; total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ } tmp->psn_ptr->psn.bytes_outstanding -= total_bytes; @@ -1346,10 +1346,10 @@ void fi_opx_hfi1_rx_reliability_ack (struct fid_ep *ep, size_t total_bytes; /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - lrh_pktlen_le = ntohs(tmp->scb_9B.hdr.lrh_9B.pktlen); + lrh_pktlen_le = ntohs(tmp->scb.scb_9B.hdr.lrh_9B.pktlen); total_bytes = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ } else { - lrh_pktlen_le = tmp->scb_16B.hdr.lrh_16B.pktlen; + lrh_pktlen_le = tmp->scb.scb_16B.hdr.lrh_16B.pktlen; total_bytes = (lrh_pktlen_le - 1) * 8; /* do not copy the trailing icrc */ } tmp->psn_ptr->psn.bytes_outstanding -= total_bytes; @@ -1464,12 +1464,12 @@ ssize_t fi_opx_reliability_service_do_replay_sdma (struct fid_ep *ep, #if defined(OPX_RELIABILITY_DEBUG) || !defined(NDEBUG) union fi_opx_reliability_service_flow_key key; if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - key.slid = (uint32_t)start_replay->scb_9B.hdr.lrh_9B.slid; - key.dlid = (uint32_t)start_replay->scb_9B.hdr.lrh_9B.dlid; + key.slid = (uint32_t)start_replay->scb.scb_9B.hdr.lrh_9B.slid; + key.dlid = (uint32_t)start_replay->scb.scb_9B.hdr.lrh_9B.dlid; } else { - key.slid = htons(start_replay->scb_16B.hdr.lrh_16B.slid20 << 20 | start_replay->scb_16B.hdr.lrh_16B.slid); - key.dlid = htons(start_replay->scb_16B.hdr.lrh_16B.dlid20 << 20 | start_replay->scb_16B.hdr.lrh_16B.dlid); + key.slid = htons(start_replay->scb.scb_16B.hdr.lrh_16B.slid20 << 20 | start_replay->scb.scb_16B.hdr.lrh_16B.slid); + key.dlid = htons(start_replay->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | start_replay->scb.scb_16B.hdr.lrh_16B.dlid); } key.tx = (uint32_t)(OPX_REPLAY_HDR(start_replay)->reliability.origin_tx); key.rx = (uint32_t)(OPX_REPLAY_HDR(start_replay)->bth.rx); @@ -1556,11 +1556,11 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service #if defined(OPX_RELIABILITY_DEBUG) || !defined(NDEBUG) union fi_opx_reliability_service_flow_key key; if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - key.slid = (uint32_t)replay->scb_9B.hdr.lrh_9B.slid; - key.dlid = (uint32_t)replay->scb_9B.hdr.lrh_9B.dlid; + key.slid = (uint32_t)replay->scb.scb_9B.hdr.lrh_9B.slid; + key.dlid = (uint32_t)replay->scb.scb_9B.hdr.lrh_9B.dlid; } else { - key.slid = htons(replay->scb_16B.hdr.lrh_16B.slid20 << 20 | replay->scb_16B.hdr.lrh_16B.slid); - key.dlid = htons(replay->scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb_16B.hdr.lrh_16B.dlid); + key.slid = htons(replay->scb.scb_16B.hdr.lrh_16B.slid20 << 20 | replay->scb.scb_16B.hdr.lrh_16B.slid); + key.dlid = htons(replay->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | replay->scb.scb_16B.hdr.lrh_16B.dlid); } key.tx = (uint32_t)FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR(replay)); key.rx = (uint32_t)(OPX_REPLAY_HDR(replay)->bth.rx); @@ -1576,12 +1576,12 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service uint16_t payload_credits_needed; int payload_qw_to_copy_with_header = 0; if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - lrh_pktlen_le = ntohs(replay->scb_9B.hdr.lrh_9B.pktlen); + lrh_pktlen_le = ntohs(replay->scb.scb_9B.hdr.lrh_9B.pktlen); total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B); payload_credits_needed = (payload_bytes_to_copy >> 6); /* number of full 64-byte blocks of payload */ } else { - lrh_pktlen_le = replay->scb_16B.hdr.lrh_16B.pktlen; + lrh_pktlen_le = replay->scb.scb_16B.hdr.lrh_16B.pktlen; total_bytes_to_copy = (lrh_pktlen_le) * 8; /* including trailing icrc */ payload_bytes_to_copy = (total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B)); payload_qw_to_copy_with_header = MIN((7*8), payload_bytes_to_copy)>>3; /* up to 7 qwords */ @@ -1658,14 +1658,14 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_sop_first, pio_state); if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - OPX_HFI1_BAR_STORE(&scb[0], replay->scb_9B.qw0); - OPX_HFI1_BAR_STORE(&scb[1], replay->scb_9B.hdr.qw_9B[0]); - OPX_HFI1_BAR_STORE(&scb[2], replay->scb_9B.hdr.qw_9B[1]); - OPX_HFI1_BAR_STORE(&scb[3], replay->scb_9B.hdr.qw_9B[2]); - OPX_HFI1_BAR_STORE(&scb[4], replay->scb_9B.hdr.qw_9B[3]); - OPX_HFI1_BAR_STORE(&scb[5], replay->scb_9B.hdr.qw_9B[4]); - OPX_HFI1_BAR_STORE(&scb[6], replay->scb_9B.hdr.qw_9B[5]); - OPX_HFI1_BAR_STORE(&scb[7], replay->scb_9B.hdr.qw_9B[6]); + OPX_HFI1_BAR_STORE(&scb[0], replay->scb.scb_9B.qw0); + OPX_HFI1_BAR_STORE(&scb[1], replay->scb.scb_9B.hdr.qw_9B[0]); + OPX_HFI1_BAR_STORE(&scb[2], replay->scb.scb_9B.hdr.qw_9B[1]); + OPX_HFI1_BAR_STORE(&scb[3], replay->scb.scb_9B.hdr.qw_9B[2]); + OPX_HFI1_BAR_STORE(&scb[4], replay->scb.scb_9B.hdr.qw_9B[3]); + OPX_HFI1_BAR_STORE(&scb[5], replay->scb.scb_9B.hdr.qw_9B[4]); + OPX_HFI1_BAR_STORE(&scb[6], replay->scb.scb_9B.hdr.qw_9B[5]); + OPX_HFI1_BAR_STORE(&scb[7], replay->scb.scb_9B.hdr.qw_9B[6]); FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); @@ -1677,14 +1677,14 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service consumed_credits = 1; #endif } else { - OPX_HFI1_BAR_STORE(&scb[0], replay->scb_16B.qw0); - OPX_HFI1_BAR_STORE(&scb[1], replay->scb_16B.hdr.qw_16B[0]); - OPX_HFI1_BAR_STORE(&scb[2], replay->scb_16B.hdr.qw_16B[1]); - OPX_HFI1_BAR_STORE(&scb[3], replay->scb_16B.hdr.qw_16B[2]); - OPX_HFI1_BAR_STORE(&scb[4], replay->scb_16B.hdr.qw_16B[3]); - OPX_HFI1_BAR_STORE(&scb[5], replay->scb_16B.hdr.qw_16B[4]); - OPX_HFI1_BAR_STORE(&scb[6], replay->scb_16B.hdr.qw_16B[5]); - OPX_HFI1_BAR_STORE(&scb[7], replay->scb_16B.hdr.qw_16B[6]); + OPX_HFI1_BAR_STORE(&scb[0], replay->scb.scb_16B.qw0); + OPX_HFI1_BAR_STORE(&scb[1], replay->scb.scb_16B.hdr.qw_16B[0]); + OPX_HFI1_BAR_STORE(&scb[2], replay->scb.scb_16B.hdr.qw_16B[1]); + OPX_HFI1_BAR_STORE(&scb[3], replay->scb.scb_16B.hdr.qw_16B[2]); + OPX_HFI1_BAR_STORE(&scb[4], replay->scb.scb_16B.hdr.qw_16B[3]); + OPX_HFI1_BAR_STORE(&scb[5], replay->scb.scb_16B.hdr.qw_16B[4]); + OPX_HFI1_BAR_STORE(&scb[6], replay->scb.scb_16B.hdr.qw_16B[5]); + OPX_HFI1_BAR_STORE(&scb[7], replay->scb.scb_16B.hdr.qw_16B[6]); FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); @@ -1696,7 +1696,7 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); // spill from 1st cacheline (SOP) - OPX_HFI1_BAR_STORE(&scb_payload[0], replay->scb_16B.hdr.qw_16B[7]); // header + OPX_HFI1_BAR_STORE(&scb_payload[0], replay->scb.scb_16B.hdr.qw_16B[7]); // header int i; @@ -2123,9 +2123,9 @@ ssize_t fi_opx_reliability_send_ping(struct fid_ep *ep, uint64_t dlid; /* Inlined but called from non-inlined functions with no const hfi1 type, so just use the runtime check */ if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - dlid = (uint64_t) head->scb_9B.hdr.lrh_9B.dlid; + dlid = (uint64_t) head->scb.scb_9B.hdr.lrh_9B.dlid; } else { - dlid = (uint64_t) htons(head->scb_16B.hdr.lrh_16B.dlid20 << 20 | head->scb_16B.hdr.lrh_16B.dlid); + dlid = (uint64_t) htons(head->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | head->scb.scb_16B.hdr.lrh_16B.dlid); } const uint64_t rx = (uint64_t)head->target_reliability_rx; diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index 3850c7f822d..df88b3f0d9f 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -208,7 +208,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - fi_opx_copy_hdr9B_cacheline(&replay->scb_9B, local_temp); + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_temp); /* write the CTS payload "send control block" */ volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); @@ -266,7 +266,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); local_temp[8] = temp[0]; - fi_opx_copy_hdr16B_cacheline(&replay->scb_16B, local_temp); + fi_opx_copy_hdr16B_cacheline(&replay->scb.scb_16B, local_temp); replay->payload[0] = temp[1]; replay->payload[1] = temp[2]; From 0c1002e8aaf7f9dc2e9b4baa985078fbe02a24e4 Mon Sep 17 00:00:00 2001 From: Archana Venkatesha Date: Mon, 16 Sep 2024 09:17:30 -0400 Subject: [PATCH 148/393] prov/opx: Link bounce support for OPX WFR This commit adds support for link up/down events in OPX for WFR platforms. Signed-off-by: Archana Venkatesha --- prov/opx/include/opa_user_gen1.h | 5 +- prov/opx/include/rdma/opx/fi_opx_hfi1.h | 66 +++++++ .../include/rdma/opx/fi_opx_hfi1_progress.h | 161 +++++++++++------- .../include/rdma/opx/fi_opx_hfi1_transport.h | 1 - prov/opx/src/fi_opx_hfi1.c | 27 ++- prov/opx/src/fi_opx_hfi1_sdma.c | 21 ++- prov/opx/src/fi_opx_service.c | 61 +++++++ 7 files changed, 275 insertions(+), 67 deletions(-) diff --git a/prov/opx/include/opa_user_gen1.h b/prov/opx/include/opa_user_gen1.h index d09f015f866..99b4c141146 100644 --- a/prov/opx/include/opa_user_gen1.h +++ b/prov/opx/include/opa_user_gen1.h @@ -329,7 +329,10 @@ int opx_hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits); int opx_hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type); /* reset halted send context, error if context is not halted. */ -int opx_hfi_reset_context(struct _hfi_ctrl *ctrl); +int opx_hfi_reset_context(int fd); + +/* ack hfi events */ +int opx_hfi_ack_events(int fd, uint64_t ackbits); /* * Safe version of opx_hfi_[d/q]wordcpy that is guaranteed to only copy each byte once. diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1.h b/prov/opx/include/rdma/opx/fi_opx_hfi1.h index 405b2f178b9..aa202ad6bf8 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1.h @@ -52,6 +52,7 @@ #include "rdma/opx/opx_hfi1_sim.h" #include "rdma/opx/fi_opx_hfi1_version.h" +#include "rdma/opx/fi_opx_timer.h" // #define FI_OPX_TRACE 1 @@ -526,6 +527,11 @@ struct fi_opx_hfi1_context { } daos_info; int64_t ref_cnt; + size_t status_lasterr; + time_t network_lost_time; + union fi_opx_timer_stamp link_status_timestamp; + union fi_opx_timer_state link_status_timer; + uint64_t status_check_next_usec; }; struct fi_opx_hfi1_context_internal { @@ -748,4 +754,64 @@ void opx_print_context(struct fi_opx_hfi1_context *context) FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "Context ref_cnt %#lX \n",context->ref_cnt); } +void opx_reset_context(struct fi_opx_ep * opx_ep); + +#define OPX_CONTEXT_STATUS_CHECK_INTERVAL_USEC 250000 /* 250 ms*/ + +__OPX_FORCE_INLINE__ +uint64_t opx_get_hw_status(struct fi_opx_hfi1_context *context) +{ + struct hfi1_status *status = + (struct hfi1_status *) context->ctrl->base_info.status_bufbase; + + return((status->dev & (HFI1_STATUS_INITTED | HFI1_STATUS_CHIP_PRESENT | HFI1_STATUS_HWERROR)) + | (status->port & (HFI1_STATUS_IB_READY | HFI1_STATUS_IB_CONF))); +} + +#define OPX_HFI1_HW_CHIP_STATUS (HFI1_STATUS_CHIP_PRESENT | HFI1_STATUS_INITTED) +#define OPX_HFI1_IB_STATUS (HFI1_STATUS_IB_CONF | HFI1_STATUS_IB_READY) + +/* The linkup time duration for a system should allow the time needed + to complete 3 LNI passes which is: + 50 seconds for a passive copper channel + 65 seconds for optical channel. + (we add 5 seconds of margin.) */ +#define OPX_LINK_DOWN_MAX_SEC 70.0 + +__OPX_FORCE_INLINE__ +size_t fi_opx_context_check_status(struct fi_opx_hfi1_context *context) +{ + size_t err = FI_SUCCESS; + uint64_t status = opx_get_hw_status(context); + + /* Fatal chip-related errors */ + if (!((status & OPX_HFI1_HW_CHIP_STATUS) == OPX_HFI1_HW_CHIP_STATUS) || + (status & HFI1_STATUS_HWERROR)) { + err = FI_ENETUNREACH; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_CTRL, "HFI1 chip error detected\n"); + abort(); + return(err); + } else if (!((status & OPX_HFI1_IB_STATUS) == OPX_HFI1_IB_STATUS)) { + err = FI_ENETDOWN; + if (err != context->status_lasterr) { + context->network_lost_time = time(NULL); + } else { + time_t now = time(NULL); + + if (difftime(now,context->network_lost_time) > OPX_LINK_DOWN_MAX_SEC) + { + fprintf(stderr, "Link has been down more than 70s. Aborting\n"); + abort(); + return(err); + } + } + } + + if (err != FI_SUCCESS) { + context->status_lasterr = err; /* record error */ + } + + return err; +} + #endif /* _FI_PROV_OPX_HFI1_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h index 9e445a4e415..5258bc47587 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h @@ -760,7 +760,78 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, } } +__OPX_FORCE_INLINE__ +void fi_opx_hfi1_poll_sdma_completion(struct fi_opx_ep *opx_ep) +{ + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SDMA POLL BEGIN\n"); + struct fi_opx_hfi1_context *hfi = opx_ep->hfi; + uint16_t queue_size = hfi->info.sdma.queue_size; + while (hfi->info.sdma.available_counter < queue_size) { + volatile struct hfi1_sdma_comp_entry * entry = + &hfi->info.sdma.completion_queue[hfi->info.sdma.done_index]; + if (entry->status == QUEUED) { + break; + } + + // Update the status/errcode of the work entry who was using this index + assert(hfi->info.sdma.queued_entries[hfi->info.sdma.done_index]); + hfi->info.sdma.queued_entries[hfi->info.sdma.done_index]->status = entry->status; + OPX_TRACER_TRACE_SDMA(OPX_TRACER_END_SUCCESS, "SDMA_COMPLETE_%hu", hfi->info.sdma.done_index); + hfi->info.sdma.queued_entries[hfi->info.sdma.done_index]->errcode = entry->errcode; + hfi->info.sdma.queued_entries[hfi->info.sdma.done_index] = NULL; + + assert(entry->status == COMPLETE || entry->status == FREE || + (entry->status == ERROR && entry->errcode != ECOMM)); // If it is a network error, retry + ++hfi->info.sdma.available_counter; + hfi->info.sdma.done_index = (hfi->info.sdma.done_index + 1) % (queue_size); + if (hfi->info.sdma.done_index == hfi->info.sdma.fill_index) { + assert(hfi->info.sdma.available_counter == queue_size); + } + } + assert(hfi->info.sdma.available_counter >= opx_ep->tx->sdma_request_queue.slots_avail); + opx_ep->tx->sdma_request_queue.slots_avail = hfi->info.sdma.available_counter; + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SDMA POLL COMPLETE\n"); +} + +__OPX_FORCE_INLINE__ +int opx_is_rhf_empty(struct fi_opx_ep *opx_ep, + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) +{ + const uint64_t local_hdrq_mask = (hdrq_mask == FI_OPX_HDRQ_MASK_RUNTIME) ? + opx_ep->hfi->info.rxe.hdrq.rx_poll_mask : + hdrq_mask; + const uint64_t hdrq_offset = opx_ep->rx->state.hdrq.head & local_hdrq_mask; + volatile uint32_t *rhf_ptr = opx_ep->rx->hdrq.rhf_base + hdrq_offset; + const uint64_t rhf_rcvd = *((volatile uint64_t *)rhf_ptr); + const uint64_t rhf_seq = opx_ep->rx->state.hdrq.rhf_seq; + + if (!OPX_RHF_SEQ_MATCH(rhf_seq, rhf_rcvd, hfi1_type)) { + return 1; + } + return 0; +} + +__OPX_FORCE_INLINE__ +void opx_handle_events(struct fi_opx_ep *opx_ep, + const uint64_t hdrq_mask, + const enum opx_hfi1_type hfi1_type) +{ + uint64_t events = *(uint64_t *)(opx_ep->hfi->ctrl->base_info.events_bufbase); + if (events & HFI1_EVENT_FROZEN) { + /* reset context only if RHF queue is empty */ + if (opx_is_rhf_empty(opx_ep, hdrq_mask, hfi1_type)) { + opx_reset_context(opx_ep); + opx_hfi_ack_events(opx_ep->hfi->fd, events); + } else { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "Context frozen: Not resetting because packets are present in receive queue\n"); + } + } +} __OPX_FORCE_INLINE__ void fi_opx_hfi1_poll_many (struct fid_ep *ep, @@ -789,28 +860,36 @@ void fi_opx_hfi1_poll_many (struct fid_ep *ep, packets = fi_opx_hfi1_poll_once(ep, FI_OPX_LOCK_NOT_REQUIRED, reliability, hdrq_mask, hfi1_type); } while ((packets > 0) && (hfi1_poll_count++ < hfi1_poll_max)); - - - if (reliability == OFI_RELIABILITY_KIND_ONLOAD) { /* compile-time constant expression */ - - struct fi_opx_reliability_service *service = opx_ep->reliability->state.service; - - union fi_opx_timer_state *timer = &service->tx.timer; - union fi_opx_timer_stamp *timestamp = &service->tx.timestamp; - uint64_t compare = fi_opx_timer_now(timestamp, timer); - - //TODO: There needs to be feedback from the replay buffer pool into this following if as well - // If the pool is getting full, then send pings out more frequently - - if (OFI_UNLIKELY(compare > service->usec_next)) { - // Drain all coalesced pings - fi_opx_hfi_rx_reliablity_process_requests(ep, PENDING_RX_RELIABLITY_COUNT_MAX); - fi_reliability_service_ping_remote(ep, service); - // Fetch the timer again as it could have taken us a while to get through reliability - fi_opx_timer_now(timestamp, timer); - service->usec_next = fi_opx_timer_next_event_usec(timer, timestamp, service->usec_max); - }// End timer fired - + struct fi_opx_reliability_service *service = &opx_ep->reliability->service; + union fi_opx_timer_state *timer = &service->tx.timer; + union fi_opx_timer_stamp *timestamp = &service->tx.timestamp; + uint64_t compare = fi_opx_timer_now(timestamp, timer); + + //TODO: There needs to be feedback from the replay buffer pool into this following if as well + // If the pool is getting full, then send pings out more frequently + + if (OFI_UNLIKELY(compare > service->usec_next)) { + // Drain all coalesced pings + fi_opx_hfi_rx_reliablity_process_requests(ep, PENDING_RX_RELIABLITY_COUNT_MAX); + fi_reliability_service_ping_remote(ep, service); + // Fetch the timer again as it could have taken us a while to get through reliability + compare = fi_opx_timer_now(timestamp, timer); + service->usec_next = fi_opx_timer_next_event_usec(timer, timestamp, service->usec_max); + } // End timer fired + + struct fi_opx_hfi1_context *context = opx_ep->hfi; + timer = &context->link_status_timer; + timestamp = &context->link_status_timestamp; + + if (OFI_UNLIKELY(compare > context->status_check_next_usec)) { + int prev_link_status = context->status_lasterr; + int err = fi_opx_context_check_status(context); + // check for hfi event if link is moving from down to up + if ((prev_link_status != FI_SUCCESS) && (err == FI_SUCCESS)) { // check for hfi event if + context->status_lasterr = FI_SUCCESS; /* clear error */ + opx_handle_events(opx_ep, hdrq_mask, hfi1_type); + } + context->status_check_next_usec = fi_opx_timer_next_event_usec(timer, timestamp, OPX_CONTEXT_STATUS_CHECK_INTERVAL_USEC); } } @@ -820,42 +899,4 @@ void fi_opx_hfi1_poll_many (struct fid_ep *ep, return; } -__OPX_FORCE_INLINE__ -void fi_opx_hfi1_poll_sdma_completion(struct fi_opx_ep *opx_ep) -{ - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SDMA POLL BEGIN\n"); - struct fi_opx_hfi1_context *hfi = opx_ep->hfi; - uint16_t queue_size = hfi->info.sdma.queue_size; - - while (hfi->info.sdma.available_counter < queue_size) { - volatile struct hfi1_sdma_comp_entry * entry = - &hfi->info.sdma.completion_queue[hfi->info.sdma.done_index]; - if (entry->status == QUEUED) { - break; - } - - // Update the status/errcode of the work entry who was using this index - assert(hfi->info.sdma.queued_entries[hfi->info.sdma.done_index]); - hfi->info.sdma.queued_entries[hfi->info.sdma.done_index]->status = entry->status; - OPX_TRACER_TRACE_SDMA(OPX_TRACER_END_SUCCESS, "SDMA_COMPLETE_%hu", hfi->info.sdma.done_index); - hfi->info.sdma.queued_entries[hfi->info.sdma.done_index]->errcode = entry->errcode; - hfi->info.sdma.queued_entries[hfi->info.sdma.done_index] = NULL; - - assert(entry->status == COMPLETE || entry->status == FREE); - ++hfi->info.sdma.available_counter; - hfi->info.sdma.done_index = (hfi->info.sdma.done_index + 1) % (queue_size); - if (hfi->info.sdma.done_index == hfi->info.sdma.fill_index) { - assert(hfi->info.sdma.available_counter == queue_size); - } - } - assert(hfi->info.sdma.available_counter >= opx_ep->tx->sdma_request_queue.slots_avail); - opx_ep->tx->sdma_request_queue.slots_avail = hfi->info.sdma.available_counter; - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SDMA POLL COMPLETE\n"); -} - - - - #endif /* _FI_PROV_OPX_HFI1_PROGRESS_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index c8025713ec4..834a94d25e2 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -952,7 +952,6 @@ void fi_opx_force_credit_return(struct fid_ep *ep, while (OFI_UNLIKELY(available_credits < credits_needed)) { if (loop++ & 0x10) { opx_ep->tx->pio_state->qw0 = pio_state.qw0; - return; } FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 9c5d6003eac..59371385b89 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -159,6 +159,29 @@ static int opx_open_hfi_and_context(struct _hfi_ctrl **ctrl, return fd; } +void opx_reset_context(struct fi_opx_ep * opx_ep) +{ + fi_opx_compiler_msync_writes(); + opx_ep->rx->state.hdrq.rhf_seq = OPX_RHF_SEQ_INIT_VAL(OPX_HFI1_TYPE); + opx_ep->rx->state.hdrq.head = 0; + + if (opx_hfi_reset_context(opx_ep->hfi->fd)) { + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "Send context reset failed: %d.\n", + errno); + abort(); + } + + opx_ep->tx->pio_state->fill_counter = 0; + opx_ep->tx->pio_state->scb_head_index = 0; + union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; + FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); + opx_ep->tx->pio_state->qw0 = pio_state.qw0; + + fi_opx_hfi1_poll_sdma_completion(opx_ep); + opx_hfi1_sdma_process_pending(opx_ep); +} + + static int fi_opx_get_daos_hfi_rank_inst(const uint8_t hfi_unit_number, const uint32_t rank) { struct fi_opx_daos_hfi_rank_key key; @@ -895,6 +918,9 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "Context configured with HFI=%d PORT=%d LID=0x%x JKEY=%d\n", context->hfi_unit, context->hfi_port, context->lid, context->jkey); + context->status_lasterr = 0; + context->status_check_next_usec = fi_opx_timer_now(&context->link_status_timestamp, &context->link_status_timer); + opx_print_context(context); return context; @@ -3714,7 +3740,6 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz &opx_ep->tx->force_credit_return, total_credits_needed); if (total_credits_available < total_credits_needed) { opx_ep->tx->pio_state->qw0 = pio_state.qw0; - return -FI_EAGAIN; } } diff --git a/prov/opx/src/fi_opx_hfi1_sdma.c b/prov/opx/src/fi_opx_hfi1_sdma.c index 6449d99f7bd..7d4b100a144 100644 --- a/prov/opx/src/fi_opx_hfi1_sdma.c +++ b/prov/opx/src/fi_opx_hfi1_sdma.c @@ -94,7 +94,7 @@ int fi_opx_hfi1_dput_sdma_pending_completion(union fi_opx_hfi1_deferred_work *wo FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); return -FI_EAGAIN; } - assert(we->comp_state == OPX_SDMA_COMP_COMPLETE); + assert(we->comp_state == OPX_SDMA_COMP_COMPLETE || we->comp_state == OPX_SDMA_COMP_ERROR); slist_remove_head(¶ms->sdma_reqs); we->next = NULL; @@ -143,6 +143,14 @@ void fi_opx_hfi1_sdma_handle_errors(struct fi_opx_ep *opx_ep, { const pid_t pid = getpid(); + if (errno == ECOMM || errno == EINTR) { + int err = fi_opx_context_check_status(opx_ep->hfi); + if (err != FI_SUCCESS) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Link down detected\n"); + return; + } + } + fprintf(stderr, "(%d) %s:%s():%d ERROR: SDMA Abort errno=%d (%s)\n", pid, file, func, line, errno, strerror(errno)); fprintf(stderr, "(%d) ===================================== SDMA_WE -- " @@ -315,7 +323,7 @@ void opx_hfi1_sdma_process_pending(struct fi_opx_ep *opx_ep) } __OPX_FORCE_INLINE__ -void opx_hfi1_sdma_writev(struct fi_opx_ep *opx_ep, +int opx_hfi1_sdma_writev(struct fi_opx_ep *opx_ep, struct iovec *iovecs, int iovs_used, uint16_t avail, @@ -356,6 +364,7 @@ void opx_hfi1_sdma_writev(struct fi_opx_ep *opx_ep, } FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.writev_calls[iovs_used]); + return(writev_rc); } void opx_hfi1_sdma_process_requests(struct fi_opx_ep *opx_ep) @@ -390,9 +399,13 @@ void opx_hfi1_sdma_process_requests(struct fi_opx_ep *opx_ep) if (iovs_free < request->num_iovs) #endif { - opx_hfi1_sdma_writev(opx_ep, iovecs, iovs_used, avail, + int err = opx_hfi1_sdma_writev(opx_ep, iovecs, iovs_used, avail, fill_index, __FILE__, __func__, __LINE__); - + if (err < 0) { + /* Error occured in writev. Add the request back to queue */ + slist_insert_head((struct slist_entry *)request, &queue->list); + return; + } iovs_used = 0; iovs_free = OPX_SDMA_HFI_MAX_IOVS_PER_WRITE; } diff --git a/prov/opx/src/fi_opx_service.c b/prov/opx/src/fi_opx_service.c index 84aa5279c17..30c6341bb4e 100644 --- a/prov/opx/src/fi_opx_service.c +++ b/prov/opx/src/fi_opx_service.c @@ -856,3 +856,64 @@ int opx_hfi_get_hfi1_count() { } return hfi1_count; } + +/** + * @brief Reset the HFI context. + * + * This function resets the HFI context by sending a command to the specified file descriptor. + * The command type is set to OPX_HFI_CMD_CTXT_RESET and the command length and address are set to 0. + * If the command write fails, the function will retry if the error is ENOLCK. + * If the error is not EINVAL, a warning message will be printed. + * + * @param fd The file descriptor to send the command to. + * @return 0 on success, -1 on failure. + */ +int opx_hfi_reset_context(int fd) +{ + struct hfi1_cmd cmd; + + cmd.type = OPX_HFI_CMD_CTXT_RESET; + cmd.len = 0; + cmd.addr = 0; + +retry: + if (opx_hfi_cmd_write(fd, &cmd, sizeof(cmd)) == -1) { + if (errno == ENOLCK) + goto retry; + + if (errno != EINVAL) + _HFI_INFO("reset ctxt failed: %s\n", strerror(errno)); + + return -1; + } + return 0; +} + +/** + * @brief Acknowledge events for the HFI. + * + * This function sends an acknowledgment for events to the HFI. + * + * @param fd The file descriptor for the HFI control. + * @param ackbits The bits to be acknowledged. + * @return 0 on success, -1 on failure. + */ +int opx_hfi_ack_events(int fd, uint64_t ackbits) +{ + struct hfi1_cmd cmd; + + cmd.type = OPX_HFI_CMD_ACK_EVENT; + cmd.len = 0; + cmd.addr = ackbits; + +retry: + if (opx_hfi_cmd_write(fd, &cmd, sizeof(cmd)) == -1) { + if (errno == ENOLCK) + goto retry; + + if (errno != EINVAL) + _HFI_INFO("ack event failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} From 4b78fc2a70a0ba85ad7f55490d3719332fc896eb Mon Sep 17 00:00:00 2001 From: Mike Wilkins Date: Mon, 16 Sep 2024 14:59:00 -0500 Subject: [PATCH 149/393] man: Document OPX max ping envvars Signed-off-by: Mike Wilkins --- man/fi_opx.7.md | 12 ++++++++++++ man/man7/fi_opx.7 | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/man/fi_opx.7.md b/man/fi_opx.7.md index 90b63dc5eb9..6810fa9d11a 100644 --- a/man/fi_opx.7.md +++ b/man/fi_opx.7.md @@ -134,6 +134,18 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL. Default setting is 64. +*FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS* +: Integer. This setting controls how many PING requests the reliability/replay + function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX in situations + with less contending outgoing traffic from the HFI. + Default setting is 128. Range of valid values is 1-65535. + +*FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS* +: Integer. This setting controls how many PING requests the reliability/replay + function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX in situations + with more contending, outgoing traffic from the HFI. + Default setting is 4. Range of valid values is 1-65535. + *FI_OPX_SELINUX* : Boolean (0/1, on/off, true/false, yes/no). Set to true if you're running a security-enhanced Linux. This enables updating the Jkey used based on system diff --git a/man/man7/fi_opx.7 b/man/man7/fi_opx.7 index 2a5fbf57c42..3481361d6e2 100644 --- a/man/man7/fi_opx.7 +++ b/man/man7/fi_opx.7 @@ -149,6 +149,22 @@ inclusive. .PP Default setting is 64. .TP +\f[I]FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS\f[R] +Integer. +This setting controls how many PING requests the reliability/replay +function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX +in situations with less contending, outgoing traffic from the HFI. +.PP +Default setting is 128. Range of valid values is 1-65535. +.TP +\f[I]FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS\f[R] +Integer. +This setting controls how many PING requests the reliability/replay +function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX +in situations with more contending, outgoing traffic from the HFI. +.PP +Default setting is 4. Range of valid values is 1-65535. +.TP \f[I]FI_OPX_SELINUX\f[R] Boolean (0/1, on/off, true/false, yes/no). Set to true if you\[cq]re running a security-enhanced Linux. From ed72c6e5a8a31da38034b0a033bbd1c40340508d Mon Sep 17 00:00:00 2001 From: Bob Cernohous Date: Tue, 17 Sep 2024 07:51:30 -0500 Subject: [PATCH 150/393] prov/opx: 16B SDMA header support Signed-off-by: Bob Cernohous --- prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h | 18 +-- .../include/rdma/opx/fi_opx_hfi1_transport.h | 50 +++--- prov/opx/include/rdma/opx/fi_opx_rma.h | 10 +- prov/opx/src/fi_opx_ep.c | 17 +- prov/opx/src/fi_opx_hfi1.c | 149 +++++++++--------- prov/opx/src/fi_opx_reliability.c | 21 +-- 6 files changed, 127 insertions(+), 138 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h index b18ea2f9095..d39f59993b0 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h @@ -514,9 +514,6 @@ int opx_hfi1_sdma_enqueue_request(struct fi_opx_ep *opx_ep, /* Set the Acknowledge Request Bit if we're only sending one packet */ uint64_t set_ack_bit = (num_packets == 1) ? (uint64_t)htonl(0x80000000) : 0; - OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); - - request->iovecs[0].iov_base = req_info; if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { @@ -553,8 +550,6 @@ int opx_hfi1_sdma_enqueue_replay(struct fi_opx_ep *opx_ep, assert(replay->use_iov); assert(replay->iov->iov_len == payload_bytes); - OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.replay_requests); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== Enqueuing replay for SDMA Send\n"); @@ -601,12 +596,15 @@ uint16_t opx_hfi1_sdma_register_replays(struct fi_opx_ep *opx_ep, we->dlid, we->rx, we->rs, &we->psn_ptr, we->num_packets); - OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); - uint32_t fragsize = 0; for (int i = 0; i < we->num_packets; ++i) { fragsize = MAX(fragsize, we->packets[i].length); - we->packets[i].replay->scb.scb_9B.hdr.qw_9B[2] |= (uint64_t)htonl((uint32_t)psn); + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + we->packets[i].replay->scb.scb_9B.hdr.qw_9B[2] |= (uint64_t)htonl((uint32_t)psn); + } else { + we->packets[i].replay->scb.scb_16B.hdr.qw_16B[3] |= (uint64_t)htonl((uint32_t)psn); + } + we->packets[i].replay->sdma_we_use_count = we->bounce_buf.use_count; we->packets[i].replay->sdma_we = replay_back_ptr; we->packets[i].replay->hmem_iface = we->hmem.iface; @@ -635,8 +633,6 @@ void opx_hfi1_sdma_enqueue_dput(struct fi_opx_ep *opx_ep, .iov_len = (we->total_payload + 3) & -4 }; - OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.nontid_requests); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== Enqueuing non-tid request for SDMA Send\n"); @@ -700,8 +696,6 @@ void opx_hfi1_sdma_enqueue_dput_tid(struct fi_opx_ep *opx_ep, } }; - OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.tid_requests); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== Enqueuing tid request for SDMA Send\n"); diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index 834a94d25e2..ea1fa89189f 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -926,8 +926,8 @@ void fi_opx_force_credit_return(struct fid_ep *ep, const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); const uint64_t pbc_dws = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 16 : 20; - const uint16_t lrh_dws = htons(pbc_dws-1); - const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ const uint64_t force_credit_return = OPX_PBC_CR(0x1, hfi1_type); @@ -1482,8 +1482,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ ((total_credits_needed-1) << 4); - /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ - const uint16_t lrh_dws = htons(pbc_dws - 1); + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ struct iovec *iov_ptr = (struct iovec *) iov; size_t *niov_ptr = &niov; @@ -1795,12 +1794,17 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, 2 + /* kdeth9 remaining 2 dws */ //--------------------- header split point KDETH 9 DWS (payload_qws_total << 1) + /* one packet payload */ - 2 ; /* tail 1 qws/2 dws */ + 2 ; /* ICRC/tail 1 qws/2 dws */ + + /* Descriptive code above, but for reference most code just has: */ + /* 9 + kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + /* 2; ICRC/tail */ + const uint16_t total_credits_needed = (pbc_dws + 15 ) >> 4; /* round up to full blocks */ /* 16B LRH is qws */ - const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ struct iovec *iov_ptr = (struct iovec *) iov; size_t *niov_ptr = &niov; @@ -2037,7 +2041,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ (payload_qws_total << 1); - const uint16_t lrh_dws = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND, SHM -- EAGER (begin)\n"); @@ -2138,15 +2142,13 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode_16B(struct fid_ep *ep, const uint64_t pbc_dws = 2 + /* pbc */ - 4 + /* lrh */ + 4 + /* lrh uncompressed */ 3 + /* bth */ - 3 + /* kdeth */ - 4 + /* software kdeth + unused */ - 2 + /* second cacheline */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ ((payload_qws_total) << 1) + - 2; //ICRC + Tail + 2; /* ICRC/tail */ - const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND 16B, SHM -- EAGER (begin)\n"); @@ -2544,15 +2546,13 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, 2 + /* pbc */ 2 + /* lhr */ 3 + /* bth */ - 3 + /* kdeth */ - 6 + /* software kdeth */ - //--------------------- header split point KDETH 9 DWS + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ /* PIO is everything else */ (payload_qws_total << 1); /* one packet payload */ /* 9B LRH is dws */ - const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ assert(lock_required == 0); @@ -2709,7 +2709,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, (tail_qws_total << 1) ; /* tail 1 qws/2 dws */ /* 16B LRH is qws */ - const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ assert(lock_required == 0); @@ -3477,7 +3477,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last (struct fi_opx_ep *opx_ep, const uint64_t pbc_dws = 16 + /* pbc + packet header */ (payload_qws_total << 1); - const uint16_t lrh_dws = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; @@ -3617,7 +3617,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last_16B (struct fi_opx_ep *opx_ep, (payload_qws_total << 1) + /* one packet payload */ (tail_qws_total << 1) ; /* tail 1 qws/2 dws */ - const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* does not include pbc (8 bytes) */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ uint16_t total_credits_needed = 1 + /* PIO SOP -- 1 credit */ @@ -3745,7 +3745,7 @@ static inline void fi_opx_shm_write_fence(struct fi_opx_ep *opx_ep, 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ (0 << 4); - const uint16_t lrh_dws = htons(pbc_dws - 1); + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ hdr->qw_9B[0] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); hdr->qw_9B[1] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[1] | bth_rx; hdr->qw_9B[2] = opx_ep->rx->tx.cts_9B.hdr.qw_9B[2]; @@ -3755,11 +3755,11 @@ static inline void fi_opx_shm_write_fence(struct fi_opx_ep *opx_ep, hdr->qw_9B[6] = bytes_to_sync; } else { const uint64_t pbc_dws = 2 + /* pbc */ - 4 + /* lrh */ + 4 + /* lrh uncompressed */ 3 + /* bth */ - 9 + /* kdeth */ - 2; /* ICRC */ - const uint16_t lrh_dws = (pbc_dws - 1) >> 1; + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2; /* ICRC/tail */ + const uint16_t lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index b1da57de82e..998ebaf5b83 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -93,17 +93,15 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ 16; /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ - /* lrh does not include pbc (8 bytes/2 dws), but does include icrc (4 bytes/1 dws), - so subtract 1 dws */ - params->lrh_dws = htons(params->pbc_dws - 1); + params->lrh_dws = htons(params->pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ } else { params->pbc_dws = 2 + /* pbc */ - 4 + /* lrh */ + 4 + /* lrh uncompressed */ 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ 16 + /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ - 2; /* ICRC */ - params->lrh_dws = (params->pbc_dws - 2) >> 1; + 2; /* ICRC/tail */ + params->lrh_dws = (params->pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ } params->is_intranode = fi_opx_hfi1_tx_is_intranode(opx_ep, opx_target_addr, caps); params->reliability = reliability; diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 826266422c5..164b4829c56 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -293,8 +293,7 @@ void fi_opx_ep_tx_model_init (struct fi_opx_hfi1_context * hfi, OPX_PBC_PORTIDX(hfi->hfi_port,hfi1_type) | OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type); - /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ - inject_9B->hdr.lrh_9B.pktlen = htons(inject_pbc_dws-1); + inject_9B->hdr.lrh_9B.pktlen = htons(inject_pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ /* specified at runtime */ inject_9B->hdr.inject.message_length = 0; @@ -381,13 +380,11 @@ void fi_opx_ep_tx_model_init_16B (struct fi_opx_hfi1_context * hfi, *inject_16B = *send_16B; const uint64_t pbc_dws = - 2 + /* pbc */ - 4 + /* lrh */ - 3 + /* bth */ - 3 + /* kdeth */ - 4 + /* software kdeth + unused */ - 2 + /* ICRC and tail */ - 2 ; /* second cacheline */ + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2 ; /* ICRC/tail */ inject_16B->qw0 = OPX_PBC_LEN(pbc_dws,hfi1_type) /* length_dws */ | OPX_PBC_VL(hfi->vl,hfi1_type) | @@ -398,12 +395,12 @@ void fi_opx_ep_tx_model_init_16B (struct fi_opx_hfi1_context * hfi, OPX_PBC_SCTXT(hfi->send_ctxt,hfi1_type) | OPX_PBC_JKR_INSERT_NON9B_ICRC; + /* (LRH QW) does not include pbc (8 bytes) */ const uint32_t packetLength = (pbc_dws - 2) * 4; const uint32_t lrh_qws = (packetLength >> 3) + ((packetLength & 0x07u) != 0); - /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ inject_16B->hdr.lrh_16B.pktlen = lrh_qws; /* specified at runtime */ diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 59371385b89..8fe879c402b 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -1150,7 +1150,7 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ ((payload_bytes + 3) >> 2); - const uint16_t lrh_dws = htons(pbc_dws - 1); + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; const uint16_t total_credits_needed = 1 + /* packet header */ ((payload_bytes + 63) >> 6); /* payload blocks needed */ @@ -1306,12 +1306,12 @@ int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "payload_bytes = %ld\n", payload_bytes); const uint64_t pbc_dws = 2 + /* pbc */ - 4 + /* lrh */ + 4 + /* lrh uncompressed */ 3 + /* bth */ - 7 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ ((payload_bytes + 3) >> 2) + - 2; // ICRC - const uint16_t lrh_qws = (pbc_dws - 2) >> 1; + 2; /* ICRC/tail */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; const uint16_t total_credits_needed = 1 + /* packet header */ ((payload_bytes + 63) >> 6); /* payload blocks needed */ @@ -1386,17 +1386,6 @@ int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) replay->scb.scb_16B.hdr.qw_16B[7] = (uint64_t) params->rzv_comp; -#ifndef NDEBUG - if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { - OPX_JKR_PRINT_16B_PBC(replay->scb.scb_16B.qw0); - OPX_JKR_PRINT_16B_LRH(replay->scb.scb_16B.hdr.qw_16B[0], replay->scb.scb_16B.hdr.qw_16B[1]); - OPX_JKR_PRINT_16B_BTH(replay->scb.scb_16B.hdr.qw_16B[2], replay->scb.scb_16B.hdr.qw_16B[3]); - } else { - abort(); - fi_opx_hfi1_dump_packet_hdr(&(replay->scb.scb_9B.hdr), OPX_HFI1_TYPE, __func__, __LINE__); - } -#endif - union fi_opx_hfi1_packet_payload *const tx_payload = (union fi_opx_hfi1_packet_payload *) (replay->payload); @@ -1416,16 +1405,6 @@ int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) tx_payload->cts.iov[i].rbuf_iface = params->dput_iov[i].rbuf_iface; vaddr_with_offset += params->dput_iov[i].bytes; } -#ifndef NDEBUG - if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { - OPX_JKR_PRINT_16B_PBC(replay->scb.scb_16B.qw0); - OPX_JKR_PRINT_16B_LRH(replay->scb.scb_16B.hdr.qw_16B[0], replay->scb.scb_16B.hdr.qw_16B[1]); - OPX_JKR_PRINT_16B_BTH(replay->scb.scb_16B.hdr.qw_16B[2], replay->scb.scb_16B.hdr.qw_16B[3]); - } else { - abort(); - fi_opx_hfi1_dump_packet_hdr(&(replay->scb.scb_9B.hdr), OPX_HFI1_TYPE, __func__, __LINE__); - } -#endif /* copy tidpairs to packet */ if (params->tid_info.npairs) { @@ -1863,7 +1842,7 @@ int opx_hfi1_rx_rzv_rts_send_etrunc(union fi_opx_hfi1_deferred_work *work) 2 + /* lrh */ 3 + /* bth */ 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - const uint16_t lrh_dws = htons(pbc_dws - 1); + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 1) < 1)) { @@ -1944,11 +1923,11 @@ int opx_hfi1_rx_rzv_rts_send_etrunc_16B(union fi_opx_hfi1_deferred_work *work) const uint64_t pbc_dws = 2 + /* pbc */ - 4 + /* lrh */ + 4 + /* lrh uncompressed */ 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - 2; - const uint16_t lrh_qws = (pbc_dws - 2) >> 1; + 2; /* ICRC/tail */ + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; // Note: Only need 1 credit here for the message truncation error case. Just @@ -2292,7 +2271,7 @@ int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work) 2 + /* lrh */ 3 + /* bth */ 9; /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - const uint16_t lrh_dws = htons(pbc_dws - 1); + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ hdr->qw_9B[0] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[0] | params->lrh_dlid | ((uint64_t)lrh_dws << 32); hdr->qw_9B[1] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[1] | params->bth_rx; @@ -2303,11 +2282,11 @@ int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work) hdr->qw_9B[6] = params->bytes_to_fence; } else { const uint64_t pbc_dws = 2 + /* pbc */ - 4 + /* lrh */ + 4 + /* lrh uncompressed */ 3 + /* bth */ - 9 + /* kdeth */ - 2; /* ICRC */ - const uint16_t lrh_dws = (pbc_dws - 1) >> 1; + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2; /* ICRC/tail */ + const uint16_t lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ hdr->qw_16B[0] = opx_ep->rx->tx.dput_16B.hdr.qw_16B[0] | ((uint64_t)(params->lrh_dlid & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_dws << 20); @@ -2460,10 +2439,10 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) uint64_t tail_bytes = bytes_to_send_this_packet & 0x3Ful; blocks_to_send_in_this_packet = (bytes_to_send_this_packet >> 6) + (tail_bytes ? 1 : 0); } else { - const uint64_t additional_hdr_tail_byte = 2 * 8; /* 1 QW for hdr that spills to 2nd cacheline - 1 QW for ICRC/tail */ - uint64_t payload_n_additional_hdr_tail_bytes = (MIN(bytes_to_send + params->payload_bytes_for_iovec + additional_hdr_tail_byte, - max_bytes_per_packet)); + /* 1 QW for hdr that spills to 2nd cacheline + 1 QW for ICRC/tail */ + const uint64_t additional_hdr_tail_byte = 2 * 8; + uint64_t payload_n_additional_hdr_tail_bytes = (MIN(bytes_to_send + params->payload_bytes_for_iovec + additional_hdr_tail_byte, + max_bytes_per_packet)); uint64_t tail_bytes = payload_n_additional_hdr_tail_bytes & 0x3Ful; blocks_to_send_in_this_packet = (payload_n_additional_hdr_tail_bytes >> 6) + (tail_bytes ? 1 : 0); bytes_to_send_this_packet = payload_n_additional_hdr_tail_bytes - additional_hdr_tail_byte; @@ -2478,14 +2457,14 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ (blocks_to_send_in_this_packet << 4); - lrh_dws = htons(pbc_dws - 1); + lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ } else { pbc_dws = 2 + /* pbc */ - 4 + /* lrh */ + 4 + /* lrh uncompressed */ 3 + /* bth */ 7 + /* kdeth */ (blocks_to_send_in_this_packet << 4); // ICRC and the kdeth in the second cacheline are accounted for here - lrh_dws = (pbc_dws - 1) >> 1; + lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ } uint64_t bytes_sent; @@ -2868,21 +2847,34 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) // Round packet_bytes up to the next multiple of 4, // then divide by 4 to get the correct number of dws. uint64_t payload_dws = ((packet_bytes + 3) & -4) >> 2; - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - payload_dws; - - const uint16_t lrh_dws = htons(pbc_dws - 1); + uint64_t pbc_dws; + uint16_t lrh_dws; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + payload_dws; + lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + } else { + pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2 + /* ICRC/tail */ + payload_dws; + lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + } assert(replay != NULL); if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { - replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | + OPX_PBC_LEN(pbc_dws, OPX_HFI1_TYPE) | params->pbc_dlid; } else { - replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | + OPX_PBC_LEN(pbc_dws, OPX_HFI1_TYPE) | params->pbc_dlid; } @@ -3299,22 +3291,34 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) // Round packet_bytes up to the next multiple of 4, // then divide by 4 to get the correct number of dws. uint64_t payload_dws = (packet_bytes + 3) >> 2; - const uint64_t pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - payload_dws; - - const uint16_t lrh_dws = htons(pbc_dws - 1); - - OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); + uint64_t pbc_dws; + uint16_t lrh_dws; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + payload_dws; + lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ + } else { + pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2 + /* ICRC/tail */ + payload_dws; + lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ + } + assert(replay != NULL); if (OPX_HFI1_TYPE & OPX_HFI1_JKR) { - replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + replay->scb.scb_16B.qw0 = opx_ep->rx->tx.dput_16B.qw0 | + OPX_PBC_LEN(pbc_dws, OPX_HFI1_TYPE) | params->pbc_dlid; } else { - replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | + replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | + OPX_PBC_LEN(pbc_dws, OPX_HFI1_TYPE) | params->pbc_dlid; } @@ -3610,14 +3614,14 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ (payload_blocks_total << 4); - lrh_dws = htons(pbc_dws - 1); + lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ } else { pbc_dws = 2 + /* pbc */ - 4 + /* lrh */ + 4 + /* lrh uncompressed */ 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - (payload_blocks_total << 4); - lrh_dws = (pbc_dws - 1) >> 1; + (payload_blocks_total << 4); /* ICRC/tail is accounted for here */ + lrh_dws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ } if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { @@ -4005,7 +4009,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ (payload_blocks_total << 4); - const uint16_t lrh_dws = htons(pbc_dws-1); + const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -4453,7 +4457,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, /* full blocks only. icrc_end_block/icrc_fragment_block count 1 qw only */ const uint64_t payload_blocks_total = - 1 + /* rzv metadata */ + 1 + /* last kdeth + rzv metadata */ immediate_fragment + immediate_block_count + immediate_end_block_count; @@ -4462,12 +4466,12 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, 2 + /* pbc */ 4 + /* lhr */ 3 + /* bth */ - 3 + /* kdeth */ - 4 + /* software kdeth + unused */ - (payload_blocks_total << 4) + + /* 9 + kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 7 + /* kdeth */ + (payload_blocks_total << 4) + /* includes last kdeth + metadata + immediate data */ ((icrc_end_block | icrc_fragment_block) << 1); /* 1 QW of any added tail block */ - const uint16_t lrh_qws = (pbc_dws - 2) >> 1; // Does not include PBC and is in QW + const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -4689,6 +4693,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); uint64_t temp[8]; + fi_opx_store_and_copy_qw(scb_payload, temp, tag, /* end of header */ /* start of receiver payload/cacheline */ diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index 31fa9960ed5..6ebb90dee8a 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -1459,8 +1459,6 @@ ssize_t fi_opx_reliability_service_do_replay_sdma (struct fid_ep *ep, params->opx_ep = opx_ep; slist_init(¶ms->sdma_reqs); - OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); - #if defined(OPX_RELIABILITY_DEBUG) || !defined(NDEBUG) union fi_opx_reliability_service_flow_key key; if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { @@ -1505,8 +1503,6 @@ ssize_t fi_opx_reliability_service_do_replay_sdma (struct fid_ep *ep, sdma_we->replay = replay; sdma_we->comp_state = OPX_SDMA_COMP_PENDING_WRITEV; - OPX_NO_16B_SUPPORT(OPX_HFI1_TYPE); - uint64_t payload_size = fi_opx_reliability_replay_get_payload_size(replay); #ifndef NDEBUG @@ -2444,13 +2440,11 @@ void fi_opx_reliability_model_init_16B(struct fi_opx_reliability_service * servi { /* PBC */ const uint64_t pbc_dws = - 2 + /* pbc */ - 4 + /* lrh */ - 3 + /* bth */ - 3 + /* kdeth */ - 4 + /* software kdeth + unused */ - 2 + /* ICRC and tail */ - 2 ; /* second cacheline */ + 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 2 ; /* ICRC/tail */ /* Setup the 16B models whether or not they'll be used */ @@ -2466,13 +2460,14 @@ void fi_opx_reliability_model_init_16B(struct fi_opx_reliability_service * servi OPX_PBC_JKR_INSERT_NON9B_ICRC; /* LRH */ + /* (LRH QW) does not include pbc (8 bytes) */ const uint32_t packetLength = (pbc_dws - 2) * 4; const uint32_t lrh_qws = (packetLength >> 3) + ((packetLength & 0x07u) != 0); service->tx.hfi1.ping_model_16B.hdr.lrh_16B.qw[0] = 0UL; service->tx.hfi1.ping_model_16B.hdr.lrh_16B.qw[1] = 0UL; - service->tx.hfi1.ping_model_16B.hdr.lrh_16B.pktlen = lrh_qws; /* does not include pbc, but does include icrc */ + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.pktlen = lrh_qws; service->tx.hfi1.ping_model_16B.hdr.lrh_16B.sc = hfi1->sc; service->tx.hfi1.ping_model_16B.hdr.lrh_16B.entropy = 0; service->tx.hfi1.ping_model_16B.hdr.lrh_16B.lt = 0; // need to add env variable to change @@ -2645,7 +2640,7 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser ((hfi1->sc & FI_OPX_HFI1_LRH_SC_MASK) << FI_OPX_HFI1_LRH_SC_SHIFT)); service->tx.hfi1.ping_model_9B.hdr.lrh_9B.dlid = 0; /* set at runtime */ - service->tx.hfi1.ping_model_9B.hdr.lrh_9B.pktlen = htons(pbc_dws-1); /* does not include pbc (8 bytes), but does include icrc (4 bytes) */ + service->tx.hfi1.ping_model_9B.hdr.lrh_9B.pktlen = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ service->tx.hfi1.ping_model_9B.hdr.lrh_9B.slid = htons(hfi1->lid); /* BTH */ From ced60a0d876aa53331e96ba32895c41853b8653c Mon Sep 17 00:00:00 2001 From: Bob Cernohous Date: Wed, 18 Sep 2024 07:38:13 -0500 Subject: [PATCH 151/393] prov/opx: Fix uepkt 16B headers Signed-off-by: Bob Cernohous --- prov/opx/src/fi_opx_reliability.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index 6ebb90dee8a..9a077bb121e 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -3182,15 +3182,24 @@ struct fi_opx_reliability_rx_uepkt *fi_opx_reliability_allocate_uepkt(struct fi_ { struct fi_opx_reliability_rx_uepkt * tmp = ofi_buf_alloc(service->uepkt_pool); assert(tmp); - - /* tmp->hdr.unused_pad_9B = hdr->unused_pad_9B; */ - tmp->hdr.qw_9B[0] = hdr->qw_9B[0]; - tmp->hdr.qw_9B[1] = hdr->qw_9B[1]; - tmp->hdr.qw_9B[2] = hdr->qw_9B[2]; - tmp->hdr.qw_9B[3] = hdr->qw_9B[3]; - tmp->hdr.qw_9B[4] = hdr->qw_9B[4]; - tmp->hdr.qw_9B[5] = hdr->qw_9B[5]; - tmp->hdr.qw_9B[6] = hdr->qw_9B[6]; + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + tmp->hdr.qw_9B[0] = hdr->qw_9B[0]; + tmp->hdr.qw_9B[1] = hdr->qw_9B[1]; + tmp->hdr.qw_9B[2] = hdr->qw_9B[2]; + tmp->hdr.qw_9B[3] = hdr->qw_9B[3]; + tmp->hdr.qw_9B[4] = hdr->qw_9B[4]; + tmp->hdr.qw_9B[5] = hdr->qw_9B[5]; + tmp->hdr.qw_9B[6] = hdr->qw_9B[6]; + } else { + tmp->hdr.qw_16B[0] = hdr->qw_16B[0]; + tmp->hdr.qw_16B[1] = hdr->qw_16B[1]; + tmp->hdr.qw_16B[2] = hdr->qw_16B[2]; + tmp->hdr.qw_16B[3] = hdr->qw_16B[3]; + tmp->hdr.qw_16B[4] = hdr->qw_16B[4]; + tmp->hdr.qw_16B[5] = hdr->qw_16B[5]; + tmp->hdr.qw_16B[6] = hdr->qw_16B[6]; + tmp->hdr.qw_16B[7] = hdr->qw_16B[7]; + } if (payload && payload_bytes_to_copy > 0) memcpy((void*)&tmp->payload[0], (const void *)payload, payload_bytes_to_copy); From 09b7e352e73c45b0581fc0b7ebff88f304671f4a Mon Sep 17 00:00:00 2001 From: Bob Cernohous Date: Wed, 18 Sep 2024 07:39:05 -0500 Subject: [PATCH 152/393] prov/opx: Support 16B SDMA CTS work Fix 16B PBC/payload lengths Signed-off-by: Bob Cernohous --- prov/opx/src/fi_opx_hfi1.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 8fe879c402b..6e31975860c 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -1309,7 +1309,7 @@ int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) 4 + /* lrh uncompressed */ 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - ((payload_bytes + 3) >> 2) + + (((payload_bytes + 7) & -8) >> 2) + /* 16B is QW length/padded */ 2; /* ICRC/tail */ const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; @@ -1582,7 +1582,12 @@ union fi_opx_hfi1_deferred_work * opx_hfi1_rx_rzv_rts_tid_prep_cts( } assert(cur_addr_range_tid_len <= cts_params->rzv_comp->context->byte_counter); - cts_params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + cts_params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + } else { + cts_params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_16B; + } cts_params->work_elem.work_type = OPX_WORK_TYPE_PIO; return cts_work; @@ -1608,7 +1613,12 @@ int opx_hfi1_rx_rzv_rts_tid_fallback(union fi_opx_hfi1_deferred_work *work, params->dst_vaddr = params->dput_iov[params->cur_iov].rbuf; params->tid_info.npairs = 0; - params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + } else { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_16B; + } params->work_elem.work_type = OPX_WORK_TYPE_PIO; params->opcode = FI_OPX_HFI_DPUT_OPCODE_RZV; @@ -1620,7 +1630,8 @@ int opx_hfi1_rx_rzv_rts_tid_fallback(union fi_opx_hfi1_deferred_work *work, params->rzv_comp, params->rzv_comp->context); - return opx_hfi1_rx_rzv_rts_send_cts(work); + + return params->work_elem.work_fn(work); } int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) @@ -1680,7 +1691,12 @@ int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) if (last_cts) { assert(cts_work == work); - assert(work->work_elem.work_fn == opx_hfi1_rx_rzv_rts_send_cts); + + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + assert(work->work_elem.work_fn == opx_hfi1_rx_rzv_rts_send_cts); + } else { + assert(work->work_elem.work_fn == opx_hfi1_rx_rzv_rts_send_cts_16B); + } FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV, HFI -- RENDEZVOUS RTS TID SETUP (end) SUCCESS (params=%p rzv_comp=%p context=%p)\n", params, @@ -1691,11 +1707,12 @@ int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) .expected_receive.rts_tid_setup_success); // This is the "FI_SUCCESS" exit point for this function - return opx_hfi1_rx_rzv_rts_send_cts(cts_work); + return cts_work->work_elem.work_fn(cts_work); } assert(cts_work != work); - int rc = opx_hfi1_rx_rzv_rts_send_cts(cts_work); + + int rc = cts_work->work_elem.work_fn(cts_work); if (rc == FI_SUCCESS) { OPX_BUF_FREE(cts_work); } else { @@ -3290,10 +3307,10 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) // Round packet_bytes up to the next multiple of 4, // then divide by 4 to get the correct number of dws. - uint64_t payload_dws = (packet_bytes + 3) >> 2; uint64_t pbc_dws; uint16_t lrh_dws; if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + uint64_t payload_dws = (packet_bytes + 3) >> 2; pbc_dws = 2 + /* pbc */ 2 + /* lrh */ 3 + /* bth */ @@ -3301,6 +3318,7 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) payload_dws; lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ } else { + uint64_t payload_dws = ((packet_bytes + 7) & -8) >> 2;/* 16B is QW length/padded */ pbc_dws = 2 + /* pbc */ 4 + /* lrh uncompressed */ 3 + /* bth */ From 20dd5afa1629f8b8ea66f25e6b45900ef323c23f Mon Sep 17 00:00:00 2001 From: Mike Wilkins Date: Wed, 18 Sep 2024 09:23:57 -0500 Subject: [PATCH 153/393] prov/opx: Remove polling call from internal rma write Signed-off-by: Mike Wilkins --- prov/opx/include/rdma/opx/fi_opx_rma.h | 1 - 1 file changed, 1 deletion(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index 998ebaf5b83..63c0e41b010 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -224,7 +224,6 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, */ ssize_t rc = fi_opx_shm_dynamic_tx_connect(params->is_intranode, opx_ep, params->u32_extended_rx, opx_dst_addr.hfi1_unit); assert(rc == FI_SUCCESS); - fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); fi_opx_hfi1_dput_sdma_init(opx_ep, params, iov->len, 0, 0, NULL, is_hmem); FI_OPX_DEBUG_COUNTERS_INC_COND(is_hmem && params->is_intranode, From c7d0fa8b93ddc52fe82f12e7d00615eef5d7d9b9 Mon Sep 17 00:00:00 2001 From: Bob Cernohous Date: Fri, 20 Sep 2024 16:21:47 -0500 Subject: [PATCH 154/393] prov/opx: Fix credit return Signed-off-by: Bob Cernohous --- prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index ea1fa89189f..fc60d9a3ad0 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -975,6 +975,9 @@ void fi_opx_force_credit_return(struct fid_ep *ep, opx_ep->tx->send_9B.hdr.qw_9B[3], opx_ep->tx->send_9B.hdr.qw_9B[4], 0, 0); + + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + } else { uint32_t lrh_dlid_16B = htons(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); fi_opx_store_and_copy_qw(scb, local_temp, @@ -993,7 +996,10 @@ void fi_opx_force_credit_return(struct fid_ep *ep, opx_ep->tx->send_16B.hdr.qw_16B[5], 0); + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_and_copy_qw(scb_payload, local_temp, 0UL, 0UL, @@ -1003,9 +1009,9 @@ void fi_opx_force_credit_return(struct fid_ep *ep, 0UL, 0UL, 0UL); + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); } - FI_OPX_HFI1_CONSUME_CREDITS(pio_state, credits_needed); opx_ep->tx->pio_state->qw0 = pio_state.qw0; FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); From fd7bac4ab389d91f2b4b39bf74baa26707c98fc3 Mon Sep 17 00:00:00 2001 From: Mike Wilkins Date: Sat, 21 Sep 2024 12:38:00 -0500 Subject: [PATCH 155/393] prov/opx: added OPX Tracer points to RMA code paths Signed-off-by: Mike Wilkins Signed-off-by: Ben Lynam --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 8 ++- prov/opx/include/rdma/opx/fi_opx_rma.h | 8 +++ prov/opx/src/fi_opx_rma.c | 67 ++++++++++++++++++--- 3 files changed, 74 insertions(+), 9 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 1429a393484..4edb41a7164 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -3532,6 +3532,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); if (OFI_UNLIKELY(context == NULL)) { FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "POST-RECVMSG"); return -FI_ENOMEM; } context->next = NULL; @@ -3566,6 +3567,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } else if (msg->iov_count == 0) { @@ -3585,6 +3587,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } @@ -3631,6 +3634,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG (HMEM) RETURN\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } #endif @@ -3652,6 +3656,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, hfi1_type); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } @@ -3672,10 +3677,9 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, reliability, hfi1_type); - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== POST RECVMSG RETURN\n"); - + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "POST-RECVMSG"); return rc; } diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index 63c0e41b010..c439b3f47da 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -69,6 +69,7 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "READV_INTERNAL"); union fi_opx_hfi1_deferred_work *work = (union fi_opx_hfi1_deferred_work *) ofi_buf_alloc(opx_ep->tx->work_pending_pool); @@ -152,6 +153,7 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, int rc = params->work_elem.work_fn(work); if(rc == FI_SUCCESS) { OPX_BUF_FREE(work); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READV_INTERNAL"); return; } assert(rc == -FI_EAGAIN); @@ -159,6 +161,8 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, /* Try again later*/ assert(work->work_elem.slist_entry.next == NULL); slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending[params->work_elem.work_type]); + + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READV_INTERNAL"); } __OPX_FORCE_INLINE__ @@ -175,6 +179,7 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "WRITE_INTERNAL"); assert(niov == 1); // TODO, support something ... bigger assert(op == FI_NOOP || op < OFI_ATOMIC_OP_LAST); assert(dt == FI_VOID || dt < OFI_DATATYPE_LAST); @@ -235,11 +240,13 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, if (rc == FI_SUCCESS) { assert(params->work_elem.complete); OPX_BUF_FREE(work); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITE_INTERNAL"); return; } assert(rc == -FI_EAGAIN); if (params->work_elem.work_type == OPX_WORK_TYPE_LAST) { slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending_completion); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITE_INTERNAL"); return; } @@ -261,6 +268,7 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, /* Try again later*/ assert(work->work_elem.slist_entry.next == NULL); slist_insert_tail(&work->work_elem.slist_entry, &opx_ep->tx->work_pending[params->work_elem.work_type]); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITE_INTERNAL"); } __OPX_FORCE_INLINE__ diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index df88b3f0d9f..68caded8c90 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -159,6 +159,7 @@ int fi_opx_do_readv_internal_intranode(union fi_opx_hfi1_deferred_work *work) int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "DO_READV"); struct fi_opx_hfi1_rx_readv_params *params = &work->readv; struct fi_opx_ep *opx_ep = params->opx_ep; const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; @@ -167,6 +168,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) ssize_t credits_available = fi_opx_hfi1_tx_check_credits(opx_ep, &pio_state, 2); if (OFI_UNLIKELY(credits_available < 2)) { + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "DO_READV"); return -FI_EAGAIN; } @@ -180,6 +182,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) addr.reliability_rx, &psn_ptr, &replay, params->reliability, hfi1_type); if (OFI_UNLIKELY(psn == -1)) { + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "DO_READV"); return -FI_EAGAIN; } @@ -286,6 +289,7 @@ int fi_opx_do_readv_internal(union fi_opx_hfi1_deferred_work *work) FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); opx_ep->tx->pio_state->qw0 = pio_state.qw0; + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "DO_READV"); return FI_SUCCESS; } @@ -297,16 +301,20 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "INJECT_WRITE"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "INJECT_WRITE"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "INJECT_WRITE"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -322,6 +330,7 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t opx_dst_addr.reliability_rx, reliability))) { fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "INJECT_WRITE"); return -FI_EAGAIN; } @@ -343,6 +352,7 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t is_hmem, lock_required, caps, reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "INJECT_WRITE"); return 0; } @@ -371,16 +381,20 @@ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "WRITE"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITE"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITE"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -392,6 +406,7 @@ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; struct opx_context *context; if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_WRITE, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITE"); return -FI_ENOMEM; } @@ -401,6 +416,7 @@ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, OPX_BUF_FREE(context); } FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITE"); return -FI_ENOMEM; } @@ -419,6 +435,7 @@ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, FI_NOOP, opx_ep->tx->op_flags, is_hmem, lock_required, caps, reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITE"); return 0; } @@ -446,17 +463,21 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "WRITEV_INTERNAL"); struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEV_INTERNAL"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEV_INTERNAL"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -468,6 +489,7 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; struct opx_context *context; if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_WRITE, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEV_INTERNAL"); return -FI_ENOMEM; } @@ -477,6 +499,7 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void OPX_BUF_FREE(context); } FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEV_INTERNAL"); return -FI_ENOMEM; } @@ -514,6 +537,7 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void addr_offset += iov[index].iov_len; } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITEV_INTERNAL"); return 0; } @@ -580,17 +604,21 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "WRITEMSG_INTERAL"); struct fi_opx_ep *opx_ep; opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEMSG_INTERNAL"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEMSG_INTERNAL"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -603,6 +631,7 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg struct fi_opx_cq *cq = (flags & FI_COMPLETION) ? opx_ep->rx->cq : NULL; struct opx_context *context; if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_RMA | FI_WRITE, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEMSG_INTERNAL"); return -FI_ENOMEM; } @@ -612,6 +641,7 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg OPX_BUF_FREE(context); } FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "WRITEMSG_INTERNAL"); return -FI_ENOMEM; } @@ -680,6 +710,7 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg } } + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITEMSG_INTERNAL"); return 0; } @@ -705,16 +736,20 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des const uint64_t caps, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "READ"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READ"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READ"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -742,6 +777,7 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; struct opx_context *context; if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_READ, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READ"); return -FI_ENOMEM; } @@ -751,6 +787,7 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des OPX_BUF_FREE(context); } FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READ"); return -FI_ENOMEM; } @@ -768,6 +805,7 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READ"); return FI_SUCCESS; } @@ -794,16 +832,20 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "READV"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READV"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READV"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -815,6 +857,7 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, struct fi_opx_cq *cq = (opx_ep->tx->op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) ? opx_ep->rx->cq : NULL; struct opx_context *context; if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, user_context, cq, FI_RMA | FI_READ, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READV"); return -FI_ENOMEM; } @@ -830,6 +873,7 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, OPX_BUF_FREE(context); } FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READV"); return -FI_ENOMEM; } @@ -895,6 +939,7 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READV"); return 0; } @@ -921,16 +966,20 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { + OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "READMSG_INTERNAL"); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); #ifndef NDEBUG int ret = 0; ret = fi_opx_check_rma(opx_ep); - if (ret) + if (ret) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); return ret; + } #endif if (lock_required) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } @@ -943,6 +992,7 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, struct fi_opx_cq *cq = ((flags & FI_COMPLETION) == FI_COMPLETION) ? opx_ep->rx->cq : NULL; struct opx_context *context; if (OFI_UNLIKELY(opx_rma_get_context(opx_ep, msg->context, cq, FI_RMA | FI_READ, &context) != FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); return -FI_ENOMEM; } @@ -971,6 +1021,7 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, OPX_BUF_FREE(context); } FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Out of memory.\n"); + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); return -FI_ENOMEM; } @@ -1036,6 +1087,7 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READMSG_INTERNAL"); return 0; } else { @@ -1087,6 +1139,7 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, } /* end while */ /* should never get here */ + OPX_TRACER_TRACE(OPX_TRACER_END_ERROR, "READMSG_INTERNAL"); FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Fatal -FI_EPERM\n"); abort(); From 49918a65c45489713bdada81e100378a70ad707a Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Sat, 21 Sep 2024 12:38:59 -0500 Subject: [PATCH 156/393] prov/opx: Simplify fi_opx_check_rma() function. Signed-off-by: Ben Lynam --- prov/opx/include/rdma/opx/fi_opx_rma.h | 8 +++++++- prov/opx/src/fi_opx_rma.c | 19 ------------------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index c439b3f47da..4f66d9c2885 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -43,7 +43,13 @@ extern "C" { #endif -int fi_opx_check_rma(struct fi_opx_ep *opx_ep); +__OPX_FORCE_INLINE__ +int fi_opx_check_rma(struct fi_opx_ep *opx_ep) +{ + return OFI_UNLIKELY(!opx_ep || + (opx_ep->state != FI_OPX_EP_INITITALIZED_ENABLED) || + (opx_ep->av->type == FI_AV_UNSPEC)) ? -FI_EINVAL : 0; +} void fi_opx_hit_zero(struct fi_opx_completion_counter *cc); diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index 68caded8c90..8be0bb7cdc3 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -75,25 +75,6 @@ void fi_opx_hit_zero(struct fi_opx_completion_counter *cc) OPX_BUF_FREE(cc); } -inline int fi_opx_check_rma(struct fi_opx_ep *opx_ep) -{ - if (!opx_ep) - return -FI_EINVAL; - if (opx_ep->state != FI_OPX_EP_INITITALIZED_ENABLED) - return -FI_EINVAL; - - const enum fi_av_type av_type = opx_ep->av->type; - - if (av_type == FI_AV_UNSPEC) - return -FI_EINVAL; - if (av_type == FI_AV_MAP && opx_ep->av->type != FI_AV_MAP) - return -FI_EINVAL; - if (av_type == FI_AV_TABLE && opx_ep->av->type != FI_AV_TABLE) - return -FI_EINVAL; - - return 0; -} - int fi_opx_do_readv_internal_intranode(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_readv_params *params = &work->readv; From 623302604b0217cf7a625342745c734b92afb96c Mon Sep 17 00:00:00 2001 From: Lindsay Reiser Date: Mon, 23 Sep 2024 09:10:25 -0400 Subject: [PATCH 157/393] prov/opx: Initialize nic info in fi_info Signed-off-by: Lindsay Reiser --- prov/opx/src/fi_opx_init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 6aafeb433c8..bc059287636 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -396,6 +396,9 @@ static int fi_opx_fillinfo(struct fi_info *fi, const char *node, fi->ep_attr->rx_ctx_cnt = hints->ep_attr->rx_ctx_cnt; /* TODO - check */ } + fi->nic = ofi_nic_dup(NULL); + fi->nic->bus_attr->bus_type = FI_BUS_PCI; + return 0; err: From bd456086600028c4f6fdc103cdca84058022190d Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Mon, 23 Sep 2024 10:53:13 -0500 Subject: [PATCH 158/393] prov/opx: Fix incorrect calculation of immediate block offset in send rendezvous Signed-off-by: Ben Lynam --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 4edb41a7164..9738f7abff1 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -1862,7 +1862,7 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, }; const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; const uint64_t * const immediate_qw = p->rendezvous.contiguous.immediate_qw; - const uint64_t immediate_fragment = ((immediate_info.byte_count + immediate_info.byte_count + 63) >> 6); + const uint64_t immediate_fragment = ((immediate_info.byte_count + immediate_info.qw_count + 63) >> 6); const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; fi_opx_handle_recv_rts(hdr, payload, opx_ep, origin_tag, opcode, @@ -1889,7 +1889,7 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, }; const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; const uint64_t * const immediate_qw = p->rendezvous.contiguous.immediate_qw; - const uint64_t immediate_fragment = ((immediate_info.byte_count + immediate_info.byte_count + 63) >> 6); + const uint64_t immediate_fragment = ((immediate_info.byte_count + immediate_info.qw_count + 63) >> 6); const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; fi_opx_handle_recv_rts(hdr, payload, opx_ep, origin_tag, opcode, From b9cd49f178ba62aa84a32f806c0a62c03f81a922 Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Tue, 24 Sep 2024 19:50:24 -0500 Subject: [PATCH 159/393] prov/opx: Add FI_OPX_TID_MIN_PAYLOAD_BYTES param Add ability to independently tune the minimum threshold to use expected receive (TID) when sending. Signed-off-by: Ben Lynam --- man/fi_opx.7.md | 5 +++ prov/opx/include/rdma/opx/fi_opx_endpoint.h | 3 +- prov/opx/include/rdma/opx/fi_opx_hfi1.h | 48 +++++++++++++-------- prov/opx/src/fi_opx_ep.c | 38 +++++++++++++--- prov/opx/src/fi_opx_hfi1.c | 11 ++--- prov/opx/src/fi_opx_init.c | 3 ++ 6 files changed, 78 insertions(+), 30 deletions(-) diff --git a/man/fi_opx.7.md b/man/fi_opx.7.md index 6810fa9d11a..da3a638e21f 100644 --- a/man/fi_opx.7.md +++ b/man/fi_opx.7.md @@ -219,6 +219,11 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL. For messages smaller than this threshold, the send will be completed using PIO. Value must be between 64 and 2147483646. Defaults to 16385. +*FI_OPX_TID_MIN_PAYLOAD_BYTES* +: Integer. The minimum length in bytes where TID (Expected Receive) will be used. + For messages smaller than this threshold, the send will be completed using Eager Receive. + Value must be between 4096 and 2147483646. Defaults to 4096. + *FI_OPX_RZV_MIN_PAYLOAD_BYTES* : Integer. The minimum length in bytes where rendezvous will be used. For messages smaller than this threshold, the send will first try to be completed using eager or multi-packet eager. diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 9738f7abff1..a6b76bf0cba 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -316,9 +316,10 @@ struct fi_opx_ep_tx { struct ofi_bufpool *rma_request_pool; struct ofi_bufpool *sdma_work_pool; uint32_t sdma_min_payload_bytes; + uint32_t tid_min_payload_bytes; uint32_t rzv_min_payload_bytes; uint16_t mp_eager_max_payload_bytes; - uint8_t unused_cacheline6[6]; + uint16_t unused_cacheline6; /* == CACHE LINE 16 == */ struct opx_sdma_queue sdma_request_queue; diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1.h b/prov/opx/include/rdma/opx/fi_opx_hfi1.h index aa202ad6bf8..2c71240c4ab 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1.h @@ -98,7 +98,7 @@ #elif HAVE_ROCR #define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (256) #else -#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT+1) +#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT+1) #endif #define OPX_RZV_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) /* Min value */ #define OPX_RZV_MIN_PAYLOAD_BYTES_MAX (OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX+1) /* Max value */ @@ -125,16 +125,15 @@ The payload itself will be FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE - 16 */ -#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type) \ - ((hfi1_type & OPX_HFI1_JKR) ? \ - (FI_OPX_MP_EGR_CHUNK_SIZE - ((8 /* PBC */ + 64 /* hdr */ + 8 /* tail */) - 16 /* payload */)) :\ - (FI_OPX_MP_EGR_CHUNK_SIZE - ((8 /* PBC */ + 56 /* hdr */) - 16 /* payload */))) - /* PAYLOAD BYTES CONSUMED */ +#define FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type) \ + ((hfi1_type & OPX_HFI1_JKR) \ + ? (FI_OPX_MP_EGR_CHUNK_SIZE - ((8 /* PBC */ + 64 /* hdr */ + 8 /* tail */) - 16 /* payload */)) \ + : (FI_OPX_MP_EGR_CHUNK_SIZE - ((8 /* PBC */ + 56 /* hdr */) - 16 /* payload */))) #define FI_OPX_MP_EGR_CHUNK_CREDITS (FI_OPX_MP_EGR_CHUNK_SIZE >> 6) /* PACKET CREDITS TOTAL */ #define FI_OPX_MP_EGR_CHUNK_DWS (FI_OPX_MP_EGR_CHUNK_SIZE >> 2) /* PBC DWS */ #define FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type) \ - ((FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type)) >> 3) /* PAYLOAD QWS CONSUMED */ + ((FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type)) >> 3) /* PAYLOAD QWS CONSUMED */ #define FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL 16 #define FI_OPX_MP_EGR_XFER_BYTES_TAIL 0x0010000000000000ull @@ -207,21 +206,36 @@ static_assert(OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX >= OPX_MP_EGR_MAX_PAYLOAD_BYTES_D /* Default for payload threshold size for SDMA */ #ifndef FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT -#if HAVE_CUDA -#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (4096) -#elif HAVE_ROCR -#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (256) -#else -#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (16385) -#endif + #if HAVE_CUDA + #define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (4096) + #elif HAVE_ROCR + #define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (256) + #else + #define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (16385) + #endif #endif #define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) /* Min Value */ #define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX (INT_MAX-1) /* Max Value */ +/* Default for payload threshold size for TID */ +#ifndef OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT + #if HAVE_CUDA + #define OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT (4096) + #elif HAVE_ROCR + #define OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT (4096) + #else + #define OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT (4096) + #endif +#endif +#define OPX_TID_MIN_PAYLOAD_BYTES_MIN (OPX_HFI1_TID_PAGESIZE) +static_assert(OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT >= OPX_TID_MIN_PAYLOAD_BYTES_MIN, + "OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT must be >= OPX_TID_MIN_PAYLOAD_BYTES_MIN!\n"); -static_assert(!(FI_OPX_HFI1_SDMA_MAX_COMP_INDEX & (FI_OPX_HFI1_SDMA_MAX_COMP_INDEX - 1)), "FI_OPX_HFI1_SDMA_MAX_COMP_INDEX must be power of 2!\n"); -static_assert(FI_OPX_HFI1_SDMA_MAX_WE >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX, "FI_OPX_HFI1_SDMA_MAX_WE must be >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX!\n"); +static_assert(!(FI_OPX_HFI1_SDMA_MAX_COMP_INDEX & (FI_OPX_HFI1_SDMA_MAX_COMP_INDEX - 1)), + "FI_OPX_HFI1_SDMA_MAX_COMP_INDEX must be power of 2!\n"); +static_assert(FI_OPX_HFI1_SDMA_MAX_WE >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX, + "FI_OPX_HFI1_SDMA_MAX_WE must be >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX!\n"); /* * SDMA includes 8B sdma hdr, 8B PBC, and message header. @@ -706,7 +720,7 @@ void opx_print_context(struct fi_opx_hfi1_context *context) FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "Context info.sdma.queue_size %#X\n",context->info.sdma.queue_size); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "Context info.sdma.completion_queue %p errcode %#X status %#X\n",context->info.sdma.completion_queue, context->info.sdma.completion_queue->errcode, - context->info.sdma.completion_queue->status); + context->info.sdma.completion_queue->status); /* Not printing Context info.sdma.queued_entries); */ FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "Context info.rxe.hdrq.base_addr %p \n",context->info.rxe.hdrq.base_addr); diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 164b4829c56..b4868d49e97 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -1110,7 +1110,8 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, if (fi_param_get_int(fi_opx_global.prov, "sdma_disable", &sdma_disable) == FI_SUCCESS) { opx_ep->tx->use_sdma = !sdma_disable; OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, - "sdma_disable parm specified as %0X; opx_ep->tx->use_sdma set to %0hhX\n", sdma_disable, opx_ep->tx->use_sdma); + "sdma_disable parm specified as %0X; opx_ep->tx->use_sdma set to %0hhX\n", + sdma_disable, opx_ep->tx->use_sdma); } else { OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "sdma_disable parm not specified; using SDMA\n"); opx_ep->tx->use_sdma = 1; @@ -1121,17 +1122,40 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, rc = fi_param_get_int(fi_opx_global.prov, "sdma_min_payload_bytes", &l_sdma_min_payload_bytes); if (rc != FI_SUCCESS) { opx_ep->tx->sdma_min_payload_bytes = FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT; - OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n", - opx_ep->tx->sdma_min_payload_bytes); - } else if (l_sdma_min_payload_bytes < FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES || l_sdma_min_payload_bytes > INT_MAX) { + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, + "FI_OPX_SDMA_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n", + opx_ep->tx->sdma_min_payload_bytes); + } else if (l_sdma_min_payload_bytes < FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN || + l_sdma_min_payload_bytes > FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX) { opx_ep->tx->sdma_min_payload_bytes = FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT; FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Error: FI_OPX_SDMA_MIN_PAYLOAD_BYTES was set but is outside min/max thresholds (%d-%d). Using default setting of %d\n", - FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES, INT_MAX, opx_ep->tx->sdma_min_payload_bytes); + FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN, FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX, + FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT); } else { opx_ep->tx->sdma_min_payload_bytes = l_sdma_min_payload_bytes; - OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_MIN_PAYLOAD_BYTES was specified. Set to %d\n", - opx_ep->tx->sdma_min_payload_bytes); + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, + "FI_OPX_SDMA_MIN_PAYLOAD_BYTES was specified. Set to %d\n", + opx_ep->tx->sdma_min_payload_bytes); + } + + int l_tid_min_payload_bytes; + rc = fi_param_get_int(fi_opx_global.prov, "tid_min_payload_bytes", &l_tid_min_payload_bytes); + if (rc != FI_SUCCESS) { + opx_ep->tx->tid_min_payload_bytes = OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, + "FI_OPX_TID_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n", + opx_ep->tx->tid_min_payload_bytes); + } else if (l_tid_min_payload_bytes < OPX_TID_MIN_PAYLOAD_BYTES_MIN) { + opx_ep->tx->tid_min_payload_bytes = OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Error: FI_OPX_TID_MIN_PAYLOAD_BYTES was set but is less than minimum allowed (%lu). Using default setting of %d\n", + OPX_TID_MIN_PAYLOAD_BYTES_MIN, OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT); + } else { + opx_ep->tx->tid_min_payload_bytes = l_tid_min_payload_bytes; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, + "FI_OPX_TID_MIN_PAYLOAD_BYTES was specified. Set to %d\n", + opx_ep->tx->tid_min_payload_bytes); } slist_init(&opx_ep->tx->work_pending[OPX_WORK_TYPE_SHM]); diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 6e31975860c..05036630c03 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -1455,6 +1455,7 @@ int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, if (is_intranode || !opx_ep->use_expected_tid_rzv || (niov != 1) + || (params->dput_iov[0].bytes < opx_ep->tx->tid_min_payload_bytes) || (opcode != FI_OPX_HFI_DPUT_OPCODE_RZV && opcode != FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG) || !fi_opx_hfi1_sdma_use_sdma(opx_ep, params->dput_iov[0].bytes, @@ -1826,7 +1827,7 @@ int opx_hfi1_rx_rzv_rts_send_etrunc_intranode_16B(union fi_opx_hfi1_deferred_wor /* Note that we do not set stl.hdr.lrh.pktlen here (usually lrh_dws << 32), because this is intranode and since it's a CTS packet, lrh.pktlen isn't used/needed */ - tx_hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | + tx_hdr->qw_16B[0] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[0] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B)); tx_hdr->qw_16B[1] = opx_ep->rx->tx.cts_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); @@ -1949,7 +1950,7 @@ int opx_hfi1_rx_rzv_rts_send_etrunc_16B(union fi_opx_hfi1_deferred_work *work) // Note: Only need 1 credit here for the message truncation error case. Just // the opcode and origin_byte_counter_vaddr is needed for replaying back to the - // sender. + // sender. if (OFI_UNLIKELY(FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, 2) < 2)) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); opx_ep->tx->pio_state->qw0 = pio_state.qw0; @@ -2003,7 +2004,7 @@ int opx_hfi1_rx_rzv_rts_send_etrunc_16B(union fi_opx_hfi1_deferred_work *work) // 2nd cacheline volatile uint64_t * const scb2 = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - + fi_opx_store_and_copy_qw(scb2, &replay->scb.scb_16B.hdr.qw_16B[7], 0, 0, 0, 0, 0, 0, 0, 0); @@ -2019,7 +2020,7 @@ int opx_hfi1_rx_rzv_rts_send_etrunc_16B(union fi_opx_hfi1_deferred_work *work) params->origin_rx, psn_ptr, replay, - params->reliability, + params->reliability, OPX_HFI1_TYPE); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -2044,7 +2045,7 @@ void fi_opx_hfi1_rx_rzv_rts_etrunc (struct fi_opx_ep *opx_ep, params->opx_ep = opx_ep; params->work_elem.slist_entry.next = NULL; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_intranode %u, opcode=%u\n", + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_intranode %u, opcode=%u\n", is_intranode, FI_OPX_HFI_DPUT_OPCODE_RZV_ETRUNC); if (is_intranode) { diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index bc059287636..2322936ad0a 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -706,6 +706,9 @@ OPX_INI fi_param_define(&fi_opx_provider, "sdma_bounce_buf_threshold", FI_PARAM_INT, "The maximum message length in bytes that will be copied to the SDMA bounce buffer. For messages larger than this threshold, the send will not be completed until receiver has ACKed. Value must be between %d and %d. Defaults to %d.", OPX_SDMA_BOUNCE_BUF_MIN, OPX_SDMA_BOUNCE_BUF_MAX, OPX_SDMA_BOUNCE_BUF_THRESHOLD); fi_param_define(&fi_opx_provider, "sdma_disable", FI_PARAM_INT, "Disables SDMA offload hardware. Default is 0"); fi_param_define(&fi_opx_provider, "sdma_min_payload_bytes", FI_PARAM_INT, "The minimum message length in bytes where SDMA will be used. For messages smaller than this threshold, the send will be completed using PIO. Value must be between %d and %d. Defaults to %d.", FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN, FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX, FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT); + fi_param_define(&fi_opx_provider, "tid_min_payload_bytes", FI_PARAM_INT, + "The minimum message length in bytes where TID will be used. Value must be >= %d. Defaults to %d.", + OPX_TID_MIN_PAYLOAD_BYTES_MIN, OPX_TID_MIN_PAYLOAD_BYTES_DEFAULT); fi_param_define(&fi_opx_provider, "expected_receive_enable", FI_PARAM_BOOL, "Enables expected receive rendezvous using Token ID (TID). Defaults to \"No\"."); fi_param_define(&fi_opx_provider, "prog_affinity", FI_PARAM_STRING, "When set, specify the set of CPU cores to set the progress " From bf312ceb0b4c51ca138e7f2a10b609a04d8487d0 Mon Sep 17 00:00:00 2001 From: Jack Morrison Date: Thu, 26 Sep 2024 16:55:56 -0400 Subject: [PATCH 160/393] github/actions: Modify Cornelis Networks internal workflows Shorten the name field of opx-ci. Remove schedule-triggered Nightly job. Signed-off-by: Jack Morrison --- .github/workflows/cn.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/cn.yml b/.github/workflows/cn.yml index 5669c52f4ea..c54d8b82af6 100644 --- a/.github/workflows/cn.yml +++ b/.github/workflows/cn.yml @@ -7,8 +7,6 @@ on: - opened - reopened - synchronize - schedule: - - cron: '0 23 * * *' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -16,10 +14,6 @@ concurrency: jobs: opx-ci: - name: OPX CI - if: ${{ github.repository == 'cornelisnetworks/libfabric-internal' && github.event_name == 'pull_request' }} + name: CI + if: ${{ github.repository == 'cornelisnetworks/libfabric-internal' }} uses: cornelisnetworks/libfabric-devel/.github/workflows/cn.yml@master - nightly: - name: Nightly - if: ${{ github.repository == 'cornelisnetworks/libfabric-internal' && github.event_name == 'schedule' }} - uses: cornelisnetworks/libfabric-devel/.github/workflows/nightly.yml@master From 99f450e2658d596d25298cf84630a9c038b6deb8 Mon Sep 17 00:00:00 2001 From: Bob Cernohous Date: Tue, 1 Oct 2024 20:21:52 -0500 Subject: [PATCH 161/393] prov/opx: Fix payload copy Signed-off-by: Bob Cernohous --- prov/opx/src/fi_opx_hfi1.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 05036630c03..be5b941a5df 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -4735,8 +4735,17 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, assert(!replay->use_iov); assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); - fi_opx_copy_cacheline(replay_payload, temp); - replay_payload += 8; + + /* temp is hdr (1 QW) + payload (7 QW) */ + replay_payload[0] = temp[1]; + replay_payload[1] = temp[2]; + replay_payload[2] = temp[3]; + replay_payload[3] = temp[4]; + replay_payload[4] = temp[5]; + replay_payload[5] = temp[6]; + replay_payload[6] = temp[7]; + + replay_payload += 7; uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM && immediate_total) { From 85e00a6570599d07f494061b4a32d6f54027de3b Mon Sep 17 00:00:00 2001 From: Bob Cernohous Date: Wed, 2 Oct 2024 13:41:41 -0500 Subject: [PATCH 162/393] prov/opx: Fix eager and mp eager Also store pad value not buffer data Signed-off-by: Bob Cernohous --- prov/opx/include/rdma/opx/fi_opx.h | 3 +- prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h | 8 ++ .../include/rdma/opx/fi_opx_hfi1_transport.h | 80 ++++++++----- prov/opx/src/fi_opx_hfi1.c | 18 +-- prov/opx/src/fi_opx_reliability.c | 109 +++++++----------- 5 files changed, 113 insertions(+), 105 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx.h b/prov/opx/include/rdma/opx/fi_opx.h index c5e1f3a9167..306e1f87ba4 100644 --- a/prov/opx/include/rdma/opx/fi_opx.h +++ b/prov/opx/include/rdma/opx/fi_opx.h @@ -69,7 +69,8 @@ #define FI_OPX_DOMAIN_NAME "ib0" #define FI_OPX_DOMAIN_NAME_PREFIX "ib" -#define FI_OPX_CACHE_LINE_SIZE (64) +#define FI_OPX_CACHE_LINE_SIZE (64) +#define FI_OPX_CACHE_LINE_QWS (FI_OPX_CACHE_LINE_SIZE/sizeof(uint64_t)) #define FI_OPX_MAX_STRLEN (64) diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h index e80420ca8f5..a2d9f06b924 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_jkr.h @@ -95,6 +95,14 @@ #define OPX_PBC_JKR_L2COMPRESSED_SHIFT 19 #define OPX_PBC_JKR_L2COMPRESSED_MASK 0x1 +/* The 16B ICRC/TAIL and pad qwords are necessary for PIO but the values are not used. + Use a poison value for pad for debug - it should not be in receive memory */ +#define OPX_JKR_16B_PAD_QWORD (uint64_t)0xDEAD00BEEF11DEAD + +/* 16B headers spill past the SOP cacheline by 1 qword. There's room for + payload in that 2nd non-SOP cacheline */ +#define OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS (FI_OPX_CACHE_LINE_QWS - 1) + /* Fields that unused on JKR (zero will be OR'd) */ #define OPX_PBC_JKR_UNUSED 0UL diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index fc60d9a3ad0..835633e7a68 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -637,13 +637,13 @@ void fi_opx_store_inject_and_copy_scb2_16B(volatile uint64_t scb[8], // 2nd cacheline PIO (only) padded out OPX_HFI1_BAR_STORE(&scb[0], d8); // tag - OPX_HFI1_BAR_STORE(&scb[1], 0); - OPX_HFI1_BAR_STORE(&scb[2], 0); - OPX_HFI1_BAR_STORE(&scb[3], 0); - OPX_HFI1_BAR_STORE(&scb[4], 0); - OPX_HFI1_BAR_STORE(&scb[5], 0); - OPX_HFI1_BAR_STORE(&scb[6], 0); - OPX_HFI1_BAR_STORE(&scb[7], 0); + OPX_HFI1_BAR_STORE(&scb[1], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[2], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[3], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[4], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[5], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[6], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb[7], OPX_JKR_16B_PAD_QWORD); local[8] = d8; } @@ -2365,8 +2365,8 @@ ssize_t fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(struct fi_opx_ep *opx_ep } if (hdr_and_payload_qws < 8) { /* less than a full block stored? pad it out */ for (; i<8 ; ++i) { - OPX_HFI1_BAR_STORE(&scb_payload[i], -1UL); - local_storage[8 + i] = -1UL; + OPX_HFI1_BAR_STORE(&scb_payload[i], OPX_JKR_16B_PAD_QWORD); + local_storage[8 + i] = OPX_JKR_16B_PAD_QWORD; } } @@ -2411,8 +2411,8 @@ ssize_t fi_opx_hfi1_tx_egr_store_full_payload_blocks(struct fi_opx_ep *opx_ep, OPX_HFI1_BAR_STORE(&scb_payload[5], buf_qws[5]); OPX_HFI1_BAR_STORE(&scb_payload[6], buf_qws[6]); OPX_HFI1_BAR_STORE(&scb_payload[7], buf_qws[7]); - scb_payload += 8; - buf_qws += 8; + scb_payload += FI_OPX_CACHE_LINE_QWS; + buf_qws += FI_OPX_CACHE_LINE_QWS; FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); } @@ -2439,8 +2439,8 @@ ssize_t fi_opx_hfi1_tx_egr_store_full_payload_blocks(struct fi_opx_ep *opx_ep, OPX_HFI1_BAR_STORE(&scb_payload[5], buf_qws[5]); OPX_HFI1_BAR_STORE(&scb_payload[6], buf_qws[6]); OPX_HFI1_BAR_STORE(&scb_payload[7], buf_qws[7]); - scb_payload += 8; - buf_qws += 8; + scb_payload += FI_OPX_CACHE_LINE_QWS; + buf_qws += FI_OPX_CACHE_LINE_QWS; FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); } @@ -2471,7 +2471,7 @@ ssize_t fi_opx_hfi1_tx_egr_store_payload_tail(struct fi_opx_ep *opx_ep, if (payload_qws_tail < 8) { /* less than a full block stored? pad it out */ for (; i<8; ++i) { - OPX_HFI1_BAR_STORE(&scb_payload[i], -1UL); + OPX_HFI1_BAR_STORE(&scb_payload[i], OPX_JKR_16B_PAD_QWORD); } } FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(*pio_state); @@ -2772,15 +2772,17 @@ ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, /* write one block of PIO non-SOP, either one full block (8 qws) or the partial qws/block */ const size_t first_block_qws = full_block_credits_needed ? 8 : tail_partial_block_qws ; + #ifndef NDEBUG credits_consumed += #endif fi_opx_hfi1_tx_egr_store_packet_hdr_and_payload(opx_ep, &pio_state, local_temp, buf_qws, first_block_qws, tag); - buf_qws = buf_qws + first_block_qws - 1 /* not the kdeth qword */; + buf_qws = buf_qws + first_block_qws - 1 /* qws of payload, not the kdeth qword */; /* adjust full or partial for what we just consumed */ if (full_block_credits_needed) full_block_credits_needed--; + /* we wrote 7 qw, counts as partial tail*/ else tail_partial_block_qws = 0; @@ -2799,7 +2801,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, #endif fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, buf_qws + (full_block_credits_needed << 3), - tail_partial_block_qws); + tail_partial_block_qws - 1);// (tail_partial_block_qws-1) data + 1 QW ICRC } FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); @@ -2961,22 +2963,30 @@ ssize_t fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(struct fi_opx_ep *opx_ep, union fi_opx_hfi1_pio_state *pio_state, uint64_t *local_storage, const uint64_t tag, + const size_t payload_after_header_qws, uint64_t *buf_qws) { union fi_opx_hfi1_pio_state pio_local = *pio_state; volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_local); + assert(payload_after_header_qws <=7); // spill from 1st cacheline (SOP) OPX_HFI1_BAR_STORE(&scb_payload[0], tag); // header - local_storage[8] = tag; /* todo: pretty sure it's already there */ + local_storage[8] = tag; - int i; + int i = 1; /* start past the hdr qword */ - for (i = 1; i <= 7 ; ++i) { + /* store remaing buffer */ + for (; i <= payload_after_header_qws ; ++i) { OPX_HFI1_BAR_STORE(&scb_payload[i], buf_qws[i-1]); local_storage[8 + i] = buf_qws[i-1]; } + /* store padding if needed */ + for (; i <= 7 ; ++i) { + OPX_HFI1_BAR_STORE(&scb_payload[i], OPX_JKR_16B_PAD_QWORD); + local_storage[8 + i] = OPX_JKR_16B_PAD_QWORD; + } FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); @@ -3203,9 +3213,10 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first_common(struct fi_opx_ep *opx_ep, #ifndef NDEBUG credits_consumed += #endif - fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, tag, buf_qws); + fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, tag, + 7 /* qws of payload */, buf_qws); - buf_qws = buf_qws + 7; + buf_qws += OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS; uint32_t full_block_credits_needed = FI_OPX_MP_EGR_CHUNK_CREDITS - 3; // the last block needs to include icrc, #ifndef NDEBUG @@ -3397,8 +3408,9 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth_16B (struct fi_opx_ep *opx_ep, credits_consumed += #endif fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, - (((uint64_t) mp_egr_uid) << 32) | payload_offset, buf_qws); - buf_qws = (uint64_t*)((uintptr_t)buf + 56); + (((uint64_t) mp_egr_uid) << 32) | payload_offset, + 7 /* qws of payload */, buf_qws); + buf_qws += OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS; uint16_t full_block_credits_needed = FI_OPX_MP_EGR_CHUNK_CREDITS - 3; #ifndef NDEBUG @@ -3429,8 +3441,8 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_nth_16B (struct fi_opx_ep *opx_ep, opx_ep->tx->pio_state->qw0 = pio_state.qw0; fi_opx_hfi1_tx_send_egr_write_replay_data(opx_ep, addr, replay, psn_ptr, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, local_temp, buf, - FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type), reliability, hfi1_type); + FI_OPX_MP_EGR_CHUNK_PAYLOAD_TAIL, local_temp, buf, + FI_OPX_MP_EGR_CHUNK_PAYLOAD_QWS(hfi1_type), reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND-MP-EAGER-NTH-HFI"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -3667,17 +3679,27 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last_16B (struct fi_opx_ep *opx_ep, #endif fi_opx_hfi1_tx_mp_egr_write_nth_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, lrh_qws, pbc_dlid, pbc_dws, xfer_bytes_tail, payload_offset, psn, mp_egr_uid, hfi1_type); - uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); + uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); + + /* write 7 qwords of payload data or the partial tail qws/block minus hdr/kdeth minus tail (not in buffer) */ + const size_t payload_after_hdr_qws = full_block_credits_needed ? + OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS : + tail_partial_block_qws - kdeth9_qws_total - tail_qws_total ; /* header and payload */ #ifndef NDEBUG credits_consumed += #endif fi_opx_hfi1_tx_mp_egr_store_hdr_and_payload(opx_ep, &pio_state, local_temp, - (((uint64_t) mp_egr_uid) << 32) | payload_offset, buf_qws); - buf_qws = (uint64_t*)((uintptr_t)buf + 56); + (((uint64_t) mp_egr_uid) << 32) | payload_offset, + payload_after_hdr_qws, buf_qws); + + buf_qws += payload_after_hdr_qws /* qws of payload, not the kdeth qword */; + /* adjust full or partial for what we just consumed */ if (full_block_credits_needed) full_block_credits_needed--; + /* we wrote 7 qw, counts as partial tail*/ + else tail_partial_block_qws = 0; if (OFI_LIKELY(full_block_credits_needed)) { #ifndef NDEBUG @@ -3695,7 +3717,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_last_16B (struct fi_opx_ep *opx_ep, #endif fi_opx_hfi1_tx_egr_store_payload_tail(opx_ep, &pio_state, buf_qws + (full_block_credits_needed << 3), - tail_partial_block_qws); + tail_partial_block_qws - 1);// (tail_partial_block_qws-1) data + 1 QW ICRC } } diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index be5b941a5df..d426622b707 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -3908,7 +3908,7 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz uint64_t rem_payload_size; if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { fi_opx_copy_cacheline(replay_payload, local_temp_payload); - replay_payload += 8; + replay_payload += FI_OPX_CACHE_LINE_QWS; rem_payload_size = sizeof(struct fi_opx_hmem_iov) * (niov - 2); } else { local_temp[7] = local_temp_payload[0]; @@ -4266,7 +4266,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, assert(!replay->use_iov); assert(((uint8_t *)replay_payload) == ((uint8_t *)&replay->data)); fi_opx_copy_cacheline(replay_payload, temp); - replay_payload += 8; + replay_payload += FI_OPX_CACHE_LINE_QWS; uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM && immediate_total) { @@ -4309,7 +4309,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, sbuf_qw += immediate_qw_count; fi_opx_copy_cacheline(replay_payload, temp); - replay_payload += 8; + replay_payload += FI_OPX_CACHE_LINE_QWS; /* consume one credit for the rendezvous payload immediate data */ FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); @@ -4362,7 +4362,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, fi_opx_store_scb_qw(scb_payload, align_tmp.immediate_qw); fi_opx_copy_cacheline(replay_payload, align_tmp.immediate_qw); - replay_payload += 8; + replay_payload += FI_OPX_CACHE_LINE_QWS; FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG @@ -4745,7 +4745,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, replay_payload[5] = temp[6]; replay_payload[6] = temp[7]; - replay_payload += 7; + replay_payload += OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS; uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM && immediate_total) { @@ -4787,7 +4787,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, sbuf_qw += immediate_qw_count; fi_opx_copy_cacheline(replay_payload, temp); - replay_payload += 8; + replay_payload += FI_OPX_CACHE_LINE_QWS; /* consume one credit for the rendezvous payload immediate data */ FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); @@ -4805,7 +4805,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, fi_opx_store_scb_qw(scb_payload, temp_0); fi_opx_copy_cacheline(replay_payload, temp_0); - replay_payload += 8; + replay_payload += FI_OPX_CACHE_LINE_QWS; FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG @@ -4872,7 +4872,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, fi_opx_store_scb_qw(scb_payload, align_tmp.immediate_qw); fi_opx_copy_cacheline(replay_payload, align_tmp.immediate_qw); - replay_payload += 8; + replay_payload += FI_OPX_CACHE_LINE_QWS; FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG @@ -4887,7 +4887,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); fi_opx_store_scb_qw(scb_payload, temp_0); fi_opx_copy_cacheline(replay_payload, temp_0); - replay_payload += 8; + replay_payload += FI_OPX_CACHE_LINE_QWS; FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index 9a077bb121e..f05e285b144 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -478,13 +478,13 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_opcode (struct fid_ep *ep, FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); OPX_HFI1_BAR_STORE(&scb_payload[0], key); - OPX_HFI1_BAR_STORE(&scb_payload[1], 0); - OPX_HFI1_BAR_STORE(&scb_payload[2], 0); - OPX_HFI1_BAR_STORE(&scb_payload[3], 0); - OPX_HFI1_BAR_STORE(&scb_payload[4], 0); - OPX_HFI1_BAR_STORE(&scb_payload[5], 0); - OPX_HFI1_BAR_STORE(&scb_payload[6], 0); - OPX_HFI1_BAR_STORE(&scb_payload[7], 0); + OPX_HFI1_BAR_STORE(&scb_payload[1], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[2], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[3], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[4], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[5], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[6], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb_payload[7], OPX_JKR_16B_PAD_QWORD); FI_OPX_HFI1_CONSUME_CREDITS(pio_state, 1); } @@ -727,16 +727,13 @@ ssize_t fi_opx_hfi1_tx_reliability_inject (struct fid_ep *ep, FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); OPX_HFI1_BAR_STORE(&scb2[0], key); - - //fi_opx_compiler_msync_writes(); - - OPX_HFI1_BAR_STORE(&scb2[1], 0); - OPX_HFI1_BAR_STORE(&scb2[2], 0); - OPX_HFI1_BAR_STORE(&scb2[3], 0); - OPX_HFI1_BAR_STORE(&scb2[4], 0); - OPX_HFI1_BAR_STORE(&scb2[5], 0); - OPX_HFI1_BAR_STORE(&scb2[6], 0); - OPX_HFI1_BAR_STORE(&scb2[7], 0); + OPX_HFI1_BAR_STORE(&scb2[1], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[2], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[3], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[4], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[5], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[6], OPX_JKR_16B_PAD_QWORD); + OPX_HFI1_BAR_STORE(&scb2[7], OPX_JKR_16B_PAD_QWORD); FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR(opx_ep->tx->pio_credits_addr); @@ -1561,16 +1558,19 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service key.tx = (uint32_t)FI_OPX_HFI1_PACKET_ORIGIN_TX(OPX_REPLAY_HDR(replay)); key.rx = (uint32_t)(OPX_REPLAY_HDR(replay)->bth.rx); #endif + /* runtime checks for non-inlined functions */ + const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; /* reported in LRH as the number of 4-byte words in the packet; header + payload + icrc */ uint16_t lrh_pktlen_le; - size_t total_bytes_to_copy; - size_t payload_bytes_to_copy; - /* runtime checks for non-inlined functions */ - const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; + size_t total_bytes_to_copy; + size_t payload_bytes_to_copy; /* payload without (16B) icrc tail */ uint16_t payload_credits_needed; - int payload_qw_to_copy_with_header = 0; + + uint32_t payload_qw_to_copy_with_header = 0; + bool tail_block_needed = false; /* 16B tail needed */ + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { lrh_pktlen_le = ntohs(replay->scb.scb_9B.hdr.lrh_9B.pktlen); total_bytes_to_copy = (lrh_pktlen_le - 1) * 4; /* do not copy the trailing icrc */ @@ -1579,12 +1579,15 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service } else { lrh_pktlen_le = replay->scb.scb_16B.hdr.lrh_16B.pktlen; total_bytes_to_copy = (lrh_pktlen_le) * 8; /* including trailing icrc */ - payload_bytes_to_copy = (total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B)); + /* do not copy icrc, it is "pad" not user data */ + payload_bytes_to_copy = (total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B) - 8 /* icrc */); payload_qw_to_copy_with_header = MIN((7*8), payload_bytes_to_copy)>>3; /* up to 7 qwords */ - assert(payload_bytes_to_copy >= payload_qw_to_copy_with_header*8); + assert(payload_bytes_to_copy >= payload_qw_to_copy_with_header * 8); payload_bytes_to_copy -= payload_qw_to_copy_with_header<<3; - /* ICRC/tail qword is already accounted for in the lrh */ - payload_credits_needed = (payload_bytes_to_copy >> 6); /* number of full 64-byte blocks of payload */ + payload_credits_needed = (payload_bytes_to_copy >> 6); /* number of full 64-byte blocks of payload - icrc */ + if (payload_qw_to_copy_with_header >= 7) { /* if tail is not in with hdr/payload block */ + tail_block_needed = true; /* tail needed even if there's no partial payload block*/ + } } union fi_opx_hfi1_pio_state pio_state = *service->tx.hfi1.pio_state; @@ -1592,9 +1595,10 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service /* Non-inlined functions should just use the runtime HFI1 type check, no optimizations */ const uint16_t credits_needed = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? 1 : 2; + const uint16_t last_partial_block = (((payload_bytes_to_copy & 0x3Ful) || tail_block_needed) ? 1 : 0); const uint16_t total_credits_needed = credits_needed + /* header */ payload_credits_needed + /* full payload blocks */ - ((payload_bytes_to_copy & 0x3Ful) ? 1 : 0); /* last partial block */ + last_partial_block ; /* last partial block */ uint16_t total_credits_available = FI_OPX_HFI1_AVAILABLE_RELIABILITY_CREDITS(pio_state); if (total_credits_available < total_credits_needed) { FI_OPX_HFI1_UPDATE_CREDITS(pio_state, service->tx.hfi1.pio_credits_addr); @@ -1642,7 +1646,6 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service } fi_opx_hfi1_reliability_iov_payload_check(replay, key.value, "Replaying packet (PIO) where source buffer has changed!", __FILE__, __func__, __LINE__); #endif - /* TODO if using user iov we can't go past their buffer for the tail */ buf_qws = replay->iov[0].iov_base; } else { buf_qws = replay->payload; @@ -1701,7 +1704,7 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service buf_qws += 1; } for (i = payload_qw_to_copy_with_header+1; i <= 7 ; ++i) { - OPX_HFI1_BAR_STORE(&scb_payload[i], 0UL); + OPX_HFI1_BAR_STORE(&scb_payload[i], OPX_JKR_16B_PAD_QWORD); } @@ -1714,12 +1717,8 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service consumed_credits = 2; #endif } - - /* Skip last block if there a partial 64-byte block of payload */ - const uint16_t last_partial_block = (payload_bytes_to_copy & 0x3Ful) ? 1 : 0; + /* Copy full blocks of payload */ while (payload_credits_needed) { - /* TODO if using user iov we can't go past their buffer for the tail */ - volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); @@ -1744,8 +1743,8 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service OPX_HFI1_BAR_STORE(&scb_payload[6], buf_qws[6]); OPX_HFI1_BAR_STORE(&scb_payload[7], buf_qws[7]); - scb_payload += 8; - buf_qws += 8; + scb_payload += FI_OPX_CACHE_LINE_QWS; + buf_qws += FI_OPX_CACHE_LINE_QWS; } payload_credits_needed -= contiguous_full_blocks_to_write; @@ -1757,16 +1756,15 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service } /* Store last partial 64-byte block of payload */ if(last_partial_block != 0) { - /* TODO if using user iov we can't go past their buffer for the tail */ - int16_t tail_bytes = (payload_bytes_to_copy & 0x3Ful) ; + int16_t payload_tail_bytes = (payload_bytes_to_copy & 0x3Ful) ; /* not icrc/pad */ /* We have a credit so we don't have to worry about this wrapping on one block */ volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); uint16_t i = 0; - for ( ; tail_bytes >= 8; tail_bytes-=8) { + for ( ; payload_tail_bytes >= 8; payload_tail_bytes-=8) { OPX_HFI1_BAR_STORE(scb_payload, *buf_qws); scb_payload += 1; @@ -1775,22 +1773,22 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service } /* LRH packets are dword (9B) or qword (16b) aligned */ - assert((tail_bytes == 4) || (tail_bytes == 0)); + assert((payload_tail_bytes == 4) || (payload_tail_bytes == 0)); if (hfi1_type != OPX_HFI1_JKR) { - if (tail_bytes) { + if (payload_tail_bytes) { OPX_HFI1_BAR_STORE(scb_payload, ((*buf_qws))); scb_payload += 1; i++; } } else { /* QWORD aligned for 16B */ - assert(tail_bytes == 0); - /* TODO if using user iov we can't go past their buffer for the tail */ - /* assert(i<-8); // left a pad for tail */ + assert(payload_tail_bytes == 0); + /* Have not yet stored icrc/pad */ + assert(i < 8); } /* Pad out the cacheline/block */ for (; i <8; i++) { - OPX_HFI1_BAR_STORE(scb_payload, 0UL); + OPX_HFI1_BAR_STORE(scb_payload, OPX_JKR_16B_PAD_QWORD); scb_payload += 1; } @@ -1802,27 +1800,6 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service #ifndef NDEBUG consumed_credits += 1; #endif - } else if (0 /* TODO */ ) /* (hfi1_type & OPX_HFI1_JKR) */ { - /* TODO if using user iov we can't go past their buffer for the tail */ - - /* The padding counted as a tail above but if we wrote - * all full blocks of payload, we need to write another - * block just to send a tail qword - */ - volatile uint64_t * scb_payload = - FI_OPX_HFI1_PIO_SCB_HEAD(service->tx.hfi1.pio_scb_first, pio_state); - for (int i = 0; i <8; i++) { - OPX_HFI1_BAR_STORE(scb_payload, 0UL); - scb_payload += 1; - } - FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); - - /* consume one credit for the tail partial block payload */ - --total_credits_available; - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); -#ifndef NDEBUG - consumed_credits += 1; -#endif } #ifndef NDEBUG From 4eebbb33022686330ddcb17e67bc54de270ebd05 Mon Sep 17 00:00:00 2001 From: Thomas Huber Date: Fri, 4 Oct 2024 16:24:11 -0400 Subject: [PATCH 163/393] prov/opx: Fix last_bytes field for replay over sdma Signed-off-by: Thomas Huber --- prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h index d39f59993b0..76ce0b47455 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h @@ -561,7 +561,7 @@ int opx_hfi1_sdma_enqueue_replay(struct fi_opx_ep *opx_ep, FI_OPX_HFI1_SDMA_REQ_HEADER_EAGER_FIXEDBITS, replay->hmem_iface, replay->hmem_device, - 0, // last packet bytes unused for replays + replay->scb.scb_9B.hdr.dput.target.bytes, // last packet bytes 0 // kdeth tid info unused for replays ); } From f27d721c20abc8f98c6b2a77b3ac67e4325e14f8 Mon Sep 17 00:00:00 2001 From: Elias Kozah Date: Wed, 9 Oct 2024 09:56:27 -0400 Subject: [PATCH 164/393] prov/opx: fi_info -e fix for FI_OPX_UUID env var Signed-off-by: Elias Kozah --- prov/opx/src/fi_opx_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 2322936ad0a..4c03a191bbe 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -691,7 +691,7 @@ OPX_INI fi_opx_init = 1; - fi_param_define(&fi_opx_provider, "uuid", FI_PARAM_STRING, "Globally unique ID for preventing OPX jobs from conflicting either in shared memory or over the OPX fabric. Defaults to \"%s\"", + fi_param_define(&fi_opx_provider, "uuid", FI_PARAM_STRING, "Globally unique ID for preventing OPX jobs from conflicting either in shared memory or over the OPX fabric. Defaults to the Slurm job ID if one exists, otherwise defaults to Intel MPI UUID if one exists, otherwise defaults to \"%s\"", OPX_DEFAULT_JOB_KEY_STR); fi_param_define(&fi_opx_provider, "force_cpuaffinity", FI_PARAM_BOOL, "Causes the thread to bind itself to the cpu core it is running on. Defaults to \"No\""); fi_param_define(&fi_opx_provider, "reliability_service_usec_max", FI_PARAM_INT, "The number of microseconds between pings for un-acknowledged packets. Defaults to 500 usec."); From b75a0be12b5d728d3c62ba0f622813a1837388eb Mon Sep 17 00:00:00 2001 From: Elias Kozah Date: Wed, 9 Oct 2024 09:57:50 -0400 Subject: [PATCH 165/393] prov/opx: Investigate and address indeterminate behavior or segfault resulting from ignored context creation error Signed-off-by: Elias Kozah --- prov/opx/include/rdma/opx/fi_opx_rma.h | 8 +---- prov/opx/src/fi_opx_ep.c | 6 +++- prov/opx/src/fi_opx_reliability.c | 50 +------------------------- 3 files changed, 7 insertions(+), 57 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index 4f66d9c2885..c088686fae9 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -230,19 +230,13 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, params->payload_bytes_for_iovec = 0; params->target_hfi_unit = opx_dst_addr.hfi1_unit; - /* Possible SHM connections required for certain applications (i.e., DAOS) - * exceeds the max value of the legacy u8_rx field. Use u32_extended field. - */ - ssize_t rc = fi_opx_shm_dynamic_tx_connect(params->is_intranode, opx_ep, params->u32_extended_rx, opx_dst_addr.hfi1_unit); - assert(rc == FI_SUCCESS); - fi_opx_hfi1_dput_sdma_init(opx_ep, params, iov->len, 0, 0, NULL, is_hmem); FI_OPX_DEBUG_COUNTERS_INC_COND(is_hmem && params->is_intranode, opx_ep->debug_counters.hmem.rma_write_intranode); FI_OPX_DEBUG_COUNTERS_INC_COND(is_hmem && !params->is_intranode, opx_ep->debug_counters.hmem.rma_write_hfi); - rc = params->work_elem.work_fn(work); + ssize_t rc = params->work_elem.work_fn(work); if (rc == FI_SUCCESS) { assert(params->work_elem.complete); OPX_BUF_FREE(work); diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index b4868d49e97..3cd5eabfdae 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -1414,9 +1414,13 @@ static int fi_opx_ep_rx_init (struct fi_opx_ep *opx_ep) snprintf(buffer,sizeof(buffer),"%s-%02x.%d", opx_domain->unique_job_key_str, hfi_unit, inst); - opx_shm_rx_init(&opx_ep->rx->shm, fi_opx_global.prov, + ssize_t rc = opx_shm_rx_init(&opx_ep->rx->shm, fi_opx_global.prov, (const char *)buffer, rx_index, FI_OPX_SHM_FIFO_SIZE, FI_OPX_SHM_PACKET_SIZE); + if (OFI_UNLIKELY(rc != FI_SUCCESS)) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, "Shared memory initialization failed.\n"); + goto err; + } } /* Now that endpoint is complete enough to have context information from the hfi, diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index f05e285b144..8b46b8fac14 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -2507,55 +2507,7 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser { uint8_t origin_reliability_rx = (uint8_t)-1; - if (OFI_RELIABILITY_KIND_OFFLOAD == reliability_kind) { - - assert (hfi1 == NULL); - - service->reliability_kind = reliability_kind; - - /* - * open the hfi1 context, determines JKR or WFR - */ - service->context = fi_opx_hfi1_context_open(NULL, unique_job_key); - FI_INFO(fi_opx_global.prov, FI_LOG_EP_DATA, - "Opened hfi %p, HFI type %#X/%#X, unit %#X, port %#X, ref_cnt %#lX," - " rcv ctxt %#X, send ctxt %#X, \n", - service->context, service->context->hfi_hfi1_type, OPX_HFI1_TYPE, - service->context->hfi_unit, service->context->hfi_port, - service->context->ref_cnt, - service->context->ctrl->ctxt_info.ctxt, - service->context->ctrl->ctxt_info.send_ctxt); - - assert (service->context != NULL); - - hfi1 = service->context; - init_hfi1_rxe_state(hfi1, &service->rx.hfi1.state); - - service->lid_be = (uint32_t)htons(hfi1->lid); - - /* - * COPY the rx static information from the hfi context structure. - * This is to improve cache layout. - */ - service->rx.hfi1.hdrq.rhf_base = hfi1->info.rxe.hdrq.rhf_base; - service->rx.hfi1.hdrq.head_register = hfi1->info.rxe.hdrq.head_register; - service->rx.hfi1.egrq.base_addr = hfi1->info.rxe.egrq.base_addr; - service->rx.hfi1.egrq.elemsz = hfi1->info.rxe.egrq.elemsz; - service->rx.hfi1.egrq.last_egrbfr_index = 0; - service->rx.hfi1.egrq.head_register = hfi1->info.rxe.egrq.head_register; - - - /* the 'state' fields will change after every tx operation */ - service->tx.hfi1.pio_state = &hfi1->state.pio; - - /* the 'info' fields do not change; the values can be safely copied */ - service->tx.hfi1.pio_scb_sop_first = hfi1->info.pio.scb_sop_first; - service->tx.hfi1.pio_scb_first = hfi1->info.pio.scb_first; - service->tx.hfi1.pio_credits_addr = hfi1->info.pio.credits_addr; - - origin_reliability_rx = hfi1->info.rxe.id; - - } else if (OFI_RELIABILITY_KIND_ONLOAD == reliability_kind) { + if (OFI_RELIABILITY_KIND_ONLOAD == reliability_kind) { assert(hfi1 != NULL); From 890c20115a5af8b3cbd0099077a912a59d07db6c Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Wed, 9 Oct 2024 14:33:11 -0500 Subject: [PATCH 166/393] prov/opx: Include less immediate data in RTS packet to improve rendezvous performance Signed-off-by: Ben Lynam --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 306 ++++++------- .../opx/include/rdma/opx/fi_opx_hfi1_packet.h | 146 ++----- .../include/rdma/opx/fi_opx_hfi1_transport.h | 3 +- prov/opx/src/fi_opx_hfi1.c | 406 ++++++++---------- prov/opx/src/fi_opx_init.c | 2 +- 5 files changed, 363 insertions(+), 500 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index a6b76bf0cba..59b8d5d3d6e 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -968,6 +968,61 @@ void fi_opx_enqueue_completed(struct slist *queue, struct opx_context *context, slist_insert_tail((struct slist_entry *) context, queue); } +__OPX_FORCE_INLINE__ +void opx_ep_copy_immediate_data(struct fi_opx_ep * opx_ep, + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info, + struct opx_payload_rzv_contig *contiguous, + const uint64_t immediate_byte_count, + const uint64_t immediate_qw_count, + const uint64_t immediate_block, + const uint64_t immediate_tail, + const uint64_t immediate_total, + const size_t xfer_len, + const uint64_t is_hmem, + const enum fi_hmem_iface rbuf_iface, + const uint64_t rbuf_device, + const uint64_t hmem_handle, + uint8_t *rbuf_in) +{ + uint8_t *rbuf = is_hmem ? opx_ep->hmem_copy_buf : rbuf_in; + + for (int i = 0; i < immediate_byte_count; ++i) { + rbuf[i] = contiguous->immediate_byte[i]; + } + rbuf += immediate_byte_count; + + uint64_t * rbuf_qw = (uint64_t *)rbuf; + for (int i = 0; i < immediate_qw_count; ++i) { + rbuf_qw[i] = contiguous->immediate_qw[i]; + } + rbuf += immediate_qw_count * sizeof(uint64_t); + + if (immediate_block) { + const uint64_t immediate_fragment = (immediate_byte_count || immediate_qw_count) ? 1 : 0; + memcpy(rbuf, (void *) (&contiguous->cache_line_1 + immediate_fragment), FI_OPX_CACHE_LINE_SIZE); + } + + if (is_hmem && immediate_total) { + opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, + rbuf_in, opx_ep->hmem_copy_buf, immediate_total, + OPX_HMEM_DEV_REG_RECV_THRESHOLD); + } + + if (immediate_tail) { + uint8_t *rbuf_start = rbuf_in + xfer_len - OPX_IMMEDIATE_TAIL_BYTE_COUNT; + + if (!is_hmem) { + for (int i = 0; i < OPX_IMMEDIATE_TAIL_BYTE_COUNT; ++i) { + rbuf_start[i] = immediate_info.tail_bytes[i]; + } + } else { + opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, rbuf_start, + immediate_info.tail_bytes, OPX_IMMEDIATE_TAIL_BYTE_COUNT, + OPX_HMEM_DEV_REG_RECV_THRESHOLD); + } + } +} + __OPX_FORCE_INLINE__ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, const union fi_opx_hfi1_packet_payload * const payload, @@ -980,14 +1035,7 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, const uint64_t is_hmem, const int lock_required, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type, - const uintptr_t origin_byte_counter_vaddr, - const struct fi_opx_hmem_iov *iov, - const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info, - const struct fi_opx_hmem_iov *src_dst_iov, - const uint8_t * const immediate_byte, - const uint64_t * const immediate_qw, - const union cacheline * const immediate_block) + const enum opx_hfi1_type hfi1_type) { assert( (opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) || (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); @@ -1028,14 +1076,14 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, hdr, payload, u8_rx, niov, - origin_byte_counter_vaddr, + payload->rendezvous.noncontiguous.origin_byte_counter_vaddr, context, (uintptr_t)(rbuf), /* receive buffer virtual address */ FI_HMEM_SYSTEM, /* receive buffer iface */ 0UL, /* receive buffer device */ 0UL, /* immediate_data */ 0UL, /* immediate_end_block_count */ - iov, + &payload->rendezvous.noncontiguous.iov[0], FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG, is_intranode, reliability, /* compile-time constant expression */ @@ -1044,16 +1092,35 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, } else { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.recv.multi_recv_rzv_contig); assert(niov == 1); - const uint64_t immediate_byte_count = immediate_info.byte_count; - const uint64_t immediate_qw_count = immediate_info.qw_count; - const uint64_t immediate_block_count = immediate_info.block_count; - const uint64_t immediate_total = immediate_byte_count + - immediate_qw_count * sizeof(uint64_t) + - immediate_block_count * sizeof(union cacheline); - const uint64_t immediate_end_block_count = immediate_info.end_block_count; + struct opx_payload_rzv_contig *contiguous = (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) + ? (struct opx_payload_rzv_contig *) &payload->rendezvous.contiguous + : (struct opx_payload_rzv_contig *) &payload->rendezvous.contiguous_16B; + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { + .qw0 = contiguous->immediate_info + }; + const uint64_t immediate_byte_count = (immediate_info.count & OPX_IMMEDIATE_BYTE_COUNT_MASK) + >> OPX_IMMEDIATE_BYTE_COUNT_SHIFT; + const uint64_t immediate_qw_count = (immediate_info.count & OPX_IMMEDIATE_QW_COUNT_MASK) + >> OPX_IMMEDIATE_QW_COUNT_SHIFT; + const uint64_t immediate_block = (immediate_info.count & OPX_IMMEDIATE_BLOCK_MASK) + >> OPX_IMMEDIATE_BLOCK_SHIFT; + const uint64_t immediate_tail = (immediate_info.count & OPX_IMMEDIATE_TAIL_MASK) + >> OPX_IMMEDIATE_TAIL_SHIFT; + const uint64_t immediate_total = immediate_byte_count + + immediate_qw_count * sizeof(uint64_t) + + immediate_block * sizeof(union cacheline); + + const struct fi_opx_hmem_iov src_dst_iov[1] = { + { + .buf = contiguous->src_vaddr, + .len = (contiguous->src_blocks << 6), + .device = contiguous->src_device_id, + .iface = (enum fi_hmem_iface) contiguous->src_iface + } + }; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", - immediate_total, immediate_byte_count, immediate_qw_count, immediate_block_count); + immediate_total, immediate_byte_count, immediate_qw_count, immediate_block); context->byte_counter -= immediate_total; @@ -1061,13 +1128,13 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, hdr, payload, u8_rx, niov, - origin_byte_counter_vaddr, + contiguous->origin_byte_counter_vaddr, context, (uintptr_t)(rbuf + immediate_total), /* receive buffer virtual address */ FI_HMEM_SYSTEM, /* receive buffer iface */ 0UL, /* receive buffer device */ immediate_total, - immediate_end_block_count, + immediate_tail, src_dst_iov, FI_OPX_HFI_DPUT_OPCODE_RZV, is_intranode, @@ -1075,41 +1142,10 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, u32_ext_rx, hfi1_type); - /* - * copy the immediate payload data - */ - unsigned i; - - if (immediate_byte_count) { - for (i=0; irendezvous.contiguous + : (struct opx_payload_rzv_contig *) &payload->rendezvous.contiguous_16B; + const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { + .qw0 = contiguous->immediate_info + }; + const uint64_t immediate_byte_count = (immediate_info.count & OPX_IMMEDIATE_BYTE_COUNT_MASK) + >> OPX_IMMEDIATE_BYTE_COUNT_SHIFT; + const uint64_t immediate_qw_count = (immediate_info.count & OPX_IMMEDIATE_QW_COUNT_MASK) + >> OPX_IMMEDIATE_QW_COUNT_SHIFT; + const uint64_t immediate_block = (immediate_info.count & OPX_IMMEDIATE_BLOCK_MASK) + >> OPX_IMMEDIATE_BLOCK_SHIFT; + const uint64_t immediate_tail = (immediate_info.count & OPX_IMMEDIATE_TAIL_MASK) + >> OPX_IMMEDIATE_TAIL_SHIFT; + const uint64_t immediate_total = immediate_byte_count + + immediate_qw_count * sizeof(uint64_t) + + immediate_block * sizeof(union cacheline); + + const struct fi_opx_hmem_iov src_dst_iov[1] = { + { + .buf = contiguous->src_vaddr, + .len = (contiguous->src_blocks << 6), + .device = contiguous->src_device_id, + .iface = (enum fi_hmem_iface) contiguous->src_iface + } + }; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE RZV_RTS immediate_total %#lX, immediate_byte_count %#lX, immediate_qw_count %#lX, immediate_block_count %#lX\n", - immediate_total, immediate_byte_count, immediate_qw_count, immediate_block_count); + immediate_total, immediate_byte_count, immediate_qw_count, immediate_block); context->byte_counter = xfer_len - immediate_total; FI_OPX_FABRIC_RX_RZV_RTS(opx_ep, hdr, payload, u8_rx, 1, - origin_byte_counter_vaddr, + contiguous->origin_byte_counter_vaddr, context, (uintptr_t) (rbuf + immediate_total), rbuf_iface, rbuf_device, immediate_total, - immediate_end_block_count, + immediate_tail, src_dst_iov, FI_OPX_HFI_DPUT_OPCODE_RZV, is_intranode, @@ -1189,63 +1244,10 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, u32_ext_rx, hfi1_type); - /* - * copy the immediate payload data - */ - if (is_hmem) { - rbuf = opx_ep->hmem_copy_buf; - } - unsigned i; - - if (immediate_byte_count) { - for (i=0; ihmem_copy_buf) + - (immediate_block_count * sizeof(union cacheline)); - if (immediate_total) { - opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, - recv_buf, opx_ep->hmem_copy_buf, immediate_total, - OPX_HMEM_DEV_REG_RECV_THRESHOLD); - } - } - - /* up to 1 block of immediate end data after the immediate blocks - Copy this to the end of rbuf */ - if (immediate_end_block_count) { - uint8_t *rbuf_start = (uint8_t *)recv_buf; - rbuf_start += xfer_len - (immediate_end_block_count << 6); - if (!is_hmem) { - memcpy(rbuf_start, - immediate_block[immediate_block_count].qw, - (immediate_end_block_count << 6)); - } else { - opx_copy_to_hmem(rbuf_iface, rbuf_device, hmem_handle, rbuf_start, - immediate_block[immediate_block_count].qw, - (immediate_end_block_count << 6), - OPX_HMEM_DEV_REG_RECV_THRESHOLD); - } - } - + opx_ep_copy_immediate_data(opx_ep, immediate_info, contiguous, immediate_byte_count, + immediate_qw_count, immediate_block, immediate_tail, + immediate_total, xfer_len, is_hmem, rbuf_iface, + rbuf_device, hmem_handle, rbuf); } else { /*fi_opx_hfi1_dump_packet_hdr(hdr, __func__, __LINE__); */ FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -1275,6 +1277,11 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, assert(payload != NULL); + uintptr_t origin_byte_counter_vaddr = is_noncontig ? + payload->rendezvous.noncontiguous.origin_byte_counter_vaddr : + (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) ? + payload->rendezvous.contiguous.origin_byte_counter_vaddr : + payload->rendezvous.contiguous_16B.origin_byte_counter_vaddr; FI_OPX_FABRIC_RX_RZV_RTS_ETRUNC(opx_ep, (const void * const)hdr, u8_rx, @@ -1842,61 +1849,10 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-MP-EAGER-NTH"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- MULTI PACKET EAGER NTH byte counter %lu (end)\n",context->byte_counter); - } else if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { /* 9B rendezvous packet */ - union fi_opx_hfi1_packet_payload *p = (union fi_opx_hfi1_packet_payload *) payload; - - const uint64_t is_noncontig = hdr->rendezvous.flags & FI_OPX_PKT_RZV_FLAGS_NONCONTIG; - - uintptr_t origin_byte_counter_vaddr = (is_noncontig == 1) ? p->rendezvous.noncontiguous.origin_byte_counter_vaddr : - p->rendezvous.contiguous.origin_byte_counter_vaddr; - - struct fi_opx_hmem_iov *iov = &p->rendezvous.noncontiguous.iov[0]; - - const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { - .qw0 = p->rendezvous.contiguous.immediate_info - }; - const struct fi_opx_hmem_iov src_dst_iov = { - .buf = p->rendezvous.contiguous.src_vaddr, - .len = (p->rendezvous.contiguous.src_blocks << 6), - .device = p->rendezvous.contiguous.src_device_id, - .iface = (enum fi_hmem_iface) p->rendezvous.contiguous.src_iface - }; - const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; - const uint64_t * const immediate_qw = p->rendezvous.contiguous.immediate_qw; - const uint64_t immediate_fragment = ((immediate_info.byte_count + immediate_info.qw_count + 63) >> 6); - const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; - - fi_opx_handle_recv_rts(hdr, payload, opx_ep, origin_tag, opcode, - context, is_multi_receive, is_intranode, is_hmem, - lock_required, reliability, hfi1_type, origin_byte_counter_vaddr, - iov, immediate_info, &src_dst_iov, immediate_byte, immediate_qw, immediate_block); - - } else { /* (hfi1_type & OPX_HFI1_JKR) 16B rendezvous packet */ - union fi_opx_hfi1_packet_payload_16B *p = (union fi_opx_hfi1_packet_payload_16B *) payload; - const uint64_t is_noncontig = hdr->rendezvous.flags & FI_OPX_PKT_RZV_FLAGS_NONCONTIG; - uintptr_t origin_byte_counter_vaddr = (is_noncontig == 1) ? p->rendezvous.noncontiguous.origin_byte_counter_vaddr : - p->rendezvous.contiguous.origin_byte_counter_vaddr; - struct fi_opx_hmem_iov *iov = &p->rendezvous.noncontiguous.iov[0]; - const union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { - .qw0 = p->rendezvous.contiguous.immediate_info - }; - - const struct fi_opx_hmem_iov src_dst_iov = { - .buf = p->rendezvous.contiguous.src_vaddr, - .len = (p->rendezvous.contiguous.src_blocks << 6), - .device = p->rendezvous.contiguous.src_device_id, - .iface = (enum fi_hmem_iface) p->rendezvous.contiguous.src_iface - - }; - const uint8_t * const immediate_byte = p->rendezvous.contiguous.immediate_byte; - const uint64_t * const immediate_qw = p->rendezvous.contiguous.immediate_qw; - const uint64_t immediate_fragment = ((immediate_info.byte_count + immediate_info.qw_count + 63) >> 6); - const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; - + } else { fi_opx_handle_recv_rts(hdr, payload, opx_ep, origin_tag, opcode, context, is_multi_receive, is_intranode, is_hmem, - lock_required, reliability, hfi1_type, origin_byte_counter_vaddr, - iov, immediate_info, &src_dst_iov, immediate_byte, immediate_qw, immediate_block); + lock_required, reliability, hfi1_type); } FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "\n"); } diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h index b0d826e3c60..99685dadca5 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h @@ -1588,17 +1588,26 @@ struct fi_opx_hmem_iov { - (4 * sizeof(uint32_t))) \ / sizeof(uint32_t)) +#define OPX_IMMEDIATE_BYTE_COUNT_SHIFT (5) +#define OPX_IMMEDIATE_BYTE_COUNT_MASK (0xE0) +#define OPX_IMMEDIATE_QW_COUNT_SHIFT (2) +#define OPX_IMMEDIATE_QW_COUNT_MASK (0x1C) +#define OPX_IMMEDIATE_BLOCK_SHIFT (1) +#define OPX_IMMEDIATE_BLOCK_MASK (0x02) +#define OPX_IMMEDIATE_TAIL_SHIFT (0) +#define OPX_IMMEDIATE_TAIL_MASK (0x01) +#define OPX_IMMEDIATE_TAIL_BYTE_COUNT (7) + union fi_opx_hfi1_rzv_rts_immediate_info { uint64_t qw0; struct { - uint8_t byte_count; /* only need 3 bits (0..7 bytes) */ - uint8_t qw_count; /* only need 3 bits (0..7 quadwords) */ - uint8_t block_count; /* only need 1 bits (0 or 1) */ - uint8_t end_block_count;/* only need 1 bits (0 or 1) */ - uint32_t unused; + uint8_t tail_bytes[7]; + uint8_t count; }; }; +static_assert(sizeof(((union fi_opx_hfi1_rzv_rts_immediate_info *)0)->tail_bytes) == OPX_IMMEDIATE_TAIL_BYTE_COUNT, + "sizeof(immediate_info->tail_bytes) must be equal to OPX_IMMEDIATE_TAIL_BYTE_COUNT!"); /* Cache "blocked" payloads in 16B are currently "tricky". * The sender will always send 1 QW of header after SOP so STORE'ing @@ -1612,124 +1621,49 @@ union fi_opx_hfi1_rzv_rts_immediate_info { * fi_opx_init_hfi_lookupoptionally STORE(icrc/tail) if no more immediate data * * STORE(full block of immediate fragment unaligned data) - * STORe(full block of immediate data) + * STORE(full block of immediate data) * STORE(full block of immediate end data) * STORE(icrc/tail) */ -union fi_opx_hfi1_packet_payload_16B { - uint8_t byte[FI_OPX_HFI1_PACKET_MTU]; - uint64_t qw[FI_OPX_HFI1_PACKET_MTU>>3]; - union { - struct { - /* ==== CACHE LINE 0 ==== */ - - uintptr_t src_vaddr; - uint64_t src_blocks; /* number of 64-byte data blocks to transfer */ - uint64_t src_device_id; - uint64_t src_iface; - uint64_t immediate_info; - uintptr_t origin_byte_counter_vaddr; - uint64_t unused[1]; - - /* Not cacheline aligned after the first block */ - union { - struct { - uint8_t immediate_byte[8]; - uint64_t immediate_qw[7]; - }; - - union cacheline cache_line_1; - }; - - union cacheline immediate_block[FI_OPX_HFI1_PACKET_MTU / sizeof(union cacheline) - 2]; - - } contiguous; - struct { - /* ==== CACHE LINE 0 ==== */ - uintptr_t src_vaddr; - uint64_t src_blocks; /* number of 64-byte data blocks to transfer */ - uint64_t src_device_id; - uint64_t src_iface; - uint64_t immediate_info; - uintptr_t origin_byte_counter_vaddr; - uint64_t unused[1]; - - union { - struct { - uint8_t immediate_byte[8]; - uint64_t immediate_qw[7]; - }; +struct opx_payload_rzv_contig { + /* ==== CACHE LINE 0 ==== */ - union cacheline cache_line_1; - }; - - union cacheline immediate_block[FI_OPX_HFI1_PACKET_MTU / sizeof(union cacheline) - 2]; + uintptr_t src_vaddr; + uint64_t src_blocks; /* number of 64-byte data blocks to transfer */ + uint64_t src_device_id; + uint64_t src_iface; + uint64_t immediate_info; + uintptr_t origin_byte_counter_vaddr; + uint64_t unused; - } contiguous_16B; + /* ==== CACHE LINE 1 (WFR/9B only) ==== */ + union { struct { - /* ==== CACHE LINE 0 ==== */ - - uintptr_t origin_byte_counter_vaddr; - struct fi_opx_hmem_iov iov[2]; - - /* ==== CACHE LINE 1-127 (for 8k mtu) ==== */ - struct fi_opx_hmem_iov iov_ext[FI_OPX_MAX_HMEM_IOV - 2]; - size_t unused; - - } noncontiguous; - } rendezvous; + uint8_t immediate_byte[8]; + uint64_t immediate_qw[7]; + }; - struct { - union fi_opx_hfi1_dput_iov iov[FI_OPX_MAX_DPUT_IOV]; - } cts; + union cacheline cache_line_1; + }; - /* tid_cts extends cts*/ - struct { - /* ==== CACHE LINE 0 ==== */ - union fi_opx_hfi1_dput_iov iov[1]; - uint32_t tid_offset; - uint32_t ntidpairs; - int32_t origin_byte_counter_adjust; - uint32_t unused; + /* ==== CACHE LINE 2-127 ==== */ - /* ==== CACHE LINE 1 ==== */ - uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; - } tid_cts; + union cacheline immediate_block[FI_OPX_HFI1_PACKET_MTU / sizeof(union cacheline) - 2]; -} __attribute__((__aligned__(32))); +}; /* 9B and common payload structure */ union fi_opx_hfi1_packet_payload { uint8_t byte[FI_OPX_HFI1_PACKET_MTU]; - uint64_t qw[FI_OPX_HFI1_PACKET_MTU>>3]; + uint64_t qw[FI_OPX_HFI1_PACKET_MTU>>3]; union { struct { - /* ==== CACHE LINE 0 ==== */ - - uintptr_t src_vaddr; - uint64_t src_blocks; /* number of 64-byte data blocks to transfer */ - uint64_t src_device_id; - uint64_t src_iface; - uint64_t immediate_info; - uintptr_t origin_byte_counter_vaddr; - uint64_t unused[2]; - - /* ==== CACHE LINE 1 ==== */ - union { - struct { - uint8_t immediate_byte[8]; - uint64_t immediate_qw[7]; - }; - - union cacheline cache_line_1; - }; - - /* ==== CACHE LINE 2-127 ==== */ - - union cacheline immediate_block[FI_OPX_HFI1_PACKET_MTU / sizeof(union cacheline) - 2]; + uint64_t contig_9B_padding; + struct opx_payload_rzv_contig contiguous; + }; + struct opx_payload_rzv_contig contiguous_16B; - } contiguous; struct { /* ==== CACHE LINE 0 ==== */ @@ -1759,8 +1693,6 @@ union fi_opx_hfi1_packet_payload { /* ==== CACHE LINE 1 ==== */ uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; } tid_cts; - /* Union with 16B payload */ - union fi_opx_hfi1_packet_payload_16B payload_16B; } __attribute__((__aligned__(32))); static_assert(sizeof(union fi_opx_hfi1_packet_payload) <= FI_OPX_HFI1_PACKET_MTU, diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index 835633e7a68..4ac1acdf7c0 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -118,7 +118,8 @@ ssize_t fi_opx_ep_tx_cq_inject_completion(struct fid_ep *ep, // faster than memcpy() for this amount of data. // DOES NOT SUPPORT SCB (PIO or UREG) (does not support SIM/BAR) // Unstructured copy - for payloads or other memcpy replacement -static inline void fi_opx_copy_cacheline(uint64_t dest[8], uint64_t source[8]) +__OPX_FORCE_INLINE__ +void fi_opx_copy_cacheline(uint64_t dest[8], uint64_t source[8]) { dest[0] = source[0]; dest[1] = source[1]; diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index d426622b707..cdd21771860 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -1440,13 +1440,12 @@ int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) return FI_SUCCESS; } - __OPX_FORCE_INLINE__ int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, struct fi_opx_hfi1_rx_rzv_rts_params *params, const uint64_t niov, const uint64_t immediate_data, - const uint64_t immediate_end_block_count, + const uint64_t immediate_tail, const uint64_t is_hmem, const uint64_t is_intranode, const enum fi_hmem_iface iface, @@ -1461,7 +1460,7 @@ int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, || !fi_opx_hfi1_sdma_use_sdma(opx_ep, params->dput_iov[0].bytes, opcode, is_hmem, OPX_INTRANODE_FALSE) || (immediate_data == 0) - || (immediate_end_block_count == 0)) { + || (immediate_tail == 0)) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_tid_ineligible); return 0; @@ -1487,7 +1486,7 @@ int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, /* First adjust for the start page alignment, using immediate data that was sent.*/ const int64_t alignment_adjustment = (uint64_t)params->dst_vaddr - vaddr; const int64_t length_with_adjustment = params->dput_iov[0].bytes + alignment_adjustment; - const int64_t new_length = length_with_adjustment & -64; + const int64_t new_length = length_with_adjustment & -8; const int64_t len_difference = new_length - params->dput_iov[0].bytes; if (alignment_adjustment) { @@ -2124,7 +2123,7 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, const enum fi_hmem_iface dst_iface, const uint64_t dst_device, const uint64_t immediate_data, - const uint64_t immediate_end_block_count, + const uint64_t immediate_end_bytes, const struct fi_opx_hmem_iov *src_iovs, uint8_t opcode, const unsigned is_intranode, @@ -2232,7 +2231,7 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, if (opx_hfi1_rx_rzv_rts_tid_eligible(opx_ep, params, niov, immediate_data, - immediate_end_block_count, + immediate_end_bytes, is_hmem, is_intranode, dst_iface, opcode)) { params->tid_info.cur_addr_range.buf = params->dput_iov[0].rbuf; @@ -2458,9 +2457,9 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) blocks_to_send_in_this_packet = (bytes_to_send_this_packet >> 6) + (tail_bytes ? 1 : 0); } else { /* 1 QW for hdr that spills to 2nd cacheline + 1 QW for ICRC/tail */ - const uint64_t additional_hdr_tail_byte = 2 * 8; - uint64_t payload_n_additional_hdr_tail_bytes = (MIN(bytes_to_send + params->payload_bytes_for_iovec + additional_hdr_tail_byte, - max_bytes_per_packet)); + const uint64_t additional_hdr_tail_byte = 2 * 8; + uint64_t payload_n_additional_hdr_tail_bytes = (MIN(bytes_to_send + params->payload_bytes_for_iovec + additional_hdr_tail_byte, + max_bytes_per_packet)); uint64_t tail_bytes = payload_n_additional_hdr_tail_bytes & 0x3Ful; blocks_to_send_in_this_packet = (payload_n_additional_hdr_tail_bytes >> 6) + (tail_bytes ? 1 : 0); bytes_to_send_this_packet = payload_n_additional_hdr_tail_bytes - additional_hdr_tail_byte; @@ -3978,39 +3977,46 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; + const uint64_t is_intranode = fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps); + #ifndef NDEBUG - const uint64_t max_immediate_block_count = (FI_OPX_HFI1_PACKET_MTU >> 6)-2 ; + const uint64_t max_immediate_block_count = (FI_OPX_HFI1_PACKET_MTU >> 6) - 2; #endif - /* Expected tid needs to send a leading data block and a trailing - * data block for alignment. Limit this to SDMA (8K+) for now */ - - const uint64_t immediate_block_count = (len > opx_ep->tx->sdma_min_payload_bytes && opx_ep->use_expected_tid_rzv) ? 1 : 0; - const uint64_t immediate_end_block_count = immediate_block_count; - - assert((immediate_block_count + immediate_end_block_count) <= max_immediate_block_count); + /* Expected tid needs to send a leading data block and trailing data + * for alignment. TID writes must start on a 64-byte boundary, so we + * need to send 64 bytes of leading immediate data that allow us + * to shift the receive buffer starting offset to a TID-friendly value. + * TID writes must also be a length that is a multiple of a DW (WFR & JKR 9B) + * or a QW (JKR), so send the last 7 bytes of the source data immediately + * so we can adjust the length after proper alignment has been achieved. */ + const uint8_t immediate_block = (!is_intranode && opx_ep->use_expected_tid_rzv && + len >= opx_ep->tx->sdma_min_payload_bytes && + len >= opx_ep->tx->tid_min_payload_bytes) ? 1 : 0; + const uint8_t immediate_tail = immediate_block; + + assert(immediate_block <= 1); + assert(immediate_tail <= 1); + assert(immediate_block <= max_immediate_block_count); const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); - const uint64_t immediate_byte_count = len & 0x0007ul; - const uint64_t immediate_qw_count = (len >> 3) & 0x0007ul; - const uint64_t immediate_fragment = (((len & 0x003Ful) + 63) >> 6); + const uint8_t immediate_byte_count = (uint8_t) (len & 0x0007ul); + const uint8_t immediate_qw_count = (uint8_t) ((len >> 3) & 0x0007ul); + const uint8_t immediate_fragment = (uint8_t) (((len & 0x003Ful) + 63) >> 6); + assert(immediate_fragment == 1 || immediate_fragment == 0); + /* Immediate total does not include trailing block */ const uint64_t immediate_total = immediate_byte_count + immediate_qw_count * sizeof(uint64_t) + - immediate_block_count * sizeof(union cacheline); - - assert(immediate_byte_count <= UINT8_MAX); - assert(immediate_qw_count <= UINT8_MAX); - assert(immediate_block_count <= UINT8_MAX); - assert(immediate_end_block_count <= UINT8_MAX); + immediate_block * sizeof(union cacheline); union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { - .byte_count = (uint8_t) immediate_byte_count, - .qw_count = (uint8_t) immediate_qw_count, - .block_count = (uint8_t) immediate_block_count, - .end_block_count = (uint8_t) immediate_end_block_count, - .unused = 0 + .count = (immediate_byte_count << OPX_IMMEDIATE_BYTE_COUNT_SHIFT) | + (immediate_qw_count << OPX_IMMEDIATE_QW_COUNT_SHIFT) | + (immediate_block << OPX_IMMEDIATE_BLOCK_SHIFT) | + (immediate_tail << OPX_IMMEDIATE_TAIL_SHIFT), + .tail_bytes = {} }; assert(((len - immediate_total) & 0x003Fu) == 0); @@ -4018,8 +4024,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, const uint64_t payload_blocks_total = 1 + /* rzv metadata */ immediate_fragment + - immediate_block_count + - immediate_end_block_count; + immediate_block; const uint64_t pbc_dws = 2 + /* pbc */ @@ -4030,7 +4035,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, const uint16_t lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ - if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { + if (is_intranode) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND, SHM -- RENDEZVOUS RTS (begin) context %p\n", user_context); @@ -4088,14 +4093,15 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, hdr, payload, buf, ((char*)buf + immediate_total),immediate_total, (len - immediate_total)); - payload->rendezvous.contiguous.src_vaddr = (uintptr_t)buf + immediate_total; - payload->rendezvous.contiguous.src_blocks = (len - immediate_total) >> 6; - payload->rendezvous.contiguous.src_device_id = src_device_id; - payload->rendezvous.contiguous.src_iface = (uint64_t) src_iface; - payload->rendezvous.contiguous.immediate_info = immediate_info.qw0; - payload->rendezvous.contiguous.origin_byte_counter_vaddr = origin_byte_counter_vaddr; - payload->rendezvous.contiguous.unused[0] = 0; - payload->rendezvous.contiguous.unused[1] = 0; + struct opx_payload_rzv_contig *contiguous = &payload->rendezvous.contiguous; + payload->rendezvous.contig_9B_padding = 0; + contiguous->src_vaddr = (uintptr_t)buf + immediate_total; + contiguous->src_blocks = (len - immediate_total) >> 6; + contiguous->src_device_id = src_device_id; + contiguous->src_iface = (uint64_t) src_iface; + contiguous->immediate_info = immediate_info.qw0; + contiguous->origin_byte_counter_vaddr = origin_byte_counter_vaddr; + contiguous->unused = 0; if (immediate_total) { @@ -4103,28 +4109,32 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, if (src_iface != FI_HMEM_SYSTEM) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; opx_copy_from_hmem(src_iface, src_device_id, - desc_mr->hmem_dev_reg_handle, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, buf, immediate_total, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; } - if (immediate_byte_count > 0) { - memcpy((void*)&payload->rendezvous.contiguous.immediate_byte, (const void*)sbuf, immediate_byte_count); - sbuf += immediate_byte_count; + for (int i = 0; i < immediate_byte_count; ++i) { + contiguous->immediate_byte[i] = sbuf[i]; } + sbuf += immediate_byte_count; uint64_t * sbuf_qw = (uint64_t *)sbuf; - unsigned i=0; - for (i=0; irendezvous.contiguous.immediate_qw[i] = sbuf_qw[i]; + for (int i = 0; i < immediate_qw_count; ++i) { + contiguous->immediate_qw[i] = sbuf_qw[i]; } - sbuf_qw += immediate_qw_count; - memcpy((void*)(&payload->rendezvous.contiguous.cache_line_1 + immediate_fragment), - (const void *)sbuf_qw, immediate_block_count << 6); /* immediate_end_block_count */ + if (immediate_block) { + sbuf_qw += immediate_qw_count; + uint64_t *payload_cacheline = + (uint64_t *)(&contiguous->cache_line_1 + immediate_fragment); + fi_opx_copy_cacheline(payload_cacheline, sbuf_qw); + } } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); @@ -4206,6 +4216,24 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] .send.rzv); + if (immediate_tail) { + uint8_t *buf_tail_bytes = ((uint8_t *)buf + len) - OPX_IMMEDIATE_TAIL_BYTE_COUNT; + if (src_iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, + opx_ep->hmem_copy_buf, buf_tail_bytes, OPX_IMMEDIATE_TAIL_BYTE_COUNT, + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + buf_tail_bytes = opx_ep->hmem_copy_buf; + } + + for (int i = 0; i < OPX_IMMEDIATE_TAIL_BYTE_COUNT; ++i) { + immediate_info.tail_bytes[i] = buf_tail_bytes[i]; + } + } + /* * Write the 'start of packet' (hw+sw header) 'send control block' * which will consume a single pio credit. @@ -4215,9 +4243,9 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, volatile uint64_t * const scb = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_sop_first, pio_state); - uint64_t local_temp[16] = {0}; + uint64_t temp[8]; - fi_opx_store_and_copy_qw(scb, local_temp, + fi_opx_store_and_copy_qw(scb, temp, opx_ep->tx->rzv_9B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | force_credit_return | OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), @@ -4238,22 +4266,22 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, local_temp); + fi_opx_copy_hdr9B_cacheline(&replay->scb.scb_9B, temp); /* * write the rendezvous payload "send control blocks" */ volatile uint64_t * scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - uint64_t temp[8]; fi_opx_store_and_copy_qw(scb_payload, temp, + 0, /* contig_9B_padding */ (uintptr_t)buf + immediate_total, /* src_vaddr */ (len - immediate_total) >> 6, /* src_blocks */ src_device_id, (uint64_t) src_iface, immediate_info.qw0, origin_byte_counter_vaddr, - 0, 0 /* unused */); + 0 /* unused */); /* consume one credit for the rendezvous payload metadata */ FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); @@ -4271,9 +4299,12 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM && immediate_total) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(src_iface, src_device_id, desc_mr->hmem_dev_reg_handle, + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, buf, immediate_total, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; @@ -4298,11 +4329,12 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, uint64_t * sbuf_qw = (uint64_t *)(sbuf + immediate_byte_count); if (immediate_fragment) { struct tmp_payload_t *tmp_payload = (void*)temp; - if (immediate_byte_count > 0) { - memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count); + + for (int i = 0; i < immediate_byte_count; ++i) { + tmp_payload->immediate_byte[i] = sbuf[i]; } - for (int i=0; iimmediate_qw[i] = sbuf_qw[i]; } fi_opx_store_scb_qw(scb_payload, temp); @@ -4318,51 +4350,10 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, #endif } - if(immediate_block_count) { -#ifndef NDEBUG - /* assert immediate_block_count can be used for both - * full_block_credits_needed and total_credits_available parameters - * on the call - */ - assert((credits_consumed + immediate_block_count) <= total_credits_needed); - ssize_t credits = -#endif - fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, - &pio_state, - sbuf_qw, - immediate_block_count, - immediate_block_count); - memcpy(replay_payload, sbuf_qw, (immediate_block_count << 6)); - /* replay_payload is pointer to uint64_t, not char */ - replay_payload += (immediate_block_count << 3); /* immediate_block_count << 6 / sizeof(uint64_t) */ - - -#ifndef NDEBUG - assert(credits == immediate_block_count); - credits_consumed+= (unsigned) credits; -#endif - - } - - if (immediate_end_block_count) { - char* sbuf_end = (char *)buf + len - (immediate_end_block_count << 6); - union { - uint8_t immediate_byte[64]; - uint64_t immediate_qw[8]; - } align_tmp; - assert(immediate_end_block_count == 1); - - OPX_HMEM_COPY_FROM(align_tmp.immediate_byte, sbuf_end, (immediate_block_count << 6), - desc ? ((struct fi_opx_mr *)desc)->hmem_dev_reg_handle - : OPX_HMEM_NO_HANDLE, - OPX_HMEM_DEV_REG_SEND_THRESHOLD, - src_iface, src_device_id); - - scb_payload = (uint64_t *)FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - fi_opx_store_scb_qw(scb_payload, align_tmp.immediate_qw); - - fi_opx_copy_cacheline(replay_payload, align_tmp.immediate_qw); - replay_payload += FI_OPX_CACHE_LINE_QWS; + if (immediate_block) { + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, sbuf_qw); + fi_opx_copy_cacheline(replay_payload, sbuf_qw); FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG @@ -4416,27 +4407,38 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = { .fi = dest_addr }; + const uint64_t is_intranode = fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps); + #ifndef NDEBUG const uint64_t max_immediate_block_count = (FI_OPX_HFI1_PACKET_MTU >> 6)-2 ; #endif - /* Expected tid needs to send a leading data block and a trailing - * data block for alignment. Limit this to SDMA (8K+) for now */ - - const uint64_t immediate_block_count = (len > opx_ep->tx->sdma_min_payload_bytes && opx_ep->use_expected_tid_rzv) ? 1 : 0; - const uint64_t immediate_end_block_count = immediate_block_count; - - assert((immediate_block_count + immediate_end_block_count) <= max_immediate_block_count); + /* Expected tid needs to send a leading data block and trailing data + * for alignment. TID writes must start on a 64-byte boundary, so we + * need to send 64 bytes of leading immediate data that allow us + * to shift the receive buffer starting offset to a TID-friendly value. + * TID writes must also be a length that is a multiple of a DW (WFR & JKR 9B) + * or a QW (JKR), so send the last 7 bytes of the source data immediately + * so we can adjust the length after proper alignment has been achieved. */ + const uint8_t immediate_block = (!is_intranode && opx_ep->use_expected_tid_rzv && + len >= opx_ep->tx->sdma_min_payload_bytes && + len >= opx_ep->tx->tid_min_payload_bytes) ? 1 : 0; + const uint8_t immediate_tail = immediate_block; + + assert(immediate_block <= 1); + assert(immediate_tail <= 1); + assert(immediate_block <= max_immediate_block_count); const uint64_t bth_rx = ((uint64_t)dest_rx) << 56; const uint64_t lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID(dest_addr); const uint64_t lrh_dlid_16B = ntohs(FI_OPX_HFI1_LRH_DLID_TO_LID(lrh_dlid)); - const uint64_t immediate_byte_count = len & 0x0007ul; - uint64_t immediate_qw_count = (len >> 3) & 0x0007ul; - uint64_t immediate_fragment = (((len & 0x003Ful) + 63) >> 6); + const uint8_t immediate_byte_count = (uint8_t) (len & 0x0007ul); + const uint8_t immediate_qw_count = (uint8_t) ((len >> 3) & 0x0007ul); + const uint8_t immediate_fragment = (uint8_t) (((len & 0x003Ful) + 63) >> 6); + assert(immediate_fragment == 1 || immediate_fragment == 0); /* Need a full block for ICRC after the end block... */ - const uint64_t icrc_end_block = immediate_end_block_count; + const uint64_t icrc_end_block = immediate_block; /* ... otherwise need a qw (or block) in the immediate fragment */ const uint64_t icrc_fragment = icrc_end_block ? 0 : immediate_fragment; @@ -4455,31 +4457,24 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, /* Immediate total does not include trailing block */ const uint64_t immediate_total = immediate_byte_count + immediate_qw_count * sizeof(uint64_t) + - immediate_block_count * sizeof(union cacheline); + immediate_block * sizeof(union cacheline); union fi_opx_hfi1_rzv_rts_immediate_info immediate_info = { - .byte_count = (uint8_t) immediate_byte_count, - .qw_count = (uint8_t) immediate_qw_count, - .block_count = (uint8_t) immediate_block_count, - .end_block_count = (uint8_t) immediate_end_block_count, - .unused = 0 + .count = (immediate_byte_count << OPX_IMMEDIATE_BYTE_COUNT_SHIFT) | + (immediate_qw_count << OPX_IMMEDIATE_QW_COUNT_SHIFT) | + (immediate_block << OPX_IMMEDIATE_BLOCK_SHIFT) | + (immediate_tail << OPX_IMMEDIATE_TAIL_SHIFT), + .tail_bytes = {} }; - assert(immediate_byte_count <= UINT8_MAX); - assert(immediate_qw_count <= UINT8_MAX); - assert(immediate_block_count <= UINT8_MAX); - assert(immediate_end_block_count <= UINT8_MAX); assert(icrc_end_block + icrc_fragment_block < 2); /* not both */ - assert(immediate_end_block_count == immediate_block_count); - assert(((len - immediate_total) & 0x003Fu) == 0); /* full blocks only. icrc_end_block/icrc_fragment_block count 1 qw only */ const uint64_t payload_blocks_total = 1 + /* last kdeth + rzv metadata */ immediate_fragment + - immediate_block_count + - immediate_end_block_count; + immediate_block; const uint64_t pbc_dws = 2 + /* pbc */ @@ -4492,7 +4487,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, const uint16_t lrh_qws = (pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ - if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { + if (is_intranode) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND 16B, SHM -- RENDEZVOUS RTS (begin) context %p\n", user_context); @@ -4550,16 +4545,17 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, hdr->qw_16B[6] = len; hdr->qw_16B[7] = tag; - union fi_opx_hfi1_packet_payload_16B * const payload = - (union fi_opx_hfi1_packet_payload_16B *)(hdr+1); + union fi_opx_hfi1_packet_payload * const payload = + (union fi_opx_hfi1_packet_payload *)(hdr+1); - payload->rendezvous.contiguous.src_vaddr = (uintptr_t)buf + immediate_total; - payload->rendezvous.contiguous.src_blocks = (len - immediate_total) >> 6; - payload->rendezvous.contiguous.src_device_id = src_device_id; - payload->rendezvous.contiguous.src_iface = (uint64_t) src_iface; - payload->rendezvous.contiguous.immediate_info = immediate_info.qw0; - payload->rendezvous.contiguous.origin_byte_counter_vaddr = origin_byte_counter_vaddr; - payload->rendezvous.contiguous.unused[0] = 0; + struct opx_payload_rzv_contig *contiguous = &payload->rendezvous.contiguous_16B; + contiguous->src_vaddr = (uintptr_t)buf + immediate_total; + contiguous->src_blocks = (len - immediate_total) >> 6; + contiguous->src_device_id = src_device_id; + contiguous->src_iface = (uint64_t) src_iface; + contiguous->immediate_info = immediate_info.qw0; + contiguous->origin_byte_counter_vaddr = origin_byte_counter_vaddr; + contiguous->unused = 0; if (immediate_total) { @@ -4567,28 +4563,32 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, if (src_iface != FI_HMEM_SYSTEM) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; opx_copy_from_hmem(src_iface, src_device_id, - desc_mr->hmem_dev_reg_handle, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, buf, immediate_total, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; } - if (immediate_byte_count > 0) { - memcpy((void*)&payload->rendezvous.contiguous.immediate_byte, (const void*)sbuf, immediate_byte_count); - sbuf += immediate_byte_count; + for (int i = 0; i < immediate_byte_count; ++i) { + contiguous->immediate_byte[i] = sbuf[i]; } + sbuf += immediate_byte_count; uint64_t * sbuf_qw = (uint64_t *)sbuf; - unsigned i=0; - for (i=0; irendezvous.contiguous.immediate_qw[i] = sbuf_qw[i]; + for (int i = 0; i < immediate_qw_count; ++i) { + contiguous->immediate_qw[i] = sbuf_qw[i]; } - sbuf_qw += immediate_qw_count; - memcpy((void*)(&payload->rendezvous.contiguous.cache_line_1 + immediate_fragment), - (const void *)sbuf_qw, immediate_block_count << 6); /* immediate_end_block_count */ + if (immediate_block) { + sbuf_qw += immediate_qw_count; + uint64_t *payload_cacheline = + (uint64_t *)(&contiguous->cache_line_1 + immediate_fragment); + fi_opx_copy_cacheline(payload_cacheline, sbuf_qw); + } } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); @@ -4667,6 +4667,24 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, .kind[(caps & FI_MSG) ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] .send.rzv); + if (immediate_tail) { + uint8_t *buf_tail_bytes = ((uint8_t *)buf + len) - OPX_IMMEDIATE_TAIL_BYTE_COUNT; + if (src_iface != FI_HMEM_SYSTEM) { + struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, + opx_ep->hmem_copy_buf, buf_tail_bytes, OPX_IMMEDIATE_TAIL_BYTE_COUNT, + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + buf_tail_bytes = opx_ep->hmem_copy_buf; + } + + for (int i = 0; i < OPX_IMMEDIATE_TAIL_BYTE_COUNT; ++i) { + immediate_info.tail_bytes[i] = buf_tail_bytes[i]; + } + } + /* * Write the 'start of packet' (hw+sw header) 'send control block' * which will consume a single pio credit. @@ -4750,9 +4768,12 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM && immediate_total) { struct fi_opx_mr * desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(src_iface, src_device_id, desc_mr->hmem_dev_reg_handle, + opx_copy_from_hmem(src_iface, src_device_id, + desc_mr ? desc_mr->hmem_dev_reg_handle + : OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, buf, immediate_total, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD + : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; @@ -4775,11 +4796,12 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, uint64_t * sbuf_qw = (uint64_t *)(sbuf + immediate_byte_count); if (immediate_fragment) { struct tmp_payload_t *tmp_payload = (void*)temp; - if (immediate_byte_count > 0) { - memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count); + + for (int i = 0; i < immediate_byte_count; ++i) { + tmp_payload->immediate_byte[i] = sbuf[i]; } - for (int i=0; iimmediate_qw[i] = sbuf_qw[i]; } scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); @@ -4797,7 +4819,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, /* Need a full tail block */ if (icrc_fragment_block) { /* No other tail or immediate block after this */ - assert(!icrc_end_block && !immediate_block_count && !immediate_end_block_count); + assert(!icrc_end_block && !immediate_block); /* Write another block to accomodate the ICRC and tail */ uint64_t temp_0[8] = {-2UL}; @@ -4813,87 +4835,39 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, #endif } #ifndef NDEBUG - else if(icrc_fragment) { /* used an immediate qw for tail */ + else if (icrc_fragment) { /* used an immediate qw for tail */ /* No other tail or immediate block after this */ - assert(!icrc_end_block && !immediate_block_count && !immediate_end_block_count); + assert(!icrc_end_block && !immediate_block); } else { /* Must be tail and immediate blocks after this */ - assert(icrc_end_block && immediate_block_count && immediate_end_block_count); + assert(icrc_end_block && immediate_block); } #endif } - if(immediate_block_count) { -#ifndef NDEBUG + if (immediate_block) { /* Tail will be it's own block */ - assert(icrc_end_block && !icrc_fragment_block && !icrc_fragment && immediate_end_block_count); - /* assert immediate_block_count can be used for both - * full_block_credits_needed and total_credits_available parameters - * on the call - */ - assert((credits_consumed + immediate_block_count) <= total_credits_needed); - ssize_t credits = -#endif - fi_opx_hfi1_tx_egr_store_full_payload_blocks(opx_ep, - &pio_state, - sbuf_qw, - immediate_block_count, - immediate_block_count); - memcpy(replay_payload, sbuf_qw, (immediate_block_count << 6)); - /* replay_payload is pointer to uint64_t, not char */ - replay_payload += (immediate_block_count << 3); /* immediate_block_count << 6 / sizeof(uint64_t) */ - - -#ifndef NDEBUG - assert(credits == immediate_block_count); - credits_consumed+= (unsigned) credits; -#endif - - } - - if (immediate_end_block_count) { - /* Tail will be it's own block */ - assert(icrc_end_block && !icrc_fragment_block && !icrc_fragment && immediate_block_count); - char* sbuf_end = (char *)buf + len - (immediate_end_block_count << 6); - union { - uint8_t immediate_byte[64]; - uint64_t immediate_qw[8]; - } align_tmp; - assert(immediate_end_block_count == 1); - - OPX_HMEM_COPY_FROM(align_tmp.immediate_byte, sbuf_end, (immediate_block_count << 6), - desc ? ((struct fi_opx_mr *)desc)->hmem_dev_reg_handle - : OPX_HMEM_NO_HANDLE, - OPX_HMEM_DEV_REG_SEND_THRESHOLD, - src_iface, src_device_id); - - scb_payload = (uint64_t *)FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); - fi_opx_store_scb_qw(scb_payload, align_tmp.immediate_qw); - - fi_opx_copy_cacheline(replay_payload, align_tmp.immediate_qw); + assert(icrc_end_block && !icrc_fragment_block && !icrc_fragment); + scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); + fi_opx_store_scb_qw(scb_payload, sbuf_qw); + fi_opx_copy_cacheline(replay_payload, sbuf_qw); replay_payload += FI_OPX_CACHE_LINE_QWS; FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG ++credits_consumed; #endif - - /* Need a full block for ICRC after the end block... */ - assert(icrc_end_block); - /* Write another block to accomodate the ICRC and tail */ uint64_t temp_0[8] = {-3UL}; scb_payload = FI_OPX_HFI1_PIO_SCB_HEAD(opx_ep->tx->pio_scb_first, pio_state); fi_opx_store_scb_qw(scb_payload, temp_0); fi_opx_copy_cacheline(replay_payload, temp_0); - replay_payload += FI_OPX_CACHE_LINE_QWS; FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG ++credits_consumed; #endif - } fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 4c03a191bbe..7967fdcf13b 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -655,7 +655,7 @@ static void do_static_assert_tests() OPX_COMPILE_TIME_ASSERT(sizeof(*payload) == sizeof(payload->tid_cts), "Expected TID rendezvous CTS payload size error"); - OPX_COMPILE_TIME_ASSERT(sizeof(*payload) == sizeof(payload->rendezvous.contiguous), + OPX_COMPILE_TIME_ASSERT(sizeof(*payload) >= sizeof(payload->rendezvous.contiguous), "Contiguous rendezvous payload size error"); OPX_COMPILE_TIME_ASSERT(sizeof(*payload) == sizeof(payload->rendezvous.noncontiguous), From 3a623e187338e8a680679c4133a25f8de3dd8ac0 Mon Sep 17 00:00:00 2001 From: Lindsay Reiser Date: Thu, 10 Oct 2024 11:39:53 -0400 Subject: [PATCH 167/393] prov/opx: Conditionally set FI_REMOTE_CQ_DATA on receive The OPX provider was explicitly setting FI_REMOTE_CQ_DATA on all receive operations; however, it should only set the flag to indicate that the data field contains the completion data provided by the peer as part of their transmit request. Signed-off-by: Lindsay Reiser --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 87 ++++++------- .../opx/include/rdma/opx/fi_opx_hfi1_packet.h | 116 ++++++++++++++---- .../include/rdma/opx/fi_opx_hfi1_progress.h | 20 +-- .../include/rdma/opx/fi_opx_hfi1_transport.h | 90 ++++++++------ prov/opx/include/rdma/opx/fi_opx_tagged.h | 24 ++-- prov/opx/src/fi_opx_ep.c | 2 +- prov/opx/src/fi_opx_hfi1.c | 36 +++--- prov/opx/src/fi_opx_reliability.c | 36 +++--- prov/opx/src/fi_opx_tagged.c | 1 + 9 files changed, 248 insertions(+), 164 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 59b8d5d3d6e..194637b7014 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -132,6 +132,7 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); dest_addr, 0, 0, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ + 0, /* flags */ \ CAPS | FI_MSG, \ RELIABILITY, \ HFI1_TYPE); \ @@ -156,7 +157,7 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); AV, /* av_type */ \ 1, /* is_contiguous */ \ 0, /* override_flags */ \ - 0, /* flags */ \ + FI_REMOTE_CQ_DATA, /* flags */ \ CAPS | FI_MSG, \ RELIABILITY, \ HFI1_TYPE); \ @@ -165,11 +166,12 @@ void fi_opx_cq_debug(struct fid_cq *cq, char *func, const int line); fi_opx_injectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ uint64_t data, fi_addr_t dest_addr) \ - { \ + { \ return fi_opx_ep_tx_inject(ep, buf, len, \ dest_addr, 0, data, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ + FI_REMOTE_CQ_DATA, /* flags */ \ CAPS | FI_MSG, \ RELIABILITY, \ HFI1_TYPE); \ @@ -1037,7 +1039,7 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { - assert( (opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) || (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); + assert(FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- RENDEZVOUS RTS (%X) (begin) context %p is_multi_recv (%lu)\n", @@ -1053,7 +1055,7 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, const uint64_t recv_len = context->len; if (is_multi_receive) { /* compile-time constant expression */ - assert(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS); + assert(FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode) == FI_MSG); const uint8_t u8_rx = hdr->rendezvous.origin_rx; const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); struct opx_context * original_multi_recv_context = context; @@ -1163,8 +1165,9 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, context->data = ofi_data; context->tag = origin_tag; context->next = NULL; - context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) ? FI_TAGGED : FI_MSG); + context->flags |= FI_RECV | + FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); const uint8_t u8_rx = hdr->rendezvous.origin_rx; const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); @@ -1181,12 +1184,12 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, rbuf_iface = hmem_info->iface; hmem_handle = hmem_info->hmem_dev_reg_handle; FI_OPX_DEBUG_COUNTERS_INC_COND(is_intranode, opx_ep->debug_counters.hmem.intranode - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.rzv); FI_OPX_DEBUG_COUNTERS_INC_COND(!is_intranode, opx_ep->debug_counters.hmem.hfi - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.rzv); } else { rbuf_device = 0; @@ -1270,8 +1273,8 @@ void fi_opx_handle_recv_rts(const union opx_hfi1_packet_hdr * const hdr, context->tag = origin_tag; context->next = NULL; context->byte_counter = 0; - context->flags = FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) ? FI_TAGGED : FI_MSG); + context->flags = FI_RECV | FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); const uint8_t u8_rx = hdr->rendezvous.origin_rx; const uint32_t u32_ext_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, is_intranode, hdr->rendezvous.origin_rx); @@ -1339,7 +1342,6 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { - assert((is_multi_receive && !is_hmem) || !is_multi_receive); struct fi_opx_ep * opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -1356,7 +1358,7 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, OPX_DEBUG_PRINT_HDR(hdr, hfi1_type); - if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT || opcode == FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) { + if (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- INJECT (begin)\n"); @@ -1402,12 +1404,12 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, recv_buf, hdr->inject.app_data_u8, send_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); FI_OPX_DEBUG_COUNTERS_INC_COND(is_intranode, opx_ep->debug_counters.hmem.intranode - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.inject); FI_OPX_DEBUG_COUNTERS_INC_COND(!is_intranode, opx_ep->debug_counters.hmem.hfi - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.inject); } else { #pragma GCC diagnostic push @@ -1453,8 +1455,8 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, "INJECT send_len %lu <= recv_len %lu; enqueue cq (completed) ofi_data = %ld tag = %ld\n", send_len, recv_len, ofi_data, origin_tag); - context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) ? FI_TAGGED : FI_MSG); + context->flags |= FI_RECV | FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); context->len = send_len; context->data = ofi_data; context->tag = origin_tag; @@ -1492,7 +1494,7 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- INJECT (end)\n"); - } else if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER || opcode == FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) { + } else if (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- EAGER (begin)\n"); @@ -1578,12 +1580,12 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, context->buf, opx_ep->hmem_copy_buf, send_len, OPX_HMEM_DEV_REG_RECV_THRESHOLD); FI_OPX_DEBUG_COUNTERS_INC_COND(is_intranode, opx_ep->debug_counters.hmem.intranode - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.eager); FI_OPX_DEBUG_COUNTERS_INC_COND(!is_intranode, opx_ep->debug_counters.hmem.hfi - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.eager); } @@ -1592,8 +1594,8 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "EAGER send_len %lu <= recv_len %lu; enqueue cq (completed), tag %#lX/%#lX, ofi_data %#lX \n", send_len, recv_len, context->tag, origin_tag, ofi_data); - context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER) ? FI_TAGGED : FI_MSG); + context->flags |= FI_RECV | FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); context->len = send_len; context->data = ofi_data; context->tag = origin_tag; @@ -1631,8 +1633,7 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- EAGER (end)\n"); - } else if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST || - opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) { + } else if (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV -- MULTI PACKET EAGER FIRST (begin)\n"); @@ -1686,8 +1687,8 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, recv_buf_qw[i] = payload_qw[i]; } - context->flags |= FI_RECV | FI_REMOTE_CQ_DATA | - ((opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST) ? FI_TAGGED : FI_MSG); + context->flags |= FI_RECV | FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); context->len = payload_total_len; context->data = ofi_data; context->tag = origin_tag; @@ -1702,8 +1703,8 @@ void opx_ep_complete_receive_operation (struct fid_ep *ep, /* MP Eager sends are never intranode */ FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi - .kind[(opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) - ? FI_OPX_KIND_MSG : FI_OPX_KIND_TAG] + .kind[FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) + ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG] .recv.mp_eager); } } else { /* truncation - unlikely */ @@ -2592,8 +2593,8 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "search the match queue\n"); const uint64_t kind = (static_flags & FI_TAGGED) ? FI_OPX_KIND_TAG : FI_OPX_KIND_MSG; - assert((kind == FI_OPX_KIND_TAG && opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST) || - (kind == FI_OPX_KIND_MSG && opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST)); + assert((kind == FI_OPX_KIND_TAG && FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode)) || + (kind == FI_OPX_KIND_MSG && !FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode))); struct opx_context *context = (struct opx_context *) opx_ep->rx->queue[kind].mq.head; struct opx_context *prev = NULL; @@ -2616,7 +2617,7 @@ void fi_opx_ep_rx_process_header_mp_eager_first(struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "did not find a match .. add this packet to the unexpected queue\n"); - if (OFI_LIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST)) + if (OFI_LIKELY(FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode))) fi_opx_ep_rx_append_ue_tag(opx_ep->rx, hdr, payload, payload_bytes, opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst, opx_ep->daos_info.hfi_rank_enabled, @@ -2777,8 +2778,7 @@ void fi_opx_ep_rx_process_header (struct fid_ep *ep, is_intranode, lock_required, reliability, hfi1_type); return; - } else if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST || - opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) { + } else if (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) { fi_opx_ep_rx_process_header_mp_eager_first(ep, hdr, payload, payload_bytes, static_flags, opcode, origin_rs, @@ -3194,8 +3194,7 @@ int fi_opx_ep_process_context_match_ue_packets(struct fi_opx_ep * opx_ep, if (uepkt) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "UEPKT found a match, uepkt = %p\n", uepkt); - uint8_t is_mp_eager = (uepkt->hdr.bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST || - uepkt->hdr.bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST); + uint8_t is_mp_eager = (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(uepkt->hdr.bth.opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST); const unsigned is_intranode = opx_lrh_is_intranode(&(uepkt->hdr), hfi1_type); if (is_mp_eager) { @@ -3734,7 +3733,7 @@ ssize_t fi_opx_hfi1_tx_send_try_mp_egr (struct fid_ep *ep, rc = fi_opx_hfi1_tx_send_mp_egr_first_common (opx_ep, (void **) &buf_bytes_ptr, len, desc, opx_ep->hmem_copy_buf, pbc_dlid, bth_rx, lrh_dlid, addr, tag, data, lock_required, - caps, reliability, &first_packet_psn, + tx_op_flags, caps, reliability, &first_packet_psn, hmem_iface, hmem_device, hfi1_type); if (rc != FI_SUCCESS) { @@ -4185,6 +4184,7 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, const uint32_t data, const int lock_required, const enum fi_av_type av_type, + uint64_t tx_op_flags, const uint64_t caps, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) @@ -4231,7 +4231,7 @@ ssize_t fi_opx_ep_tx_inject_internal (struct fid_ep *ep, const union fi_opx_addr addr = FI_OPX_EP_AV_ADDR(av_type,opx_ep,dest_addr); const ssize_t rc = FI_OPX_FABRIC_TX_INJECT(ep, buf, len, addr.fi, tag, data, - lock_required, addr.hfi1_rx, caps, reliability, hfi1_type); + lock_required, addr.hfi1_rx, tx_op_flags, caps, reliability, hfi1_type); if (OFI_UNLIKELY(rc == -EAGAIN)) { // In this case we are probably out of replay buffers. To deal @@ -4258,6 +4258,7 @@ ssize_t fi_opx_ep_tx_inject(struct fid_ep *ep, const uint32_t data, const int lock_required, const enum fi_av_type av_type, + uint64_t tx_op_flags, const uint64_t caps, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) @@ -4268,7 +4269,7 @@ ssize_t fi_opx_ep_tx_inject(struct fid_ep *ep, ssize_t rc = fi_opx_ep_tx_inject_internal(ep, buf, len, dest_addr, tag, data, FI_OPX_LOCK_NOT_REQUIRED, av_type, - caps, reliability, hfi1_type); + tx_op_flags, caps, reliability, hfi1_type); fi_opx_unlock_if_required(&opx_ep->lock, lock_required); diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h index 99685dadca5..f7d8ef70fb2 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h @@ -64,17 +64,38 @@ #define FI_OPX_HFI_BTH_OPCODE_ATOMIC (0xC4) #define FI_OPX_HFI_BTH_OPCODE_ACK (0xC5) #define FI_OPX_HFI_BTH_OPCODE_UD (0xC6) /* unreliabile datagram */ -/* opcodes (0xC7..0xEF) are unused */ -#define FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH (0xF7) -#define FI_OPX_HFI_BTH_OPCODE_MSG_INJECT (0xF8) -#define FI_OPX_HFI_BTH_OPCODE_MSG_EAGER (0xF9) -#define FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST (0xFA) -#define FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS (0xFB) -#define FI_OPX_HFI_BTH_OPCODE_TAG_INJECT (0xFC) -#define FI_OPX_HFI_BTH_OPCODE_TAG_EAGER (0xFD) -#define FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST (0xFE) -#define FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS (0xFF) - +/* opcodes (0xC7..0xEE) are unused */ + +#define FI_OPX_HFI_BTH_OPCODE_CQ_BIT (0x01) +#define FI_OPX_HFI_BTH_OPCODE_TAG_BIT (0x02) +#define FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(opcode) \ + (opcode & ~(FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT)) +#define FI_OPX_HFI_BTH_OPCODE_WITHOUT_CQ(opcode) \ + (opcode & ~(FI_OPX_HFI_BTH_OPCODE_CQ_BIT)) +#define FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) \ + ((opcode & FI_OPX_HFI_BTH_OPCODE_CQ_BIT) ? FI_REMOTE_CQ_DATA : 0) +#define FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode) \ + ((opcode & FI_OPX_HFI_BTH_OPCODE_TAG_BIT) ? FI_TAGGED : FI_MSG) +#define FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode) \ + ((opcode & FI_OPX_HFI_BTH_OPCODE_TAG_BIT) ? 1 : 0) + +#define FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH (0xEF) +#define FI_OPX_HFI_BTH_OPCODE_MSG_INJECT (0xF0) +#define FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_INJECT | FI_OPX_HFI_BTH_OPCODE_CQ_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_INJECT (FI_OPX_HFI_BTH_OPCODE_MSG_INJECT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_INJECT | FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_MSG_EAGER (0xF4) +#define FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_EAGER | FI_OPX_HFI_BTH_OPCODE_CQ_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_EAGER (FI_OPX_HFI_BTH_OPCODE_MSG_EAGER | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_EAGER | FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST (0xF8) +#define FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST | FI_OPX_HFI_BTH_OPCODE_CQ_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST (FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST | FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS (0xFC) +#define FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS | FI_OPX_HFI_BTH_OPCODE_CQ_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS (FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) +#define FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ (FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS | FI_OPX_HFI_BTH_OPCODE_CQ_BIT | FI_OPX_HFI_BTH_OPCODE_TAG_BIT) static const char* FI_OPX_HFI_BTH_LOW_OPCODE_STRINGS[] = { /* opcodes (0x00..0xBF) are reserved */ @@ -87,17 +108,46 @@ static const char* FI_OPX_HFI_BTH_LOW_OPCODE_STRINGS[] = { "FI_OPX_HFI_BTH_OPCODE_UD " }; static const char* FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS[] = { - /* opcodes (0xC7..0xEF) are unused */ - "FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH ", - "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT ", - "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER ", - "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST ", - "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS ", - "FI_OPX_HFI_BTH_OPCODE_TAG_INJECT ", - "FI_OPX_HFI_BTH_OPCODE_TAG_EAGER ", - "FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST ", - "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS ", - "INVALID BTH OPCODE " }; + /* opcodes (0xC7..0xEE) are unused */ + "FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH ", + "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT ", + "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ ", + "FI_OPX_HFI_BTH_OPCODE_TAG_INJECT ", + "FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ ", + "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER ", + "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ ", + "FI_OPX_HFI_BTH_OPCODE_TAG_EAGER ", + "FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ ", + "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST ", + "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ ", + "FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST ", + "FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ ", + "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS ", + "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ ", + "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS ", + "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ ", + "INVALID BTH OPCODE " }; + +OPX_COMPILE_TIME_ASSERT((FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH == (FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ - sizeof(FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS)/sizeof(char*) + 2)), "FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH must be first in the high opcode array, or dependent code conditionals need updated"); +OPX_COMPILE_TIME_ASSERT((FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ == 0xFF), "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ must be last in the high opcode array, or dependent code conditionals need updated"); + +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_INJECT ^ FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT and FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_EAGER ^ FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER and FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST ^ FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST and FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS ^ FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS and FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_TAG_INJECT ^ FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_TAG_INJECT and FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_TAG_EAGER ^ FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_TAG_EAGER and FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST ^ FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST and FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS ^ FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ) == FI_OPX_HFI_BTH_OPCODE_CQ_BIT), "FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS and FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_CQ_BIT"); + +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_INJECT ^ FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT and FI_OPX_HFI_BTH_OPCODE_TAG_INJECT must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ ^ FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ and FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_EAGER ^ FI_OPX_HFI_BTH_OPCODE_TAG_EAGER) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER and FI_OPX_HFI_BTH_OPCODE_TAG_EAGER must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ ^ FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ and FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST ^ FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST and FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ ^ FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ and FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS ^ FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS and FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); +OPX_COMPILE_TIME_ASSERT(((FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ ^ FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ) == FI_OPX_HFI_BTH_OPCODE_TAG_BIT), "FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ and FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ must only differ by FI_OPX_HFI_BTH_OPCODE_TAG_BIT"); static inline const char* opx_hfi1_bth_opcode_to_string(uint16_t opcode) { @@ -106,7 +156,7 @@ static inline const char* opx_hfi1_bth_opcode_to_string(uint16_t opcode) (opcode <= (uint16_t) FI_OPX_HFI_BTH_OPCODE_UD)) { return FI_OPX_HFI_BTH_LOW_OPCODE_STRINGS[opcode-FI_OPX_HFI_BTH_OPCODE_INVALID]; } else if ((opcode >= (uint16_t) FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH) && - (opcode <= (uint16_t) FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)) { + (opcode <= (uint16_t) FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ)) { return FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS[opcode-FI_OPX_HFI_BTH_OPCODE_MP_EAGER_NTH]; } return FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS[sizeof(FI_OPX_HFI_BTH_HIGH_OPCODE_STRINGS)/sizeof(char*)-1]; /* INVALID */ @@ -1315,18 +1365,26 @@ fi_opx_hfi1_packet_hdr_message_length (const union opx_hfi1_packet_hdr * const h switch (hdr->bth.opcode) { case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT: case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT: + case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ: message_length = hdr->inject.message_length; break; case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER: case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER: + case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ: message_length = hdr->send.xfer_bytes_tail + hdr->send.payload_qws_total * sizeof(uint64_t); break; case FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST: case FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST: + case FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ: message_length = hdr->mp_eager_first.payload_bytes_total & FI_OPX_HFI1_KDETH_VERSION_OFF_MASK; break; case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS: case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS: + case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ: //assert(hdr->rendezvous.niov == 1); message_length = hdr->rendezvous.message_length; break; @@ -1444,18 +1502,24 @@ void fi_opx_hfi1_dump_packet_hdr (const union opx_hfi1_packet_hdr * const hdr, break; case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT: case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT: + case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ: FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.message_length ... 0x%02x \n", pid, fn, ln, hdr->inject.message_length); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.app_data_u64[0] .. 0x%016lx \n", pid, fn, ln, hdr->inject.app_data_u64[0]); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .inject.app_data_u64[1] .. 0x%016lx \n", pid, fn, ln, hdr->inject.app_data_u64[1]); break; case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER: case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER: + case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ: FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .send.xfer_bytes_tail .... 0x%02x \n", pid, fn, ln, hdr->send.xfer_bytes_tail); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .send.payload_qws_total .. 0x%04x \n", pid, fn, ln, hdr->send.payload_qws_total); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .send.xfer_tail .......... 0x%016lx \n", pid, fn, ln, hdr->send.xfer_tail); break; case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS: case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS: /* calculate (?) total bytes to be transfered */ + case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ: /* calculate (?) total bytes to be transfered */ case FI_OPX_HFI_BTH_OPCODE_RZV_CTS: FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .cts.origin .......... 0x%x \n", pid, fn, ln, hdr->cts.origin_rx); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"(%d) %s():%u .cts.target.vaddr.ntidpairs .......... 0x%x \n", pid, fn, ln, hdr->cts.target.vaddr.ntidpairs); @@ -1903,6 +1967,8 @@ void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr break; case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT: case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT: + case FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ: fprintf(stderr, "(%d) %s():%u .inject.message_length .............. 0x%02x\n", pid, fn, ln, hdr->inject.message_length); fprintf(stderr, "(%d) %s():%u .inject.app_data_u64[0] 0x%016lx\n", @@ -1912,6 +1978,8 @@ void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr break; case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER: case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER: + case FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ: fprintf(stderr, "(%d) %s():%u .send.xfer_bytes_tail ............... 0x%02x\n", pid, fn, ln, hdr->send.xfer_bytes_tail); fprintf(stderr, "(%d) %s():%u .send.payload_qws_total 0x%04x\n", @@ -1969,6 +2037,8 @@ void fi_opx_hfi1_dump_packet_hdr (const union fi_opx_hfi1_packet_hdr * const hdr break; case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS: case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS: /* calculate (?) total bytes to be transfered */ + case FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ: + case FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ: /* calculate (?) total bytes to be transfered */ break; default: break; diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h index 5258bc47587..3e57c064dc3 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h @@ -368,15 +368,15 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, "================ received a packet from the fabric\n"); if (!OPX_RHF_IS_USE_EGR_BUF(rhf,hfi1_type)) { - if (OFI_LIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)) { + if (OFI_LIKELY(FI_OPX_HFI_BTH_OPCODE_WITHOUT_CQ(opcode) == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)) { /* "header only" packet - no payload */ fi_opx_ep_rx_process_header(&opx_ep->ep_fid, hdr, NULL, 0, FI_TAGGED, - FI_OPX_HFI_BTH_OPCODE_TAG_INJECT, + opcode, origin_rx, OPX_INTRANODE_FALSE, lock_required, reliability, hfi1_type, slid); - } else if (opcode > FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { + } else if (FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode)) { /* all other "tag" packets */ fi_opx_ep_rx_process_header_tag(&opx_ep->ep_fid, hdr, NULL, 0, opcode, origin_rx, OPX_INTRANODE_FALSE, @@ -415,17 +415,17 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); } - if (OFI_LIKELY(opcode == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)) { + if (OFI_LIKELY(FI_OPX_HFI_BTH_OPCODE_WITHOUT_CQ(opcode) == FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)) { fi_opx_ep_rx_process_header( &opx_ep->ep_fid, hdr, (const union fi_opx_hfi1_packet_payload *const)payload, - payload_bytes_to_copy, FI_TAGGED, FI_OPX_HFI_BTH_OPCODE_TAG_EAGER, + payload_bytes_to_copy, FI_TAGGED, opcode, origin_rx, OPX_INTRANODE_FALSE, lock_required, reliability, hfi1_type, slid); - } else if (opcode > FI_OPX_HFI_BTH_OPCODE_TAG_EAGER) { /* all other "tag" packets */ + } else if (FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode)) { /* all other "tag" packets */ fi_opx_ep_rx_process_header_tag(&opx_ep->ep_fid, hdr, payload, payload_bytes_to_copy, opcode, origin_rx, OPX_INTRANODE_FALSE, @@ -685,10 +685,10 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, slid = htons(hdr->lrh_16B.slid20 << 20 | hdr->lrh_16B.slid); } - if (opcode == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { + if (FI_OPX_HFI_BTH_OPCODE_WITHOUT_CQ(opcode) == FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { fi_opx_ep_rx_process_header(ep, hdr, NULL, 0, FI_TAGGED, - FI_OPX_HFI_BTH_OPCODE_TAG_INJECT, + opcode, (const uint8_t) origin_reliability_rx, OPX_INTRANODE_TRUE, lock_required, @@ -733,7 +733,7 @@ void fi_opx_shm_poll_many(struct fid_ep *ep, const int lock_required, payload_bytes_to_copy = total_bytes_to_copy - sizeof(struct fi_opx_hfi1_stl_packet_hdr_16B); } - if (opcode >= FI_OPX_HFI_BTH_OPCODE_TAG_INJECT) { + if (FI_OPX_HFI_BTH_OPCODE_IS_TAGGED(opcode)) { fi_opx_ep_rx_process_header_tag(ep, hdr, payload, payload_bytes_to_copy, opcode, @@ -782,7 +782,7 @@ void fi_opx_hfi1_poll_sdma_completion(struct fi_opx_ep *opx_ep) hfi->info.sdma.queued_entries[hfi->info.sdma.done_index]->errcode = entry->errcode; hfi->info.sdma.queued_entries[hfi->info.sdma.done_index] = NULL; - assert(entry->status == COMPLETE || entry->status == FREE || + assert(entry->status == COMPLETE || entry->status == FREE || (entry->status == ERROR && entry->errcode != ECOMM)); // If it is a network error, retry ++hfi->info.sdma.available_counter; hfi->info.sdma.done_index = (hfi->info.sdma.done_index + 1) % (queue_size); diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index 4ac1acdf7c0..344ab734e4b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -1033,6 +1033,7 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, const uint32_t data, int lock_required, const uint64_t dest_rx, + uint64_t tx_op_flags, const uint64_t caps, const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) @@ -1079,9 +1080,8 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, hdr->qw_9B[1] = opx_ep->tx->inject_9B.hdr.qw_9B[1] | bth_rx | (len << 48) | ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT); - + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)); hdr->qw_9B[2] = opx_ep->tx->inject_9B.hdr.qw_9B[2]; hdr->qw_9B[3] = opx_ep->tx->inject_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); @@ -1097,9 +1097,9 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, hdr->qw_16B[1] = opx_ep->tx->inject_16B.hdr.qw_16B[1] | (((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); hdr->qw_16B[2] = opx_ep->tx->inject_16B.hdr.qw_16B[2] | bth_rx | (len << 48) | - ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT); + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)); hdr->qw_16B[3] = opx_ep->tx->inject_16B.hdr.qw_16B[3]; hdr->qw_16B[4] = opx_ep->tx->inject_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), @@ -1179,8 +1179,8 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, opx_ep->tx->inject_9B.hdr.qw_9B[1] | bth_rx | (len << 48) | ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT), + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)), opx_ep->tx->inject_9B.hdr.qw_9B[2] | psn, opx_ep->tx->inject_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), @@ -1195,9 +1195,9 @@ ssize_t fi_opx_hfi1_tx_inject (struct fid_ep *ep, opx_ep->tx->inject_16B.hdr.qw_16B[1] | (((uint64_t)(dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), opx_ep->tx->inject_16B.hdr.qw_16B[2] | bth_rx | (len << 48) | - ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT), + ((caps & FI_MSG) ? /* compile-time constant expression */ + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_INJECT) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)), opx_ep->tx->inject_16B.hdr.qw_16B[3] | psn, opx_ep->tx->inject_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), @@ -1344,6 +1344,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, const uint32_t data, int lock_required, const uint64_t dest_rx, + uint64_t tx_op_flags, const uint64_t caps, const uint64_t do_cq_completion, const enum fi_hmem_iface iface, @@ -1400,8 +1401,8 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, #endif hdr->qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); hdr->qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); hdr->qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2]; hdr->qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); hdr->qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); @@ -1510,6 +1511,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz data, lock_required, dest_rx, + tx_op_flags, caps, do_cq_completion, iface, @@ -1577,8 +1579,8 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type); replay->scb.scb_9B.hdr.qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); replay->scb.scb_9B.hdr.qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER - : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); replay->scb.scb_9B.hdr.qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2] | psn; replay->scb.scb_9B.hdr.qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); replay->scb.scb_9B.hdr.qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); @@ -1648,6 +1650,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, const uint32_t data, int lock_required, const uint64_t dest_rx, + uint64_t tx_op_flags, const uint64_t caps, const uint64_t do_cq_completion, const enum fi_hmem_iface iface, @@ -1709,8 +1712,8 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, ((uint64_t)((lrh_dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); hdr->qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); hdr->qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3]; hdr->qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); hdr->qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); @@ -1832,6 +1835,7 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, data, lock_required, dest_rx, + tx_op_flags, caps, do_cq_completion, iface, @@ -1901,8 +1905,9 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, replay->scb.scb_16B.hdr.qw_16B[0] = opx_ep->tx->send_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_qws << 20); replay->scb.scb_16B.hdr.qw_16B[1] = opx_ep->tx->send_16B.hdr.qw_16B[1] |((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); replay->scb.scb_16B.hdr.qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + ((caps & FI_MSG) ? + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); replay->scb.scb_16B.hdr.qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3] | psn; replay->scb.scb_16B.hdr.qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); replay->scb.scb_16B.hdr.qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); @@ -2027,6 +2032,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, const uint32_t data, int lock_required, const uint64_t dest_rx, + uint64_t tx_op_flags, const uint64_t caps, const uint64_t do_cq_completion, const enum fi_hmem_iface iface, @@ -2082,8 +2088,8 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, hdr->qw_9B[0] = opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); hdr->qw_9B[1] = opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); hdr->qw_9B[2] = opx_ep->tx->send_9B.hdr.qw_9B[2]; hdr->qw_9B[3] = opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); hdr->qw_9B[4] = opx_ep->tx->send_9B.hdr.qw_9B[4] | (payload_qws_total << 48); @@ -2133,6 +2139,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode_16B(struct fid_ep *ep, const uint32_t data, int lock_required, const uint64_t dest_rx, + uint64_t tx_op_flags, const uint64_t caps, const uint64_t do_cq_completion, const enum fi_hmem_iface iface, @@ -2192,8 +2199,8 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode_16B(struct fid_ep *ep, ((uint64_t)((lrh_dlid & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); hdr->qw_16B[2] = opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER); + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)); hdr->qw_16B[3] = opx_ep->tx->send_16B.hdr.qw_16B[3]; hdr->qw_16B[4] = opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); hdr->qw_16B[5] = opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48); @@ -2248,6 +2255,7 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, const uint32_t psn, const uint32_t data, const uint64_t tag, + uint64_t tx_op_flags, const uint64_t caps, const enum opx_hfi1_type hfi1_type) { @@ -2270,8 +2278,8 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)), opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), @@ -2291,8 +2299,8 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)), opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), @@ -2308,8 +2316,8 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)), opx_ep->tx->send_9B.hdr.qw_9B[2] | psn, opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), @@ -2322,7 +2330,10 @@ ssize_t fi_opx_hfi1_tx_egr_write_packet_header(struct fi_opx_ep *opx_ep, opx_ep->tx->send_16B.qw0 | OPX_PBC_LEN(pbc_dws, hfi1_type) | OPX_PBC_CR(opx_ep->tx->force_credit_return, hfi1_type) | pbc_dlid, opx_ep->tx->send_16B.hdr.qw_16B[0] | ((uint64_t)(lrh_dlid_16B & OPX_LRH_JKR_16B_DLID_MASK_16B) << OPX_LRH_JKR_16B_DLID_SHIFT_16B) | ((uint64_t)lrh_packet_length << 20), opx_ep->tx->send_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), - opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER), + opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | (xfer_bytes_tail << 48) | + ((caps & FI_MSG) ? + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_EAGER) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_EAGER)), opx_ep->tx->send_16B.hdr.qw_16B[3] | psn, opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), opx_ep->tx->send_16B.hdr.qw_16B[5] | (payload_qws_total << 48), @@ -2531,7 +2542,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { return fi_opx_hfi1_tx_send_egr_intranode(ep, buf, len, desc, dest_addr, - tag, context, data, lock_required, dest_rx, caps, do_cq_completion, + tag, context, data, lock_required, dest_rx, tx_op_flags, caps, do_cq_completion, iface, hmem_device); } @@ -2609,7 +2620,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, #endif fi_opx_hfi1_tx_egr_write_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, lrh_dws, pbc_dlid, pbc_dws, len, xfer_bytes_tail, - payload_qws_total, psn, data, tag, caps, hfi1_type); + payload_qws_total, psn, data, tag, tx_op_flags, caps, hfi1_type); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); @@ -2678,7 +2689,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { return fi_opx_hfi1_tx_send_egr_intranode_16B(ep, buf, len, desc, dest_addr, - tag, context, data, lock_required, dest_rx, caps, do_cq_completion, + tag, context, data, lock_required, dest_rx, tx_op_flags, caps, do_cq_completion, iface, hmem_device); } @@ -2765,7 +2776,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, #endif fi_opx_hfi1_tx_egr_write_packet_header(opx_ep, &pio_state, local_temp, buf, bth_rx, lrh_dlid, lrh_qws, pbc_dlid, pbc_dws, len, xfer_bytes_tail, - payload_qws_total, psn, data, tag, caps, hfi1_type); + payload_qws_total, psn, data, tag, tx_op_flags, caps, hfi1_type); uint64_t *buf_qws = (uint64_t*)((uintptr_t)buf + xfer_bytes_tail); @@ -2907,6 +2918,7 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ const uint32_t psn, const uint32_t data, const uint64_t tag, + uint64_t tx_op_flags, const uint64_t caps, const enum opx_hfi1_type hfi1_type) { @@ -2926,8 +2938,8 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ opx_ep->tx->send_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), opx_ep->tx->send_9B.hdr.qw_9B[1] | bth_rx | FI_OPX_MP_EGR_XFER_BYTES_TAIL | ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST), + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST)), opx_ep->tx->send_9B.hdr.qw_9B[2] | (payload_bytes_total << 32) | psn, opx_ep->tx->send_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), *((uint64_t *)buf), @@ -2944,8 +2956,8 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_initial_packet_header(struct fi_opx_ep *opx_ ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), opx_ep->tx->send_16B.hdr.qw_16B[2] | bth_rx | FI_OPX_MP_EGR_XFER_BYTES_TAIL | ((caps & FI_MSG) ? /* compile-time constant expression */ - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST), + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST_CQ : (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_MP_EAGER_FIRST)), opx_ep->tx->send_16B.hdr.qw_16B[3] | psn | (payload_bytes_total << 32), opx_ep->tx->send_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), *((uint64_t *)buf), @@ -3140,6 +3152,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first_common(struct fi_opx_ep *opx_ep, uint64_t tag, const uint32_t data, int lock_required, + uint64_t tx_op_flags, const uint64_t caps, const enum ofi_reliability_kind reliability, uint32_t *psn_out, @@ -3203,6 +3216,7 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first_common(struct fi_opx_ep *opx_ep, psn, data, tag, + tx_op_flags, caps, hfi1_type); diff --git a/prov/opx/include/rdma/opx/fi_opx_tagged.h b/prov/opx/include/rdma/opx/fi_opx_tagged.h index 38143f24278..ccf378a050e 100644 --- a/prov/opx/include/rdma/opx/fi_opx_tagged.h +++ b/prov/opx/include/rdma/opx/fi_opx_tagged.h @@ -78,11 +78,12 @@ dest_addr, tag, 0, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ + 0, /* flags */ \ CAPS | FI_TAGGED, \ - RELIABILITY, \ + RELIABILITY, \ HFI1_TYPE); \ } \ - __OPX_FORCE_INLINE__ ssize_t \ + __OPX_FORCE_INLINE__ ssize_t \ fi_opx_tsenddata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ void *desc, uint64_t data, fi_addr_t dest_addr, \ @@ -90,27 +91,28 @@ { \ return fi_opx_ep_tx_send(ep, buf, len, desc, \ dest_addr, tag, context, data, \ - LOCK, /* lock_required */ \ - AV, /* av_type */ \ - 1, /* is_contiguous */ \ - 0, /* override_flags */ \ - 0, /* flags */ \ + LOCK, /* lock_required */ \ + AV, /* av_type */ \ + 1, /* is_contiguous */ \ + 0, /* override_flags */ \ + FI_REMOTE_CQ_DATA, /* flags */ \ CAPS | FI_TAGGED, \ - RELIABILITY, \ + RELIABILITY, \ HFI1_TYPE); \ } \ - __OPX_FORCE_INLINE__ ssize_t \ + __OPX_FORCE_INLINE__ ssize_t \ fi_opx_tinjectdata_ ## LOCK ## _ ## AV ## _ ## CAPS ## _ ## RELIABILITY ## _ ## HFI1_TYPE \ (struct fid_ep *ep, const void *buf, size_t len, \ uint64_t data, fi_addr_t dest_addr, \ uint64_t tag) \ - { \ + { \ return fi_opx_ep_tx_inject(ep, buf, len, \ dest_addr, tag, data, \ LOCK, /* lock_required */ \ AV, /* av_type */ \ + FI_REMOTE_CQ_DATA, /* flags */ \ CAPS | FI_TAGGED, \ - RELIABILITY, \ + RELIABILITY, \ HFI1_TYPE); \ } diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 3cd5eabfdae..eebd26186a5 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -2956,7 +2956,7 @@ void fi_opx_ep_rx_reliability_process_packet (struct fid_ep * ep, slid = htons(((hdr->lrh_16B.slid20 << 20) | (hdr->lrh_16B.slid))); } - if (OFI_LIKELY(opcode >= FI_OPX_HFI_BTH_OPCODE_TAG_INJECT)) { + if (OFI_LIKELY(opcode & FI_OPX_HFI_BTH_OPCODE_TAG_BIT)) { fi_opx_ep_rx_process_header(ep, hdr, (const union fi_opx_hfi1_packet_payload * const) payload, payload_bytes, diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index cdd21771860..c4de298f527 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -3678,8 +3678,8 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { hdr->qw_9B[0] = opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); hdr->qw_9B[1] = opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); hdr->qw_9B[2] = opx_ep->tx->rzv_9B.hdr.qw_9B[2]; hdr->qw_9B[3] = opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); hdr->qw_9B[4] = opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; @@ -3693,8 +3693,8 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz hdr->qw_16B[1] = opx_ep->tx->rzv_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); hdr->qw_16B[2] = opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); hdr->qw_16B[3] = opx_ep->tx->rzv_16B.hdr.qw_16B[3]; hdr->qw_16B[4] = opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); hdr->qw_16B[5] = opx_ep->tx->rzv_16B.hdr.qw_16B[5] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK; @@ -3830,8 +3830,8 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)), opx_ep->tx->rzv_9B.hdr.qw_9B[2] | psn, opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, @@ -3848,8 +3848,8 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, siz opx_ep->tx->rzv_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | - ((caps & FI_MSG) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)), opx_ep->tx->rzv_16B.hdr.qw_16B[3] | psn, opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), opx_ep->tx->rzv_16B.hdr.qw_16B[5] | (niov << 48) | FI_OPX_PKT_RZV_FLAGS_NONCONTIG_MASK, @@ -4077,9 +4077,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, hdr->qw_9B[0] = opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32); hdr->qw_9B[1] = opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); hdr->qw_9B[2] = opx_ep->tx->rzv_9B.hdr.qw_9B[2]; hdr->qw_9B[3] = opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32); @@ -4250,9 +4249,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, OPX_PBC_LRH_DLID_TO_PBC_DLID(lrh_dlid, hfi1_type), opx_ep->tx->rzv_9B.hdr.qw_9B[0] | lrh_dlid | ((uint64_t)lrh_dws << 32), opx_ep->tx->rzv_9B.hdr.qw_9B[1] | bth_rx | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)), opx_ep->tx->rzv_9B.hdr.qw_9B[2] | psn, opx_ep->tx->rzv_9B.hdr.qw_9B[3] | (((uint64_t)data) << 32), opx_ep->tx->rzv_9B.hdr.qw_9B[4] | (1ull << 48), @@ -4535,9 +4533,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)); hdr->qw_16B[2] = opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS); + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)); hdr->qw_16B[3] = opx_ep->tx->rzv_16B.hdr.qw_16B[3]; hdr->qw_16B[4] = opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32); @@ -4705,9 +4702,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B (struct fid_ep *ep, opx_ep->tx->rzv_16B.hdr.qw_16B[1] | ((uint64_t)((lrh_dlid_16B & OPX_LRH_JKR_16B_DLID20_MASK_16B) >> OPX_LRH_JKR_16B_DLID20_SHIFT_16B)), opx_ep->tx->rzv_16B.hdr.qw_16B[2] | bth_rx | - ((caps & FI_MSG) ? - (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS : - (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS), + ((caps & FI_MSG) ? ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) : + ((tx_op_flags & FI_REMOTE_CQ_DATA) ? (uint64_t)FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS_CQ : FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS)), opx_ep->tx->rzv_16B.hdr.qw_16B[3] | psn, opx_ep->tx->rzv_16B.hdr.qw_16B[4] | (((uint64_t)data) << 32), opx_ep->tx->rzv_16B.hdr.qw_16B[5] | (1ull << 48), diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index 8b46b8fac14..bda8429269e 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -454,7 +454,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_opcode (struct fid_ep *ep, OPX_HFI1_BAR_STORE(&scb[5], 0UL); OPX_HFI1_BAR_STORE(&scb[6], 0UL); OPX_HFI1_BAR_STORE(&scb[7], key); - + /* consume one credit for the packet header */ FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); } else { @@ -841,7 +841,7 @@ void fi_opx_hfi1_rx_reliability_ping (struct fid_ep *ep, key, slid, rx, 0, /* psn_start */ 1, /* psn_count */ - FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK, + FI_OPX_HFI_UD_OPCODE_RELIABILITY_NACK, OPX_HFI1_TYPE); INC_PING_STAT_COND(rc == FI_SUCCESS, NACKS_SENT, key, 0, 1); OPX_TRACER_TRACE_RELI(OPX_TRACER_END_ERROR, "RX_RELI_PING"); @@ -2003,7 +2003,7 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, if (!queing_replays) { #ifdef OPX_DEBUG_COUNTERS_RELIABILITY struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - if(OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS || OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) { + if(FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(OPX_REPLAY_HDR(replay)->bth.opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rts); } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_cts); @@ -2047,7 +2047,7 @@ void fi_opx_hfi1_rx_reliability_nack (struct fid_ep *ep, } #ifdef OPX_DEBUG_COUNTERS_RELIABILITY struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); - if(OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS || OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_TAG_RZV_RTS) { + if(FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(OPX_REPLAY_HDR(replay)->bth.opcode) == FI_OPX_HFI_BTH_OPCODE_MSG_RZV_RTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_rts); } else if (OPX_REPLAY_HDR(replay)->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_CTS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.reliability.replay_cts); @@ -2095,7 +2095,7 @@ ssize_t fi_opx_reliability_send_ping(struct fid_ep *ep, uint64_t dlid; /* Inlined but called from non-inlined functions with no const hfi1 type, so just use the runtime check */ - if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + if (OPX_HFI1_TYPE & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { dlid = (uint64_t) head->scb.scb_9B.hdr.lrh_9B.dlid; } else { dlid = (uint64_t) htons(head->scb.scb_16B.hdr.lrh_16B.dlid20 << 20 | head->scb.scb_16B.hdr.lrh_16B.dlid); @@ -2158,16 +2158,16 @@ void fi_reliability_service_ping_remote (struct fid_ep *ep, fi_opx_rbt_key(itr, &key_value); rc = fi_opx_reliability_send_ping(ep, service, itr, key_value); - + /* advance to the next dlid */ - itr = rbtNext(service->tx.flow, itr); - + itr = rbtNext(service->tx.flow, itr); + if(rc == OPX_RELIABILITY_PING_SENT) { ++num_pings; } } - /* We ran out of credits on a particular ping. + /* We ran out of credits on a particular ping. * Store the failing key to be the first to try next time, * set the congested flag to limit future pings, and stop */ if (!rc) { @@ -2184,7 +2184,7 @@ void fi_reliability_service_ping_remote (struct fid_ep *ep, return; } service->tx.ping_start_key = 0; - return; + return; } /* We hit the end of the tree. If there was no starting key, we've iterated through the whole tree and we're done. */ @@ -2204,7 +2204,7 @@ void fi_reliability_service_ping_remote (struct fid_ep *ep, /* advance to the next dlid */ itr = rbtNext(service->tx.flow, itr); - + if(rc == OPX_RELIABILITY_PING_SENT) { ++num_pings; } @@ -2225,9 +2225,9 @@ void fi_reliability_service_ping_remote (struct fid_ep *ep, service->tx.ping_start_key = 0; return; } - + service->tx.ping_start_key = 0; - + // We iterated through the whole tree, unset the congested flag service->tx.congested_flag = 0; } @@ -2415,7 +2415,7 @@ void fi_opx_reliability_model_init_16B(struct fi_opx_reliability_service * servi { /* Ping model */ { - /* PBC */ + /* PBC */ const uint64_t pbc_dws = 2 + /* pbc */ 4 + /* lrh uncompressed */ @@ -2423,7 +2423,7 @@ void fi_opx_reliability_model_init_16B(struct fi_opx_reliability_service * servi 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ 2 ; /* ICRC/tail */ - + /* Setup the 16B models whether or not they'll be used */ enum opx_hfi1_type __attribute__ ((unused)) hfi1_type = OPX_HFI1_JKR; @@ -2453,8 +2453,8 @@ void fi_opx_reliability_model_init_16B(struct fi_opx_reliability_service * servi service->tx.hfi1.ping_model_16B.hdr.lrh_16B.rc = OPX_RC_IN_ORDER_0; service->tx.hfi1.ping_model_16B.hdr.lrh_16B.cspec = OPX_BTH_CSPEC_DEFAULT; /*NOT BTH CSPEC*/ service->tx.hfi1.ping_model_16B.hdr.lrh_16B.pkey = hfi1->pkey; - - + + service->tx.hfi1.ping_model_16B.hdr.lrh_16B.slid = hfi1->lid & 0xFFFFF; service->tx.hfi1.ping_model_16B.hdr.lrh_16B.slid20 = (hfi1->lid) >> 20; @@ -2710,7 +2710,7 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser max_uncongested_pings = OPX_RELIABILITY_MAX_UNCONGESTED_PINGS_DEFAULT; } service->tx.max_uncongested_pings = max_uncongested_pings; - + int max_congested_pings; if(fi_param_get_int(fi_opx_global.prov, "reliability_max_congested_pings", &max_congested_pings) == FI_SUCCESS) { if (max_congested_pings < OPX_RELIABILITY_MAX_CONGESTED_PINGS_MIN || max_congested_pings > OPX_RELIABILITY_MAX_CONGESTED_PINGS_MAX) { diff --git a/prov/opx/src/fi_opx_tagged.c b/prov/opx/src/fi_opx_tagged.c index d0ef9a23aa3..d2b03c056cc 100644 --- a/prov/opx/src/fi_opx_tagged.c +++ b/prov/opx/src/fi_opx_tagged.c @@ -238,6 +238,7 @@ ssize_t fi_opx_tsendmsg(struct fid_ep *ep, msg->addr, msg->tag, msg->data, FI_OPX_LOCK_NOT_REQUIRED, av_type, + flags, caps | FI_TAGGED, opx_ep->reliability->state.kind, OPX_HFI1_TYPE); From b88aa97ef0185a8d2744645a8c0265f7561e4bb8 Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Fri, 11 Oct 2024 10:18:36 -0500 Subject: [PATCH 168/393] prov/opx: Add debug check for zero-byte length data packets Signed-off-by: Ben Lynam --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 56 ++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 194637b7014..bab44eb0631 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -2067,7 +2067,17 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, hdr->dput.target.bytes; assert(bytes <= FI_OPX_HFI1_PACKET_MTU); - +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received RZV (non-TID) data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif const uint64_t *sbuf_qws = (uint64_t*)&payload->byte[0]; #ifdef OPX_HMEM if (target_context->flags & FI_OPX_CQ_CONTEXT_HMEM) { @@ -2230,6 +2240,17 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, hdr->dput.target.bytes; assert(bytes <= FI_OPX_HFI1_PACKET_MTU); +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received RMA PUT data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif // Optimize Memcpy if(hdr->dput.target.op == FI_NOOP - 1 && hdr->dput.target.dt == FI_VOID - 1) { @@ -2265,6 +2286,17 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, assert(cc); assert(bytes <= FI_OPX_HFI1_PACKET_MTU); +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received RMA GET data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif if (hdr->dput.target.dt == (FI_VOID - 1)) { OPX_HMEM_COPY_TO(rbuf_qws, sbuf_qws, bytes, OPX_HMEM_NO_HANDLE, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET, @@ -2310,6 +2342,17 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, hdr->dput.target.last_bytes : hdr->dput.target.bytes; +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received ATOMIC FETCH data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif assert(bytes > sizeof(*dput_fetch)); uint64_t hmem_device; enum fi_hmem_iface hmem_iface = fi_opx_mr_get_iface(opx_mr, &hmem_device); @@ -2383,6 +2426,17 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, hdr->dput.target.last_bytes : hdr->dput.target.bytes; +#ifndef NDEBUG + if (bytes == 0) { + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Received ATOMIC COMPARE FETCH data packet with 0-byte payload size. hdr->dput.target.last_bytes=%hd, hdr->dput.target.bytes=%hd. Based on PSN high bit (%s), bytes was set to %s\n", + hdr->dput.target.last_bytes, + hdr->dput.target.bytes, + (ntohl(hdr->bth.psn) & 0x80000000) ? "ON" : "OFF", + (ntohl(hdr->bth.psn) & 0x80000000) ? "last_bytes" : "bytes"); + abort(); + } +#endif assert(bytes > sizeof(*dput_fetch)); uint64_t hmem_device; enum fi_hmem_iface hmem_iface = fi_opx_mr_get_iface(opx_mr, &hmem_device); From 55a9daf1407c30d0e0b2207ce1cd5405cd652577 Mon Sep 17 00:00:00 2001 From: Jack Morrison Date: Sat, 12 Oct 2024 14:25:21 -0400 Subject: [PATCH 169/393] github/actions: Remove unused Cornelis Networks formatting workflow Signed-off-by: Jack Morrison --- .github/workflows/clang-format-check-cn.yml | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 .github/workflows/clang-format-check-cn.yml diff --git a/.github/workflows/clang-format-check-cn.yml b/.github/workflows/clang-format-check-cn.yml deleted file mode 100644 index 8474d3326d4..00000000000 --- a/.github/workflows/clang-format-check-cn.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: clang-format-cn Check -on: workflow_dispatch -jobs: - formatting-check: - name: Formatting Check - runs-on: ubuntu-latest - strategy: - matrix: - path: - - 'prov/opx' - steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - - name: Run clang-format style check for C/C++/Protobuf programs (Cornelis Networks-specific). - uses: jidicula/clang-format-action@c74383674bf5f7c69f60ce562019c1c94bc1421a # v4.13.0 - with: - clang-format-version: '15' - check-path: ${{ matrix.path }} From e5a9dae17e3963acc6a8bd00200ae123f248f0e3 Mon Sep 17 00:00:00 2001 From: Ben Lynam Date: Mon, 14 Oct 2024 11:40:59 -0500 Subject: [PATCH 170/393] prov/opx: Set immediate ACK requested bit when sending last packet of RMA PUT Signed-off-by: Ben Lynam --- .../include/rdma/opx/fi_opx_hfi1_progress.h | 8 ++-- prov/opx/src/fi_opx_hfi1.c | 47 ++++++++++++------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h index 3e57c064dc3..081058eb5da 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h @@ -473,11 +473,9 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, const uint8_t opcode, hdr, origin_rx, slid, hfi1_type); } else if (hdr->bth.opcode == FI_OPX_HFI_BTH_OPCODE_RZV_DATA && - ((ntohl(hdr->bth.psn) & 0x80000000) || - (hdr->dput.target.opcode == FI_OPX_HFI_DPUT_OPCODE_PUT))) { - /* Send preemptive ACKs on Rendezvous FI_OPX_HFI_DPUT_OPCODE_PUT or - * on the final packet of a Rendezvous SDMA writev (the high bit - * of the PSN - the Acknowledge Request bit - is set) + (ntohl(hdr->bth.psn) & 0x80000000)) { + /* Send preemptive ACKs on Rendezvous Data packets when + * the high bit of the PSN - the Acknowledge Request bit - is set */ uint32_t psn_count = MAX(MIN(opx_ep->reliability->service.preemptive_ack_rate, psn), 1); assert(psn >= psn_count - 1); diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index c4de298f527..18e1f48bcba 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -2449,12 +2449,21 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) uint64_t bytes_to_send = dput_iov[i].bytes - params->bytes_sent; while (bytes_to_send > 0) { - uint64_t bytes_to_send_this_packet, blocks_to_send_in_this_packet; + uint64_t bytes_to_send_this_packet; + uint64_t blocks_to_send_in_this_packet; + uint64_t pbc_dws; + uint16_t lrh_dws; if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { bytes_to_send_this_packet = MIN(bytes_to_send + params->payload_bytes_for_iovec, max_bytes_per_packet); uint64_t tail_bytes = bytes_to_send_this_packet & 0x3Ful; blocks_to_send_in_this_packet = (bytes_to_send_this_packet >> 6) + (tail_bytes ? 1 : 0); + pbc_dws = 2 + /* pbc */ + 2 + /* lrh */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + (blocks_to_send_in_this_packet << 4); + lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ } else { /* 1 QW for hdr that spills to 2nd cacheline + 1 QW for ICRC/tail */ const uint64_t additional_hdr_tail_byte = 2 * 8; @@ -2463,19 +2472,6 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) uint64_t tail_bytes = payload_n_additional_hdr_tail_bytes & 0x3Ful; blocks_to_send_in_this_packet = (payload_n_additional_hdr_tail_bytes >> 6) + (tail_bytes ? 1 : 0); bytes_to_send_this_packet = payload_n_additional_hdr_tail_bytes - additional_hdr_tail_byte; - - } - - uint64_t pbc_dws; - uint16_t lrh_dws; - if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { - pbc_dws = 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - (blocks_to_send_in_this_packet << 4); - lrh_dws = htons(pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ - } else { pbc_dws = 2 + /* pbc */ 4 + /* lrh uncompressed */ 3 + /* bth */ @@ -2515,8 +2511,7 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) } else { union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - const uint16_t credits_needed = blocks_to_send_in_this_packet - + 1 /* header */; + const uint16_t credits_needed = blocks_to_send_in_this_packet + 1 /* header */; uint32_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, @@ -2539,8 +2534,9 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; - psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, params->slid, - u8_rx, params->origin_rs, &psn_ptr, &replay, reliability, hfi1_type); + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, + params->slid, u8_rx, params->origin_rs, &psn_ptr, + &replay, reliability, hfi1_type); if(OFI_UNLIKELY(psn == -1)) { return -FI_EAGAIN; } @@ -2578,6 +2574,21 @@ int fi_opx_hfi1_do_dput (union fi_opx_hfi1_deferred_work * work) FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); if (opcode == FI_OPX_HFI_DPUT_OPCODE_PUT) { + if (bytes_to_send == bytes_sent) { + /* This is the last packet to send for this PUT. + Turn on the immediate ACK request bit so the + user gets control of their buffer back ASAP */ + const uint64_t set_ack_bit = (uint64_t)htonl(0x80000000); + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { + replay->scb.scb_9B.hdr.qw_9B[2] |= set_ack_bit; + replay->scb.scb_9B.hdr.dput.target.last_bytes = + replay->scb.scb_9B.hdr.dput.target.bytes; + } else { + replay->scb.scb_16B.hdr.qw_16B[3] |= set_ack_bit; + replay->scb.scb_16B.hdr.dput.target.last_bytes = + replay->scb.scb_16B.hdr.dput.target.bytes; + } + } fi_opx_reliability_client_replay_register_with_update( &opx_ep->reliability->state, params->slid, params->origin_rs, u8_rx, psn_ptr, replay, cc, From 12131e9133b8bf995a5e82609c86286bb63103c7 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Fri, 18 Oct 2024 17:53:40 +0000 Subject: [PATCH 171/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_opx.7 | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/man/man7/fi_opx.7 b/man/man7/fi_opx.7 index 3481361d6e2..0653d54a3ef 100644 --- a/man/man7/fi_opx.7 +++ b/man/man7/fi_opx.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_opx" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_opx" "7" "2024\-10\-18" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .PP {%include JB/setup %} @@ -152,18 +152,18 @@ Default setting is 64. \f[I]FI_OPX_RELIABILITY_MAX_UNCONGESTED_PINGS\f[R] Integer. This setting controls how many PING requests the reliability/replay -function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX -in situations with less contending, outgoing traffic from the HFI. -.PP -Default setting is 128. Range of valid values is 1-65535. +function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX +in situations with less contending outgoing traffic from the HFI. +Default setting is 128. +Range of valid values is 1-65535. .TP \f[I]FI_OPX_RELIABILITY_MAX_CONGESTED_PINGS\f[R] Integer. This setting controls how many PING requests the reliability/replay -function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX +function will issue per iteration of FI_OPX_RELIABILITY_SERVICE_USEC_MAX in situations with more contending, outgoing traffic from the HFI. -.PP -Default setting is 4. Range of valid values is 1-65535. +Default setting is 4. +Range of valid values is 1-65535. .TP \f[I]FI_OPX_SELINUX\f[R] Boolean (0/1, on/off, true/false, yes/no). @@ -268,6 +268,14 @@ using PIO. Value must be between 64 and 2147483646. Defaults to 16385. .TP +\f[I]FI_OPX_TID_MIN_PAYLOAD_BYTES\f[R] +Integer. +The minimum length in bytes where TID (Expected Receive) will be used. +For messages smaller than this threshold, the send will be completed +using Eager Receive. +Value must be between 4096 and 2147483646. +Defaults to 4096. +.TP \f[I]FI_OPX_RZV_MIN_PAYLOAD_BYTES\f[R] Integer. The minimum length in bytes where rendezvous will be used. From 2ddb37553c62e9b365f2d3ea97f9cc023d0974a6 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 18 Oct 2024 20:24:37 +0000 Subject: [PATCH 172/393] prov/efa: Remove unused fields in efa_rdm_ope shm_desc and peer_unexp_entry are not used after moving to peer provider and util srx Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ope.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ope.h b/prov/efa/src/rdm/efa_rdm_ope.h index d8483d96223..626eb6dc8d4 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.h +++ b/prov/efa/src/rdm/efa_rdm_ope.h @@ -110,7 +110,6 @@ struct efa_rdm_ope { size_t iov_count; struct iovec iov[EFA_RDM_IOV_LIMIT]; void *desc[EFA_RDM_IOV_LIMIT]; - void *shm_desc[EFA_RDM_IOV_LIMIT]; struct fid_mr *mr[EFA_RDM_IOV_LIMIT]; size_t rma_iov_count; @@ -144,8 +143,6 @@ struct efa_rdm_ope { uint64_t bytes_copied; uint64_t bytes_queued_blocking_copy; - /* linked to peer->rx_unexp_list or peer->rx_unexp_tagged_list */ - struct dlist_entry peer_unexp_entry; #if ENABLE_DEBUG /* linked with ope_recv_list in efa_rdm_ep */ struct dlist_entry pending_recv_entry; From ce936d067ad85526c237ec56299d0072677d5005 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 18 Oct 2024 18:14:30 +0000 Subject: [PATCH 173/393] fabtests/efa: Remove rnr cq error message check The current check for rnr cq error message string is unnecessary and fragile as we may adjust the error message. This patch removes such error message check and relies on the error code check. Signed-off-by: Shi Jin --- fabtests/prov/efa/src/rdm_rnr_read_cq_error.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c b/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c index 1eb11acdbaa..4c7edf2886c 100644 --- a/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c +++ b/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c @@ -43,7 +43,6 @@ static int rnr_read_cq_error(void) { int total_send, expected_rnr_error; int ret, i, cnt, rnr_flag; - const char *prov_errmsg; expected_rnr_error = fi->rx_attr->size; rnr_flag = 0; @@ -89,16 +88,6 @@ static int rnr_read_cq_error(void) rnr_flag = 1; printf("Got RNR error CQ entry as expected: %d, %s\n", comp_err.err, fi_strerror(comp_err.err)); - prov_errmsg = fi_cq_strerror(txcq, comp_err.prov_errno, - comp_err.err_data, - comp_err.buf, - comp_err.len); - if (strstr(prov_errmsg, "Destination resource not ready") == NULL) { - printf("Got unexpected provider error message.\n"); - printf(" Expected error message to have \"Destination resource not ready\" in it\n"); - printf(" Got: %s\n", prov_errmsg); - return -FI_EINVAL; - } } else { printf("Got non-RNR error CQ entry: %d, %s\n", comp_err.err, fi_strerror(comp_err.err)); From a3164b0bbcd13ce0d196c21a370516cb042382fe Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 17 Oct 2024 13:37:48 -0700 Subject: [PATCH 174/393] fabtests: fix complex fill cast The previous implementation was trying to fill complex numbers with multiple chars in the integ_alphabet to randomize the data. Unfortunately, this resulted in many complex numbers becoming invalid numbers (NaN). Because of that, the data validation check would fail as NaN != NaN To fix this, when filling the data, set the real and imaginary parts as a single char element so the individual part is valid and can be used in the computation and check. Because the implementation of complex types differs on unix and Windows systems, this requires the addition of a new function that initializes the individual components of the complex number instead of the number as a whole. This requires moving the long_double typedef into the osd.h headers to help with the casting. Signed-off-by: Alexia Ingerson --- fabtests/include/ofi_atomic.h | 1 - fabtests/include/shared.h | 4 ++-- fabtests/include/unix/osd.h | 6 ++++++ fabtests/include/windows/osd.h | 6 ++++++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/fabtests/include/ofi_atomic.h b/fabtests/include/ofi_atomic.h index 765a4a8137f..abb87bc4620 100644 --- a/fabtests/include/ofi_atomic.h +++ b/fabtests/include/ofi_atomic.h @@ -40,7 +40,6 @@ extern "C" { #endif -typedef long double long_double; #define OFI_WRITE_OP_START FI_MIN #define OFI_WRITE_OP_LAST (FI_ATOMIC_WRITE + 1) diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 7ff3aa49cb6..e15c61daea7 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -775,8 +775,8 @@ static inline void *ft_get_page_end(const void *addr, size_t page_size) int i, a = 0; \ OFI_COMPLEX(type) *d = (dst); \ for (i = 0; i < cnt; i++) { \ - ofi_complex_set_##type (&d[i], \ - *(OFI_COMPLEX(type) *) &integ_alphabet[a]); \ + ofi_complex_fill_##type (&d[i], \ + (type) integ_alphabet[a]); \ if (++a >= integ_alphabet_length) \ a = 0; \ } \ diff --git a/fabtests/include/unix/osd.h b/fabtests/include/unix/osd.h index 07a3ab09f60..66956aeb79f 100644 --- a/fabtests/include/unix/osd.h +++ b/fabtests/include/unix/osd.h @@ -83,6 +83,8 @@ static inline int ofi_sockerr(void) return errno; } +typedef long double long_double; + /* complex operations implementation */ #define OFI_COMPLEX(name) ofi_complex_##name #define OFI_COMPLEX_OP(name, op) ofi_complex_##op##_##name @@ -120,6 +122,10 @@ static inline OFI_COMPLEX(name) OFI_COMPLEX_OP(name, lxor)(OFI_COMPLEX(name) v1, static inline void OFI_COMPLEX_OP(name, set)(OFI_COMPLEX(name) *v1, OFI_COMPLEX(name) v2) \ { \ *v1 = v2; \ +} \ +static inline void OFI_COMPLEX_OP(name, fill)(OFI_COMPLEX(name) *v1, name v2) \ +{ \ + *v1 = CMPLX(v2, v2); \ } OFI_COMPLEX_OPS(float) diff --git a/fabtests/include/windows/osd.h b/fabtests/include/windows/osd.h index 49bee1d7751..bc9fd781977 100644 --- a/fabtests/include/windows/osd.h +++ b/fabtests/include/windows/osd.h @@ -724,6 +724,7 @@ ofi_send_socket(SOCKET fd, const void *buf, size_t count, int flags) return (ssize_t) send(fd, (const char*) buf, len, flags); } +typedef long double long_double; /* complex operations implementation */ #define OFI_COMPLEX(name) ofi_complex_##name @@ -790,6 +791,11 @@ static inline void OFI_COMPLEX_OP(name, set)(OFI_COMPLEX(name) *v1, OFI_COMPLEX( { \ v1->re = v2.re; \ v1->im = v2.im; \ +} \ +static inline void OFI_COMPLEX_OP(name, fill)(OFI_COMPLEX(name) *v1, char v2) \ +{ \ + v1->re = v2; \ + v1->im = v2; \ } OFI_COMPLEX_OPS(float) From c677d1ec6c321f677c3a148ce617e6fad084a2e9 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 16 Oct 2024 14:46:34 -0500 Subject: [PATCH 175/393] util/mr_cache: Support compile default monitor --with-default-monitor can be used to define a default memory monitor instead of relying on the hardcoded ordered list. Options are memhooks, uffd, or disabled. Signed-off-by: Ian Ziemba --- configure.ac | 19 ++++++++++ prov/util/src/util_mem_monitor.c | 60 ++++++++++++++++++++------------ 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/configure.ac b/configure.ac index 777a870f69a..7e9985991e4 100644 --- a/configure.ac +++ b/configure.ac @@ -885,6 +885,25 @@ AC_ARG_ENABLE([uffd-monitor], AC_DEFINE_UNQUOTED(ENABLE_UFFD_MONITOR, [$enable_uffd], [Define to 1 to enable uffd memory monitor]) +default_monitor="" +bad_default="0" +AC_ARG_WITH([default-monitor], + [AS_HELP_STRING([--with-default-monitor=], + [Select the default memory monitor.])], + [AS_CASE([$with_default_monitor], + [memhooks],[default_monitor=memhooks], + [uffd],[default_monitor=uffd], + [disabled], [default_monitor=disabled], + [AC_MSG_ERROR([Unknown monitor specified: $with_default_monitor. Choices are memhooks, uffd, or disabled.])]) + AS_CASE([$default_monitor], + [memhooks], [AS_IF([test "$enable_memhooks" != "1"], [bad_default=1])], + [uffd], [AS_IF([test "$enable_uffd" != "1"], [bad_default=1])], + []) + AS_IF([test "$bad_default" != "0"], + [AC_MSG_ERROR(["Default memory monitor is not available: $default_monitor."])]) + AC_DEFINE_UNQUOTED([HAVE_MR_CACHE_MONITOR_DEFAULT], ["$default_monitor"], [Default memory monitor]) + ], + []) AH_BOTTOM([ #if defined(__linux__) && (defined(__x86_64__) || defined(__amd64__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)) && ENABLE_MEMHOOKS_MONITOR diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index 746cf50fb59..10a6b4e2795 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -211,6 +211,30 @@ static void cleanup_monitor_list() { monitor_list_size = 0; } +static void set_default_monitor(const char *monitor) +{ + if (!monitor) + return; + + if (!strcmp(monitor, "userfaultfd") || !strcmp(monitor, "uffd")) { +#if HAVE_UFFD_MONITOR + default_monitor = uffd_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "userfaultfd monitor not available\n"); + default_monitor = NULL; +#endif + } else if (!strcmp(monitor, "memhooks")) { +#if HAVE_MEMHOOKS_MONITOR + default_monitor = memhooks_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "memhooks monitor not available\n"); + default_monitor = NULL; +#endif + } else if (!strcmp(monitor, "disabled")) { + default_monitor = NULL; + } +} + /* * Initialize all available memory monitors */ @@ -247,9 +271,14 @@ void ofi_monitors_init(void) " address changes. Options are: userfaultfd, memhooks" " and disabled. Userfaultfd is a Linux kernel feature." " Memhooks operates by intercepting memory allocation" - " and free calls. Userfaultfd is the default if" - " available on the system. 'disabled' option disables" - " memory caching."); + " and free calls." +#if defined(HAVE_MR_CACHE_MONITOR_DEFAULT) + " " HAVE_MR_CACHE_MONITOR_DEFAULT +#else + " Userfaultfd" +#endif + " is the default if available on the system. 'disabled'" + " option disables memory caching."); fi_param_define(NULL, "mr_cuda_cache_monitor_enabled", FI_PARAM_BOOL, "Enable or disable the CUDA cache memory monitor." "Enabled by default."); @@ -278,7 +307,9 @@ void ofi_monitors_init(void) * do not override */ if (!default_monitor) { -#if HAVE_MEMHOOKS_MONITOR +#if defined(HAVE_MR_CACHE_MONITOR_DEFAULT) + set_default_monitor(HAVE_MR_CACHE_MONITOR_DEFAULT); +#elif HAVE_MEMHOOKS_MONITOR default_monitor = memhooks_monitor; #elif HAVE_UFFD_MONITOR default_monitor = uffd_monitor; @@ -287,25 +318,8 @@ void ofi_monitors_init(void) #endif } - if (cache_params.monitor != NULL) { - if (!strcmp(cache_params.monitor, "userfaultfd")) { -#if HAVE_UFFD_MONITOR - default_monitor = uffd_monitor; -#else - FI_WARN(&core_prov, FI_LOG_MR, "userfaultfd monitor not available\n"); - default_monitor = NULL; -#endif - } else if (!strcmp(cache_params.monitor, "memhooks")) { -#if HAVE_MEMHOOKS_MONITOR - default_monitor = memhooks_monitor; -#else - FI_WARN(&core_prov, FI_LOG_MR, "memhooks monitor not available\n"); - default_monitor = NULL; -#endif - } else if (!strcmp(cache_params.monitor, "disabled")) { - default_monitor = NULL; - } - } + if (cache_params.monitor != NULL) + set_default_monitor(cache_params.monitor); FI_INFO(&core_prov, FI_LOG_MR, "Default memory monitor is: %s\n", From 877221e27c2de9fd5dee0062d311e564db687d63 Mon Sep 17 00:00:00 2001 From: Mike Uttormark Date: Mon, 11 Dec 2023 15:48:35 -0600 Subject: [PATCH 176/393] prov/util: Integrate kdreg2 into libfabric kdreg2 is a Linux kernel module used to enabled the libfabric MR cache for FI_HMEM_SYSTEM. Signed-off-by: Mike Uttormark Signed-off-by: Ian Ziemba --- Makefile.am | 1 + configure.ac | 51 +++- include/ofi_mr.h | 38 ++- libfabric.vcxproj | 1 + man/fi_mr.3.md | 5 +- prov/util/src/kdreg2_mem_monitor.c | 367 +++++++++++++++++++++++++++++ prov/util/src/util_mem_monitor.c | 17 +- 7 files changed, 472 insertions(+), 8 deletions(-) create mode 100644 prov/util/src/kdreg2_mem_monitor.c diff --git a/Makefile.am b/Makefile.am index 00242c7d65e..de2158c5fc1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -91,6 +91,7 @@ common_srcs = \ prov/util/src/rocr_ipc_monitor.c \ prov/util/src/ze_ipc_monitor.c \ prov/util/src/xpmem_monitor.c \ + prov/util/src/kdreg2_mem_monitor.c \ prov/util/src/util_profile.c \ prov/coll/src/coll_attr.c \ prov/coll/src/coll_av.c \ diff --git a/configure.ac b/configure.ac index 7e9985991e4..2b476f7f9d2 100644 --- a/configure.ac +++ b/configure.ac @@ -598,6 +598,53 @@ AC_ARG_ENABLE([restricted_dl], AC_DEFINE_UNQUOTED([HAVE_RESTRICTED_DL], [$restricted_dl], [Define to 1 to only look for dl providers under default location if FI_PROVIDER_PATH is not set]) +dnl Check kdreg2 support +kdreg2_enabled=1 +have_kdreg2=0 +have_kdreg2_include_path=0 + +AC_ARG_ENABLE([kdreg2], + [AC_HELP_STRING([--disable-kdreg2], + [Determine whether kdreg2 memory monitor is disabled.])], + [AS_IF([test "$enable_kdreg2" = "no"], [kdreg2_enabled=0])], + []) + +AS_IF([test $kdreg2_enabled -ne 0 ], + [AC_CHECK_HEADER([linux/kdreg2.h], [have_kdreg2=1], [], []) + + AC_ARG_WITH([kdreg2], + [AS_HELP_STRING([--with-kdreg2=DIR], + [Enable KDREG2 memory monitor. + Optional=.])], + [AS_CASE(["$with_kdreg2"], + ["no"], [kdreg2_enabled=0], + ["yes"], [], + [""], [], + [CPPFLAGS="$CPPFLAGS -I$with_kdreg2" + AC_CHECK_HEADER([kdreg2.h], + [have_kdreg2=1 + have_kdreg2_include_path=1], + [have_kdreg2=0], + [])]) + AS_IF([test $have_kdreg2 -eq 0 ], + [AC_MSG_ERROR([KDREG2 header not found in $with_kdreg2. Cannot enable KDREG2 memory monitor.])]) + ]) + ]) + +AS_IF([test $kdreg2_enabled -eq 0], + [AC_MSG_NOTICE([kdreg2 monitor disabled])], + [AS_IF([test $have_kdreg2 -ne 0], + [AC_MSG_NOTICE([kdreg2 present and enabled])])]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2, [$have_kdreg2], + [Define to 1 if kdreg2.h is available.]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2_INCLUDE_PATH, [$have_kdreg2_include_path], + [Define to 1 if kdreg2.h path is not .]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2_MONITOR, [$have_kdreg2], + [Define to 1 to enable kdreg2 memory monitor]) + dnl Check support to intercept syscalls AC_CHECK_HEADERS_ONCE(elf.h sys/auxv.h) @@ -888,16 +935,18 @@ AC_DEFINE_UNQUOTED(ENABLE_UFFD_MONITOR, [$enable_uffd], default_monitor="" bad_default="0" AC_ARG_WITH([default-monitor], - [AS_HELP_STRING([--with-default-monitor=], + [AS_HELP_STRING([--with-default-monitor=], [Select the default memory monitor.])], [AS_CASE([$with_default_monitor], [memhooks],[default_monitor=memhooks], [uffd],[default_monitor=uffd], + [kdreg2],[default_monitor=kdreg2] [disabled], [default_monitor=disabled], [AC_MSG_ERROR([Unknown monitor specified: $with_default_monitor. Choices are memhooks, uffd, or disabled.])]) AS_CASE([$default_monitor], [memhooks], [AS_IF([test "$enable_memhooks" != "1"], [bad_default=1])], [uffd], [AS_IF([test "$enable_uffd" != "1"], [bad_default=1])], + [kdreg2], [AS_IF([test "$kdreg2_enabled" != "1"], [bad_default=1])], []) AS_IF([test "$bad_default" != "0"], [AC_MSG_ERROR(["Default memory monitor is not available: $default_monitor."])]) diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 64bae5f0755..6f85e07eadd 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -2,7 +2,7 @@ * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. * All rights reserved. - * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -40,6 +40,8 @@ # include #endif /* HAVE_CONFIG_H */ +struct ofi_mr; + #include #include @@ -48,6 +50,15 @@ #include #include #include +#include + +#if HAVE_KDREG2_MONITOR +#if HAVE_KDREG2_INCLUDE_PATH +#include "kdreg2.h" +#else +#include +#endif +#endif int ofi_open_mr_cache(uint32_t version, void *attr, size_t attr_len, uint64_t flags, struct fid **fid, void *context); @@ -128,6 +139,12 @@ struct ofi_mr_cache; union ofi_mr_hmem_info { uint64_t cuda_id; uint64_t ze_id; +#if HAVE_KDREG2_MONITOR + struct { + kdreg2_cookie_t cookie; + struct kdreg2_monitoring_params monitoring_params; + } kdreg2; +#endif }; struct ofi_mr_entry { @@ -228,6 +245,23 @@ struct ofi_memhooks { extern struct ofi_mem_monitor *memhooks_monitor; +/* + * Kdreg2 monitor + */ + +struct kdreg2_status_data; + +struct ofi_kdreg2 { + struct ofi_mem_monitor monitor; + pthread_t thread; + int fd; + int exit_pipe[2]; + const struct kdreg2_status_data *status_data; + ofi_atomic64_t next_cookie; +}; + +extern struct ofi_mem_monitor *kdreg2_monitor; + extern struct ofi_mem_monitor *cuda_monitor; extern struct ofi_mem_monitor *cuda_ipc_monitor; extern struct ofi_mem_monitor *rocr_monitor; @@ -368,7 +402,7 @@ struct ofi_mr_cache { struct ofi_rbmap tree; struct dlist_entry lru_list; struct dlist_entry dead_region_list; - pthread_mutex_t lock; + pthread_mutex_t lock; size_t cached_cnt; size_t cached_size; diff --git a/libfabric.vcxproj b/libfabric.vcxproj index 1bc35fb93b5..b4e8dc9cbd3 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -759,6 +759,7 @@ + diff --git a/man/fi_mr.3.md b/man/fi_mr.3.md index 7e13d587c47..be43f409c8e 100644 --- a/man/fi_mr.3.md +++ b/man/fi_mr.3.md @@ -1054,12 +1054,13 @@ configure registration caches. : The cache monitor is responsible for detecting system memory (FI_HMEM_SYSTEM) changes made between the virtual addresses used by an application and the underlying physical pages. Valid monitor options are: userfaultfd, memhooks, - and disabled. Selecting disabled will turn off the registration cache. + kdreg2, and disabled. Selecting disabled will turn off the registration cache. Userfaultfd is a Linux kernel feature used to report virtual to physical address mapping changes to user space. Memhooks operates by intercepting relevant memory allocation and deallocation calls which may result in the mappings changing, such as malloc, mmap, free, etc. Note that memhooks - operates at the elf linker layer, and does not use glibc memory hooks. + operates at the elf linker layer, and does not use glibc memory hooks. Kdreg2 + is supplied as a loadable Linux kernel module. *FI_MR_CUDA_CACHE_MONITOR_ENABLED* : The CUDA cache monitor is responsible for detecting CUDA device memory diff --git a/prov/util/src/kdreg2_mem_monitor.c b/prov/util/src/kdreg2_mem_monitor.c new file mode 100644 index 00000000000..ba7c2a21d31 --- /dev/null +++ b/prov/util/src/kdreg2_mem_monitor.c @@ -0,0 +1,367 @@ +/* + * (C) Copyright 2022-2023 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ofi_mr.h" + +#if HAVE_KDREG2_MONITOR + +#include "ofi_hmem.h" + +#define EVICTOR_THREAD_ATTR NULL +#define INFINITE_TIMEOUT -1 + +static int kdreg2_monitor_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + uint64_t cookie = ofi_atomic_inc64(&kdreg2->next_cookie); + struct kdreg2_ioctl_monitor ioctl_monitor = { + .addr = addr, + .length = len, + .cookie = (kdreg2_cookie_t) cookie, + }; + int ret; + + ret = ioctl(kdreg2->fd, KDREG2_IOCTL_MONITOR, &ioctl_monitor); + if (ret) + return ret; + + hmem_info->kdreg2.cookie = ioctl_monitor.cookie; + hmem_info->kdreg2.monitoring_params = ioctl_monitor.monitoring_params; + + return 0; +} + +static void kdreg2_monitor_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + struct kdreg2_ioctl_unmonitor ioctl_unmonitor = { + .cookie = hmem_info->kdreg2.cookie, + .monitoring_params = hmem_info->kdreg2.monitoring_params, + }; + + ioctl(kdreg2->fd, KDREG2_IOCTL_UNMONITOR, &ioctl_unmonitor); +} + +static bool kdreg2_monitor_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + struct kdreg2_monitoring_params *params = + &entry->hmem_info.kdreg2.monitoring_params; + + return !kdreg2_mapping_changed(kdreg2->status_data, params); +} + +static int kdreg2_read_evictions(struct ofi_kdreg2 *kdreg2) +{ + struct kdreg2_event event; + ssize_t bytes; + int err; + + while (kdreg2_read_counter(&kdreg2->status_data->pending_events) > 0) { + + /* The read should return a multiple of sizeof(event) or + * an error. There should be no partial reads. + */ + + bytes = read(kdreg2->fd, &event, sizeof(event)); + if (bytes < 0) { + err = errno; + + /* EINTR means we caught a signal. */ + if (err == EINTR) + continue; + + /* Nothing left */ + if ((err == EAGAIN) || + (err == EWOULDBLOCK)) + return 0; + + /* All other errors */ + return err; + } + + switch (event.type) { + case KDREG2_EVENT_MAPPING_CHANGE: + + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + + ofi_monitor_notify(&kdreg2->monitor, + event.u.mapping_change.addr, + event.u.mapping_change.len); + + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); + + break; + + default: + + return -ENOMSG; + } + } + + return 0; +} + +static void kdreg2_close_pipe(struct ofi_kdreg2 *kdreg2) +{ + close(kdreg2->exit_pipe[0]); + close(kdreg2->exit_pipe[1]); + kdreg2->exit_pipe[0] = -1; + kdreg2->exit_pipe[1] = -1; +} + +static void kdreg2_close_fd(struct ofi_kdreg2 *kdreg2) +{ + close(kdreg2->fd); + kdreg2->fd = -1; + kdreg2->status_data = NULL; +} + +static void *kdreg2_evictor(void *arg) +{ + struct ofi_kdreg2 *kdreg2 = (struct ofi_kdreg2 *) arg; + int ret; + struct pollfd pollfd[2] = { + { + .fd = kdreg2->fd, + .events = POLLIN, + }, + { .fd = kdreg2->exit_pipe[0], + .events = POLLIN, + }, + }; + int n; + + while (1) { + + /* wait until there are events to read */ + n = poll(pollfd, 2, INFINITE_TIMEOUT); + if (n == 0) /* timeout(?) */ + continue; + + if (n < 0) { + switch (errno) { + case EINTR: /* interrupted */ + continue; + default: + ret = -errno; + goto error_ret; + } + } + + /* look for exit message on second fd */ + if (pollfd[1].revents) { + ret = 0; + goto error_ret; + } + + ret = kdreg2_read_evictions(kdreg2); + if (ret) + goto error_ret; + } + +error_ret: + + return (void *) (intptr_t) ret; +} + + +static int kdreg2_monitor_start(struct ofi_mem_monitor *monitor) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + int ret = 0; + struct kdreg2_config_data config_data; + + /* see if already started */ + if (kdreg2->fd >= 0) + return 0; + + ofi_atomic_initialize64(&kdreg2->next_cookie, 1); + + ret = pipe(kdreg2->exit_pipe); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to create pipe for kdreg2: %s\n", + strerror(errno)); + return -errno; + } + + kdreg2->fd = open(KDREG2_DEVICE_NAME, O_RDWR); + if (kdreg2->fd < 0) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to open %s for monitor kdreg2: %s.\n", + KDREG2_DEVICE_NAME, strerror(errno)); + ret = -errno; + goto close_pipe; + } + + /* configure the monitor with the maximum number of entries */ + + config_data.max_regions = cache_params.max_cnt; + if (!config_data.max_regions) { + ret = -FI_ENOSPC; + goto close_fd; + } + + ret = ioctl(kdreg2->fd, KDREG2_IOCTL_CONFIG_DATA, &config_data); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to get module config data for kdreg2 monitor: %d.\n", + errno); + ret = -errno; + goto close_fd; + } + + /* Configuring the monitor allocates the status data. Save the address. */ + + kdreg2->status_data = config_data.status_data; + + ret = pthread_create(&kdreg2->thread, EVICTOR_THREAD_ATTR, + kdreg2_evictor, kdreg2); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to start thread for kdreg2 monitor: %d.\n", + ret); + goto close_fd; + } + + FI_INFO(&core_prov, FI_LOG_MR, "Kdreg2 memory monitor started.\n"); + + return 0; + +close_fd: + + kdreg2_close_fd(kdreg2); + +close_pipe: + + kdreg2_close_pipe(kdreg2); + + FI_WARN(&core_prov, FI_LOG_MR, + "Kdreg2 memory monitor failed to start: %i.\n", ret); + + return ret; +} + +static void kdreg2_monitor_stop(struct ofi_mem_monitor *monitor) +{ + ssize_t num_written; + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + + /* see if it's really running */ + if (kdreg2->fd < 0) + return; + + num_written = write(kdreg2->exit_pipe[1], "X", 1); + if (num_written != 1) { + FI_WARN(&core_prov, FI_LOG_MR, + "Unable to write to kdreg2 exit pipe: %s\n", + strerror(errno)); + /* We could call pthread cancel here. The thread + * has probably already exited. Cancelling would be + * benign. But calling join on an exited thread is + * also legal. + */ + } + + pthread_join(kdreg2->thread, NULL); + + kdreg2_close_fd(kdreg2); + kdreg2_close_pipe(kdreg2); + + FI_INFO(&core_prov, FI_LOG_MR, "Kdreg2 memory monitor stopped.\n"); +} + +#else /* !HAVE_KDREG2_MONITOR */ + +static int kdreg2_monitor_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, + size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + return -FI_ENOSYS; +} + +static void kdreg2_monitor_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ +} + +static bool kdreg2_monitor_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + return false; +} + +static int kdreg2_monitor_start(struct ofi_mem_monitor *monitor) +{ + return -FI_ENOSYS; +} + +void kdreg2_monitor_stop(struct ofi_mem_monitor *monitor) +{ + /* no-op */ +} + +#endif /* HAVE_KDREG2_MONITOR */ + +static struct ofi_kdreg2 kdreg2_mm = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_monitor_init, + .monitor.cleanup = ofi_monitor_cleanup, + .monitor.start = kdreg2_monitor_start, + .monitor.stop = kdreg2_monitor_stop, + .monitor.subscribe = kdreg2_monitor_subscribe, + .monitor.unsubscribe = kdreg2_monitor_unsubscribe, + .monitor.valid = kdreg2_monitor_valid, + .monitor.name = "kdreg2", + .fd = -1, + .exit_pipe = { -1, -1 }, + .status_data = NULL, +}; + +struct ofi_mem_monitor *kdreg2_monitor = &kdreg2_mm.monitor; diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index 10a6b4e2795..b725a90bc6d 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -194,6 +194,7 @@ static void initialize_monitor_list() ze_monitor, ze_ipc_monitor, import_monitor, + kdreg2_monitor, }; monitor_list_size = ARRAY_SIZE(monitors); @@ -229,6 +230,13 @@ static void set_default_monitor(const char *monitor) #else FI_WARN(&core_prov, FI_LOG_MR, "memhooks monitor not available\n"); default_monitor = NULL; +#endif + } else if (!strcmp(monitor, "kdreg2")) { +#if HAVE_KDREG2_MONITOR + default_monitor = kdreg2_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "kdreg2 monitor not available\n"); + default_monitor = NULL; #endif } else if (!strcmp(monitor, "disabled")) { default_monitor = NULL; @@ -269,9 +277,10 @@ void ofi_monitors_init(void) "Define a default memory registration monitor." " The monitor checks for virtual to physical memory" " address changes. Options are: userfaultfd, memhooks" - " and disabled. Userfaultfd is a Linux kernel feature." - " Memhooks operates by intercepting memory allocation" - " and free calls." + " kdreg2, and disabled. Userfaultfd is a Linux kernel" + " feature. Memhooks operates by intercepting memory" + " allocation and free calls. kdreg2 is a supplied as a" + " loadable Linux kernel module." #if defined(HAVE_MR_CACHE_MONITOR_DEFAULT) " " HAVE_MR_CACHE_MONITOR_DEFAULT #else @@ -313,6 +322,8 @@ void ofi_monitors_init(void) default_monitor = memhooks_monitor; #elif HAVE_UFFD_MONITOR default_monitor = uffd_monitor; +#elif HAVE_KDREG2_MONITOR + default_monitor = kdreg2_monitor; #else default_monitor = NULL; #endif From 708610d1b094d86b3e0b882b7dcaadcbbd6e8802 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Tue, 22 Oct 2024 16:45:58 +0000 Subject: [PATCH 177/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man3/fi_mr.3 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/man/man3/fi_mr.3 b/man/man3/fi_mr.3 index 3c11b40fe5f..e2797a7b7c9 100644 --- a/man/man3/fi_mr.3 +++ b/man/man3/fi_mr.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_mr" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_mr" "3" "2024\-10\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -1129,7 +1129,7 @@ Setting this to zero will disable registration caching. The cache monitor is responsible for detecting system memory (FI_HMEM_SYSTEM) changes made between the virtual addresses used by an application and the underlying physical pages. -Valid monitor options are: userfaultfd, memhooks, and disabled. +Valid monitor options are: userfaultfd, memhooks, kdreg2, and disabled. Selecting disabled will turn off the registration cache. Userfaultfd is a Linux kernel feature used to report virtual to physical address mapping changes to user space. @@ -1138,6 +1138,7 @@ deallocation calls which may result in the mappings changing, such as malloc, mmap, free, etc. Note that memhooks operates at the elf linker layer, and does not use glibc memory hooks. +Kdreg2 is supplied as a loadable Linux kernel module. .TP \f[I]FI_MR_CUDA_CACHE_MONITOR_ENABLED\f[R] The CUDA cache monitor is responsible for detecting CUDA device memory From 6d8cba93308f8334c85871b5355679551682fbd8 Mon Sep 17 00:00:00 2001 From: Mike Uttormark Date: Sun, 10 Dec 2023 19:52:41 -0600 Subject: [PATCH 178/393] prov/util: Change uffd stop routine to use pipe Without this change, the following segfault can happen. 0 0x00001555546a66de in malloc () from /lib64/libc.so.6 1 0x00001555555351f8 in _dl_new_object () from /lib64/ld-linux-x86-64.so.2 2 0x000015555552f413 in _dl_map_object_from_fd () from /lib64/ld-linux-x86-64.so.2 3 0x000015555553252a in _dl_map_object () from /lib64/ld-linux-x86-64.so.2 4 0x000015555553d700 in dl_open_worker () from /lib64/ld-linux-x86-64.so.2 5 0x000015555476002d in _dl_catch_exception () from /lib64/libc.so.6 6 0x000015555553d28b in _dl_open () from /lib64/ld-linux-x86-64.so.2 7 0x000015555475f37d in do_dlopen () from /lib64/libc.so.6 8 0x000015555476002d in _dl_catch_exception () from /lib64/libc.so.6 9 0x00001555547600bf in _dl_catch_error () from /lib64/libc.so.6 10 0x000015555475f497 in dlerror_run () from /lib64/libc.so.6 11 0x000015555475f567 in __libc_dlopen_mode () from /lib64/libc.so.6 12 0x0000155555409fbb in pthread_cancel_init () from /lib64/libpthread.so.0 13 0x00001555554062b7 in pthread_cancel () from /lib64/libpthread.so.0 14 0x00000000005b66eb in ofi_uffd_stop (monitor=0x70f3e0 ) at prov/util/src/util_mem_monitor.c:887 15 0x00000000005b4c25 in ofi_monitors_update (monitors=0x7fffffffe070) at prov/util/src/util_mem_monitor.c:154 16 0x00000000005b5828 in ofi_monitors_del_cache (cache=0x74a428) at prov/util/src/util_mem_monitor.c:485 17 0x00000000005b9956 in ofi_mr_cache_cleanup (cache=0x74a428) at prov/util/src/util_mr_cache.c:502 18 0x00000000005e8cb1 in cxip_iomm_fini (dom=0x746250) at prov/cxi/src/cxip_iomm.c:368 19 0x0000000000617494 in cxip_domain_disable (dom=0x746250) at prov/cxi/src/cxip_dom.c:540 20 0x000000000061755f in cxip_dom_close (fid=0x746250) at prov/cxi/src/cxip_dom.c:568 21 0x000000000054da04 in fi_close (fid=0x746250) at ./include/rdma/fabric.h:632 22 0x0000000000570df9 in av_auth_key_test_rx_ep_fini () at prov/cxi/test/auth_key.c:2495 23 0x0000000000575efa in data_transfer_av_auth_key_av_user_id_source_err_auth_key_user_id_impl () at prov/cxi/test/auth_key.c:2802 24 0x0000155555017f50 in criterion_internal_test_main (fn=0x5751ad ) at ../src/core/test.c:94 25 0x00000000005751a5 in data_transfer_av_auth_key_av_user_id_source_err_auth_key_user_id_jmp () at prov/cxi/test/auth_key.c:2754 26 0x0000155555016823 in run_test_child () at ../src/core/runner_coroutine.c:230 27 0x00001555550268b1 in bxfi_main () at ../subprojects/boxfort/src/sandbox.c:57 28 0x000015555463e24d in __libc_start_main () from /lib64/libc.so.6 29 0x0000000000405bea in _start () at ../sysdeps/x86_64/start.S:120 Signed-off-by: Mike Uttormark --- include/ofi_mr.h | 1 + prov/util/src/util_mem_monitor.c | 94 ++++++++++++++++++++++++++------ 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 6f85e07eadd..12383413110 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -231,6 +231,7 @@ struct ofi_uffd { struct ofi_mem_monitor monitor; pthread_t thread; int fd; + int exit_pipe[2]; }; extern struct ofi_mem_monitor *uffd_monitor; diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index b725a90bc6d..2b6cc9b0ed5 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -61,6 +61,8 @@ static struct ofi_uffd uffd = { .monitor.start = ofi_uffd_start, .monitor.stop = ofi_uffd_stop, .monitor.name = "uffd", + .fd = -1, + .exit_pipe = { -1, -1 }, }; struct ofi_mem_monitor *uffd_monitor = &uffd.monitor; @@ -594,14 +596,17 @@ static void ofi_uffd_pagefault_handler(struct uffd_msg *msg); static void *ofi_uffd_handler(void *arg) { struct uffd_msg msg; - struct pollfd fds; + struct pollfd fds[2]; int ret; - fds.fd = uffd.fd; - fds.events = POLLIN; + fds[0].fd = uffd.fd; + fds[0].events = POLLIN; + fds[1].fd = uffd.exit_pipe[0]; + fds[1].events = POLLIN; + for (;;) { - ret = poll(&fds, 1, -1); - if (ret != 1) + ret = poll(fds, 2, -1); + if (ret < 0 || fds[1].revents) break; pthread_rwlock_rdlock(&mm_list_rwlock); @@ -832,24 +837,45 @@ static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, return true; } +static void ofi_uffd_close_fd(struct ofi_uffd *monitor) +{ + close(monitor->fd); + monitor->fd = -1; +} + +static void ofi_uffd_close_pipe(struct ofi_uffd *monitor) +{ + close(monitor->exit_pipe[0]); + close(monitor->exit_pipe[1]); + monitor->exit_pipe[0] = -1; + monitor->exit_pipe[1] = -1; +} + static int ofi_uffd_start(struct ofi_mem_monitor *monitor) { struct uffdio_api api; int ret; - uffd.monitor.subscribe = ofi_uffd_subscribe; - uffd.monitor.unsubscribe = ofi_uffd_unsubscribe; - uffd.monitor.valid = ofi_uffd_valid; + if (uffd.fd >= 0) + return 0; if (!num_page_sizes) return -FI_ENODATA; + ret = pipe(uffd.exit_pipe); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "uffd/pipe: %s\n", strerror(errno)); + return -errno; + } + uffd.fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd.fd < 0) { FI_WARN(&core_prov, FI_LOG_MR, "syscall/userfaultfd %s\n", strerror(errno)); - return -errno; + ret = -errno; + goto close_pipe; } api.api = UFFD_API; @@ -860,13 +886,13 @@ static int ofi_uffd_start(struct ofi_mem_monitor *monitor) FI_WARN(&core_prov, FI_LOG_MR, "ioctl/uffdio: %s\n", strerror(errno)); ret = -errno; - goto closefd; + goto close_fd; } if (api.api != UFFD_API) { FI_WARN(&core_prov, FI_LOG_MR, "uffd features not supported\n"); ret = -FI_ENOSYS; - goto closefd; + goto close_fd; } ret = pthread_create(&uffd.thread, NULL, ofi_uffd_handler, &uffd); @@ -874,20 +900,56 @@ static int ofi_uffd_start(struct ofi_mem_monitor *monitor) FI_WARN(&core_prov, FI_LOG_MR, "failed to create handler thread %s\n", strerror(ret)); ret = -ret; - goto closefd; + goto close_fd; } + + uffd.monitor.subscribe = ofi_uffd_subscribe; + uffd.monitor.unsubscribe = ofi_uffd_unsubscribe; + uffd.monitor.valid = ofi_uffd_valid; + + FI_INFO(&core_prov, FI_LOG_MR, + "Memory monitor uffd started.\n"); + return 0; -closefd: - close(uffd.fd); +close_fd: + + ofi_uffd_close_fd(&uffd); + +close_pipe: + + ofi_uffd_close_pipe(&uffd); + + FI_WARN(&core_prov, FI_LOG_MR, + "Memory monitor uffd failed to start: %s.\n", + strerror(-ret)); + return ret; } static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) { - pthread_cancel(uffd.thread); + ssize_t num_written; + + if (uffd.fd < 0) + return; + + /* tell the thread to exit with the exit_pipe */ + + num_written = write(uffd.exit_pipe[1], "X", 1); + if (num_written != 1) { + FI_WARN(&core_prov, FI_LOG_MR, + "uffd/close: unable to write to exit pipe: %s", + strerror(errno)); + } + pthread_join(uffd.thread, NULL); - close(uffd.fd); + + ofi_uffd_close_fd(&uffd); + ofi_uffd_close_pipe(&uffd); + + FI_INFO(&core_prov, FI_LOG_MR, + "Memory monitor uffd stopped.\n"); } #else /* HAVE_UFFD_MONITOR */ From 627e76402b06779db2c6cda94ff2af354a704bce Mon Sep 17 00:00:00 2001 From: Zhuo Zhi Date: Tue, 13 Aug 2024 20:14:11 +0800 Subject: [PATCH 179/393] prov/opx: use page_sizes[OFI_PAGE_SIZE] instead of PAGE_SIZE Some arch like riscv does not have a standard defined PAGE_SIZE in sys/user.h. Signed-off-by: Zhuo Zhi --- prov/opx/include/opa_user_gen1.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/prov/opx/include/opa_user_gen1.h b/prov/opx/include/opa_user_gen1.h index 99b4c141146..ed1ff675eb4 100644 --- a/prov/opx/include/opa_user_gen1.h +++ b/prov/opx/include/opa_user_gen1.h @@ -83,6 +83,7 @@ #include "opa_udebug.h" #include "opa_service.h" #include "opa_user.h" +#include "ofi_mem.h" #define HFI_RHF_USE_EGRBFR_MASK 0x1 #define HFI_RHF_USE_EGRBFR_SHIFT 15 @@ -570,7 +571,7 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, #endif FI_DBG(&fi_opx_provider, FI_LOG_MR, "OPX_DEBUG_ENTRY update [%p - %p], length %u (pages %lu)\n", - (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / PAGE_SIZE); + (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE]); cmd.len = sizeof(tidinfo); cmd.addr = (__u64) &tidinfo; @@ -589,9 +590,9 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, FI_WARN(&fi_opx_provider, FI_LOG_MR, "PARTIAL UPDATE errno %d \"%s\" INPUTS vaddr [%p - %p] length %u (pages %lu), OUTPUTS vaddr [%p - %p] length %u (pages %lu), tidcnt %u\n", errno, strerror(errno), (void*)vaddr, - (void*)(vaddr + *length), *length, (*length) / PAGE_SIZE, + (void*)(vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE], (void*)rettidinfo->vaddr,(void*)(rettidinfo->vaddr + rettidinfo->length), - rettidinfo->length, rettidinfo->length / PAGE_SIZE, + rettidinfo->length, rettidinfo->length / page_sizes[OFI_PAGE_SIZE], rettidinfo->tidcnt); } /* Always update outputs, even on soft errors */ @@ -601,7 +602,7 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, FI_DBG(&fi_opx_provider, FI_LOG_MR, "TID UPDATE IOCTL returned %d errno %d \"%s\" vaddr [%p - %p] length %u (pages %lu), tidcnt %u\n", err, errno, strerror(errno), (void*)vaddr, - (void*)(vaddr + *length), *length, (*length) / PAGE_SIZE, *tidcnt); + (void*)(vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE], *tidcnt); return 0; } @@ -609,13 +610,13 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, if (errno == ENOSPC) { FI_DBG(&fi_opx_provider, FI_LOG_MR, "IOCTL FAILED : No TIDs available, requested range=%p-%p (%u bytes, %lu pages)\n", - (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / PAGE_SIZE); + (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE]); err = -FI_ENOSPC; } else { FI_WARN(&fi_opx_provider, FI_LOG_MR, "IOCTL FAILED ERR %d errno %d \"%s\" requested range=%p-%p (%u bytes, %lu pages)\n", err, errno, strerror(errno), - (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / PAGE_SIZE); + (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / page_sizes[OFI_PAGE_SIZE]); } /* Hard error, we can't trust these */ From b534edcc3b24a4d70efceecb7521335e712cf408 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Wed, 23 Oct 2024 16:41:05 -0700 Subject: [PATCH 180/393] NEWS.md: Update for 2.0.0 beta Signed-off-by: Jianxin Xiong --- NEWS.md | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/NEWS.md b/NEWS.md index c257d7b5dbc..242872c4f0b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,191 @@ bug fixes (and other actions) for each version of Libfabric since version 1.0. New major releases include all fixes from minor releases with earlier release dates. +v2.0.0 beta, Fri Oct 25, 2024 +============================== + +## Core + +- Change the xpmem log level to info +- Clarify FI_HMEM support of inject calls +- Introduce Sub-MR +- Define capbility for directed receive without wildcard src_addr +- Define capability for tagged message only directed recv +- Define capability bit for tagged multi receive +- Define flag for single use MR +- Move flags only used for memory registration calls to fi_domain.h +- windows/osd.h: fix and refactor logical operations on complex numbers +- man/fi_peer: update peer fid initialization language +- Remove CURRENT_SYMVER() macro +- 1.8 ABI compat + +## CXI + +- Update provider man page +- Update version to 2.0 +- Remove setting total_buffered_recv +- Update CXI provider + +## EFA + +- Remove unused fields from various data structures +- Update efa shm implementation to allocate fi_peer_srx_context +- Avoid gdr_pin/gdr_map for dmabuf mrs +- Only do dmabuf reg when FI_MR_DMABUF is set +- Report correct inject_msg_size for zcpy rx +- Add setopt/getopt support for remaining EP sizes +- Split RDM EP inject size field into MSG,RMA variants +- Use tclass to prioritize the messages from an ep +- Remove tx_size and rx_size from efa_rdm_ep +- Remove tx_iov_limit and rx_iov_limit from efa_rdm_ep +- Remove DC NACK packet from rxe map after recv completed +- Correctly handle fallback longcts-rtw send completion +- Differentiate unresponsive receiver errors following rdma-core +- Make NACK protocol fall back to DC longCTS when DC is requested +- Update help message for inter_min_read_write_size +- Adjust log level for setopt/getopt +- Add dependency header file in fi_ext_efa.h +- Test: Disable shm via fi_setopt +- Rename p2p_available to mr_p2p_available +- Always use p2p for system memory +- Test: Use correct qp num in the mock +- Shrink the size of extra_info array +- Improve the zero-copy recv error message. +- Update read nack protocol docs +- Receiver send NACK if p2p is unavailable +- Sender switch to emulated long CTS write if p2p unavailable +- Adjust log level for shm disabling. +- Check p2p support to use rdma read +- Add device to host copy for inject rdma write +- Copy user buffer for fi_sendmsg with FI_INJECT +- Respect FI_MR_LOCAL in transport path + +## HOOK + +- Trace: Add trace log for domain_attr. + +## LNX + +- Initial addition + +## OPX + +- Use page_sizes[OFI_PAGE_SIZE] instead of PAGE_SIZE +- Set immediate ACK requested bit when sending last packet of RMA PUT +- Add debug check for zero-byte length data packets +- Conditionally set FI_REMOTE_CQ_DATA on receive +- Include less immediate data in RTS packet to improve rendezvous performance +- Investigate and address indeterminate behavior or segfault resulting from ignored context creation error +- fi_info -e fix for FI_OPX_UUID env var +- Fix last_bytes field for replay over sdma +- Fix eager and mp eager +- Fix payload copy +- Add FI_OPX_TID_MIN_PAYLOAD_BYTES param +- Fix incorrect calculation of immediate block offset in send rendezvous +- Initialize nic info in fi_info +- Simplify fi_opx_check_rma() function. +- added OPX Tracer points to RMA code paths +- Fix credit return +- Remove polling call from internal rma write +- Support 16B SDMA CTS work +- Fix uepkt 16B headers +- 16B SDMA header support +- Man: Document OPX max ping envvars +- Link bounce support for OPX WFR +- Scb/hdr changes +- Updated configure.m4 for ROCR +- Capitalized env var used for production override, also added opx to the front. +- Remove FI_CONTEXT2 requirement +- Only posting one completion for rzv truncation receives. +- Fixing bug for credit check in inject code path. +- Resolve coverity scan defects uncovered after upstream +- Replace fi_opx_context_slist with slist +- Remove assert from find pkt by tag +- Add OPX Tracer EP lock and Recv entries +- CN5000/JKR: Changes needed to get RMA working in 16B +- Added GDRCopy logging and failure path +- Initial 16B header support +- Fix wrong function used when copying from HMEM/rocr. +- Create GPU-specific SDMA/RZV thresholds +- Don't try to get HMEM iface for NULL pointers +- Limit the number of reliability pings on credit-constrained flows +- Remove function table entries for reliability types other than ONLOAD + +## PSM3 + +- Fix logical atomic function calls +- Check atomic op error code +- Disable complex comparison combinations + +## SHM + +- Use owner-allocated srx +- Fix incorrect capability set +- Make progress errors ints instead of unit64 +- Remove unused err path from progress_iov +- Refactor initialization process +- Put smr_map memory into av + +## TCP + +- Fix incorrect usage of av insert apis when multiplexing +- Initialize addr_size when duplicating an av + +## Util + +- Change uffd stop routine to use pipe +- Integrate kdreg2 into libfabric +- mr_cache: Support compile default monitor +- Handle page faults in uffd monitor +- Allow providers to update cache MR IOV +- Log AV insert with AV's specified address format +- Add uffd user mode flag for kernels + +## Fabtests + +- Fix complex fill cast +- efa: Remove rnr cq error message check +- efa: Loose assertion for read request counters +- runfabtests.cmd: add atomic tests to windows testing +- runfabtests.sh: add rdm_atomic validation tests +- rdm_atomic: add data validation +- Change ZE memset to use uint8 +- Change sync message to be 0 bytes instead of 1 byte +- Fix atomic buffer +- Add hmem support to common atomic validation +- Move ubertest atomic validation code to common +- Use new synapse api +- Update fi_multinode test +- Update runmultinode.py with args +- Added inband sync to ft_init_fabric_cm +- lpp: remove deprecated FI_MR_BASIC +- Add option for conditionally building lpp +- Make building efa conditional +- Call provider specific configure +- efa: Skip inter_min_write_write_size test when rdma write is on +- efa: Add efa_rdma_checker +- lpp: remove invalid condition in fi_tsenddata +- Support no prepost RX pingpong test +- Split out ft_sync logic +- Define common run pingpong function +- Move pingpong logic into pre-posted func +- lpp: update version and protocol in fi_getinfo +- lpp: fix compile warnings +- Remove multi_ep from tcp exclude +- runfabtests.sh: add more multi_ep tests +- Add common threading option +- multi_ep: use common long ops, switch shared-av and cq opts +- multi_ep: add closing and reopening of MRs +- multi_ep: add RMA validation +- Create common raw key functions +- multi_ep: separate MR resources per EP +- efa: Skip memory registration that hit device limit +- efa: Avoid testing duplicate mixed memory type workload +- lpp: Fix compiler warning about unused variables +- Remove deprecated MR modes +- Remove fi_poll and fi_dgram_waitset tests (deprecated feature) + + v2.0.0 alpha, Fri Aug 30, 2024 ============================== From f68230eadf181c1d36fa5ab13fc69024db174ffe Mon Sep 17 00:00:00 2001 From: Nikhil Nanal Date: Wed, 16 Oct 2024 15:49:20 -0700 Subject: [PATCH 181/393] fabtests: change xfer-method variable to xfer_method in runmultinode.sh The '-' in the xfer-method option in the runmultinode.sh script causes method and xfer as separate instead of a single bash variable xfer-method. This supplies invalid inputs to the fi_multinode test. changing the bash variable xfer-method to xfer_method fixes this issue. Signed-off-by: Nikhil Nanal --- fabtests/scripts/runmultinode.sh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fabtests/scripts/runmultinode.sh b/fabtests/scripts/runmultinode.sh index d4491de48b8..77b9dceeceb 100755 --- a/fabtests/scripts/runmultinode.sh +++ b/fabtests/scripts/runmultinode.sh @@ -1,7 +1,7 @@ #!/bin/bash Options=$(getopt --options h:,n:,p:,I:,-x:,z: \ - --longoptions hosts:,processes-per-node:,provider:,xfer-method:,iterations:,ci:,cleanup,help \ + --longoptions hosts:,processes-per-node:,provider:,xfer-method:,iterations:,ci:,cleanup,help \ -- "$@") eval set -- "$Options" @@ -10,7 +10,7 @@ hosts=[] ppn=1 iterations=1 pattern="" -xfer-method="msg" +xfer_method="msg" cleanup=false help=false ci="" @@ -19,7 +19,7 @@ while true; do case "$1" in -h|--hosts) IFS=',' read -r -a hosts <<< "$2"; shift 2 ;; - -n|--processes-per-node) + -n|--processes-per-node) ppn=$2; shift 2 ;; -p|--provider) provider="$2"; shift 2 ;; @@ -30,10 +30,10 @@ while true; do --cleanup) cleanup=true; shift ;; -x|--xfer-method) - xfer-method="$2"; shift 2 ;; + xfer_method="$2"; shift 2 ;; --ci) ci="$2"; shift 2 ;; - --help) + --help) help=true; shift ;; --) shift; break ;; @@ -41,21 +41,21 @@ while true; do done if $help ; then - echo "Run the multinode test suite on the nodes provided for many procceses" + echo "Run the multinode test suite on the nodes provided for many procceses" echo "multinode tests are run in performance mode" echo "Options" echo "\t-h,--hosts list of host names to run the tests on" echo "\t-n,--processes-per-node number of processes to be run on each node.\ Total number of fi_mulinode tests run will be n*number of hosts" echo "\t-p,--provider libfabric provider to run the multinode tests on" - echo "\t-C,--cabability multinode cabability to use (rma or default: msg)" + echo "\t-x,--xfer-method multinode transfer method/capability to use (rma or default: msg)" echo "\t-I,-- iterations number of iterations for the multinode test \ to run each pattern on" echo "\t--cleanup end straggling processes. Does not rerun tests" echo "\t--help show this message" exit 1 fi - + num_hosts=${#hosts[@]} max_ranks=$(($num_hosts*$ppn)) ranks=$max_ranks; @@ -65,7 +65,7 @@ output="multinode_server_${num_hosts}_${ppn}.log" ret=0 if ! $cleanup ; then - cmd="${ci}fi_multinode -n $ranks -s $server -p '$provider' -x $xfer-method $pattern -I $iterations -T" + cmd="${ci}fi_multinode -n $ranks -s $server -p '$provider' -x $xfer_method $pattern -I $iterations -T" echo $cmd for node in "${hosts[@]}"; do for i in $(seq 1 $ppn); do @@ -73,7 +73,7 @@ if ! $cleanup ; then echo STARTING SERVER if [ "$ci" == "" ]; then ssh $node $cmd &> $output & - else + else ssh $node $cmd | tee $output & fi server_pid=$! @@ -104,4 +104,4 @@ if ! $cleanup ; then echo "Output: $PWD/$output" fi -exit $ret +exit $ret From 038719018a3c489b2a91be14086027a72c6abe54 Mon Sep 17 00:00:00 2001 From: Nikhil Nanal Date: Tue, 22 Oct 2024 12:01:15 -0700 Subject: [PATCH 182/393] fabtests: Added -E/env option to multinode test script Currently no way exists to specify env variables to the multinode scripts. Added option to runmultinode.sh. Changes similar to runfabtests.sh Signed-off-by: Nikhil Nanal --- fabtests/scripts/runmultinode.sh | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fabtests/scripts/runmultinode.sh b/fabtests/scripts/runmultinode.sh index 77b9dceeceb..ebe564d0a8e 100755 --- a/fabtests/scripts/runmultinode.sh +++ b/fabtests/scripts/runmultinode.sh @@ -1,7 +1,7 @@ #!/bin/bash -Options=$(getopt --options h:,n:,p:,I:,-x:,z: \ - --longoptions hosts:,processes-per-node:,provider:,xfer-method:,iterations:,ci:,cleanup,help \ +Options=$(getopt --options h:,n:,p:,I:,-x:-E:,z: \ + --longoptions hosts:,processes-per-node:,provider:,xfer-method:,env:,iterations:,ci:,cleanup,help \ -- "$@") eval set -- "$Options" @@ -31,6 +31,13 @@ while true; do cleanup=true; shift ;; -x|--xfer-method) xfer_method="$2"; shift 2 ;; + -E|--env) + delimiter="=" + value=${2#*$delimiter} + var=${2:0:$(( ${#2} - ${#value} - ${#delimiter} ))} + EXPORT_STRING="export $var=\"$value\"" + EXPORT_ENV="${EXPORT_ENV}${EXPORT_STRING}; " + shift 2 ;; --ci) ci="$2"; shift 2 ;; --help) @@ -45,11 +52,11 @@ if $help ; then echo "multinode tests are run in performance mode" echo "Options" echo "\t-h,--hosts list of host names to run the tests on" - echo "\t-n,--processes-per-node number of processes to be run on each node.\ - Total number of fi_mulinode tests run will be n*number of hosts" + echo "\t-n,--processes-per-node number of processes to be run on each node. Total number of fi_mulinode tests run will be n*number of hosts" echo "\t-p,--provider libfabric provider to run the multinode tests on" echo "\t-x,--xfer-method multinode transfer method/capability to use (rma or default: msg)" - echo "\t-I,-- iterations number of iterations for the multinode test \ + echo "\t-E,--env export provided variable name and value" + echo "\t-I,--iterations number of iterations for the multinode test \ to run each pattern on" echo "\t--cleanup end straggling processes. Does not rerun tests" echo "\t--help show this message" @@ -65,7 +72,7 @@ output="multinode_server_${num_hosts}_${ppn}.log" ret=0 if ! $cleanup ; then - cmd="${ci}fi_multinode -n $ranks -s $server -p '$provider' -x $xfer_method $pattern -I $iterations -T" + cmd="${EXPORT_ENV} ${ci}fi_multinode -n $ranks -s $server -p '$provider' -x $xfer_method $pattern -I $iterations -T" echo $cmd for node in "${hosts[@]}"; do for i in $(seq 1 $ppn); do From 8457ba6b58288d0e8fca7f1d2d44920b01689812 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Wed, 23 Oct 2024 10:18:11 -0700 Subject: [PATCH 183/393] fabtest: Fix compilation error about CMPLX with C99 CMPLX is introduced in C11, it may be undefined in C99. Compilation error was observed with Coverity scan. Construct the complex number directly. It also handles float complex and long double complex better. Signed-off-by: Jianxin Xiong --- fabtests/include/unix/osd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fabtests/include/unix/osd.h b/fabtests/include/unix/osd.h index 66956aeb79f..0c3200b0468 100644 --- a/fabtests/include/unix/osd.h +++ b/fabtests/include/unix/osd.h @@ -125,7 +125,7 @@ static inline void OFI_COMPLEX_OP(name, set)(OFI_COMPLEX(name) *v1, OFI_COMPLEX( } \ static inline void OFI_COMPLEX_OP(name, fill)(OFI_COMPLEX(name) *v1, name v2) \ { \ - *v1 = CMPLX(v2, v2); \ + *v1 = (OFI_COMPLEX(name))((name)(v2) + I * (name)(v2)); \ } OFI_COMPLEX_OPS(float) From 0fbf4f40c72d0cfce15b1fe7f4fb6fd297f8acdd Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 2 May 2024 18:48:10 -0400 Subject: [PATCH 184/393] prov/lnx: Introducing the LINKx (lnx) provider The LINKx (lnx) provider offers a framework by which multiple providers can be linked together and presented as one provider to the application. This abstracts away the details of the traffic providers from the application. This iteration of the provider allows linking only two providers, shm and another provider, ex; CXI or RXM. The composite providers which are linked together need to support the peer infrastructure. Currently the provider supports creating a unique chain of fabric->domain->ep. It doesn't support creating multiple domains per fabric and multiple endpoints per domain. This will be addresses in followup updates to the provider. This iteration mainly focuses on supporting open MPI's MTL path which uses the libfabric tagged APIs. It has been tested with linking shm and cxi and shm and rxm. Future work will include: - Supporting 1:N of fabric:domain and domain:endpoint, etc - Hardware offload support - Arbitrary provider linking - Memory caching and registration - Full libfabric API support - Multi-Rail feature In order to use the lnx provider the user needs to: export FI_LNX_PROV_LINKS="shm+" ex: export FI_LNX_PROV_LINKS="shm+cxi" or export FI_LNX_PROV_LINKS="shm+tcp;ofi_rxm" This results in the lnx provider returning all available links to the application, which can then select the most appropriate one to use. Signed-off-by: Amir Shehata --- Makefile.am | 1 + configure.ac | 1 + include/ofi.h | 1 + include/ofi_prov.h | 11 + include/ofi_util.h | 15 +- include/rdma/fabric.h | 1 + include/rdma/fi_errno.h | 2 +- man/fi_lnx.7.md | 157 +++++ man/man7/fi_lnx.7 | 173 ++++++ prov/lnx/Makefile.include | 61 ++ prov/lnx/configure.m4 | 15 + prov/lnx/include/lnx.h | 477 +++++++++++++++ prov/lnx/src/lnx_av.c | 702 ++++++++++++++++++++++ prov/lnx/src/lnx_cq.c | 234 ++++++++ prov/lnx/src/lnx_domain.c | 581 ++++++++++++++++++ prov/lnx/src/lnx_ep.c | 1181 +++++++++++++++++++++++++++++++++++++ prov/lnx/src/lnx_init.c | 884 +++++++++++++++++++++++++++ prov/lnx/src/lnx_ops.c | 1036 ++++++++++++++++++++++++++++++++ prov/util/src/util_attr.c | 15 +- src/fabric.c | 20 +- src/fi_tostr.c | 1 + 21 files changed, 5562 insertions(+), 7 deletions(-) create mode 100644 man/fi_lnx.7.md create mode 100644 man/man7/fi_lnx.7 create mode 100644 prov/lnx/Makefile.include create mode 100644 prov/lnx/configure.m4 create mode 100644 prov/lnx/include/lnx.h create mode 100644 prov/lnx/src/lnx_av.c create mode 100644 prov/lnx/src/lnx_cq.c create mode 100644 prov/lnx/src/lnx_domain.c create mode 100644 prov/lnx/src/lnx_ep.c create mode 100644 prov/lnx/src/lnx_init.c create mode 100644 prov/lnx/src/lnx_ops.c diff --git a/Makefile.am b/Makefile.am index de2158c5fc1..204352db93b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -485,6 +485,7 @@ include prov/sm2/Makefile.include include prov/tcp/Makefile.include include prov/ucx/Makefile.include include prov/lpp/Makefile.include +include prov/lnx/Makefile.include include prov/hook/Makefile.include include prov/hook/perf/Makefile.include include prov/hook/trace/Makefile.include diff --git a/configure.ac b/configure.ac index 2b476f7f9d2..8e222be2e38 100644 --- a/configure.ac +++ b/configure.ac @@ -1125,6 +1125,7 @@ FI_PROVIDER_SETUP([hook_debug]) FI_PROVIDER_SETUP([hook_hmem]) FI_PROVIDER_SETUP([dmabuf_peer_mem]) FI_PROVIDER_SETUP([opx]) +FI_PROVIDER_SETUP([lnx]) FI_PROVIDER_FINI dnl Configure the .pc file FI_PROVIDER_SETUP_PC diff --git a/include/ofi.h b/include/ofi.h index 7592281c766..9661a7553d9 100644 --- a/include/ofi.h +++ b/include/ofi.h @@ -297,6 +297,7 @@ enum ofi_prov_type { OFI_PROV_UTIL, OFI_PROV_HOOK, OFI_PROV_OFFLOAD, + OFI_PROV_LNX, }; /* Restrict to size of struct fi_provider::context (struct fi_context) */ diff --git a/include/ofi_prov.h b/include/ofi_prov.h index ccb3fbf616d..7ffcda76268 100644 --- a/include/ofi_prov.h +++ b/include/ofi_prov.h @@ -211,6 +211,17 @@ MRAIL_INI ; # define MRAIL_INIT NULL #endif +#if (HAVE_LNX) && (HAVE_LNX_DL) +# define LNX_INI FI_EXT_INI +# define LNX_INIT NULL +#elif (HAVE_LNX) +# define LNX_INI INI_SIG(fi_lnx_ini) +# define LNX_INIT fi_lnx_ini() +LNX_INI ; +#else +# define LNX_INIT NULL +#endif + #if (HAVE_PERF) && (HAVE_PERF_DL) # define HOOK_PERF_INI FI_EXT_INI # define HOOK_PERF_INIT NULL diff --git a/include/ofi_util.h b/include/ofi_util.h index 911a69893ba..dda5c903e6e 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -1172,9 +1172,11 @@ void ofi_fabric_remove(struct util_fabric *fabric); * Utility Providers */ -#define OFI_NAME_DELIM ';' +#define OFI_NAME_LNX_DELIM ':' +#define OFI_NAME_DELIM ';' #define OFI_UTIL_PREFIX "ofi_" #define OFI_OFFLOAD_PREFIX "off_" +#define OFI_LNX "lnx" static inline int ofi_has_util_prefix(const char *str) { @@ -1186,6 +1188,16 @@ static inline int ofi_has_offload_prefix(const char *str) return !strncasecmp(str, OFI_OFFLOAD_PREFIX, strlen(OFI_OFFLOAD_PREFIX)); } +static inline int ofi_is_lnx(const char *str) +{ + return !strncasecmp(str, OFI_LNX, strlen(OFI_LNX)); +} + +static inline int ofi_is_linked(const char *str) +{ + return (strcasestr(str, OFI_LNX)) ? 1 : 0; +} + int ofi_get_core_info(uint32_t version, const char *node, const char *service, uint64_t flags, const struct util_prov *util_prov, const struct fi_info *util_hints, @@ -1201,6 +1213,7 @@ int ofi_get_core_info_fabric(const struct fi_provider *prov, struct fi_info **core_info); +char *ofi_strdup_link_append(const char *head, const char *tail); char *ofi_strdup_append(const char *head, const char *tail); // char *ofi_strdup_head(const char *str); // char *ofi_strdup_tail(const char *str); diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index c96d2c79ddc..366e6b0402b 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -340,6 +340,7 @@ enum { FI_PROTO_SM2, FI_PROTO_CXI_RNR, FI_PROTO_LPP, + FI_PROTO_LNX, }; enum { diff --git a/include/rdma/fi_errno.h b/include/rdma/fi_errno.h index f5af121ec79..b90dbd5f42d 100644 --- a/include/rdma/fi_errno.h +++ b/include/rdma/fi_errno.h @@ -114,7 +114,7 @@ extern "C" { //#define FI_EADV EADV /* Advertise error */ //#define FI_ESRMNT ESRMNT /* Srmount error */ //#define FI_ECOMM ECOMM /* Communication error on send */ -//#define FI_EPROTO EPROTO /* Protocol error */ +#define FI_EPROTO EPROTO /* Protocol error */ //#define FI_EMULTIHOP EMULTIHOP /* Multihop attempted */ //#define FI_EDOTDOT EDOTDOT /* RFS specific error */ //#define FI_EBADMSG EBADMSG /* Not a data message */ diff --git a/man/fi_lnx.7.md b/man/fi_lnx.7.md new file mode 100644 index 00000000000..f52a08840dc --- /dev/null +++ b/man/fi_lnx.7.md @@ -0,0 +1,157 @@ +--- +layout: page +title: fi_lnx(7) +tagline: Libfabric Programmer's Manual +--- +{% include JB/setup %} + +# NAME + +fi_lnx \- The LINKx (LNX) Provider + +# OVERVIEW + +The LNX provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. This provider uses +the libfabric peer infrastructure to aid in the use of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. Future releases of the provider will allow linking any +number of providers and provide the users with the ability to influence +the way the providers are utilized for traffic load. + +# SUPPORTED FEATURES + +This release contains an initial implementation of the LNX provider that +offers the following support: + +*Endpoint types* +: The provider supports only endpoint type *FI_EP_RDM*. + +*Endpoint capabilities* +: LNX is a passthrough layer on the send path. On the receive path LNX + utilizes the peer infrastructure to create shared receive queues (SRQ). + Receive requests are placed on the SRQ instead of on the core provider + receive queue. When the provider receives a message it queries the SRQ for + a match. If one is found the receive request is completed, otherwise the + message is placed on the LNX shared unexpected queue (SUQ). Further receive + requests query the SUQ for matches. + The first release of the provider only supports tagged and RMA operations. + Other message types will be supported in future releases. + +*Modes* +: The provider does not require the use of any mode bits. + +*Progress* +: LNX utilizes the peer infrastructure to provide a shared completion + queue. Each linked provider still needs to handle its own progress. + Completion events will however be placed on the shared completion queue, + which is passed to the application for access. + +*Address Format* +: LNX wraps the linked providers addresses in one common binary blob. + It does not alter or change the linked providers address format. It wraps + them into a LNX structure which is then flattened and returned to the + application. This is passed between different nodes. The LNX provider + is able to parse the flattened format and operate on the different links. + This assumes that nodes in the same group are all using the same version of + the provider with the exact same links. IE: you can't have one node linking + SHM+CXI while another linking SHM+RXM. + +*Message Operations* +: LNX is designed to intercept message operations such as fi_tsenddata + and based on specific criteria forward the operation to the appropriate + provider. For the first release, LNX will only support linking SHM + provider for intra-node traffic and another provider (ex: CXI) for inter + node traffic. LNX send operation looks at the destination and based on + whether the destination is local or remote it will select the provider to + forward the operation to. The receive case has been described earlier. + +*Using the Provider* +: In order to use the provider the user needs to set FI_LNX_PROV_LINKS + environment variable to the linked providers in the following format + shm+. This will allow LNX to report back to the application in the + fi_getinfo() call the different links which can be selected. Since there are + multiple domains per provider LNX reports a permutation of all the + possible links. For example if there are two CXI interfaces on the machine + LNX will report back shm+cxi0 and shm+cxi1. The application can then + select based on its own criteria the link it wishes to use. + The application typically uses the PCI information in the fi_info + structure to select the interface to use. A common selection criteria is + the interface nearest the core the process is bound to. In order to make + this determination, the application requires the PCI information about the + interface. For this reason LNX forwards the PCI information for the + inter-node provider in the link to the application. + +# LIMITATIONS AND FUTURE WORK + +*Hardware Support* +: LNX doesn't support hardware offload; ex hardware tag matching. This is + an inherit limitation when using the peer infrastructure. Due to the use + of a shared receive queue which linked providers need to query when + a message is received, any hardware offload which requires sending the + receive buffers to the hardware directly will not work with the shared + receive queue. The shared receive queue provides two advantages; 1) reduce + memory usage, 2) coordinate the receive operations. For #2 this is needed + when receiving from FI_ADDR_UNSPEC. In this case both providers which are + part of the link can race to gain access to the receive buffer. It is + a future effort to determine a way to use hardware tag matching and other + hardware offload capability with LNX + +*Limited Linking* +: This release of the provider supports linking SHM provider for intra-node + operations and another provider which supports the FI_PEER capability for + inter-node operations. It is a future effort to expand to link any + multiple sets of providers. + +*Memory Registration* +: As part of the memory registration operation, varying hardware can perform + hardware specific steps such as memory pinning. Due to the fact that + memory registration APIs do not specify the source or destination + addresses it is not possible for LNX to determine which provider to + forward the memory registration to. LNX, therefore, registers the memory + with all linked providers. This might not be efficient and might have + unforeseen side effects. A better method is needed to support memory + registration. One option is to have memory registration cache in lnx + to avoid expensive operations. + +*Operation Types* +: This release of LNX supports tagged and RMA operations only. Future + releases will expand the support to other operation types. + +*Multi-Rail* +: Future design effort is being planned to support utilizing multiple interfaces + for traffic simultaneously. This can be over homogeneous interfaces or over + heterogeneous interfaces. + +# RUNTIME PARAMETERS + +The *LNX* provider checks for the following environment variables: + +*FI_LNX_PROV_LINKS* +: This environment variable is used to specify which providers to link. This + must be set in order for the LNX provider to return a list of fi_info + blocks in the fi_getinfo() call. The format which must be used is: + ++... As mentioned earlier currently LNX supports linking + only two providers the first of which is SHM followed by one other + provider for inter-node operations + +*FI_LNX_DISABLE_SHM* +: By default this environment variable is set to 0. However, the user can + set it to one and then the SHM provider will not be used. This can be + useful for debugging and performance analysis. The SHM provider will + naturally be used for all intra-node operations. Therefore, to test SHM in + isolation with LNX, the processes can be limited to the same node only. + +*FI_LNX_USE_SRQ* +: Shared Receive Queues are integral part of the peer infrastructure, but + they have the limitation of not using hardware offload, such as tag + matching. SRQ is needed to support the FI_ADDR_UNSPEC case. If the application + is sure this will never be the case, then it can turn off SRQ support by + setting this environment variable to 0. It is 1 by default. + +# SEE ALSO + +[`fabric`(7)](fabric.7.html), +[`fi_provider`(7)](fi_provider.7.html), +[`fi_getinfo`(3)](fi_getinfo.3.html) diff --git a/man/man7/fi_lnx.7 b/man/man7/fi_lnx.7 new file mode 100644 index 00000000000..b30876e24e4 --- /dev/null +++ b/man/man7/fi_lnx.7 @@ -0,0 +1,173 @@ +.\" Automatically generated by Pandoc 2.9.2.1 +.\" +.TH "fi_lnx" "7" "" "" "" +.hy +.PP +{% include JB/setup %} +.SH NAME +.PP +fi_lnx - The LINKx (lnx) Provider +.SH OVERVIEW +.PP +The lnx provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. +This provider uses the libfabric peer infrastructure to aid in the use +of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. +Future releases of the provider will allow linking any number of +providers and provide the users with the ability to influence the way +the providers are utilized for traffic load. +.SH SUPPORTED FEATURES +.PP +This release contains an initial implementation of the lnx provider +that offers the following support: +.TP +\f[I]Endpoint types\f[R] +The provider supports only endpoint type \f[I]FI_EP_RDM\f[R]. +.TP +\f[I]Endpoint capabilities\f[R] +lnx is a passthrough layer on the send path. +On the receive path lnx utilizes the peer infrastructure to create +shared receive queues (SRQ). +Receive requests are placed on the SRQ instead of on the core provider +receive queue. +When the provider receives a message it queries the SRQ for a match. +If one is found the receive request is completed, otherwise the message +is placed on the lnx shared unexpected queue (SUQ). +Further receive requests query the SUQ for matches. +The first release of the provider only supports tagged and RMA +operations. +Other message types will be supported in future releases. +.TP +\f[I]Modes\f[R] +The provider does not require the use of any mode bits. +.TP +\f[I]Progress\f[R] +lnx utilizes the peer infrastructure to provide a shared completion +queue. +Each linked provider still needs to handle its own progress. +Completion events will however be placed on the shared completion queue, +which is passed to the application for access. +.TP +\f[I]Address Format\f[R] +lnx wraps the linked providers addresses in one common binary blob. +It does not alter or change the linked providers address format. +It wraps them into a lnx structure which is then flattened and +returned to the application. +This is passed between different nodes. +The lnx provider is able to parse the flattened format and operate on +the different links. +This assumes that nodes in the same group are all using the same version +of the provider with the exact same links. +IE: you can\[cq]t have one node linking SHM+CXI while another linking +SHM+RXM. +.TP +\f[I]Message Operations\f[R] +lnx is designed to intercept message operations such as fi_tsenddata +and based on specific criteria forward the operation to the appropriate +provider. +For the first release, lnx will only support linking SHM provider for +intra-node traffic and another provider (ex: CXI) for inter node +traffic. +lnx send operation looks at the destination and based on whether the +destination is local or remote it will select the provider to forward +the operation to. +The receive case has been described earlier. +.TP +\f[I]Using the Provider\f[R] +In order to use the provider the user needs to set FI_LNX_PROV_LINKS +environment variable to the linked providers in the following format +shm+. +This will allow lnx to report back to the application in the +fi_getinfo() call the different links which can be selected. +Since there are multiple domains per provider lnx reports a +permutation of all the possible links. +For example if there are two CXI interfaces on the machine lnx will +report back shm+cxi0 and shm+cxi1. +The application can then select based on its own criteria the link it +wishes to use. +The application typically uses the PCI information in the fi_info +structure to select the interface to use. +A common selection criteria is the interface nearest the core the +process is bound to. +In order to make this determination, the application requires the PCI +information about the interface. +For this reason lnx forwards the PCI information for the inter-node +provider in the link to the application. +.SH LIMITATIONS AND FUTURE WORK +.TP +\f[I]Hardware Support\f[R] +lnx doesn\[cq]t support hardware offload; ex hardware tag matching. +This is an inherit limitation when using the peer infrastructure. +Due to the use of a shared receive queue which linked providers need to +query when a message is received, any hardware offload which requires +sending the receive buffers to the hardware directly will not work with +the shared receive queue. +The shared receive queue provides two advantages; 1) reduce memory +usage, 2) coordinate the receive operations. +For #2 this is needed when receiving from FI_ADDR_UNSPEC. +In this case both providers which are part of the link can race to gain +access to the receive buffer. +It is a future effort to determine a way to use hardware tag matching +and other hardware offload capability with lnx +.TP +\f[I]Limited Linking\f[R] +This release of the provider supports linking SHM provider for +intra-node operations and another provider which supports the FI_PEER +capability for inter-node operations. +It is a future effort to expand to link any multiple sets of providers. +.TP +\f[I]Memory Registration\f[R] +As part of the memory registration operation, varying hardware can +perform hardware specific steps such as memory pinning. +Due to the fact that memory registration APIs do not specify the source +or destination addresses it is not possible for lnx to determine which +provider to forward the memory registration to. +LINkx, therefore, registers the memory with all linked providers. +This might not be efficient and might have unforeseen side effects. +A better method is needed to support memory registration. +.TP +\f[I]Operation Types\f[R] +This release of lnx supports tagged and RMA operations only. +Future releases will expand the support to other operation types. +.TP +\f[I]Multi-Rail\f[R] +Future design effort is being planned to support utilizing multiple +interfaces for traffic simultaneously. +This can be over homogeneous interfaces or over heterogeneous +interfaces. +.SH RUNTIME PARAMETERS +.PP +The \f[I]lnx\f[R] provider checks for the following environment +variables: +.TP +\f[I]FI_LNX_PROV_LINKS\f[R] +This environment variable is used to specify which providers to link. +This must be set in order for the lnx provider to return a list of +fi_info blocks in the fi_getinfo() call. +The format which must be used is: ++\&... As mentioned earlier currently +lnx supports linking only two providers the first of which is SHM +followed by one other provider for inter-node operations +.TP +\f[I]FI_LNX_DISABLE_SHM\f[R] +By default this environment variable is set to 0. +However, the user can set it to one and then the SHM provider will not +be used. +This can be useful for debugging and performance analysis. +The SHM provider will naturally be used for all intra-node operations. +Therefore, to test SHM in isolation with lnx, the processes can be +limited to the same node only. +.TP +\f[I]FI_LNX_USE_SRQ\f[R] +Shared Receive Queues are integral part of the peer infrastructure, but +they have the limitation of not using hardware offload, such as tag +matching. +SRQ is needed to support the FI_ADDR_UNSPEC case. +If the application is sure this will never be the case, then it can turn +off SRQ support by setting this environment variable to 0. +It is 1 by default. +.SH SEE ALSO +.PP +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) diff --git a/prov/lnx/Makefile.include b/prov/lnx/Makefile.include new file mode 100644 index 00000000000..cd23049e845 --- /dev/null +++ b/prov/lnx/Makefile.include @@ -0,0 +1,61 @@ +# +# Copyright (c) 2022 ORNL. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + + +if HAVE_LNX +_lnx_files = \ + prov/lnx/src/lnx_cq.c \ + prov/lnx/src/lnx_domain.c \ + prov/lnx/src/lnx_ep.c \ + prov/lnx/src/lnx_init.c \ + prov/lnx/src/lnx_ops.c \ + prov/lnx/src/lnx_av.c + +_lnx_headers = \ + prov/lnx/include/lnx.h + +if HAVE_LNX_DL +pkglib_LTLIBRARIES += liblnx-fi.la +liblnx_fi_la_SOURCES = $(_lnx_files) $(_lnx_headers) +liblnx_fi_la_LIBADD = $(linkback) $(lnx_LIBS) +liblnx_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic +liblnx_fi_la_DEPENDENCIES = $(linkback) +else +src_libfabric_la_SOURCES += $(_lnx_files) $(_lnx_headers) +src_libfabric_la_CPPFLAGS += -I$(top_srcdir)/prov/lnx/include +endif + +prov_install_man_pages += man/man7/fi_lnx.7 + +endif HAVE_LNX + +prov_dist_man_pages += man/man7/fi_lnx.7 diff --git a/prov/lnx/configure.m4 b/prov/lnx/configure.m4 new file mode 100644 index 00000000000..737b62bc46d --- /dev/null +++ b/prov/lnx/configure.m4 @@ -0,0 +1,15 @@ +dnl Configury specific to the libfabric lnx provider + +dnl Called to configure this provider +dnl +dnl Arguments: +dnl +dnl $1: action if configured successfully +dnl $2: action if not configured successfully +dnl +AC_DEFUN([FI_LNX_CONFIGURE],[ + # Determine if we can support the lnx provider + lnx_happy=0 + AS_IF([test x"$enable_lnx" != x"no"], [lnx_happy=1]) + AS_IF([test $lnx_happy -eq 1], [$1], [$2]) +]) diff --git a/prov/lnx/include/lnx.h b/prov/lnx/include/lnx.h new file mode 100644 index 00000000000..b40c9ea3eca --- /dev/null +++ b/prov/lnx/include/lnx.h @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LNX_H +#define LNX_H + +#define LNX_DEF_AV_SIZE 1024 +#define LNX_MAX_LOCAL_EPS 16 +#define LNX_IOV_LIMIT 4 + +#define lnx_ep_rx_flags(lnx_ep) ((lnx_ep)->le_ep.rx_op_flags) + +struct local_prov_ep; + +struct lnx_match_attr { + fi_addr_t lm_addr; + uint64_t lm_tag; + uint64_t lm_ignore; + struct lnx_peer *lm_peer; + struct local_prov_ep *lm_cep; +}; + +struct lnx_peer_cq { + struct lnx_cq *lpc_shared_cq; + struct fid_peer_cq lpc_cq; + struct fid_cq *lpc_core_cq; +}; + +struct lnx_queue { + struct dlist_entry lq_queue; + dlist_func_t *lq_match_func; + ofi_spin_t lq_qlock; +}; + +struct lnx_qpair { + struct lnx_queue lqp_recvq; + struct lnx_queue lqp_unexq; +}; + +struct lnx_peer_srq { + struct lnx_qpair lps_trecv; + struct lnx_qpair lps_recv; +}; + +struct local_prov_ep { + struct dlist_entry entry; + bool lpe_local; + char lpe_fabric_name[FI_NAME_MAX]; + struct fid_fabric *lpe_fabric; + struct fid_domain *lpe_domain; + struct fid_ep *lpe_ep; + struct fid_ep **lpe_txc; + struct fid_ep **lpe_rxc; + struct fid_av *lpe_av; + struct lnx_peer_cq lpe_cq; + struct fi_info *lpe_fi_info; + struct fid_peer_srx lpe_srx; + struct ofi_bufpool *lpe_recv_bp; + ofi_spin_t lpe_bplock; + struct local_prov *lpe_parent; +}; + +struct lnx_rx_entry { + /* the entry which will be passed to the core provider */ + struct fi_peer_rx_entry rx_entry; + /* iovec to use to point to receive buffers */ + struct iovec rx_iov[LNX_IOV_LIMIT]; + /* desc array to be used to point to the descs passed by the user */ + void *rx_desc[LNX_IOV_LIMIT]; + /* peer we expect messages from. + * This is available if the receive request provided a source address. + * Otherwise it will be NULL + */ + struct lnx_peer *rx_peer; + /* local prov endpoint receiving the message if this entry is + * added to the SUQ + */ + struct local_prov_ep *rx_cep; + /* match information which will be given to us by the core provider */ + struct fi_peer_match_attr rx_match_info; + /* ignore bit passed in by the user */ + uint64_t rx_ignore; + /* which pool this rx_entry came from. It's either from the global + * pool or some core provider pool + */ + bool rx_global; +}; + +OFI_DECLARE_FREESTACK(struct lnx_rx_entry, lnx_recv_fs); + +struct local_prov { + struct dlist_entry lpv_entry; + char lpv_prov_name[FI_NAME_MAX]; + int lpv_ep_count; + struct dlist_entry lpv_prov_eps; +}; + +struct lnx_address_prov { + char lap_prov[FI_NAME_MAX]; + /* an array of addresses of size count. */ + /* entry 0 is shm if available */ + /* array can't be larger than LNX_MAX_LOCAL_EPS */ + int lap_addr_count; + /* size as specified by the provider */ + int lap_addr_size; + /* payload */ + char lap_addrs[]; +}; + +struct lnx_addresses { + /* used to determine if the address is node local or node remote */ + char la_hostname[FI_NAME_MAX]; + /* number of providers <= LNX_MAX_LOCAL_EPS */ + int la_prov_count; + struct lnx_address_prov la_addr_prov[]; +}; + +struct lnx_local2peer_map { + struct dlist_entry entry; + struct local_prov_ep *local_ep; + int addr_count; + fi_addr_t peer_addrs[LNX_MAX_LOCAL_EPS]; +}; + +struct lnx_peer_prov { + struct dlist_entry entry; + + /* provider name */ + char lpp_prov_name[FI_NAME_MAX]; + + uint64_t lpp_flags; + + /* pointer to the local endpoint information to be used for + * communication with this peer. + * + * If the peer is on-node, then lp_endpoints[0] = shm + * + * if peer is off-node, then there could be up to LNX_MAX_LOCAL_EPS + * local endpoints we can use to reach that peer. + */ + struct local_prov *lpp_prov; + + /* each peer can be reached from any of the local provider endpoints + * on any of the addresses which are given to us. It's an N:N + * relationship + */ + struct dlist_entry lpp_map; +}; + +struct lnx_peer { + /* true if peer can be reached over shared memory, false otherwise */ + bool lp_local; + + /* Each provider that we can reach the peer on will have an entry + * below. Each entry will contain all the local provider endpoints we + * can reach the peer through, as well as all the peer addresses on that + * provider. + * + * We can potentially multi-rail between the interfaces on the same + * provider, both local and remote. + * + * Or we can multi-rail across different providers. Although this + * might be more complicated due to the differences in provider + * capabilities. + */ + struct lnx_peer_prov *lp_shm_prov; + struct dlist_entry lp_provs; +}; + +struct lnx_peer_table { + struct util_av lpt_av; + int lpt_max_count; + int lpt_count; + struct lnx_domain *lpt_domain; + /* an array of peer entries */ + struct lnx_peer **lpt_entries; +}; + +struct lnx_ctx { + struct dlist_entry ctx_head; + int ctx_idx; + struct lnx_ep *ctx_parent; + struct fid_ep ctx_ep; +}; + +struct lnx_ep { + struct util_ep le_ep; + struct dlist_entry le_tx_ctx; + struct dlist_entry le_rx_ctx; + struct lnx_domain *le_domain; + size_t le_fclass; + struct lnx_peer_table *le_peer_tbl; + struct lnx_peer_srq le_srq; +}; + +struct lnx_srx_context { + struct lnx_ep *srx_lep; + struct local_prov_ep *srx_cep; +}; + +struct lnx_mem_desc_prov { + struct local_prov *prov; + struct fid_mr *core_mr; +}; + +struct lnx_mem_desc { + struct lnx_mem_desc_prov desc[LNX_MAX_LOCAL_EPS]; + int desc_count; +}; + +struct lnx_mr { + struct ofi_mr mr; + struct lnx_mem_desc desc; +}; + +struct lnx_domain { + struct util_domain ld_domain; + struct lnx_fabric *ld_fabric; + bool ld_srx_supported; + struct ofi_mr_cache ld_mr_cache; +}; + +struct lnx_cq { + struct util_cq util_cq; + struct lnx_domain *lnx_domain; +}; + +struct lnx_fabric { + struct util_fabric util_fabric; + /* providers linked by this fabric */ + struct dlist_entry local_prov_table; + /* memory registration buffer pool */ + struct ofi_bufpool *mem_reg_bp; + /* shared memory provider used in this link */ + struct local_prov *shm_prov; + /* peers associated with this link */ + struct lnx_peer_table *lnx_peer_tbl; +}; + +extern struct util_prov lnx_util_prov; +extern struct fi_provider lnx_prov; +extern struct ofi_bufpool *global_recv_bp; +extern ofi_spin_t global_bplock; + +struct fi_info *lnx_get_link_by_dom(char *domain_name); + +int lnx_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, + struct fi_info **info); + +int lnx_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context); +int lnx_setup_core_fabrics(char *name, struct lnx_fabric *lnx_fab, + void *context); + +void lnx_fini(void); + +int lnx_fabric_close(struct fid *fid); + +int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **dom, void *context); + +int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context); + +int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq, void *context); + +int lnx_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); + +int lnx_scalable_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); + +int lnx_cq2ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags); + +int lnx_get_msg(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry); +int lnx_get_tag(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry); +int lnx_queue_msg(struct fi_peer_rx_entry *entry); +int lnx_queue_tag(struct fi_peer_rx_entry *entry); +void lnx_free_entry(struct fi_peer_rx_entry *entry); +void lnx_foreach_unspec_addr(struct fid_peer_srx *srx, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)); + +static inline struct lnx_peer * +lnx_get_peer(struct lnx_peer **peers, fi_addr_t addr) +{ + if (!peers || addr == FI_ADDR_UNSPEC) + return NULL; + + return peers[addr]; +} + +static inline +void lnx_get_core_desc(struct lnx_mem_desc *desc, void **mem_desc) +{ + if (desc && desc->desc[0].core_mr) { + if (mem_desc) + *mem_desc = desc->desc[0].core_mr->mem_desc; + return; + } + + *mem_desc = NULL; +} + +static inline +int lnx_create_mr(const struct iovec *iov, fi_addr_t addr, + struct lnx_domain *lnx_dom, struct ofi_mr_entry **mre) +{ + struct ofi_mr *mr; + struct fi_mr_attr attr = {}; + struct fi_mr_attr cur_abi_attr; + struct ofi_mr_info info = {}; + uint64_t flags; + int rc; + + attr.iov_count = 1; + attr.mr_iov = iov; + *mre = ofi_mr_cache_find(&lnx_dom->ld_mr_cache, &attr, 0); + if (*mre) { + mr = (struct ofi_mr *)(*mre)->data; + goto out; + } + + attr.iface = ofi_get_hmem_iface(iov->iov_base, + &attr.device.reserved, &flags); + info.iov = *iov; + info.iface = attr.iface; + rc = ofi_hmem_dev_register(attr.iface, iov->iov_base, iov->iov_len, + (uint64_t *) &attr.hmem_data); + if (rc) + return rc; + + rc = ofi_mr_cache_search(&lnx_dom->ld_mr_cache, &info, mre); + if (rc) { + ofi_hmem_dev_unregister(attr.iface, (uint64_t)attr.hmem_data); + return rc; + } + + mr = (struct ofi_mr *)(*mre)->data; + ofi_mr_update_attr(lnx_dom->ld_domain.fabric->fabric_fid.api_version, + lnx_dom->ld_domain.info_domain_caps, &attr, &cur_abi_attr, 0); + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = attr.context; + mr->domain = &lnx_dom->ld_domain; + mr->flags = flags; + mr->iface = cur_abi_attr.iface; + mr->device = cur_abi_attr.device.reserved; + mr->hmem_data = cur_abi_attr.hmem_data; + mr->mr_fid.mem_desc = (void*) mr; + +out: + return FI_SUCCESS; +} + +static inline +int lnx_select_send_pathway(struct lnx_peer *lp, struct lnx_domain *lnx_dom, + struct lnx_mem_desc *desc, struct local_prov_ep **cep, + fi_addr_t *addr, const struct iovec *iov, size_t iov_count, + struct ofi_mr_entry **mre, void **mem_desc, uint64_t *rkey) +{ + int idx = 0; + int rc; + struct lnx_peer_prov *prov; + struct lnx_local2peer_map *lpm; + struct ofi_mr *mr = NULL; + + if (lp->lp_local) { + prov = lp->lp_shm_prov; + } else { + prov = dlist_first_entry_or_null( + &lp->lp_provs, struct lnx_peer_prov, entry); + idx = 1; + } + + /* TODO when we support multi-rail we can have multiple maps */ + lpm = dlist_first_entry_or_null(&prov->lpp_map, + struct lnx_local2peer_map, entry); + *addr = lpm->peer_addrs[0]; + + /* TODO this will need to be expanded to handle Multi-Rail. For now + * the assumption is that local peers can be reached on shm and remote + * peers have only one interface, hence indexing on 0 and 1 + * + * If we did memory registration, then we've already figured out the + * pathway + */ + if (desc && desc->desc[idx].core_mr) { + *cep = dlist_first_entry_or_null( + &desc->desc[idx].prov->lpv_prov_eps, + struct local_prov_ep, entry); + if (mem_desc) + *mem_desc = fi_mr_desc(desc->desc[idx].core_mr); + if (rkey) + *rkey = fi_mr_key(desc->desc[idx].core_mr); + return 0; + } + + *cep = lpm->local_ep; + if (mem_desc) + *mem_desc = NULL; + + if (!lp->lp_local || !mem_desc || (mem_desc && *mem_desc) || + !iov || (iov && iov->iov_base == NULL)) + return 0; + + /* Look up the address in the cache: + * - if it's found then use the cached fid_mr + * - This will include the iface, which is really all we need + * - if it's not then lookup the iface, create the fid_mr and + * cache it. + */ + rc = lnx_create_mr(iov, *addr, lnx_dom, mre); + if (!rc && mre) { + mr = (struct ofi_mr *)(*mre)->data; + *mem_desc = mr->mr_fid.mem_desc; + } + + return rc; +} + +static inline +int lnx_select_recv_pathway(struct lnx_peer *lp, struct lnx_domain *lnx_dom, + struct lnx_mem_desc *desc, struct local_prov_ep **cep, + fi_addr_t *addr, const struct iovec *iov, size_t iov_count, + struct ofi_mr_entry **mre, void **mem_desc) +{ + /* if the src address is FI_ADDR_UNSPEC, then we'll need to trigger + * all core providers to listen for a receive, since we don't know + * which one will endup getting the message. + * + * For each core provider we're tracking, trigger the recv operation + * on it. + * + * if the src address is specified then we just need to select and + * exact core endpoint to trigger the recv on. + */ + if (!lp) + return -FI_ENOSYS; + + return lnx_select_send_pathway(lp, lnx_dom, desc, cep, addr, iov, + iov_count, mre, mem_desc, NULL); +} + +#endif /* LNX_H */ diff --git a/prov/lnx/src/lnx_av.c b/prov/lnx/src/lnx_av.c new file mode 100644 index 00000000000..4e6ac0bebaf --- /dev/null +++ b/prov/lnx/src/lnx_av.c @@ -0,0 +1,702 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +static void lnx_free_peer(struct lnx_peer *lp) +{ + struct lnx_peer_prov *lpp; + struct dlist_entry *tmp, *tmp2; + struct lnx_local2peer_map *lpm; + + dlist_foreach_container_safe(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry, tmp) { + dlist_foreach_container_safe(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry, tmp2) { + dlist_remove(&lpm->entry); + free(lpm); + } + dlist_remove(&lpp->entry); + free(lpp); + } + + free(lp); +} + +#if ENABLE_DEBUG +static void lnx_print_peer(int idx, struct lnx_peer *lp) +{ + int k; + struct lnx_peer_prov *lpp; + struct lnx_local2peer_map *lpm; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "%d: lnx_peer[%d] is %s\n", getpid(), idx, + (lp->lp_local) ? "local" : "remote"); + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "%d: peer[%p] provider %s\n", getpid(), lpp, + lpp->lpp_prov_name); + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + " %d: peer has %d mapped addrs\n", + getpid(), lpm->addr_count); + for (k = 0; k < lpm->addr_count; k++) + FI_DBG(&lnx_prov, FI_LOG_CORE, + " %d: addr = %lu\n", + getpid(), lpm->peer_addrs[k]); + } + } +} +#endif /* ENABLE_DEBUG */ + +static int lnx_peer_insert(struct lnx_peer_table *tbl, + struct lnx_peer *lp) +{ + int i; + + if (tbl->lpt_max_count == 0 || + tbl->lpt_count >= tbl->lpt_max_count) + return -FI_ENOENT; + + for (i = 0; i < tbl->lpt_max_count; i++) { + if (!tbl->lpt_entries[i]) { + tbl->lpt_entries[i] = lp; +#if ENABLE_DEBUG + lnx_print_peer(i, lp); +#endif + tbl->lpt_count++; + return i; + } + } + + return -FI_ENOENT; +} + +static int lnx_peer_av_remove(struct lnx_peer *lp) +{ + int rc, frc = 0; + struct lnx_peer_prov *lpp; + struct lnx_local2peer_map *lpm; + + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, lpp, entry) { + /* if this is a remote peer then we didn't insert its shm address + * into our local shm endpoint, so no need to remove it + */ + if (!strncasecmp(lpp->lpp_prov_name, "shm", 3) && + !lp->lp_local) + continue; + + /* remove these address from all local providers */ + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, lpm, entry) { + if (lpm->addr_count > 0) { + rc = fi_av_remove(lpm->local_ep->lpe_av, lpm->peer_addrs, + lpm->addr_count, lpp->lpp_flags); + if (rc) + frc = rc; + } + } + } + + return frc; +} + +static int lnx_peer_remove(struct lnx_peer_table *tbl, int idx) +{ + struct lnx_peer *lp = tbl->lpt_entries[idx]; + int rc = 0; + + if (!lp) + return 0; + + rc = lnx_peer_av_remove(lp); + + tbl->lpt_entries[idx] = NULL; + tbl->lpt_count--; + + return rc; +} + +static int lnx_cleanup_avs(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_close(&ep->lpe_av->fid); + if (rc) + frc = rc; + } + + return frc; +} + +static inline void lnx_free_peer_tbl(struct lnx_peer_table *peer_tbl) +{ + free(peer_tbl->lpt_entries); + free(peer_tbl); +} + +int lnx_av_close(struct fid *fid) +{ + int rc; + struct local_prov *entry; + struct lnx_fabric *fabric; + struct lnx_peer_table *peer_tbl; + + peer_tbl = container_of(fid, struct lnx_peer_table, lpt_av.av_fid.fid); + fabric = peer_tbl->lpt_domain->ld_fabric; + + /* walk through the rest of the core providers and open their + * respective address vector tables + */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_avs(entry); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to close av for %s\n", + entry->lpv_prov_name); + } + } + + ofi_av_close_lightweight(&peer_tbl->lpt_av); + + free(peer_tbl); + + return 0; +} + +static struct fi_ops lnx_av_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_av_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static int lnx_get_or_create_peer_prov(struct dlist_entry *prov_table, + struct lnx_peer *lp, char *prov_name, + struct lnx_peer_prov **lpp) +{ + bool shm = false; + struct local_prov *entry; + struct lnx_peer_prov *peer_prov; + + if (!strcmp(prov_name, "shm")) { + if (lp->lp_shm_prov) + return -FI_ENOENT; + shm = true; + goto insert_prov; + } + + /* check if we already have a peer provider */ + dlist_foreach_container(&lp->lp_provs, + struct lnx_peer_prov, peer_prov, entry) { + if (!strncasecmp(peer_prov->lpp_prov_name, prov_name, FI_NAME_MAX)) { + *lpp = peer_prov; + return 0; + } + } + +insert_prov: + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + if (!strncasecmp(entry->lpv_prov_name, prov_name, FI_NAME_MAX)) { + peer_prov = calloc(sizeof(*peer_prov), 1); + if (!peer_prov) + return -FI_ENOMEM; + + dlist_init(&peer_prov->entry); + dlist_init(&peer_prov->lpp_map); + + strncpy(peer_prov->lpp_prov_name, prov_name, FI_NAME_MAX); + + peer_prov->lpp_prov = entry; + + if (shm) + lp->lp_shm_prov = peer_prov; + else + dlist_insert_tail(&peer_prov->entry, &lp->lp_provs); + + *lpp = peer_prov; + return 0; + } + } + + return -FI_ENOENT; +} + +static inline struct lnx_address_prov * +next_prov(struct lnx_address_prov *prov) +{ + uint8_t *ptr; + + ptr = (uint8_t*) prov; + + ptr += (sizeof(*prov) + (prov->lap_addr_count * prov->lap_addr_size)); + + return (struct lnx_address_prov*)ptr; +} + +static inline size_t +get_lnx_addresses_size(struct lnx_addresses *addrs) +{ + int i; + size_t s = sizeof(*addrs); + struct lnx_address_prov *prov; + + prov = addrs->la_addr_prov; + for (i = 0; i < addrs->la_prov_count; i++) { + s += sizeof(*prov) + (prov->lap_addr_count * prov->lap_addr_size); + prov = next_prov(prov); + } + + return s; +} + +static inline struct lnx_addresses * +next_peer(struct lnx_addresses *addrs) +{ + uint8_t *ptr; + + ptr = (uint8_t*)addrs + get_lnx_addresses_size(addrs); + + return (struct lnx_addresses *)ptr; +} + +static struct lnx_address_prov * +lnx_get_peer_shm_addr(struct lnx_addresses *addrs) +{ + int i; + struct lnx_address_prov *prov; + + prov = addrs->la_addr_prov; + for (i = 0; i < addrs->la_prov_count; i++) { + if (!strcmp(prov->lap_prov, "shm")) + return prov; + prov = next_prov(prov); + } + + return NULL; +} + +static int is_local_addr(struct local_prov **shm_prov, struct lnx_addresses *la) +{ + int rc; + char hostname[FI_NAME_MAX]; + struct lnx_address_prov *lap_shm; + + /* check the hostname and compare it to mine + * TODO: Is this good enough? or do we need a better way of + * determining if the address is local? + */ + rc = gethostname(hostname, FI_NAME_MAX); + if (rc == -1) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "failed to get hostname\n"); + return -FI_EPERM; + } + + lap_shm = lnx_get_peer_shm_addr(la); + if (!lap_shm) + return -FI_EOPNOTSUPP; + + /* Shared memory address not provided or not local*/ + if ((lap_shm->lap_addr_count == 0) || + strncasecmp(hostname, la->la_hostname, FI_NAME_MAX)) + return -FI_EOPNOTSUPP; + + /* badly formed address */ + if (*shm_prov && (lap_shm->lap_addr_count > 1 || + lap_shm->lap_addr_count < 0)) + return -FI_EPROTO; + + return 0; +} + +static void +lnx_update_msg_entries(struct lnx_qpair *qp, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)) +{ + struct lnx_queue *q = &qp->lqp_unexq; + struct lnx_rx_entry *rx_entry; + struct dlist_entry *item; + + ofi_spin_lock(&q->lq_qlock); + dlist_foreach(&q->lq_queue, item) { + rx_entry = (struct lnx_rx_entry *) item; + if (rx_entry->rx_entry.addr == FI_ADDR_UNSPEC) + rx_entry->rx_entry.addr = get_addr(&rx_entry->rx_entry); + } + ofi_spin_unlock(&q->lq_qlock); +} + +void +lnx_foreach_unspec_addr(struct fid_peer_srx *srx, + fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)) +{ + struct lnx_srx_context *ctxt; + + ctxt = (struct lnx_srx_context *) srx->ep_fid.fid.context; + + lnx_update_msg_entries(&ctxt->srx_lep->le_srq.lps_trecv, get_addr); + lnx_update_msg_entries(&ctxt->srx_lep->le_srq.lps_recv, get_addr); +} + +static int lnx_peer_map_addrs(struct dlist_entry *prov_table, + struct lnx_peer *lp, struct lnx_addresses *la, + uint64_t flags, void *context) +{ + int i, j, rc; + struct lnx_peer_prov *lpp; + struct lnx_address_prov *lap; + struct local_prov_ep *lpe; + struct dlist_entry *eps; + + lap = &la->la_addr_prov[0]; + + for (i = 0; i < la->la_prov_count; i++) { + if (lap->lap_addr_count > LNX_MAX_LOCAL_EPS) + return -FI_EPROTO; + + rc = lnx_get_or_create_peer_prov(prov_table, lp, lap->lap_prov, + &lpp); + if (rc) + return rc; + + lpp->lpp_flags = flags; + + eps = &lpp->lpp_prov->lpv_prov_eps; + dlist_foreach_container(eps, struct local_prov_ep, lpe, + entry) { + struct lnx_local2peer_map *lpm; + + /* if this is a remote peer, don't insert the shm address + * since we will never talk to that peer over shm + */ + if (!strncasecmp(lpe->lpe_fabric_name, "shm", 3) && + !lp->lp_local) + continue; + + lpm = calloc(sizeof(*lpm), 1); + if (!lpm) + return -FI_ENOMEM; + + dlist_init(&lpm->entry); + dlist_insert_tail(&lpm->entry, &lpp->lpp_map); + + lpm->local_ep = lpe; + lpm->addr_count = lap->lap_addr_count; + for (j = 0; j < LNX_MAX_LOCAL_EPS; j++) + lpm->peer_addrs[j] = FI_ADDR_NOTAVAIL; + /* fi_av_insert returns the number of addresses inserted */ + rc = fi_av_insert(lpe->lpe_av, (void*)lap->lap_addrs, + lap->lap_addr_count, + lpm->peer_addrs, flags, context); + if (rc < 0) + return rc; + + /* should only insert the number of addresses indicated */ + assert(rc == lap->lap_addr_count); + } + + lap = next_prov(lap); + } + + return 0; +} + +/* + * count: number of LNX addresses + * addr: an array of addresses + * fi_addr: an out array of fi_addr)t + * + * Each LNX address can have multiple core provider addresses + * Check the hostname provided in each address to see if it's the same as + * me. If so, then we'll use the SHM address if available. + * + * ASSUMPTION: fi_av_insert() is called exactly once per peer. + * We're not handling multiple av_inserts on the same peer. If that + * happens then we will create multiple peers entries. + */ +int lnx_av_insert(struct fid_av *av, const void *addr, size_t count, + fi_addr_t *fi_addr, uint64_t flags, void *context) +{ + int i, rc, idx; + int disable_shm = 0; + struct lnx_peer *lp; + struct dlist_entry *prov_table; + struct lnx_peer_table *peer_tbl; + struct lnx_addresses *la = (struct lnx_addresses *)addr; + + fi_param_get_bool(&lnx_prov, "disable_shm", &disable_shm); + + peer_tbl = container_of(av, struct lnx_peer_table, lpt_av.av_fid.fid); + prov_table = &peer_tbl->lpt_domain->ld_fabric->local_prov_table; + + /* each entry represents a separate peer */ + for (i = 0; i < count; i++) { + /* can't have more providers than LNX_MAX_LOCAL_EPS */ + if (la->la_prov_count >= LNX_MAX_LOCAL_EPS || + la->la_prov_count <= 0) + return -FI_EPROTO; + + /* this is a local peer */ + lp = calloc(sizeof(*lp), 1); + if (!lp) + return -FI_ENOMEM; + + dlist_init(&lp->lp_provs); + + rc = is_local_addr(&peer_tbl->lpt_domain->ld_fabric->shm_prov, + la); + if (!rc) { + lp->lp_local = !disable_shm; + } else if (rc == -FI_EOPNOTSUPP) { + lp->lp_local = false; + } else if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "failed to identify address\n"); + return rc; + } + + rc = lnx_peer_map_addrs(prov_table, lp, la, flags, context); + if (rc) { + free(lp); + return rc; + } + + idx = lnx_peer_insert(peer_tbl, lp); + if (idx == -1) { + rc = lnx_peer_av_remove(lp); + lnx_free_peer(lp); + FI_INFO(&lnx_prov, FI_LOG_CORE, + "Peer table size exceeded. Removed = %d\n", rc); + return -FI_ENOENT; + } + + fi_addr[i] = (fi_addr_t) idx; + + la = next_peer(la); + } + + return i; +} + +int lnx_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, + uint64_t flags) +{ + struct lnx_peer_table *peer_tbl; + int frc = 0, rc, i; + + peer_tbl = container_of(av, struct lnx_peer_table, lpt_av.av_fid.fid); + + for (i = 0; i < count; i++) { + rc = lnx_peer_remove(peer_tbl, (int)fi_addr[i]); + if (rc) + frc = rc; + } + + return frc; +} + +static const char * +lnx_av_straddr(struct fid_av *av, const void *addr, + char *buf, size_t *len) +{ + /* TODO: implement */ + return NULL; +} + +static int +lnx_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, + size_t *addrlen) +{ + /* TODO: implement */ + return -FI_EOPNOTSUPP; +} + +static struct fi_ops_av lnx_av_ops = { + .size = sizeof(struct fi_ops_av), + .insert = lnx_av_insert, + .remove = lnx_av_remove, + .insertsvc = fi_no_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .lookup = lnx_av_lookup, + .straddr = lnx_av_straddr, +}; + +static void lnx_get_core_av_attr(struct local_prov_ep *ep, + struct fi_av_attr *attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->type = ep->lpe_fi_info->domain_attr->av_type; +} + +static int lnx_open_avs(struct local_prov *prov, struct fi_av_attr *attr, + void *context) +{ + int rc = 0; + struct local_prov_ep *ep; + struct fi_av_attr core_attr; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + lnx_get_core_av_attr(ep, &core_attr); + if (ep->lpe_local) + core_attr.count = ep->lpe_fi_info->domain_attr->ep_cnt; + else + core_attr.count = attr->count; + rc = fi_av_open(ep->lpe_domain, &core_attr, + &ep->lpe_av, context); + if (rc) + return rc; + } + + return 0; +} + +int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context) +{ + struct lnx_fabric *fabric; + struct lnx_domain *lnx_domain; + struct lnx_peer_table *peer_tbl; + struct local_prov *entry; + size_t table_sz = LNX_DEF_AV_SIZE; + int rc = 0; + + if (!attr) + return -FI_EINVAL; + + if (attr->name) + return -FI_ENOSYS; + + if (attr->type != FI_AV_UNSPEC && + attr->type != FI_AV_TABLE) + return -FI_ENOSYS; + + if (attr->type == FI_AV_UNSPEC) + attr->type = FI_AV_TABLE; + + peer_tbl = calloc(sizeof(*peer_tbl), 1); + if (!peer_tbl) + return -FI_ENOMEM; + + if (attr->count != 0) + table_sz = attr->count; + + peer_tbl->lpt_entries = + calloc(sizeof(struct lnx_peer *) * table_sz, 1); + if (!peer_tbl->lpt_entries) { + rc = -FI_ENOMEM; + goto failed; + } + + lnx_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid.fid); + fabric = lnx_domain->ld_fabric; + + rc = ofi_av_init_lightweight(&lnx_domain->ld_domain, attr, + &peer_tbl->lpt_av, context); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "failed to initialize AV: %d\n", rc); + goto failed; + } + + peer_tbl->lpt_max_count = table_sz; + peer_tbl->lpt_domain = lnx_domain; + peer_tbl->lpt_av.av_fid.fid.ops = &lnx_av_fi_ops; + peer_tbl->lpt_av.av_fid.ops = &lnx_av_ops; + + assert(fabric->lnx_peer_tbl == NULL); + + /* need this to handle memory registration vi fi_mr_regattr(). We need + * to be able to access the peer table to determine which endpoint + * we'll be using based on the source/destination address */ + fabric->lnx_peer_tbl = peer_tbl; + + /* walk through the rest of the core providers and open their + * respective address vector tables + */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_open_avs(entry, attr, context); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to initialize domain for %s\n", + entry->lpv_prov_name); + goto close; + } + } + + *av = &peer_tbl->lpt_av.av_fid; + + return 0; + +close: + ofi_av_close_lightweight(&peer_tbl->lpt_av); +failed: + lnx_free_peer_tbl(peer_tbl); + return rc; +} + + diff --git a/prov/lnx/src/lnx_cq.c b/prov/lnx/src/lnx_cq.c new file mode 100644 index 00000000000..6aebc8f4c5a --- /dev/null +++ b/prov/lnx/src/lnx_cq.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +ssize_t lnx_peer_cq_write(struct fid_peer_cq *cq, void *context, uint64_t flags, + size_t len, void *buf, uint64_t data, uint64_t tag, + fi_addr_t src) +{ + struct lnx_peer_cq *lnx_cq; + int rc; + + lnx_cq = container_of(cq, struct lnx_peer_cq, lpc_cq); + + rc = ofi_cq_write(&lnx_cq->lpc_shared_cq->util_cq, context, + flags, len, buf, data, tag); + + return rc; +} + +ssize_t lnx_peer_cq_writeerr(struct fid_peer_cq *cq, + const struct fi_cq_err_entry *err_entry) +{ + struct lnx_peer_cq *lnx_cq; + int rc; + + lnx_cq = container_of(cq, struct lnx_peer_cq, lpc_cq); + + rc = ofi_cq_write_error(&lnx_cq->lpc_shared_cq->util_cq, err_entry); + + return rc; +} + +static int lnx_cleanup_cqs(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_close(&ep->lpe_cq.lpc_core_cq->fid); + if (rc) + frc = rc; + ep->lpe_cq.lpc_core_cq = NULL; + } + + return frc; +} + +static int lnx_cq_close(struct fid *fid) +{ + int rc; + struct lnx_cq *lnx_cq; + struct local_prov *entry; + struct dlist_entry *prov_table; + + lnx_cq = container_of(fid, struct lnx_cq, util_cq.cq_fid); + prov_table = &lnx_cq->lnx_domain->ld_fabric->local_prov_table; + + /* close all the open core cqs */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_cqs(entry); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "Failed to close domain for %s\n", + entry->lpv_prov_name); + return rc; + } + } + + rc = ofi_cq_cleanup(&lnx_cq->util_cq); + if (rc) + return rc; + + free(lnx_cq); + return 0; +} + +struct fi_ops_cq_owner lnx_cq_write = { + .size = sizeof(lnx_cq_write), + .write = lnx_peer_cq_write, + .writeerr = lnx_peer_cq_writeerr, +}; + +static struct fi_ops lnx_cq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_cq_close, + .bind = fi_no_bind, + .control = ofi_cq_control, + .ops_open = fi_no_ops_open, +}; + +static void lnx_cq_progress(struct util_cq *cq) +{ + struct lnx_cq *lnx_cq; + struct local_prov_ep *ep; + struct local_prov *entry; + struct dlist_entry *prov_table; + + lnx_cq = container_of(cq, struct lnx_cq, util_cq); + prov_table = &lnx_cq->lnx_domain->ld_fabric->local_prov_table; + + /* Kick the core provider endpoints to progress */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) + fi_cq_read(ep->lpe_cq.lpc_core_cq, NULL, 0); + } +} + +static int lnx_cq_open_core_prov(struct lnx_cq *cq, struct fi_cq_attr *attr) +{ + int rc; + struct local_prov_ep *ep; + struct local_prov *entry; + struct dlist_entry *prov_table = + &cq->lnx_domain->ld_fabric->local_prov_table; + + /* tell the core providers to import my CQ */ + attr->flags |= FI_PEER; + + /* create all the core provider completion queues */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + struct fid_cq *core_cq; + struct fi_peer_cq_context cq_ctxt; + + ep->lpe_cq.lpc_shared_cq = cq; + ep->lpe_cq.lpc_cq.owner_ops = &lnx_cq_write; + + cq_ctxt.size = sizeof(cq_ctxt); + cq_ctxt.cq = &ep->lpe_cq.lpc_cq; + + /* pass my CQ into the open and get back the core's cq */ + rc = fi_cq_open(ep->lpe_domain, attr, &core_cq, &cq_ctxt); + if (rc) + return rc; + + /* before the fi_cq_open() returns the core provider should + * have called fi_export_fid() and got a pointer to the peer + * CQ which we have allocated for this core provider + */ + + ep->lpe_cq.lpc_core_cq = core_cq; + } + } + + return 0; +} + +int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq_fid, void *context) +{ + struct lnx_cq *lnx_cq; + struct lnx_domain *lnx_dom; + int rc; + + lnx_cq = calloc(1, sizeof(*lnx_cq)); + if (!lnx_cq) + return -FI_ENOMEM; + + /* this is going to be a standard CQ from the read side. From the + * write side, it'll use the peer_cq callbacks to write + */ + rc = ofi_cq_init(&lnx_prov, domain, attr, &lnx_cq->util_cq, + &lnx_cq_progress, context); + if (rc) + goto free; + + lnx_dom = container_of(domain, struct lnx_domain, + ld_domain.domain_fid); + + lnx_cq->lnx_domain = lnx_dom; + lnx_cq->util_cq.cq_fid.fid.ops = &lnx_cq_fi_ops; + (*cq_fid) = &lnx_cq->util_cq.cq_fid; + + /* open core CQs and tell them to import my CQ */ + rc = lnx_cq_open_core_prov(lnx_cq, attr); + + return rc; + +free: + free(lnx_cq); + return rc; +} diff --git a/prov/lnx/src/lnx_domain.c b/prov/lnx/src/lnx_domain.c new file mode 100644 index 00000000000..1d898319225 --- /dev/null +++ b/prov/lnx/src/lnx_domain.c @@ -0,0 +1,581 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +static struct fi_ops_domain lnx_domain_ops = { + .size = sizeof(struct fi_ops_domain), + .av_open = lnx_av_open, + .cq_open = lnx_cq_open, + .endpoint = lnx_endpoint, + .scalable_ep = lnx_scalable_ep, + .cntr_open = fi_no_cntr_open, + .poll_open = fi_no_poll_open, + .stx_ctx = fi_no_stx_context, + .srx_ctx = fi_no_srx_context, + .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, +}; + +static int lnx_cleanup_domains(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (!ep->lpe_domain) + continue; + rc = fi_close(&ep->lpe_domain->fid); + if (rc) + frc = rc; + } + + return frc; +} + +static int lnx_domain_close(fid_t fid) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_domain *domain; + + domain = container_of(fid, struct lnx_domain, ld_domain.domain_fid.fid); + + /* close all the open core domains */ + dlist_foreach_container(&domain->ld_fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + rc = lnx_cleanup_domains(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, "Failed to close domain for %s\n", + entry->lpv_prov_name); + } + + ofi_mr_cache_cleanup(&domain->ld_mr_cache); + + rc = ofi_domain_close(&domain->ld_domain); + + free(domain); + + return rc; +} + +static int +lnx_mr_regattrs_all(struct local_prov *prov, const struct fi_mr_attr *attr, + uint64_t flags, struct lnx_mem_desc_prov *desc) +{ + int rc = 0; + struct local_prov_ep *ep; + + desc->prov = prov; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_mr_regattr(ep->lpe_domain, attr, + flags, &desc->core_mr); + + /* TODO: SHM provider returns FI_ENOKEY if requested_key is the + * same as the previous call. Application, like OMPI, might not + * specify the requested key in fi_mr_attr, so for now ignore that + * error. + * We need a better way of handling this. + * if (rc == -FI_ENOKEY) + * rc = 0; + * I made a change in SHM to support FI_MR_PROV_KEY if set by the + * application. This tells ofi to generate its own requested_key + * for each fi_mr_regattr call + */ + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s mr_regattr() failed: %d\n", + ep->lpe_fabric_name, rc); + return rc; + } + } + + return rc; +} + +static int +lnx_mr_close_all(struct lnx_mem_desc *mem_desc) +{ + int i, rc, frc = 0; + struct fid_mr *mr; + + for (i = 0; i < mem_desc->desc_count; i++) { + mr = mem_desc->desc[i].core_mr; + if (!mr) + continue; + rc = fi_close(&mr->fid); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s mr_close() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + + return frc; +} + +int lnx_mr_close(struct fid *fid) +{ + struct lnx_mr *lnx_mr; + struct ofi_mr *mr; + int rc, frc = 0; + + mr = container_of(fid, struct ofi_mr, mr_fid.fid); + lnx_mr = container_of(mr, struct lnx_mr, mr); + + rc = lnx_mr_close_all(mr->mr_fid.mem_desc); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "Failed to complete Memory Deregistration\n"); + frc = rc; + } + + ofi_atomic_dec32(&mr->domain->ref); + + ofi_buf_free(lnx_mr); + + return frc; +} + +static int lnx_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int i, rc, frc = 0; + struct local_prov_ep *ep; + struct fid_mr *mr, *cmr; + struct lnx_mem_desc *mem_desc; + struct lnx_mem_desc_prov *desc; + + mr = container_of(fid, struct fid_mr, fid); + + mem_desc = mr->mem_desc; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + for (i = 0; i < mem_desc->desc_count; i++) { + desc = &mem_desc->desc[i]; + cmr = desc->core_mr; + if (!cmr) + continue; + dlist_foreach_container(&desc->prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_mr_bind(cmr, &ep->lpe_ep->fid, flags); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s lnx_mr_bind() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + } + + return frc; +} + +static int lnx_mr_control(struct fid *fid, int command, void *arg) +{ + int i, rc, frc = 0; + struct fid_mr *mr, *cmr; + struct lnx_mem_desc *mem_desc; + struct lnx_mem_desc_prov *desc; + + if (command != FI_ENABLE) + return -FI_ENOSYS; + + mr = container_of(fid, struct fid_mr, fid); + + mem_desc = mr->mem_desc; + + /* TODO: This is another issue here because MR registration can happen + * quiet often + */ + for (i = 0; i < mem_desc->desc_count; i++) { + desc = &mem_desc->desc[i]; + cmr = desc->core_mr; + if (!cmr) + continue; + rc = fi_mr_enable(cmr); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s lnx_mr_control() failed: %d\n", + mem_desc->desc[i].prov->lpv_prov_name, rc); + frc = rc; + } + } + + return frc; +} + +static struct fi_ops lnx_mr_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_mr_close, + .bind = lnx_mr_bind, + .control = lnx_mr_control, + .ops_open = fi_no_ops_open +}; + +static int +lnx_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr_fid) +{ + /* + * If the address is specified then use it to find out which + * domain to register the memory against. LNX can be managing + * multiple underlying core provider endpoints, I need to register the + * memory against the correct one. + * + * Once the domain is determined, I need to set the mr->mem_desc to + * point to a structure which contains my local endpoint I'll end up + * using (which is the same one that I registered the memory against) + * and the associate fid_mr which the core provider set for me. + * + * I return that to the application. + * + * When the application calls back into the data operations API it'll + * pass the mr. I can then pull out a pointer to my local endpoint + * which I'll use in the data operation and pass it the correct mr. + * + * If the address is not provided, then I'll register the memory + * buffer against all my core domains, store those and return them to + * the user + */ + + struct lnx_domain *domain; + struct lnx_fabric *fabric; + struct lnx_mr *lnx_mr = NULL;; + struct ofi_mr *mr; + struct lnx_mem_desc *mem_desc; + struct local_prov *entry; + int rc = 0, i = 1; + bool shm = false; + + if (fid->fclass != FI_CLASS_DOMAIN || !attr || attr->iov_count <= 0) + return -FI_EINVAL; + + domain = container_of(fid, struct lnx_domain, ld_domain.domain_fid.fid); + fabric = domain->ld_fabric; + + lnx_mr = ofi_buf_alloc(fabric->mem_reg_bp); + if (!lnx_mr) { + rc = -FI_ENOMEM; + goto fail; + } + + mr = &lnx_mr->mr; + mem_desc = &lnx_mr->desc; + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = attr->context; + mr->mr_fid.fid.ops = &lnx_mr_fi_ops; + mr->mr_fid.mem_desc = mem_desc; + mr->domain = &domain->ld_domain; + mr->flags = flags; + + /* TODO: What's gonna happen if you try to register the same piece + * of memory via multiple providers? + * TODO 2: We need a better way to handle memory registration. + * This is simply not very good. We need to have a peer interface + * to memory registration + */ + /* register against all domains */ + dlist_foreach_container(&fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + if (!strcmp(entry->lpv_prov_name, "shm")) + shm = true; + else + shm = false; + if (i >= LNX_MAX_LOCAL_EPS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Exceeded number of allowed memory registrations %s\n", + entry->lpv_prov_name); + rc = -FI_ENOSPC; + goto fail; + } + rc = lnx_mr_regattrs_all(entry, attr, flags, + (shm) ? &mem_desc->desc[0] : + &mem_desc->desc[i]); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to complete Memory Registration %s\n", + entry->lpv_prov_name); + goto fail; + } + if (!shm) + i++; + } + + mem_desc->desc_count = i; + if (shm) + mr->mr_fid.key = mem_desc->desc[0].core_mr->key; + else + mr->mr_fid.key = mem_desc->desc[1].core_mr->key; + *mr_fid = &mr->mr_fid; + ofi_atomic_inc32(&domain->ld_domain.ref); + + return 0; + +fail: + if (lnx_mr) + ofi_buf_free(lnx_mr); + return rc; +} + +static struct fi_ops lnx_domain_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_domain_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_mr lnx_mr_ops = { + .size = sizeof(struct fi_ops_mr), + .reg = fi_no_mr_reg, + .regv = fi_no_mr_regv, + .regattr = lnx_mr_regattr, +}; + +static int lnx_setup_core_domain(struct local_prov_ep *ep, struct fi_info *info) +{ + struct fi_info *fi, *itr; + + fi = lnx_get_link_by_dom(info->domain_attr->name); + if (!fi) + return -FI_ENODATA; + + for (itr = fi; itr; itr = itr->next) { + if (!strcmp(itr->fabric_attr->name, ep->lpe_fabric_name)) { + ep->lpe_fi_info = fi_dupinfo(itr); + return FI_SUCCESS; + } + } + + ep->lpe_fi_info = NULL; + + return -FI_ENOENT; +} + +static struct fi_ops_srx_owner lnx_srx_ops = { + .size = sizeof(struct fi_ops_srx_owner), + .get_msg = lnx_get_msg, + .get_tag = lnx_get_tag, + .queue_msg = lnx_queue_msg, + .queue_tag = lnx_queue_tag, + .free_entry = lnx_free_entry, + .foreach_unspec_addr = lnx_foreach_unspec_addr, +}; + +static int lnx_open_core_domains(struct local_prov *prov, + void *context, struct lnx_domain *lnx_domain, + struct fi_info *info) +{ + int rc; + struct local_prov_ep *ep; + struct fi_rx_attr attr = {0}; + struct fi_peer_srx_context peer_srx; + struct dlist_entry *tmp; + int srq_support = 1; + + fi_param_get_bool(&lnx_prov, "use_srq", &srq_support); + + attr.op_flags = FI_PEER; + peer_srx.size = sizeof(peer_srx); + + if (srq_support) + lnx_domain->ld_srx_supported = true; + else + lnx_domain->ld_srx_supported = false; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + /* the fi_info we setup when we created the fabric might not + * necessarily be the correct one. It'll have the same fabric + * information, since the fabric information is common among all + * the domains the provider manages. However at this point we need + * to get the fi_info that the application is requesting */ + rc = lnx_setup_core_domain(ep, info); + if (rc) + return rc; + + if (srq_support) { + /* special case for CXI provider. We need to turn off tag + * matching HW offload if we're going to support shared + * receive queues. + */ + if (strstr(ep->lpe_fabric_name, "cxi")) + setenv("FI_CXI_RX_MATCH_MODE", "software", 1); + } + + rc = fi_domain(ep->lpe_fabric, ep->lpe_fi_info, + &ep->lpe_domain, context); + + if (!rc && srq_support) { + ep->lpe_srx.owner_ops = &lnx_srx_ops; + peer_srx.srx = &ep->lpe_srx; + rc = fi_srx_context(ep->lpe_domain, &attr, NULL, &peer_srx); + } + + /* if one of the constituent endpoints doesn't support shared + * receive context, then fail, as we can't continue with this + * inconsistency + */ + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "%s does not support shared" + " receive queues. Failing\n", ep->lpe_fabric_name); + return rc; + } + } + + return 0; +} + +static int lnx_addr_add_region_noop(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + return FI_SUCCESS; +} + +static void lnx_addr_del_region(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + struct ofi_mr *mr = (struct ofi_mr *)entry->data; + + ofi_hmem_dev_unregister(mr->iface, (uint64_t) mr->hmem_data); +} + +/* + * provider: shm+cxi:lnx + * fabric: ofi_lnx_fabric + * domain: shm+cxi3:ofi_lnx_domain + * version: 120.0 + * type: FI_EP_RDM + * protocol: FI_PROTO_LNX + * + * Parse out the provider name. It should be shm+ + * + * Create a fabric for shm and one for the other provider. + * + * When fi_domain() is called, we get the fi_info for the + * second provider, which we should've returned as part of the + * fi_getinfo() call. + */ +int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain, void *context) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_domain *lnx_domain; + struct util_domain *lnx_domain_info; + struct lnx_fabric *lnx_fab = container_of(fabric, struct lnx_fabric, + util_fabric.fabric_fid); + struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = { + [FI_HMEM_SYSTEM] = default_monitor, + [FI_HMEM_CUDA] = default_cuda_monitor, + [FI_HMEM_ROCR] = default_rocr_monitor, + [FI_HMEM_ZE] = default_ze_monitor, + }; + + /* create a new entry for shm. + * Create its fabric. + * insert fabric in the global table + */ + rc = lnx_setup_core_fabrics(info->domain_attr->name, lnx_fab, context); + if (rc) + goto fail; + + rc = -FI_ENOMEM; + lnx_domain = calloc(sizeof(*lnx_domain), 1); + if (!lnx_domain) + goto fail; + + lnx_domain_info = &lnx_domain->ld_domain; + lnx_domain->ld_fabric = lnx_fab; + + rc = ofi_domain_init(fabric, info, lnx_domain_info, context, + OFI_LOCK_SPINLOCK); + if (rc) + goto fail; + + dlist_foreach_container(&lnx_domain->ld_fabric->local_prov_table, + struct local_prov, entry, lpv_entry) { + rc = lnx_open_core_domains(entry, context, lnx_domain, info); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Failed to initialize domain for %s\n", + entry->lpv_prov_name); + goto close_domain; + } + } + + lnx_domain_info->domain_fid.fid.ops = &lnx_domain_fi_ops; + lnx_domain_info->domain_fid.ops = &lnx_domain_ops; + lnx_domain_info->domain_fid.mr = &lnx_mr_ops; + + lnx_domain->ld_mr_cache.add_region = lnx_addr_add_region_noop; + lnx_domain->ld_mr_cache.delete_region = lnx_addr_del_region; + lnx_domain->ld_mr_cache.entry_data_size = sizeof(struct ofi_mr); + rc = ofi_mr_cache_init(&lnx_domain->ld_domain, memory_monitors, + &lnx_domain->ld_mr_cache); + if (rc) + goto close_domain; + + *domain = &lnx_domain_info->domain_fid; + + return 0; + +close_domain: + lnx_domain_close(&(lnx_domain_info->domain_fid.fid)); +fail: + return rc; +} + diff --git a/prov/lnx/src/lnx_ep.c b/prov/lnx/src/lnx_ep.c new file mode 100644 index 00000000000..cd4b83d099f --- /dev/null +++ b/prov/lnx/src/lnx_ep.c @@ -0,0 +1,1181 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +extern struct fi_ops_cm lnx_cm_ops; +extern struct fi_ops_msg lnx_msg_ops; +extern struct fi_ops_tagged lnx_tagged_ops; +extern struct fi_ops_rma lnx_rma_ops; +extern struct fi_ops_atomic lnx_atomic_ops; + +static void lnx_init_ctx(struct fid_ep *ctx, size_t fclass); + +static int lnx_close_ceps(struct local_prov *prov) +{ + int rc, frc = 0; + struct local_prov_ep *ep; + + dlist_foreach_container(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + + if (ep->lpe_srx.ep_fid.fid.context) + free(ep->lpe_srx.ep_fid.fid.context); + + rc = fi_close(&ep->lpe_ep->fid); + if (rc) + frc = rc; + ofi_bufpool_destroy(ep->lpe_recv_bp); + } + + return frc; +} + +int lnx_ep_close(struct fid *fid) +{ + int rc = 0; + struct local_prov *entry; + struct lnx_ep *ep; + struct lnx_fabric *fabric; + + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = ep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, + struct local_prov, + entry, lpv_entry) { + lnx_close_ceps(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close endpoint for %s\n", + entry->lpv_prov_name); + } + + ofi_endpoint_close(&ep->le_ep); + free(ep); + + return rc; +} + +static int lnx_enable_core_eps(struct lnx_ep *lep) +{ + int rc; + struct local_prov *entry; + struct local_prov_ep *ep; + int srq_support = 1; + struct lnx_fabric *fabric = lep->le_domain->ld_fabric; + + fi_param_get_bool(&lnx_prov, "use_srq", &srq_support); + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (srq_support) { + rc = fi_ep_bind(ep->lpe_ep, + &ep->lpe_srx.ep_fid.fid, 0); + if (rc) { + FI_INFO(&lnx_prov, FI_LOG_CORE, + "%s doesn't support SRX (%d)\n", + ep->lpe_fabric_name, rc); + return rc; + } + } + + rc = fi_enable(ep->lpe_ep); + if (rc) + return rc; + } + } + + return 0; +} + +static int lnx_ep_control(struct fid *fid, int command, void *arg) +{ + struct lnx_ep *ep; + int rc; + + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + + switch (command) { + case FI_ENABLE: + if (ep->le_fclass == FI_CLASS_EP && + ((ofi_needs_rx(ep->le_ep.caps) && !ep->le_ep.rx_cq) || + (ofi_needs_tx(ep->le_ep.caps) && !ep->le_ep.tx_cq))) + return -FI_ENOCQ; + if (!ep->le_peer_tbl) + return -FI_ENOAV; + rc = lnx_enable_core_eps(ep); + break; + default: + return -FI_ENOSYS; + } + + return rc; +} + +int lnx_cq_bind_core_prov(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct util_cq *cq; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + cq = container_of(bfid, struct util_cq, cq_fid.fid); + fabric = lep->le_domain->ld_fabric; + + rc = ofi_ep_bind_cq(&lep->le_ep, cq, flags); + if (rc) + return rc; + + /* bind the core providers to their respective CQs */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_ep_bind(ep->lpe_ep, + &ep->lpe_cq.lpc_core_cq->fid, flags); + if (rc) + return rc; + } + } + + return 0; +} + +static int lnx_ep_bind_core_prov(struct lnx_fabric *fabric, uint64_t flags) +{ + struct local_prov *entry; + struct local_prov_ep *ep; + int rc; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_ep_bind(ep->lpe_ep, &ep->lpe_av->fid, flags); + if (rc) + return rc; + } + } + + return rc; +} + +static int +lnx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc = 0; + struct lnx_ep *ep; + struct lnx_peer_table *peer_tbl; + + switch (fid->fclass) { + case FI_CLASS_EP: /* Standard EP */ + case FI_CLASS_SEP: /* Scalable EP */ + ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + break; + + default: + return -FI_EINVAL; + } + + switch (bfid->fclass) { + case FI_CLASS_EQ: + return -FI_ENOSYS; + + case FI_CLASS_CQ: + rc = lnx_cq_bind_core_prov(fid, bfid, flags); + break; + + case FI_CLASS_CNTR: + return -FI_ENOSYS; + + case FI_CLASS_AV: + peer_tbl = container_of(bfid, struct lnx_peer_table, + lpt_av.av_fid.fid); + if (peer_tbl->lpt_domain != ep->le_domain) + return -FI_EINVAL; + ep->le_peer_tbl = peer_tbl; + /* forward the bind to the core provider endpoints */ + rc = lnx_ep_bind_core_prov(ep->le_domain->ld_fabric, flags); + break; + + case FI_CLASS_STX_CTX: /* shared TX context */ + return -FI_ENOSYS; + + case FI_CLASS_SRX_CTX: /* shared RX context */ + return -FI_ENOSYS; + + default: + return -FI_EINVAL; + } + + return rc; +} + +int lnx_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct local_prov *entry; + size_t size = sizeof(struct lnx_addresses); + /* initial location to put the address */ + char ep_addr[FI_NAME_MAX]; + char *tmp = NULL; + struct lnx_addresses *la; + struct lnx_address_prov *lap; + char hostname[FI_NAME_MAX]; + size_t prov_addrlen; + size_t addrlen_list[LNX_MAX_LOCAL_EPS]; + int rc, j = 0; + struct lnx_ep *lnx_ep; + struct lnx_fabric *fabric; + struct local_prov_ep *ep; + + lnx_ep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lnx_ep->le_domain->ld_fabric; + + /* check the hostname and compare it to mine + * TODO: Is this good enough? or do we need a better way of + * determining if the address is local? + */ + rc = gethostname(hostname, FI_NAME_MAX); + if (rc == -1) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "failed to get hostname\n"); + return -FI_EPERM; + } + + addrlen_list[0] = 0; + + /* calculate the size of the address */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + size += sizeof(struct lnx_address_prov); + prov_addrlen = 0; + + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_getname(&ep->lpe_ep->fid, (void*)ep_addr, &prov_addrlen); + if (rc == -FI_ETOOSMALL) { + size += prov_addrlen * entry->lpv_ep_count; + addrlen_list[j] = prov_addrlen; + j++; + break; + } else { + return -FI_EINVAL; + } + } + } + + if (!addr || *addrlen < size) { + *addrlen = size; + return -FI_ETOOSMALL; + } + + la = addr; + + lap = (struct lnx_address_prov *)((char*)la + sizeof(*la)); + + j = 0; + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + memcpy(lap->lap_prov, entry->lpv_prov_name, FI_NAME_MAX - 1); + lap->lap_addr_count = entry->lpv_ep_count; + lap->lap_addr_size = addrlen_list[j]; + + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + tmp = (char*)lap + sizeof(*lap); + + rc = fi_getname(&ep->lpe_ep->fid, (void*)tmp, &addrlen_list[j]); + if (rc) + return rc; + + if (lap->lap_addr_size != addrlen_list[j]) + return -FI_EINVAL; + + tmp += addrlen_list[j]; + } + + lap = (struct lnx_address_prov *)tmp; + j++; + } + + la->la_prov_count = j; + memcpy(la->la_hostname, hostname, FI_NAME_MAX - 1); + + return 0; +} + +static ssize_t lnx_ep_cancel(fid_t fid, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + switch (fid->fclass) { + case FI_CLASS_EP: + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + break; + case FI_CLASS_RX_CTX: + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + break; + case FI_CLASS_TX_CTX: + return -FI_ENOENT; + default: + return -FI_EINVAL; + } + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_cancel(&ep->lpe_ep->fid, context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + return rc; +} + +static int lnx_ep_setopt(fid_t fid, int level, int optname, const void *optval, + size_t optlen) +{ + int rc = 0; + struct lnx_ep *lep; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = fi_setopt(&ep->lpe_ep->fid, level, optname, + optval, optlen); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + return rc; +} + + +static int lnx_ep_txc(struct fid_ep *fid, int index, struct fi_tx_attr *attr, + struct fid_ep **tx_ep, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + ctx = calloc(sizeof(*ctx), 1); + if (!ctx) + return -FI_ENOMEM; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (index >= ep->lpe_fi_info->ep_attr->tx_ctx_cnt) + continue; + + rc = fi_tx_context(ep->lpe_ep, index, attr, + &ep->lpe_txc[index], context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + dlist_init(&ctx->ctx_head); + ctx->ctx_idx = index; + ctx->ctx_parent = lep; + lnx_init_ctx(&ctx->ctx_ep, FI_CLASS_TX_CTX); + dlist_insert_tail(&ctx->ctx_head, &lep->le_tx_ctx); + /* set the callbacks for the transmit context */ + *tx_ep = &ctx->ctx_ep; + + return rc; +} + +static int lnx_ep_rxc(struct fid_ep *fid, int index, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + int rc = 0; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + ctx = calloc(sizeof(*ctx), 1); + if (!ctx) + return -FI_ENOMEM; + + lep = container_of(fid, struct lnx_ep, le_ep.ep_fid.fid); + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (index >= ep->lpe_fi_info->ep_attr->rx_ctx_cnt) + continue; + + rc = fi_rx_context(ep->lpe_ep, index, attr, + &ep->lpe_rxc[index], context); + if (rc == -FI_ENOSYS) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "%s: Operation not supported by provider. " + "Ignoring\n", ep->lpe_fabric_name); + rc = 0; + continue; + } else if (rc != FI_SUCCESS) { + return rc; + } + } + } + + dlist_init(&ctx->ctx_head); + ctx->ctx_idx = index; + ctx->ctx_parent = lep; + lnx_init_ctx(&ctx->ctx_ep, FI_CLASS_RX_CTX); + dlist_insert_tail(&ctx->ctx_head, &lep->le_rx_ctx); + /* set the callbacks for the receive context */ + *rx_ep = &ctx->ctx_ep; + + return rc; +} + +struct fi_ops_ep lnx_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = lnx_ep_cancel, + /* can't get opt, because there is no way to report multiple + * options for the different links */ + .getopt = fi_no_getopt, + .setopt = lnx_ep_setopt, + .tx_ctx = lnx_ep_txc, + .rx_ctx = lnx_ep_rxc, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +struct fi_ops lnx_ep_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_ep_close, + .bind = lnx_ep_bind, + .control = lnx_ep_control, + .ops_open = fi_no_ops_open, +}; + +struct fi_ops_cm lnx_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .setname = fi_no_setname, + .getname = lnx_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, +}; + +static int lnx_open_eps(struct local_prov *prov, struct fi_info *info, + void *context, size_t fclass, struct lnx_ep *lep) +{ + int rc = 0; + struct local_prov_ep *ep; + struct dlist_entry *tmp; + struct ofi_bufpool_attr bp_attrs = {}; + struct lnx_srx_context *ctxt; + + ctxt = calloc(1, sizeof(*ctxt)); + if (!ctxt) + return -FI_ENOMEM; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + if (fclass == FI_CLASS_EP) { + rc = fi_endpoint(ep->lpe_domain, ep->lpe_fi_info, + &ep->lpe_ep, context); + } else { + /* update endpoint attributes with whatever is being + * passed from the application + */ + if (ep->lpe_fi_info && info) { + ep->lpe_fi_info->ep_attr->tx_ctx_cnt = + info->ep_attr->tx_ctx_cnt; + ep->lpe_fi_info->ep_attr->rx_ctx_cnt = + info->ep_attr->rx_ctx_cnt; + } + + ep->lpe_txc = calloc(info->ep_attr->tx_ctx_cnt, + sizeof(*ep->lpe_txc)); + ep->lpe_rxc = calloc(info->ep_attr->rx_ctx_cnt, + sizeof(*ep->lpe_rxc)); + if (!ep->lpe_txc || !ep->lpe_rxc) + return -FI_ENOMEM; + + rc = fi_scalable_ep(ep->lpe_domain, ep->lpe_fi_info, + &ep->lpe_ep, context); + } + if (rc) + return rc; + + ctxt->srx_lep = lep; + ctxt->srx_cep = ep; + + ep->lpe_srx.ep_fid.fid.context = ctxt; + ep->lpe_srx.ep_fid.fid.fclass = FI_CLASS_SRX_CTX; + ofi_spin_init(&ep->lpe_bplock); + /* create a buffer pool for the receive requests */ + bp_attrs.size = sizeof(struct lnx_rx_entry); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT16_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + rc = ofi_bufpool_create_attr(&bp_attrs, &ep->lpe_recv_bp); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create receive buffer pool"); + return -FI_ENOMEM; + } + } + + return 0; +} + +static void +lnx_ep_nosys_progress(struct util_ep *util_ep) +{ + assert(0); +} + +static inline int +match_tag(uint64_t tag, uint64_t match_tag, uint64_t ignore) +{ + return ((tag | ignore) == (match_tag | ignore)); +} + +static inline bool +lnx_addr_match(fi_addr_t addr1, fi_addr_t addr2) +{ + return (addr1 == addr2); +} + +static inline bool +lnx_search_addr_match(fi_addr_t cep_addr, struct lnx_peer_prov *lpp) +{ + struct lnx_local2peer_map *lpm; + fi_addr_t peer_addr; + int i; + + dlist_foreach_container(&lpp->lpp_map, + struct lnx_local2peer_map, + lpm, entry) { + for (i = 0; i < LNX_MAX_LOCAL_EPS; i++) { + peer_addr = lpm->peer_addrs[i]; + if (peer_addr == FI_ADDR_NOTAVAIL) + break; + if (lnx_addr_match(peer_addr, cep_addr)) + return true; + } + } + + return false; +} + +static int lnx_match_common(uint64_t tag1, uint64_t tag2, uint64_t ignore, + fi_addr_t cep_addr, fi_addr_t lnx_addr, struct lnx_peer *peer, + struct local_prov_ep *cep) +{ + struct lnx_peer_prov *lpp; + struct local_prov *lp; + bool tmatch; + + /* if a request has no address specified it'll match against any + * rx_entry with a matching tag + * or + * if an rx_entry has no address specified, it'll match against any + * request with a matching tag + * + * for non tagged messages tags will be set to TAG_ANY so they will + * always match and decision will be made on address only. + */ + tmatch = match_tag(tag1, tag2, ignore); + if (!tmatch) + return tmatch; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "tag1=%lx tag2=%lx ignore=%lx cep_addr=%lx lnx_addr=%lx tmatch=%d\n", + tag1, tag2, ignore, cep_addr, lnx_addr, tmatch); + + /* if we're requested to receive from any peer, then tag maching is + * enough. None tagged message will match irregardless. + */ + if (lnx_addr == FI_ADDR_UNSPEC) + return tmatch; + + /* if the address is specified, then we should have a peer and + * a receiving core endpoint and a provider parent + */ + assert(peer && cep && cep->lpe_parent); + + lp = cep->lpe_parent; + + /* if this is a shm core provider, then only go through lnx + * shm provider + */ + if (cep->lpe_local) + return lnx_search_addr_match(cep_addr, peer->lp_shm_prov); + + /* check if we already have a peer provider. + * A peer can receive messages from multiple providers, we need to + * find the provider which maps to the provider we're currently + * checking. The map looked up can have multiple addresses which + * we can receive from, so we need to check which one of those is + * the correct match. + * + * Note: we're trying to make this loop as efficient as possible, + * because it's executed on the message matching path, which is + * heavily hit. + * + * The theory is in most use cases: + * - There will be only two providers to check + * - Each provider will have 1 endpoint, and therefore only one map + * - Each peer will only have 1 address. + * + */ + dlist_foreach_container(&peer->lp_provs, + struct lnx_peer_prov, lpp, entry) { + if (lpp->lpp_prov == lp) + return lnx_search_addr_match(cep_addr, lpp); + } + + return false; +} + +static int lnx_match_unexq(struct dlist_entry *item, const void *args) +{ + /* this entry is placed on the SUQ via the lnx_get_tag() path + * and examined in the lnx_process_tag() path */ + struct lnx_match_attr *match_attr = (struct lnx_match_attr *) args; + struct lnx_rx_entry *entry = (struct lnx_rx_entry *) item; + struct lnx_peer *peer = match_attr->lm_peer; + + /* entry refers to the unexpected message received + * entry->rx_entry.tag will be the tag of the message or TAG_UNSPEC + * otherwise + * + * entry->rx_entry.addr will be the address of the peer which sent the + * message or ADDR_UNSPEC if the core provider didn't do a reverse + * lookup. + * + * entry->rx_cep will be set to the core endpoint which received the + * message. + * + * match_attr is filled in by the lnx_process_tag() and contains + * information passed to us by the application + * + * match_attr->lm_peer is the peer looked up via the addr passed by + * the application to LNX. It is NULL if the addr is ADDR_UNSPEC. + * + * match_attr->lm_tag, match_attr->lm_ignore are the tag and ignore + * bits passed by the application to LNX via the receive API. + * + * match_attr->lm_addr is the only significant if it's set to + * FI_ADDR_UNSPEC, otherwise it's not used in matching because it's + * the LNX level address and we need to compare the core level address. + */ + return lnx_match_common(entry->rx_entry.tag, match_attr->lm_tag, + match_attr->lm_ignore, entry->rx_entry.addr, + match_attr->lm_addr, peer, entry->rx_cep); +} + +static int lnx_match_recvq(struct dlist_entry *item, const void *args) +{ + struct lnx_match_attr *match_attr = (struct lnx_match_attr *) args; + /* this entry is placed on the recvq via the lnx_process_tag() path + * and examined in the lnx_get_tag() path */ + struct lnx_rx_entry *entry = (struct lnx_rx_entry *) item; + + /* entry refers to the receive request waiting for a message + * entry->rx_entry.tag is the tag passed in by the application. + * + * entry->rx_entry.addr is the address passed in by the application. + * This is the LNX level address. It's only significant if it's set + * to ADDR_UNSPEC. Otherwise, it has already been used to look up the + * peer. + * + * entry->rx_cep is always NULL in this case, as this will only be + * known when the message is received. + * + * entry->rx_peer is the LNX peer looked up if a valid address is + * given by the application, otherwise it's NULL. + * + * match_attr information is filled by the lnx_get_tag() callback and + * contains information passed to us by the core endpoint receiving + * the message. + * + * match_attr->rx_peer is not significant because at the lnx_get_tag() + * call there isn't enough information to find what the peer is. + * + * match_attr->lm_tag, match_attr->lm_ignore are the tag and ignore + * bits passed up by the core endpoint receiving the message. + * + * match_attr->lm_addr is the address of the peer which sent the + * message. Set if the core endpoint has done a reverse lookup, + * otherwise set to ADDR_UNSPEC. + * + * match_attr->lm_cep is the core endpoint which received the message. + */ + return lnx_match_common(entry->rx_entry.tag, match_attr->lm_tag, + entry->rx_ignore, match_attr->lm_addr, + entry->rx_entry.addr, entry->rx_peer, match_attr->lm_cep); +} + +static inline int +lnx_init_queue(struct lnx_queue *q, dlist_func_t *match_func) +{ + int rc; + + rc = ofi_spin_init(&q->lq_qlock); + if (rc) + return rc; + + dlist_init(&q->lq_queue); + + q->lq_match_func = match_func; + + return 0; +} + +static inline int +lnx_init_qpair(struct lnx_qpair *qpair, dlist_func_t *recvq_match_func, + dlist_func_t *unexq_match_func) +{ + int rc = 0; + + rc = lnx_init_queue(&qpair->lqp_recvq, recvq_match_func); + if (rc) + goto out; + rc = lnx_init_queue(&qpair->lqp_unexq, unexq_match_func); + if (rc) + goto out; + +out: + return rc; +} + +static inline int +lnx_init_srq(struct lnx_peer_srq *srq) +{ + int rc; + + rc = lnx_init_qpair(&srq->lps_trecv, lnx_match_recvq, lnx_match_unexq); + if (rc) + return rc; + rc = lnx_init_qpair(&srq->lps_recv, lnx_match_recvq, lnx_match_unexq); + if (rc) + return rc; + + return rc; +} + +static int lnx_get_ctx(struct local_prov_ep *ep, size_t fclass, + struct fid_ep ***ep_ctx, size_t *size) +{ + switch (fclass) { + case FI_CLASS_RX_CTX: + *ep_ctx = ep->lpe_rxc; + *size = ep->lpe_fi_info->ep_attr->rx_ctx_cnt; + break; + case FI_CLASS_TX_CTX: + *ep_ctx = ep->lpe_txc; + *size = ep->lpe_fi_info->ep_attr->tx_ctx_cnt; + break; + default: + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static void lnx_close_ep_ctx(struct local_prov_ep *ep, size_t fclass) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return; + + for (i = 0; i < size; i++) { + rc = fi_close(&ep_ctx[i]->fid); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close ep context %lu with %d\n", + fclass, rc); + } +} + +static int lnx_ctx_close(struct fid *fid) +{ + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) + lnx_close_ep_ctx(ep, fid->fclass); + } + + return FI_SUCCESS; +} + +static int lnx_ctx_bind_cq(struct local_prov_ep *ep, size_t fclass, + struct fid *bfid, uint64_t flags) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return rc; + + for (i = 0; i < size; i++) { + rc = fi_ep_bind(ep_ctx[i], bfid, flags); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +static int +lnx_ctx_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + if (bfid->fclass == FI_CLASS_CQ) + /* bind the context to the shared cq */ + rc = lnx_ctx_bind_cq(ep, fid->fclass, + &ep->lpe_cq.lpc_core_cq->fid, + flags); + else + return -FI_ENOSYS; + + if (rc) + return rc; + } + } + + return FI_SUCCESS; +} + +static int +lnx_enable_ctx_eps(struct local_prov_ep *ep, size_t fclass) +{ + struct fid_ep **ep_ctx; + size_t size; + size_t i; + int rc; + + rc = lnx_get_ctx(ep, fclass, &ep_ctx, &size); + if (rc) + return rc; + + for (i = 0; i < size; i++) { + rc = fi_enable(ep_ctx[i]); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +static int +lnx_ctx_control(struct fid *fid, int command, void *arg) +{ + int rc; + struct lnx_ep *lep; + struct lnx_ctx *ctx; + struct local_prov_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + + if (fid->fclass != FI_CLASS_RX_CTX && + fid->fclass != FI_CLASS_TX_CTX) + return -FI_EINVAL; + + ctx = container_of(fid, struct lnx_ctx, ctx_ep.fid); + lep = ctx->ctx_parent; + + fabric = lep->le_domain->ld_fabric; + + switch (command) { + case FI_ENABLE: + if (!lep->le_peer_tbl) + return -FI_ENOAV; + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + dlist_foreach_container(&entry->lpv_prov_eps, + struct local_prov_ep, ep, entry) { + rc = lnx_enable_ctx_eps(ep, fid->fclass); + if (rc) + return rc; + } + } + break; + default: + return -FI_ENOSYS; + } + + return rc; +} + +static struct fi_ops lnx_ctx_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_ctx_close, + .bind = lnx_ctx_bind, + .control = lnx_ctx_control, + .ops_open = fi_no_ops_open, +}; + +struct fi_ops_ep lnx_ctx_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = lnx_ep_cancel, + .getopt = fi_no_getopt, + .setopt = fi_no_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +static void +lnx_init_ctx(struct fid_ep *ctx, size_t fclass) +{ + ctx->fid.fclass = fclass; + ctx->fid.ops = &lnx_ctx_ops; + ctx->ops = &lnx_ctx_ep_ops; + ctx->msg = &lnx_msg_ops; + ctx->tagged = &lnx_tagged_ops; + ctx->rma = &lnx_rma_ops; + ctx->atomic = &lnx_atomic_ops; +} + +static int +lnx_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, + struct lnx_ep **out_ep, void *context, size_t fclass) +{ + int rc; + struct lnx_ep *ep; + struct local_prov *entry; + struct lnx_fabric *fabric; + uint64_t mr_mode; + + ep = calloc(1, sizeof(*ep)); + if (!ep) + return -FI_ENOMEM; + + ep->le_fclass = fclass; + ep->le_ep.ep_fid.fid.fclass = fclass; + + ep->le_ep.ep_fid.fid.ops = &lnx_ep_fi_ops; + ep->le_ep.ep_fid.ops = &lnx_ep_ops; + ep->le_ep.ep_fid.cm = &lnx_cm_ops; + ep->le_ep.ep_fid.msg = &lnx_msg_ops; + ep->le_ep.ep_fid.tagged = &lnx_tagged_ops; + ep->le_ep.ep_fid.rma = &lnx_rma_ops; + ep->le_ep.ep_fid.atomic = &lnx_atomic_ops; + ep->le_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid); + lnx_init_srq(&ep->le_srq); + + dlist_init(&ep->le_rx_ctx); + dlist_init(&ep->le_tx_ctx); + + fabric = ep->le_domain->ld_fabric; + + /* create all the core provider endpoints */ + dlist_foreach_container(&fabric->local_prov_table, struct local_prov, + entry, lpv_entry) { + rc = lnx_open_eps(entry, info, context, fclass, ep); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to create ep for %s\n", + entry->lpv_prov_name); + goto fail; + } + } + + mr_mode = lnx_util_prov.info->domain_attr->mr_mode; + lnx_util_prov.info->domain_attr->mr_mode = 0; + rc = ofi_endpoint_init(domain, (const struct util_prov *)&lnx_util_prov, + (struct fi_info *)lnx_util_prov.info, &ep->le_ep, + context, lnx_ep_nosys_progress); + if (rc) + goto fail; + + lnx_util_prov.info->domain_attr->mr_mode = mr_mode; + *out_ep = ep; + + return 0; + +fail: + free(ep); + return rc; +} + +int lnx_scalable_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int rc; + struct lnx_ep *my_ep; + + rc = lnx_alloc_endpoint(domain, info, &my_ep, context, FI_CLASS_SEP); + if (rc) + return rc; + + *ep = &my_ep->le_ep.ep_fid; + + return 0; +} + +int lnx_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + int rc; + struct lnx_ep *my_ep; + + rc = lnx_alloc_endpoint(domain, info, &my_ep, context, FI_CLASS_EP); + if (rc) + return rc; + + *ep = &my_ep->le_ep.ep_fid; + + return 0; +} + + diff --git a/prov/lnx/src/lnx_init.c b/prov/lnx/src/lnx_init.c new file mode 100644 index 00000000000..94c7a7e14cd --- /dev/null +++ b/prov/lnx/src/lnx_init.c @@ -0,0 +1,884 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "rdma/fi_ext.h" +#include "lnx.h" + +#define LNX_PASSTHRU_TX_OP_FLAGS (FI_INJECT_COMPLETE | \ + FI_TRANSMIT_COMPLETE | \ + FI_DELIVERY_COMPLETE) +#define LNX_PASSTHRU_RX_OP_FLAGS (0ULL) +#define LNX_TX_OP_FLAGS (FI_INJECT_COMPLETE | FI_COMPLETION | \ + FI_DELIVERY_COMPLETE | FI_TRANSMIT_COMPLETE) +#define LNX_RX_OP_FLAGS (FI_COMPLETION) + +ofi_spin_t global_bplock; +struct ofi_bufpool *global_recv_bp = NULL; + +struct util_fabric lnx_fabric_info; + +struct fi_tx_attr lnx_tx_attr = { + .caps = ~0x0ULL, + .op_flags = LNX_PASSTHRU_TX_OP_FLAGS | LNX_TX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = 0, + .inject_size = SIZE_MAX, + .size = SIZE_MAX, + .iov_limit = LNX_IOV_LIMIT, + .rma_iov_limit = LNX_IOV_LIMIT, +}; + +struct fi_rx_attr lnx_rx_attr = { + .caps = ~0x0ULL, + .op_flags = LNX_PASSTHRU_RX_OP_FLAGS | LNX_RX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = 0, + .total_buffered_recv = 0, + .size = 1024, + .iov_limit = LNX_IOV_LIMIT, +}; + +struct fi_ep_attr lnx_ep_attr = { + .type = FI_EP_UNSPEC, + .protocol = FI_PROTO_LNX, + .protocol_version = 1, + .max_msg_size = SIZE_MAX, + .msg_prefix_size = SIZE_MAX, + .max_order_raw_size = SIZE_MAX, + .max_order_war_size = SIZE_MAX, + .max_order_waw_size = SIZE_MAX, + .mem_tag_format = FI_TAG_GENERIC, + .tx_ctx_cnt = SIZE_MAX, + .rx_ctx_cnt = SIZE_MAX, + .auth_key = NULL, + .auth_key_size = 0, +}; + +struct fi_domain_attr lnx_domain_attr = { + .name = "ofi_lnx_domain", + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_AUTO, + .data_progress = FI_PROGRESS_AUTO, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_RAW, + .mr_key_size = SIZE_MAX, + .cq_data_size = SIZE_MAX, + .cq_cnt = SIZE_MAX, + .ep_cnt = SIZE_MAX, + .tx_ctx_cnt = SIZE_MAX, + .rx_ctx_cnt = SIZE_MAX, + .max_ep_tx_ctx = SIZE_MAX, + .max_ep_rx_ctx = SIZE_MAX, + .max_ep_stx_ctx = SIZE_MAX, + .max_ep_srx_ctx = SIZE_MAX, + .cntr_cnt = SIZE_MAX, + .mr_iov_limit = SIZE_MAX, + .caps = ~0x0ULL, + .auth_key_size = SIZE_MAX, + .max_err_data = SIZE_MAX, + .mr_cnt = SIZE_MAX, +}; + +struct fi_fabric_attr lnx_fabric_attr = { + .prov_version = OFI_VERSION_DEF_PROV, + .name = "ofi_lnx_fabric", +}; + +struct fi_info lnx_info = { + .caps = ~0x0ULL, + .tx_attr = &lnx_tx_attr, + .rx_attr = &lnx_rx_attr, + .ep_attr = &lnx_ep_attr, + .domain_attr = &lnx_domain_attr, + .fabric_attr = &lnx_fabric_attr +}; + +static struct fi_ops lnx_fabric_fi_ops = { + .size = sizeof(struct fi_ops), + .close = lnx_fabric_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_fabric lnx_fabric_ops = { + .size = sizeof(struct fi_ops_fabric), + .domain = lnx_domain_open, + .passive_ep = fi_no_passive_ep, + .eq_open = fi_no_eq_open, + .wait_open = fi_no_wait_open, + .trywait = fi_no_trywait +}; + +struct fi_provider lnx_prov = { + .name = OFI_LNX, + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, + .getinfo = lnx_getinfo, + .fabric = lnx_fabric, + .cleanup = lnx_fini +}; + +struct util_prov lnx_util_prov = { + .prov = &lnx_prov, + .info = &lnx_info, + .flags = 0 +}; + +/* + * For the fi_getinfo() -> fi_fabric() -> fi_domain() path, we need to + * keep track of the fi_info in case we need them later on when linking in + * the fi_fabric() function. + * + * This cache gets cleared after we use the ones we need, or when the + * library exists, if LNX is never used. + */ +struct dlist_entry lnx_fi_info_cache; +/* this is a list of all possible links */ +struct dlist_entry lnx_links; +struct dlist_entry lnx_links_meta; + +struct lnx_fi_cache_entry { + struct dlist_entry entry; + struct fi_info *fi; +}; + +struct lnx_fi_info_meta { + struct dlist_entry entry; + struct fi_info *lnx_rep; + struct fi_info *lnx_link; +}; + +static int lnx_get_cache_meta(struct dlist_entry *head, int *size) +{ + int num_prov = 0; + struct dlist_entry *e; + + dlist_foreach(head, e) + num_prov++; + + *size = num_prov; + + return FI_SUCCESS; +} + +static void lnx_free_meta(void) +{ + struct lnx_fi_info_meta *e; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(&lnx_links_meta, struct lnx_fi_info_meta, e, + entry, tmp) { + dlist_remove(&e->entry); + free(e); + } +} + +static void lnx_free_info_cache(struct dlist_entry *head, bool meta) +{ + struct lnx_fi_cache_entry *e; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(head, struct lnx_fi_cache_entry, e, + entry, tmp) { + fi_freeinfo(e->fi); + dlist_remove(&e->entry); + free(e); + } + + if (meta) + lnx_free_meta(); +} + +static int lnx_cache_info(struct dlist_entry *head, + struct fi_info *info) +{ + struct lnx_fi_cache_entry *e = calloc(1, sizeof(*e)); + + if (!e) + return -FI_ENOMEM; + dlist_init(&e->entry); + e->fi = info; + + dlist_insert_tail(&e->entry, head); + + return 0; +} + +struct fi_info * +lnx_get_link_by_dom(char *domain_name) +{ + struct fi_info *info; + struct lnx_fi_info_meta *e; + + dlist_foreach_container(&lnx_links_meta, struct lnx_fi_info_meta, e, + entry) { + info = e->lnx_rep; + if (info && info->domain_attr) { + if (!strcmp(domain_name, + info->domain_attr->name)) { + FI_INFO(&lnx_prov, FI_LOG_CORE, "Found %s\n", + info->fabric_attr->prov_name); + return e->lnx_link; + } + } + } + + return NULL; +} + +static void lnx_insert_tail(struct fi_info *head, struct fi_info *item) +{ + struct fi_info *itr = head; + + while (itr->next) + itr = itr->next; + itr->next = item; +} + +static void lnx_remove_tail(struct fi_info **head) +{ + struct fi_info *itr = *head, *prev = NULL; + + while (itr->next) { + prev = itr; + itr = itr->next; + } + + if (prev) + prev->next = NULL; + else + *head = NULL; + free(itr); +} + +static struct fi_info *lnx_dupinfo_list(struct fi_info *l) +{ + struct fi_info *itr, *new, *prev = NULL, *head = NULL; + + for (itr = l; itr; itr = itr->next) { + new = fi_dupinfo(itr); + if (!new) { + if (head) + fi_freeinfo(head); + return NULL; + } + + if (!head) + head = new; + + if (prev) { + prev->next = new; + prev = new; + } else { + prev = new; + } + } + + return head; +} + +static int gen_links_rec(struct dlist_entry *current, struct dlist_entry *head, + struct dlist_entry *result, struct fi_info *l, + int depth, int target_depth) +{ + int rc; + struct fi_info *itr; + struct fi_info *fi_copy, *dup; + struct lnx_fi_cache_entry *e, *new; + + while(current->next != head) { + e = container_of(current->next, struct lnx_fi_cache_entry, entry); + for (itr = e->fi; itr; itr = itr->next) { + fi_copy = fi_dupinfo(itr); + if (l) { + lnx_insert_tail(l, fi_copy); + } else { + l = fi_copy; + } + if (current->next->next == head && + depth == target_depth) { + dup = lnx_dupinfo_list(l); + if (!dup) + return -FI_ENOMEM; + new = calloc(1, sizeof(*new)); + if (!new) + return -FI_ENOMEM; + new->fi = dup; + dlist_init(&new->entry); + dlist_insert_tail(&new->entry, result); + } + rc = gen_links_rec(current->next, head, result, l, + depth+1, target_depth); + lnx_remove_tail(&l); + if (rc) + return rc; + } + current = current->next; + } + + return FI_SUCCESS; +} + +static int gen_links(struct dlist_entry *head, struct dlist_entry *result, + int target_depth) +{ + return gen_links_rec(head, head, result, NULL, 1, target_depth); +} + +static int lnx_form_info(struct fi_info *fi, struct fi_info **out) +{ + int size_prov = 0, size_dom = 0, rc = FI_SUCCESS; + struct lnx_fi_info_meta *meta = NULL; + char *lnx_prov, *lnx_dom, *s; + struct fi_info *itr, *r = NULL; + bool copy = false; + uint64_t min_inject_size = SIZE_MAX; + + for (itr = fi; itr; itr = itr->next) { + size_prov += strlen(itr->fabric_attr->prov_name)+1; + size_dom += strlen(itr->domain_attr->name)+1; + if (itr->tx_attr && itr->tx_attr->inject_size < min_inject_size) + min_inject_size = itr->tx_attr->inject_size; + } + + lnx_dom = calloc(size_dom, sizeof(char)); + lnx_prov = calloc(size_prov, sizeof(char)); + if (!lnx_prov || !lnx_dom) + return -FI_ENOMEM; + + for (itr = fi; itr; itr = itr->next) { + strcat(lnx_prov, itr->fabric_attr->prov_name); + strcat(lnx_dom, itr->domain_attr->name); + if (itr->next) { + strcat(lnx_dom, "+"); + strcat(lnx_prov, "+"); + } + if (!strncmp(itr->fabric_attr->prov_name, "shm", 3)) + continue; + + if (!copy) { + meta = calloc(1, sizeof(*meta)); + r = fi_dupinfo(itr); + if (!r || !meta) { + rc = -FI_ENOMEM; + goto fail; + } + meta->lnx_rep = r; + meta->lnx_link = fi; + if (r->tx_attr) + r->tx_attr->inject_size = min_inject_size; + dlist_init(&meta->entry); + dlist_insert_tail(&meta->entry, &lnx_links_meta); + copy = true; + } + } + + if (!r) { + rc = -FI_ENODATA; + goto fail; + } + + free(r->fabric_attr->prov_name); + free(r->fabric_attr->name); + free(r->domain_attr->name); + + r->fabric_attr->name = NULL; + r->domain_attr->name = NULL; + r->fabric_attr->prov_name = lnx_prov; + + if (asprintf(&s, "%s", lnx_info.fabric_attr->name) < 0) + goto fail; + r->fabric_attr->name = s; + + if (asprintf(&s, "%s:%s", lnx_dom, lnx_info.domain_attr->name) < 0) + goto fail; + r->domain_attr->name = s; + free(lnx_dom); + + *out = r; + return FI_SUCCESS; + +fail: + if (meta) + free(meta); + if (r) + fi_freeinfo(r); + free(lnx_dom); + return rc; +} + +static int lnx_generate_info(struct fi_info **info) +{ + struct fi_info *fi = NULL, *head = NULL, *prev = NULL; + struct lnx_fi_cache_entry *e; + int rc, size; + + /* we need at least 2 providers to link */ + rc = lnx_get_cache_meta(&lnx_fi_info_cache, &size); + if (rc || size < 2) + return -FI_ENODATA; + + rc = gen_links(&lnx_fi_info_cache, &lnx_links, size); + if (rc) + return rc; + + /* + * 1. Iterate over the links and create a linked list of fi_infos + * each fi_info in the list represents one of the links + * 2. Have metadata associated with each fi_info to refer back to + * an entry in the lnx_links cache. + * 3. When the application selects one of these fi_infos, we can + * then find the appropriate link in the cache and be able to + * create the underlying core providers correctly. + */ + dlist_foreach_container(&lnx_links, struct lnx_fi_cache_entry, e, + entry) { + rc = lnx_form_info(e->fi, &fi); + if (rc) + goto err; + + if (prev) { + prev->next = fi; + prev = fi; + } else { + prev = fi; + head = fi; + } + } + + *info = head; + + return FI_SUCCESS; + +err: + if (fi) + fi_freeinfo(fi); + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + + return -FI_ENODATA; +} + +int lnx_getinfo_helper(uint32_t version, char *prov, struct fi_info *lnx_hints) +{ + int rc; + char *orig_prov_name = NULL; + struct fi_info *core_info; + uint64_t caps, mr_mode; + bool shm = false; + + caps = lnx_hints->caps; + mr_mode = lnx_hints->domain_attr->mr_mode; + + if (lnx_hints->fabric_attr->prov_name) { + orig_prov_name = lnx_hints->fabric_attr->prov_name; + lnx_hints->fabric_attr->prov_name = NULL; + } + + lnx_hints->fabric_attr->prov_name = prov; + if (!strncmp(prov, "shm", 3)) { + shm = true; + /* make sure we get the correct shm provider */ + lnx_hints->caps &= ~(FI_REMOTE_COMM | FI_LOCAL_COMM); + lnx_hints->caps |= FI_HMEM; + lnx_hints->domain_attr->mr_mode |= (FI_MR_VIRT_ADDR | FI_MR_HMEM + | FI_MR_PROV_KEY); + } + rc = fi_getinfo(version, NULL, NULL, OFI_GETINFO_INTERNAL, + lnx_hints, &core_info); + + lnx_hints->fabric_attr->prov_name = orig_prov_name; + if (rc) + return rc; + + if (shm) { + lnx_hints->caps = caps; + lnx_hints->domain_attr->mr_mode = mr_mode; + } + + rc = lnx_cache_info(&lnx_fi_info_cache, core_info); + + return rc; +} + +int lnx_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, + struct fi_info **info) +{ + int rc; + struct fi_info *lnx_hints; + char *linked_provs, *linked_provs_cp, *token, *exclude = NULL; + + rc = fi_param_get_str(&lnx_prov, "prov_links", + &linked_provs); + if (rc) + return rc; + + if (strstr(linked_provs, "lnx")) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Can't specify the lnx provider as part of the link: %s\n", + linked_provs); + return -FI_EINVAL; + } + + linked_provs_cp = strdup(linked_provs); + if (!linked_provs_cp) + return -FI_ENOMEM; + + /* The assumption is that the entire series of + * lnx_getinfo()->lnx_fabric()->lnx_domain()->lnx_endpoint() are + * going to be called before another lnx_getinfo() is called again. + * Based on this assumption, we will free the cache whenever + * lnx_getinfo() is called + */ + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + + /* If the hints are not provided then we endup with a new block */ + lnx_hints = fi_dupinfo(hints); + if (!lnx_hints) + return -FI_ENOMEM; + + rc = ofi_exclude_prov_name(&lnx_hints->fabric_attr->prov_name, lnx_prov.name); + if (rc) + return rc; + + /* get the providers which support peer functionality. These are + * the only ones we can link*/ + lnx_hints->caps |= FI_PEER; + + token = strtok(linked_provs_cp, "+"); + while (token) { + lnx_getinfo_helper(version, token, lnx_hints); + rc = ofi_exclude_prov_name(&lnx_hints->fabric_attr->prov_name, token); + if (rc) + goto free_hints; + token = strtok(NULL, "+"); + } + free(linked_provs_cp); + + /* Generate the lnx info which represents all possible combination + * of domains which are to be linked. + */ + rc = lnx_generate_info(info); + +free_hints: + free(exclude); + fi_freeinfo(lnx_hints); + return rc; +} + +static struct local_prov * +lnx_get_local_prov(struct dlist_entry *prov_table, char *prov_name) +{ + struct local_prov *entry; + + /* close all the open core fabrics */ + dlist_foreach_container(prov_table, struct local_prov, + entry, lpv_entry) { + if (!strncasecmp(entry->lpv_prov_name, prov_name, FI_NAME_MAX)) + return entry; + } + + return NULL; +} + +static int +lnx_add_ep_to_prov(struct local_prov *prov, struct local_prov_ep *ep) +{ + dlist_insert_tail(&ep->entry, &prov->lpv_prov_eps); + ep->lpe_parent = prov; + prov->lpv_ep_count++; + + return FI_SUCCESS; +} + +static int +lnx_setup_core_prov(struct fi_info *info, struct dlist_entry *prov_table, + struct local_prov **shm_prov, void *context) +{ + int rc = -FI_EINVAL; + struct local_prov_ep *ep = NULL; + struct local_prov *lprov, *new_lprov = NULL; + + ep = calloc(sizeof(*ep), 1); + if (!ep) + return -FI_ENOMEM; + + new_lprov = calloc(sizeof(*new_lprov), 1); + if (!new_lprov) + goto free_entry; + + dlist_init(&new_lprov->lpv_prov_eps); + + rc = fi_fabric(info->fabric_attr, &ep->lpe_fabric, context); + if (rc) + return rc; + + ep->lpe_fi_info = info; + strncpy(ep->lpe_fabric_name, info->fabric_attr->name, + FI_NAME_MAX - 1); + + lprov = lnx_get_local_prov(prov_table, info->fabric_attr->prov_name); + if (!lprov) { + lprov = new_lprov; + new_lprov = NULL; + strncpy(lprov->lpv_prov_name, info->fabric_attr->prov_name, + FI_NAME_MAX - 1); + } else { + free(new_lprov); + } + + /* indicate that this fabric can be used for on-node communication */ + if (!strncasecmp(lprov->lpv_prov_name, "shm", 3)) { + *shm_prov = lprov; + ep->lpe_local = true; + } + + dlist_init(&ep->entry); + rc = lnx_add_ep_to_prov(lprov, ep); + if (rc) + goto free_all; + + dlist_insert_after(&lprov->lpv_entry, prov_table); + + return 0; + +free_all: + if (new_lprov) + free(new_lprov); +free_entry: + if (ep) + free(ep); + + return rc; +} + +int +lnx_setup_core_fabrics(char *name, struct lnx_fabric *lnx_fab, + void *context) +{ + int rc; + struct fi_info *link, *itr; + + link = lnx_get_link_by_dom(name); + if (!link) + return -FI_ENODATA; + + for (itr = link; itr; itr = itr->next) { + rc = lnx_setup_core_prov(itr, &lnx_fab->local_prov_table, + &lnx_fab->shm_prov, context); + if (rc) + return rc; + } + + return FI_SUCCESS; +} + +int lnx_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context) +{ + struct ofi_bufpool_attr bp_attrs = {}; + struct lnx_fabric *lnx_fab; + int rc; + + lnx_fab = calloc(sizeof(*lnx_fab), 1); + if (!lnx_fab) + return -FI_ENOMEM; + + bp_attrs.size = sizeof(struct lnx_mr); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT32_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + rc = ofi_bufpool_create_attr(&bp_attrs, &lnx_fab->mem_reg_bp); + if (rc) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create memory registration buffer pool"); + free(lnx_fab); + return -FI_ENOMEM; + } + + /* initialize the provider table */ + dlist_init(&lnx_fab->local_prov_table); + + rc = ofi_fabric_init(&lnx_prov, lnx_info.fabric_attr, + lnx_info.fabric_attr, + &lnx_fab->util_fabric, context); + if (rc) + goto fail; + + lnx_fab->util_fabric.fabric_fid.fid.ops = &lnx_fabric_fi_ops; + lnx_fab->util_fabric.fabric_fid.ops = &lnx_fabric_ops; + *fabric = &lnx_fab->util_fabric.fabric_fid; + + return 0; + +fail: + return rc; +} + +void lnx_fini(void) +{ + lnx_free_info_cache(&lnx_fi_info_cache, false); + lnx_free_info_cache(&lnx_links, true); + ofi_bufpool_destroy(global_recv_bp); +} + +static int lnx_free_ep(struct local_prov *prov, struct local_prov_ep *ep) +{ + int rc; + + if (!prov || !ep) + return FI_SUCCESS; + + rc = fi_close(&ep->lpe_fabric->fid); + fi_freeinfo(ep->lpe_fi_info); + free(ep); + prov->lpv_ep_count--; + + if (prov->lpv_ep_count == 0) + dlist_remove(&prov->lpv_entry); + + return rc; +} + +static int lnx_free_eps(struct local_prov *prov) +{ + int rc, frc = 0; + struct dlist_entry *tmp; + struct local_prov_ep *ep; + + dlist_foreach_container_safe(&prov->lpv_prov_eps, + struct local_prov_ep, ep, entry, tmp) { + dlist_remove(&ep->entry); + rc = lnx_free_ep(prov, ep); + if (rc) + frc = rc; + } + + return frc; +} + +int lnx_fabric_close(struct fid *fid) +{ + int rc = 0; + struct util_fabric *fabric; + struct lnx_fabric *lnx_fab; + struct local_prov *entry; + struct dlist_entry *tmp; + + fabric = container_of(fid, struct util_fabric, fabric_fid.fid); + lnx_fab = container_of(fabric, struct lnx_fabric, util_fabric); + + /* close all the open core fabrics */ + dlist_foreach_container_safe(&lnx_fab->local_prov_table, + struct local_prov, entry, lpv_entry, tmp) { + dlist_remove(&entry->lpv_entry); + rc = lnx_free_eps(entry); + if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Failed to close provider %s\n", + entry->lpv_prov_name); + + free(entry); + } + + /* free mr registration pool */ + ofi_bufpool_destroy(lnx_fab->mem_reg_bp); + + rc = ofi_fabric_close(fabric); + + return rc; +} + +void ofi_link_fini(void) +{ + lnx_prov.cleanup(); +} + +LNX_INI +{ + struct ofi_bufpool_attr bp_attrs = {}; + int ret; + + fi_param_define(&lnx_prov, "prov_links", FI_PARAM_STRING, + "Specify which providers LNX will link together. Format: " + "++...+. EX: shm+cxi"); + + fi_param_define(&lnx_prov, "disable_shm", FI_PARAM_BOOL, + "Turn off SHM support. Defaults to 0"); + + fi_param_define(&lnx_prov, "use_srq", FI_PARAM_BOOL, + "Turns shared receive queue support on and off. By default it is on. " + "When SRQ is turned on some Hardware offload capability will not " + "work. EX: Hardware Tag matching"); + + dlist_init(&lnx_fi_info_cache); + dlist_init(&lnx_links); + dlist_init(&lnx_links_meta); + + if (!global_recv_bp) { + bp_attrs.size = sizeof(struct lnx_rx_entry); + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT16_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.flags = OFI_BUFPOOL_NO_TRACK; + ret = ofi_bufpool_create_attr(&bp_attrs, &global_recv_bp); + if (ret) { + FI_WARN(&lnx_prov, FI_LOG_FABRIC, + "Failed to create receive buffer pool"); + return NULL; + } + ofi_spin_init(&global_bplock); + } + + return &lnx_prov; +} diff --git a/prov/lnx/src/lnx_ops.c b/prov/lnx/src/lnx_ops.c new file mode 100644 index 00000000000..3750e27f2a6 --- /dev/null +++ b/prov/lnx/src/lnx_ops.c @@ -0,0 +1,1036 @@ +/* + * Copyright (c) 2022 ORNL. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include +#include "ofi_util.h" +#include "ofi.h" +#include "ofi_str.h" +#include "ofi_prov.h" +#include "ofi_perf.h" +#include "ofi_hmem.h" +#include "ofi_lock.h" +#include "rdma/fi_ext.h" +#include "ofi_iov.h" +#include "lnx.h" + +int lnx_get_msg(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry) +{ + return -FI_ENOSYS; +} + +int lnx_queue_msg(struct fi_peer_rx_entry *entry) +{ + return -FI_ENOSYS; +} + +void lnx_free_entry(struct fi_peer_rx_entry *entry) +{ + struct lnx_rx_entry *rx_entry = (struct lnx_rx_entry *) entry; + ofi_spin_t *bplock; + + if (rx_entry->rx_global) + bplock = &global_bplock; + else + bplock = &rx_entry->rx_cep->lpe_bplock; + + ofi_spin_lock(bplock); + ofi_buf_free(rx_entry); + ofi_spin_unlock(bplock); +} + +static struct lnx_ep *lnx_get_lep(struct fid_ep *ep, struct lnx_ctx **ctx) +{ + struct lnx_ep *lep; + + if (ctx) + *ctx = NULL; + + switch (ep->fid.fclass) { + case FI_CLASS_RX_CTX: + case FI_CLASS_TX_CTX: + *ctx = container_of(ep, struct lnx_ctx, ctx_ep.fid); + lep = (*ctx)->ctx_parent; + break; + case FI_CLASS_EP: + case FI_CLASS_SEP: + lep = container_of(ep, struct lnx_ep, le_ep.ep_fid.fid); + break; + default: + lep = NULL; + } + + return lep; +} + +static struct fid_ep *lnx_get_core_ep(struct local_prov_ep *cep, int idx, + size_t fclass) +{ + switch (fclass) { + case FI_CLASS_RX_CTX: + return cep->lpe_rxc[idx]; + case FI_CLASS_TX_CTX: + return cep->lpe_txc[idx]; + case FI_CLASS_EP: + case FI_CLASS_SEP: + return cep->lpe_ep; + default: + return NULL; + } + + return NULL; +} + +static void +lnx_init_rx_entry(struct lnx_rx_entry *entry, struct iovec *iov, void **desc, + size_t count, fi_addr_t addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags) +{ + memcpy(&entry->rx_iov, iov, sizeof(*iov) * count); + if (desc) + memcpy(entry->rx_desc, desc, sizeof(*desc) * count); + + entry->rx_entry.iov = entry->rx_iov; + entry->rx_entry.desc = entry->rx_desc; + entry->rx_entry.count = count; + entry->rx_entry.addr = addr; + entry->rx_entry.context = context; + entry->rx_entry.tag = tag; + entry->rx_entry.flags = flags; + entry->rx_ignore = ignore; +} + +static struct lnx_rx_entry * +get_rx_entry(struct local_prov_ep *cep, struct iovec *iov, void **desc, + size_t count, fi_addr_t addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags) +{ + struct lnx_rx_entry *rx_entry = NULL; + ofi_spin_t *bplock; + struct ofi_bufpool *bp; + + /* if lp is NULL, then we don't know where the message is going to + * come from, so allocate the rx_entry from a global pool + */ + if (!cep) { + bp = global_recv_bp; + bplock = &global_bplock; + } else { + bp = cep->lpe_recv_bp; + bplock = &cep->lpe_bplock; + } + + ofi_spin_lock(bplock); + rx_entry = (struct lnx_rx_entry *)ofi_buf_alloc(bp); + ofi_spin_unlock(bplock); + if (rx_entry) { + memset(rx_entry, 0, sizeof(*rx_entry)); + if (!cep) + rx_entry->rx_global = true; + rx_entry->rx_cep = cep; + lnx_init_rx_entry(rx_entry, iov, desc, count, addr, tag, + ignore, context, flags); + } + + return rx_entry; +} + +static inline struct lnx_rx_entry * +lnx_remove_first_match(struct lnx_queue *q, struct lnx_match_attr *match) +{ + struct lnx_rx_entry *rx_entry; + + ofi_spin_lock(&q->lq_qlock); + rx_entry = (struct lnx_rx_entry *) dlist_remove_first_match( + &q->lq_queue, q->lq_match_func, match); + ofi_spin_unlock(&q->lq_qlock); + + return rx_entry; +} + +static inline void +lnx_insert_rx_entry(struct lnx_queue *q, struct lnx_rx_entry *entry) +{ + ofi_spin_lock(&q->lq_qlock); + dlist_insert_tail((struct dlist_entry *)(&entry->rx_entry), + &q->lq_queue); + ofi_spin_unlock(&q->lq_qlock); +} + +int lnx_queue_tag(struct fi_peer_rx_entry *entry) +{ + struct lnx_rx_entry *rx_entry = (struct lnx_rx_entry *)entry; + struct lnx_peer_srq *lnx_srq = (struct lnx_peer_srq*)entry->owner_context; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 found\n", + entry->addr, entry->tag); + + lnx_insert_rx_entry(&lnx_srq->lps_trecv.lqp_unexq, rx_entry); + + return 0; +} + +int lnx_get_tag(struct fid_peer_srx *srx, struct fi_peer_match_attr *match, + struct fi_peer_rx_entry **entry) +{ + struct lnx_match_attr match_attr; + struct lnx_peer_srq *lnx_srq; + struct local_prov_ep *cep; + struct lnx_ep *lep; + struct lnx_rx_entry *rx_entry; + fi_addr_t addr = match->addr; + struct lnx_srx_context *srx_ctxt; + uint64_t tag = match->tag; + int rc = 0; + + /* get the endpoint */ + cep = container_of(srx, struct local_prov_ep, lpe_srx); + srx_ctxt = cep->lpe_srx.ep_fid.fid.context; + cep = srx_ctxt->srx_cep; + lep = srx_ctxt->srx_lep; + lnx_srq = &lep->le_srq; + + /* The fi_addr_t is a generic address returned by the provider. It's usually + * just an index or id in their AV table. When I get it here, I could have + * duplicates if multiple providers are using the same scheme to + * insert in the AV table. I need to be able to identify the provider + * in this function so I'm able to correctly match this message to + * a possible rx entry on my receive queue. That's why we need to make + * sure we use the core endpoint as part of the matching key. + */ + memset(&match_attr, 0, sizeof(match_attr)); + + match_attr.lm_addr = addr; + match_attr.lm_ignore = 0; + match_attr.lm_tag = tag; + match_attr.lm_cep = cep; + + /* 1. Find a matching request to the message received. + * 2. Return the receive request. + * 3. If there are no matching requests, then create a new one + * and return it to the core provider. The core provider will turn + * around and tell us to queue it. Return -FI_ENOENT. + */ + rx_entry = lnx_remove_first_match(&lnx_srq->lps_trecv.lqp_recvq, + &match_attr); + if (rx_entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 found\n", + addr, tag); + + goto assign; + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = 0 not found\n", + addr, tag); + + rx_entry = get_rx_entry(cep, NULL, NULL, 0, addr, tag, 0, NULL, + lnx_ep_rx_flags(lep)); + if (!rx_entry) { + rc = -FI_ENOMEM; + goto out; + } + + rx_entry->rx_match_info = *match; + rx_entry->rx_entry.owner_context = lnx_srq; + rx_entry->rx_entry.msg_size = match->msg_size; + + rc = -FI_ENOENT; + +assign: + rx_entry->rx_entry.msg_size = MIN(rx_entry->rx_entry.msg_size, + match->msg_size); + *entry = &rx_entry->rx_entry; + +out: + return rc; +} + +/* + * if lp is NULL, then we're attempting to receive from any peer so + * matching the tag is the only thing that matters. + * + * if lp != NULL, then we're attempting to receive from a particular + * peer. This peer can have multiple endpoints serviced by different core + * providers. + * + * Therefore when we check the unexpected queue, we need to check + * if we received any messages from any of the peer's addresses. If we + * find one, then we kick the core provider associated with that + * address to receive the message. + * + * If nothing is found on the unexpected messages, then add a receive + * request on the SRQ; happens in the lnx_process_recv() + */ +static int lnx_process_recv(struct lnx_ep *lep, struct iovec *iov, void **desc, + fi_addr_t addr, size_t count, struct lnx_peer *lp, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags, + bool tagged) +{ + struct lnx_peer_srq *lnx_srq = &lep->le_srq; + struct local_prov_ep *cep; + struct lnx_rx_entry *rx_entry; + struct lnx_match_attr match_attr; + int rc = 0; + + match_attr.lm_addr = addr; + match_attr.lm_ignore = ignore; + match_attr.lm_tag = tag; + match_attr.lm_cep = NULL; + match_attr.lm_peer = lp; + + /* if support is turned off, don't go down the SRQ path */ + if (!lep->le_domain->ld_srx_supported) + return -FI_ENOSYS; + + rx_entry = lnx_remove_first_match(&lnx_srq->lps_trecv.lqp_unexq, + &match_attr); + if (!rx_entry) { + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr=%lx tag=%lx ignore=%lx buf=%p len=%lx not found\n", + addr, tag, ignore, iov->iov_base, iov->iov_len); + + goto nomatch; + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr=%lx tag=%lx ignore=%lx buf=%p len=%lx found\n", + addr, tag, ignore, iov->iov_base, iov->iov_len); + + cep = rx_entry->rx_cep; + + /* match is found in the unexpected queue. call into the core + * provider to complete this message + */ + lnx_init_rx_entry(rx_entry, iov, desc, count, addr, tag, ignore, + context, lnx_ep_rx_flags(lep)); + rx_entry->rx_entry.msg_size = MIN(ofi_total_iov_len(iov, count), + rx_entry->rx_entry.msg_size); + if (tagged) + rc = cep->lpe_srx.peer_ops->start_tag(&rx_entry->rx_entry); + else + rc = cep->lpe_srx.peer_ops->start_msg(&rx_entry->rx_entry); + + if (rc == -FI_EINPROGRESS) { + /* this is telling me that more messages can match the same + * rx_entry. So keep it on the queue + */ + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = %lx start_tag() in progress\n", + addr, tag, ignore); + + goto insert_recvq; + } else if (rc) { + FI_WARN(&lnx_prov, FI_LOG_CORE, "start tag failed with %d\n", rc); + } + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "addr = %lx tag = %lx ignore = %lx start_tag() success\n", + addr, tag, ignore); + + return 0; + +nomatch: + /* nothing on the unexpected queue, then allocate one and put it on + * the receive queue + */ + rx_entry = get_rx_entry(NULL, iov, desc, count, addr, tag, ignore, + context, lnx_ep_rx_flags(lep)); + rx_entry->rx_entry.msg_size = ofi_total_iov_len(iov, count); + if (!rx_entry) { + rc = -FI_ENOMEM; + goto out; + } + rx_entry->rx_peer = lp; + +insert_recvq: + lnx_insert_rx_entry(&lnx_srq->lps_trecv.lqp_recvq, rx_entry); + +out: + return rc; +} + +ssize_t lnx_trecv(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct iovec iov = {.iov_base = buf, .iov_len = len}; + struct lnx_peer *lp; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lnx_get_core_desc(desc, &mem_desc); + + /* addr is an index into the peer table. + * This gets us to a peer. Each peer can be reachable on + * multiple endpoints. Each endpoint has its own fi_addr_t which is + * core provider specific. + */ + lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc); + if (rc) + goto out; + } + + rc = lnx_process_recv(lep, &iov, &mem_desc, src_addr, 1, lp, tag, ignore, + context, 0, true); + if (rc == -FI_ENOSYS) + goto do_recv; + else if (rc) + FI_WARN(&lnx_prov, FI_LOG_CORE, "lnx_process_recv failed with %d\n", rc); + + goto out; + +do_recv: + if (lp) + rc = fi_trecv(cep->lpe_ep, buf, len, mem_desc, core_addr, tag, ignore, context); + +out: + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_trecvv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct lnx_peer *lp; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + lnx_get_core_desc(*desc, &mem_desc); + + lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, *desc, &cep, + &core_addr, iov, count, &mre, &mem_desc); + if (rc) + goto out; + } + + rc = lnx_process_recv(lep, (struct iovec *)iov, &mem_desc, src_addr, + 1, lp, tag, ignore, context, 0, true); + if (rc == -FI_ENOSYS) + goto do_recv; + + goto out; + +do_recv: + if (lp) + rc = fi_trecvv(cep->lpe_ep, iov, &mem_desc, count, core_addr, tag, ignore, context); + +out: + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep = NULL; + fi_addr_t core_addr = FI_ADDR_UNSPEC; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct lnx_peer *lp; + struct fi_msg_tagged core_msg; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + lp = lnx_get_peer(peer_tbl->lpt_entries, msg->addr); + if (lp) { + rc = lnx_select_recv_pathway(lp, lep->le_domain, *msg->desc, + &cep, &core_addr, msg->msg_iov, + msg->iov_count, &mre, &mem_desc); + if (rc) + goto out; + } + lnx_get_core_desc(*msg->desc, &mem_desc); + + rc = lnx_process_recv(lep, (struct iovec *)msg->msg_iov, &mem_desc, + msg->addr, msg->iov_count, lp, msg->tag, msg->ignore, + msg->context, flags, true); + if (rc == -FI_ENOSYS) + goto do_recv; + + goto out; + +do_recv: + if (lp) { + memcpy(&core_msg, msg, sizeof(*msg)); + + core_msg.desc = mem_desc; + core_msg.addr = core_addr; + + rc = fi_trecvmsg(cep->lpe_ep, &core_msg, flags); + } + +out: + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*) buf, .iov_len = len}; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tsend(cep->lpe_ep, buf, len, mem_desc, core_addr, tag, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + struct ofi_mr_entry *mre = NULL; + void *mem_desc; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, (desc) ? *desc : NULL, &cep, + &core_addr, iov, count, &mre, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx\n", core_addr, tag); + + rc = fi_tsendv(cep->lpe_ep, iov, &mem_desc, count, core_addr, tag, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct fi_msg_tagged core_msg; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[msg->addr], + lep->le_domain, + (msg->desc) ? *msg->desc : NULL, &cep, + &core_addr, msg->msg_iov, + msg->iov_count, &mre, &mem_desc, NULL); + if (rc) + return rc; + + memcpy(&core_msg, msg, sizeof(*msg)); + + core_msg.desc = mem_desc; + core_msg.addr = core_addr; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx\n", core_msg.addr, core_msg.tag); + + rc = fi_tsendmsg(cep->lpe_ep, &core_msg, flags); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tinject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, NULL, &cep, + &core_addr, NULL, 0, &mre, NULL, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tinject(cep->lpe_ep, buf, len, core_addr, tag); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc, + uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tsenddata(cep->lpe_ep, buf, len, mem_desc, + data, core_addr, tag, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +ssize_t lnx_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t tag) +{ + int rc; + struct lnx_ep *lep; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + struct ofi_mr_entry *mre = NULL; + + lep = lnx_get_lep(ep, NULL); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, NULL, &cep, + &core_addr, NULL, 0, &mre, NULL, NULL); + if (rc) + return rc; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx tag %lx buf %p len %ld\n", + core_addr, tag, buf, len); + + rc = fi_tinjectdata(cep->lpe_ep, buf, len, data, core_addr, tag); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + + return rc; +} + +static inline ssize_t +lnx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[src_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "rma read from %lx key %lx buf %p len %ld\n", + core_addr, key, buf, len); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_read(core_ep, buf, len, mem_desc, + core_addr, addr, key, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = len}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "rma write to %lx key %lx buf %p len %ld\n", + core_addr, key, buf, len); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_write(core_ep, buf, len, mem_desc, + core_addr, addr, key, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_atomic_write(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_atomic(core_ep, buf, count, mem_desc, + core_addr, addr, key, datatype, op, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_atomic_readwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_fetch_atomic(core_ep, buf, count, desc, + result, mem_desc, core_addr, addr, key, + datatype, op, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); +out: + return rc; +} + +static inline ssize_t +lnx_atomic_compwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + int rc; + struct lnx_ep *lep; + struct fid_ep *core_ep; + struct lnx_ctx *ctx; + struct local_prov_ep *cep; + fi_addr_t core_addr; + struct lnx_peer_table *peer_tbl; + void *mem_desc; + uint64_t rkey; + struct ofi_mr_entry *mre = NULL; + struct iovec iov = {.iov_base = (void*)buf, .iov_len = count}; + + lep = lnx_get_lep(ep, &ctx); + if (!lep) + return -FI_ENOSYS; + + peer_tbl = lep->le_peer_tbl; + + rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], + lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); + if (rc) + goto out; + + FI_DBG(&lnx_prov, FI_LOG_CORE, + "sending to %lx\n", core_addr); + + core_ep = lnx_get_core_ep(cep, ctx->ctx_idx, ep->fid.fclass); + + rc = fi_compare_atomic(core_ep, buf, count, desc, + compare, compare_desc, result, mem_desc, + core_addr, addr, key, datatype, op, context); + + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + +out: + return rc; +} + +struct fi_ops_tagged lnx_tagged_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = lnx_trecv, + .recvv = lnx_trecvv, + .recvmsg = lnx_trecvmsg, + .send = lnx_tsend, + .sendv = lnx_tsendv, + .sendmsg = lnx_tsendmsg, + .inject = lnx_tinject, + .senddata = lnx_tsenddata, + .injectdata = lnx_tinjectdata, +}; + +struct fi_ops_msg lnx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +struct fi_ops_rma lnx_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = lnx_rma_read, + .readv = fi_no_rma_readv, + .readmsg = fi_no_rma_readmsg, + .write = lnx_rma_write, + .writev = fi_no_rma_writev, + .writemsg = fi_no_rma_writemsg, + .inject = fi_no_rma_inject, + .writedata = fi_no_rma_writedata, + .injectdata = fi_no_rma_injectdata, +}; + +struct fi_ops_atomic lnx_atomic_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = lnx_atomic_write, + .writev = fi_no_atomic_writev, + .writemsg = fi_no_atomic_writemsg, + .inject = fi_no_atomic_inject, + .readwrite = lnx_atomic_readwrite, + .readwritev = fi_no_atomic_readwritev, + .readwritemsg = fi_no_atomic_readwritemsg, + .compwrite = lnx_atomic_compwrite, + .compwritev = fi_no_atomic_compwritev, + .compwritemsg = fi_no_atomic_compwritemsg, + .writevalid = fi_no_atomic_writevalid, + .readwritevalid = fi_no_atomic_readwritevalid, + .compwritevalid = fi_no_atomic_compwritevalid, +}; + + diff --git a/prov/util/src/util_attr.c b/prov/util/src/util_attr.c index 634af1e5e82..ffe1bb87b5f 100644 --- a/prov/util/src/util_attr.c +++ b/prov/util/src/util_attr.c @@ -93,7 +93,8 @@ char *ofi_strdup_tail(const char *str) } */ -char *ofi_strdup_append(const char *head, const char *tail) +static char *ofi_strdup_append_internal(const char *head, const char *tail, + char delim) { char *str; size_t len; @@ -101,10 +102,20 @@ char *ofi_strdup_append(const char *head, const char *tail) len = strlen(head) + strlen(tail) + 2; str = malloc(len); if (str) - sprintf(str, "%s%c%s", head, OFI_NAME_DELIM, tail); + sprintf(str, "%s%c%s", head, delim, tail); return str; } +char *ofi_strdup_link_append(const char *head, const char *tail) +{ + return ofi_strdup_append_internal(head, tail, OFI_NAME_LNX_DELIM); +} + +char *ofi_strdup_append(const char *head, const char *tail) +{ + return ofi_strdup_append_internal(head, tail, OFI_NAME_DELIM); +} + int ofi_exclude_prov_name(char **prov_name_list, const char *util_prov_name) { char *exclude, *name, *temp; diff --git a/src/fabric.c b/src/fabric.c index b1a735638bb..13b529ea95c 100644 --- a/src/fabric.c +++ b/src/fabric.c @@ -262,6 +262,11 @@ static int ofi_is_hook_prov(const struct fi_provider *provider) return ofi_prov_ctx(provider)->type == OFI_PROV_HOOK; } +static int ofi_is_lnx_prov(const struct fi_provider *provider) +{ + return ofi_prov_ctx(provider)->type == OFI_PROV_LNX; +} + int ofi_apply_filter(struct ofi_filter *filter, const char *name) { if (!filter->names) @@ -500,6 +505,8 @@ static void ofi_set_prov_type(struct fi_provider *provider) ofi_prov_ctx(provider)->type = OFI_PROV_UTIL; else if (ofi_has_offload_prefix(provider->name)) ofi_prov_ctx(provider)->type = OFI_PROV_OFFLOAD; + else if (ofi_is_lnx(provider->name)) + ofi_prov_ctx(provider)->type = OFI_PROV_LNX; else ofi_prov_ctx(provider)->type = OFI_PROV_CORE; } @@ -988,6 +995,7 @@ void fi_ini(void) ofi_register_provider(SOCKETS_INIT, NULL); ofi_register_provider(TCP_INIT, NULL); + ofi_register_provider(LNX_INIT, NULL); ofi_register_provider(HOOK_PERF_INIT, NULL); ofi_register_provider(HOOK_TRACE_INIT, NULL); ofi_register_provider(HOOK_PROFILE_INIT, NULL); @@ -1207,8 +1215,12 @@ static void ofi_set_prov_attr(struct fi_fabric_attr *attr, core_name = attr->prov_name; if (core_name) { - assert(ofi_is_util_prov(prov)); - attr->prov_name = ofi_strdup_append(core_name, prov->name); + if (ofi_is_lnx_prov(prov)) { + attr->prov_name = ofi_strdup_link_append(core_name, prov->name); + } else { + assert(ofi_is_util_prov(prov)); + attr->prov_name = ofi_strdup_append(core_name, prov->name); + } free(core_name); } else { attr->prov_name = strdup(prov->name); @@ -1557,7 +1569,9 @@ int DEFAULT_SYMVER_PRE(fi_fabric)(struct fi_fabric_attr *attr, fi_ini(); - top_name = strrchr(attr->prov_name, OFI_NAME_DELIM); + ret = ofi_is_linked(attr->prov_name); + top_name = strrchr(attr->prov_name, + ret ? OFI_NAME_LNX_DELIM : OFI_NAME_DELIM); if (top_name) top_name++; else diff --git a/src/fi_tostr.c b/src/fi_tostr.c index 910dfd1214b..420f0cca2f6 100644 --- a/src/fi_tostr.c +++ b/src/fi_tostr.c @@ -259,6 +259,7 @@ static void ofi_tostr_protocol(char *buf, size_t len, uint32_t protocol) CASEENUMSTRN(FI_PROTO_SM2, len); CASEENUMSTRN(FI_PROTO_CXI_RNR, len); CASEENUMSTRN(FI_PROTO_LPP, len); + CASEENUMSTRN(FI_PROTO_LNX, len); default: ofi_strncatf(buf, len, "Unknown"); break; From 54d5444051e795826492c79fbb71ac2291e7ea31 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Thu, 24 Oct 2024 15:06:03 +0000 Subject: [PATCH 185/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_lnx.7 | 68 ++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/man/man7/fi_lnx.7 b/man/man7/fi_lnx.7 index b30876e24e4..8caddfcebf2 100644 --- a/man/man7/fi_lnx.7 +++ b/man/man7/fi_lnx.7 @@ -1,15 +1,13 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_lnx" "7" "" "" "" +.TH "fi_lnx" "7" "2024\-10\-24" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy -.PP -{% include JB/setup %} .SH NAME .PP -fi_lnx - The LINKx (lnx) Provider +fi_lnx - The LINKx (LNX) Provider .SH OVERVIEW .PP -The lnx provider is designed to link two or more providers, allowing +The LNX provider is designed to link two or more providers, allowing applications to seamlessly use multiple providers or NICs. This provider uses the libfabric peer infrastructure to aid in the use of the underlying providers. @@ -21,21 +19,21 @@ providers and provide the users with the ability to influence the way the providers are utilized for traffic load. .SH SUPPORTED FEATURES .PP -This release contains an initial implementation of the lnx provider -that offers the following support: +This release contains an initial implementation of the LNX provider that +offers the following support: .TP \f[I]Endpoint types\f[R] The provider supports only endpoint type \f[I]FI_EP_RDM\f[R]. .TP \f[I]Endpoint capabilities\f[R] -lnx is a passthrough layer on the send path. -On the receive path lnx utilizes the peer infrastructure to create +LNX is a passthrough layer on the send path. +On the receive path LNX utilizes the peer infrastructure to create shared receive queues (SRQ). Receive requests are placed on the SRQ instead of on the core provider receive queue. When the provider receives a message it queries the SRQ for a match. If one is found the receive request is completed, otherwise the message -is placed on the lnx shared unexpected queue (SUQ). +is placed on the LNX shared unexpected queue (SUQ). Further receive requests query the SUQ for matches. The first release of the provider only supports tagged and RMA operations. @@ -45,19 +43,19 @@ Other message types will be supported in future releases. The provider does not require the use of any mode bits. .TP \f[I]Progress\f[R] -lnx utilizes the peer infrastructure to provide a shared completion +LNX utilizes the peer infrastructure to provide a shared completion queue. Each linked provider still needs to handle its own progress. Completion events will however be placed on the shared completion queue, which is passed to the application for access. .TP \f[I]Address Format\f[R] -lnx wraps the linked providers addresses in one common binary blob. +LNX wraps the linked providers addresses in one common binary blob. It does not alter or change the linked providers address format. -It wraps them into a lnx structure which is then flattened and -returned to the application. +It wraps them into a LNX structure which is then flattened and returned +to the application. This is passed between different nodes. -The lnx provider is able to parse the flattened format and operate on +The LNX provider is able to parse the flattened format and operate on the different links. This assumes that nodes in the same group are all using the same version of the provider with the exact same links. @@ -65,13 +63,13 @@ IE: you can\[cq]t have one node linking SHM+CXI while another linking SHM+RXM. .TP \f[I]Message Operations\f[R] -lnx is designed to intercept message operations such as fi_tsenddata -and based on specific criteria forward the operation to the appropriate +LNX is designed to intercept message operations such as fi_tsenddata and +based on specific criteria forward the operation to the appropriate provider. -For the first release, lnx will only support linking SHM provider for +For the first release, LNX will only support linking SHM provider for intra-node traffic and another provider (ex: CXI) for inter node traffic. -lnx send operation looks at the destination and based on whether the +LNX send operation looks at the destination and based on whether the destination is local or remote it will select the provider to forward the operation to. The receive case has been described earlier. @@ -80,11 +78,11 @@ The receive case has been described earlier. In order to use the provider the user needs to set FI_LNX_PROV_LINKS environment variable to the linked providers in the following format shm+. -This will allow lnx to report back to the application in the +This will allow LNX to report back to the application in the fi_getinfo() call the different links which can be selected. -Since there are multiple domains per provider lnx reports a -permutation of all the possible links. -For example if there are two CXI interfaces on the machine lnx will +Since there are multiple domains per provider LNX reports a permutation +of all the possible links. +For example if there are two CXI interfaces on the machine LNX will report back shm+cxi0 and shm+cxi1. The application can then select based on its own criteria the link it wishes to use. @@ -94,12 +92,12 @@ A common selection criteria is the interface nearest the core the process is bound to. In order to make this determination, the application requires the PCI information about the interface. -For this reason lnx forwards the PCI information for the inter-node +For this reason LNX forwards the PCI information for the inter-node provider in the link to the application. .SH LIMITATIONS AND FUTURE WORK .TP \f[I]Hardware Support\f[R] -lnx doesn\[cq]t support hardware offload; ex hardware tag matching. +LNX doesn\[cq]t support hardware offload; ex hardware tag matching. This is an inherit limitation when using the peer infrastructure. Due to the use of a shared receive queue which linked providers need to query when a message is received, any hardware offload which requires @@ -111,7 +109,7 @@ For #2 this is needed when receiving from FI_ADDR_UNSPEC. In this case both providers which are part of the link can race to gain access to the receive buffer. It is a future effort to determine a way to use hardware tag matching -and other hardware offload capability with lnx +and other hardware offload capability with LNX .TP \f[I]Limited Linking\f[R] This release of the provider supports linking SHM provider for @@ -123,14 +121,16 @@ It is a future effort to expand to link any multiple sets of providers. As part of the memory registration operation, varying hardware can perform hardware specific steps such as memory pinning. Due to the fact that memory registration APIs do not specify the source -or destination addresses it is not possible for lnx to determine which +or destination addresses it is not possible for LNX to determine which provider to forward the memory registration to. -LINkx, therefore, registers the memory with all linked providers. +LNX, therefore, registers the memory with all linked providers. This might not be efficient and might have unforeseen side effects. A better method is needed to support memory registration. +One option is to have memory registration cache in lnx to avoid +expensive operations. .TP \f[I]Operation Types\f[R] -This release of lnx supports tagged and RMA operations only. +This release of LNX supports tagged and RMA operations only. Future releases will expand the support to other operation types. .TP \f[I]Multi-Rail\f[R] @@ -140,15 +140,15 @@ This can be over homogeneous interfaces or over heterogeneous interfaces. .SH RUNTIME PARAMETERS .PP -The \f[I]lnx\f[R] provider checks for the following environment +The \f[I]LNX\f[R] provider checks for the following environment variables: .TP \f[I]FI_LNX_PROV_LINKS\f[R] This environment variable is used to specify which providers to link. -This must be set in order for the lnx provider to return a list of +This must be set in order for the LNX provider to return a list of fi_info blocks in the fi_getinfo() call. The format which must be used is: ++\&... As mentioned earlier currently -lnx supports linking only two providers the first of which is SHM +LNX supports linking only two providers the first of which is SHM followed by one other provider for inter-node operations .TP \f[I]FI_LNX_DISABLE_SHM\f[R] @@ -157,7 +157,7 @@ However, the user can set it to one and then the SHM provider will not be used. This can be useful for debugging and performance analysis. The SHM provider will naturally be used for all intra-node operations. -Therefore, to test SHM in isolation with lnx, the processes can be +Therefore, to test SHM in isolation with LNX, the processes can be limited to the same node only. .TP \f[I]FI_LNX_USE_SRQ\f[R] @@ -171,3 +171,5 @@ It is 1 by default. .SH SEE ALSO .PP \f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +.SH AUTHORS +OpenFabrics. From 07254b6aa2ede98c4a9eb925a7d173137f3c284f Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 23 Oct 2024 17:49:49 +0000 Subject: [PATCH 186/393] xpmem: Fix compilation warning low is set but not used. Signed-off-by: Shi Jin --- src/xpmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xpmem.c b/src/xpmem.c index 9f73db3f1d1..456a92ee821 100644 --- a/src/xpmem.c +++ b/src/xpmem.c @@ -65,7 +65,7 @@ int ofi_xpmem_init(void) char buffer[1024]; uintptr_t address_max = 0; FILE *fh; - uintptr_t low, high; + uintptr_t high; char *tmp; fi_param_define(&core_prov, "xpmem_memcpy_chunksize", FI_PARAM_SIZE_T, @@ -87,7 +87,7 @@ int ofi_xpmem_init(void) while (fgets(buffer, sizeof(buffer), fh)) { /* each line of /proc/self/maps starts with low-high in * hexidecimal (without a 0x) */ - low = strtoul(buffer, &tmp, 16); + (void) strtoul(buffer, &tmp, 16); high = strtoul(tmp + 1, NULL, 16); if (address_max < high) address_max = high; From 331b4256291198b54f35c24ec6e0543743d3d0cf Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 22 Oct 2024 15:52:40 -0700 Subject: [PATCH 187/393] prov/shm: Add unmap_region function This function is mainly for the niche case where on progress_connreq a peer is added to the map with its region needing to be mapped, and then after mapping it, it's discovered that the newly mapped peer's process died. In this case we need to unmap them and free any resources that were opened for communicating with them. Remove lock from map_to_region and unmap_region functions and require lock acquirement before calling those functions. This is necessary because on av removal path, map will be double locked if the functions also process locking the map. The map_to_region function is updated to mirror this policy. Signed-off-by: Zach Dworkin --- prov/shm/src/smr_av.c | 8 +-- prov/shm/src/smr_ep.c | 2 + prov/shm/src/smr_progress.c | 11 ++--- prov/shm/src/smr_util.c | 98 +++++++++++++++++++++++++------------ prov/shm/src/smr_util.h | 2 + 5 files changed, 80 insertions(+), 41 deletions(-) diff --git a/prov/shm/src/smr_av.c b/prov/shm/src/smr_av.c index de12e152545..61e4344bde5 100644 --- a/prov/shm/src/smr_av.c +++ b/prov/shm/src/smr_av.c @@ -69,9 +69,12 @@ static void smr_map_cleanup(struct smr_map *map) { int64_t i; - for (i = 0; i < SMR_MAX_PEERS; i++) - smr_map_del(map, i); + for (i = 0; i < SMR_MAX_PEERS; i++) { + if (map->peers[i].peer.id < 0) + continue; + smr_map_del(map, i); + } ofi_rbmap_cleanup(&map->rbmap); } @@ -210,7 +213,6 @@ static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count dlist_foreach(&util_av->ep_list, av_entry) { util_ep = container_of(av_entry, struct util_ep, av_entry); smr_ep = container_of(util_ep, struct smr_ep, util_ep); - smr_unmap_from_endpoint(smr_ep->region, id); if (smr_av->smr_map.num_peers > 0) smr_ep->region->max_sar_buf_per_peer = SMR_MAX_PEERS / diff --git a/prov/shm/src/smr_ep.c b/prov/shm/src/smr_ep.c index 8ad190711fb..dd3d7f53f07 100644 --- a/prov/shm/src/smr_ep.c +++ b/prov/shm/src/smr_ep.c @@ -223,7 +223,9 @@ int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr) return id; if (!ep->region->map->peers[id].region) { + ofi_spin_lock(&ep->region->map->lock); ret = smr_map_to_region(&smr_prov, ep->region->map, id); + ofi_spin_unlock(&ep->region->map->lock); if (ret) return -1; } diff --git a/prov/shm/src/smr_progress.c b/prov/shm/src/smr_progress.c index c5315aa4b1f..5059f576eb2 100644 --- a/prov/shm/src/smr_progress.c +++ b/prov/shm/src/smr_progress.c @@ -878,7 +878,9 @@ static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd) peer_smr = smr_peer_region(ep->region, idx); if (!peer_smr) { + ofi_spin_lock(&ep->region->map->lock); ret = smr_map_to_region(&smr_prov, ep->region->map, idx); + ofi_spin_unlock(&ep->region->map->lock); if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "Could not map peer region\n"); @@ -891,14 +893,11 @@ static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd) if (peer_smr->pid != (int) cmd->msg.hdr.data) { /* TODO track and update/complete in error any transfers * to or from old mapping - * - * TODO create smr_unmap_region - * this needs to close peer_smr->map->peers[idx].pid_fd - * This case will also return an unmapped region because the idx - * is valid but the region was unmapped */ - munmap(peer_smr, peer_smr->total_size); + ofi_spin_lock(&ep->region->map->lock); + smr_unmap_region(&smr_prov, ep->region->map, idx, false); smr_map_to_region(&smr_prov, ep->region->map, idx); + ofi_spin_unlock(&ep->region->map->lock); peer_smr = smr_peer_region(ep->region, idx); } diff --git a/prov/shm/src/smr_util.c b/prov/shm/src/smr_util.c index 2924ddaa6f2..0c5de80e2a0 100644 --- a/prov/shm/src/smr_util.c +++ b/prov/shm/src/smr_util.c @@ -367,16 +367,15 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, } pthread_mutex_unlock(&ep_list_lock); - ofi_spin_lock(&map->lock); if (peer_buf->region) - goto unlock; + return FI_SUCCESS; + assert(ofi_spin_held(&map->lock)); fd = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); if (fd < 0) { - ret = -errno; FI_WARN_ONCE(prov, FI_LOG_AV, "shm_open error: name %s errno %d\n", name, errno); - goto unlock; + return -errno; } memset(tmp, 0, sizeof(tmp)); @@ -437,8 +436,6 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, out: close(fd); -unlock: - ofi_spin_unlock(&map->lock); return ret; } @@ -448,6 +445,7 @@ void smr_map_to_endpoint(struct smr_region *region, int64_t id) struct smr_region *peer_smr; struct smr_peer_data *local_peers; + assert(ofi_spin_held(®ion->map->lock)); peer_smr = smr_peer_region(region, id); if (region->map->peers[id].peer.id < 0 || !peer_smr) return; @@ -479,32 +477,81 @@ void smr_map_to_endpoint(struct smr_region *region, int64_t id) return; } +void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map, + int64_t peer_id, bool local) +{ + struct smr_region *peer_region; + struct smr_peer *peer; + struct util_ep *util_ep; + struct smr_ep *smr_ep; + struct smr_av *av; + int ret = 0; + + assert(ofi_spin_held(&map->lock)); + peer_region = map->peers[peer_id].region; + if (!peer_region) + return; + + peer = &map->peers[peer_id]; + av = container_of(map, struct smr_av, smr_map); + dlist_foreach_container(&av->util_av.ep_list, struct util_ep, util_ep, + av_entry) { + smr_ep = container_of(util_ep, struct smr_ep, util_ep); + smr_unmap_from_endpoint(smr_ep->region, peer_id); + } + + /* Don't unmap memory owned by this pid because the endpoint it belongs + * to might still be active. + */ + if (local) + return; + + if (map->flags & SMR_FLAG_HMEM_ENABLED) { + ret = ofi_hmem_host_unregister(peer_region); + if (ret) + FI_WARN(prov, FI_LOG_EP_CTRL, + "unable to unregister shm with iface\n"); + + if (peer->pid_fd != -1) { + close(peer->pid_fd); + peer->pid_fd = -1; + } + } + + munmap(peer_region, peer_region->total_size); + peer->region = NULL; +} + void smr_unmap_from_endpoint(struct smr_region *region, int64_t id) { struct smr_region *peer_smr; struct smr_peer_data *local_peers, *peer_peers; int64_t peer_id; - local_peers = smr_peer_data(region); if (region->map->peers[id].peer.id < 0) return; peer_smr = smr_peer_region(region, id); - peer_id = smr_peer_data(region)[id].addr.id; - + assert(peer_smr); peer_peers = smr_peer_data(peer_smr); + peer_id = smr_peer_data(region)[id].addr.id; peer_peers[peer_id].addr.id = -1; peer_peers[peer_id].name_sent = 0; + local_peers = smr_peer_data(region); ofi_xpmem_release(&local_peers[peer_id].xpmem); } void smr_exchange_all_peers(struct smr_region *region) { int64_t i; + + ofi_spin_lock(®ion->map->lock); for (i = 0; i < SMR_MAX_PEERS; i++) smr_map_to_endpoint(region, i); + + ofi_spin_unlock(®ion->map->lock); } int smr_map_add(const struct fi_provider *prov, struct smr_map *map, @@ -546,37 +593,24 @@ int smr_map_add(const struct fi_provider *prov, struct smr_map *map, void smr_map_del(struct smr_map *map, int64_t id) { - struct dlist_entry *entry; + struct smr_ep_name *name; + bool local = false; assert(id >= 0 && id < SMR_MAX_PEERS); - pthread_mutex_lock(&ep_list_lock); - entry = dlist_find_first_match(&ep_name_list, smr_match_name, - smr_no_prefix(map->peers[id].peer.name)); + dlist_foreach_container(&ep_name_list, struct smr_ep_name, name, entry) { + if (strcmp(name->name, map->peers[id].peer.name)) { + local = true; + break; + } + } pthread_mutex_unlock(&ep_list_lock); - ofi_spin_lock(&map->lock); - (void) ofi_rbmap_find_delete(&map->rbmap, - (void *) map->peers[id].peer.name); - + smr_unmap_region(&smr_prov, map, id, local); map->peers[id].fiaddr = FI_ADDR_NOTAVAIL; map->peers[id].peer.id = -1; map->num_peers--; - - if (!map->peers[id].region) - goto unlock; - - if (!entry) { - if (map->flags & SMR_FLAG_HMEM_ENABLED) { - if (map->peers[id].pid_fd != -1) - close(map->peers[id].pid_fd); - - (void) ofi_hmem_host_unregister(map->peers[id].region); - } - munmap(map->peers[id].region, map->peers[id].region->total_size); - map->peers[id].region = NULL; - } -unlock: + ofi_rbmap_find_delete(&map->rbmap, map->peers[id].peer.name); ofi_spin_unlock(&map->lock); } diff --git a/prov/shm/src/smr_util.h b/prov/shm/src/smr_util.h index c5bf8124873..7ed4e1e426f 100644 --- a/prov/shm/src/smr_util.h +++ b/prov/shm/src/smr_util.h @@ -356,6 +356,8 @@ void smr_cleanup(void); int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, int64_t id); void smr_map_to_endpoint(struct smr_region *region, int64_t id); +void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map, + int64_t id, bool found); void smr_unmap_from_endpoint(struct smr_region *region, int64_t id); void smr_exchange_all_peers(struct smr_region *region); int smr_map_add(const struct fi_provider *prov, struct smr_map *map, From 28d41eabe543167c52e2bd95240fe9f5ae87bf68 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Thu, 24 Oct 2024 09:40:10 -0700 Subject: [PATCH 188/393] v2.0.0 beta Signed-off-by: Jianxin Xiong --- AUTHORS | 3 +++ NEWS.md | 5 +++++ configure.ac | 2 +- fabtests/configure.ac | 2 +- include/windows/config.h | 2 +- man/fi_provider.7.md | 17 +++++++++++++++++ 6 files changed, 28 insertions(+), 3 deletions(-) diff --git a/AUTHORS b/AUTHORS index 6efa2e1831c..89f9c5bfa91 100644 --- a/AUTHORS +++ b/AUTHORS @@ -76,6 +76,7 @@ Dmitry Durnov Dmitry Gladkov Doug Oucharek Edgar Gabriel +Elias Kozah Elias Kozah Elias Kozah Eric Raut @@ -108,6 +109,7 @@ Ignacio Hernandez Ira Weiny Itai Masuari iziemba <57813515+iziemba@users.noreply.github.com> +Jack Morrison Jaime Arteaga James Dinan James Shimek @@ -298,4 +300,5 @@ Zach Tiffany zdworkin Zhaojuan Guo zhngaj +Zhuo Zhi ztaylor diff --git a/NEWS.md b/NEWS.md index 242872c4f0b..f8f46be8e35 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,7 @@ v2.0.0 beta, Fri Oct 25, 2024 ## Core +- xpmem: Fix compilation warning - Change the xpmem log level to info - Clarify FI_HMEM support of inject calls - Introduce Sub-MR @@ -124,6 +125,7 @@ v2.0.0 beta, Fri Oct 25, 2024 ## SHM +- Add unmap_region function - Use owner-allocated srx - Fix incorrect capability set - Make progress errors ints instead of unit64 @@ -148,6 +150,9 @@ v2.0.0 beta, Fri Oct 25, 2024 ## Fabtests +- Fix compilation error about CMPLX with C99 +- Added -E/env option to multinode test script +- Change xfer-method variable to xfer_method in runmultinode.sh - Fix complex fill cast - efa: Remove rnr cq error message check - efa: Loose assertion for read request counters diff --git a/configure.ac b/configure.ac index 8e222be2e38..ef368ef9377 100644 --- a/configure.ac +++ b/configure.ac @@ -9,7 +9,7 @@ dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ([2.60]) -AC_INIT([libfabric], [2.0.0b1], [ofiwg@lists.openfabrics.org]) +AC_INIT([libfabric], [2.0.0beta], [ofiwg@lists.openfabrics.org]) AC_CONFIG_SRCDIR([src/fabric.c]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) diff --git a/fabtests/configure.ac b/fabtests/configure.ac index 18a4d1d17d9..b5ab3117376 100644 --- a/fabtests/configure.ac +++ b/fabtests/configure.ac @@ -5,7 +5,7 @@ dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.57) -AC_INIT([fabtests], [2.0.0b1], [ofiwg@lists.openfabrics.org]) +AC_INIT([fabtests], [2.0.0beta], [ofiwg@lists.openfabrics.org]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) AC_CONFIG_HEADERS(config.h) diff --git a/include/windows/config.h b/include/windows/config.h index 920f06b9ca5..380bea50323 100644 --- a/include/windows/config.h +++ b/include/windows/config.h @@ -256,7 +256,7 @@ #define PACKAGE_TARNAME PACKAGE /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.0.0b1" +#define PACKAGE_VERSION "2.0.0beta" /* Define to the full name and version of this package. */ #define PACKAGE_STRING PACKAGE_NAME " " PACKAGE_VERSION diff --git a/man/fi_provider.7.md b/man/fi_provider.7.md index ba820906a12..9d6684a5543 100644 --- a/man/fi_provider.7.md +++ b/man/fi_provider.7.md @@ -77,6 +77,10 @@ operating system support is available, etc. This list is not exhaustive. hardware interface for inter-instance communication on EC2. See [`fi_efa`(7)](fi_efa.7.html) for more information. +*LPP* +: A provider runs on FabreX PCIe networks. See + [`fi_lpp`(7)](fi_lpp.7.html) for more information. + *OPX* : Supports Omni-Path networking from Cornelis Networks. See [`fi_opx`(7)](fi_opx.7.html) for more information. @@ -156,6 +160,19 @@ An offload provider is intended to accelerate specific types of communication, generally by taking advantage of network services that have been offloaded into hardware, though actual hardware offload support is not a requirement. +# LINKx (LNX) provider (Technology Preview) + +The LNX provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. This provider uses +the libfabric peer infrastructure to aid in the use of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. Future releases of the provider will allow linking any +number of providers and provide the users with the ability to influence +the way the providers are utilized for traffic load. + +See [`fi_lnx`(7)](fi_lnx.7.html) for more information. + # SEE ALSO [`fabric`(7)](fabric.7.html) From efaed6cce986ea317f7ef22fd5454bfdbc065a74 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Fri, 25 Oct 2024 03:13:11 +0000 Subject: [PATCH 189/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_provider.7 | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/man/man7/fi_provider.7 b/man/man7/fi_provider.7 index 571ec75e21f..a47af04f404 100644 --- a/man/man7/fi_provider.7 +++ b/man/man7/fi_provider.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_provider" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_provider" "7" "2024\-10\-25" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -82,6 +82,10 @@ A provider for the Amazon EC2 Elastic Fabric Adapter hardware interface for inter-instance communication on EC2. See \f[C]fi_efa\f[R](7) for more information. .TP +\f[I]LPP\f[R] +A provider runs on FabreX PCIe networks. +See \f[C]fi_lpp\f[R](7) for more information. +.TP \f[I]OPX\f[R] Supports Omni-Path networking from Cornelis Networks. See \f[C]fi_opx\f[R](7) for more information. @@ -163,6 +167,20 @@ An offload provider is intended to accelerate specific types of communication, generally by taking advantage of network services that have been offloaded into hardware, though actual hardware offload support is not a requirement. +.SH LINKx (LNX) provider (Technology Preview) +.PP +The LNX provider is designed to link two or more providers, allowing +applications to seamlessly use multiple providers or NICs. +This provider uses the libfabric peer infrastructure to aid in the use +of the underlying providers. +This version of the provider currently supports linking the libfabric +shared memory provider for intra-node traffic and another provider for +inter-node traffic. +Future releases of the provider will allow linking any number of +providers and provide the users with the ability to influence the way +the providers are utilized for traffic load. +.PP +See \f[C]fi_lnx\f[R](7) for more information. .SH SEE ALSO .PP \f[C]fabric\f[R](7) \f[C]fi_provider\f[R](3) From 197339edc3f397cc96a398c625214ac2a4f2110c Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 24 Oct 2024 19:34:23 +0000 Subject: [PATCH 190/393] prov/efa: Set max rma order size correctly EFA device doesn't support any ordering for RMA and atomic operations. RDM endpoint supports emulated ordered atomic protocol with max order size as the max atomic size. EFA provider should only return these non-zero for rdm ep type when ordered atomic is requested: FI_ORDER_ATOMIC_*. Signed-off-by: Shi Jin --- prov/efa/src/efa_prov_info.c | 6 ++- prov/efa/src/efa_user_info.c | 13 ++--- prov/efa/test/efa_unit_test_info.c | 83 ++++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 3 ++ prov/efa/test/efa_unit_tests.h | 3 ++ 5 files changed, 100 insertions(+), 8 deletions(-) diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c index bddc965d53a..be3221cb791 100644 --- a/prov/efa/src/efa_prov_info.c +++ b/prov/efa/src/efa_prov_info.c @@ -145,7 +145,9 @@ const struct fi_ep_attr efa_ep_attr = { .protocol = FI_PROTO_EFA, .protocol_version = 1, .msg_prefix_size = 0, + .max_order_raw_size = 0, .max_order_war_size = 0, + .max_order_waw_size = 0, .mem_tag_format = 0, .tx_ctx_cnt = 1, .rx_ctx_cnt = 1, @@ -187,8 +189,6 @@ void efa_prov_info_set_ep_attr(struct fi_info *prov_info, } prov_info->ep_attr->max_msg_size = device->ibv_port_attr.max_msg_sz; - prov_info->ep_attr->max_order_raw_size = device->ibv_port_attr.max_msg_sz; - prov_info->ep_attr->max_order_waw_size = device->ibv_port_attr.max_msg_sz; } /** @@ -579,6 +579,8 @@ int efa_prov_info_alloc_for_rdm(struct fi_info **prov_info_rdm_ptr, - device->rdm_info->src_addrlen - EFA_RDM_IOV_LIMIT * sizeof(struct fi_rma_iov); prov_info_rdm->ep_attr->max_order_raw_size = max_atomic_size; + prov_info_rdm->ep_attr->max_order_war_size = max_atomic_size; + prov_info_rdm->ep_attr->max_order_waw_size = max_atomic_size; } /* update tx_attr */ diff --git a/prov/efa/src/efa_user_info.c b/prov/efa/src/efa_user_info.c index 919a1cacb97..e152f2adc23 100644 --- a/prov/efa/src/efa_user_info.c +++ b/prov/efa/src/efa_user_info.c @@ -361,8 +361,6 @@ bool efa_user_info_should_support_hmem(int version) static int efa_user_info_alter_rdm(int version, struct fi_info *info, const struct fi_info *hints) { - uint64_t atomic_ordering; - if (hints && (hints->caps & FI_HMEM)) { /* * FI_HMEM is a primary capability, therefore only check @@ -418,11 +416,14 @@ int efa_user_info_alter_rdm(int version, struct fi_info *info, const struct fi_i * the default message order supported by the provider is returned. */ info->tx_attr->msg_order &= hints->tx_attr->msg_order; - atomic_ordering = FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW | - FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW; - if (!(hints->tx_attr->msg_order & atomic_ordering)) { + + /* If no atomic ordering is requested, set the max_order_*_size as 0 */ + if (!(hints->tx_attr->msg_order & FI_ORDER_ATOMIC_RAW)) info->ep_attr->max_order_raw_size = 0; - } + if (!(hints->tx_attr->msg_order & FI_ORDER_ATOMIC_WAR)) + info->ep_attr->max_order_war_size = 0; + if (!(hints->tx_attr->msg_order & FI_ORDER_ATOMIC_WAW)) + info->ep_attr->max_order_waw_size = 0; } if (hints->rx_attr) { diff --git a/prov/efa/test/efa_unit_test_info.c b/prov/efa/test/efa_unit_test_info.c index 6a53ea381a8..1380e36976c 100644 --- a/prov/efa/test/efa_unit_test_info.c +++ b/prov/efa/test/efa_unit_test_info.c @@ -157,6 +157,89 @@ void test_info_tx_rx_msg_order_dgram_order_sas(struct efa_resource **state) test_info_tx_rx_msg_order_from_hints(resource->hints, -FI_ENODATA); } +/** + * @brief Verify max order size is set correctly according to hints + * + * @param hints hints + * @param expected_ret expected rc of fi_getinfo + * @param expected_size expected value of max_order_*_size. Ignored when expected_ret is non-zero. + */ +static void +test_info_max_order_size_from_hints(struct fi_info *hints, int expected_ret, size_t expected_size) +{ + struct fi_info *info; + int err; + + err = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), NULL, NULL, 0ULL, hints, &info); + + assert_int_equal(err, expected_ret); + + if (expected_ret == FI_SUCCESS) { + assert_true(info->ep_attr->max_order_raw_size == expected_size); + assert_true(info->ep_attr->max_order_war_size == expected_size); + assert_true(info->ep_attr->max_order_waw_size == expected_size); + } + + fi_freeinfo(info); +} + +/** + * DGRAM ep type doesn't support FI_ATOMIC, fi_getinfo should return + * ENODATA when FI_ATOMIC is requested in hints. + */ +void test_info_max_order_size_dgram_with_atomic(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + assert_non_null(resource->hints); + + resource->hints->caps = FI_ATOMIC; + + test_info_max_order_size_from_hints(resource->hints, -FI_ENODATA, 0); +} + +/** + * RDM ep type supports FI_ATOMIC. When FI_ORDER_ATOMIC_* is NOT requested, + * max_order_*_size should be 0 + */ +void test_info_max_order_size_rdm_with_atomic_no_order(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + + resource->hints->caps = FI_ATOMIC; + resource->hints->domain_attr->mr_mode |= FI_MR_VIRT_ADDR | FI_MR_PROV_KEY; + + test_info_max_order_size_from_hints(resource->hints, FI_SUCCESS, 0); +} + +/** + * RDM ep type supports FI_ATOMIC. When FI_ORDER_ATOMIC_* is requested, + * max_order_*_size should be the max atomic size derived from mtu and headers + */ +void test_info_max_order_size_rdm_with_atomic_order(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + size_t max_atomic_size = g_device_list[0].rdm_info->ep_attr->max_msg_size + - sizeof(struct efa_rdm_rta_hdr) + - g_device_list[0].rdm_info->src_addrlen + - EFA_RDM_IOV_LIMIT * sizeof(struct fi_rma_iov); + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + resource->hints->caps = FI_ATOMIC; + resource->hints->domain_attr->mr_mode |= FI_MR_VIRT_ADDR | FI_MR_PROV_KEY; + resource->hints->tx_attr->msg_order |= FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW | FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW; + resource->hints->rx_attr->msg_order = resource->hints->tx_attr->msg_order; + + test_info_max_order_size_from_hints(resource->hints, FI_SUCCESS, max_atomic_size); +} + void test_info_tx_rx_op_flags_rdm(struct efa_resource **state) { struct efa_resource *resource = *state; diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index e6b7a324d81..1e2f2087fa2 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -130,6 +130,9 @@ int main(void) cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_dgram_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_dgram_order_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_max_order_size_dgram_with_atomic, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_max_order_size_rdm_with_atomic_no_order, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_max_order_size_rdm_with_atomic_order, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_op_flags_rdm, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_size_rdm, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_check_shm_info_hmem, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 2e62473a717..52670e8af9c 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -145,6 +145,9 @@ void test_info_tx_rx_msg_order_rdm_order_none(); void test_info_tx_rx_msg_order_rdm_order_sas(); void test_info_tx_rx_msg_order_dgram_order_none(); void test_info_tx_rx_msg_order_dgram_order_sas(); +void test_info_max_order_size_dgram_with_atomic(); +void test_info_max_order_size_rdm_with_atomic_no_order(); +void test_info_max_order_size_rdm_with_atomic_order(); void test_info_tx_rx_op_flags_rdm(); void test_info_tx_rx_size_rdm(); void test_info_check_shm_info_hmem(); From f236201bd383be1765b8e28effed9ad23d778bb4 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 24 Oct 2024 14:13:28 -0700 Subject: [PATCH 191/393] contrib/intel/jenkins: Migrate multinode tests to use new CI Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 30e0089c3d1..ccd1390ec03 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -748,12 +748,11 @@ pipeline { stage ('multinode_performance') { steps { script { - dir (RUN_LOCATION) { - run_middleware([["tcp", null],["sockets", null]], - "multinode_performance", "multinode", "grass", - "bulbasaur,chikorita", "2") - run_middleware([["verbs", "rxm"]], "multinode_performance", - "multinode", "water", "totodile", "2") + dir (CI_LOCATION) { + run_ci("CI_multinode_performance_grass", + "pr_multinode_performance_grass.json") + run_ci("CI_multinode_performance_water", + "pr_multinode_performance_water.json") } } } From 0aa486b89883277417dab3b9d4e5a88fc82fffa3 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Fri, 8 Apr 2022 18:17:30 -0700 Subject: [PATCH 192/393] prov/cxi: Implement shared Completion Queues Implement the shared completion queues in CXI. This implementation mainly relies on the util CQ common code. The util code set the correct completion queue callbacks based on the FI_PEER flag Signed-off-by: Amir Shehata --- prov/cxi/src/cxip_cq.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/prov/cxi/src/cxip_cq.c b/prov/cxi/src/cxip_cq.c index 5ca5f41abff..1c6504c90bc 100644 --- a/prov/cxi/src/cxip_cq.c +++ b/prov/cxi/src/cxip_cq.c @@ -34,9 +34,9 @@ int cxip_cq_req_complete(struct cxip_req *req) return FI_SUCCESS; } - return ofi_cq_write(&req->cq->util_cq, (void *)req->context, - req->flags, req->data_len, (void *)req->buf, - req->data, req->tag); + return ofi_peer_cq_write(&req->cq->util_cq, (void *)req->context, + req->flags, req->data_len, (void *)req->buf, + req->data, req->tag, FI_ADDR_NOTAVAIL); } /* @@ -50,9 +50,9 @@ int cxip_cq_req_complete_addr(struct cxip_req *req, fi_addr_t src) return FI_SUCCESS; } - return ofi_cq_write_src(&req->cq->util_cq, (void *)req->context, - req->flags, req->data_len, (void *)req->buf, - req->data, req->tag, src); + return ofi_peer_cq_write(&req->cq->util_cq, (void *)req->context, + req->flags, req->data_len, (void *)req->buf, + req->data, req->tag, src); } /* @@ -94,7 +94,7 @@ int cxip_cq_req_error(struct cxip_req *req, size_t olen, err_entry.buf = (void *)(uintptr_t)req->buf; err_entry.src_addr = src_addr; - return ofi_cq_write_error(&req->cq->util_cq, &err_entry); + return ofi_peer_cq_write_error(&req->cq->util_cq, &err_entry); } /* From d1401d545adc590758055526fc31e6ffa7db5055 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 7 May 2024 14:26:07 -0700 Subject: [PATCH 193/393] prov/cxi: Support shared receive queues Restructure the code to allow for posting on the owner provider's shared receive queues. Signed-off-by: Amir Shehata --- prov/cxi/include/cxip.h | 95 ++++++++++++- prov/cxi/src/cxip_av.c | 17 +++ prov/cxi/src/cxip_dom.c | 81 ++++++++++- prov/cxi/src/cxip_ep.c | 4 + prov/cxi/src/cxip_info.c | 69 +-------- prov/cxi/src/cxip_msg.c | 24 ++-- prov/cxi/src/cxip_msg_hpc.c | 273 +++++++++++++++++++++++++++--------- prov/cxi/src/cxip_req_buf.c | 18 ++- prov/cxi/src/cxip_rxc.c | 7 + 9 files changed, 441 insertions(+), 147 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index ed549d556f4..5ac64578697 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -861,6 +861,9 @@ struct cxip_domain { ofi_spin_t lock; ofi_atomic32_t ref; + struct fid_ep rx_ep; + struct fid_peer_srx *owner_srx; + uint32_t tclass; struct cxip_eq *eq; //unused @@ -1271,6 +1274,8 @@ struct cxip_req { uint64_t trig_thresh; struct cxip_cntr *trig_cntr; + struct fi_peer_rx_entry *rx_entry; + /* CQ event fields, set according to fi_cq.3 * - set by provider * - returned to user in completion event @@ -1444,6 +1449,8 @@ struct cxip_cntr { struct cxip_ux_send { struct dlist_entry rxc_entry; struct cxip_req *req; + struct cxip_rxc *rxc; + struct fi_peer_rx_entry *rx_entry; union c_event put_ev; bool claimed; /* Reserved with FI_PEEK | FI_CLAIM */ }; @@ -2378,6 +2385,8 @@ struct cxip_ep_obj { struct cxip_domain *domain; struct cxip_av *av; + struct fid_peer_srx *owner_srx; + /* Domain has been configured with FI_AV_AUTH_KEY. */ bool av_auth_key; @@ -3247,6 +3256,11 @@ double cxip_rep_sum(size_t count, double *values); int cxip_check_auth_key_info(struct fi_info *info); int cxip_gen_auth_key(struct fi_info *info, struct cxi_auth_key *key); +static inline struct fid_peer_srx *cxip_get_owner_srx(struct cxip_rxc *rxc) +{ + return rxc->ep_obj->owner_srx; +} + #define CXIP_FC_SOFTWARE_INITIATED -1 /* cxip_fc_reason() - Returns the event reason for portal state @@ -3291,6 +3305,13 @@ ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, struct cxip_cntr *trig_cntr, struct cxip_cntr *comp_cntr); +static inline int cxip_no_discard(struct fi_peer_rx_entry *rx_entry) +{ + return -FI_ENOSYS; +} + +int cxip_unexp_start(struct fi_peer_rx_entry *entry); + /* * Request variants: * CXIP_RQ_AMO @@ -3702,7 +3723,9 @@ int cxip_set_recv_match_id(struct cxip_rxc *rxc, fi_addr_t src_addr, return FI_SUCCESS; } -fi_addr_t cxip_recv_req_src_addr(struct cxip_req *req); +fi_addr_t cxip_recv_req_src_addr(struct cxip_rxc *rxc, + uint32_t init, uint16_t vni, + bool force); int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, struct cxip_md *md, struct cxip_req **cxip_req, int (*recv_cb)(struct cxip_req *req, @@ -3754,4 +3777,74 @@ int cxip_domain_dwq_emit_amo(struct cxip_domain *dom, uint16_t vni, struct c_dma_amo_cmd *amo, uint64_t flags, bool fetching, bool flush); +static inline void cxip_set_env_rx_match_mode(void) +{ + char *param_str = NULL; + + fi_param_get_str(&cxip_prov, "rx_match_mode", ¶m_str); + /* Parameters to tailor hybrid hardware to software transitions + * that are initiated by software. + */ + fi_param_define(&cxip_prov, "hybrid_preemptive", FI_PARAM_BOOL, + "Enable/Disable low LE preemptive UX transitions."); + fi_param_get_bool(&cxip_prov, "hybrid_preemptive", + &cxip_env.hybrid_preemptive); + fi_param_define(&cxip_prov, "hybrid_recv_preemptive", FI_PARAM_BOOL, + "Enable/Disable low LE preemptive recv transitions."); + fi_param_get_bool(&cxip_prov, "hybrid_recv_preemptive", + &cxip_env.hybrid_recv_preemptive); + fi_param_define(&cxip_prov, "hybrid_unexpected_msg_preemptive", + FI_PARAM_BOOL, + "Enable preemptive transition to software endpoint when number of hardware unexpected messages exceeds RX attribute size"); + fi_param_get_bool(&cxip_prov, "hybrid_unexpected_msg_preemptive", + &cxip_env.hybrid_unexpected_msg_preemptive); + fi_param_define(&cxip_prov, "hybrid_posted_recv_preemptive", + FI_PARAM_BOOL, + "Enable preemptive transition to software endpoint when number of posted receives exceeds RX attribute size"); + fi_param_get_bool(&cxip_prov, "hybrid_posted_recv_preemptive", + &cxip_env.hybrid_posted_recv_preemptive); + + if (param_str) { + if (!strcasecmp(param_str, "hardware")) { + cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; + cxip_env.msg_offload = true; + } else if (!strcmp(param_str, "software")) { + cxip_env.rx_match_mode = CXIP_PTLTE_SOFTWARE_MODE; + cxip_env.msg_offload = false; + } else if (!strcmp(param_str, "hybrid")) { + cxip_env.rx_match_mode = CXIP_PTLTE_HYBRID_MODE; + cxip_env.msg_offload = true; + } else { + _CXIP_WARN(FI_LOG_FABRIC, "Unrecognized rx_match_mode: %s\n", + param_str); + cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; + cxip_env.msg_offload = true; + } + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_preemptive) { + cxip_env.hybrid_preemptive = false; + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignoring preemptive\n"); + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_recv_preemptive) { + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignore LE recv preemptive\n"); + cxip_env.hybrid_recv_preemptive = 0; + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_posted_recv_preemptive) { + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignore hybrid_posted_recv_preemptive\n"); + cxip_env.hybrid_posted_recv_preemptive = 0; + } + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_unexpected_msg_preemptive) { + _CXIP_WARN(FI_LOG_FABRIC, "Not in hybrid mode, ignore hybrid_unexpected_msg_preemptive\n"); + cxip_env.hybrid_unexpected_msg_preemptive = 0; + } +} + #endif diff --git a/prov/cxi/src/cxip_av.c b/prov/cxi/src/cxip_av.c index 6dd4aa4e415..031bf0fb22e 100644 --- a/prov/cxi/src/cxip_av.c +++ b/prov/cxi/src/cxip_av.c @@ -229,6 +229,18 @@ struct cxip_addr *(*cxip_av_addr_in)(const void *addr) = insert_in; void (*cxip_av_addr_out)(struct cxip_addr *addr_out, struct cxip_addr *addr) = insert_out; +static fi_addr_t cxip_get_addr(struct fi_peer_rx_entry *entry) +{ + uint32_t ux_init; + uint16_t vni; + struct cxip_ux_send *ux = entry->peer_context; + + ux_init = ux->put_ev.tgt_long.initiator.initiator.process; + vni = ux->put_ev.tgt_long.vni; + + return cxip_recv_req_src_addr(ux->rxc, ux_init, vni, true); +} + static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { @@ -236,6 +248,7 @@ static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, size_t i; size_t success_cnt = 0; int ret; + struct fid_peer_srx *owner_srx; ret = cxip_av_insert_validate_args(fid, addr_in, count, fi_addr, flags, context); @@ -253,6 +266,10 @@ static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, cxip_av_unlock(av); + owner_srx = av->domain->owner_srx; + if (owner_srx) + owner_srx->owner_ops->foreach_unspec_addr(owner_srx, &cxip_get_addr); + return success_cnt; } diff --git a/prov/cxi/src/cxip_dom.c b/prov/cxi/src/cxip_dom.c index 07cdb4a498a..798a0c89fa5 100644 --- a/prov/cxi/src/cxip_dom.c +++ b/prov/cxi/src/cxip_dom.c @@ -1556,6 +1556,85 @@ static int cxip_query_atomic(struct fid_domain *domain, return FI_SUCCESS; } +struct fi_ops_srx_peer cxip_srx_peer_ops = { + .size = sizeof(struct fi_ops_srx_peer), + .start_msg = cxip_unexp_start, + .start_tag = cxip_unexp_start, + .discard_msg = cxip_no_discard, + .discard_tag = cxip_no_discard, +}; + +static int cxip_srx_close(struct fid *fid) +{ + struct cxip_domain *dom; + + dom = container_of(fid, struct cxip_domain, rx_ep.fid); + + ofi_atomic_dec32(&dom->util_domain.ref); + + return FI_SUCCESS; +} + +static struct fi_ops cxip_srx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_srx_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_msg cxip_srx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops_tagged cxip_srx_tagged_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = fi_no_tagged_send, + .sendv = fi_no_tagged_sendv, + .sendmsg = fi_no_tagged_sendmsg, + .inject = fi_no_tagged_inject, + .senddata = fi_no_tagged_senddata, + .injectdata = fi_no_tagged_injectdata, +}; + +static int cxip_srx_context(struct fid_domain *fid, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct cxip_domain *dom; + + if (!context || ! attr || !fid) + return -FI_EINVAL; + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + + if (attr->op_flags & FI_PEER) { + dom->owner_srx = ((struct fi_peer_srx_context *) context)->srx; + dom->owner_srx->peer_ops = &cxip_srx_peer_ops; + dom->rx_ep.msg = &cxip_srx_msg_ops; + dom->rx_ep.tagged = &cxip_srx_tagged_ops; + dom->rx_ep.fid.ops = &cxip_srx_fi_ops; + dom->rx_ep.fid.fclass = FI_CLASS_SRX_CTX; + *rx_ep = &dom->rx_ep; + ofi_atomic_inc32(&dom->util_domain.ref); + return FI_SUCCESS; + } + + return -FI_ENOSYS; +} + static int cxip_query_collective(struct fid_domain *domain, enum fi_collective_op coll, struct fi_collective_attr *attr, @@ -1695,7 +1774,7 @@ static struct fi_ops_domain cxip_dom_ops = { .cntr_open = cxip_cntr_open, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, - .srx_ctx = fi_no_srx_context, + .srx_ctx = cxip_srx_context, .query_atomic = cxip_query_atomic, .query_collective = cxip_query_collective }; diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c index 50ff7b9bd96..7be36c0d56d 100644 --- a/prov/cxi/src/cxip_ep.c +++ b/prov/cxi/src/cxip_ep.c @@ -925,6 +925,10 @@ int cxip_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) break; + case FI_CLASS_SRX_CTX: + ep->ep_obj->owner_srx = ep->ep_obj->domain->owner_srx; + break; + default: return -FI_EINVAL; } diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index 6681aae0480..c0e0fc278b2 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -836,27 +836,8 @@ static void cxip_env_init(void) fi_param_define(&cxip_prov, "rx_match_mode", FI_PARAM_STRING, "Sets RX message match mode (hardware | software | hybrid)."); - fi_param_get_str(&cxip_prov, "rx_match_mode", ¶m_str); - if (param_str) { - if (!strcasecmp(param_str, "hardware")) { - cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; - cxip_env.msg_offload = true; - } else if (!strcmp(param_str, "software")) { - cxip_env.rx_match_mode = CXIP_PTLTE_SOFTWARE_MODE; - cxip_env.msg_offload = false; - } else if (!strcmp(param_str, "hybrid")) { - cxip_env.rx_match_mode = CXIP_PTLTE_HYBRID_MODE; - cxip_env.msg_offload = true; - } else { - CXIP_WARN("Unrecognized rx_match_mode: %s\n", - param_str); - cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; - cxip_env.msg_offload = true; - } - - param_str = NULL; - } + cxip_set_env_rx_match_mode(); fi_param_define(&cxip_prov, "rdzv_threshold", FI_PARAM_SIZE_T, "Message size threshold for rendezvous protocol."); @@ -1044,54 +1025,6 @@ static void cxip_env_init(void) fi_param_get_size_t(&cxip_prov, "req_buf_max_cached", &cxip_env.req_buf_max_cached); - /* Parameters to tailor hybrid hardware to software transitions - * that are initiated by software. - */ - fi_param_define(&cxip_prov, "hybrid_preemptive", FI_PARAM_BOOL, - "Enable/Disable low LE preemptive UX transitions."); - fi_param_get_bool(&cxip_prov, "hybrid_preemptive", - &cxip_env.hybrid_preemptive); - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_preemptive) { - cxip_env.hybrid_preemptive = false; - CXIP_WARN("Not in hybrid mode, ignoring preemptive\n"); - } - - fi_param_define(&cxip_prov, "hybrid_recv_preemptive", FI_PARAM_BOOL, - "Enable/Disable low LE preemptive recv transitions."); - fi_param_get_bool(&cxip_prov, "hybrid_recv_preemptive", - &cxip_env.hybrid_recv_preemptive); - - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_recv_preemptive) { - CXIP_WARN("Not in hybrid mode, ignore LE recv preemptive\n"); - cxip_env.hybrid_recv_preemptive = 0; - } - - fi_param_define(&cxip_prov, "hybrid_posted_recv_preemptive", - FI_PARAM_BOOL, - "Enable preemptive transition to software endpoint when number of posted receives exceeds RX attribute size"); - fi_param_get_bool(&cxip_prov, "hybrid_posted_recv_preemptive", - &cxip_env.hybrid_posted_recv_preemptive); - - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_posted_recv_preemptive) { - CXIP_WARN("Not in hybrid mode, ignore hybrid_posted_recv_preemptive\n"); - cxip_env.hybrid_posted_recv_preemptive = 0; - } - - fi_param_define(&cxip_prov, "hybrid_unexpected_msg_preemptive", - FI_PARAM_BOOL, - "Enable preemptive transition to software endpoint when number of hardware unexpected messages exceeds RX attribute size"); - fi_param_get_bool(&cxip_prov, "hybrid_unexpected_msg_preemptive", - &cxip_env.hybrid_unexpected_msg_preemptive); - - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_unexpected_msg_preemptive) { - CXIP_WARN("Not in hybrid mode, ignore hybrid_unexpected_msg_preemptive\n"); - cxip_env.hybrid_unexpected_msg_preemptive = 0; - } - if (cxip_software_pte_allowed()) { min_free = CXIP_REQ_BUF_HEADER_MAX_SIZE + cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c index 29c9589baa0..ef8356943c2 100644 --- a/prov/cxi/src/cxip_msg.c +++ b/prov/cxi/src/cxip_msg.c @@ -23,26 +23,25 @@ /* * cxip_recv_req_src_addr() - Translate request source address to FI address. */ -fi_addr_t cxip_recv_req_src_addr(struct cxip_req *req) +fi_addr_t cxip_recv_req_src_addr(struct cxip_rxc *rxc, + uint32_t init, uint16_t vni, + bool force) { - struct cxip_rxc *rxc = req->recv.rxc; - /* If the FI_SOURCE capability is enabled, convert the initiator's * address to an FI address to be reported in a CQ event. If * application AVs are symmetric, the match_id in the EQ event is * logical and translation is not needed. Otherwise, translate the * physical address in the EQ event to logical FI address. */ - if (rxc->attr.caps & FI_SOURCE) { + if ((rxc->attr.caps & FI_SOURCE) || force) { struct cxip_addr addr = {}; if (rxc->ep_obj->av->symmetric) - return CXI_MATCH_ID_EP(rxc->pid_bits, - req->recv.initiator); + return CXI_MATCH_ID_EP(rxc->pid_bits, init); - addr.nic = CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator); - addr.pid = CXI_MATCH_ID_PID(rxc->pid_bits, req->recv.initiator); - addr.vni = req->recv.vni; + addr.nic = CXI_MATCH_ID_EP(rxc->pid_bits, init); + addr.pid = CXI_MATCH_ID_PID(rxc->pid_bits, init); + addr.vni = vni; return cxip_av_lookup_fi_addr(rxc->ep_obj->av, &addr); } @@ -118,6 +117,7 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, void cxip_recv_req_free(struct cxip_req *req) { struct cxip_rxc *rxc = req->recv.rxc; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(rxc); assert(req->type == CXIP_REQ_RECV); assert(dlist_empty(&req->recv.children)); @@ -128,6 +128,9 @@ void cxip_recv_req_free(struct cxip_req *req) if (req->recv.recv_md && !req->recv.hybrid_md) cxip_unmap(req->recv.recv_md); + if (owner_srx && req->rx_entry) + owner_srx->owner_ops->free_entry(req->rx_entry); + cxip_evtq_req_free(req); } @@ -150,7 +153,8 @@ static inline int recv_req_event_success(struct cxip_rxc *rxc, } if (req->recv.rxc->attr.caps & FI_SOURCE) { - src_addr = cxip_recv_req_src_addr(req); + src_addr = cxip_recv_req_src_addr(req->recv.rxc, req->recv.initiator, + req->recv.vni, false); if (src_addr != FI_ADDR_NOTAVAIL || !(rxc->attr.caps & FI_SOURCE_ERR)) return cxip_cq_req_complete_addr(req, src_addr); diff --git a/prov/cxi/src/cxip_msg_hpc.c b/prov/cxi/src/cxip_msg_hpc.c index 89de6e4f01b..c6e0bcc35fd 100644 --- a/prov/cxi/src/cxip_msg_hpc.c +++ b/prov/cxi/src/cxip_msg_hpc.c @@ -2058,7 +2058,7 @@ static void cxip_ux_onload_complete(struct cxip_req *req) rxc->sw_pending_ux_list_len = 0; RXC_WARN(rxc, "Software UX list updated, %d SW UX entries\n", - rxc->sw_ux_list_len); + rxc->sw_ux_list_len); if (rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) cxip_post_ux_onload_sw(rxc); @@ -2125,6 +2125,7 @@ static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event) struct cxip_deferred_event *def_ev; struct cxip_ux_send *ux_send; bool matched; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); assert(rxc->base.state == RXC_ONLOAD_FLOW_CONTROL || rxc->base.state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || @@ -2179,8 +2180,13 @@ static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event) } rxc->cur_ule_offsets++; - dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); - rxc->sw_ux_list_len++; + /* TODO: support onloading in peer mode */ + if (owner_srx) { + RXC_FATAL(rxc, "Software onloading is currently not supported in peer mode\n"); + } else { + dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); + rxc->sw_ux_list_len++; + } RXC_DBG(rxc, "Onloaded Send: %p\n", ux_send); @@ -3033,7 +3039,9 @@ static void cxip_set_ux_dump_entry(struct cxip_req *req, } if (src_addr && req->recv.rxc->attr.caps & FI_SOURCE) - *src_addr = cxip_recv_req_src_addr(req); + *src_addr = cxip_recv_req_src_addr(req->recv.rxc, + req->recv.initiator, + req->recv.vni, false); } } @@ -3317,6 +3325,192 @@ static int cxip_recv_sw_matcher(struct cxip_rxc_hpc *rxc, struct cxip_req *req, return ret; } +static int +cxip_recv_req_init(struct cxip_rxc *rxc, void *buf, size_t len, fi_addr_t addr, + uint64_t tag, uint64_t ignore, uint64_t flags, bool tagged, + void *context, struct cxip_cntr *comp_cntr, + struct cxip_req **req_out) +{ + struct cxip_req *req; + uint32_t match_id; + int ret; + uint16_t vni; + + if (len && !buf) { + ret = -FI_EINVAL; + goto err; + } + + if (rxc->state == RXC_DISABLED) { + ret = -FI_EOPBADSTATE; + goto err; + } + + /* HW to SW PtlTE transition, ensure progress is made */ + if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { + cxip_cq_progress(rxc->recv_cq); + ret = -FI_EAGAIN; + goto err; + } + + if (tagged) { + if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) { + RXC_WARN(rxc, + "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", + tag, ignore, CXIP_TAG_MASK); + ret = -FI_EINVAL; + goto err; + } + flags &= ~FI_MULTI_RECV; + } + + ret = cxip_set_recv_match_id(rxc, addr, rxc->ep_obj->av_auth_key && + (flags & FI_AUTH_KEY), &match_id, &vni); + if (ret) { + RXC_WARN(rxc, "Error setting match_id: %d %s\n", + ret, fi_strerror(-ret)); + goto err; + } + + ofi_genlock_lock(&rxc->ep_obj->lock); + ret = cxip_recv_req_alloc(rxc, buf, len, NULL, &req, cxip_recv_cb); + ofi_genlock_unlock(&rxc->ep_obj->lock); + if (ret) + return ret; + + /* req->data_len, req->tag, req->data must be set later. req->buf may + * be overwritten later. + */ + req->context = (uint64_t)context; + + req->flags = FI_RECV | (flags & FI_COMPLETION); + if (tagged) + req->flags |= FI_TAGGED; + else + req->flags |= FI_MSG; + + req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; + req->recv.match_id = match_id; + req->recv.tag = tag; + req->recv.ignore = ignore; + req->recv.flags = flags; + req->recv.tagged = tagged; + req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); + + *req_out = req; + + return FI_SUCCESS; + +err: + return ret; +} + +int cxip_unexp_start(struct fi_peer_rx_entry *rx_entry) +{ + int ret; + struct cxip_ux_send *ux; + union cxip_match_bits ux_mb; + struct cxip_req *req; + struct cxip_rxc *rxc; + + ux = rx_entry->peer_context; + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + rxc = ux->rxc; + + ret = cxip_recv_req_init(rxc, rx_entry->iov[0].iov_base, + rx_entry->iov[0].iov_len, rx_entry->addr, + rx_entry->tag, 0, rx_entry->flags, + ux_mb.tagged, rx_entry->context, NULL, &req); + if (ret) + return ret; + + req->rx_entry = rx_entry; + + ret = cxip_recv_sw_matched(req, ux); + if (ret == -FI_EAGAIN) + return ret; + + /* FI_EINPROGRESS is return for a multi-recv match. */ + assert(ret == FI_SUCCESS || ret == -FI_EINPROGRESS); + + if (ux->req && ux->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux); + else + free(ux); + + RXC_DBG(rxc, + "Software match, req: %p ux_send: %p\n", req, ux); + + return ret; +} + +static int cxip_process_srx_ux_matcher(struct cxip_rxc *rxc, + struct fid_peer_srx *owner_srx, struct cxip_ux_send *ux) +{ + int ret; + uint32_t ux_init; + union cxip_match_bits ux_mb; + struct fi_peer_rx_entry *rx_entry = NULL; + struct cxip_req *req; + uint16_t vni; + struct fi_peer_match_attr match = {0}; + + /* stash the rxc because we're going to need it if the peer + * address isn't already inserted into the AV table. + */ + ux->rxc = rxc; + ux_init = ux->put_ev.tgt_long.initiator.initiator.process; + vni = ux->put_ev.tgt_long.vni; + + match.addr = cxip_recv_req_src_addr(rxc, ux_init, vni, true); + + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + + if (ux_mb.tagged) { + match.tag = ux_mb.tag; + ret = owner_srx->owner_ops->get_tag(owner_srx, &match, &rx_entry); + } else { + ret = owner_srx->owner_ops->get_msg(owner_srx, &match, &rx_entry); + } + + /* return it back to the caller */ + ux->rx_entry = rx_entry; + + if (ret == -FI_ENOENT) { + /* this is used when the owner calls start_msg */ + rx_entry->peer_context = ux; + return -FI_ENOMSG; + } else if (ret) { + return ret; + } + + ret = cxip_recv_req_init(rxc, rx_entry->iov[0].iov_base, + rx_entry->iov[0].iov_len, rx_entry->addr, + rx_entry->tag, 0, rx_entry->flags, + ux_mb.tagged, rx_entry->context, NULL, &req); + if (ret) + return ret; + + req->rx_entry = rx_entry; + + ret = cxip_recv_sw_matched(req, ux); + if (ret == -FI_EAGAIN) + return -FI_EAGAIN; + + /* FI_EINPROGRESS is return for a multi-recv match. */ + assert(ret == FI_SUCCESS || ret == -FI_EINPROGRESS); + + if (ux->req && ux->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux); + else + free(ux); + + RXC_DBG(rxc, + "Software match, req: %p ux_send: %p\n", req, ux); + + return ret; +} + /* * cxip_recv_ux_sw_matcher() - Attempt to match an unexpected message to a user * posted receive. @@ -3327,10 +3521,17 @@ int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux) { struct cxip_ptelist_buf *rbuf = ux->req->req_ctx; struct cxip_rxc_hpc *rxc = rbuf->rxc; + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); struct cxip_req *req; struct dlist_entry *tmp; int ret; + if (owner_srx) { + /* we never add anything on the sw_ux_list */ + rxc->sw_ux_list_len--; + return cxip_process_srx_ux_matcher(&rxc->base, owner_srx, ux); + } + if (dlist_empty(&rxc->sw_recv_queue)) return -FI_ENOMSG; @@ -3987,71 +4188,16 @@ cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, void *desc, int ret; struct cxip_req *req; struct cxip_ux_send *ux_msg; - uint32_t match_id; - uint16_t vni; assert(rxc_hpc->base.protocol == FI_PROTO_CXI); - if (len && !buf) - return -FI_EINVAL; - - if (rxc->state == RXC_DISABLED) - return -FI_EOPBADSTATE; - - /* HW to SW PtlTE transition, ensure progress is made */ - if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { - cxip_cq_progress(rxc->recv_cq); - return -FI_EAGAIN; - } - - if (tagged) { - if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) { - RXC_WARN(rxc, - "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", - tag, ignore, CXIP_TAG_MASK); - return -FI_EINVAL; - } - } - - ret = cxip_set_recv_match_id(rxc, src_addr, rxc->ep_obj->av_auth_key && - (flags & FI_AUTH_KEY), &match_id, &vni); - if (ret) { - RXC_WARN(rxc, "Error setting match_id: %d %s\n", - ret, fi_strerror(-ret)); - return ret; - } - - ofi_genlock_lock(&rxc->ep_obj->lock); - ret = cxip_recv_req_alloc(rxc, buf, len, NULL, &req, cxip_recv_cb); + ret = cxip_recv_req_init(rxc, buf, len, src_addr, tag, ignore, flags, + tagged, context, comp_cntr, &req); if (ret) goto err; - /* req->data_len, req->tag, req->data must be set later. req->buf may - * be overwritten later. - */ - req->context = (uint64_t)context; - - req->flags = FI_RECV | (flags & FI_COMPLETION); - if (tagged) - req->flags |= FI_TAGGED; - else - req->flags |= FI_MSG; - - req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; - req->recv.match_id = match_id; - req->recv.tag = tag; - req->recv.ignore = ignore; - req->recv.flags = flags; - req->recv.tagged = tagged; - req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); - - if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { - ret = -FI_EAGAIN; - goto err_free_request; - } - + ofi_genlock_lock(&rxc->ep_obj->lock); if (!(req->recv.flags & (FI_PEEK | FI_CLAIM))) { - ret = cxip_recv_req_queue(req, false); /* Match made in software? */ if (ret == -FI_EALREADY) { @@ -4110,9 +4256,8 @@ cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, void *desc, err_free_request: cxip_recv_req_free(req); -err: ofi_genlock_unlock(&rxc->ep_obj->lock); - +err: return ret; } diff --git a/prov/cxi/src/cxip_req_buf.c b/prov/cxi/src/cxip_req_buf.c index 4a4624c59b7..09b1432b214 100644 --- a/prov/cxi/src/cxip_req_buf.c +++ b/prov/cxi/src/cxip_req_buf.c @@ -150,10 +150,22 @@ static int cxip_req_buf_process_ux(struct cxip_ptelist_buf *buf, "rbuf=%p ux=%p sw_pending_ux_list_len=%u\n", buf, ux, buf->rxc->sw_pending_ux_list_len); } else { - dlist_insert_tail(&ux->rxc_entry, &rxc->sw_ux_list); + struct fid_peer_srx *owner_srx = cxip_get_owner_srx(&rxc->base); - RXC_DBG(buf->rxc, "rbuf=%p ux=%p sw_ux_list_len=%u\n", - buf, ux, buf->rxc->sw_ux_list_len); + if (owner_srx) { + union cxip_match_bits ux_mb; + + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + + if (ux_mb.tagged) + owner_srx->owner_ops->queue_tag(ux->rx_entry); + else + owner_srx->owner_ops->queue_msg(ux->rx_entry); + } else { + dlist_insert_tail(&ux->rxc_entry, &rxc->sw_ux_list); + RXC_DBG(buf->rxc, "rbuf=%p ux=%p sw_ux_list_len=%u\n", + buf, ux, buf->rxc->sw_ux_list_len); + } } break; diff --git a/prov/cxi/src/cxip_rxc.c b/prov/cxi/src/cxip_rxc.c index 72b1ed92c43..cc3f3e9f91a 100644 --- a/prov/cxi/src/cxip_rxc.c +++ b/prov/cxi/src/cxip_rxc.c @@ -402,6 +402,13 @@ struct cxip_rxc *cxip_rxc_calloc(struct cxip_ep_obj *ep_obj, void *context) { struct cxip_rxc *rxc = NULL; + /* + * It's possible the owner provider decides to turn off + * hardware offload in cxi. If that happens we need to update the + * rx_match_mode. + */ + cxip_set_env_rx_match_mode(); + switch (ep_obj->protocol) { case FI_PROTO_CXI: rxc = calloc(1, sizeof(struct cxip_rxc_hpc)); From f71668a69e0ed78957e0daefca5e4cb9c0026a3b Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 2 May 2024 15:21:20 -0400 Subject: [PATCH 194/393] prov/cxi: Add FI_PEER capability bit Add the FI_PEER capability bit to the CXI provider fi_info Signed-off-by: Amir Shehata --- prov/cxi/include/cxip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 5ac64578697..61fd43be6b2 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -138,7 +138,7 @@ (FI_SOURCE | FI_SOURCE_ERR | FI_LOCAL_COMM | \ FI_REMOTE_COMM | FI_RMA_EVENT | FI_MULTI_RECV | FI_FENCE | FI_TRIGGER) #define CXIP_EP_CAPS (CXIP_EP_PRI_CAPS | CXIP_EP_SEC_CAPS) -#define CXIP_DOM_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID) +#define CXIP_DOM_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID | FI_PEER) #define CXIP_CAPS (CXIP_DOM_CAPS | CXIP_EP_CAPS) #define CXIP_MSG_ORDER (FI_ORDER_SAS | \ FI_ORDER_WAW | \ From 871da0fd3adf2545a518c20a067e7e740f9f6a1a Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 28 Oct 2024 16:47:45 -0500 Subject: [PATCH 195/393] prov/cxi: Remove srx unittests The cxi provider shared RX unittests verify that the cxi provider returns -FI_ENOSYS. With linkx support, this is not the case. Remove these now invalid tests. Signed-off-by: Ian Ziemba --- prov/cxi/test/ep.c | 58 ---------------------------------------------- 1 file changed, 58 deletions(-) diff --git a/prov/cxi/test/ep.c b/prov/cxi/test/ep.c index 3292138f049..c49029ef808 100644 --- a/prov/cxi/test/ep.c +++ b/prov/cxi/test/ep.c @@ -295,17 +295,6 @@ Test(ep, ep_bind_stx_ctx) "TODO Add test for STX CTXs binding to the endpoint when implemented"); } -Test(ep, ep_bind_srx_ctx) -{ - int ret; - struct fi_rx_attr *attr = NULL; - void *context = NULL; - - ret = fi_srx_context(cxit_domain, attr, NULL, context); - cr_assert_eq(ret, -FI_ENOSYS, - "TODO Add test for SRX CTXs binding to the endpoint when implemented"); -} - Test(ep, ep_bind_unhandled) { int ret; @@ -970,53 +959,6 @@ Test(ep, stx_ctx) cr_assert_eq(ret, FI_SUCCESS, "fi_close stx_ep. %d", ret); } -Test(ep, srx_ctx_null_srx) -{ - int ret; - struct fi_rx_attr *attr = NULL; - void *context = NULL; - - ret = fi_srx_context(cxit_domain, attr, NULL, context); - /* TODO Fix when fi_srx_context is implemented, should be -FI_EINVAL */ - cr_assert_eq(ret, -FI_ENOSYS, "fi_srx_context null srx. %d", ret); -} - -Test(ep, srx_ctx) -{ - int ret; - struct fi_rx_attr *attr = NULL; - struct fid_ep *srx; - struct cxip_ep *srx_ep; - void *context = &ret; - struct cxip_domain *dom; - struct cxip_rxc *rxc; - int refs; - - dom = container_of(cxit_domain, struct cxip_domain, - util_domain.domain_fid); - refs = ofi_atomic_get32(&dom->ref); - - ret = fi_srx_context(cxit_domain, attr, &srx, context); - /* TODO Fix when fi_srx_context is implemented, should be FI_SUCCESS */ - cr_assert_eq(ret, -FI_ENOSYS, "fi_stx_context failed. %d", ret); - if (ret == -FI_ENOSYS) - return; - - srx_ep = container_of(srx, struct cxip_ep, ep); - rxc = srx_ep->ep_obj->rxc; - - /* Validate stx */ - cr_assert_eq(rxc->domain, dom); - cr_assert_eq(ofi_atomic_inc32(&dom->ref), refs + 1); - cr_assert_eq(srx_ep->ep.fid.fclass, FI_CLASS_RX_CTX); - cr_assert_eq(srx_ep->ep.fid.context, context); - cr_assert_eq(rxc->state, RXC_ENABLED); - cr_assert_eq(rxc->min_multi_recv, CXIP_EP_MIN_MULTI_RECV); - - ret = fi_close(&srx->fid); - cr_assert_eq(ret, FI_SUCCESS, "fi_close srx_ep. %d", ret); -} - TestSuite(ep_init, .timeout = CXIT_DEFAULT_TIMEOUT); Test(ep_init, auth_key) From 9f99aef697fcef1824df42a39c8f0e17f66c4839 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 16:35:49 +0000 Subject: [PATCH 196/393] build(deps): bump actions/checkout from 4.2.1 to 4.2.2 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.2.1 to 4.2.2. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871...11bd71901bbe5b1630ceea73d27597364c9af683) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/clang-format-check.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/coverity.yml | 2 +- .github/workflows/gh-man.yaml | 2 +- .github/workflows/nroff-elves.yaml | 2 +- .github/workflows/pr-ci.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 47ca4512d60..704b1c91ceb 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -11,7 +11,7 @@ jobs: path: - 'prov/sm2' steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Run clang-format style check for C/C++/Protobuf programs. uses: jidicula/clang-format-action@c74383674bf5f7c69f60ce562019c1c94bc1421a # v4.13.0 with: diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 934c372aa2b..1f865e2a2fe 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -48,7 +48,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 3eb28e6a0e8..4cad165bc2b 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -52,7 +52,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Download Coverity tools run: | wget https://scan.coverity.com/download/linux64 --post-data "token=${{ secrets.COVERITY_SCAN_TOKEN }}&project=ofiwg%2Flibfabric" -O coverity_tool.tgz diff --git a/.github/workflows/gh-man.yaml b/.github/workflows/gh-man.yaml index 754604332a1..44ad72bb2ca 100644 --- a/.github/workflows/gh-man.yaml +++ b/.github/workflows/gh-man.yaml @@ -25,7 +25,7 @@ jobs: echo "$GITHUB_DATA" - name: Check out the git repo - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Update the man pages in branch gh-pages run: .github/workflows/gh-man.sh diff --git a/.github/workflows/nroff-elves.yaml b/.github/workflows/nroff-elves.yaml index 5d72b4d1ea8..d8dea720789 100644 --- a/.github/workflows/nroff-elves.yaml +++ b/.github/workflows/nroff-elves.yaml @@ -23,7 +23,7 @@ jobs: echo "$GITHUB_DATA" - name: Check out the git repo - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Get the required packages run: sudo apt install -y pandoc diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 1a73450e5de..9d595f5f844 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -56,7 +56,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Build Check run: | set -x @@ -96,7 +96,7 @@ jobs: sudo apt-add-repository 'deb [arch=amd64] https://repositories.intel.com/graphics/ubuntu focal main' sudo apt-get update sudo apt-get install -y level-zero level-zero-dev - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: HMEM Checks run: | set -x @@ -126,7 +126,7 @@ jobs: run: | brew install automake brew install --quiet libtool - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Build Check run: | ./autogen.sh diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 2fb410afcfe..7bc2aa0c4d3 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -33,7 +33,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: persist-credentials: false From 0dd07314f82d18dc75234317089b540ebb7f9be5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 16:35:59 +0000 Subject: [PATCH 197/393] build(deps): bump github/codeql-action from 3.26.13 to 3.27.0 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.13 to 3.27.0. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/f779452ac5af1c261dce0346a8f964149f49322b...662472033e021d55d94146f66f6058822b0b39fd) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 1f865e2a2fe..6ab9813507f 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/init@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/autobuild@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/analyze@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 7bc2aa0c4d3..7d8974b3e3d 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/upload-sarif@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: sarif_file: results.sarif From 2942e545408ff967cd2b7177a8130df64b48f931 Mon Sep 17 00:00:00 2001 From: Steve Welch Date: Tue, 29 Oct 2024 09:00:01 -0500 Subject: [PATCH 198/393] prov/cxi: Report RMA order used in debug output Report order used for an RMA operation in debug and error output. Signed-off-by: Steve Welch --- prov/cxi/src/cxip_rma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/prov/cxi/src/cxip_rma.c b/prov/cxi/src/cxip_rma.c index 7b691c17e20..9aa1ace679f 100644 --- a/prov/cxi/src/cxip_rma.c +++ b/prov/cxi/src/cxip_rma.c @@ -658,12 +658,14 @@ ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, if (ret) TXC_WARN(txc, - "%s RMA %s failed: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + "%s %s RMA %s failed: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + unr ? "Ordered" : "Un-ordered", idc ? "IDC" : "DMA", write ? "write" : "read", buf, len, key, addr, caddr.nic, caddr.pid, pid_idx); else TXC_DBG(txc, - "%s RMA %s emitted: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + "%s %s RMA %s emitted: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + unr ? "Ordered" : "Un-ordered", idc ? "IDC" : "DMA", write ? "write" : "read", buf, len, key, addr, caddr.nic, caddr.pid, pid_idx); From f16e8be24aba3a4cafe2c7e544aa0d8e27b15d3c Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 28 Oct 2024 17:56:23 +0000 Subject: [PATCH 199/393] src/common.c: Update ofi_vrb_speed 1. Fixes the QDR and FDR10 ib speed (8->10) 2. Support HDR (64) and NDR (128) ib speed 3. Support width enum 16 All the values are referenced to upstream rdma-core: https://github.com/linux-rdma/rdma-core/blob/master/libibverbs/examples/devinfo.c#L126C20-L154 Signed-off-by: Shi Jin --- src/common.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/common.c b/src/common.c index a5f5ba5a22e..0d641ac74a1 100644 --- a/src/common.c +++ b/src/common.c @@ -2385,7 +2385,7 @@ size_t ofi_vrb_speed(uint8_t speed, uint8_t width) break; case 4: case 8: - speed_val = 8 * gbit_2_bit_coef; + speed_val = 10 * gbit_2_bit_coef; break; case 16: speed_val = 14 * gbit_2_bit_coef; @@ -2393,6 +2393,12 @@ size_t ofi_vrb_speed(uint8_t speed, uint8_t width) case 32: speed_val = 25 * gbit_2_bit_coef; break; + case 64: + speed_val = 50 * gbit_2_bit_coef; + break; + case 128: + speed_val = 100 * gbit_2_bit_coef; + break; default: speed_val = 0; break; @@ -2411,6 +2417,9 @@ size_t ofi_vrb_speed(uint8_t speed, uint8_t width) case 8: width_val = 12; break; + case 16: + width_val = 2; + break; default: width_val = 0; break; From 5ae97ef2043d4ad7378764e7624c576e2c6b199e Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Thu, 24 Oct 2024 15:41:50 -0700 Subject: [PATCH 200/393] prov/efa: Make efa_hmem_info a global variable Currently efa_hmem_info is part of efa_domain and created for every efa domain. hmem_info init involves several operations like device memory allocation / free, and trial ibv reg mr, which is expensive and can potentially cause more memory usage. Make efa_hmem_info a global variable and call it only once per process. Remove p2p_disabled_by_user and p2p_required_by_impl from efa_hmem_info since they are only used for ep level operations. Signed-off-by: Jessie Yang --- prov/efa/src/efa_domain.c | 7 --- prov/efa/src/efa_domain.h | 1 - prov/efa/src/efa_hmem.c | 70 +++++++++++------------------ prov/efa/src/efa_hmem.h | 8 ++-- prov/efa/src/efa_mr.c | 4 +- prov/efa/src/efa_prov.c | 4 ++ prov/efa/src/rdm/efa_rdm_ep.h | 2 +- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 36 +++++++++------ prov/efa/src/rdm/efa_rdm_msg.c | 12 +++-- prov/efa/src/rdm/efa_rdm_peer.c | 6 +-- prov/efa/src/rdm/efa_rdm_rma.c | 8 ++-- prov/efa/test/efa_unit_test_ep.c | 23 +++++----- prov/efa/test/efa_unit_test_hmem.c | 47 ++++++------------- prov/efa/test/efa_unit_test_runt.c | 8 +--- 14 files changed, 96 insertions(+), 140 deletions(-) diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 130cfc052a9..2e81aafa666 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -297,13 +297,6 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info, goto err_free; } - err = efa_domain_hmem_info_init_all(efa_domain); - if (err) { - ret = err; - EFA_WARN(FI_LOG_DOMAIN, "Failed to check hmem support status. err: %d\n", ret); - goto err_free; - } - dlist_insert_tail(&efa_domain->list_entry, &g_efa_domain_list); return 0; diff --git a/prov/efa/src/efa_domain.h b/prov/efa/src/efa_domain.h index 2eaf7fc06ed..6fa13e0bd8d 100644 --- a/prov/efa/src/efa_domain.h +++ b/prov/efa/src/efa_domain.h @@ -22,7 +22,6 @@ struct efa_domain { struct ofi_mr_cache *cache; struct efa_qp **qp_table; size_t qp_table_sz_m1; - struct efa_hmem_info hmem_info[OFI_HMEM_MAX]; size_t mtu_size; size_t addrlen; bool mr_local; diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c index 15f2513bf79..3c713221711 100644 --- a/prov/efa/src/efa_hmem.c +++ b/prov/efa/src/efa_hmem.c @@ -5,16 +5,18 @@ #include "efa_hmem.h" #include "rdm/efa_rdm_pkt_type.h" +struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX]; + #if HAVE_CUDA || HAVE_NEURON -static size_t efa_max_eager_msg_size_with_largest_header(struct efa_domain *efa_domain) { +static size_t efa_max_eager_msg_size_with_largest_header() { int mtu_size; - mtu_size = efa_domain->device->rdm_info->ep_attr->max_msg_size; + mtu_size = g_device_list[0].rdm_info->ep_attr->max_msg_size; return mtu_size - efa_rdm_pkt_type_get_max_hdr_size(); } #else -static size_t efa_max_eager_msg_size_with_largest_header(struct efa_domain *efa_domain) { +static size_t efa_max_eager_msg_size_with_largest_header() { return 0; } #endif @@ -23,14 +25,13 @@ static size_t efa_max_eager_msg_size_with_largest_header(struct efa_domain *efa_ * @brief Initialize the various protocol thresholds tracked in efa_hmem_info * according to the given FI_HMEM interface. * - * @param[in,out] efa_domain Pointer to struct efa_domain * @param[in] iface The FI_HMEM interface to initialize * * @return 0 */ -static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_domain, enum fi_hmem_iface iface) +static int efa_domain_hmem_info_init_protocol_thresholds(enum fi_hmem_iface iface) { - struct efa_hmem_info *info = &efa_domain->hmem_info[iface]; + struct efa_hmem_info *info = &g_efa_hmem_info[iface]; size_t tmp_value; /* Fall back to FI_HMEM_SYSTEM initialization logic when p2p is @@ -53,8 +54,8 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_ case FI_HMEM_CUDA: info->runt_size = EFA_DEFAULT_RUNT_SIZE; info->max_medium_msg_size = 0; - info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; - info->min_read_write_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; + info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header() + 1; + info->min_read_write_size = efa_max_eager_msg_size_with_largest_header() + 1; fi_param_get_size_t(&efa_prov, "runt_size", &info->runt_size); fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &info->min_read_msg_size); fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size); @@ -68,8 +69,8 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_ case FI_HMEM_NEURON: info->runt_size = EFA_NEURON_RUNT_SIZE; info->max_medium_msg_size = 0; - info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; - info->min_read_write_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; + info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header() + 1; + info->min_read_write_size = efa_max_eager_msg_size_with_largest_header() + 1; fi_param_get_size_t(&efa_prov, "runt_size", &info->runt_size); fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &info->min_read_msg_size); fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size); @@ -105,7 +106,7 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_ return 0; } -static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *info) { +static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *info) { #if HAVE_CUDA cudaError_t cuda_ret; void *ptr = NULL; @@ -168,7 +169,7 @@ static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_i return; } -static inline void efa_domain_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *info) { +static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *info) { #if HAVE_NEURON struct ibv_mr *ibv_mr = NULL; int ibv_access = IBV_ACCESS_LOCAL_WRITE; @@ -239,13 +240,12 @@ static inline void efa_domain_hmem_info_check_p2p_support_neuron(struct efa_hmem /** * @brief Initialize the efa_hmem_info state for iface * - * @param[in,out] efa_domain Pointer to struct efa_domain * @param[in] iface HMEM interface */ static void -efa_domain_hmem_info_init_iface(struct efa_domain *efa_domain, enum fi_hmem_iface iface) +efa_hmem_info_init_iface(enum fi_hmem_iface iface) { - struct efa_hmem_info *info = &efa_domain->hmem_info[iface]; + struct efa_hmem_info *info = &g_efa_hmem_info[iface]; if (!ofi_hmem_is_initialized(iface)) { EFA_INFO(FI_LOG_DOMAIN, "%s is not initialized\n", @@ -262,41 +262,27 @@ efa_domain_hmem_info_init_iface(struct efa_domain *efa_domain, enum fi_hmem_ifac } info->initialized = true; - info->p2p_disabled_by_user = (iface == FI_HMEM_SYSTEM) ? false : ofi_hmem_p2p_disabled(); if (iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_SYSTEM) { info->p2p_supported_by_device = true; - } else if (info->p2p_disabled_by_user) { + } else if (ofi_hmem_p2p_disabled()) { info->p2p_supported_by_device = false; } else { if (iface == FI_HMEM_CUDA) - efa_domain_hmem_info_check_p2p_support_cuda(info); + efa_hmem_info_check_p2p_support_cuda(info); if (iface == FI_HMEM_NEURON) - efa_domain_hmem_info_check_p2p_support_neuron(info); + efa_hmem_info_check_p2p_support_neuron(info); if (!info->p2p_supported_by_device) EFA_INFO(FI_LOG_DOMAIN, "%s P2P support is not available.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); } - info->p2p_required_by_impl = true; - /* If user is using libfabric API 1.18 or later, by default EFA - * provider is permitted to use CUDA library to support CUDA - * memory, therefore p2p is not required. - */ - if (iface == FI_HMEM_CUDA && - FI_VERSION_GE(efa_domain->util_domain.fabric->fabric_fid.api_version, FI_VERSION(1, 18))) - info->p2p_required_by_impl = !hmem_ops[iface].initialized; - if (iface == FI_HMEM_SYSTEM) - info->p2p_required_by_impl = false; - - efa_domain_hmem_info_init_protocol_thresholds(efa_domain, iface); + efa_domain_hmem_info_init_protocol_thresholds(iface); } /** * @brief Validate an FI_OPT_FI_HMEM_P2P (FI_OPT_ENDPOINT) option for a * specified HMEM interface. - * Also update hmem_info[iface]->p2p_disabled_by_user accordingly. * - * @param[in,out] domain The efa_domain struct which contains an efa_hmem_info array * @param[in] iface The fi_hmem_iface enum of the FI_HMEM interface to validate * @param[in] p2p_opt The P2P option to validate * @@ -305,9 +291,9 @@ efa_domain_hmem_info_init_iface(struct efa_domain *efa_domain, enum fi_hmem_ifac * -FI_ENODATA if the given HMEM interface was not initialized * -FI_EINVAL if p2p_opt is not a valid FI_OPT_FI_HMEM_P2P option */ -int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem_iface iface, int p2p_opt) +int efa_hmem_validate_p2p_opt(enum fi_hmem_iface iface, int p2p_opt, uint32_t api_version) { - struct efa_hmem_info *info = &efa_domain->hmem_info[iface]; + struct efa_hmem_info *info = &g_efa_hmem_info[iface]; if (OFI_UNLIKELY(!info->initialized)) return -FI_ENODATA; @@ -317,7 +303,6 @@ int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem if (OFI_UNLIKELY(ofi_hmem_p2p_disabled()) || !info->p2p_supported_by_device) return -FI_EOPNOTSUPP; - info->p2p_disabled_by_user = false; return 0; /* * According to fi_setopt() document: @@ -334,14 +319,13 @@ int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem if (OFI_UNLIKELY(ofi_hmem_p2p_disabled())) return -FI_EOPNOTSUPP; - info->p2p_disabled_by_user = false; return 0; case FI_HMEM_P2P_DISABLED: - if (info->p2p_required_by_impl) + /* return -FI_EOPNOTSUPP if p2p is required by implementation */ + if (iface != FI_HMEM_CUDA || FI_VERSION_LT(api_version, FI_VERSION(1, 18))) return -FI_EOPNOTSUPP; - info->p2p_disabled_by_user = true; return 0; } @@ -354,12 +338,10 @@ int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem * struct will be used to determine which efa transfer * protocol should be selected. * - * @param[in,out] efa_domain Pointer to struct efa_domain to be initialized - * * @return 0 on success * negative libfabric error code on an unexpected error */ -int efa_domain_hmem_info_init_all(struct efa_domain *efa_domain) +int efa_hmem_info_initialize() { int ret = 0, i = 0; @@ -367,10 +349,10 @@ int efa_domain_hmem_info_init_all(struct efa_domain *efa_domain) return -FI_ENODEV; } - memset(efa_domain->hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info)); + memset(g_efa_hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info)); EFA_HMEM_IFACE_FOREACH(i) { - efa_domain_hmem_info_init_iface(efa_domain, efa_hmem_ifaces[i]); + efa_hmem_info_init_iface(efa_hmem_ifaces[i]); } return ret; diff --git a/prov/efa/src/efa_hmem.h b/prov/efa/src/efa_hmem.h index e18c0e4c534..858b7035883 100644 --- a/prov/efa/src/efa_hmem.h +++ b/prov/efa/src/efa_hmem.h @@ -23,8 +23,6 @@ static const enum fi_hmem_iface efa_hmem_ifaces[] = { struct efa_hmem_info { bool initialized; /* do we support it at all */ - bool p2p_disabled_by_user; /* Did the user disable p2p via FI_OPT_FI_HMEM_P2P? */ - bool p2p_required_by_impl; /* Is p2p required for this interface? */ bool p2p_supported_by_device; /* do we support p2p with this device */ size_t max_medium_msg_size; @@ -33,10 +31,12 @@ struct efa_hmem_info { size_t min_read_write_size; }; +extern struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX]; + struct efa_domain; -int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem_iface iface, int p2p_opt); -int efa_domain_hmem_info_init_all(struct efa_domain *efa_domain); +int efa_hmem_validate_p2p_opt(enum fi_hmem_iface iface, int p2p_opt, uint32_t api_version); +int efa_hmem_info_initialize(); /** * @brief Copy data from a hmem device to a system buffer diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c index 0307914aff2..1e1f803b777 100644 --- a/prov/efa/src/efa_mr.c +++ b/prov/efa/src/efa_mr.c @@ -192,7 +192,7 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr, } if (efa_mr->domain->util_domain.info_domain_caps & FI_HMEM) { - if (efa_mr->domain->hmem_info[attr->iface].initialized) { + if (g_efa_hmem_info[attr->iface].initialized) { efa_mr->peer.iface = attr->iface; } else { EFA_WARN(FI_LOG_MR, @@ -813,7 +813,7 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, const void *at * For FI_HMEM_CUDA iface when p2p is unavailable, skip ibv_reg_mr() and * generate proprietary mr_fid key. */ - if (mr_attr.iface == FI_HMEM_CUDA && !efa_mr->domain->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) { + if (mr_attr.iface == FI_HMEM_CUDA && !g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) { efa_mr->mr_fid.key = efa_mr_cuda_non_p2p_keygen(); } else { efa_mr->ibv_mr = efa_mr_reg_ibv_mr(efa_mr, &mr_attr, fi_ibv_access, flags); diff --git a/prov/efa/src/efa_prov.c b/prov/efa/src/efa_prov.c index 85a71aa2c41..2dd5b42fecb 100644 --- a/prov/efa/src/efa_prov.c +++ b/prov/efa/src/efa_prov.c @@ -164,6 +164,10 @@ EFA_INI if (err) goto err_free; + err = efa_hmem_info_initialize(); + if (err) + goto err_free; + dlist_init(&g_efa_domain_list); return &efa_prov; diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 316bab93d98..cebf968439c 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -307,7 +307,7 @@ int efa_rdm_ep_use_p2p(struct efa_rdm_ep *efa_rdm_ep, struct efa_mr *efa_mr) if (!efa_mr || efa_mr->peer.iface == FI_HMEM_SYSTEM) return 1; - if (efa_rdm_ep_domain(efa_rdm_ep)->hmem_info[efa_mr->peer.iface].p2p_supported_by_device) + if (g_efa_hmem_info[efa_mr->peer.iface].p2p_supported_by_device) return (efa_rdm_ep->hmem_p2p_opt != FI_HMEM_P2P_DISABLED); if (efa_rdm_ep->hmem_p2p_opt == FI_HMEM_P2P_REQUIRED) { diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 01a4b3fd909..014ade78b46 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -434,7 +434,6 @@ static inline void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep) { enum fi_hmem_iface iface; - struct efa_hmem_info *hmem_info; uint64_t unsupported_caps = FI_DIRECTED_RECV | FI_TAGGED | FI_ATOMIC; ep->use_zcpy_rx = true; @@ -482,11 +481,11 @@ void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep) } /* Zero-copy receive requires P2P support. Disable it if any initialized HMEM iface does not support P2P. */ - for (iface = FI_HMEM_SYSTEM; iface < OFI_HMEM_MAX; ++iface) { - hmem_info = &ep->base_ep.domain->hmem_info[iface]; - if (hmem_info->initialized && - !hmem_info->p2p_disabled_by_user && - !hmem_info->p2p_supported_by_device) { + EFA_HMEM_IFACE_FOREACH(iface) { + if (g_efa_hmem_info[iface].initialized && + !ofi_hmem_p2p_disabled() && + ep->hmem_p2p_opt != FI_HMEM_P2P_DISABLED && + !g_efa_hmem_info[iface].p2p_supported_by_device) { EFA_INFO(FI_LOG_EP_CTRL, "%s does not support P2P, zero-copy receive " "protocol will be disabled\n", @@ -530,6 +529,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, struct efa_domain *efa_domain = NULL; struct efa_rdm_ep *efa_rdm_ep = NULL; int ret, retv, i; + enum fi_hmem_iface iface; efa_rdm_ep = calloc(1, sizeof(*efa_rdm_ep)); if (!efa_rdm_ep) @@ -606,6 +606,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep_init_linked_lists(efa_rdm_ep); + efa_rdm_ep->cuda_api_permitted = (FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 18))); /* Set hmem_p2p_opt */ efa_rdm_ep->hmem_p2p_opt = FI_HMEM_P2P_DISABLED; @@ -615,16 +616,21 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, * tighter requirements for the default p2p opt */ EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) { - if (efa_rdm_ep->base_ep.domain->hmem_info[efa_hmem_ifaces[i]].initialized && - efa_rdm_ep->base_ep.domain->hmem_info[efa_hmem_ifaces[i]].p2p_supported_by_device) { - efa_rdm_ep->hmem_p2p_opt = efa_rdm_ep->base_ep.domain->hmem_info[efa_hmem_ifaces[i]].p2p_required_by_impl - ? FI_HMEM_P2P_REQUIRED - : FI_HMEM_P2P_PREFERRED; + iface = efa_hmem_ifaces[i]; + if (g_efa_hmem_info[iface].initialized && + g_efa_hmem_info[iface].p2p_supported_by_device) { + /* If user is using libfabric API 1.18 or later, by default EFA + * provider is permitted to use CUDA library to support CUDA + * memory, therefore p2p is not required. + */ + efa_rdm_ep->hmem_p2p_opt = + (iface == FI_HMEM_CUDA && efa_rdm_ep->cuda_api_permitted) ? + FI_HMEM_P2P_PREFERRED : + FI_HMEM_P2P_REQUIRED; break; } } - efa_rdm_ep->cuda_api_permitted = (FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 18))); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = false; efa_rdm_ep->write_in_order_aligned_128_bytes = false; @@ -1413,7 +1419,9 @@ static int efa_rdm_ep_set_fi_hmem_p2p_opt(struct efa_rdm_ep *efa_rdm_ep, int opt * tighter restrictions on valid p2p options. */ EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) { - err = efa_domain_hmem_validate_p2p_opt(efa_rdm_ep_domain(efa_rdm_ep), efa_hmem_ifaces[i], opt); + err = efa_hmem_validate_p2p_opt( + efa_hmem_ifaces[i], opt, + efa_rdm_ep->base_ep.info->fabric_attr->api_version); if (err == -FI_ENODATA) continue; @@ -1449,7 +1457,7 @@ static int efa_rdm_ep_set_cuda_api_permitted(struct efa_rdm_ep *ep, bool cuda_ap /* CUDA memory can be supported by using either peer to peer or CUDA API. If neither is * available, we cannot support CUDA memory */ - if (!efa_rdm_ep_domain(ep)->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) + if (!g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) return -FI_EOPNOTSUPP; ep->cuda_api_permitted = false; diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index 839cde917f0..cdbabe128c1 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -60,7 +60,6 @@ int efa_rdm_msg_select_rtm(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *tx int tagged; int eager_rtm, medium_rtm, longcts_rtm, readbase_rtm, iface; size_t eager_rtm_max_data_size; - struct efa_hmem_info *hmem_info; bool delivery_complete_requested; assert(txe->op == ofi_op_msg || txe->op == ofi_op_tagged); @@ -68,7 +67,6 @@ int efa_rdm_msg_select_rtm(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *tx assert(tagged == 0 || tagged == 1); iface = txe->desc[0] ? ((struct efa_mr*) txe->desc[0])->peer.iface : FI_HMEM_SYSTEM; - hmem_info = efa_rdm_ep_domain(efa_rdm_ep)->hmem_info; if (txe->fi_flags & FI_INJECT || efa_both_support_zero_hdr_data_transfer(efa_rdm_ep, txe->peer)) delivery_complete_requested = false; @@ -88,16 +86,16 @@ int efa_rdm_msg_select_rtm(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *tx readbase_rtm = efa_rdm_peer_select_readbase_rtm(txe->peer, efa_rdm_ep, txe); - if (use_p2p && - txe->total_len >= hmem_info[iface].min_read_msg_size && - efa_rdm_interop_rdma_read(efa_rdm_ep, txe->peer) && - (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(efa_rdm_ep)))) + if (use_p2p && + txe->total_len >= g_efa_hmem_info[iface].min_read_msg_size && + efa_rdm_interop_rdma_read(efa_rdm_ep, txe->peer) && + (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(efa_rdm_ep)))) return readbase_rtm; if (txe->total_len <= eager_rtm_max_data_size) return eager_rtm; - if (txe->total_len <= hmem_info[iface].max_medium_msg_size) + if (txe->total_len <= g_efa_hmem_info[iface].max_medium_msg_size) return medium_rtm; return longcts_rtm; diff --git a/prov/efa/src/rdm/efa_rdm_peer.c b/prov/efa/src/rdm/efa_rdm_peer.c index 9674a642be6..3e8e3dff774 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.c +++ b/prov/efa/src/rdm/efa_rdm_peer.c @@ -330,18 +330,16 @@ void efa_rdm_peer_proc_pending_items_in_robuf(struct efa_rdm_peer *peer, struct size_t efa_rdm_peer_get_runt_size(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_rdm_ope *ope) { - struct efa_hmem_info *hmem_info; size_t runt_size; size_t memory_alignment; int iface; - hmem_info = efa_rdm_ep_domain(ep)->hmem_info; iface = ope->desc[0] ? ((struct efa_mr*) ope->desc[0])->peer.iface : FI_HMEM_SYSTEM; - if (hmem_info[iface].runt_size < peer->num_runt_bytes_in_flight) + if (g_efa_hmem_info[iface].runt_size < peer->num_runt_bytes_in_flight) return 0; - runt_size = MIN(hmem_info[iface].runt_size - peer->num_runt_bytes_in_flight, ope->total_len); + runt_size = MIN(g_efa_hmem_info[iface].runt_size - peer->num_runt_bytes_in_flight, ope->total_len); memory_alignment = efa_rdm_ep_get_memory_alignment(ep, iface); /* * runt size must be aligned because: diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index ae04af66e1e..720788c8757 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -399,10 +399,10 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) iface = txe->desc[0] ? ((struct efa_mr*) txe->desc[0])->peer.iface : FI_HMEM_SYSTEM; - if (use_p2p && - txe->total_len >= efa_rdm_ep_domain(ep)->hmem_info[iface].min_read_write_size && - efa_rdm_interop_rdma_read(ep, txe->peer) && - (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(ep)))) { + if (use_p2p && + txe->total_len >= g_efa_hmem_info[iface].min_read_write_size && + efa_rdm_interop_rdma_read(ep, txe->peer) && + (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(ep)))) { err = efa_rdm_ope_post_send(txe, EFA_RDM_LONGREAD_RTW_PKT); if (err != -FI_ENOMEM) return err; diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index f01efc72560..adc1ba64255 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -363,7 +363,7 @@ void test_efa_rdm_pke_get_available_copy_methods_align128(struct efa_resource ** efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; /* p2p is available */ - efa_rdm_ep_domain(efa_rdm_ep)->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = true; + g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = true; efa_rdm_ep->hmem_p2p_opt = FI_HMEM_P2P_ENABLED; /* RDMA read is supported */ @@ -921,35 +921,30 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, bool cuda_p2p_supported, bool expected_use_zcpy_rx) { - struct efa_domain *efa_domain; struct efa_rdm_ep *ep; size_t max_msg_size = 1000; size_t inject_msg_size = 0; size_t inject_rma_size = 0; bool shm_permitted = false; + ofi_hmem_disable_p2p = cuda_p2p_disabled; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), resource->hints, false, true); - efa_domain = container_of(resource->domain, struct efa_domain, - util_domain.domain_fid.fid); - /* System memory P2P should always be enabled */ - assert_true(efa_domain->hmem_info[FI_HMEM_SYSTEM].initialized); - assert_false(efa_domain->hmem_info[FI_HMEM_SYSTEM].p2p_disabled_by_user); - assert_true(efa_domain->hmem_info[FI_HMEM_SYSTEM].p2p_supported_by_device); + assert_true(g_efa_hmem_info[FI_HMEM_SYSTEM].initialized); + assert_true(g_efa_hmem_info[FI_HMEM_SYSTEM].p2p_supported_by_device); /** * We want to be able to run this test on any platform: * 1. Fake CUDA support. * 2. Disable all other hmem ifaces. */ - efa_domain->hmem_info[FI_HMEM_CUDA].initialized = true; - efa_domain->hmem_info[FI_HMEM_CUDA].p2p_disabled_by_user = cuda_p2p_disabled; - efa_domain->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = cuda_p2p_supported; + g_efa_hmem_info[FI_HMEM_CUDA].initialized = true; + g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = cuda_p2p_supported; - efa_domain->hmem_info[FI_HMEM_NEURON].initialized = false; - efa_domain->hmem_info[FI_HMEM_SYNAPSEAI].initialized = false; + g_efa_hmem_info[FI_HMEM_NEURON].initialized = false; + g_efa_hmem_info[FI_HMEM_SYNAPSEAI].initialized = false; ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -983,6 +978,8 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, assert_int_equal(inject_msg_size, resource->info->tx_attr->inject_size); assert_int_equal(inject_rma_size, resource->info->tx_attr->inject_size); } + /* restore global variable */ + ofi_hmem_disable_p2p = 0; } /** diff --git a/prov/efa/test/efa_unit_test_hmem.c b/prov/efa/test/efa_unit_test_hmem.c index 55734af286a..90a366f7064 100644 --- a/prov/efa/test/efa_unit_test_hmem.c +++ b/prov/efa/test/efa_unit_test_hmem.c @@ -7,8 +7,7 @@ #if HAVE_NEURON /** * @brief Verify when neuron_alloc failed (return null), - * efa_domain_open, which call efa_hmem_info_update_neuron - * when HAVE_NEURON=1, will still return 0 but leave + * efa_hmem_info_initialize will still return 0 but leave * efa_hmem_info[FI_HMEM_NEURON].initialized and * efa_hmem_info[FI_HMEM_NEURON].p2p_supported_by_device as false. * @@ -18,7 +17,6 @@ void test_efa_hmem_info_update_neuron(struct efa_resource **state) { int ret; struct efa_resource *resource = *state; - struct efa_domain *efa_domain; uint32_t efa_device_caps_orig; bool neuron_initialized_orig; @@ -28,26 +26,21 @@ void test_efa_hmem_info_update_neuron(struct efa_resource **state) ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info); assert_int_equal(ret, 0); - ret = fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL); - assert_int_equal(ret, 0); - neuron_initialized_orig = hmem_ops[FI_HMEM_NEURON].initialized; hmem_ops[FI_HMEM_NEURON].initialized = true; efa_device_caps_orig = g_device_list[0].device_caps; g_device_list[0].device_caps |= EFADV_DEVICE_ATTR_CAPS_RDMA_READ; g_efa_unit_test_mocks.neuron_alloc = &efa_mock_neuron_alloc_return_null; - ret = fi_domain(resource->fabric, resource->info, &resource->domain, NULL); + ret = efa_hmem_info_initialize(); /* recover the modified global variables before doing check */ hmem_ops[FI_HMEM_NEURON].initialized = neuron_initialized_orig; g_device_list[0].device_caps = efa_device_caps_orig; assert_int_equal(ret, 0); - efa_domain = container_of(resource->domain, struct efa_domain, - util_domain.domain_fid.fid); - assert_false(efa_domain->hmem_info[FI_HMEM_NEURON].initialized); - assert_false(efa_domain->hmem_info[FI_HMEM_NEURON].p2p_supported_by_device); + assert_false(g_efa_hmem_info[FI_HMEM_NEURON].initialized); + assert_false(g_efa_hmem_info[FI_HMEM_NEURON].p2p_supported_by_device); } /** @@ -60,19 +53,17 @@ void test_efa_hmem_info_disable_p2p_neuron(struct efa_resource **state) { int ret; struct efa_resource *resource = *state; - struct efa_domain *efa_domain; uint32_t efa_device_caps_orig; bool neuron_initialized_orig; + ofi_hmem_disable_p2p = 1; + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); assert_non_null(resource->hints); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info); assert_int_equal(ret, 0); - ret = fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL); - assert_int_equal(ret, 0); - neuron_initialized_orig = hmem_ops[FI_HMEM_NEURON].initialized; hmem_ops[FI_HMEM_NEURON].initialized = true; efa_device_caps_orig = g_device_list[0].device_caps; @@ -80,8 +71,7 @@ void test_efa_hmem_info_disable_p2p_neuron(struct efa_resource **state) /* neuron_alloc should not be called when p2p is disabled. efa_mock_neuron_alloc_return_mock will fail the test when it is called. */ g_efa_unit_test_mocks.neuron_alloc = efa_mock_neuron_alloc_return_mock; - ofi_hmem_disable_p2p = 1; - ret = fi_domain(resource->fabric, resource->info, &resource->domain, NULL); + ret = efa_hmem_info_initialize(); /* recover the modified global variables before doing check */ ofi_hmem_disable_p2p = 0; @@ -89,11 +79,8 @@ void test_efa_hmem_info_disable_p2p_neuron(struct efa_resource **state) hmem_ops[FI_HMEM_NEURON].initialized = neuron_initialized_orig; assert_int_equal(ret, 0); - efa_domain = container_of(resource->domain, struct efa_domain, - util_domain.domain_fid.fid); - assert_true(efa_domain->hmem_info[FI_HMEM_NEURON].p2p_disabled_by_user); - assert_true(efa_domain->hmem_info[FI_HMEM_NEURON].initialized); - assert_false(efa_domain->hmem_info[FI_HMEM_NEURON].p2p_supported_by_device); + assert_true(g_efa_hmem_info[FI_HMEM_NEURON].initialized); + assert_false(g_efa_hmem_info[FI_HMEM_NEURON].p2p_supported_by_device); } #else void test_efa_hmem_info_update_neuron() @@ -118,36 +105,30 @@ void test_efa_hmem_info_disable_p2p_cuda(struct efa_resource **state) { int ret; struct efa_resource *resource = *state; - struct efa_domain *efa_domain; bool cuda_initialized_orig; + ofi_hmem_disable_p2p = 1; + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); assert_non_null(resource->hints); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info); assert_int_equal(ret, 0); - ret = fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL); - assert_int_equal(ret, 0); - cuda_initialized_orig = hmem_ops[FI_HMEM_CUDA].initialized; hmem_ops[FI_HMEM_CUDA].initialized = true; /* ofi_cudaMalloc should not be called when p2p is disabled. efa_mock_ofi_cudaMalloc_return_mock will fail the test when it is called. */ g_efa_unit_test_mocks.ofi_cudaMalloc = efa_mock_ofi_cudaMalloc_return_mock; - ofi_hmem_disable_p2p = 1; - ret = fi_domain(resource->fabric, resource->info, &resource->domain, NULL); + ret = efa_hmem_info_initialize(); /* recover the modified global variables before doing check */ ofi_hmem_disable_p2p = 0; hmem_ops[FI_HMEM_CUDA].initialized = cuda_initialized_orig; assert_int_equal(ret, 0); - efa_domain = container_of(resource->domain, struct efa_domain, - util_domain.domain_fid.fid); - assert_true(efa_domain->hmem_info[FI_HMEM_CUDA].p2p_disabled_by_user); - assert_true(efa_domain->hmem_info[FI_HMEM_CUDA].initialized); - assert_false(efa_domain->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device); + assert_true(g_efa_hmem_info[FI_HMEM_CUDA].initialized); + assert_false(g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device); } #else void test_efa_hmem_info_disable_p2p_cuda() diff --git a/prov/efa/test/efa_unit_test_runt.c b/prov/efa/test/efa_unit_test_runt.c index ab7537061c0..ae09f0a1c0e 100644 --- a/prov/efa/test/efa_unit_test_runt.c +++ b/prov/efa/test/efa_unit_test_runt.c @@ -27,12 +27,10 @@ void test_efa_rdm_peer_get_runt_size_impl( struct efa_mr mock_mr; struct efa_rdm_ope mock_txe; size_t runt_size; - struct efa_domain *efa_domain; int ret; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_domain = efa_rdm_ep_domain(efa_rdm_ep); - efa_domain->hmem_info[iface].runt_size = total_runt_size; + g_efa_hmem_info[iface].runt_size = total_runt_size; /* insert a fake peer */ ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -296,13 +294,11 @@ void test_efa_rdm_peer_select_readbase_rtm_impl( fi_addr_t addr; struct efa_mr mock_mr; struct efa_rdm_ope mock_txe; - struct efa_domain *efa_domain; int readbase_rtm; int ret; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_domain = efa_rdm_ep_domain(efa_rdm_ep); - efa_domain->hmem_info[iface].runt_size = total_runt_size; + g_efa_hmem_info[iface].runt_size = total_runt_size; /* insert a fake peer */ ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); From 6cdfb7eee1430208a90c57c0be31fecbc1aaea61 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Tue, 29 Oct 2024 11:22:30 -0700 Subject: [PATCH 201/393] fabtests: Fix compiler warning about unitialized variable The compiler can't properly follow the def-use chain guarded by the same condition and complains about the variable could be used w/o initialization: if (atomic == FI_ATOMIC_COMPARE) { check_cmp = ...; } ...... if (atomic == FI_ATOMIC_COMPARE) { use check_cmp in macro } Signed-off-by: Jianxin Xiong --- fabtests/common/shared.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 40f53c84f6b..646a2ea1eab 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -3843,7 +3843,7 @@ int ft_check_atomic(enum ft_atomic_opcodes atomic, enum fi_op op, void *cmp, void *res, size_t count) { int ret = 0; - void *check_res = res, *check_buf, *check_comp; + void *check_res = res, *check_buf, *check_comp = cmp; /* * If we don't have the test function, return > 0 to indicate From 07fb34525c0639e2e2f462fecfcf6e7b45c871e1 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Tue, 29 Oct 2024 11:40:18 -0700 Subject: [PATCH 202/393] prov/verbs: Enable implicit dmabuf mr reg for more HMEM ifaces Now all the non-system HMEM ifaces have get_dmabuf_fd method defined, it's safe to always try the dmabuf based memory registration first. If it fails for any reason (e.g. CUDA is not configured with dmabuf support), the failover path will try the peer memory approach when available. Signed-off-by: Jianxin Xiong --- prov/verbs/src/verbs_mr.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/prov/verbs/src/verbs_mr.c b/prov/verbs/src/verbs_mr.c index 9c7199d99eb..47ccf9d97ce 100644 --- a/prov/verbs/src/verbs_mr.c +++ b/prov/verbs/src/verbs_mr.c @@ -142,12 +142,9 @@ vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access, const void *base_addr md->mr = ibv_reg_dmabuf_mr(md->domain->pd, (uintptr_t) buf, len, (uintptr_t) base_addr + (uintptr_t) buf, (int) device, vrb_access); - else if (vrb_gl_data.dmabuf_support && - (iface == FI_HMEM_ZE || - iface == FI_HMEM_SYNAPSEAI || - iface == FI_HMEM_ROCR)) + else if (vrb_gl_data.dmabuf_support && iface != FI_HMEM_SYSTEM) md->mr = vrb_reg_hmem_dmabuf(iface, md->domain->pd, buf, len, - vrb_access); + vrb_access); else #endif md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, From c9e4315afc6af569b17a94d42776cc604badf3e6 Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Mon, 28 Oct 2024 14:06:49 -0500 Subject: [PATCH 203/393] prov/cxi: remove use of deprecated FI_ORDER_NONE Signed-off-by: Chuck Fossen --- prov/cxi/src/cxip_info.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index c0e0fc278b2..2b0d33d7617 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -270,7 +270,6 @@ struct fi_rx_attr cxip_rx_attr = { .caps = CXIP_EP_CAPS & ~OFI_IGNORED_RX_CAPS, .op_flags = CXIP_RX_OP_FLAGS, .msg_order = CXIP_MSG_ORDER, - .comp_order = FI_ORDER_NONE, .size = CXIP_MAX_RX_SIZE, .iov_limit = 1, }; @@ -289,7 +288,6 @@ struct fi_rx_attr cxip_multi_auth_key_rx_attr = { .caps = CXIP_EP_CAPS & ~OFI_IGNORED_RX_CAPS & ~FI_DIRECTED_RECV, .op_flags = CXIP_RX_OP_FLAGS, .msg_order = CXIP_MSG_ORDER, - .comp_order = FI_ORDER_NONE, .size = CXIP_MAX_RX_SIZE, .iov_limit = 1, }; From 97422ba0ec1158008b7568e7d7d2d00748657fbc Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Tue, 29 Oct 2024 17:17:46 -0500 Subject: [PATCH 204/393] prov/cxi: Disable use of dmabuf by default for cuda Set FI_HMEM_CUDA_USE_DMABUF to disabled by default. NETCASSINI-6844 Signed-off-by: Chuck Fossen --- prov/cxi/src/cxip_info.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index 2b0d33d7617..e195a459624 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -782,6 +782,12 @@ static void cxip_env_init(void) fi_param_get_bool(&cxip_prov, "disable_dmabuf_rocr", &cxip_env.disable_dmabuf_rocr); + /* Disable cuda DMABUF by default - honors the env if already set */ + ret = setenv("FI_HMEM_CUDA_USE_DMABUF", "0", 0); + if (ret) + CXIP_INFO("Could not disable FI_HMEM_CUDA_USE_DMABUF ret:%d %s\n", + ret, fi_strerror(errno)); + fi_param_define(&cxip_prov, "ats_mlock_mode", FI_PARAM_STRING, "Sets ATS mlock mode (off | all)."); fi_param_get_str(&cxip_prov, "ats_mlock_mode", ¶m_str); From 15b4487772efc0d674df3c25b457e81e2cbfb26c Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Wed, 30 Oct 2024 14:03:09 -0500 Subject: [PATCH 205/393] prov/cxi: Remove disable_dmabuf_cuda and disable_dmabuf_rocr Remove unneeded disable_dmabuf_cuda and disable_dmabuf_rocr environment variables. NETCASSINI-6844 Signed-off-by: Chuck Fossen --- prov/cxi/src/cxip_info.c | 13 ------------- prov/cxi/src/cxip_iomm.c | 6 ------ 2 files changed, 19 deletions(-) diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index e195a459624..0033afda593 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -608,8 +608,6 @@ struct cxip_environment cxip_env = { .force_odp = false, .ats = false, .iotlb = true, - .disable_dmabuf_cuda = false, - .disable_dmabuf_rocr = false, .ats_mlock_mode = CXIP_ATS_MLOCK_ALL, .fork_safe_requested = false, .rx_match_mode = CXIP_PTLTE_DEFAULT_MODE, @@ -770,17 +768,6 @@ static void cxip_env_init(void) "Enables the NIC IOTLB (default %d).", cxip_env.iotlb); fi_param_get_bool(&cxip_prov, "iotlb", &cxip_env.iotlb); - fi_param_define(&cxip_prov, "disable_dmabuf_cuda", FI_PARAM_BOOL, - "Disables the DMABUF interface for CUDA (default %d).", - cxip_env.disable_dmabuf_cuda); - fi_param_get_bool(&cxip_prov, "disable_dmabuf_cuda", - &cxip_env.disable_dmabuf_cuda); - - fi_param_define(&cxip_prov, "disable_dmabuf_rocr", FI_PARAM_BOOL, - "Disables the DMABUF interface for ROCR (default %d).", - cxip_env.disable_dmabuf_rocr); - fi_param_get_bool(&cxip_prov, "disable_dmabuf_rocr", - &cxip_env.disable_dmabuf_rocr); /* Disable cuda DMABUF by default - honors the env if already set */ ret = setenv("FI_HMEM_CUDA_USE_DMABUF", "0", 0); diff --git a/prov/cxi/src/cxip_iomm.c b/prov/cxi/src/cxip_iomm.c index 69975cfb06a..b998bd34aee 100644 --- a/prov/cxi/src/cxip_iomm.c +++ b/prov/cxi/src/cxip_iomm.c @@ -28,12 +28,6 @@ static int cxip_dmabuf_hints(enum fi_hmem_iface iface, void *iov_base, return -FI_ENOSYS; } - if (iface == FI_HMEM_CUDA && cxip_env.disable_dmabuf_cuda) - return FI_SUCCESS; - - if (iface == FI_HMEM_ROCR && cxip_env.disable_dmabuf_rocr) - return FI_SUCCESS; - ret = ofi_hmem_get_base_addr(iface, iov_base, len, (void*)&base, &size); if (ret) return ret; From 19d58286c8d02645b9a437c55c837fabe195675b Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Wed, 30 Oct 2024 14:05:08 -0500 Subject: [PATCH 206/393] prov/cxi: Enable dmabuf for ROCR by default. NETCASSINI-6844 Signed-off-by: Chuck Fossen --- prov/cxi/src/cxip_info.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index 0033afda593..af94964ab5f 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -768,6 +768,11 @@ static void cxip_env_init(void) "Enables the NIC IOTLB (default %d).", cxip_env.iotlb); fi_param_get_bool(&cxip_prov, "iotlb", &cxip_env.iotlb); + /* Use ROCR DMABUF by default - honors the env if already set */ + ret = setenv("FI_HMEM_ROCR_USE_DMABUF", "1", 0); + if (ret) + CXIP_INFO("Could not enable FI_HMEM_ROCR_USE_DMABUF ret:%d %s\n", + ret, fi_strerror(errno)); /* Disable cuda DMABUF by default - honors the env if already set */ ret = setenv("FI_HMEM_CUDA_USE_DMABUF", "0", 0); From 6ab2666a46ab8b930a54a55ec7e3eef1f3a46863 Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Wed, 30 Oct 2024 14:06:11 -0500 Subject: [PATCH 207/393] prov/cxi: Update CXI man page. NETCASSINI-6844 Signed-off-by: Chuck Fossen --- man/fi_cxi.7.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md index 3be247ae462..384026f0192 100644 --- a/man/fi_cxi.7.md +++ b/man/fi_cxi.7.md @@ -445,14 +445,14 @@ faults but requires all buffers to be backed by physical memory. Copy-on-write semantics are broken when using pinned memory. See the Fork section for more information. -The CXI provider supports DMABUF for device memory registration. If the ROCR -and CUDA libraries support it, the CXI provider will default to use DMA-buf. +The CXI provider supports DMABUF for device memory registration. +DMABUF is supported in ROCm 5.6+ and Cuda 11.7+ with nvidia open source driver +525+. +Both *FI_HMEM_ROCR_USE_DMABUF* and *FI_HMEM_CUDA_USE_DMABUF are disabled by +default in libfabric core but the CXI provider enables +*FI_HMEM_ROCR_USE_DMABUF* by default if not specifically set. There may be situations with CUDA that may double the BAR consumption. -Until this is fixed in the CUDA stack, the environment variable -*FI_CXI_DISABLE_DMABUF_CUDA* can be used to fall back to the nvidia -peer-memory interface. -Also, *FI_CXI_DISABLE_DMABUF_ROCR* can be used to fall back to the amdgpu -peer-memory interface. +Until this is fixed in the CUDA stack, CUDA DMABUF will be disabled by default. ## Translation Cache From ae01547e03c874b63f0230ed05467c8f7cae8c1c Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Thu, 31 Oct 2024 02:52:34 +0000 Subject: [PATCH 208/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_cxi.7 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 index a81b6dd18b7..adf11179bde 100644 --- a/man/man7/fi_cxi.7 +++ b/man/man7/fi_cxi.7 @@ -1,7 +1,7 @@ .\"t .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_cxi" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cxi" "7" "2024\-10\-31" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -529,14 +529,14 @@ Copy-on-write semantics are broken when using pinned memory. See the Fork section for more information. .PP The CXI provider supports DMABUF for device memory registration. -If the ROCR and CUDA libraries support it, the CXI provider will default -to use DMA-buf. +DMABUF is supported in ROCm 5.6+ and Cuda 11.7+ with nvidia open source +driver 525+. +Both \f[I]FI_HMEM_ROCR_USE_DMABUF\f[R] and \f[I]FI_HMEM_CUDA_USE_DMABUF +are disabled by default in libfabric core but the CXI provider enables +\f[R]FI_HMEM_ROCR_USE_DMABUF* by default if not specifically set. There may be situations with CUDA that may double the BAR consumption. -Until this is fixed in the CUDA stack, the environment variable -\f[I]FI_CXI_DISABLE_DMABUF_CUDA\f[R] can be used to fall back to the -nvidia peer-memory interface. -Also, \f[I]FI_CXI_DISABLE_DMABUF_ROCR\f[R] can be used to fall back to -the amdgpu peer-memory interface. +Until this is fixed in the CUDA stack, CUDA DMABUF will be disabled by +default. .SS Translation Cache .PP Mapping a buffer for use by the NIC is an expensive operation. From f31e94dd78ef3df9edf1c5df86469a1381ea3fdd Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Mon, 28 Oct 2024 14:29:57 -0500 Subject: [PATCH 209/393] hmem/cuda: Add env variable to enable/disable CUDA DMABUF Enabled by default Signed-off-by: Chuck Fossen --- src/hmem_cuda.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c index 1c8abb03285..0580bdb24f1 100644 --- a/src/hmem_cuda.c +++ b/src/hmem_cuda.c @@ -763,6 +763,9 @@ int cuda_hmem_init(void) "If libfabric is not compiled with gdrcopy support, " "this variable is not checked. (default: true)"); + fi_param_define(NULL, "hmem_cuda_use_dmabuf", FI_PARAM_BOOL, + "Use dma-buf for sharing buffer with hardware. (default:true)"); + ret = cuda_hmem_dl_init(); if (ret != FI_SUCCESS) return ret; @@ -936,7 +939,11 @@ bool cuda_is_gdrcopy_enabled(void) bool cuda_is_dmabuf_supported(void) { - return cuda_attr.dmabuf_supported; + int use_dmabuf = 1; + + fi_param_get_bool(NULL, "hmem_cuda_use_dmabuf", &use_dmabuf); + + return use_dmabuf && cuda_attr.dmabuf_supported; } #else From f8b419400eccfc65ae404f1494a8ca7721e6ed2c Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Wed, 30 Oct 2024 09:05:58 -0700 Subject: [PATCH 210/393] test_configs/ofi_rxm/tcp.test: remove cntr RMA testing ubertest implementation currently requires FI_RMA_EVENT when using RMA and counters This will cause tcp to return ENODATA for these combinations and cause runfabtests to fail. This should get updated in ubertest to not require it but remove testing for now Signed-off-by: Alexia Ingerson --- fabtests/test_configs/ofi_rxm/tcp.test | 1 - 1 file changed, 1 deletion(-) diff --git a/fabtests/test_configs/ofi_rxm/tcp.test b/fabtests/test_configs/ofi_rxm/tcp.test index 6087f7ed588..f03c74e1a19 100644 --- a/fabtests/test_configs/ofi_rxm/tcp.test +++ b/fabtests/test_configs/ofi_rxm/tcp.test @@ -59,7 +59,6 @@ ], comp_type: [ FT_COMP_QUEUE, - FT_COMP_CNTR, ], mr_mode: [], progress: [ From 637782f749f6c8bc1a90d36884c2f12b3f29d35f Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Thu, 31 Oct 2024 13:13:47 -0700 Subject: [PATCH 211/393] prov/efa: Update efa_hmem and efa_fork_support log to FI_LOG_CORE efa_hmem_info and efa_fork_support are global and not associated with domain any more. Update log to FI_LOG_CORE. Signed-off-by: Jessie Yang --- prov/efa/src/efa_fork_support.c | 14 +++++++------- prov/efa/src/efa_hmem.c | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/prov/efa/src/efa_fork_support.c b/prov/efa/src/efa_fork_support.c index 82db3505987..891f9aca785 100644 --- a/prov/efa/src/efa_fork_support.c +++ b/prov/efa/src/efa_fork_support.c @@ -96,7 +96,7 @@ static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) page_size = ofi_get_page_size(); if (page_size <= 0) { - EFA_WARN(FI_LOG_DOMAIN, "Unable to determine page size %ld\n", + EFA_WARN(FI_LOG_CORE, "Unable to determine page size %ld\n", page_size); return -FI_EINVAL; } @@ -125,14 +125,14 @@ static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) if(buf) free(buf); if(mr) ibv_dereg_mr(mr); if (ret) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Unexpected error during ibv_reg_mr in " "efa_fork_support_is_enabled(): %s\n",strerror(ret)); return -FI_EINVAL; } if (ret_init == 0) return 0; if (ret_init == EINVAL) return 1; - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Unexpected error during ibv_fork_init in " "efa_fork_support_is_enabled(): %s\n",strerror(ret_init)); return -FI_EINVAL; @@ -236,7 +236,7 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) if (g_efa_fork_status == EFA_FORK_SUPPORT_ON) { ret = -ibv_fork_init(); if (ret) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Fork support requested but ibv_fork_init failed: %s\n", strerror(-ret)); return ret; @@ -257,7 +257,7 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) g_efa_fork_status = EFA_FORK_SUPPORT_ON; if (g_efa_fork_status == EFA_FORK_SUPPORT_ON && getenv("RDMAV_HUGEPAGES_SAFE")) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Using libibverbs fork support and huge pages is not" " supported by the EFA provider.\n"); return -FI_EINVAL; @@ -278,7 +278,7 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) } if (ret) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Unable to register atfork callback: %s\n", strerror(-ret)); return ret; @@ -302,7 +302,7 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) int efa_fork_support_enable_if_requested(struct domain_fid* domain_fid) { if (g_efa_fork_status == EFA_FORK_SUPPORT_ON) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Using fork support is not supported by the EFA provider on Windows\n"); return -FI_EINVAL; } diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c index 3c713221711..61eca026219 100644 --- a/prov/efa/src/efa_hmem.c +++ b/prov/efa/src/efa_hmem.c @@ -60,7 +60,7 @@ static int efa_domain_hmem_info_init_protocol_thresholds(enum fi_hmem_iface ifac fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &info->min_read_msg_size); fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size); if (-FI_ENODATA != fi_param_get(&efa_prov, "inter_max_medium_message_size", &tmp_value)) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "The environment variable FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE was set, " "but EFA HMEM via Cuda API only supports eager and runting read protocols. " "The variable will not modify CUDA memory run config.\n"); @@ -75,7 +75,7 @@ static int efa_domain_hmem_info_init_protocol_thresholds(enum fi_hmem_iface ifac fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &info->min_read_msg_size); fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size); if (-FI_ENODATA != fi_param_get(&efa_prov, "inter_max_medium_message_size", &tmp_value)) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "The environment variable FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE was set, " "but EFA HMEM via Neuron API only supports eager and runting read protocols. " "The variable will not modify CUDA memory run config.\n"); @@ -90,7 +90,7 @@ static int efa_domain_hmem_info_init_protocol_thresholds(enum fi_hmem_iface ifac -FI_ENODATA != fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &tmp_value) || -FI_ENODATA != fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &tmp_value) || -FI_ENODATA != fi_param_get_size_t(&efa_prov, "runt_size", &tmp_value)) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "One or more of the following environment variable(s) were set: [" "FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE, " "FI_EFA_INTER_MIN_READ_MESSAGE_SIZE, " @@ -120,7 +120,7 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in cuda_ret = ofi_cudaMalloc(&ptr, len); if (cuda_ret != cudaSuccess) { info->initialized = false; - EFA_WARN(FI_LOG_DOMAIN, "Failed to allocate CUDA buffer: %s\n", + EFA_WARN(FI_LOG_CORE, "Failed to allocate CUDA buffer: %s\n", ofi_cudaGetErrorString(cuda_ret)); return; } @@ -130,13 +130,13 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in ibv_mr = ibv_reg_dmabuf_mr(g_device_list[0].ibv_pd, dmabuf_offset, len, (uint64_t)ptr, dmabuf_fd, ibv_access); if (!ibv_mr) { - EFA_INFO(FI_LOG_DOMAIN, + EFA_INFO(FI_LOG_CORE, "Unable to register CUDA device buffer via dmabuf: %s. " "Fall back to ibv_reg_mr\n", fi_strerror(-errno)); ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access); } } else { - EFA_INFO(FI_LOG_DOMAIN, + EFA_INFO(FI_LOG_CORE, "Unable to retrieve dmabuf fd of CUDA device buffer: %d. " "Fall back to ibv_reg_mr\n", ret); ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access); @@ -147,7 +147,7 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in if (!ibv_mr) { info->p2p_supported_by_device = false; - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Failed to register CUDA buffer with the EFA device, FI_HMEM transfers that require peer to peer support will fail.\n"); ofi_cudaFree(ptr); return; @@ -156,7 +156,7 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in ret = ibv_dereg_mr(ibv_mr); ofi_cudaFree(ptr); if (ret) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Failed to deregister CUDA buffer: %s\n", fi_strerror(-ret)); return; @@ -192,7 +192,7 @@ static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info * */ if (!ptr) { info->initialized = false; - EFA_INFO(FI_LOG_DOMAIN, "Cannot allocate Neuron buffer\n"); + EFA_INFO(FI_LOG_CORE, "Cannot allocate Neuron buffer\n"); return; } @@ -215,7 +215,7 @@ static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info * if (!ibv_mr) { info->p2p_supported_by_device = false; /* We do not expect to support Neuron on non p2p systems */ - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Failed to register Neuron buffer with the EFA device, " "FI_HMEM transfers that require peer to peer support will fail.\n"); neuron_free(&handle); @@ -225,7 +225,7 @@ static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info * ret = ibv_dereg_mr(ibv_mr); neuron_free(&handle); if (ret) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "Failed to deregister Neuron buffer: %s\n", fi_strerror(-ret)); return; @@ -248,14 +248,14 @@ efa_hmem_info_init_iface(enum fi_hmem_iface iface) struct efa_hmem_info *info = &g_efa_hmem_info[iface]; if (!ofi_hmem_is_initialized(iface)) { - EFA_INFO(FI_LOG_DOMAIN, "%s is not initialized\n", + EFA_INFO(FI_LOG_CORE, "%s is not initialized\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); return; } if ((iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_NEURON) && !efa_device_support_rdma_read()) { - EFA_WARN(FI_LOG_DOMAIN, + EFA_WARN(FI_LOG_CORE, "No EFA RDMA read support, transfers using %s will fail.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); return; @@ -273,7 +273,7 @@ efa_hmem_info_init_iface(enum fi_hmem_iface iface) if (iface == FI_HMEM_NEURON) efa_hmem_info_check_p2p_support_neuron(info); if (!info->p2p_supported_by_device) - EFA_INFO(FI_LOG_DOMAIN, "%s P2P support is not available.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); + EFA_INFO(FI_LOG_CORE, "%s P2P support is not available.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); } efa_domain_hmem_info_init_protocol_thresholds(iface); From a9796d9e0afb22c4df18a579b1b7768a0f26f48d Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Thu, 31 Oct 2024 13:17:47 -0700 Subject: [PATCH 212/393] prov/efa: Initialize efa fork support in EFA_INI Move efa_fork_support_enable_if_requested before initializing efa_hmem_info because ibv_fork_init should be called before ibv_reg_mr. Fix the bug of wrong return value of efa_fork_support_is_enabled. Install fork handler in efa_user_info because fork() is currently called by nvml_init in other provider's ini (which calls ofi_hmem_init) after efa provider init. Signed-off-by: Jessie Yang --- prov/efa/src/efa_domain.c | 7 --- prov/efa/src/efa_fork_support.c | 81 +++++++++++++++++---------------- prov/efa/src/efa_fork_support.h | 4 +- prov/efa/src/efa_prov.c | 5 ++ prov/efa/src/efa_user_info.c | 18 ++++++++ 5 files changed, 67 insertions(+), 48 deletions(-) diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 2e81aafa666..e6cab857af3 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -290,13 +290,6 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info, efa_domain->util_domain.domain_fid.ops = &efa_ops_domain_dgram; } - err = efa_fork_support_enable_if_requested(*domain_fid); - if (err) { - ret = err; - EFA_WARN(FI_LOG_DOMAIN, "Failed to initialize fork support. err: %d\n", ret); - goto err_free; - } - dlist_insert_tail(&efa_domain->list_entry, &g_efa_domain_list); return 0; diff --git a/prov/efa/src/efa_fork_support.c b/prov/efa/src/efa_fork_support.c index 891f9aca785..41ac57bd5f6 100644 --- a/prov/efa/src/efa_fork_support.c +++ b/prov/efa/src/efa_fork_support.c @@ -70,14 +70,13 @@ void efa_fork_support_request_initialize() * * This relies on internal behavior in rdma-core and is a temporary workaround. * - * @param domain_fid domain fid so we can register memory * @return 1 if fork support is enabled * 0 if not enabled * -FI_EINVAL/-FI_NOMEM on errors. */ -static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) +static int efa_fork_support_is_enabled() { - /* If ibv_is_fork_initialized is availble, check if the function + /* If ibv_is_fork_initialized is available, check if the function * can exit early. */ #if HAVE_IBV_IS_FORK_INITIALIZED == 1 @@ -86,14 +85,11 @@ static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) /* If fork support is ENABLED or UNNEEDED, return 1. */ return fork_status != IBV_FORK_DISABLED; #else - struct efa_domain *efa_domain; struct ibv_mr *mr = NULL; char *buf = NULL; int ret=0, ret_init=0; long page_size; - efa_domain = container_of(domain_fid, struct efa_domain, util_domain.domain_fid); - page_size = ofi_get_page_size(); if (page_size <= 0) { EFA_WARN(FI_LOG_CORE, "Unable to determine page size %ld\n", @@ -105,8 +101,7 @@ static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) if (!buf) return -FI_ENOMEM; - - mr = ibv_reg_mr(efa_domain->ibv_pd, buf, page_size, 0); + mr = ibv_reg_mr(g_device_list[0].ibv_pd, buf, page_size, 0); if (mr == NULL) { ret = errno; goto out; @@ -130,8 +125,8 @@ static int efa_fork_support_is_enabled(struct fid_domain *domain_fid) "efa_fork_support_is_enabled(): %s\n",strerror(ret)); return -FI_EINVAL; } - if (ret_init == 0) return 0; - if (ret_init == EINVAL) return 1; + if (ret_init == 0) return 1; + if (ret_init == EINVAL) return 0; EFA_WARN(FI_LOG_CORE, "Unexpected error during ibv_fork_init in " "efa_fork_support_is_enabled(): %s\n",strerror(ret_init)); @@ -221,12 +216,10 @@ void efa_atfork_callback_flush_mr_cache() * library or process initiates a fork and we determined from previous logic * that we cannot support that. * - * @param domain_fid domain fid so we can check register memory during initialization. * @return error number if we failed to initialize, 0 otherwise */ -int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) +int efa_fork_support_enable_if_requested() { - static int fork_handler_installed = 0; int ret; int is_enabled; @@ -249,7 +242,7 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) * this variable was set to ON during provider init. Huge pages for * bounce buffers will not be used if fork support is on. */ - ret = efa_fork_support_is_enabled(domain_fid); + ret = efa_fork_support_is_enabled(); if (ret < 0) return ret; is_enabled = ret; @@ -263,30 +256,6 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) return -FI_EINVAL; } - /* - * It'd be better to install this during provider init (since that's - * only invoked once) but we need to do a memory registration for the - * fork check above. This can move to the provider init once that check - * is gone. - */ - if (!fork_handler_installed && g_efa_fork_status != EFA_FORK_SUPPORT_UNNEEDED) { - if (g_efa_fork_status == EFA_FORK_SUPPORT_OFF) { - ret = pthread_atfork(efa_atfork_callback_warn_and_abort, NULL, NULL); - } else { - assert(g_efa_fork_status == EFA_FORK_SUPPORT_ON); - ret = pthread_atfork(efa_atfork_callback_flush_mr_cache, NULL, NULL); - } - - if (ret) { - EFA_WARN(FI_LOG_CORE, - "Unable to register atfork callback: %s\n", - strerror(-ret)); - return ret; - } - - fork_handler_installed = 1; - } - return 0; } @@ -296,10 +265,9 @@ int efa_fork_support_enable_if_requested(struct fid_domain* domain_fid) * * We check if fork is requested and return failure as fork is not supported on Windows * - * @param domain_fid domain unused * @return error number if fork is requested, 0 otherwise */ -int efa_fork_support_enable_if_requested(struct domain_fid* domain_fid) +int efa_fork_support_enable_if_requested() { if (g_efa_fork_status == EFA_FORK_SUPPORT_ON) { EFA_WARN(FI_LOG_CORE, @@ -311,3 +279,36 @@ int efa_fork_support_enable_if_requested(struct domain_fid* domain_fid) #endif +/* @brief + * + * install a fork handler to ensure that we abort if another + * library or process initiates a fork and we determined from previous logic + * that we cannot support that. + * + * @return error number if we failed to install, 0 otherwise + */ +int efa_fork_support_install_fork_handler() +{ + static int fork_handler_installed = 0; + int ret; + + if (!fork_handler_installed && g_efa_fork_status != EFA_FORK_SUPPORT_UNNEEDED) { + if (g_efa_fork_status == EFA_FORK_SUPPORT_OFF) { + ret = pthread_atfork(efa_atfork_callback_warn_and_abort, NULL, NULL); + } else { + assert(g_efa_fork_status == EFA_FORK_SUPPORT_ON); + ret = pthread_atfork(efa_atfork_callback_flush_mr_cache, NULL, NULL); + } + + if (ret) { + EFA_WARN(FI_LOG_CORE, + "Unable to register atfork callback: %s\n", + strerror(-ret)); + return ret; + } + + fork_handler_installed = 1; + } + + return 0; +} diff --git a/prov/efa/src/efa_fork_support.h b/prov/efa/src/efa_fork_support.h index ef16c23d577..13e692c0fdf 100644 --- a/prov/efa/src/efa_fork_support.h +++ b/prov/efa/src/efa_fork_support.h @@ -17,8 +17,10 @@ enum efa_fork_support_status { }; extern enum efa_fork_support_status g_efa_fork_status; -int efa_fork_support_enable_if_requested(struct fid_domain *domain_fid); +int efa_fork_support_enable_if_requested(); void efa_fork_support_request_initialize(); +int efa_fork_support_install_fork_handler(); + #endif diff --git a/prov/efa/src/efa_prov.c b/prov/efa/src/efa_prov.c index 2dd5b42fecb..1f805c6742b 100644 --- a/prov/efa/src/efa_prov.c +++ b/prov/efa/src/efa_prov.c @@ -164,6 +164,11 @@ EFA_INI if (err) goto err_free; + err = efa_fork_support_enable_if_requested(); + if (err) { + goto err_free; + } + err = efa_hmem_info_initialize(); if (err) goto err_free; diff --git a/prov/efa/src/efa_user_info.c b/prov/efa/src/efa_user_info.c index e152f2adc23..129c038ee21 100644 --- a/prov/efa/src/efa_user_info.c +++ b/prov/efa/src/efa_user_info.c @@ -610,6 +610,24 @@ int efa_getinfo(uint32_t version, const char *node, struct fi_info *dgram_info_list, *rdm_info_list; int err; +#ifndef _WIN32 + /* + * TODO: + * It'd be better to install this during provider init (since that's + * only invoked once) but fork() is currently called by nvml_init in + * other provider's ini (which calls ofi_hmem_init) after efa provider init. + * This can move to the provider init after we get rid of that fork() in + * ofi_hmem_init(). + */ + err = efa_fork_support_install_fork_handler(); + if (err) { + EFA_WARN(FI_LOG_CORE, + "Unable to install fork handler: %s\n", + strerror(-err)); + return err; + } +#endif + if (hints && hints->ep_attr && hints->ep_attr->type == FI_EP_DGRAM) return efa_user_info_get_dgram(version, node, service, flags, hints, info); From c0225bd67bddf4397c4475b8857ef9bcb7af944e Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Fri, 1 Nov 2024 11:32:01 -0700 Subject: [PATCH 213/393] prov/efa: Disable zero copy receive if p2p is not available Fix the bug of using zcpy_rx for an iface that does not support p2p. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 8 ++++---- prov/efa/test/efa_unit_test_ep.c | 9 ++++++--- prov/efa/test/efa_unit_tests.c | 2 +- prov/efa/test/efa_unit_tests.h | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 014ade78b46..56e80bc146d 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -481,11 +481,11 @@ void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep) } /* Zero-copy receive requires P2P support. Disable it if any initialized HMEM iface does not support P2P. */ - EFA_HMEM_IFACE_FOREACH(iface) { + EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(iface) { if (g_efa_hmem_info[iface].initialized && - !ofi_hmem_p2p_disabled() && - ep->hmem_p2p_opt != FI_HMEM_P2P_DISABLED && - !g_efa_hmem_info[iface].p2p_supported_by_device) { + (ofi_hmem_p2p_disabled() || + ep->hmem_p2p_opt == FI_HMEM_P2P_DISABLED || + !g_efa_hmem_info[iface].p2p_supported_by_device)) { EFA_INFO(FI_LOG_EP_CTRL, "%s does not support P2P, zero-copy receive " "protocol will be disabled\n", diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index adc1ba64255..f8dd2073df4 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -948,6 +948,9 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + if (cuda_p2p_supported) + ep->hmem_p2p_opt = FI_HMEM_P2P_ENABLED; + /* Set sufficiently small max_msg_size */ assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_MSG_SIZE, &max_msg_size, sizeof max_msg_size), 0); @@ -1003,9 +1006,9 @@ void test_efa_rdm_ep_user_zcpy_rx_disabled(struct efa_resource **state) } /** - * @brief Verify zcpy_rx is enabled if CUDA P2P is explictly disabled + * @brief Verify zcpy_rx is disabled if CUDA P2P is explictly disabled */ -void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_happy(struct efa_resource **state) +void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_disabled(struct efa_resource **state) { struct efa_resource *resource = *state; @@ -1015,7 +1018,7 @@ void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_happy(struct efa_resource **state) resource->hints->mode = FI_MSG_PREFIX; resource->hints->caps = FI_MSG; - test_efa_rdm_ep_use_zcpy_rx_impl(resource, true, false, true); + test_efa_rdm_ep_use_zcpy_rx_impl(resource, true, false, false); } /** diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 1e2f2087fa2..cf3bc976884 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -105,7 +105,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rma_without_caps, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_atomic_without_caps, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_disabled, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_disable_p2p_zcpy_rx_happy, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_disable_p2p_zcpy_rx_disabled, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_p2p_not_supported_zcpy_rx_happy, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 52670e8af9c..5422295f1b3 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -120,7 +120,7 @@ void test_efa_rdm_ep_setopt_shared_memory_permitted(); void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_good(); void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad(); void test_efa_rdm_ep_user_zcpy_rx_disabled(); -void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_happy(); +void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_disabled(); void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas(); void test_efa_rdm_ep_user_p2p_not_supported_zcpy_rx_happy(); void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local(); From b5ebb82f223d4f9cfe12aaf9e961ed957255626e Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 1 Nov 2024 16:59:28 +0000 Subject: [PATCH 214/393] prov/hook: Fix the preprocessor ifdef should be if here, as HAVE_HOOK_HMEM_DL is defined anyway. Signed-off-by: Shi Jin --- prov/hook/hook_hmem/src/hook_hmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/hook/hook_hmem/src/hook_hmem.c b/prov/hook/hook_hmem/src/hook_hmem.c index 0a9a3c2b84d..44daa4fb621 100644 --- a/prov/hook/hook_hmem/src/hook_hmem.c +++ b/prov/hook/hook_hmem/src/hook_hmem.c @@ -1914,7 +1914,7 @@ static int hook_hmem_domain(struct fid_fabric *fabric, struct fi_info *info, HOOK_HMEM_INI { -#ifdef HAVE_HOOK_HMEM_DL +#if HAVE_HOOK_HMEM_DL ofi_hmem_init(); #endif hook_hmem_fabric_ops = hook_fabric_ops; From 5c701e50fa4c9ff383f92a80f897e3f6590c254a Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 4 Nov 2024 20:19:01 +0000 Subject: [PATCH 215/393] contrib/aws: Use lockable resources to limit the number of jobs run in parallel Signed-off-by: Seth Zegelstein --- contrib/aws/Jenkinsfile | 84 +++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index ad3a5391095..a4cb883614c 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -81,24 +81,26 @@ def get_cluster_name(build_tag, os, instance_type) { return cluster_name } -def get_single_node_windows_test_stage(stage_name) { +def get_single_node_windows_test_stage_with_lock(stage_name, lock_label) { /* * Get Windows Stage */ return { stage("${stage_name}") { - sh """ - . venv/bin/activate; - cd PortaFiducia/scripts; - export PULL_REQUEST_ID=${env.CHANGE_ID}; - env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py --ci public --s3-bucket-name libfabric-ci-windows-prod-test-output --pull-request-id ${env.CHANGE_ID}; - """ + lock(label: lock_label, quantity: 1) { + sh """ + . venv/bin/activate; + cd PortaFiducia/scripts; + export PULL_REQUEST_ID=${env.CHANGE_ID}; + env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py --ci public --s3-bucket-name libfabric-ci-windows-prod-test-output --pull-request-id ${env.CHANGE_ID}; + """ + } } } } -def get_test_stage(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) { +def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, instance_count, region, test_config, lock_label, addl_args) { /* * Generate a single test stage that run test_orchestrator.py with the given parameters. * param@ stage_name: the name of the stage @@ -113,7 +115,9 @@ def get_test_stage(stage_name, build_tag, os, instance_type, instance_count, reg */ return { stage("${stage_name}") { - this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) + lock(label: lock_label, quantity: instance_count) { + this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) + } } } } @@ -167,45 +171,53 @@ pipeline { // This needs the extra space at the end // Set 9 hour timeout for all clusters def addl_args_pr = "--timeout 540 --test-libfabric-pr $env.CHANGE_ID " + // Use lockable resources to limit the number of jobs that can get executed in parallel + def g4dn8x_lock_label = "g4dn8x" + def g4dn12x_lock_label = "g4dn12x" + def c52x_lock_label = "c52x" + def hpc6a48x_lock_label = "hpc6a48x" + def c6gn16x_lock_label = "c6gn16x" + def c5n18x_lock_label = "c5n18x" + def c6g2x_lock_label = "c6g2x" // Single Node Tests - EFA - stages["1_g4dn_alinux2-efa"] = get_test_stage("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["1_g4dn_alinux2023-efa"] = get_test_stage("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["1_g4dn_ubuntu2004-efa"] = get_test_stage("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["1_g4dn_rhel8-efa"] = get_test_stage("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["1_g4dn_alinux2-efa"] = get_test_stage_with_lock("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr) + stages["1_g4dn_alinux2023-efa"] = get_test_stage_with_lock("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr) + stages["1_g4dn_ubuntu2004-efa"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr) + stages["1_g4dn_rhel8-efa"] = get_test_stage_with_lock("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr) // Single Node Tests - SHM - stages["1_g4dn_alinux2_shm"] = get_test_stage("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") - stages["1_g4dn_alinux2023_shm"] = get_test_stage("1_g4dn_alinux2023_shm", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") - stages["1_g4dn_ubuntu2004_shm"] = get_test_stage("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") - stages["1_c5_rhel8_shm"] = get_test_stage("1_c5_rhel8_shm", env.BUILD_TAG, "rhel8", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm --enable-efa false") - stages["1_c5_ubuntu2004_shm_disable-cma"] = get_test_stage("1_c5_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm --enable-cma false --enable-efa false") + stages["1_g4dn_alinux2_shm"] = get_test_stage_with_lock("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_alinux2023_shm"] = get_test_stage_with_lock("1_g4dn_alinux2023_shm", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_ubuntu2004_shm"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr + "--test-libfabric-provider shm") + stages["1_c5_rhel8_shm"] = get_test_stage_with_lock("1_c5_rhel8_shm", env.BUILD_TAG, "rhel8", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", c52x_lock_label, addl_args_pr + "--test-libfabric-provider shm --enable-efa false") + stages["1_c5_ubuntu2004_shm_disable-cma"] = get_test_stage_with_lock("1_c5_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", c52x_lock_label, addl_args_pr + "--test-libfabric-provider shm --enable-cma false --enable-efa false") // Single Node Windows Test - stages["EFA_Windows_Test"] = get_single_node_windows_test_stage("EFA_Windows_Test") + stages["EFA_Windows_Test"] = get_single_node_windows_test_stage_with_lock("EFA_Windows_Test", c5n18x_lock_label) // Multi Node Tests - EFA - stages["2_hpc6a_alinux2_efa"] = get_test_stage("2_hpc6a_alinux2_efa", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_hpc6a_alinux2023_efa"] = get_test_stage("2_hpc6a_alinux2023_efa", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_c6gn_alinux2_efa"] = get_test_stage("2_c6gn_alinux2_efa", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_c6gn_alinux2023_efa"] = get_test_stage("2_c6gn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_c5n_alinux2_efa"] = get_test_stage("2_c5n_alinux2_efa", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_c5n_alinux2023_efa"] = get_test_stage("2_c5n_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_hpc6a_ubuntu2004_efa"] = get_test_stage("2_hpc6a_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) - stages["2_hpc6a_rhel8_efa"] = get_test_stage("2_hpc6a_rhel8_efa", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["2_hpc6a_alinux2_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr) + stages["2_hpc6a_alinux2023_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr) + stages["2_c6gn_alinux2_efa"] = get_test_stage_with_lock("2_c6gn_alinux2_efa", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6gn16x_lock_label, addl_args_pr) + stages["2_c6gn_alinux2023_efa"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6gn16x_lock_label, addl_args_pr) + stages["2_c5n_alinux2_efa"] = get_test_stage_with_lock("2_c5n_alinux2_efa", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", c5n18x_lock_label, addl_args_pr) + stages["2_c5n_alinux2023_efa"] = get_test_stage_with_lock("2_c5n_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", c5n18x_lock_label, addl_args_pr) + stages["2_hpc6a_ubuntu2004_efa"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr) + stages["2_hpc6a_rhel8_efa"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr) // Multi Node Tests - TCP - stages["2_c6g_alinux2_tcp"] = get_test_stage("2_c6g_alinux2_tcp", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["2_c6g_alinux2023_tcp"] = get_test_stage("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["2_c6g_ubuntu2004_tcp"] = get_test_stage("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["2_c6g_rhel8_tcp"] = get_test_stage("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["3_g4dn_alinux2_tcp"] = get_test_stage("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp --enable-efa false --test-list test_nccl_tests") + stages["2_c6g_alinux2_tcp"] = get_test_stage_with_lock("2_c6g_alinux2_tcp", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") + stages["2_c6g_alinux2023_tcp"] = get_test_stage_with_lock("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") + stages["2_c6g_ubuntu2004_tcp"] = get_test_stage_with_lock("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") + stages["2_c6g_rhel8_tcp"] = get_test_stage_with_lock("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") + stages["3_g4dn_alinux2_tcp"] = get_test_stage_with_lock("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", "libfabric_pr_test.yaml", g4dn12x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false --test-list test_nccl_tests") // Multi Node Tests - SOCKETS - stages["2_c6g_alinux2_sockets"] = get_test_stage("2_c6g_alinux2_sockets", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") - stages["2_c6g_alinux2023_sockets"] = get_test_stage("2_c6g_alinux2023_sockets", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") - stages["2_c6g_ubuntu2004_sockets"] = get_test_stage("2_c6g_ubuntu2004_sockets", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") - stages["2_c6g_rhel8_sockets"] = get_test_stage("2_c6g_rhel8_sockets", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") + stages["2_c6g_alinux2_sockets"] = get_test_stage_with_lock("2_c6g_alinux2_sockets", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") + stages["2_c6g_alinux2023_sockets"] = get_test_stage_with_lock("2_c6g_alinux2023_sockets", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") + stages["2_c6g_ubuntu2004_sockets"] = get_test_stage_with_lock("2_c6g_ubuntu2004_sockets", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") + stages["2_c6g_rhel8_sockets"] = get_test_stage_with_lock("2_c6g_rhel8_sockets", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") parallel stages } From a03c5a71fef61a79648005c88db820790fdc5570 Mon Sep 17 00:00:00 2001 From: John Heemstra Date: Fri, 1 Nov 2024 11:35:59 -0400 Subject: [PATCH 216/393] prov/cxi: ignore FLT_OVERFLOW and FLT_INVALID errors Don't treat CXIP_COLL_RC_FLT_OVERFLOW and CXIP_COLL_RC_FLT_INVALID as errors. Add logging for when one of these two or CXIP_COLL_RC_FLT_INEXACT is encountered. Signed-off-by: John Heemstra --- prov/cxi/src/cxip_coll.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c index 0b121496c33..9d9c6d73316 100644 --- a/prov/cxi/src/cxip_coll.c +++ b/prov/cxi/src/cxip_coll.c @@ -1820,16 +1820,25 @@ static void _post_coll_complete(struct cxip_coll_reduction *reduction) /* convert Rosetta return codes to CXIP return codes */ if (reduction->accum.red_rc == CXIP_COLL_RC_SUCCESS || - reduction->accum.red_rc == CXIP_COLL_RC_FLT_INEXACT) { - ret = cxip_cq_req_complete(req); - } else { + reduction->accum.red_rc == CXIP_COLL_RC_FLT_INEXACT || + reduction->accum.red_rc == CXIP_COLL_RC_FLT_INVALID || + reduction->accum.red_rc == CXIP_COLL_RC_FLT_OVERFLOW) { switch (reduction->accum.red_rc) { - case CXIP_COLL_RC_FLT_OVERFLOW: - prov = FI_CXI_ERRNO_RED_FLT_OVERFLOW; + case CXIP_COLL_RC_FLT_INEXACT: + CXIP_WARN("coll reduce FLT result was rounded\n"); break; case CXIP_COLL_RC_FLT_INVALID: - prov = FI_CXI_ERRNO_RED_FLT_INVALID; + CXIP_WARN("coll reduce FLT invalid\n"); break; + case CXIP_COLL_RC_FLT_OVERFLOW: + CXIP_WARN("coll reduce FLT overflow\n"); + break; + default: + break; + } + ret = cxip_cq_req_complete(req); + } else { + switch (reduction->accum.red_rc) { case CXIP_COLL_RC_INT_OVERFLOW: prov = FI_CXI_ERRNO_RED_INT_OVERFLOW; break; From 230b840d833ab02fef23a6e4c201c0e3a3c2f518 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 6 Nov 2024 19:43:27 +0000 Subject: [PATCH 217/393] prov/efa: Fall back to zero sl when non-zero sl qp creation failed When driver doesn't support non-zero sl, the qp creation will fail. In this case, fallback to zero sl qp. Signed-off-by: Shi Jin --- prov/efa/src/efa_base_ep.c | 13 ++++++++++++- prov/efa/src/efa_base_ep.h | 3 ++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index d022c0e3ef6..db75afa01e0 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -186,14 +186,25 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, #endif efa_attr.driver_qp_type = EFADV_QP_DRIVER_TYPE_SRD; #if HAVE_EFADV_SL + efa_attr.sl = EFA_QP_DEFAULT_SERVICE_LEVEL; if (tclass == FI_TC_LOW_LATENCY) - efa_attr.sl = EFA_QP_DEFAULT_SERVICE_LEVEL; + efa_attr.sl = EFA_QP_LOW_LATENCY_SERVICE_LEVEL; #endif (*qp)->ibv_qp = efadv_create_qp_ex( init_attr_ex->pd->context, init_attr_ex, &efa_attr, sizeof(struct efadv_qp_init_attr)); } +#if HAVE_EFADV_SL + if (!(*qp)->ibv_qp && tclass == FI_TC_LOW_LATENCY) { + EFA_INFO(FI_LOG_EP_CTRL, "ibv_create_qp failed with sl %u, errno: %d. Retrying with default sl.\n", efa_attr.sl, errno); + efa_attr.sl = EFA_QP_DEFAULT_SERVICE_LEVEL; + (*qp)->ibv_qp = efadv_create_qp_ex( + init_attr_ex->pd->context, init_attr_ex, &efa_attr, + sizeof(struct efadv_qp_init_attr)); + } +#endif + if (!(*qp)->ibv_qp) { EFA_WARN(FI_LOG_EP_CTRL, "ibv_create_qp failed. errno: %d\n", errno); free(*qp); diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 6cde8f9f4bf..6a761ce4dc0 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -12,7 +12,8 @@ #include "ofi_util.h" #include "rdm/efa_rdm_protocol.h" -#define EFA_QP_DEFAULT_SERVICE_LEVEL 8 +#define EFA_QP_DEFAULT_SERVICE_LEVEL 0 +#define EFA_QP_LOW_LATENCY_SERVICE_LEVEL 8 struct efa_qp { struct ibv_qp *ibv_qp; From 6aa358db3863c93b345550e6097d984f329a960e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 18:05:53 +0000 Subject: [PATCH 218/393] build(deps): bump github/codeql-action from 3.27.0 to 3.27.1 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.27.0 to 3.27.1. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/662472033e021d55d94146f66f6058822b0b39fd...4f3212b61783c3c68e8309a0f18a699764811cda) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 6ab9813507f..c3f0ceba04b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + uses: github/codeql-action/init@4f3212b61783c3c68e8309a0f18a699764811cda # v3.27.1 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + uses: github/codeql-action/autobuild@4f3212b61783c3c68e8309a0f18a699764811cda # v3.27.1 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + uses: github/codeql-action/analyze@4f3212b61783c3c68e8309a0f18a699764811cda # v3.27.1 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 7d8974b3e3d..40dcc2a5da8 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + uses: github/codeql-action/upload-sarif@4f3212b61783c3c68e8309a0f18a699764811cda # v3.27.1 with: sarif_file: results.sarif From 4feb9c35c0580494e4efe0d5ab99d91c0753e4eb Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 6 Nov 2024 15:09:25 -0600 Subject: [PATCH 219/393] prov/cxi: Fix broken client key check Clients could provide an RKEY greater than 4 bytes and the provider would not return -FI_EKEYREJECTED. Signed-off-by: Ian Ziemba --- prov/cxi/src/cxip_mr.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c index 6a53ea87af5..b52e6d22d1a 100644 --- a/prov/cxi/src/cxip_mr.c +++ b/prov/cxi/src/cxip_mr.c @@ -758,6 +758,14 @@ static void cxip_mr_domain_remove(struct cxip_mr *mr) ofi_spin_unlock(&mr->domain->mr_domain.lock); } +static bool cxip_is_valid_mr_key(uint64_t key) +{ + if (key & ~CXIP_MR_KEY_MASK) + return false; + + return true; +} + /* * cxip_mr_domain_insert() - Validate uniqueness and insert * client key in the domain hash table. @@ -777,7 +785,7 @@ static int cxip_mr_domain_insert(struct cxip_mr *mr) mr->key = mr->attr.requested_key; - if (!cxip_generic_is_valid_mr_key(mr->key)) + if (!cxip_is_valid_mr_key(mr->key)) return -FI_EKEYREJECTED; bucket = fasthash64(&mr->key, sizeof(mr->key), 0) % @@ -851,14 +859,6 @@ static int cxip_prov_cache_init_mr_key(struct cxip_mr *mr, return FI_SUCCESS; } -static bool cxip_is_valid_mr_key(uint64_t key) -{ - if (key & ~CXIP_MR_KEY_MASK) - return false; - - return true; -} - static bool cxip_is_valid_prov_mr_key(uint64_t key) { struct cxip_mr_key cxip_key = { From 50b9a42fb0545784025b7e7c2e22a5865d0b782f Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 6 Nov 2024 15:20:54 -0600 Subject: [PATCH 220/393] prov/cxi: Add test for invalid client RKEY Signed-off-by: Ian Ziemba --- prov/cxi/test/mr.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/prov/cxi/test/mr.c b/prov/cxi/test/mr.c index 0c21f1e3c5d..3699506c20c 100644 --- a/prov/cxi/test/mr.c +++ b/prov/cxi/test/mr.c @@ -51,6 +51,25 @@ Test(mr, invalid_fi_directed_recv_flag) cr_assert_eq(ret, -FI_EINVAL, "fi_mr_regattr failed: %d", ret); } +Test(mr, invalid_client_rkey) +{ + int ret; + struct fi_mr_attr attr = {}; + struct iovec iov = {}; + struct fid_mr *mr; + + iov.iov_len = sizeof(ret); + iov.iov_base = (void *)&ret; + + attr.mr_iov = &iov; + attr.iov_count = 1; + attr.access = FI_REMOTE_READ | FI_REMOTE_WRITE; + attr.requested_key = ~1; + + ret = fi_mr_regattr(cxit_domain, &attr, 0, &mr); + cr_assert_eq(ret, -FI_EKEYREJECTED, "fi_mr_regattr failed: %d", ret); +} + Test(mr, std_mrs, .timeout = 600, .disabled = true) { int std_mr_cnt = 16*1024; From 33e8e868d036fcc607c1da9b91c77944e33d87f1 Mon Sep 17 00:00:00 2001 From: wey Date: Wed, 6 Nov 2024 10:03:41 +0800 Subject: [PATCH 221/393] core: Remove redundant windows.h The windows.h header is not necessary for ssize_t definition on the Windows platform since BaseTsd.h already provides the required SSIZE_T type definition. Remove this redundant include to reduce compilation dependencies. Signed-off-by: wey --- include/rdma/fabric.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index 366e6b0402b..42c50532797 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -65,7 +65,6 @@ #if defined(_WIN32) #include -#include typedef SSIZE_T ssize_t; #endif From 39c2f970ac0574bba264a3c7937937faea3caef1 Mon Sep 17 00:00:00 2001 From: John Heemstra Date: Tue, 12 Nov 2024 13:58:20 -0500 Subject: [PATCH 222/393] prov/cxi: update unit test for collectives Update the unit tests for collectives to account for previously-merged PR: https://github.com/ofiwg/libfabric/pull/10513 Signed-off-by: John Heemstra --- prov/cxi/test/coll.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/prov/cxi/test/coll.c b/prov/cxi/test/coll.c index 4dbcfcf26b8..d88c95efa98 100644 --- a/prov/cxi/test/coll.c +++ b/prov/cxi/test/coll.c @@ -2032,7 +2032,7 @@ Test(coll_reduce_ops, fmin) ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); - cr_assert(!ret, "rc failed NAN\n"); + cr_assert(ret, "rc NAN succeeded\n"); data[1].fval[1] = _snan64(); _predict_fmin(nodes, data, &check, true); @@ -2041,7 +2041,7 @@ Test(coll_reduce_ops, fmin) ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2070,7 +2070,7 @@ Test(coll_reduce_ops, fmax) ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); - cr_assert(!ret, "rc failed NAN\n"); + cr_assert(ret, "rc NAN succeeded\n"); data[1].fval[1] = _snan64(); _predict_fmax(nodes, data, &check, true); @@ -2079,7 +2079,7 @@ Test(coll_reduce_ops, fmax) ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2134,7 +2134,7 @@ Test(coll_reduce_ops, fminmaxloc) ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2166,7 +2166,7 @@ Test(coll_reduce_ops, fminnum) ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); - cr_assert(!ret, "rc failed NAN\n"); + cr_assert(ret, "rc NAN succeeded\n"); /* number is given preference over NAN */ data[1].fval[1] = _snan64(); @@ -2177,7 +2177,7 @@ Test(coll_reduce_ops, fminnum) ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2209,7 +2209,7 @@ Test(coll_reduce_ops, fmaxnum) ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed NAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_OVERFLOW); - cr_assert(!ret, "rc failed NAN\n"); + cr_assert(ret, "rc NAN succeeded\n"); /* SNAN is given preference over number */ data[1].fval[1] = _snan64(); @@ -2220,7 +2220,7 @@ Test(coll_reduce_ops, fmaxnum) ret = _check_fval(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } @@ -2275,7 +2275,7 @@ Test(coll_reduce_ops, fminmaxnumloc) ret = _check_fminmax(nodes, rslt, &check); cr_assert(!ret, "compare failed sNAN\n"); ret = _check_rc(nodes, context, FI_CXI_ERRNO_RED_FLT_INVALID); - cr_assert(!ret, "rc failed sNAN\n"); + cr_assert(ret, "rc sNAN succeeded\n"); STDCLEANUP } From 4c2c5a61f76be0ef37653c4017a2435df36d0b14 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 11 Nov 2024 22:06:25 +0000 Subject: [PATCH 223/393] prov/efa: Fix the error handling for unsolicited recv When getting an wc error for rdma with imm + unsolicited recv, there is no wr_id / pkt_entry associated. Libfabric should write the eq error directly. Also added unit tests to cover both solicited and unsolicited recv error path. Signed-off-by: Shi Jin --- prov/efa/Makefile.include | 3 +- prov/efa/src/rdm/efa_rdm_cq.c | 6 ++ prov/efa/test/efa_unit_test_cq.c | 93 +++++++++++++++++++++++++++++ prov/efa/test/efa_unit_test_mocks.c | 16 +++++ prov/efa/test/efa_unit_test_mocks.h | 8 +++ prov/efa/test/efa_unit_tests.c | 3 + prov/efa/test/efa_unit_tests.h | 2 + 7 files changed, 130 insertions(+), 1 deletion(-) diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 4963fe404e3..fc065cb42e4 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -161,7 +161,8 @@ prov_efa_test_efa_unit_test_LDFLAGS = $(cmocka_rpath) $(efa_LDFLAGS) $(cmocka_LD -Wl,--wrap=efadv_query_device \ -Wl,--wrap=ofi_cudaMalloc \ -Wl,--wrap=ofi_copy_from_hmem_iov \ - -Wl,--wrap=efa_rdm_pke_read + -Wl,--wrap=efa_rdm_pke_read \ + -Wl,--wrap=efa_device_support_unsolicited_write_recv if HAVE_EFADV_CQ_EX prov_efa_test_efa_unit_test_LDFLAGS += -Wl,--wrap=efadv_create_cq diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 67a02e55f3d..2d8c1d8811f 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -487,6 +487,12 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) break; case IBV_WC_RECV: /* fall through */ case IBV_WC_RECV_RDMA_WITH_IMM: + if (efa_rdm_cq_wc_is_unsolicited(ibv_cq->ibv_cq_ex)) { + EFA_WARN(FI_LOG_CQ, "Receive error %s (%d) for unsolicited write recv", + efa_strerror(prov_errno), prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, to_fi_errno(prov_errno), prov_errno); + break; + } efa_rdm_pke_handle_rx_error(pkt_entry, prov_errno); break; default: diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 76d45368e87..0c823d0f15b 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -345,6 +345,99 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) assert_int_equal(eq_err_entry.prov_errno, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); } +/** + * @brief verify that fi_cq_read/fi_eq_read works properly when rdma-core return bad status for + * recv rdma with imm. + * + * When getting a wc error of op code IBV_WC_RECV_RDMA_WITH_IMM, libfabric cannot find the + * corresponding application operation to write a cq error. + * It will write an EQ error instead. + * + * @param[in] state struct efa_resource that is managed by the framework + * @param[in] use_unsolicited_recv whether to use unsolicited write recv + */ +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(struct efa_resource **state, bool use_unsolicited_recv) +{ + struct efa_rdm_ep *efa_rdm_ep; + struct efa_resource *resource = *state; + struct fi_cq_data_entry cq_entry; + struct fi_eq_err_entry eq_err_entry; + int ret; + struct efa_rdm_cq *efa_rdm_cq; + + + efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + + efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + + will_return(efa_mock_ibv_start_poll_return_mock, 0); + will_return(efa_mock_ibv_end_poll_check_mock, NULL); + /* efa_mock_ibv_read_opcode_return_mock() will be called once in release mode, + * but will be called twice in debug mode. because there is an assertion that called ibv_read_opcode(), + * therefore use will_return_always() + */ + will_return_always(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV_RDMA_WITH_IMM); + will_return_always(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num); + will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_FLUSHED); + + g_efa_unit_test_mocks.efa_device_support_unsolicited_write_recv = &efa_mock_efa_device_support_unsolicited_write_recv; + +#if HAVE_CAPS_UNSOLICITED_WRITE_RECV + if (use_unsolicited_recv) { + efadv_cq_from_ibv_cq_ex(efa_rdm_cq->ibv_cq.ibv_cq_ex)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; + will_return(efa_mock_efa_device_support_unsolicited_write_recv, true); + will_return(efa_mock_efadv_wc_is_unsolicited, true); + efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = 0; + } else { + /* + * For solicited write recv, it will consume an internal rx pkt + */ + will_return(efa_mock_efa_device_support_unsolicited_write_recv, false); + struct efa_rdm_pke *pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); + assert_non_null(pkt_entry); + efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); + efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; + } +#else + /* + * Always test with solicited recv + */ + will_return(efa_mock_efa_device_support_unsolicited_write_recv, false); + struct efa_rdm_pke *pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); + assert_non_null(pkt_entry); + efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); + efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; +#endif + /* the recv rdma with imm will not populate to application cq because it's an EFA internal error and + * and not related to any application operations. Currently we can only read the error from eq. + */ + efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; + ret = fi_cq_read(resource->cq, &cq_entry, 1); + assert_int_equal(ret, -FI_EAGAIN); + + ret = fi_eq_readerr(resource->eq, &eq_err_entry, 0); + assert_int_equal(ret, sizeof(eq_err_entry)); + assert_int_not_equal(eq_err_entry.err, FI_SUCCESS); + assert_int_equal(eq_err_entry.prov_errno, EFA_IO_COMP_STATUS_FLUSHED); +} + +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_unsolicited_recv(struct efa_resource **state) +{ + test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(state, true); +} + +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_solicited_recv(struct efa_resource **state) +{ + test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(state, false); +} + /** * @brief verify that fi_cq_read/fi_cq_readerr works properly when ibv_start_poll failed. * diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c index ee97098d001..75dd2bad732 100644 --- a/prov/efa/test/efa_unit_test_mocks.c +++ b/prov/efa/test/efa_unit_test_mocks.c @@ -182,6 +182,11 @@ uint32_t efa_mock_ibv_read_wc_flags_return_mock(struct ibv_cq_ex *current) return mock(); } +bool efa_mock_efadv_wc_is_unsolicited(struct efadv_cq *efadv_cq) +{ + return mock(); +} + int g_ofi_copy_from_hmem_iov_call_counter; ssize_t efa_mock_ofi_copy_from_hmem_iov_inc_counter(void *dest, size_t size, enum fi_hmem_iface hmem_iface, uint64_t device, @@ -197,6 +202,11 @@ int efa_mock_efa_rdm_pke_read_return_mock(struct efa_rdm_ope *ope) return mock(); } +bool efa_mock_efa_device_support_unsolicited_write_recv() +{ + return mock(); +} + struct efa_unit_test_mocks g_efa_unit_test_mocks = { .local_host_id = 0, .peer_host_id = 0, @@ -213,6 +223,7 @@ struct efa_unit_test_mocks g_efa_unit_test_mocks = { #endif .ofi_copy_from_hmem_iov = __real_ofi_copy_from_hmem_iov, .efa_rdm_pke_read = __real_efa_rdm_pke_read, + .efa_device_support_unsolicited_write_recv = __real_efa_device_support_unsolicited_write_recv, .ibv_is_fork_initialized = __real_ibv_is_fork_initialized, #if HAVE_EFADV_QUERY_MR .efadv_query_mr = __real_efadv_query_mr, @@ -347,6 +358,11 @@ int __wrap_efa_rdm_pke_read(struct efa_rdm_ope *ope) return g_efa_unit_test_mocks.efa_rdm_pke_read(ope); } +bool __wrap_efa_device_support_unsolicited_write_recv(void) +{ + return g_efa_unit_test_mocks.efa_device_support_unsolicited_write_recv(); +} + enum ibv_fork_status __wrap_ibv_is_fork_initialized(void) { return g_efa_unit_test_mocks.ibv_is_fork_initialized(); diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h index ec9af71b7ec..3e764c91fb1 100644 --- a/prov/efa/test/efa_unit_test_mocks.h +++ b/prov/efa/test/efa_unit_test_mocks.h @@ -72,6 +72,8 @@ uint32_t efa_mock_ibv_read_qp_num_return_mock(struct ibv_cq_ex *current); uint32_t efa_mock_ibv_read_wc_flags_return_mock(struct ibv_cq_ex *current); +bool efa_mock_efadv_wc_is_unsolicited(struct efadv_cq *efadv_cq); + ssize_t __real_ofi_copy_from_hmem_iov(void *dest, size_t size, enum fi_hmem_iface hmem_iface, uint64_t device, const struct iovec *hmem_iov, @@ -85,8 +87,12 @@ ssize_t efa_mock_ofi_copy_from_hmem_iov_inc_counter(void *dest, size_t size, int __real_efa_rdm_pke_read(struct efa_rdm_ope *ope); +bool __real_efa_device_support_unsolicited_write_recv(); + int efa_mock_efa_rdm_pke_read_return_mock(struct efa_rdm_ope *ope); +bool efa_mock_efa_device_support_unsolicited_write_recv(void); + struct efa_unit_test_mocks { uint64_t local_host_id; @@ -118,6 +124,8 @@ struct efa_unit_test_mocks int (*efa_rdm_pke_read)(struct efa_rdm_ope *ope); + bool (*efa_device_support_unsolicited_write_recv)(void); + enum ibv_fork_status (*ibv_is_fork_initialized)(void); #if HAVE_EFADV_QUERY_MR diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index cf3bc976884..7c485058132 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -61,6 +61,7 @@ static int efa_unit_test_mocks_teardown(void **state) #endif .ofi_copy_from_hmem_iov = __real_ofi_copy_from_hmem_iov, .efa_rdm_pke_read = __real_efa_rdm_pke_read, + .efa_device_support_unsolicited_write_recv = __real_efa_device_support_unsolicited_write_recv, .ibv_is_fork_initialized = __real_ibv_is_fork_initialized, }; @@ -122,6 +123,8 @@ int main(void) cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_invalid_qpn, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_message_too_long, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_status, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_unsolicited_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_solicited_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_recover_forgotten_peer_ah, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_ignore_removed_peer, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_fallback_to_ibv_create_cq_ex_cq_read_ignore_forgotton_peer, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 5422295f1b3..c4605c8e962 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -137,6 +137,8 @@ void test_rdm_cq_read_bad_send_status_unreachable_receiver(); void test_rdm_cq_read_bad_send_status_invalid_qpn(); void test_rdm_cq_read_bad_send_status_message_too_long(); void test_ibv_cq_ex_read_bad_recv_status(); +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_unsolicited_recv(); +void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_use_solicited_recv(); void test_ibv_cq_ex_read_recover_forgotten_peer_ah(); void test_rdm_fallback_to_ibv_create_cq_ex_cq_read_ignore_forgotton_peer(); void test_ibv_cq_ex_read_ignore_removed_peer(); From f61cb8ac0251b7a2f58904ac478e0f2962c112a4 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 13 Nov 2024 04:45:27 +0000 Subject: [PATCH 224/393] contrib/aws: Increase ci timeout limit Signed-off-by: Shi Jin --- contrib/aws/Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index a4cb883614c..2edbb0c5945 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -169,8 +169,8 @@ pipeline { script { def stages = [:] // This needs the extra space at the end - // Set 9 hour timeout for all clusters - def addl_args_pr = "--timeout 540 --test-libfabric-pr $env.CHANGE_ID " + // Set 12 hour timeout for all clusters + def addl_args_pr = "--timeout 720 --test-libfabric-pr $env.CHANGE_ID " // Use lockable resources to limit the number of jobs that can get executed in parallel def g4dn8x_lock_label = "g4dn8x" def g4dn12x_lock_label = "g4dn12x" From 15278bb999d3c0a36bad541c309acaba99530426 Mon Sep 17 00:00:00 2001 From: nikhil nanal Date: Mon, 26 Aug 2024 15:01:27 -0700 Subject: [PATCH 225/393] fabtests: New fabtest fi_flood to test over subscription of resources 1. MR cache based registrations tests regsiter and send in batch and sequential modes while flooding the cache beyond the maximum size. 2. Test receipt of unexpected messages by overwhelming the receiver Signed-off-by: nikhil nanal --- fabtests/Makefile.am | 10 +- fabtests/Makefile.win | 4 +- fabtests/fabtests.vcxproj | 2 +- fabtests/fabtests.vcxproj.filters | 2 +- fabtests/functional/{bw.c => flood.c} | 138 ++++++++++++++------- fabtests/man/fabtests.7.md | 11 +- fabtests/man/man1/{fi_bw.1 => fi_flood.1} | 0 fabtests/pytest/default/test_rdm.py | 2 +- fabtests/pytest/efa/test_flood_peer.py | 2 +- fabtests/scripts/runfabtests.cmd | 6 +- fabtests/scripts/runfabtests.sh | 6 +- fabtests/test_configs/tcp/io_uring.exclude | 9 +- 12 files changed, 119 insertions(+), 73 deletions(-) rename fabtests/functional/{bw.c => flood.c} (70%) rename fabtests/man/man1/{fi_bw.1 => fi_flood.1} (100%) diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index 6d830668833..a60ec46cb41 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -40,7 +40,7 @@ bin_PROGRAMS = \ functional/fi_rdm_atomic \ functional/fi_rdm_stress \ functional/fi_multi_recv \ - functional/fi_bw \ + functional/fi_flood \ functional/fi_rdm_multi_client \ functional/fi_loopback \ benchmarks/fi_msg_pingpong \ @@ -360,9 +360,9 @@ functional_fi_multi_recv_SOURCES = \ functional/multi_recv.c functional_fi_multi_recv_LDADD = libfabtests.la -functional_fi_bw_SOURCES = \ - functional/bw.c -functional_fi_bw_LDADD = libfabtests.la +functional_fi_flood_SOURCES = \ + functional/flood.c +functional_fi_flood_LDADD = libfabtests.la functional_fi_rdm_multi_client_SOURCES = \ functional/rdm_multi_client.c @@ -651,7 +651,7 @@ dummy_man_pages = \ man/man1/fi_eq_test.1 \ man/man1/fi_getinfo_test.1 \ man/man1/fi_mr_test.1 \ - man/man1/fi_bw.1 \ + man/man1/fi_flood.1 \ man/man1/fi_rdm_multi_client.1 \ man/man1/fi_ubertest.1 \ man/man1/fi_efa_ep_rnr_retry.1 diff --git a/fabtests/Makefile.win b/fabtests/Makefile.win index da244c78735..fb7924227c8 100644 --- a/fabtests/Makefile.win +++ b/fabtests/Makefile.win @@ -77,7 +77,7 @@ benchmarks: $(outdir)\dgram_pingpong.exe $(outdir)\msg_bw.exe \ $(outdir)\rdm_pingpong.exe $(outdir)\rma_pingpong.exe $(outdir)\rdm_tagged_bw.exe \ $(outdir)\rdm_bw.exe $(outdir)\rdm_tagged_pingpong.exe $(outdir)\rma_bw.exe -functional: $(outdir)\av_xfer.exe $(outdir)\bw.exe $(outdir)\cm_data.exe $(outdir)\cq_data.exe \ +functional: $(outdir)\av_xfer.exe $(outdir)\flood.exe $(outdir)\cm_data.exe $(outdir)\cq_data.exe \ $(outdir)\dgram.exe $(outdir)\msg.exe $(outdir)\msg_epoll.exe \ $(outdir)\inject_test.exe $(outdir)\msg_sockets.exe $(outdir)\multi_mr.exe \ $(outdir)\multi_ep.exe $(outdir)\multi_recv.exe $(outdir)\rdm.exe \ @@ -120,7 +120,7 @@ $(outdir)\rma_bw.exe: {benchmarks}rma_bw.c $(basedeps) {benchmarks}benchmark_sha $(outdir)\av_xfer.exe: {functional}av_xfer.c $(basedeps) -$(outdir)\bw.exe: {functional}bw.c $(basedeps) +$(outdir)\flood.exe: {functional}flood.c $(basedeps) $(outdir)\cm_data.exe: {functional}cm_data.c $(basedeps) diff --git a/fabtests/fabtests.vcxproj b/fabtests/fabtests.vcxproj index cba59f15c1a..65b0af71be0 100644 --- a/fabtests/fabtests.vcxproj +++ b/fabtests/fabtests.vcxproj @@ -239,7 +239,7 @@ - + diff --git a/fabtests/fabtests.vcxproj.filters b/fabtests/fabtests.vcxproj.filters index d3f495b81e2..e113cbf898e 100644 --- a/fabtests/fabtests.vcxproj.filters +++ b/fabtests/fabtests.vcxproj.filters @@ -225,7 +225,7 @@ Source Files\functional - + Source Files\functional diff --git a/fabtests/functional/bw.c b/fabtests/functional/flood.c similarity index 70% rename from fabtests/functional/bw.c rename to fabtests/functional/flood.c index 04745e61374..f85f5274e75 100644 --- a/fabtests/functional/bw.c +++ b/fabtests/functional/flood.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Intel Corporation. All rights reserved. + * Copyright (c) Intel Corporation. All rights reserved. * * This software is available to you under the BSD license * below: @@ -34,7 +34,7 @@ #include -int sleep_time = 0; +static int sleep_time = 0; static ssize_t post_one_tx(struct ft_context *msg) { @@ -99,27 +99,96 @@ static int post_rx_sync(void) return ret; } -static int run_loop(void) +static void mr_close_all(struct ft_context *ctx_arr, int window_size) +{ + int i; + + for (i = 0; i < window_size; i++) + FT_CLOSE_FID(ctx_arr[i].mr); +} + +static int run_seq_mr_send(void) { + + int ret; + int i; + + mr_close_all(tx_ctx_arr, opts.window_size); + mr_close_all(rx_ctx_arr, opts.window_size); + + printf("Sequential memory registration:"); + if (opts.dst_addr) { + for (i = 0; i < opts.window_size; i++) { + ret = ft_reg_mr(fi, tx_ctx_arr[i].buf, tx_mr_size, + ft_info_to_mr_access(fi), + FT_TX_MR_KEY + i, opts.iface, opts.device, + &(tx_ctx_arr[i].mr), &(tx_ctx_arr[i].desc)); + if (ret) + goto out; + + ret = post_one_tx(&tx_ctx_arr[i]); + if (ret) + goto out; + + ret = ft_get_tx_comp(tx_seq); + if (ret) + goto out; + + FT_CLOSE_FID(tx_ctx_arr[i].mr); + } + } else { + for (i = 0; i < opts.window_size; i++) { + ret = ft_reg_mr(fi, rx_ctx_arr[i].buf, rx_mr_size, + ft_info_to_mr_access(fi), FT_RX_MR_KEY + i, opts.iface, opts.device, + &(rx_ctx_arr[i].mr), + &(rx_ctx_arr[i].desc)); + if (ret) + goto out; + + ret = ft_post_rx_buf(ep, opts.transfer_size, + &(rx_ctx_arr[i].context), + rx_ctx_arr[i].buf, + rx_ctx_arr[i].desc, ft_tag); + if (ret) + goto out; + + ret = wait_check_rx_bufs(); + if (ret) + goto out; + + FT_CLOSE_FID(rx_ctx_arr[i].mr); + } + } + if (opts.options & FT_OPT_OOB_SYNC) + ret = ft_sync(); + else + ret = post_rx_sync(); +out: + printf("%s\n", ret ? "Fail" : "Pass"); + return ret; +} + +static int run_batch_mr_send(void) { int ret, i; /* Receive side delay is used in order to let the sender - get ahead of the receiver and post multiple sends - before the receiver begins processing them. */ + * get ahead of the receiver and post multiple sends + * before the receiver begins processing them. + */ if (!opts.dst_addr) sleep(sleep_time); - ft_start(); + printf("Batch memory registration:"); if (opts.dst_addr) { for (i = 0; i < opts.window_size; i++) { ret = post_one_tx(&tx_ctx_arr[i]); if (ret) - return ret; + goto out; } ret = ft_get_tx_comp(tx_seq); if (ret) - return ret; + goto out; } else { for (i = 0; i < opts.window_size; i++) { ret = ft_post_rx_buf(ep, opts.transfer_size, @@ -127,66 +196,39 @@ static int run_loop(void) rx_ctx_arr[i].buf, rx_ctx_arr[i].desc, 0); if (ret) - return ret; + goto out; } ret = wait_check_rx_bufs(); if (ret) - return ret; + goto out; } - ft_stop(); if (opts.options & FT_OPT_OOB_SYNC) ret = ft_sync(); else ret = post_rx_sync(); - if (ret) - return ret; - - if (opts.machr) - show_perf_mr(opts.transfer_size, opts.window_size, &start, &end, 1, - opts.argc, opts.argv); - else - show_perf(NULL, opts.transfer_size, opts.window_size, &start, &end, 1); - +out: + printf("%s\n", ret ? "Fail" : "Pass"); return ret; } static int run(void) { - int ret, i; + int ret; ret = hints->ep_attr->type == FI_EP_MSG ? ft_init_fabric_cm() : ft_init_fabric(); if (ret) return ret; - - ret = ft_tx(ep, remote_fi_addr, 1, &tx_ctx); - if (ret) - return ret; - ret = ft_get_tx_comp(tx_seq); + ret = run_batch_mr_send(); if (ret) - return ret; + goto out; - ret = ft_get_rx_comp(rx_seq); + ret = run_seq_mr_send(); if (ret) - return ret; - - if (!(opts.options & FT_OPT_SIZE)) { - for (i = 0; i < TEST_CNT; i++) { - if (!ft_use_size(i, opts.sizes_enabled)) - continue; - opts.transfer_size = test_size[i].size; - ret = run_loop(); - if (ret) - goto out; - } - } else { - ret = run_loop(); - if (ret) - goto out; - } + goto out; out: return ret; @@ -197,6 +239,8 @@ int main(int argc, char **argv) int op, ret; opts = INIT_OPTS; + opts.options |= FT_OPT_ALLOC_MULT_MR; + opts.options |= FT_OPT_NO_PRE_POSTED_RX; hints = fi_allocinfo(); if (!hints) @@ -225,7 +269,7 @@ int main(int argc, char **argv) break; case '?': case 'h': - ft_usage(argv[0], "A bandwidth test with data verification."); + ft_usage(argv[0], "test to oversubscribe mr cache and receiver with unexpected msgs."); FT_PRINT_OPTS_USAGE("-T sleep_time", "Receive side delay before starting"); FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); @@ -243,8 +287,6 @@ int main(int argc, char **argv) hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; - opts.options |= FT_OPT_ALLOC_MULT_MR; - if (hints->ep_attr->type == FI_EP_DGRAM) { fprintf(stderr, "This test does not support DGRAM endpoints\n"); return -FI_EINVAL; @@ -260,4 +302,4 @@ int main(int argc, char **argv) ft_free_res(); return ft_exit_code(ret); -} +} \ No newline at end of file diff --git a/fabtests/man/fabtests.7.md b/fabtests/man/fabtests.7.md index 4f31360e1cc..bac6c1b3c4c 100644 --- a/fabtests/man/fabtests.7.md +++ b/fabtests/man/fabtests.7.md @@ -140,10 +140,13 @@ features of libfabric. buffer tries to remain the same. This test is used to validate the correct behavior of memory registration caches. -*fi_bw* -: Performs a one-sided bandwidth test with an option for data verification. - A sleep time on the receiving side can be enabled in order to allow - the sender to get ahead of the receiver. +*fi_flood* +: The test performs a one-sided transfer by utilizing Bulk Memory Region (MR) + registration and flooding the receiver with unexpected messages. This is + followed by sequential MR registration transfers, which force the MR cache + to evict the least recently used MRs before making new transfers. An optional + sleep time can be enabled on the receiving side to allow the sender to get + ahead of the receiver. *fi_rdm_multi_client* : Tests a persistent server communicating with multiple clients, one at a diff --git a/fabtests/man/man1/fi_bw.1 b/fabtests/man/man1/fi_flood.1 similarity index 100% rename from fabtests/man/man1/fi_bw.1 rename to fabtests/man/man1/fi_flood.1 diff --git a/fabtests/pytest/default/test_rdm.py b/fabtests/pytest/default/test_rdm.py index 1978006db21..0236fbd8b79 100644 --- a/fabtests/pytest/default/test_rdm.py +++ b/fabtests/pytest/default/test_rdm.py @@ -39,7 +39,7 @@ def test_rdm_shared_av(cmdline_args): @pytest.mark.functional def test_rdm_bw_functional(cmdline_args, completion_semantic): from common import ClientServerTest - test = ClientServerTest(cmdline_args, "fi_bw -e rdm -v -T 1", completion_semantic=completion_semantic) + test = ClientServerTest(cmdline_args, "fi_flood -e rdm -v -T 1", completion_semantic=completion_semantic) test.run() @pytest.mark.parametrize("iteration_type", diff --git a/fabtests/pytest/efa/test_flood_peer.py b/fabtests/pytest/efa/test_flood_peer.py index d49cfdd1c63..ee321e007f2 100644 --- a/fabtests/pytest/efa/test_flood_peer.py +++ b/fabtests/pytest/efa/test_flood_peer.py @@ -3,6 +3,6 @@ @pytest.mark.functional def test_flood_peer(cmdline_args): from common import ClientServerTest - test = ClientServerTest(cmdline_args, "fi_bw -e rdm -W 6400 -S 512 -T 5", + test = ClientServerTest(cmdline_args, "fi_flood -e rdm -W 6400 -S 512 -T 5", timeout=300) test.run() diff --git a/fabtests/scripts/runfabtests.cmd b/fabtests/scripts/runfabtests.cmd index 07e21d4f2eb..5fb9b3833da 100644 --- a/fabtests/scripts/runfabtests.cmd +++ b/fabtests/scripts/runfabtests.cmd @@ -74,9 +74,9 @@ set functional_tests=^ "inject_test -N -A inject -v"^ "inject_test -A inj_complete -v"^ "inject_test -N -A inj_complete -v"^ - "bw -e rdm -v -T 1"^ - "bw -e rdm -v -T 1 -U"^ - "bw -e msg -v -T 1"^ + "flood -e rdm -v -T 1"^ + "flood -e rdm -v -T 1 -U"^ + "flood -e msg -v -T 1"^ "rdm_multi_client -C 10 -I 5"^ "rdm_multi_client -C 10 -I 5 -U" diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index 65cc9958f6f..a6c3b075576 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -150,9 +150,9 @@ functional_tests=( "fi_inject_test -N -A inject -v" "fi_inject_test -A inj_complete -v" "fi_inject_test -N -A inj_complete -v" - "fi_bw -e rdm -v -T 1" - "fi_bw -e rdm -v -T 1 -U" - "fi_bw -e msg -v -T 1" + "fi_flood -e rdm -v -T 1" + "fi_flood -e rdm -v -T 1 -U" + "fi_flood -e msg -v -T 1" "fi_rdm_multi_client -C 10 -I 5" "fi_rdm_multi_client -C 10 -I 5 -U" ) diff --git a/fabtests/test_configs/tcp/io_uring.exclude b/fabtests/test_configs/tcp/io_uring.exclude index d84ee2fde6a..9b7a5c73176 100644 --- a/fabtests/test_configs/tcp/io_uring.exclude +++ b/fabtests/test_configs/tcp/io_uring.exclude @@ -70,13 +70,14 @@ fi_msg_sockets # fi_unexpected_msg -e rdm fails with no message fi_unexpected_msg -e rdm -# fi_bw -e msg fails with +# fi_flood -e msg fails with # fi_eq_sread(): common/shared.c:1165, ret=-4 (Interrupted system call) -fi_bw -e msg +fi_flood -e msg -# fi_bw fails by hanging +# fi_flood fails by hanging +# fi_flood fails by runfabtest timeout only on the CI. # This is a suspected race condition -fi_bw +fi_flood # fi_msg_pingpong fails with # fi_eq_sread(): common/shared.c:1127, ret=-4 (Interrupted system call) From c2d2efe69e1def57fdb548a945970c79d9357dcd Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Wed, 13 Nov 2024 23:50:32 +0000 Subject: [PATCH 226/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- fabtests/man/man7/fabtests.7 | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fabtests/man/man7/fabtests.7 b/fabtests/man/man7/fabtests.7 index 567ef27e978..22707d61522 100644 --- a/fabtests/man/man7/fabtests.7 +++ b/fabtests/man/man7/fabtests.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fabtests" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fabtests" "7" "2024\-11\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -142,10 +142,13 @@ tries to remain the same. This test is used to validate the correct behavior of memory registration caches. .TP -\f[I]fi_bw\f[R] -Performs a one-sided bandwidth test with an option for data -verification. -A sleep time on the receiving side can be enabled in order to allow the +\f[I]fi_flood\f[R] +The test performs a one-sided transfer by utilizing Bulk Memory Region +(MR) registration and flooding the receiver with unexpected messages. +This is followed by sequential MR registration transfers, which force +the MR cache to evict the least recently used MRs before making new +transfers. +An optional sleep time can be enabled on the receiving side to allow the sender to get ahead of the receiver. .TP \f[I]fi_rdm_multi_client\f[R] From f077d787de6eb2963738bb58a560afd6f2ac3179 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 31 Oct 2024 14:47:42 -0700 Subject: [PATCH 227/393] prov/util: set srx completion flags and msg_len properly The peer srx should return entries with the FI_MSG/FI_TAGGED and FI_RECV flags set The msg_size field in the peer_rx_entry needs to be set in the expected path to the number of bytes allowed to be copied. This is either the size of the message (from the attr->msg_size paramter) or, if the buffer is not large enough to hold the entire message, the size of the buffer. This also fixes setting the message size and flag fields on the unexpected multi receive path. This case is a bit different because it not only has to account for the message size and buffer size, but also for the owner entry's message size and flags Signed-off-by: Alexia Ingerson --- prov/util/src/util_srx.c | 53 ++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/prov/util/src/util_srx.c b/prov/util/src/util_srx.c index 0035a1a067e..c2c18599b0e 100644 --- a/prov/util/src/util_srx.c +++ b/prov/util/src/util_srx.c @@ -69,6 +69,7 @@ static void util_init_rx_entry(struct util_rx_entry *entry, entry->peer_entry.context = context; entry->peer_entry.tag = tag; entry->peer_entry.flags = flags; + entry->peer_entry.msg_size = ofi_total_iov_len(iov, count); } static struct util_rx_entry *util_get_recv_entry(struct util_srx_ctx *srx, @@ -191,6 +192,8 @@ static int util_match_msg(struct fid_peer_srx *srx, util_entry->peer_entry.srx = srx; srx_ctx->update_func(srx_ctx, util_entry); } + util_entry->peer_entry.msg_size = MIN(util_entry->peer_entry.msg_size, + attr->msg_size); *rx_entry = &util_entry->peer_entry; return ret; } @@ -268,6 +271,8 @@ static int util_match_tag(struct fid_peer_srx *srx, ret = -FI_ENOENT; util_entry->peer_entry.srx = srx; out: + util_entry->peer_entry.msg_size = MIN(util_entry->peer_entry.msg_size, + attr->msg_size); *rx_entry = &util_entry->peer_entry; return ret; } @@ -496,6 +501,33 @@ static struct util_rx_entry *util_search_unexp_msg(struct util_srx_ctx *srx, return util_search_peer_msg(ofi_array_at(&srx->src_unexp_peers, addr)); } +static bool util_unexp_mrecv(struct util_srx_ctx *srx, + struct util_rx_entry *mrecv_entry, + struct util_rx_entry *rx_entry) +{ + mrecv_entry->multi_recv_ref++; + rx_entry->peer_entry.owner_context = mrecv_entry; + + rx_entry->peer_entry.iov[0].iov_base = + mrecv_entry->peer_entry.iov->iov_base; + rx_entry->peer_entry.iov->iov_len = + MIN(mrecv_entry->peer_entry.iov->iov_len, + rx_entry->peer_entry.msg_size); + *rx_entry->peer_entry.desc = mrecv_entry->peer_entry.desc[0]; + + rx_entry->peer_entry.count = 1; + rx_entry->peer_entry.addr = mrecv_entry->peer_entry.addr; + rx_entry->peer_entry.context = mrecv_entry->peer_entry.context; + rx_entry->peer_entry.tag = mrecv_entry->peer_entry.tag; + rx_entry->peer_entry.flags |= mrecv_entry->peer_entry.flags & + ~FI_MULTI_RECV; + rx_entry->peer_entry.msg_size = rx_entry->peer_entry.iov->iov_len; + + return util_adjust_multi_recv(srx, &mrecv_entry->peer_entry, + rx_entry->peer_entry.msg_size); + +} + static ssize_t util_generic_mrecv(struct util_srx_ctx *srx, const struct iovec *iov, void **desc, size_t iov_count, fi_addr_t addr, void *context, uint64_t flags) @@ -510,7 +542,8 @@ static ssize_t util_generic_mrecv(struct util_srx_ctx *srx, ofi_genlock_lock(srx->lock); mrecv_entry = util_get_recv_entry(srx, iov, desc, iov_count, addr, - context, 0, 0, flags); + context, 0, 0, + flags | FI_MSG | FI_RECV); if (!mrecv_entry) { ret = -FI_ENOMEM; goto out; @@ -520,15 +553,7 @@ static ssize_t util_generic_mrecv(struct util_srx_ctx *srx, rx_entry = util_search_unexp_msg(srx, addr); while (rx_entry) { - util_init_rx_entry(rx_entry, mrecv_entry->peer_entry.iov, desc, - iov_count, addr, context, 0, - flags & (~FI_MULTI_RECV)); - mrecv_entry->multi_recv_ref++; - rx_entry->peer_entry.owner_context = mrecv_entry; - - if (util_adjust_multi_recv(srx, &mrecv_entry->peer_entry, - rx_entry->peer_entry.msg_size)) - buf_done = true; + buf_done = util_unexp_mrecv(srx, mrecv_entry, rx_entry); srx->update_func(srx, rx_entry); ret = rx_entry->peer_entry.srx->peer_ops->start_msg( @@ -695,7 +720,8 @@ ssize_t util_srx_generic_trecv(struct fid_ep *ep_fid, const struct iovec *iov, assert(queue); rx_entry = util_get_recv_entry(srx, iov, desc, iov_count, addr, context, tag, - ignore, flags); + ignore, + flags | FI_TAGGED | FI_RECV); if (!rx_entry) ret = -FI_ENOMEM; else @@ -741,10 +767,11 @@ ssize_t util_srx_generic_recv(struct fid_ep *ep_fid, const struct iovec *iov, ofi_array_at(&srx->src_recv_queues, addr); assert(queue); rx_entry = util_get_recv_entry(srx, iov, desc, iov_count, addr, - context, 0, 0, flags); + context, 0, 0, + flags | FI_MSG | FI_RECV); if (!rx_entry) ret = -FI_ENOMEM; - else + else slist_insert_tail((struct slist_entry *) (&rx_entry->peer_entry), queue); goto out; From fc24cadcff4ecb5303d46eca522a870bcb50f98a Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 31 Oct 2024 11:15:26 -0700 Subject: [PATCH 228/393] prov/shm: cleanup op flags The util srx/peer srx should return the entry with the op flags (FI_MSG/FI_TAGGED + FI_RECV) already set. Remove from the shm code Signed-off-by: Alexia Ingerson --- prov/shm/src/smr.h | 4 +--- prov/shm/src/smr_progress.c | 41 +++++++++++++++++-------------------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/prov/shm/src/smr.h b/prov/shm/src/smr.h index 1992ecbca9c..52c534097e1 100644 --- a/prov/shm/src/smr.h +++ b/prov/shm/src/smr.h @@ -290,10 +290,8 @@ int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, uint64_t flags, size_t len, void *buf, int64_t id, uint64_t tag, uint64_t data); -static inline uint64_t smr_rx_cq_flags(uint32_t op, uint64_t rx_flags, - uint16_t op_flags) +static inline uint64_t smr_rx_cq_flags(uint64_t rx_flags, uint16_t op_flags) { - rx_flags |= ofi_rx_cq_flags(op); if (op_flags & SMR_REMOTE_CQ_DATA) rx_flags |= FI_REMOTE_CQ_DATA; return rx_flags; diff --git a/prov/shm/src/smr_progress.c b/prov/shm/src/smr_progress.c index 5059f576eb2..3932e404c15 100644 --- a/prov/shm/src/smr_progress.c +++ b/prov/shm/src/smr_progress.c @@ -762,8 +762,8 @@ static int smr_start_common(struct smr_ep *ep, struct smr_cmd *cmd, if (!pend) { comp_buf = rx_entry->iov[0].iov_base; - comp_flags = smr_rx_cq_flags(cmd->msg.hdr.op, rx_entry->flags, - cmd->msg.hdr.op_flags); + comp_flags = smr_rx_cq_flags(rx_entry->flags, + cmd->msg.hdr.op_flags); if (err) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing op\n"); @@ -822,8 +822,8 @@ static int smr_copy_saved(struct smr_cmd_ctx *cmd_ctx, } assert(!cmd_ctx->sar_entry); - comp_flags = smr_rx_cq_flags(cmd_ctx->cmd.msg.hdr.op, - rx_entry->flags, cmd_ctx->cmd.msg.hdr.op_flags); + comp_flags = smr_rx_cq_flags(rx_entry->flags, + cmd_ctx->cmd.msg.hdr.op_flags); ret = smr_complete_rx(cmd_ctx->ep, rx_entry->context, cmd_ctx->cmd.msg.hdr.op, comp_flags, @@ -1106,14 +1106,14 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd, FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing rma op\n"); ret = smr_write_err_comp(ep->util_ep.rx_cq, NULL, - smr_rx_cq_flags(cmd->msg.hdr.op, 0, - cmd->msg.hdr.op_flags), 0, -err); + smr_rx_cq_flags(0, cmd->msg.hdr.op_flags), + 0, -err); } else { ret = smr_complete_rx(ep, (void *) cmd->msg.hdr.msg_id, - cmd->msg.hdr.op, smr_rx_cq_flags(cmd->msg.hdr.op, - 0, cmd->msg.hdr.op_flags), total_len, - iov_count ? iov[0].iov_base : NULL, - cmd->msg.hdr.id, 0, cmd->msg.hdr.data); + cmd->msg.hdr.op, smr_rx_cq_flags(0, + cmd->msg.hdr.op_flags), total_len, + iov_count ? iov[0].iov_base : NULL, + cmd->msg.hdr.id, 0, cmd->msg.hdr.data); } if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, @@ -1191,13 +1191,12 @@ static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd, FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing atomic op\n"); ret = smr_write_err_comp(ep->util_ep.rx_cq, NULL, - smr_rx_cq_flags(cmd->msg.hdr.op, 0, - cmd->msg.hdr.op_flags), 0, err); + smr_rx_cq_flags(0, cmd->msg.hdr.op_flags), + 0, err); } else { ret = smr_complete_rx(ep, NULL, cmd->msg.hdr.op, - smr_rx_cq_flags(cmd->msg.hdr.op, 0, - cmd->msg.hdr.op_flags), total_len, - ioc_count ? ioc[0].addr : NULL, + smr_rx_cq_flags(0, cmd->msg.hdr.op_flags), + total_len, ioc_count ? ioc[0].addr : NULL, cmd->msg.hdr.id, 0, cmd->msg.hdr.data); } if (ret) { @@ -1304,13 +1303,11 @@ void smr_progress_ipc_list(struct smr_ep *ep) if (ipc_entry->rx_entry) { context = ipc_entry->rx_entry->context; - flags = smr_rx_cq_flags(ipc_entry->cmd.msg.hdr.op, - ipc_entry->rx_entry->flags, + flags = smr_rx_cq_flags(ipc_entry->rx_entry->flags, ipc_entry->cmd.msg.hdr.op_flags); } else { context = NULL; - flags = smr_rx_cq_flags(ipc_entry->cmd.msg.hdr.op, - 0, ipc_entry->cmd.msg.hdr.op_flags); + flags = smr_rx_cq_flags(0, ipc_entry->cmd.msg.hdr.op_flags); } ret = smr_complete_rx(ep, context, ipc_entry->cmd.msg.hdr.op, @@ -1422,13 +1419,13 @@ static void smr_progress_sar_list(struct smr_ep *ep) if (sar_entry->rx_entry) { comp_ctx = sar_entry->rx_entry->context; - comp_flags = smr_rx_cq_flags(sar_entry->cmd.msg.hdr.op, + comp_flags = smr_rx_cq_flags( sar_entry->rx_entry->flags, sar_entry->cmd.msg.hdr.op_flags); } else { comp_ctx = NULL; - comp_flags = smr_rx_cq_flags(sar_entry->cmd.msg.hdr.op, - 0, sar_entry->cmd.msg.hdr.op_flags); + comp_flags = smr_rx_cq_flags(0, + sar_entry->cmd.msg.hdr.op_flags); } ret = smr_complete_rx(ep, comp_ctx, sar_entry->cmd.msg.hdr.op, comp_flags, From f0c858a9457db428c401a44f40e052662968a3aa Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Wed, 13 Nov 2024 14:14:56 -0800 Subject: [PATCH 229/393] util/pingpong: Fix coverity issue about integer overflow The calculation of `power_of_two` would overflow when `i` reaches 32. Based on the size of allocated array `sizes`, `i` should be less than 32. Add explicit loop limit to suppress the warning. Signed-off-by: Jianxin Xiong --- util/pingpong.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/pingpong.c b/util/pingpong.c index f8af6943320..68a554c8752 100644 --- a/util/pingpong.c +++ b/util/pingpong.c @@ -982,7 +982,7 @@ static int generate_test_sizes(struct pp_opts *opts, size_t tx_size, int **sizes n++; } } else { - for (i = 0;; i++) { + for (i = 0; i < 32; i++) { power_of_two = (i == 0) ? 0 : (1 << i); half_up = (i == 0) ? 1 : power_of_two + (power_of_two / 2); From 8345021cd2ed51d07ef2e16924f04063d80da6d9 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Wed, 13 Nov 2024 14:46:02 -0800 Subject: [PATCH 230/393] prov/verbs: Fix coverity issue about overflowed return value The function `vrb_eq_xrc_recip_conn_event` was defined as size_t but can return a nagative value on error which would overflow when casted to int. Change the type to ssize_t. Also change the return type of `vrb_eq_xrc_connected_event` to ssize_t to be consistent with its caller and the function it calls. Signed-off-by: Jianxin Xiong --- prov/verbs/src/verbs_eq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/prov/verbs/src/verbs_eq.c b/prov/verbs/src/verbs_eq.c index f9bc78a828f..f5f37fe9c17 100644 --- a/prov/verbs/src/verbs_eq.c +++ b/prov/verbs/src/verbs_eq.c @@ -620,7 +620,7 @@ vrb_eq_xrc_conn_event(struct vrb_xrc_ep *ep, return -FI_EAGAIN; } -static size_t +static ssize_t vrb_eq_xrc_recip_conn_event(struct vrb_eq *eq, struct vrb_xrc_ep *ep, struct rdma_cm_event *cma_event, @@ -787,7 +787,7 @@ vrb_eq_xrc_cm_err_event(struct vrb_eq *eq, return FI_SUCCESS; } -static int +static ssize_t vrb_eq_xrc_connected_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event, int *acked, struct fi_eq_cm_entry *entry, size_t len, @@ -795,7 +795,7 @@ vrb_eq_xrc_connected_event(struct vrb_eq *eq, { struct vrb_xrc_ep *ep; fid_t fid = cma_event->id->context; - int ret; + ssize_t ret; ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); From d678ba32e0ebf5b8bbfc2cc127befc535f42c958 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 13 Nov 2024 15:21:33 -0600 Subject: [PATCH 231/393] prov/cxi: Correct checking of MR test rc For invalid_client_rkey, -FI_EKEYREJECTED will be returned if FI_MR_PROV_KEY is not being used. Signed-off-by: Ian Ziemba --- prov/cxi/test/mr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prov/cxi/test/mr.c b/prov/cxi/test/mr.c index 3699506c20c..fd35fe5fbc4 100644 --- a/prov/cxi/test/mr.c +++ b/prov/cxi/test/mr.c @@ -67,7 +67,8 @@ Test(mr, invalid_client_rkey) attr.requested_key = ~1; ret = fi_mr_regattr(cxit_domain, &attr, 0, &mr); - cr_assert_eq(ret, -FI_EKEYREJECTED, "fi_mr_regattr failed: %d", ret); + if ((cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY) != FI_MR_PROV_KEY) + cr_assert_eq(ret, -FI_EKEYREJECTED, "fi_mr_regattr failed: %d", ret); } Test(mr, std_mrs, .timeout = 600, .disabled = true) From 693edbf17fb26928bd84fccd75fb83507b51fe13 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Wed, 6 Nov 2024 13:56:45 -0800 Subject: [PATCH 232/393] contrib/intel/jenkins: Migrate shmem tests to use new CI Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index ccd1390ec03..f848648f160 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -451,11 +451,7 @@ pipeline { """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ --build_item=mpich --build_hw=water""" ) - slurm_batch("totodile", "1", - "${env.LOG_DIR}/build_shmem_water_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=shmem --build_hw=water""" - ) + run_ci("CI_shmem_grass", "pr_shmem_grass.json") } } } @@ -468,11 +464,7 @@ pipeline { """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ --build_item=mpich --build_hw=grass""" ) - slurm_batch("grass", "1", - "${env.LOG_DIR}/build_shmem_grass_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=shmem --build_hw=grass""" - ) + run_ci("CI_shmem_water", "pr_shmem_water.json") } } } @@ -728,9 +720,9 @@ pipeline { stage('SHMEM_grass') { steps { script { - dir (RUN_LOCATION) { - run_middleware([["tcp", null]], "SHMEM", "shmem", - "grass", "bulbasaur,chikorita", "2") + dir (CI_LOCATION) { + run_ci("CI_shmem_grass", "pr_shmem_1n2ppn_grass.json") + run_ci("CI_shmem_water", "pr_shmem_2n1ppn_water.json") } } } @@ -738,9 +730,9 @@ pipeline { stage('SHMEM_water') { steps { script { - dir (RUN_LOCATION) { - run_middleware([["verbs", "rxm"], ["sockets", null]], "SHMEM", - "shmem", "water", "totodile", "2") + dir (CI_LOCATION) { + run_ci("CI_shmem_water", "pr_shmem_1n2ppn_water.json") + run_ci("CI_shmem_water", "pr_shmem_2n1ppn_water.json") } } } From 9b0b75294ca47e12b05aef3c64152b1ac7d81e9b Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 7 Nov 2024 11:40:52 -0800 Subject: [PATCH 233/393] contrib/intel/jenkins: Rename build_ci to bootstrap_ci Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index f848648f160..a5d052f34f2 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -276,7 +276,7 @@ def build(item, mode=null, hw=null, additional_args=null) { run_python(PYTHON_VERSION, cmd) } -def build_ci() { +def bootstrap_ci() { sh "${CI_LOCATION}/${env.CI_MODULE}/bootstrap.sh" } @@ -435,10 +435,10 @@ pipeline { stage ('parallel-builds') { when { equals expected: true, actual: DO_RUN } parallel { - stage ('build-ci') { + stage ('bootstrap-ci') { steps { script { - build_ci() + bootstrap_ci() } } } @@ -536,7 +536,7 @@ pipeline { dir (CUSTOM_WORKSPACE) { build("logdir") build("builddir") - build_ci() + bootstrap_ci() slurm_build(BUILD_MODES, "fabrics-ci", "source", "ze", "gpu", "--gpu") } From 8eb0b40b1e8a8de4273ccc93a09761e7b85af486 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 12 Nov 2024 15:31:30 -0800 Subject: [PATCH 234/393] contrib/intel/jenkins: Add stage to check node health Add health check stage to reboot nodes as needed if they are stuck in a bad state from previous jobs. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index a5d052f34f2..63baa65e07d 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -421,6 +421,16 @@ pipeline { } } } + stage ('health check') { + when { equals expected: true, actual: DO_RUN } + steps { + script { + dir (CI_LOCATION) { + sh "./temperature.sh" + } + } + } + } stage ('prepare build') { when { equals expected: true, actual: DO_RUN } steps { From 85e25353b296df1a8b2456e4c26f29705ff64e72 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 7 Nov 2024 12:10:17 -0800 Subject: [PATCH 235/393] contrib/intel/jenkins: Separate tcp and tcp;ofi_rxm testing Separate tcp and tcp;ofi_rxm testing so that both providers are covered. Previously tcp was testing everything and picking up rxm as needed. Instead it is better practice to separate them and just test both usecases entirely. Update both tcp ubertest lists to correctly run only supported tests. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 63baa65e07d..490850688a1 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -605,6 +605,19 @@ pipeline { } } } + stage('CI_fabtests_tcp-rxm') { + steps { + script { + dir (CI_LOCATION) { + run_ci("CI_fabtests_tcp-rxm_reg", + "pr_fabtests_tcp-rxm_reg.json") + run_ci("CI_fabtests_tcp-rxm_dbg", + "pr_fabtests_tcp-rxm_dbg.json") + run_ci("CI_fabtests_tcp-rxm_dl", "pr_fabtests_tcp-rxm_dl.json") + } + } + } + } stage('CI_fabtests_sockets') { steps { script { From 6b0fe118fd51676bf458fbb450bb60c86c93fd61 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 12 Nov 2024 15:34:34 -0800 Subject: [PATCH 236/393] contrib/intel/jenkins: Use new CI to build New CI will be in charge of building everything that needs to be built that it is in charge of running. Libfabric, Fabtests, SHMEM. New CI prefers a tarball to build from so we will checkout scm from the plugin, tar it, and then feed it to new CI to use. All builds will eventually move from middlewares to pre-build. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 126 ++++++++++++++++-------------- 1 file changed, 66 insertions(+), 60 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 490850688a1..faaff15f28c 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -280,6 +280,13 @@ def bootstrap_ci() { sh "${CI_LOCATION}/${env.CI_MODULE}/bootstrap.sh" } +def checkout_tar(name) { + dir ("${env.CUSTOM_WORKSPACE}/${name}/libfabric") { checkout scm } + dir ("${env.CUSTOM_WORKSPACE}/${name}/") { + sh "tar -cvf libfabric.tar.gz libfabric/*" + } +} + def check_target() { echo "CHANGE_TARGET = ${env.CHANGE_TARGET}" if (changeRequest()) { @@ -369,27 +376,7 @@ pipeline { stage ('checkout') { steps { script { - dir ("${CUSTOM_WORKSPACE}/source/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/grass/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/water/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/electric/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/ucx/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/cuda/libfabric") { - checkout scm - } - dir ("${CUSTOM_WORKSPACE}/iouring/libfabric") { - checkout scm - } + checkout_tar("source") dir (CUSTOM_WORKSPACE) { checkout_external_resources() } @@ -442,72 +429,88 @@ pipeline { } } } + stage ('bootstrap-ci') { + steps { + script { + bootstrap_ci() + } + } + } stage ('parallel-builds') { when { equals expected: true, actual: DO_RUN } parallel { - stage ('bootstrap-ci') { - steps { - script { - bootstrap_ci() - } - } - } stage ('build-water') { steps { script { - slurm_build(BUILD_MODES, "totodile", "water", "water", "water") - slurm_batch("totodile", "1", + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_water.json") + run_ci("pre-build", "pr_build_shmem_water.json") + slurm_batch("totodile", "1", "${env.LOG_DIR}/build_mpich_water_log", """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ --build_item=mpich --build_hw=water""" - ) - run_ci("CI_shmem_grass", "pr_shmem_grass.json") + ) + } } } } stage ('build-grass') { steps { script { - slurm_build(BUILD_MODES, "grass", "grass", "grass", "grass") - slurm_batch("grass", "1", + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_grass.json") + run_ci("pre-build", "pr_build_shmem_grass.json") + slurm_batch("grass", "1", "${env.LOG_DIR}/build_mpich_grass_log", """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ --build_item=mpich --build_hw=grass""" - ) - run_ci("CI_shmem_water", "pr_shmem_water.json") + ) + } } } } stage ('build-electric') { steps { script { - slurm_build(BUILD_MODES, "electric", "electric", "electric", - "electric") + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_electric.json") + } } } } stage ('build-ucx') { steps { script { - slurm_build(BUILD_MODES, "totodile", "ucx", "ucx", "ucx") + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_ucx.json") + } } } } - stage ('build-cuda') { + stage ('build-cyndaquil') { steps { script { - slurm_build(["reg"], "cyndaquil", "cuda", "cyndaquil", - "cyndaquil", "--cuda") - slurm_build(["reg"], "quilava", "cuda", "quilava", - "quilava", "--cuda") + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_cyndaquil.json") + } } } } - stage ('build-iouring') { + stage ('build-quilava') { steps { script { - slurm_build(BUILD_MODES, "ivysaur", "iouring", "ivysaur", - "ivysaur") + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_quilava.json") + } + } + } + } + stage ('build-ivysaur') { + steps { + script { + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_ivysaur.json") + } } } } @@ -521,17 +524,19 @@ pipeline { options { skipDefaultCheckout() } steps { script { - dir ("${CUSTOM_WORKSPACE}/source/libfabric") { checkout scm } + checkout_tar("source") checkout_external_resources() dir (CUSTOM_WORKSPACE) { build("logdir") - build("libfabric", "reg", "daos") - build("fabtests", "reg", "daos") + } + bootstrap_ci() + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_daos.json") } } } } - stage ('build-gpu') { + stage ('build-fire') { agent { node { label 'ze' @@ -541,14 +546,15 @@ pipeline { options { skipDefaultCheckout() } steps { script { - dir ("${CUSTOM_WORKSPACE}/source/libfabric") { checkout scm } + checkout_tar("source") checkout_external_resources() dir (CUSTOM_WORKSPACE) { build("logdir") build("builddir") - bootstrap_ci() - slurm_build(BUILD_MODES, "fabrics-ci", "source", "ze", "gpu", - "--gpu") + } + bootstrap_ci() + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_fire.json") } } } @@ -799,16 +805,16 @@ pipeline { script { dir (RUN_LOCATION) { run_middleware([["psm3", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, + "fire", "torchic", "1", null, null, "FI_HMEM_DISABLE_P2P=1") run_middleware([["verbs", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, + "fire", "torchic", "1", null, null, "FI_HMEM_DISABLE_P2P=1") run_middleware([["tcp", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, + "fire", "torchic", "1", null, null, "FI_HMEM_DISABLE_P2P=1") run_middleware([["shm", null]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "torchic", "1", null, null, + "fire", "torchic", "1", null, null, "FI_HMEM_DISABLE_P2P=1") } } @@ -850,7 +856,7 @@ pipeline { dir (RUN_LOCATION) { dmabuf_output = "${LOG_DIR}/DMABUF-Tests_verbs-rxm_dmabuf" cmd = """ python3.9 runtests.py --test=dmabuf \ - --prov=verbs --util=rxm --build_hw=gpu""" + --prov=verbs --util=rxm --build_hw=fire""" slurm_batch("torchic", "1", "${dmabuf_output}_reg", "${cmd}") } From 075834a6a7c496a9eef768f20e984dd4681df0ee Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 14 Nov 2024 11:01:43 -0800 Subject: [PATCH 237/393] contrib/intel/jenkins: Rebase before running Make sure code is rebased on target branch before running. Re-order git commands to not need adding remote several times Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 46 +++++++++++++------------------ 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index faaff15f28c..a28d420af65 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -200,27 +200,17 @@ def checkout_external_resources() { checkout_ci() } -def generate_diff(def branch_name, def output_loc) { - sh """ - git remote add mainRepo ${env.UPSTREAM} - git fetch mainRepo - git diff --name-only HEAD..mainRepo/${branch_name} > ${output_loc}/commit_id - git remote remove mainRepo - """ -} +def git_diffs() { + dir ("${CUSTOM_WORKSPACE}/source/libfabric") { + sh """ + git diff --name-only HEAD..upstream/${TARGET} > ./commit_id + git diff upstream/${TARGET}:Makefile.am Makefile.am > ./Makefile.am.diff + git diff upstream/${TARGET}:configure.ac configure.ac > ./configure.ac.diff -def generate_release_num(def branch_name, def output_loc) { - sh """ - git remote add mainRepo ${env.UPSTREAM} - git fetch mainRepo - git diff mainRepo/${branch_name}:Makefile.am Makefile.am > \ - ${output_loc}/Makefile.am.diff - git diff mainRepo/${branch_name}:configure.ac configure.ac > \ - ${output_loc}/configure.ac.diff - cat configure.ac | grep AC_INIT | cut -d ' ' -f 2 | \ - cut -d '[' -f 2 | cut -d ']' -f 1 > ${output_loc}/release_num.txt - git remote remove mainRepo - """ + cat configure.ac | grep AC_INIT | cut -d ' ' -f 2 | \ + cut -d '[' -f 2 | cut -d ']' -f 1 > ./release_num.txt + """ + } } def slurm_build(modes, partition, location, tag, hw=null, additional_args=null) { @@ -281,7 +271,14 @@ def bootstrap_ci() { } def checkout_tar(name) { - dir ("${env.CUSTOM_WORKSPACE}/${name}/libfabric") { checkout scm } + dir ("${env.CUSTOM_WORKSPACE}/${name}/libfabric") { + checkout scm + TARGET=check_target() + sh """ + git remote add upstream ${env.UPSTREAM} + git pull --rebase upstream ${TARGET} + """ + } dir ("${env.CUSTOM_WORKSPACE}/${name}/") { sh "tar -cvf libfabric.tar.gz libfabric/*" } @@ -386,12 +383,7 @@ pipeline { stage ('opt-out') { steps { script { - TARGET=check_target() - dir ("${CUSTOM_WORKSPACE}/source/libfabric") { - generate_diff("${TARGET}", "${env.WORKSPACE}/source/libfabric") - generate_release_num("${TARGET}", "${env.WORKSPACE}/source/libfabric") - } - + git_diffs() if (env.WEEKLY == null) { weekly = false } else { From c6085d101f5d60a694bac934da61f029f0fa02c5 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 14 Nov 2024 22:34:59 +0000 Subject: [PATCH 238/393] prov/efa: Fix the ep list scan in cq/cntr read We cannot only iterate eps and post initial batch of internal rx pkt once, as there can be more eps joining later after the cq read call. This patch fixes by introducing a bit in cq/ctnr that indicates whether a ep list scan is needed. This bit is set as true when a new ep is bind to the cq, and will be set as false every time when a scan is done. Signed-off-by: Shi Jin --- prov/efa/src/efa_cntr.c | 6 +++--- prov/efa/src/efa_cntr.h | 2 +- prov/efa/src/rdm/efa_rdm_cq.c | 6 +++--- prov/efa/src/rdm/efa_rdm_cq.h | 2 +- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 9 +++++++++ prov/efa/test/efa_unit_test_cntr.c | 6 ++++-- prov/efa/test/efa_unit_test_cq.c | 6 ++++-- 7 files changed, 25 insertions(+), 12 deletions(-) diff --git a/prov/efa/src/efa_cntr.c b/prov/efa/src/efa_cntr.c index 3a014c1c614..fa1f548c525 100644 --- a/prov/efa/src/efa_cntr.c +++ b/prov/efa/src/efa_cntr.c @@ -161,13 +161,13 @@ static void efa_rdm_cntr_progress(struct util_cntr *cntr) * some idle endpoints and never poll completions for them. Move these initial posts to * the first polling before having a long term fix. */ - if (!efa_cntr->initial_rx_to_all_eps_posted) { + if (efa_cntr->need_to_scan_ep_list) { dlist_foreach(&cntr->ep_list, item) { fid_entry = container_of(item, struct fid_list_entry, entry); efa_rdm_ep = container_of(fid_entry->fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); efa_rdm_ep_post_internal_rx_pkts(efa_rdm_ep); } - efa_cntr->initial_rx_to_all_eps_posted = true; + efa_cntr->need_to_scan_ep_list = false; } dlist_foreach(&efa_cntr->ibv_cq_poll_list, item) { @@ -193,7 +193,7 @@ int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, return -FI_ENOMEM; dlist_init(&cntr->ibv_cq_poll_list); - cntr->initial_rx_to_all_eps_posted = false; + cntr->need_to_scan_ep_list = false; efa_domain = container_of(domain, struct efa_domain, util_domain.domain_fid); diff --git a/prov/efa/src/efa_cntr.h b/prov/efa/src/efa_cntr.h index 05227159d49..bcfde8784a2 100644 --- a/prov/efa/src/efa_cntr.h +++ b/prov/efa/src/efa_cntr.h @@ -13,7 +13,7 @@ struct efa_cntr { struct fid_cntr *shm_cntr; struct dlist_entry ibv_cq_poll_list; /* Only used by RDM EP type */ - bool initial_rx_to_all_eps_posted; + bool need_to_scan_ep_list; }; int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 2d8c1d8811f..4b3bcd74d1d 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -634,13 +634,13 @@ static void efa_rdm_cq_progress(struct util_cq *cq) * some idle endpoints and never poll completions for them. Move these initial posts to * the first cq read call before having a long term fix. */ - if (!efa_rdm_cq->initial_rx_to_all_eps_posted) { + if (efa_rdm_cq->need_to_scan_ep_list) { dlist_foreach(&cq->ep_list, item) { fid_entry = container_of(item, struct fid_list_entry, entry); efa_rdm_ep = container_of(fid_entry->fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); efa_rdm_ep_post_internal_rx_pkts(efa_rdm_ep); } - efa_rdm_cq->initial_rx_to_all_eps_posted = true; + efa_rdm_cq->need_to_scan_ep_list = false; } dlist_foreach(&efa_rdm_cq->ibv_cq_poll_list, item) { @@ -686,7 +686,7 @@ int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, attr->size = MAX(efa_domain->rdm_cq_size, attr->size); dlist_init(&cq->ibv_cq_poll_list); - cq->initial_rx_to_all_eps_posted = false; + cq->need_to_scan_ep_list = false; ret = ofi_cq_init(&efa_prov, domain, attr, &cq->util_cq, &efa_rdm_cq_progress, context); diff --git a/prov/efa/src/rdm/efa_rdm_cq.h b/prov/efa/src/rdm/efa_rdm_cq.h index 5bb7b2b80c0..4e88a8b7f63 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.h +++ b/prov/efa/src/rdm/efa_rdm_cq.h @@ -12,7 +12,7 @@ struct efa_rdm_cq { struct fid_cq *shm_cq; struct efa_ibv_cq ibv_cq; struct dlist_entry ibv_cq_poll_list; - bool initial_rx_to_all_eps_posted; + bool need_to_scan_ep_list; }; /* diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 56e80bc146d..d7b8a18620f 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1208,6 +1208,9 @@ int efa_rdm_ep_insert_cntr_ibv_cq_poll_list(struct efa_rdm_ep *ep) if (ret) return ret; } + ofi_genlock_lock(&efa_cntr->util_cntr.ep_list_lock); + efa_cntr->need_to_scan_ep_list = true; + ofi_genlock_unlock(&efa_cntr->util_cntr.ep_list_lock); } } @@ -1233,6 +1236,9 @@ int efa_rdm_ep_insert_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) if (ret) return ret; } + ofi_genlock_lock(&tx_cq->util_cq.ep_list_lock); + tx_cq->need_to_scan_ep_list = true; + ofi_genlock_unlock(&tx_cq->util_cq.ep_list_lock); } if (rx_cq) { @@ -1245,6 +1251,9 @@ int efa_rdm_ep_insert_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) if (ret) return ret; } + ofi_genlock_lock(&rx_cq->util_cq.ep_list_lock); + rx_cq->need_to_scan_ep_list = true; + ofi_genlock_unlock(&rx_cq->util_cq.ep_list_lock); } return FI_SUCCESS; diff --git a/prov/efa/test/efa_unit_test_cntr.c b/prov/efa/test/efa_unit_test_cntr.c index aeb44d51195..2aa2ea60927 100644 --- a/prov/efa/test/efa_unit_test_cntr.c +++ b/prov/efa/test/efa_unit_test_cntr.c @@ -121,7 +121,8 @@ void test_efa_cntr_post_initial_rx_pkts(struct efa_resource **state) efa_cntr = container_of(cntr, struct efa_cntr, util_cntr.cntr_fid); - assert_false(efa_cntr->initial_rx_to_all_eps_posted); + /* cntr read need to scan the ep list since a ep is bind */ + assert_true(efa_cntr->need_to_scan_ep_list); cnt = fi_cntr_read(cntr); /* No completion should be read */ @@ -132,7 +133,8 @@ void test_efa_cntr_post_initial_rx_pkts(struct efa_resource **state) assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); assert_int_equal(efa_rdm_ep->efa_rx_pkts_held, 0); - assert_true(efa_cntr->initial_rx_to_all_eps_posted); + /* scan is done */ + assert_false(efa_cntr->need_to_scan_ep_list); /* ep must be closed before cq/av/eq... */ fi_close(&resource->ep->fid); resource->ep = NULL; diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 0c823d0f15b..7cb8c47dc4c 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -597,7 +597,8 @@ void test_efa_rdm_cq_post_initial_rx_pkts(struct efa_resource **state) assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, 0); assert_int_equal(efa_rdm_ep->efa_rx_pkts_held, 0); - assert_false(efa_rdm_cq->initial_rx_to_all_eps_posted); + /* cq read need to scan the ep list since a ep is bind */ + assert_true(efa_rdm_cq->need_to_scan_ep_list); fi_cq_read(resource->cq, NULL, 0); /* At this time, rx pool size number of rx pkts are posted */ @@ -605,7 +606,8 @@ void test_efa_rdm_cq_post_initial_rx_pkts(struct efa_resource **state) assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); assert_int_equal(efa_rdm_ep->efa_rx_pkts_held, 0); - assert_true(efa_rdm_cq->initial_rx_to_all_eps_posted); + /* scan is done */ + assert_false(efa_rdm_cq->need_to_scan_ep_list); } #if HAVE_EFADV_CQ_EX /** From cfa95c9cab37d5379feb56e040e77801a07f676a Mon Sep 17 00:00:00 2001 From: Olga Weiss Date: Sun, 17 Nov 2024 18:30:02 -0500 Subject: [PATCH 239/393] prov/cxi: testing FI_RM_ENABLED Adding an option for CQ/cntr to tests that verify FI_RM_ENABLED in CXI Libfabric provider. Now the tests are: - read_mr_overrun_cntr - read_mr_overrun_cq - write_mr_overrun_cntr - write_mr_overrun_cq NETCASSINI-3288 Signed-off-by: Olga Weiss --- prov/cxi/test/rma.c | 50 +++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/prov/cxi/test/rma.c b/prov/cxi/test/rma.c index 5642e2389cf..3527c100bfa 100644 --- a/prov/cxi/test/rma.c +++ b/prov/cxi/test/rma.c @@ -1989,7 +1989,7 @@ Test(rma, invalid_read_target_opt_mr_key) /* Tests to verify FI_RM_ENABLED */ -static void mr_overrun(bool write) +static void mr_overrun(bool write, bool use_cq) { int ret; uint8_t *local; @@ -2018,11 +2018,16 @@ static void mr_overrun(bool write) cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret); } - /* Wait for async event indicating data has been sent */ - ret = cxit_await_completion(cxit_tx_cq, &cqe); - cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret); + if (use_cq) { + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret); - validate_tx_event(&cqe, FI_RMA | (write ? FI_WRITE : FI_READ), NULL); + validate_tx_event(&cqe, FI_RMA | (write ? FI_WRITE : FI_READ), NULL); + } else { + while (fi_cntr_read(write ? cxit_write_cntr : cxit_read_cntr) != 1) + ; + } /* Validate read data */ for (int i = 0; i < good_len; i++) @@ -2042,25 +2047,40 @@ static void mr_overrun(bool write) cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret); } - /* Wait for async event indicating data has been sent */ - ret = cxit_await_completion(cxit_tx_cq, &cqe); - cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret); - ret = fi_cq_readerr(cxit_tx_cq, &err, 1); - cr_assert(ret == 1); - cr_assert_eq(err.err, FI_EIO, "Error return %d", err.err); + if (use_cq) { + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret); + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert(ret == 1); + cr_assert_eq(err.err, FI_EIO, "Error return %d", err.err); + } else { + while (fi_cntr_readerr(write ? cxit_write_cntr : cxit_read_cntr) != 1) + ; + } mr_destroy(&remote); free(local); } -Test(rma, read_mr_overrun) +Test(rma, read_mr_overrun_cq) +{ + mr_overrun(false, true); +} + +Test(rma, write_mr_overrun_cq) +{ + mr_overrun(true, true); +} + +Test(rma, read_mr_overrun_cntr) { - mr_overrun(false); + mr_overrun(false, false); } -Test(rma, write_mr_overrun) +Test(rma, write_mr_overrun_cntr) { - mr_overrun(true); + mr_overrun(true, false); } static void rma_hybrid_mr_desc_test_runner(bool write, bool cq_events) From d7bfafeb9bbf8915ce3e0773f5ead494cb5556da Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Thu, 14 Nov 2024 13:25:28 -0800 Subject: [PATCH 240/393] prov/efa: Move inject sizes from rdm ep to base ep So these fields can be used by efa-raw. Signed-off-by: Jessie Yang --- prov/efa/src/efa_base_ep.c | 5 ++++ prov/efa/src/efa_base_ep.h | 5 ++++ prov/efa/src/rdm/efa_rdm_ep.h | 4 ---- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 37 ++++++++++++++++------------- prov/efa/src/rdm/efa_rdm_msg.c | 12 +++++----- prov/efa/src/rdm/efa_rdm_ope.c | 2 +- prov/efa/src/rdm/efa_rdm_rma.c | 10 ++++---- prov/efa/test/efa_unit_test_ep.c | 6 ++--- 8 files changed, 45 insertions(+), 36 deletions(-) diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index db75afa01e0..2b17088d1b7 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -348,6 +348,11 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, base_ep->efa_qp_enabled = false; base_ep->qp = NULL; base_ep->user_recv_qp = NULL; + + base_ep->max_msg_size = info->ep_attr->max_msg_size; + base_ep->max_rma_size = info->ep_attr->max_msg_size; + base_ep->inject_msg_size = info->tx_attr->inject_size; + base_ep->inject_rma_size = info->tx_attr->inject_size; return 0; } diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 6a761ce4dc0..263e0ca902b 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -57,6 +57,11 @@ struct efa_base_ep { struct ibv_recv_wr *recv_more_wr_tail; struct efa_recv_wr *efa_recv_wr_vec; + size_t max_msg_size; /**< #FI_OPT_MAX_MSG_SIZE */ + size_t max_rma_size; /**< #FI_OPT_MAX_RMA_SIZE */ + size_t inject_msg_size; /**< #FI_OPT_INJECT_MSG_SIZE */ + size_t inject_rma_size; /**< #FI_OPT_INJECT_RMA_SIZE */ + /* Only used by RDM ep type */ struct efa_qp *user_recv_qp; /* Separate qp to receive pkts posted by users */ }; diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index cebf968439c..d7a8fc5ddc2 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -60,13 +60,9 @@ struct efa_rdm_ep { struct fid_peer_srx *shm_peer_srx; size_t mtu_size; - size_t max_msg_size; /**< #FI_OPT_MAX_MSG_SIZE */ size_t max_tagged_size; /**< #FI_OPT_MAX_TAGGED_SIZE */ - size_t max_rma_size; /**< #FI_OPT_MAX_RMA_SIZE */ size_t max_atomic_size; /**< #FI_OPT_MAX_ATOMIC_SIZE */ - size_t inject_msg_size; /**< #FI_OPT_INJECT_MSG_SIZE */ size_t inject_tagged_size; /**< #FI_OPT_INJECT_TAGGED_SIZE */ - size_t inject_rma_size; /**< #FI_OPT_INJECT_RMA_SIZE */ size_t inject_atomic_size; /**< #FI_OPT_INJECT_ATOMIC_SIZE */ /* Endpoint's capability to support zero-copy rx */ diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index d7b8a18620f..24360786e1f 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -453,9 +453,12 @@ void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep) } /* Max msg size is too large, turn off zcpy recv */ - if (ep->max_msg_size > ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size) { - EFA_INFO(FI_LOG_EP_CTRL, "max_msg_size (%zu) is greater than the mtu size limit: %zu. Zero-copy receive protocol will be disabled.\n", - ep->max_msg_size, ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size); + if (ep->base_ep.max_msg_size > ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size) { + EFA_INFO(FI_LOG_EP_CTRL, + "max_msg_size (%zu) is greater than the mtu size limit: %zu. " + "Zero-copy receive protocol will be disabled.\n", + ep->base_ep.max_msg_size, + ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size); ep->use_zcpy_rx = false; goto out; } @@ -557,13 +560,9 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, EFA_INFO(FI_LOG_EP_CTRL, "efa_rdm_ep->host_id: i-%017lx\n", efa_rdm_ep->host_id); } - efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size; efa_rdm_ep->max_tagged_size = info->ep_attr->max_msg_size; - efa_rdm_ep->max_rma_size = info->ep_attr->max_msg_size; efa_rdm_ep->max_atomic_size = info->ep_attr->max_msg_size; - efa_rdm_ep->inject_msg_size = info->tx_attr->inject_size; efa_rdm_ep->inject_tagged_size = info->tx_attr->inject_size; - efa_rdm_ep->inject_rma_size = info->tx_attr->inject_size; efa_rdm_ep->inject_atomic_size = info->tx_attr->inject_size; efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size; efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size; @@ -1298,8 +1297,12 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) * when supported */ if (ep->use_zcpy_rx) { - ep->inject_msg_size = MIN(ep->inject_msg_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); - ep->inject_rma_size = MIN(ep->inject_rma_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + ep->base_ep.inject_msg_size = + MIN(ep->base_ep.inject_msg_size, + efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + ep->base_ep.inject_rma_size = + MIN(ep->base_ep.inject_rma_size, + efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); } ret = efa_rdm_ep_create_base_ep_ibv_qp(ep); @@ -1720,25 +1723,25 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, return ret; break; case FI_OPT_MAX_MSG_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(MAX_MSG_SIZE, efa_rdm_ep->max_msg_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + EFA_RDM_EP_SETOPT_THRESHOLD(MAX_MSG_SIZE, efa_rdm_ep->base_ep.max_msg_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_MAX_TAGGED_SIZE: EFA_RDM_EP_SETOPT_THRESHOLD(MAX_TAGGED_SIZE, efa_rdm_ep->max_tagged_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_MAX_RMA_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, efa_rdm_ep->max_rma_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + EFA_RDM_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, efa_rdm_ep->base_ep.max_rma_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_MAX_ATOMIC_SIZE: EFA_RDM_EP_SETOPT_THRESHOLD(MAX_ATOMIC_SIZE, efa_rdm_ep->max_atomic_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_INJECT_MSG_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, efa_rdm_ep->inject_msg_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, efa_rdm_ep->base_ep.inject_msg_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_INJECT_TAGGED_SIZE: EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_TAGGED_SIZE, efa_rdm_ep->inject_tagged_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_INJECT_RMA_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, efa_rdm_ep->inject_rma_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, efa_rdm_ep->base_ep.inject_rma_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_INJECT_ATOMIC_SIZE: EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_ATOMIC_SIZE, efa_rdm_ep->inject_atomic_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) @@ -1822,7 +1825,7 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, case FI_OPT_MAX_MSG_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->max_msg_size; + *(size_t *) optval = efa_rdm_ep->base_ep.max_msg_size; *optlen = sizeof (size_t); break; case FI_OPT_MAX_TAGGED_SIZE: @@ -1834,7 +1837,7 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, case FI_OPT_MAX_RMA_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->max_rma_size; + *(size_t *) optval = efa_rdm_ep->base_ep.max_rma_size; *optlen = sizeof (size_t); break; case FI_OPT_MAX_ATOMIC_SIZE: @@ -1846,7 +1849,7 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, case FI_OPT_INJECT_MSG_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->inject_msg_size; + *(size_t *) optval = efa_rdm_ep->base_ep.inject_msg_size; *optlen = sizeof (size_t); break; case FI_OPT_INJECT_TAGGED_SIZE: @@ -1858,7 +1861,7 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, case FI_OPT_INJECT_RMA_SIZE: if (*optlen < sizeof (size_t)) return -FI_ETOOSMALL; - *(size_t *) optval = efa_rdm_ep->inject_rma_size; + *(size_t *) optval = efa_rdm_ep->base_ep.inject_rma_size; *optlen = sizeof (size_t); break; case FI_OPT_INJECT_ATOMIC_SIZE: diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index cdbabe128c1..c808cd34768 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -290,7 +290,7 @@ ssize_t efa_rdm_msg_send(struct fid_ep *ep, const void *buf, size_t len, int ret; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_msg_size); + assert(len <= efa_rdm_ep->base_ep.max_msg_size); ret = efa_rdm_attempt_to_sync_memops(efa_rdm_ep, (void *)buf, desc); if (ret) @@ -322,7 +322,7 @@ ssize_t efa_rdm_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, int ret; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_msg_size); + assert(len <= efa_rdm_ep->base_ep.max_msg_size); ret = efa_rdm_attempt_to_sync_memops(efa_rdm_ep, (void *)buf, desc); if (ret) @@ -354,7 +354,7 @@ ssize_t efa_rdm_msg_inject(struct fid_ep *ep, const void *buf, size_t len, struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_msg_size); + assert(len <= efa_rdm_ep->base_ep.inject_msg_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -382,7 +382,7 @@ ssize_t efa_rdm_msg_injectdata(struct fid_ep *ep, const void *buf, struct efa_rdm_peer *peer; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_msg_size); + assert(len <= efa_rdm_ep->base_ep.inject_msg_size); peer = efa_rdm_ep_get_peer(efa_rdm_ep, dest_addr); assert(peer); @@ -492,7 +492,7 @@ ssize_t efa_rdm_msg_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, int ret; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_msg_size); + assert(len <= efa_rdm_ep->base_ep.max_msg_size); ret = efa_rdm_attempt_to_sync_memops(efa_rdm_ep, (void *)buf, desc); if (ret) @@ -525,7 +525,7 @@ ssize_t efa_rdm_msg_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len int ret; efa_rdm_ep = container_of(ep_fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_msg_size); + assert(len <= efa_rdm_ep->base_ep.max_msg_size); ret = efa_rdm_attempt_to_sync_memops(efa_rdm_ep, (void *)buf, desc); if (ret) diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index bdab59510b4..f24d9c0150e 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -1509,7 +1509,7 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) if (ope->fi_flags & FI_INJECT) { assert(ope->iov_count == 1); - assert(ope->total_len <= ep->inject_rma_size); + assert(ope->total_len <= ep->base_ep.inject_rma_size); copied = efa_rdm_pke_copy_from_hmem_iov( ope->desc[iov_idx], pkt_entry, ope, sizeof(struct efa_rdm_rma_context_pkt), 0, diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index 720788c8757..fdb0d629a8a 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -292,7 +292,7 @@ ssize_t efa_rdm_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_rma_size); + assert(len <= efa_rdm_ep->base_ep.max_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; @@ -560,7 +560,7 @@ ssize_t efa_rdm_rma_write(struct fid_ep *ep, const void *buf, size_t len, void * int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_rma_size); + assert(len <= efa_rdm_ep->base_ep.max_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; @@ -595,7 +595,7 @@ ssize_t efa_rdm_rma_writedata(struct fid_ep *ep, const void *buf, size_t len, int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->max_rma_size); + assert(len <= efa_rdm_ep->base_ep.max_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; @@ -642,7 +642,7 @@ ssize_t efa_rdm_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_rma_size); + assert(len <= efa_rdm_ep->base_ep.inject_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; @@ -679,7 +679,7 @@ ssize_t efa_rdm_rma_inject_writedata(struct fid_ep *ep, const void *buf, size_t int err; efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - assert(len <= efa_rdm_ep->inject_rma_size); + assert(len <= efa_rdm_ep->base_ep.inject_rma_size); err = efa_rdm_ep_cap_check_rma(efa_rdm_ep); if (err) return err; diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index f8dd2073df4..392a1c0c3b6 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -959,7 +959,7 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_SHARED_MEMORY_PERMITTED, &shm_permitted, sizeof shm_permitted), 0); - assert_true(ep->max_msg_size == max_msg_size); + assert_true(ep->base_ep.max_msg_size == max_msg_size); /* Enable EP */ assert_int_equal(fi_enable(resource->ep), 0); @@ -968,11 +968,11 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, &inject_msg_size, &(size_t){sizeof inject_msg_size}), 0); - assert_int_equal(ep->inject_msg_size, inject_msg_size); + assert_int_equal(ep->base_ep.inject_msg_size, inject_msg_size); assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, &inject_rma_size, &(size_t){sizeof inject_rma_size}), 0); - assert_int_equal(ep->inject_rma_size, inject_rma_size); + assert_int_equal(ep->base_ep.inject_rma_size, inject_rma_size); if (expected_use_zcpy_rx) { assert_int_equal(inject_msg_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); From 928a2daf5e46a9da6d7b7993a7fee874de98328f Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 13 Nov 2024 22:59:52 +0000 Subject: [PATCH 241/393] prov/efa: Fix the error path of zero copy recv Fix the resource leak of rxe when the user rx pkt post fails. Make the user rx pkt pool fixed size (rx size) and doesn't allow further grow (for better traffic control). Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 4 ++- prov/efa/src/rdm/efa_rdm_ep_utils.c | 16 ++---------- prov/efa/src/rdm/efa_rdm_msg.c | 3 +++ prov/efa/test/efa_unit_test_ep.c | 40 +++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 1 + prov/efa/test/efa_unit_tests.h | 1 + 6 files changed, 50 insertions(+), 15 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 24360786e1f..149ba48ce2a 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -236,7 +236,9 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) ret = ofi_bufpool_create(&ep->user_rx_pkt_pool, sizeof(struct efa_rdm_pke), EFA_RDM_BUFPOOL_ALIGNMENT, - 0, ep->base_ep.info->rx_attr->size, 0); + ep->base_ep.info->rx_attr->size, + ep->base_ep.info->rx_attr->size, /* max count==chunk_cnt means pool is not allowed to grow */ + 0); if (ret) goto err_free; diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 69812d0f90c..574b2bdfceb 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -214,10 +214,8 @@ int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe assert(rxe->iov_count > 0 && rxe->iov_count <= ep->base_ep.info->rx_attr->iov_limit); assert(rxe->iov[0].iov_len >= ep->msg_prefix_size); pkt_entry = efa_rdm_pke_alloc(ep, ep->user_rx_pkt_pool, EFA_RDM_PKE_FROM_USER_RX_POOL); - if (OFI_UNLIKELY(!pkt_entry)) { - EFA_WARN(FI_LOG_EP_DATA, "Failed to allocate pkt_entry for user rx\n"); - return -FI_ENOMEM; - } + if (OFI_UNLIKELY(!pkt_entry)) + return -FI_EAGAIN; pkt_entry->ope = rxe; rxe->state = EFA_RDM_RXE_MATCHED; @@ -843,16 +841,6 @@ int efa_rdm_ep_grow_rx_pools(struct efa_rdm_ep *ep) } } - if (ep->use_zcpy_rx) { - err = ofi_bufpool_grow(ep->user_rx_pkt_pool); - if (OFI_UNLIKELY(err)) { - EFA_WARN(FI_LOG_CQ, - "cannot allocate memory for user recv pkt pool. error: %s\n", - strerror(-err)); - return err; - } - } - return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index c808cd34768..615b3bb47bc 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -917,6 +917,9 @@ ssize_t efa_rdm_msg_generic_recv(struct efa_rdm_ep *ep, const struct fi_msg *msg } ret = efa_rdm_ep_post_user_recv_buf(ep, rxe, flags); + if (OFI_UNLIKELY(ret)) + efa_rdm_rxe_release(rxe); + ofi_genlock_unlock(srx_ctx->lock); } else if (op == ofi_op_tagged) { ret = util_srx_generic_trecv(ep->peer_srx_ep, msg->msg_iov, msg->desc, diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 392a1c0c3b6..375ada94683 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -1119,6 +1119,46 @@ void test_efa_rdm_ep_zcpy_recv_cancel(struct efa_resource **state) free(recv_buff.buff); } +/** + * @brief When user posts more than rx size fi_recv, we should return eagain and make sure + * there is no rx entry leaked + */ +void test_efa_rdm_ep_zcpy_recv_eagain(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff recv_buff; + int i; + struct efa_rdm_ep *efa_rdm_ep; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + resource->hints->caps = FI_MSG; + + /* enable zero-copy recv mode in ep */ + test_efa_rdm_ep_use_zcpy_rx_impl(resource, false, true, true); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Construct a recv buffer with mr */ + efa_unit_test_buff_construct(&recv_buff, resource, 16); + + for (i = 0; i < efa_rdm_ep->base_ep.info->rx_attr->size; i++) + assert_int_equal(fi_recv(resource->ep, recv_buff.buff, recv_buff.size, fi_mr_desc(recv_buff.mr), FI_ADDR_UNSPEC, NULL), 0); + + /* we should have rx number of rx entry before and after the extra recv post */ + assert_true(efa_unit_test_get_dlist_length(&efa_rdm_ep->rxe_list) == efa_rdm_ep->base_ep.info->rx_attr->size); + assert_int_equal(fi_recv(resource->ep, recv_buff.buff, recv_buff.size, fi_mr_desc(recv_buff.mr), FI_ADDR_UNSPEC, NULL), -FI_EAGAIN); + assert_true(efa_unit_test_get_dlist_length(&efa_rdm_ep->rxe_list) == efa_rdm_ep->base_ep.info->rx_attr->size); + + /** + * the buf is still posted to rdma-core, so unregistering mr can + * return non-zero. Currently ignore this failure. + */ + (void) fi_close(&recv_buff.mr->fid); + free(recv_buff.buff); +} + /** * @brief when efa_rdm_ep_post_handshake_error failed due to pkt pool exhaustion, * make sure both txe is cleaned diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 7c485058132..2232ea36059 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -112,6 +112,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_close_discard_posted_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_cancel, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_eagain, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index c4605c8e962..d44368bc81f 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -126,6 +126,7 @@ void test_efa_rdm_ep_user_p2p_not_supported_zcpy_rx_happy(); void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local(); void test_efa_rdm_ep_close_discard_posted_recv(); void test_efa_rdm_ep_zcpy_recv_cancel(); +void test_efa_rdm_ep_zcpy_recv_eagain(); void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); From a50accb6b2b75ea488f47faf484179a3643a456b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:17:22 +0000 Subject: [PATCH 242/393] build(deps): bump github/codeql-action from 3.27.1 to 3.27.4 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.27.1 to 3.27.4. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/4f3212b61783c3c68e8309a0f18a699764811cda...ea9e4e37992a54ee68a9622e985e60c8e8f12d9f) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index c3f0ceba04b..5763660afe6 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@4f3212b61783c3c68e8309a0f18a699764811cda # v3.27.1 + uses: github/codeql-action/init@ea9e4e37992a54ee68a9622e985e60c8e8f12d9f # v3.27.4 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@4f3212b61783c3c68e8309a0f18a699764811cda # v3.27.1 + uses: github/codeql-action/autobuild@ea9e4e37992a54ee68a9622e985e60c8e8f12d9f # v3.27.4 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@4f3212b61783c3c68e8309a0f18a699764811cda # v3.27.1 + uses: github/codeql-action/analyze@ea9e4e37992a54ee68a9622e985e60c8e8f12d9f # v3.27.4 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 40dcc2a5da8..b1d8ea77f21 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@4f3212b61783c3c68e8309a0f18a699764811cda # v3.27.1 + uses: github/codeql-action/upload-sarif@ea9e4e37992a54ee68a9622e985e60c8e8f12d9f # v3.27.4 with: sarif_file: results.sarif From 199094569b0e9361ea8c85edd515b1cff9dd9989 Mon Sep 17 00:00:00 2001 From: James Swaro Date: Tue, 19 Nov 2024 13:15:54 +0100 Subject: [PATCH 243/393] contrib/cray: Add local and jenkins build scripts Intended for use with Jenkins and manual build processes Signed-off-by: James Swaro --- contrib/cray/build.sh | 21 +++++++++++++++++++++ contrib/cray/run.cxi.jenkins | 7 +++++++ contrib/cray/run.cxi.local | 9 +++++++++ contrib/cray/run.verbs.jenkins | 6 ++++++ contrib/cray/run.verbs.local | 8 ++++++++ 5 files changed, 51 insertions(+) create mode 100644 contrib/cray/build.sh create mode 100755 contrib/cray/run.cxi.jenkins create mode 100755 contrib/cray/run.cxi.local create mode 100755 contrib/cray/run.verbs.jenkins create mode 100755 contrib/cray/run.verbs.local diff --git a/contrib/cray/build.sh b/contrib/cray/build.sh new file mode 100644 index 00000000000..71faaeb289c --- /dev/null +++ b/contrib/cray/build.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# +# Copyright 2024 Hewlett Packard Enterprise Development LP. All rights reserved. +# + +set -Exeuo pipefail + +CE_BUILD_SCRIPT_REPO=hpc-shs-ce-devops +CE_CONFIG_BRANCH=${CE_CONFIG_BRANCH:-main} +if [ -d ${CE_BUILD_SCRIPT_REPO} ]; then + git -C ${CE_BUILD_SCRIPT_REPO} checkout ${CE_CONFIG_BRANCH} + git -C ${CE_BUILD_SCRIPT_REPO} pull +else + git clone --branch "${CE_CONFIG_BRANCH}" https://$HPE_GITHUB_TOKEN@github.hpe.com/hpe/${CE_BUILD_SCRIPT_REPO}.git +fi + +. ${CE_BUILD_SCRIPT_REPO}/build/sh/rpmbuild/load.sh + +setup_dst_env +dst_build_rpm -c ${CE_BUILD_SCRIPT_REPO}/build/configs/${CE_CONFIG_FILE} $@ + diff --git a/contrib/cray/run.cxi.jenkins b/contrib/cray/run.cxi.jenkins new file mode 100755 index 00000000000..5f4a60fbe41 --- /dev/null +++ b/contrib/cray/run.cxi.jenkins @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +DIR=$(cd $(dirname $0) && pwd) + +PRODUCT=slingshot-host-software-internal \ +CE_CONFIG_FILE=libfabric-cxi.yaml \ + bash $DIR/build.sh -n $@ diff --git a/contrib/cray/run.cxi.local b/contrib/cray/run.cxi.local new file mode 100755 index 00000000000..12dd4bbbccc --- /dev/null +++ b/contrib/cray/run.cxi.local @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +DIR=$(cd $(dirname $0) && pwd) + +SHS_LOCAL_BUILD=y \ +BRANCH_NAME=${BRANCH_NAME:-$(git rev-parse --abbrev-ref HEAD)} \ +PRODUCT=slingshot-host-software-internal \ +CE_CONFIG_FILE=libfabric-cxi.yaml \ + bash $DIR/build.sh $@ diff --git a/contrib/cray/run.verbs.jenkins b/contrib/cray/run.verbs.jenkins new file mode 100755 index 00000000000..9c18551d27d --- /dev/null +++ b/contrib/cray/run.verbs.jenkins @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +DIR=$(cd $(dirname $0) && pwd) + +CE_CONFIG_FILE=libfabric-verbs.yaml \ + bash $DIR/build.sh -n $@ diff --git a/contrib/cray/run.verbs.local b/contrib/cray/run.verbs.local new file mode 100755 index 00000000000..44aaf8a12df --- /dev/null +++ b/contrib/cray/run.verbs.local @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +DIR=$(cd $(dirname $0) && pwd) + +SHS_LOCAL_BUILD=y \ +BRANCH_NAME=${BRANCH_NAME:-$(git rev-parse --abbrev-ref HEAD)} \ +CE_CONFIG_FILE=libfabric-verbs.yaml \ + bash $DIR/build.sh $@ From 3f02073f1c4886aec5813dc62915909e28751a76 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 12 Nov 2024 10:32:15 -0800 Subject: [PATCH 244/393] prov/efa: Implement FI_MORE for fi_recv in zero copy recv mode Track the last index of wr with recv_wr_index. Append wr when FI_MORE is set. Only post recv when FI_MORE is not set. Signed-off-by: Jessie Yang --- prov/efa/src/efa_base_ep.c | 9 +++ prov/efa/src/efa_base_ep.h | 2 + prov/efa/src/rdm/efa_rdm_ep_utils.c | 4 +- prov/efa/src/rdm/efa_rdm_pke.c | 92 +++++++++++++++++++++-------- prov/efa/src/rdm/efa_rdm_pke.h | 3 + 5 files changed, 85 insertions(+), 25 deletions(-) diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 2b17088d1b7..1043775fa83 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -86,6 +86,9 @@ int efa_base_ep_destruct(struct efa_base_ep *base_ep) if (base_ep->efa_recv_wr_vec) free(base_ep->efa_recv_wr_vec); + + if (base_ep->user_recv_wr_vec) + free(base_ep->user_recv_wr_vec); return err; } @@ -345,6 +348,12 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, EFA_WARN(FI_LOG_EP_CTRL, "cannot alloc memory for base_ep->efa_recv_wr_vec!\n"); return -FI_ENOMEM; } + base_ep->user_recv_wr_vec = calloc(sizeof(struct efa_recv_wr), EFA_RDM_EP_MAX_WR_PER_IBV_POST_RECV); + if (!base_ep->user_recv_wr_vec) { + EFA_WARN(FI_LOG_EP_CTRL, "cannot alloc memory for base_ep->user_recv_wr_vec!\n"); + return -FI_ENOMEM; + } + base_ep->recv_wr_index = 0; base_ep->efa_qp_enabled = false; base_ep->qp = NULL; base_ep->user_recv_qp = NULL; diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 263e0ca902b..ccce5a06da1 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -56,6 +56,7 @@ struct efa_base_ep { struct ibv_recv_wr recv_more_wr_head; struct ibv_recv_wr *recv_more_wr_tail; struct efa_recv_wr *efa_recv_wr_vec; + size_t recv_wr_index; size_t max_msg_size; /**< #FI_OPT_MAX_MSG_SIZE */ size_t max_rma_size; /**< #FI_OPT_MAX_RMA_SIZE */ @@ -64,6 +65,7 @@ struct efa_base_ep { /* Only used by RDM ep type */ struct efa_qp *user_recv_qp; /* Separate qp to receive pkts posted by users */ + struct efa_recv_wr *user_recv_wr_vec; }; int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av); diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 574b2bdfceb..83d66a23991 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -205,7 +205,7 @@ struct efa_rdm_ope *efa_rdm_ep_alloc_rxe(struct efa_rdm_ep *ep, fi_addr_t addr, * @param[in] rxe rxe that contain user buffer information * @param[in] flags user supplied flags passed to fi_recv */ -int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe, size_t flags) +int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe, uint64_t flags) { struct efa_rdm_pke *pkt_entry = NULL; size_t rx_iov_offset = 0; @@ -242,7 +242,7 @@ int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe pkt_entry->payload_mr = rxe->desc[rx_iov_index]; pkt_entry->payload_size = ofi_total_iov_len(&rxe->iov[rx_iov_index], rxe->iov_count - rx_iov_index) - rx_iov_offset; - err = efa_rdm_pke_recvv(&pkt_entry, 1); + err = efa_rdm_pke_user_recvv(&pkt_entry, 1, flags); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_EP_CTRL, "failed to post user supplied buffer %d (%s)\n", -err, diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index 8255931b8d9..73cb58c82b9 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -622,7 +622,7 @@ ssize_t efa_rdm_pke_recvv(struct efa_rdm_pke **pke_vec, { struct efa_rdm_ep *ep; struct ibv_recv_wr *bad_wr; - struct ibv_qp *qp; + struct efa_recv_wr *recv_wr; int i, err; assert(pke_cnt); @@ -631,37 +631,83 @@ ssize_t efa_rdm_pke_recvv(struct efa_rdm_pke **pke_vec, assert(ep); for (i = 0; i < pke_cnt; ++i) { - ep->base_ep.efa_recv_wr_vec[i].wr.wr_id = (uintptr_t)pke_vec[i]; - ep->base_ep.efa_recv_wr_vec[i].wr.num_sge = 1; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list = ep->base_ep.efa_recv_wr_vec[i].sge; - if (pke_vec[i]->alloc_type == EFA_RDM_PKE_FROM_USER_RX_POOL) { - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].addr = (uintptr_t) pke_vec[i]->payload; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].length = pke_vec[i]->payload_size; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].lkey = ((struct efa_mr *) pke_vec[i]->payload_mr)->ibv_mr->lkey; - } else { - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].length = pke_vec[i]->pkt_size; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].lkey = ((struct efa_mr *) pke_vec[i]->mr)->ibv_mr->lkey; - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].addr = (uintptr_t)pke_vec[i]->wiredata; - } - ep->base_ep.efa_recv_wr_vec[i].wr.next = NULL; + recv_wr = &ep->base_ep.efa_recv_wr_vec[i]; + recv_wr->wr.wr_id = (uintptr_t)pke_vec[i]; + recv_wr->wr.num_sge = 1; + recv_wr->wr.sg_list = recv_wr->sge; + recv_wr->wr.sg_list[0].length = pke_vec[i]->pkt_size; + recv_wr->wr.sg_list[0].lkey = ((struct efa_mr *) pke_vec[i]->mr)->ibv_mr->lkey; + recv_wr->wr.sg_list[0].addr = (uintptr_t)pke_vec[i]->wiredata; + recv_wr->wr.next = NULL; if (i > 0) - ep->base_ep.efa_recv_wr_vec[i-1].wr.next = &ep->base_ep.efa_recv_wr_vec[i].wr; + ep->base_ep.efa_recv_wr_vec[i-1].wr.next = &recv_wr->wr; #if HAVE_LTTNG efa_tracepoint_wr_id_post_recv(pke_vec[i]); #endif } - if (pke_vec[0]->alloc_type == EFA_RDM_PKE_FROM_USER_RX_POOL) { - assert(ep->base_ep.user_recv_qp); - qp = ep->base_ep.user_recv_qp->ibv_qp; - } else { - qp = ep->base_ep.qp->ibv_qp; + err = ibv_post_recv(ep->base_ep.qp->ibv_qp, &ep->base_ep.efa_recv_wr_vec[0].wr, &bad_wr); + if (OFI_UNLIKELY(err)) + err = (err == ENOMEM) ? -FI_EAGAIN : -err; + + return err; +} + +/** + * @brief Post user receive requests to EFA device through user_recv_qp + * + * @param[in] pke_vec packet entries that contains information of receive buffer + * @param[in] pke_cnt Number of packet entries to post receive requests for + * @param[in] flags user supplied flags passed to fi_recv, support FI_MORE + * @return 0 on success + * On error, a negative value corresponding to fabric errno + */ +ssize_t efa_rdm_pke_user_recvv(struct efa_rdm_pke **pke_vec, + int pke_cnt, uint64_t flags) +{ + struct efa_rdm_ep *ep; + struct ibv_recv_wr *bad_wr; + struct efa_recv_wr *recv_wr; + int i, err; + size_t wr_index; + + assert(pke_cnt); + + ep = pke_vec[0]->ep; + assert(ep); + + wr_index = ep->base_ep.recv_wr_index; + assert(wr_index < ep->base_ep.info->rx_attr->size); + + for (i = 0; i < pke_cnt; ++i) { + recv_wr = &ep->base_ep.user_recv_wr_vec[wr_index]; + recv_wr->wr.wr_id = (uintptr_t) pke_vec[i]; + recv_wr->wr.num_sge = 1; + recv_wr->wr.sg_list = recv_wr->sge; + recv_wr->wr.sg_list[0].addr = (uintptr_t) pke_vec[i]->payload; + recv_wr->wr.sg_list[0].length = pke_vec[i]->payload_size; + recv_wr->wr.sg_list[0].lkey = ((struct efa_mr *) pke_vec[i]->payload_mr)->ibv_mr->lkey; + recv_wr->wr.next = NULL; + if (wr_index > 0) + ep->base_ep.user_recv_wr_vec[wr_index - 1].wr.next = &recv_wr->wr; +#if HAVE_LTTNG + efa_tracepoint_wr_id_post_recv(pke_vec[i]); +#endif + wr_index++; } - err = ibv_post_recv(qp, &ep->base_ep.efa_recv_wr_vec[0].wr, &bad_wr); - if (OFI_UNLIKELY(err)) { + ep->base_ep.recv_wr_index = wr_index; + + if (flags & FI_MORE) + return 0; + + assert(ep->base_ep.user_recv_qp); + err = ibv_post_recv(ep->base_ep.user_recv_qp->ibv_qp, &ep->base_ep.user_recv_wr_vec[0].wr, &bad_wr); + + if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; - } + + ep->base_ep.recv_wr_index = 0; return err; } diff --git a/prov/efa/src/rdm/efa_rdm_pke.h b/prov/efa/src/rdm/efa_rdm_pke.h index 7291a36c466..223822ce595 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.h +++ b/prov/efa/src/rdm/efa_rdm_pke.h @@ -237,5 +237,8 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, ssize_t efa_rdm_pke_recvv(struct efa_rdm_pke **pke_vec, int pke_cnt); +ssize_t efa_rdm_pke_user_recvv(struct efa_rdm_pke **pke_vec, + int pke_cnt, uint64_t flags); + int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry); #endif From 5dbc8fd3976664bb5e141308a44a4b26264c10fd Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 12 Nov 2024 11:49:04 -0800 Subject: [PATCH 245/393] fabtests: Allow tests with FI_MORE flag by using fi_recvmsg Also fix a bug in ft_recvmsg that uses tx_seq tag. Signed-off-by: Jessie Yang --- fabtests/benchmarks/benchmark_shared.c | 18 ++++++++++++++---- fabtests/common/shared.c | 6 +++--- fabtests/include/shared.h | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fabtests/benchmarks/benchmark_shared.c b/fabtests/benchmarks/benchmark_shared.c index 6c863bbcf3a..ad0d6dca035 100644 --- a/fabtests/benchmarks/benchmark_shared.c +++ b/fabtests/benchmarks/benchmark_shared.c @@ -511,10 +511,20 @@ int bandwidth(void) if (i == opts.warmup_iterations) ft_start(); - ret = ft_post_rx_buf(ep, opts.transfer_size, - &rx_ctx_arr[j].context, - rx_ctx_arr[j].buf, mr_desc, - ft_tag); + if (opts.use_fi_more) { + flags = set_fi_more_flag(i, j, flags); + ret = ft_recvmsg(ep, remote_fi_addr, + rx_ctx_arr[j].buf, + MAX(opts.transfer_size, + FT_MAX_CTRL_MSG) + + ft_rx_prefix_size(), + &rx_ctx_arr[j].context, flags); + } else { + ret = ft_post_rx_buf(ep, opts.transfer_size, + &rx_ctx_arr[j].context, + rx_ctx_arr[j].buf, mr_desc, + ft_tag); + } if (ret) return ret; diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 646a2ea1eab..a7d548da097 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -2952,14 +2952,14 @@ int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, } -int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, +int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, size_t size, void *ctx, int flags) { struct fi_msg msg; struct fi_msg_tagged tagged_msg; struct iovec msg_iov; - msg_iov.iov_base = rx_buf; + msg_iov.iov_base = (char *) buf; msg_iov.iov_len = size; if (hints->caps & FI_TAGGED) { @@ -2969,7 +2969,7 @@ int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, tagged_msg.addr = fi_addr; tagged_msg.data = NO_CQ_DATA; tagged_msg.context = ctx; - tagged_msg.tag = ft_tag ? ft_tag : tx_seq; + tagged_msg.tag = ft_tag ? ft_tag : rx_seq; tagged_msg.ignore = 0; FT_POST(fi_trecvmsg, ft_progress, rxcq, rx_seq, diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index e15c61daea7..2a798ecbaa7 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -629,7 +629,7 @@ int ft_get_cq_comp(struct fid_cq *cq, uint64_t *cur, uint64_t total, int timeout int ft_get_cntr_comp(struct fid_cntr *cntr, uint64_t total, int timeout); int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, - size_t size, void *ctx, int flags); + void *buf, size_t size, void *ctx, int flags); int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, size_t size, void *ctx, int flags); int ft_writemsg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, size_t size, From d8f367bc0448de125870004028d58453bca870e5 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 12 Nov 2024 15:58:46 -0800 Subject: [PATCH 246/393] fabtests: Add FI_MORE pytest for fi_recv in zcpy recv mode Signed-off-by: Jessie Yang --- fabtests/pytest/efa/test_rdm.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fabtests/pytest/efa/test_rdm.py b/fabtests/pytest/efa/test_rdm.py index 112893c8ce6..d42dc6dea56 100644 --- a/fabtests/pytest/efa/test_rdm.py +++ b/fabtests/pytest/efa/test_rdm.py @@ -132,3 +132,12 @@ def test_rdm_bw_zcpy_recv(cmdline_args, memory_type, zcpy_recv_max_msg_size, zcp cmdline_args_copy.append_environ("FI_EFA_ENABLE_SHM_TRANSFER=0") efa_run_client_server_test(cmdline_args_copy, f"fi_rdm_bw --max-msg-size {zcpy_recv_max_msg_size}", "short", "transmit_complete", memory_type, zcpy_recv_message_size) + +@pytest.mark.functional +def test_rdm_bw_zcpy_recv_use_fi_more(cmdline_args, memory_type, zcpy_recv_max_msg_size, zcpy_recv_message_size): + if cmdline_args.server_id == cmdline_args.client_id: + pytest.skip("no zero copy recv for intra-node communication") + cmdline_args_copy = copy.copy(cmdline_args) + cmdline_args_copy.append_environ("FI_EFA_ENABLE_SHM_TRANSFER=0") + efa_run_client_server_test(cmdline_args_copy, f"fi_rdm_bw --use-fi-more --max-msg-size {zcpy_recv_max_msg_size}", + "short", "transmit_complete", memory_type, zcpy_recv_message_size) From b30ce18adc48e6dc14cb9156fa573dbaed01f81c Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Wed, 6 Nov 2024 15:16:45 -0800 Subject: [PATCH 247/393] prov/efa: Implement efa_msg interface Rename efa_dgram_msg.c to efa_msg.c as a common MSG interface for both rdm and dgram ep type. Add support for FI_INJECT and FI_REMOTE_CQ_DATA. Replace ibv_post_send with ibv_wr* calls and remove xmit_more_wr_head and xmit_more_wr_tail. Replace recv_more_wr_head and recv_more_wr_tail with efa_recv_wr_vec. Signed-off-by: Jessie Yang --- libfabric.vcxproj | 2 +- prov/efa/Makefile.include | 2 +- prov/efa/src/dgram/efa_dgram_msg.c | 445 ---------------------------- prov/efa/src/efa_base_ep.c | 5 +- prov/efa/src/efa_base_ep.h | 4 - prov/efa/src/efa_cq.h | 5 + prov/efa/src/efa_msg.c | 378 +++++++++++++++++++++++ prov/efa/src/efa_prov_info.c | 9 +- prov/efa/src/rdm/efa_rdm_cq.h | 5 - prov/efa/src/rdm/efa_rdm_ep_fiops.c | 2 - 10 files changed, 392 insertions(+), 465 deletions(-) delete mode 100644 prov/efa/src/dgram/efa_dgram_msg.c create mode 100644 prov/efa/src/efa_msg.c diff --git a/libfabric.vcxproj b/libfabric.vcxproj index b4e8dc9cbd3..e85229c9f1a 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -884,9 +884,9 @@ + - diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index fc065cb42e4..e5961cb13d5 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -47,9 +47,9 @@ _efa_files = \ prov/efa/src/efa_prov.c \ prov/efa/src/efa_env.c \ prov/efa/src/efa_cntr.c \ + prov/efa/src/efa_msg.c \ prov/efa/src/dgram/efa_dgram_ep.c \ prov/efa/src/dgram/efa_dgram_cq.c \ - prov/efa/src/dgram/efa_dgram_msg.c \ prov/efa/src/dgram/efa_dgram_rma.c \ prov/efa/src/rdm/efa_rdm_peer.c \ prov/efa/src/rdm/efa_rdm_cq.c \ diff --git a/prov/efa/src/dgram/efa_dgram_msg.c b/prov/efa/src/dgram/efa_dgram_msg.c deleted file mode 100644 index f8a5010daf9..00000000000 --- a/prov/efa/src/dgram/efa_dgram_msg.c +++ /dev/null @@ -1,445 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include "config.h" - - -#include "ofi.h" -#include "ofi_enosys.h" -#include "ofi_iov.h" - -#include "efa_dgram_ep.h" -#include "efa.h" -#include "efa_av.h" - -#include "efa_tp.h" - -#define EFA_SETUP_IOV(iov, buf, len) \ - do { \ - iov.iov_base = (void *)buf; \ - iov.iov_len = (size_t)len; \ - } while (0) - -#define EFA_SETUP_MSG(msg, iov, _desc, count, _addr, _context, _data) \ - do { \ - msg.msg_iov = (const struct iovec *)iov; \ - msg.desc = (void **)_desc; \ - msg.iov_count = (size_t)count; \ - msg.addr = (fi_addr_t)_addr; \ - msg.context = (void *)_context; \ - msg.data = (uint32_t)_data; \ - } while (0) - -#ifndef EFA_MSG_DUMP -static inline void dump_msg(const struct fi_msg *msg, const char *context) {} -#else -#define DUMP_IOV(i, iov, desc) \ - EFA_DBG(FI_LOG_EP_DATA, \ - "\t{ iov[%d] = { base = %p, buff = \"%s\", len = %zu }, desc = %p },\n", \ - i, iov.iov_base, (char *)iov.iov_base, iov.iov_len, (desc ? desc[i] : NULL)) - -static inline void dump_msg(const struct fi_msg *msg, const char *context) -{ - int i; - - EFA_DBG(FI_LOG_EP_DATA, "%s: { data = %u, addr = %" PRIu64 ", iov_count = %zu, [\n", - context, (unsigned)msg->data, msg->addr, msg->iov_count); - for (i = 0; i < msg->iov_count; ++i) - DUMP_IOV(i, msg->msg_iov[i], msg->desc); - EFA_DBG(FI_LOG_EP_DATA, " ] }\n"); -} -#endif /* EFA_MSG_DUMP */ - -static void free_send_wr_list(struct ibv_send_wr *head) -{ - struct ibv_send_wr *wr = head; - struct ibv_send_wr *tmp; - - while (wr) { - tmp = wr->next; - ofi_buf_free(container_of(wr, struct efa_send_wr, wr)); - wr = tmp; - } -} - -static void free_recv_wr_list(struct ibv_recv_wr *head) -{ - struct ibv_recv_wr *wr = head; - struct ibv_recv_wr *tmp; - - while (wr) { - tmp = wr->next; - ofi_buf_free(container_of(wr, struct efa_recv_wr, wr)); - wr = tmp; - } -} - -static ssize_t efa_dgram_post_recv_validate(struct efa_dgram_ep *ep, const struct fi_msg *msg) -{ - if (OFI_UNLIKELY(!ep->rcq)) { - EFA_WARN(FI_LOG_EP_DATA, "No receive cq was bound to ep.\n"); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->iov_count > ep->base_ep.info->rx_attr->iov_limit)) { - EFA_WARN(FI_LOG_EP_DATA, "requested sge[%zu] is greater than max supported[%zu]!\n", - msg->iov_count, ep->base_ep.info->tx_attr->iov_limit); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->msg_iov[0].iov_len < - ep->base_ep.info->ep_attr->msg_prefix_size)) { - EFA_WARN(FI_LOG_EP_DATA, "prefix not present on first iov, iov_len[%zu]\n", - msg->msg_iov[0].iov_len); - return -EINVAL; - } - - return 0; -} - -/** - * @brief post receive buffer to EFA device via ibv_post_recv - * - * @param[in] ep endpoint - * @param[in] msg libfabric message - * @param[in] flags libfabric flags, currently only FI_MORE is supported. - * @reutrn On Success, return 0 - * On failure, return negative libfabric error code - */ -static ssize_t efa_dgram_post_recv(struct efa_dgram_ep *ep, const struct fi_msg *msg, uint64_t flags) -{ - struct efa_mr *efa_mr; - struct efa_qp *qp = ep->base_ep.qp; - struct ibv_recv_wr *bad_wr; - struct efa_recv_wr *ewr; - struct ibv_recv_wr *wr; - uintptr_t addr; - ssize_t err, post_recv_err; - size_t i; - - ewr = ofi_buf_alloc(ep->recv_wr_pool); - if (OFI_UNLIKELY(!ewr)) - return -FI_ENOMEM; - - memset(ewr, 0, sizeof(*ewr) + sizeof(*ewr->sge) * msg->iov_count); - wr = &ewr->wr; - dump_msg(msg, "recv"); - - err = efa_dgram_post_recv_validate(ep, msg); - if (OFI_UNLIKELY(err)) { - ofi_buf_free(ewr); - goto out_err; - } - - wr->wr_id = (uintptr_t)msg->context; - wr->num_sge = msg->iov_count; - wr->sg_list = ewr->sge; - - for (i = 0; i < msg->iov_count; i++) { - addr = (uintptr_t)msg->msg_iov[i].iov_base; - - /* Set RX buffer desc from SGE */ - wr->sg_list[i].length = msg->msg_iov[i].iov_len; - assert(msg->desc[i]); - efa_mr = (struct efa_mr *)msg->desc[i]; - wr->sg_list[i].lkey = efa_mr->ibv_mr->lkey; - wr->sg_list[i].addr = addr; - } - - ep->base_ep.recv_more_wr_tail->next = wr; - ep->base_ep.recv_more_wr_tail = wr; - - if (flags & FI_MORE) - return 0; - -#if HAVE_LTTNG - struct ibv_recv_wr *head = ep->base_ep.recv_more_wr_head.next; - while (head) { - efa_tracepoint_wr_id_post_recv((void *) head->wr_id); - head = head->next; - } -#endif - - err = ibv_post_recv(qp->ibv_qp, ep->base_ep.recv_more_wr_head.next, &bad_wr); - if (OFI_UNLIKELY(err)) { - /* On failure, ibv_post_recv() return positive errno. - * Meanwhile, this function return a negative errno. - * So, we do the conversion here. - */ - err = (err == ENOMEM) ? -FI_EAGAIN : -err; - } - - free_recv_wr_list(ep->base_ep.recv_more_wr_head.next); - ep->base_ep.recv_more_wr_tail = &ep->base_ep.recv_more_wr_head; - - return err; - -out_err: - if (ep->base_ep.recv_more_wr_head.next) { - post_recv_err = ibv_post_recv(qp->ibv_qp, ep->base_ep.recv_more_wr_head.next, &bad_wr); - if (post_recv_err) { - EFA_WARN(FI_LOG_EP_DATA, - "Encountered error %ld when ibv_post_recv on error handling path\n", - post_recv_err); - } - } - - free_recv_wr_list(ep->base_ep.recv_more_wr_head.next); - ep->base_ep.recv_more_wr_tail = &ep->base_ep.recv_more_wr_head; - - return err; -} - -static ssize_t efa_dgram_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - - return efa_dgram_post_recv(ep, msg, flags); -} - -static ssize_t efa_dgram_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, - void *desc, fi_addr_t src_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct iovec iov; - struct fi_msg msg; - - EFA_SETUP_IOV(iov, buf, len); - EFA_SETUP_MSG(msg, &iov, &desc, 1, src_addr, context, 0); - - return efa_dgram_post_recv(ep, &msg, 0); -} - -static ssize_t efa_dgram_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, - size_t count, fi_addr_t src_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_msg msg; - - EFA_SETUP_MSG(msg, iov, desc, count, src_addr, context, 0); - - return efa_dgram_post_recv(ep, &msg, 0); -} - -static ssize_t efa_dgram_post_send_validate(struct efa_dgram_ep *ep, const struct fi_msg *msg, - struct efa_conn *conn, uint64_t flags, size_t *len) -{ - if (OFI_UNLIKELY(!ep->scq)) { - EFA_WARN(FI_LOG_EP_DATA, "No send cq was bound to ep.\n"); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->iov_count > ep->base_ep.info->tx_attr->iov_limit)) { - EFA_WARN(FI_LOG_EP_DATA, "requested sge[%zu] is greater than max supported[%zu]!\n", - msg->iov_count, ep->base_ep.info->tx_attr->iov_limit); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->msg_iov[0].iov_len < - ep->base_ep.info->ep_attr->msg_prefix_size)) { - EFA_WARN(FI_LOG_EP_DATA, "prefix not present on first iov, iov_len[%zu]\n", - msg->msg_iov[0].iov_len); - return -EINVAL; - } - - *len = ofi_total_iov_len(msg->msg_iov, msg->iov_count) - ep->base_ep.info->ep_attr->msg_prefix_size; - if (OFI_UNLIKELY(*len > ep->base_ep.info->ep_attr->max_msg_size)) { - EFA_WARN(FI_LOG_EP_DATA, "requested size[%zu] is greater than max[%zu]!\n", - *len, ep->base_ep.info->ep_attr->max_msg_size); - return -FI_EINVAL; - } - - return 0; -} - -static void efa_dgram_post_send_sgl(struct efa_dgram_ep *ep, const struct fi_msg *msg, - struct efa_send_wr *ewr) -{ - struct efa_mr *efa_mr; - struct ibv_send_wr *wr = &ewr->wr; - struct ibv_sge *sge; - uint32_t length; - uintptr_t addr; - size_t i; - - wr->num_sge = msg->iov_count; - wr->sg_list = ewr->sge; - - for (i = 0; i < msg->iov_count; i++) { - sge = &wr->sg_list[i]; - addr = (uintptr_t)msg->msg_iov[i].iov_base; - length = msg->msg_iov[i].iov_len; - - /* Whole prefix must be on the first sgl */ - if (!i) { - /* Check if payload exists */ - if (length <= ep->base_ep.info->ep_attr->msg_prefix_size) - continue; - - addr += ep->base_ep.info->ep_attr->msg_prefix_size; - length -= ep->base_ep.info->ep_attr->msg_prefix_size; - } - - /* Set TX buffer desc from SGE */ - sge->length = length; - assert (msg->desc && msg->desc[i]); - efa_mr = (struct efa_mr *)msg->desc[i]; - sge->lkey = efa_mr->ibv_mr->lkey; - sge->addr = addr; - } -} - -ssize_t efa_dgram_post_flush(struct efa_dgram_ep *ep, struct ibv_send_wr **bad_wr, bool free) -{ - ssize_t ret; - -#if HAVE_LTTNG - struct ibv_send_wr *head = ep->base_ep.xmit_more_wr_head.next; - while (head) { - efa_tracepoint_wr_id_post_send((void *) head->wr_id); - head = head->next; - } -#endif - - ret = ibv_post_send(ep->base_ep.qp->ibv_qp, ep->base_ep.xmit_more_wr_head.next, bad_wr); - if (free) - free_send_wr_list(ep->base_ep.xmit_more_wr_head.next); - else - ep->base_ep.xmit_more_wr_head.next = NULL; - ep->base_ep.xmit_more_wr_tail = &ep->base_ep.xmit_more_wr_head; - return ret; -} - -static bool efa_msg_has_hmem_mr(const struct fi_msg *msg) -{ - /* the device only support send up 2 iov, so iov_count cannot be > 2 */ - assert(msg->iov_count == 1 || msg->iov_count == 2); - /* first iov is always on host memory, because it must contain packet header */ - assert(!efa_mr_is_hmem(msg->desc[0])); - return (msg->iov_count == 2) && efa_mr_is_hmem(msg->desc[1]); -} - -static ssize_t efa_dgram_post_send(struct efa_dgram_ep *ep, const struct fi_msg *msg, uint64_t flags) -{ - struct efa_qp *qp = ep->base_ep.qp; - struct ibv_send_wr *bad_wr; - struct efa_send_wr *ewr; - struct ibv_send_wr *wr; - struct efa_conn *conn; - size_t len; - int ret; - - dump_msg(msg, "send"); - - ewr = ofi_buf_alloc(ep->send_wr_pool); - if (OFI_UNLIKELY(!ewr)) - return -FI_ENOMEM; - - memset(ewr, 0, sizeof(*ewr) + sizeof(*ewr->sge) * msg->iov_count); - wr = &ewr->wr; - conn = efa_av_addr_to_conn(ep->base_ep.av, msg->addr); - assert(conn && conn->ep_addr); - - ret = efa_dgram_post_send_validate(ep, msg, conn, flags, &len); - if (OFI_UNLIKELY(ret)) { - ofi_buf_free(ewr); - goto out_err; - } - - efa_dgram_post_send_sgl(ep, msg, ewr); - - if (len <= ep->base_ep.domain->device->efa_attr.inline_buf_size && - !efa_msg_has_hmem_mr(msg)) - wr->send_flags |= IBV_SEND_INLINE; - - wr->opcode = IBV_WR_SEND; - wr->wr_id = (uintptr_t)msg->context; - wr->wr.ud.ah = conn->ah->ibv_ah; - wr->wr.ud.remote_qpn = conn->ep_addr->qpn; - wr->wr.ud.remote_qkey = conn->ep_addr->qkey; - - ep->base_ep.xmit_more_wr_tail->next = wr; - ep->base_ep.xmit_more_wr_tail = wr; - - if (flags & FI_MORE) - return 0; - - ret = efa_dgram_post_flush(ep, &bad_wr, true /* free ibv_send_wr */); - - return ret; - -out_err: - if (ep->base_ep.xmit_more_wr_head.next) - ibv_post_send(qp->ibv_qp, ep->base_ep.xmit_more_wr_head.next, &bad_wr); - - free_send_wr_list(ep->base_ep.xmit_more_wr_head.next); - ep->base_ep.xmit_more_wr_tail = &ep->base_ep.xmit_more_wr_head; - - return ret; -} - -static ssize_t efa_dgram_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - - return efa_dgram_post_send(ep, msg, flags); -} - -static ssize_t efa_dgram_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_msg msg; - struct iovec iov; - uint64_t flags; - - EFA_SETUP_IOV(iov, buf, len); - EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, 0); - flags = ep->base_ep.info->tx_attr->op_flags; - - return efa_dgram_post_send(ep, &msg, flags); -} - -static ssize_t efa_dgram_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_msg msg; - struct iovec iov; - uint64_t flags; - - EFA_SETUP_IOV(iov, buf, len); - EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, data); - - flags = ep->base_ep.info->tx_attr->op_flags | FI_REMOTE_CQ_DATA; - - return efa_dgram_post_send(ep, &msg, flags); -} - -static ssize_t efa_dgram_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, - size_t count, fi_addr_t dest_addr, void *context) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_msg msg; - uint64_t flags; - - EFA_SETUP_MSG(msg, iov, desc, count, dest_addr, context, 0); - - flags = ep->base_ep.info->tx_attr->op_flags; - - return efa_dgram_post_send(ep, &msg, flags); -} - -struct fi_ops_msg efa_dgram_ep_msg_ops = { - .size = sizeof(struct fi_ops_msg), - .recv = efa_dgram_ep_recv, - .recvv = efa_dgram_ep_recvv, - .recvmsg = efa_dgram_ep_recvmsg, - .send = efa_dgram_ep_send, - .sendv = efa_dgram_ep_sendv, - .sendmsg = efa_dgram_ep_sendmsg, - .inject = fi_no_msg_inject, - .senddata = efa_dgram_ep_senddata, - .injectdata = fi_no_msg_injectdata, -}; diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 1043775fa83..55997a3cfe6 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -178,6 +178,9 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, if (!*qp) return -FI_ENOMEM; + init_attr_ex->comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; + init_attr_ex->send_ops_flags |= IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM; + if (init_attr_ex->qp_type == IBV_QPT_UD) { (*qp)->ibv_qp = ibv_create_qp_ex(init_attr_ex->pd->context, init_attr_ex); @@ -341,8 +344,6 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, base_ep->rnr_retry = efa_env.rnr_retry; - base_ep->xmit_more_wr_tail = &base_ep->xmit_more_wr_head; - base_ep->recv_more_wr_tail = &base_ep->recv_more_wr_head; base_ep->efa_recv_wr_vec = calloc(sizeof(struct efa_recv_wr), EFA_RDM_EP_MAX_WR_PER_IBV_POST_RECV); if (!base_ep->efa_recv_wr_vec) { EFA_WARN(FI_LOG_EP_CTRL, "cannot alloc memory for base_ep->efa_recv_wr_vec!\n"); diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index ccce5a06da1..f970fe3aba9 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -51,10 +51,6 @@ struct efa_base_ep { bool efa_qp_enabled; bool is_wr_started; - struct ibv_send_wr xmit_more_wr_head; - struct ibv_send_wr *xmit_more_wr_tail; - struct ibv_recv_wr recv_more_wr_head; - struct ibv_recv_wr *recv_more_wr_tail; struct efa_recv_wr *efa_recv_wr_vec; size_t recv_wr_index; diff --git a/prov/efa/src/efa_cq.h b/prov/efa/src/efa_cq.h index 18cf435023b..238e769cc93 100644 --- a/prov/efa/src/efa_cq.h +++ b/prov/efa/src/efa_cq.h @@ -18,6 +18,11 @@ struct efa_ibv_cq_poll_list_entry { struct efa_ibv_cq *cq; }; +/* + * Control header with completion data. CQ data length is static. + */ +#define EFA_CQ_DATA_SIZE (4) + static inline int efa_ibv_cq_poll_list_match(struct dlist_entry *entry, const void *cq) { diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c new file mode 100644 index 00000000000..e03ca5aeebe --- /dev/null +++ b/prov/efa/src/efa_msg.c @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include "config.h" + + +#include "ofi.h" +#include "ofi_enosys.h" +#include "ofi_iov.h" + +#include "efa.h" +#include "efa_av.h" + +#include "efa_tp.h" + +#define EFA_SETUP_IOV(iov, buf, len) \ + do { \ + iov.iov_base = (void *)buf; \ + iov.iov_len = (size_t)len; \ + } while (0) + +#define EFA_SETUP_MSG(msg, iov, _desc, count, _addr, _context, _data) \ + do { \ + msg.msg_iov = (const struct iovec *)iov; \ + msg.desc = (void **)_desc; \ + msg.iov_count = (size_t)count; \ + msg.addr = (fi_addr_t)_addr; \ + msg.context = (void *)_context; \ + msg.data = (uint32_t)_data; \ + } while (0) + +#ifndef EFA_MSG_DUMP +static inline void dump_msg(const struct fi_msg *msg, const char *context) {} +#else +#define DUMP_IOV(i, iov, desc) \ + EFA_DBG(FI_LOG_EP_DATA, \ + "\t{ iov[%d] = { base = %p, buff = \"%s\", len = %zu }, desc = %p },\n", \ + i, iov.iov_base, (char *)iov.iov_base, iov.iov_len, (desc ? desc[i] : NULL)) + +static inline void dump_msg(const struct fi_msg *msg, const char *context) +{ + int i; + + EFA_DBG(FI_LOG_EP_DATA, "%s: { data = %u, addr = %" PRIu64 ", iov_count = %zu, [\n", + context, (unsigned)msg->data, msg->addr, msg->iov_count); + for (i = 0; i < msg->iov_count; ++i) + DUMP_IOV(i, msg->msg_iov[i], msg->desc); + EFA_DBG(FI_LOG_EP_DATA, " ] }\n"); +} +#endif /* EFA_MSG_DUMP */ + +/** + * @brief post receive buffer to EFA device via ibv_post_recv + * + * @param[in] base_ep endpoint + * @param[in] msg libfabric message + * @param[in] flags libfabric flags, currently only FI_MORE is supported. + * @reutrn On Success, return 0 + * On failure, return negative libfabric error code + */ +static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi_msg *msg, uint64_t flags) +{ + struct efa_mr *efa_mr; + struct efa_qp *qp = base_ep->qp; + struct ibv_recv_wr *bad_wr; + struct ibv_recv_wr *wr; + uintptr_t addr; + ssize_t err, post_recv_err; + size_t i, wr_index = base_ep->recv_wr_index; + + if (wr_index >= base_ep->info->rx_attr->size) { + EFA_INFO(FI_LOG_EP_DATA, + "recv_wr_index exceeds the rx limit, " + "recv_wr_index = %zu, rx size = %zu\n", + wr_index, base_ep->info->rx_attr->size); + err = -FI_EAGAIN; + goto out_err; + } + + memset(&base_ep->efa_recv_wr_vec[wr_index], 0, sizeof(base_ep->efa_recv_wr_vec[wr_index])); + dump_msg(msg, "recv"); + + assert(msg->iov_count <= base_ep->info->rx_attr->iov_limit); + + if (qp->ibv_qp->qp_type == IBV_QPT_UD && + OFI_UNLIKELY(msg->msg_iov[0].iov_len < + base_ep->info->ep_attr->msg_prefix_size)) { + EFA_WARN(FI_LOG_EP_DATA, + "prefix not present on first iov, " + "iov_len[%zu]\n", + msg->msg_iov[0].iov_len); + err = -EINVAL; + goto out_err; + } + + wr = &base_ep->efa_recv_wr_vec[wr_index].wr; + wr->wr_id = (uintptr_t)msg->context; + wr->num_sge = msg->iov_count; + wr->sg_list = base_ep->efa_recv_wr_vec[wr_index].sge; + + for (i = 0; i < msg->iov_count; i++) { + addr = (uintptr_t)msg->msg_iov[i].iov_base; + + /* Set RX buffer desc from SGE */ + wr->sg_list[i].length = msg->msg_iov[i].iov_len; + assert(msg->desc && msg->desc[i]); + efa_mr = (struct efa_mr *)msg->desc[i]; + wr->sg_list[i].lkey = efa_mr->ibv_mr->lkey; + wr->sg_list[i].addr = addr; + } + + base_ep->efa_recv_wr_vec[wr_index].wr.next = NULL; + if (wr_index > 0) + base_ep->efa_recv_wr_vec[wr_index - 1].wr.next = &base_ep->efa_recv_wr_vec[wr_index].wr; + + base_ep->recv_wr_index++; + + if (flags & FI_MORE) + return 0; + + err = ibv_post_recv(qp->ibv_qp, &base_ep->efa_recv_wr_vec[0].wr, &bad_wr); + if (OFI_UNLIKELY(err)) { + /* On failure, ibv_post_recv() return positive errno. + * Meanwhile, this function return a negative errno. + * So, we do the conversion here. + */ + err = (err == ENOMEM) ? -FI_EAGAIN : -err; + } + + base_ep->recv_wr_index = 0; + + return err; + +out_err: + if (base_ep->recv_wr_index > 0) { + post_recv_err = ibv_post_recv(qp->ibv_qp, &base_ep->efa_recv_wr_vec[0].wr, &bad_wr); + if (OFI_UNLIKELY(post_recv_err)) { + EFA_WARN(FI_LOG_EP_DATA, + "Encountered error %ld when ibv_post_recv on error handling path\n", + post_recv_err); + } + } + + base_ep->recv_wr_index = 0; + + return err; +} + +static ssize_t efa_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + + return efa_post_recv(base_ep, msg, flags); +} + +static ssize_t efa_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct iovec iov; + struct fi_msg msg; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, &desc, 1, src_addr, context, 0); + + return efa_post_recv(base_ep, &msg, 0); +} + +static ssize_t efa_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, + size_t count, fi_addr_t src_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + + EFA_SETUP_MSG(msg, iov, desc, count, src_addr, context, 0); + + return efa_post_recv(base_ep, &msg, 0); +} + +static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi_msg *msg, uint64_t flags) +{ + struct efa_qp *qp = base_ep->qp; + struct efa_conn *conn; + struct ibv_sge sg_list[2]; /* efa device support up to 2 iov */ + struct ibv_data_buf inline_data_list[2]; + size_t len, i; + int ret = 0; + + dump_msg(msg, "send"); + + conn = efa_av_addr_to_conn(base_ep->av, msg->addr); + assert(conn && conn->ep_addr); + + assert(msg->iov_count <= base_ep->info->tx_attr->iov_limit); + + len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); + + if (qp->ibv_qp->qp_type == IBV_QPT_UD) { + assert(msg->msg_iov[0].iov_len >= base_ep->info->ep_attr->msg_prefix_size); + len -= base_ep->info->ep_attr->msg_prefix_size; + } + + assert(len <= base_ep->info->ep_attr->max_msg_size); + + if (!base_ep->is_wr_started) { + ibv_wr_start(qp->ibv_qp_ex); + base_ep->is_wr_started = true; + } + + qp->ibv_qp_ex->wr_id = (uintptr_t)msg->context; + if (flags & FI_REMOTE_CQ_DATA) { + ibv_wr_send_imm(qp->ibv_qp_ex, msg->data); + } else { + ibv_wr_send(qp->ibv_qp_ex); + } + + if (len <= base_ep->domain->device->efa_attr.inline_buf_size && + (!msg->desc || !efa_mr_is_hmem(msg->desc[0]))) { + for (i = 0; i < msg->iov_count; i++) { + inline_data_list[i].addr = msg->msg_iov[i].iov_base; + inline_data_list[i].length = msg->msg_iov[i].iov_len; + + /* Whole prefix must be on the first sgl for dgram */ + if (!i && qp->ibv_qp->qp_type == IBV_QPT_UD) { + inline_data_list[i].addr = (char*)inline_data_list[i].addr + base_ep->info->ep_attr->msg_prefix_size; + inline_data_list[i].length -= base_ep->info->ep_attr->msg_prefix_size; + } + } + ibv_wr_set_inline_data_list(qp->ibv_qp_ex, msg->iov_count, inline_data_list); + } else { + for (i = 0; i < msg->iov_count; i++) { + /* Set TX buffer desc from SGE */ + assert (msg->desc && msg->desc[i]); + sg_list[i].lkey = ((struct efa_mr *)msg->desc[i])->ibv_mr->lkey; + sg_list[i].addr = (uintptr_t)msg->msg_iov[i].iov_base; + sg_list[i].length = msg->msg_iov[i].iov_len; + + /* Whole prefix must be on the first sgl for dgram */ + if (!i && qp->ibv_qp->qp_type == IBV_QPT_UD) { + sg_list[i].addr += base_ep->info->ep_attr->msg_prefix_size; + sg_list[i].length -= base_ep->info->ep_attr->msg_prefix_size; + } + } + ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sg_list); + } + + ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, conn->ep_addr->qpn, + conn->ep_addr->qkey); + +#if HAVE_LTTNG + efa_tracepoint_wr_id_post_send((void *)msg->context); +#endif + + if (!(flags & FI_MORE)) { + ret = ibv_wr_complete(qp->ibv_qp_ex); + base_ep->is_wr_started = false; + } + if (OFI_UNLIKELY(ret)) + return ret; + + return 0; +} + +static ssize_t efa_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + + return efa_post_send(base_ep, msg, flags); +} + +static ssize_t efa_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + struct iovec iov; + uint64_t flags; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, 0); + flags = base_ep->info->tx_attr->op_flags; + + return efa_post_send(base_ep, &msg, flags); +} + +static ssize_t efa_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + struct iovec iov; + uint64_t flags; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, data); + + flags = base_ep->info->tx_attr->op_flags | FI_REMOTE_CQ_DATA; + + return efa_post_send(base_ep, &msg, flags); +} + +static ssize_t efa_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, void *context) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + uint64_t flags; + + EFA_SETUP_MSG(msg, iov, desc, count, dest_addr, context, 0); + + flags = base_ep->info->tx_attr->op_flags; + + return efa_post_send(base_ep, &msg, flags); +} + +static ssize_t efa_ep_msg_inject(struct fid_ep *ep_fid, const void *buf, size_t len, + fi_addr_t dest_addr) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + struct iovec iov; + uint64_t flags; + + assert(len <= base_ep->domain->device->efa_attr.inline_buf_size); + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, NULL, 1, dest_addr, NULL, 0); + + flags = base_ep->info->tx_attr->op_flags | FI_INJECT; + + return efa_post_send(base_ep, &msg, flags); +} + +static ssize_t efa_ep_msg_injectdata(struct fid_ep *ep_fid, const void *buf, + size_t len, uint64_t data, + fi_addr_t dest_addr) +{ + struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_msg msg; + struct iovec iov; + uint64_t flags; + + assert(len <= base_ep->domain->device->efa_attr.inline_buf_size); + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_MSG(msg, &iov, NULL, 1, dest_addr, NULL, data); + + flags = base_ep->info->tx_attr->op_flags | FI_REMOTE_CQ_DATA | FI_INJECT; + + return efa_post_send(base_ep, &msg, flags); +} + +struct fi_ops_msg efa_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = efa_ep_recv, + .recvv = efa_ep_recvv, + .recvmsg = efa_ep_recvmsg, + .send = efa_ep_send, + .sendv = efa_ep_sendv, + .sendmsg = efa_ep_sendmsg, + .senddata = efa_ep_senddata, + .inject = efa_ep_msg_inject, + .injectdata = efa_ep_msg_injectdata, +}; + +struct fi_ops_msg efa_dgram_ep_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = efa_ep_recv, + .recvv = efa_ep_recvv, + .recvmsg = efa_ep_recvmsg, + .send = efa_ep_send, + .sendv = efa_ep_sendv, + .sendmsg = efa_ep_sendmsg, + .senddata = efa_ep_senddata, + .inject = fi_no_msg_inject, + .injectdata = fi_no_msg_injectdata, +}; diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c index be3221cb791..2f16f23816f 100644 --- a/prov/efa/src/efa_prov_info.c +++ b/prov/efa/src/efa_prov_info.c @@ -75,7 +75,7 @@ const struct fi_domain_attr efa_domain_attr = { .resource_mgmt = FI_RM_DISABLED, .mr_mode = OFI_MR_BASIC_MAP | FI_MR_LOCAL | OFI_MR_BASIC, .mr_key_size = sizeof_field(struct ibv_sge, lkey), - .cq_data_size = 0, + .cq_data_size = EFA_CQ_DATA_SIZE, .tx_ctx_cnt = 1024, .rx_ctx_cnt = 1024, .max_ep_tx_ctx = 1, @@ -184,6 +184,9 @@ void efa_prov_info_set_ep_attr(struct fi_info *prov_info, * a completion, therefore there is no way for dgram endpoint * to implement FI_INJECT. Because FI_INJECT is not an optional * feature, we had to set inject_size to 0. + * + * TODO: + * Remove this after implementing cq read for efa-raw */ prov_info->tx_attr->inject_size = 0; } @@ -553,10 +556,6 @@ int efa_prov_info_alloc_for_rdm(struct fi_info **prov_info_rdm_ptr, * buffer. EFA RDM endpoint does not have this requirement, hence unset the flag */ prov_info_rdm->domain_attr->mr_mode &= ~FI_MR_LOCAL; - - /* EFA RDM endpoint support writing CQ data by put it in packet header - */ - prov_info_rdm->domain_attr->cq_data_size = EFA_RDM_CQ_DATA_SIZE; } /* update ep_attr */ diff --git a/prov/efa/src/rdm/efa_rdm_cq.h b/prov/efa/src/rdm/efa_rdm_cq.h index 4e88a8b7f63..932c57109d7 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.h +++ b/prov/efa/src/rdm/efa_rdm_cq.h @@ -15,11 +15,6 @@ struct efa_rdm_cq { bool need_to_scan_ep_list; }; -/* - * Control header with completion data. CQ data length is static. - */ -#define EFA_RDM_CQ_DATA_SIZE (4) - int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 149ba48ce2a..d8a1a3fc5e9 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -26,8 +26,6 @@ void efa_rdm_ep_construct_ibv_qp_init_attr_ex(struct efa_rdm_ep *ep, attr_ex->cap.max_recv_sge = ep->base_ep.domain->device->rdm_info->rx_attr->iov_limit; attr_ex->cap.max_inline_data = ep->base_ep.domain->device->efa_attr.inline_buf_size; attr_ex->qp_type = IBV_QPT_DRIVER; - attr_ex->comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; - attr_ex->send_ops_flags = IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM; if (efa_device_support_rdma_read()) attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_READ; if (efa_device_support_rdma_write()) { From 628dfa78866fe162695eff37624912d05fdbda9f Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 19 Nov 2024 01:57:53 +0000 Subject: [PATCH 248/393] prov/efa: Fix the read_bad_recv_status unit test When device supports unsolicited write, efadv_wc_is_unsolicited is called in the error path of IBV_WC_RECV as well, but we failed to setup its mock in the unit test, it will cause a segfault inside the rdma-core function when we don't really have a cqe, which is what the current unit test does. This patch fixes this bug. Signed-off-by: Shi Jin --- prov/efa/test/efa_unit_test_cq.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 7cb8c47dc4c..75e32b39773 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -336,6 +336,14 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) */ efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; + +#if HAVE_CAPS_UNSOLICITED_WRITE_RECV + if (efa_rdm_use_unsolicited_write_recv()) { + efadv_cq_from_ibv_cq_ex(efa_rdm_cq->ibv_cq.ibv_cq_ex)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; + will_return(efa_mock_efadv_wc_is_unsolicited, false); + } +#endif + ret = fi_cq_read(resource->cq, &cq_entry, 1); assert_int_equal(ret, -FI_EAGAIN); From 33721225533b8c4091b4aee38711fae2966e2249 Mon Sep 17 00:00:00 2001 From: Ryan Hankins Date: Thu, 14 Nov 2024 19:48:00 +0000 Subject: [PATCH 249/393] prov/cxi: Support FI_OPT_CUDA_API_PERMITTED If GDRCopy is required by the application (ie. it has set FI_OPT_CUDA_API_PERMITTED), and is not available, return not supported, eliminating deadlocks due to calls to cudaMemcpy interacting with CUDA applications. Signed-off-by: Ian Ziemba --- prov/cxi/include/cxip.h | 50 ++++++++++++++++++++++------ prov/cxi/src/cxip_atomic.c | 15 +++++---- prov/cxi/src/cxip_coll.c | 4 +-- prov/cxi/src/cxip_ep.c | 58 +++++++++++++++++++++++++++++++++ prov/cxi/src/cxip_mr.c | 13 ++++++++ prov/cxi/src/cxip_msg.c | 12 +++---- prov/cxi/src/cxip_msg_hpc.c | 5 +-- prov/cxi/src/cxip_msg_rnr.c | 6 ++-- prov/cxi/src/cxip_ptelist_buf.c | 4 +-- prov/cxi/src/cxip_rma.c | 3 +- prov/cxi/src/cxip_txc.c | 4 +-- 11 files changed, 140 insertions(+), 34 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 61fd43be6b2..4494b9b703a 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -1963,13 +1963,16 @@ struct cxip_rxc_rnr { }; static inline void cxip_copy_to_md(struct cxip_md *md, void *dest, - const void *src, size_t size) + const void *src, size_t size, + bool require_dev_reg_copy) { ssize_t ret __attribute__((unused)); struct iovec iov; + bool dev_reg_copy = require_dev_reg_copy || + (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold); - /* Favor CPU store access instead of relying on HMEM copy functions. */ - if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) { + /* Favor dev reg access instead of relying on HMEM copy functions. */ + if (dev_reg_copy) { ret = ofi_hmem_dev_reg_copy_to_hmem(md->info.iface, md->handle, dest, src, size); assert(ret == FI_SUCCESS); @@ -1985,13 +1988,16 @@ static inline void cxip_copy_to_md(struct cxip_md *md, void *dest, } static inline void cxip_copy_from_md(struct cxip_md *md, void *dest, - const void *src, size_t size) + const void *src, size_t size, + bool require_dev_reg_copy) { ssize_t ret __attribute__((unused)); struct iovec iov; + bool dev_reg_copy = require_dev_reg_copy || + (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold); - /* Favor CPU store access instead of relying on HMEM copy functions. */ - if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) { + /* Favor dev reg access instead of relying on HMEM copy functions. */ + if (dev_reg_copy) { ret = ofi_hmem_dev_reg_copy_from_hmem(md->info.iface, md->handle, dest, src, size); @@ -2438,6 +2444,9 @@ struct cxip_ep_obj { struct fi_tx_attr tx_attr; struct fi_rx_attr rx_attr; + /* Require memcpy's via the dev reg APIs. */ + bool require_dev_reg_copy[OFI_HMEM_MAX]; + /* Collectives support */ struct cxip_ep_coll_obj coll; struct cxip_ep_zbcoll_obj zbcoll; @@ -2448,6 +2457,25 @@ struct cxip_ep_obj { struct cxip_portals_table *ptable; }; +int cxip_ep_obj_map(struct cxip_ep_obj *ep, const void *buf, unsigned long len, + uint64_t flags, struct cxip_md **md); + +static inline void +cxip_ep_obj_copy_to_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest, + const void *src, size_t size) +{ + cxip_copy_to_md(md, dest, src, size, + ep->require_dev_reg_copy[md->info.iface]); +} + +static inline void +cxip_ep_obj_copy_from_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest, + const void *src, size_t size) +{ + cxip_copy_from_md(md, dest, src, size, + ep->require_dev_reg_copy[md->info.iface]); +} + static inline void cxip_txc_otx_reqs_inc(struct cxip_txc *txc) { assert(ofi_genlock_held(&txc->ep_obj->lock) == 1); @@ -3641,17 +3669,19 @@ cxip_txc_copy_from_hmem(struct cxip_txc *txc, struct cxip_md *hmem_md, */ if (!cxip_env.fork_safe_requested) { if (!hmem_md) { - ret = cxip_map(domain, hmem_src, size, 0, &hmem_md); + ret = cxip_ep_obj_map(txc->ep_obj, hmem_src, size, 0, + &hmem_md); if (ret) { - TXC_WARN(txc, "cxip_map failed: %d:%s\n", ret, - fi_strerror(-ret)); + TXC_WARN(txc, "cxip_ep_obj_map failed: %d:%s\n", + ret, fi_strerror(-ret)); return ret; } unmap_hmem_md = true; } - cxip_copy_from_md(hmem_md, dest, hmem_src, size); + cxip_ep_obj_copy_from_md(txc->ep_obj, hmem_md, dest, hmem_src, + size); if (unmap_hmem_md) cxip_unmap(hmem_md); diff --git a/prov/cxi/src/cxip_atomic.c b/prov/cxi/src/cxip_atomic.c index 49218a324a3..0b8f0d4867b 100644 --- a/prov/cxi/src/cxip_atomic.c +++ b/prov/cxi/src/cxip_atomic.c @@ -612,8 +612,9 @@ static int cxip_amo_emit_idc(struct cxip_txc *txc, if (result_mr) { result_md = result_mr->md; } else { - ret = cxip_map(dom, result, atomic_type_len, 0, - &req->amo.result_md); + ret = cxip_ep_obj_map(txc->ep_obj, result, + atomic_type_len, 0, + &req->amo.result_md); if (ret) { TXC_WARN_RET(txc, ret, "Failed to map result buffer\n"); @@ -930,8 +931,9 @@ static int cxip_amo_emit_dma(struct cxip_txc *txc, /* Optionally register result MR. */ if (result) { if (!result_mr) { - ret = cxip_map(dom, result, atomic_type_len, 0, - &req->amo.result_md); + ret = cxip_ep_obj_map(txc->ep_obj, result, + atomic_type_len, 0, + &req->amo.result_md); if (ret) { TXC_WARN(txc, "Failed to map result buffer: %d:%s\n", @@ -1017,8 +1019,9 @@ static int cxip_amo_emit_dma(struct cxip_txc *txc, buf_md = buf_mr->md; } else { /* Map user operand buffer for DMA command. */ - ret = cxip_map(dom, buf, atomic_type_len, 0, - &req->amo.oper1_md); + ret = cxip_ep_obj_map(txc->ep_obj, buf, + atomic_type_len, 0, + &req->amo.oper1_md); if (ret) { TXC_WARN(txc, "Failed to map operand buffer: %d:%s\n", diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c index 9d9c6d73316..8d503c1c7b0 100644 --- a/prov/cxi/src/cxip_coll.c +++ b/prov/cxi/src/cxip_coll.c @@ -1246,8 +1246,8 @@ static int _coll_add_buffers(struct cxip_coll_pte *coll_pte, size_t size, ret = -FI_ENOMEM; goto out; } - ret = cxip_map(coll_pte->ep_obj->domain, (void *)buf->buffer, - size, 0, &buf->cxi_md); + ret = cxip_ep_obj_map(coll_pte->ep_obj, (void *)buf->buffer, + size, 0, &buf->cxi_md); if (ret) goto del_msg; buf->bufsiz = size; diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c index 7be36c0d56d..bc5cc9ead2e 100644 --- a/prov/cxi/src/cxip_ep.c +++ b/prov/cxi/src/cxip_ep.c @@ -1118,6 +1118,15 @@ int cxip_ep_getopt_priv(struct cxip_ep *ep, int level, int optname, *optlen = sizeof(size_t); break; + case FI_OPT_CUDA_API_PERMITTED: + if (!optval || !optlen) + return -FI_EINVAL; + if (*optlen < sizeof(bool)) + return -FI_ETOOSMALL; + + *(bool *)optval = + !ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA]; + break; default: return -FI_ENOPROTOOPT; } @@ -1140,6 +1149,7 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname, const void *optval, size_t optlen) { size_t min_multi_recv; + bool cuda_api_permitted; if (level != FI_OPT_ENDPOINT) return -FI_ENOPROTOOPT; @@ -1158,6 +1168,28 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname, } ep->ep_obj->rxc->min_multi_recv = min_multi_recv; break; + /* + * If GDRCopy is required by the application (ie. it has set + * FI_OPT_CUDA_API_PERMITTED), and is not available, return not + * supported. + */ + case FI_OPT_CUDA_API_PERMITTED: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + + if (!hmem_ops[FI_HMEM_CUDA].initialized) { + CXIP_WARN("FI_OPT_CUDA_API_PERMITTED cannot be set when CUDA library or CUDA device is not available\n"); + return -FI_EOPNOTSUPP; + } + + cuda_api_permitted = *(bool *)optval; + + if (!cuda_api_permitted && !cuda_is_gdrcopy_enabled()) + return -FI_EOPNOTSUPP; + + ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA] = + !cuda_api_permitted; + break; default: return -FI_ENOPROTOOPT; @@ -1260,6 +1292,12 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, ep_obj->src_addr.pid = pid; ep_obj->fi_addr = FI_ADDR_NOTAVAIL; + /* Default to allowing non-dev reg copy APIs unless the caller + * disables it. + */ + for (i = 0; i < OFI_HMEM_MAX; i++) + ep_obj->require_dev_reg_copy[i] = false; + ofi_atomic_initialize32(&ep_obj->txq_ref, 0); ofi_atomic_initialize32(&ep_obj->tgq_ref, 0); @@ -1332,6 +1370,26 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, return ret; } +int cxip_ep_obj_map(struct cxip_ep_obj *ep, const void *buf, unsigned long len, + uint64_t flags, struct cxip_md **md) +{ + struct cxip_domain *dom = ep->domain; + int ret; + + ret = cxip_map(dom, buf, len, flags, md); + if (ret != FI_SUCCESS) + return ret; + + if (ep->require_dev_reg_copy[(*md)->info.iface] && + !((*md)->handle_valid)) { + CXIP_WARN("Required dev registration copy failed\n"); + cxip_unmap(*md); + return -FI_EOPNOTSUPP; + } + + return FI_SUCCESS; +} + /* * cxip_endpoint() - Provider fi_endpoint() implementation. */ diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c index b52e6d22d1a..34d8ead3576 100644 --- a/prov/cxi/src/cxip_mr.c +++ b/prov/cxi/src/cxip_mr.c @@ -1283,6 +1283,15 @@ static int cxip_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) break; } + /* Zero length MRs do not have MD. */ + if (mr->md && + ep->ep_obj->require_dev_reg_copy[mr->md->info.iface] && + !mr->md->handle_valid) { + CXIP_WARN("Cannot bind to endpoint without required dev reg support\n"); + ret = -FI_EOPNOTSUPP; + break; + } + mr->ep = ep; ofi_atomic_inc32(&ep->ep_obj->ref); break; @@ -1439,6 +1448,10 @@ static int cxip_regattr(struct fid *fid, const struct fi_mr_attr *attr, _mr->mr_fid.key = _mr->key; if (_mr->len) { + /* Do not check whether cuda_api_permitted is set at this point, + * because the mr is not bound to an endpoint. Check instead in + * cxip_mr_bind(). + */ ret = cxip_map(_mr->domain, (void *)_mr->buf, _mr->len, 0, &_mr->md); if (ret) { diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c index ef8356943c2..a8309847802 100644 --- a/prov/cxi/src/cxip_msg.c +++ b/prov/cxi/src/cxip_msg.c @@ -60,7 +60,6 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, int (*recv_cb)(struct cxip_req *req, const union c_event *event)) { - struct cxip_domain *dom = rxc->domain; struct cxip_req *req; struct cxip_md *recv_md = NULL; int ret; @@ -79,7 +78,8 @@ int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, if (len) { /* If hybrid descriptor not passed, map for dma */ if (!md) { - ret = cxip_map(dom, (void *)buf, len, 0, &recv_md); + ret = cxip_ep_obj_map(rxc->ep_obj, (void *)buf, len, 0, + &recv_md); if (ret) { RXC_WARN(rxc, "Map of recv buffer failed: %d, %s\n", @@ -718,8 +718,8 @@ int cxip_send_buf_init(struct cxip_req *req) /* Triggered operation always requires memory registration. */ if (req->triggered) - return cxip_map(txc->domain, req->send.buf, req->send.len, 0, - &req->send.send_md); + return cxip_ep_obj_map(txc->ep_obj, req->send.buf, + req->send.len, 0, &req->send.send_md); /* FI_INJECT operations always require an internal bounce buffer. This * is needed to replay FI_INJECT operations which may experience flow @@ -777,8 +777,8 @@ int cxip_send_buf_init(struct cxip_req *req) } /* Everything else requires memory registeration. */ - return cxip_map(txc->domain, req->send.buf, req->send.len, 0, - &req->send.send_md); + return cxip_ep_obj_map(txc->ep_obj, req->send.buf, req->send.len, 0, + &req->send.send_md); err_buf_fini: cxip_send_buf_fini(req); diff --git a/prov/cxi/src/cxip_msg_hpc.c b/prov/cxi/src/cxip_msg_hpc.c index c6e0bcc35fd..4980a3fd3b0 100644 --- a/prov/cxi/src/cxip_msg_hpc.c +++ b/prov/cxi/src/cxip_msg_hpc.c @@ -629,8 +629,9 @@ static int cxip_ux_send(struct cxip_req *match_req, struct cxip_req *oflow_req, /* Copy data out of overflow buffer. */ oflow_bytes = MIN(put_event->tgt_long.mlength, match_req->data_len); - cxip_copy_to_md(match_req->recv.recv_md, match_req->recv.recv_buf, - oflow_va, oflow_bytes); + cxip_ep_obj_copy_to_md(match_req->recv.rxc->ep_obj, + match_req->recv.recv_md, + match_req->recv.recv_buf, oflow_va, oflow_bytes); if (oflow_req->type == CXIP_REQ_OFLOW) oflow_req_put_bytes(oflow_req, put_event->tgt_long.mlength); diff --git a/prov/cxi/src/cxip_msg_rnr.c b/prov/cxi/src/cxip_msg_rnr.c index ec5064a4fe5..7b4415ea1e8 100644 --- a/prov/cxi/src/cxip_msg_rnr.c +++ b/prov/cxi/src/cxip_msg_rnr.c @@ -1174,9 +1174,9 @@ cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, if (send_req->send.len && !idc) { if (!mr) { - ret = cxip_map(txc->domain, send_req->send.buf, - send_req->send.len, 0, - &send_req->send.send_md); + ret = cxip_ep_obj_map(txc->ep_obj, send_req->send.buf, + send_req->send.len, 0, + &send_req->send.send_md); if (ret) { TXC_WARN(txc, "Local buffer map failed: %d %s\n", diff --git a/prov/cxi/src/cxip_ptelist_buf.c b/prov/cxi/src/cxip_ptelist_buf.c index b8ee08a3733..a313ccf0be4 100644 --- a/prov/cxi/src/cxip_ptelist_buf.c +++ b/prov/cxi/src/cxip_ptelist_buf.c @@ -132,8 +132,8 @@ cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool) } } - ret = cxip_map(rxc->base.domain, buf->data, pool->attr.buf_size, - OFI_MR_NOCACHE, &buf->md); + ret = cxip_ep_obj_map(rxc->base.ep_obj, buf->data, pool->attr.buf_size, + OFI_MR_NOCACHE, &buf->md); if (ret) goto err_unreg_buf; diff --git a/prov/cxi/src/cxip_rma.c b/prov/cxi/src/cxip_rma.c index 9aa1ace679f..660c29862de 100644 --- a/prov/cxi/src/cxip_rma.c +++ b/prov/cxi/src/cxip_rma.c @@ -269,7 +269,8 @@ static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, } else { assert(req != NULL); - ret = cxip_map(dom, buf, len, 0, &req->rma.local_md); + ret = cxip_ep_obj_map(txc->ep_obj, buf, len, 0, + &req->rma.local_md); if (ret) { TXC_WARN(txc, "Failed to map buffer: %d:%s\n", ret, fi_strerror(-ret)); diff --git a/prov/cxi/src/cxip_txc.c b/prov/cxi/src/cxip_txc.c index 94bc470ba68..fdbd64af604 100644 --- a/prov/cxi/src/cxip_txc.c +++ b/prov/cxi/src/cxip_txc.c @@ -63,8 +63,8 @@ int cxip_ibuf_chunk_init(struct ofi_bufpool_region *region) struct cxip_md *md; int ret; - ret = cxip_map(txc->domain, region->mem_region, - region->pool->region_size, OFI_MR_NOCACHE, &md); + ret = cxip_ep_obj_map(txc->ep_obj, region->mem_region, + region->pool->region_size, OFI_MR_NOCACHE, &md); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to map inject buffer chunk\n"); return ret; From 7943a1b895fe9aefa852a71ac77ead83bb58543c Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Tue, 19 Nov 2024 12:04:26 -0600 Subject: [PATCH 250/393] prov/cxi: Define FI_CXI_FORCE_DEV_REG_COPY FI_CXI_FORCE_DEV_REG_COPY will force the CXI provider to use the HMEM device register copy routines. If not supported, RDMA operations or memory registration will fail. Signed-off-by: Ian Ziemba --- man/fi_cxi.7.md | 4 ++++ prov/cxi/include/cxip.h | 1 + prov/cxi/src/cxip_ep.c | 8 +++++--- prov/cxi/src/cxip_info.c | 7 +++++++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md index 384026f0192..4c6e46d14cc 100644 --- a/man/fi_cxi.7.md +++ b/man/fi_cxi.7.md @@ -1301,6 +1301,10 @@ The CXI provider checks for the following environment variables: : Max amount of time to poll when LE invalidate disabling an MR configured with MR match events. +*FI_CXI_FORCE_DEV_REG_COPY* +: Force the CXI provider to use the HMEM device register copy routines. If not + supported, RDMA operations or memory registration will fail. + Note: Use the fi_info utility to query provider environment variables: fi_info -p cxi -e diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 4494b9b703a..68ea4c0ce7e 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -324,6 +324,7 @@ struct cxip_environment { int hybrid_unexpected_msg_preemptive; size_t mr_cache_events_disable_poll_nsecs; size_t mr_cache_events_disable_le_poll_nsecs; + int force_dev_reg_copy; }; extern struct cxip_environment cxip_env; diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c index bc5cc9ead2e..aebec245ef7 100644 --- a/prov/cxi/src/cxip_ep.c +++ b/prov/cxi/src/cxip_ep.c @@ -1187,8 +1187,10 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname, if (!cuda_api_permitted && !cuda_is_gdrcopy_enabled()) return -FI_EOPNOTSUPP; - ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA] = - !cuda_api_permitted; + if (!cxip_env.force_dev_reg_copy) { + ep->ep_obj->require_dev_reg_copy[FI_HMEM_CUDA] = + !cuda_api_permitted; + } break; default: @@ -1296,7 +1298,7 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, * disables it. */ for (i = 0; i < OFI_HMEM_MAX; i++) - ep_obj->require_dev_reg_copy[i] = false; + ep_obj->require_dev_reg_copy[i] = cxip_env.force_dev_reg_copy; ofi_atomic_initialize32(&ep_obj->txq_ref, 0); ofi_atomic_initialize32(&ep_obj->tgq_ref, 0); diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index af94964ab5f..f0da25e315e 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -670,6 +670,7 @@ struct cxip_environment cxip_env = { CXIP_MR_CACHE_EVENTS_DISABLE_POLL_NSECS, .mr_cache_events_disable_le_poll_nsecs = CXIP_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS, + .force_dev_reg_copy = false, }; static void cxip_env_init(void) @@ -1288,6 +1289,12 @@ static void cxip_env_init(void) fi_param_get_size_t(&cxip_prov, "mr_cache_events_disable_le_poll_nsecs", &cxip_env.mr_cache_events_disable_le_poll_nsecs); + fi_param_define(&cxip_prov, "force_dev_reg_copy", FI_PARAM_BOOL, + "Force device register copy operations. Default: %d", + cxip_env.force_dev_reg_copy); + fi_param_get_bool(&cxip_prov, "force_dev_reg_copy", + &cxip_env.force_dev_reg_copy); + set_system_page_size(); } From 98529aa0583fff486b24ac8f89b9d81893f778f1 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Tue, 19 Nov 2024 17:30:00 -0600 Subject: [PATCH 251/393] prov/cxi: Add FI_OPT_CUDA_API_PERMITTED tests Signed-off-by: Ian Ziemba --- prov/cxi/test/cuda.c | 149 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/prov/cxi/test/cuda.c b/prov/cxi/test/cuda.c index 53338f60fd6..4776556635a 100644 --- a/prov/cxi/test/cuda.c +++ b/prov/cxi/test/cuda.c @@ -431,3 +431,152 @@ Test(cuda, large_transfer) { cuda_dev_memory_test(LARGE_XFER, 2, false, true); } + +static void verify_dev_reg_eopnotsupp_local_op(void) +{ + void *buf; + cudaError_t cuda_ret; + size_t buf_size = 1024; + int ret; + + cuda_ret = cudaMalloc(&buf, buf_size); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret); + + ret = fi_recv(cxit_ep, buf, buf_size, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, -FI_EOPNOTSUPP, "fi_recv failed: %d", ret); + + cuda_ret = cudaFree(buf); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret); +} + +static void verify_dev_reg_eopnotsupp_remote_mr(void) +{ + int ret; + void *buf; + cudaError_t cuda_ret; + size_t buf_size = 1024; + struct fid_mr *fid_mr; + + cuda_ret = cudaMalloc(&buf, buf_size); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret); + + ret = fi_mr_reg(cxit_domain, buf, buf_size, FI_READ, 0, 0x123, 0, + &fid_mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + ret = fi_mr_bind(fid_mr, &(cxit_ep->fid), 0); + cr_assert_eq(ret, -FI_EOPNOTSUPP, "fi_mr_bind failed: %d", ret); + + ret = fi_close(&fid_mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + + cuda_ret = cudaFree(buf); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret); +} + +Test(cuda, verify_fi_opt_cuda_api_permitted_local_operation) +{ + int ret; + bool optval = false; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_msg(); + + ret = fi_setopt(&(cxit_ep->fid), FI_OPT_ENDPOINT, + FI_OPT_CUDA_API_PERMITTED, &optval, sizeof(optval)); + assert(ret == FI_SUCCESS); + + verify_dev_reg_eopnotsupp_local_op(); + + cxit_teardown_msg(); +} + +Test(cuda, verify_fi_opt_cuda_api_permitted_remote_mr) +{ + int ret; + bool optval = false; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_msg(); + + ret = fi_setopt(&(cxit_ep->fid), FI_OPT_ENDPOINT, + FI_OPT_CUDA_API_PERMITTED, &optval, sizeof(optval)); + assert(ret == FI_SUCCESS); + + verify_dev_reg_eopnotsupp_remote_mr(); + + cxit_teardown_msg(); +} + +Test(cuda, verify_get_fi_opt_cuda_api_permitted) +{ + int ret; + bool optval = false; + size_t size = sizeof(optval); + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_msg(); + + ret = fi_setopt(&(cxit_ep->fid), FI_OPT_ENDPOINT, + FI_OPT_CUDA_API_PERMITTED, &optval, sizeof(optval)); + assert(ret == FI_SUCCESS); + + optval = true; + + ret = fi_getopt(&(cxit_ep->fid), FI_OPT_ENDPOINT, + FI_OPT_CUDA_API_PERMITTED, &optval, &size); + assert(ret == FI_SUCCESS); + + assert(optval == false); + + cxit_teardown_msg(); +} + +Test(cuda, verify_force_dev_reg_local) +{ + int ret; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + ret = setenv("FI_CXI_FORCE_DEV_REG_COPY", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + cxit_fi_hints->tx_attr->size = 512; + + cxit_setup_ep(); + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret != FI_SUCCESS, "ret is: %d\n", ret); + + /* Tear down RMA objects */ + cxit_destroy_ep(); /* EP must be destroyed before bound objects */ + + cxit_destroy_av(); + cxit_destroy_cntrs(); + cxit_destroy_cqs(); + cxit_teardown_ep(); +} From afbff48571fb2666be6b46d5170bf5b96b957c1c Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Wed, 20 Nov 2024 19:04:00 +0000 Subject: [PATCH 252/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_cxi.7 | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 index adf11179bde..528787a6e1e 100644 --- a/man/man7/fi_cxi.7 +++ b/man/man7/fi_cxi.7 @@ -1,7 +1,7 @@ .\"t .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_cxi" "7" "2024\-10\-31" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cxi" "7" "2024\-11\-20" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -1559,6 +1559,10 @@ events. \f[I]FI_CXI_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS\f[R] Max amount of time to poll when LE invalidate disabling an MR configured with MR match events. +.TP +\f[I]FI_CXI_FORCE_DEV_REG_COPY\f[R] +Force the CXI provider to use the HMEM device register copy routines. +If not supported, RDMA operations or memory registration will fail. .PP Note: Use the fi_info utility to query provider environment variables: fi_info -p cxi -e From 52e3437b96a62489cb603469a29be7bd14d54efd Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Sun, 17 Nov 2024 22:17:39 -0500 Subject: [PATCH 253/393] prov/lnx: Fix various issues with initial commit 1. must pass an ep fid for fi_srx_context to comply with API symantics 2. Don't overwrite attribute flags passed in fi_av_open() by application 3. Enforce the use of FI_AV_TABLE type as the other types have been deprecated 4. Check for NULL entry before calling ofi_mr_cache_delete() Signed-off-by: Amir Shehata --- prov/lnx/include/lnx.h | 1 + prov/lnx/src/lnx_av.c | 6 +----- prov/lnx/src/lnx_cq.c | 5 +++-- prov/lnx/src/lnx_domain.c | 8 +++++++- prov/lnx/src/lnx_ep.c | 2 +- prov/lnx/src/lnx_init.c | 5 +++-- prov/lnx/src/lnx_ops.c | 33 ++++++++++++++++++++++----------- 7 files changed, 38 insertions(+), 22 deletions(-) diff --git a/prov/lnx/include/lnx.h b/prov/lnx/include/lnx.h index b40c9ea3eca..450324d5d92 100644 --- a/prov/lnx/include/lnx.h +++ b/prov/lnx/include/lnx.h @@ -81,6 +81,7 @@ struct local_prov_ep { struct fid_ep **lpe_txc; struct fid_ep **lpe_rxc; struct fid_av *lpe_av; + struct fid_ep *lpe_srx_ep; struct lnx_peer_cq lpe_cq; struct fi_info *lpe_fi_info; struct fid_peer_srx lpe_srx; diff --git a/prov/lnx/src/lnx_av.c b/prov/lnx/src/lnx_av.c index 4e6ac0bebaf..3fbc5b59464 100644 --- a/prov/lnx/src/lnx_av.c +++ b/prov/lnx/src/lnx_av.c @@ -630,11 +630,7 @@ int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, if (attr->name) return -FI_ENOSYS; - if (attr->type != FI_AV_UNSPEC && - attr->type != FI_AV_TABLE) - return -FI_ENOSYS; - - if (attr->type == FI_AV_UNSPEC) + if (attr->type != FI_AV_TABLE) attr->type = FI_AV_TABLE; peer_tbl = calloc(sizeof(*peer_tbl), 1); diff --git a/prov/lnx/src/lnx_cq.c b/prov/lnx/src/lnx_cq.c index 6aebc8f4c5a..03b43a593e9 100644 --- a/prov/lnx/src/lnx_cq.c +++ b/prov/lnx/src/lnx_cq.c @@ -160,11 +160,12 @@ static int lnx_cq_open_core_prov(struct lnx_cq *cq, struct fi_cq_attr *attr) int rc; struct local_prov_ep *ep; struct local_prov *entry; + struct fi_cq_attr peer_attr = {0}; struct dlist_entry *prov_table = &cq->lnx_domain->ld_fabric->local_prov_table; /* tell the core providers to import my CQ */ - attr->flags |= FI_PEER; + peer_attr.flags |= FI_PEER; /* create all the core provider completion queues */ dlist_foreach_container(prov_table, struct local_prov, @@ -181,7 +182,7 @@ static int lnx_cq_open_core_prov(struct lnx_cq *cq, struct fi_cq_attr *attr) cq_ctxt.cq = &ep->lpe_cq.lpc_cq; /* pass my CQ into the open and get back the core's cq */ - rc = fi_cq_open(ep->lpe_domain, attr, &core_cq, &cq_ctxt); + rc = fi_cq_open(ep->lpe_domain, &peer_attr, &core_cq, &cq_ctxt); if (rc) return rc; diff --git a/prov/lnx/src/lnx_domain.c b/prov/lnx/src/lnx_domain.c index 1d898319225..f1b055f4a88 100644 --- a/prov/lnx/src/lnx_domain.c +++ b/prov/lnx/src/lnx_domain.c @@ -72,6 +72,11 @@ static int lnx_cleanup_domains(struct local_prov *prov) struct local_prov_ep, ep, entry) { if (!ep->lpe_domain) continue; + + rc = fi_close(&ep->lpe_srx_ep->fid); + if (rc) + frc = rc; + rc = fi_close(&ep->lpe_domain->fid); if (rc) frc = rc; @@ -463,7 +468,8 @@ static int lnx_open_core_domains(struct local_prov *prov, if (!rc && srq_support) { ep->lpe_srx.owner_ops = &lnx_srx_ops; peer_srx.srx = &ep->lpe_srx; - rc = fi_srx_context(ep->lpe_domain, &attr, NULL, &peer_srx); + rc = fi_srx_context(ep->lpe_domain, &attr, + &ep->lpe_srx_ep, &peer_srx); } /* if one of the constituent endpoints doesn't support shared diff --git a/prov/lnx/src/lnx_ep.c b/prov/lnx/src/lnx_ep.c index cd4b83d099f..6590a6056d9 100644 --- a/prov/lnx/src/lnx_ep.c +++ b/prov/lnx/src/lnx_ep.c @@ -119,7 +119,7 @@ static int lnx_enable_core_eps(struct lnx_ep *lep) struct local_prov_ep, ep, entry) { if (srq_support) { rc = fi_ep_bind(ep->lpe_ep, - &ep->lpe_srx.ep_fid.fid, 0); + &ep->lpe_srx_ep->fid, 0); if (rc) { FI_INFO(&lnx_prov, FI_LOG_CORE, "%s doesn't support SRX (%d)\n", diff --git a/prov/lnx/src/lnx_init.c b/prov/lnx/src/lnx_init.c index 94c7a7e14cd..d1377a0dd9d 100644 --- a/prov/lnx/src/lnx_init.c +++ b/prov/lnx/src/lnx_init.c @@ -105,7 +105,7 @@ struct fi_domain_attr lnx_domain_attr = { .control_progress = FI_PROGRESS_AUTO, .data_progress = FI_PROGRESS_AUTO, .resource_mgmt = FI_RM_ENABLED, - .av_type = FI_AV_UNSPEC, + .av_type = FI_AV_TABLE, .mr_mode = FI_MR_RAW, .mr_key_size = SIZE_MAX, .cq_data_size = SIZE_MAX, @@ -410,6 +410,7 @@ static int lnx_form_info(struct fi_info *fi, struct fi_info **out) rc = -FI_ENOMEM; goto fail; } + r->domain_attr->av_type = FI_AV_TABLE; meta->lnx_rep = r; meta->lnx_link = fi; if (r->tx_attr) @@ -531,7 +532,7 @@ int lnx_getinfo_helper(uint32_t version, char *prov, struct fi_info *lnx_hints) lnx_hints->domain_attr->mr_mode |= (FI_MR_VIRT_ADDR | FI_MR_HMEM | FI_MR_PROV_KEY); } - rc = fi_getinfo(version, NULL, NULL, OFI_GETINFO_INTERNAL, + rc = fi_getinfo(version, NULL, NULL, OFI_GETINFO_HIDDEN, lnx_hints, &core_info); lnx_hints->fabric_attr->prov_name = orig_prov_name; diff --git a/prov/lnx/src/lnx_ops.c b/prov/lnx/src/lnx_ops.c index 3750e27f2a6..7d94b7c9352 100644 --- a/prov/lnx/src/lnx_ops.c +++ b/prov/lnx/src/lnx_ops.c @@ -574,7 +574,8 @@ ssize_t lnx_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, rc = fi_tsend(cep->lpe_ep, buf, len, mem_desc, core_addr, tag, context); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); return rc; } @@ -607,7 +608,8 @@ ssize_t lnx_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, rc = fi_tsendv(cep->lpe_ep, iov, &mem_desc, count, core_addr, tag, context); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); return rc; } @@ -648,7 +650,8 @@ ssize_t lnx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, rc = fi_tsendmsg(cep->lpe_ep, &core_msg, flags); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); return rc; } @@ -681,7 +684,8 @@ ssize_t lnx_tinject(struct fid_ep *ep, const void *buf, size_t len, rc = fi_tinject(cep->lpe_ep, buf, len, core_addr, tag); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); return rc; } @@ -717,7 +721,8 @@ ssize_t lnx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc rc = fi_tsenddata(cep->lpe_ep, buf, len, mem_desc, data, core_addr, tag, context); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); return rc; } @@ -750,7 +755,8 @@ ssize_t lnx_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, rc = fi_tinjectdata(cep->lpe_ep, buf, len, data, core_addr, tag); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); return rc; } @@ -792,7 +798,8 @@ lnx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, rc = fi_read(core_ep, buf, len, mem_desc, core_addr, addr, key, context); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); out: return rc; } @@ -834,7 +841,8 @@ lnx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, rc = fi_write(core_ep, buf, len, mem_desc, core_addr, addr, key, context); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); out: return rc; } @@ -878,7 +886,8 @@ lnx_atomic_write(struct fid_ep *ep, rc = fi_atomic(core_ep, buf, count, mem_desc, core_addr, addr, key, datatype, op, context); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); out: return rc; } @@ -924,7 +933,8 @@ lnx_atomic_readwrite(struct fid_ep *ep, result, mem_desc, core_addr, addr, key, datatype, op, context); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); out: return rc; } @@ -971,7 +981,8 @@ lnx_atomic_compwrite(struct fid_ep *ep, compare, compare_desc, result, mem_desc, core_addr, addr, key, datatype, op, context); - ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); + if (mre) + ofi_mr_cache_delete(&lep->le_domain->ld_mr_cache, mre); out: return rc; From b987c16724a066ac6d7d6a880c80c50170c18085 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Sun, 17 Nov 2024 23:21:48 -0500 Subject: [PATCH 254/393] xpmem: Cleanup xpmem before monitors Since xpmem installs memory monitors, these need to be cleaned up before cleaning up the monitors to avoid an assert in debug mode. Signed-off-by: Amir Shehata --- src/fabric.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fabric.c b/src/fabric.c index 13b529ea95c..c9eca76ae7e 100644 --- a/src/fabric.c +++ b/src/fabric.c @@ -1030,9 +1030,9 @@ FI_DESTRUCTOR(fi_fini(void)) } ofi_free_filter(&prov_filter); + ofi_shm_p2p_cleanup(); ofi_monitors_cleanup(); ofi_hmem_cleanup(); - ofi_shm_p2p_cleanup(); ofi_hook_fini(); ofi_mem_fini(); fi_log_fini(); From 8cf1e53ad7e6ca7444d63cf086a02aec82f8e6da Mon Sep 17 00:00:00 2001 From: Nikhil Nanal Date: Mon, 18 Nov 2024 16:10:18 -0800 Subject: [PATCH 255/393] prov/sockets: Fixed coverity issue for unchecked return value. Added a error logging print in sock_av_report_success to report failures if the sock_eq_report_event fails. Signed-off-by: Nikhil Nanal --- prov/sockets/src/sock_av.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/prov/sockets/src/sock_av.c b/prov/sockets/src/sock_av.c index 6e799a2a1d8..71003e2d8d6 100644 --- a/prov/sockets/src/sock_av.c +++ b/prov/sockets/src/sock_av.c @@ -123,8 +123,11 @@ static inline void sock_av_report_success(struct sock_av *av, void *context, eq_entry.fid = &av->av_fid.fid; eq_entry.context = context; eq_entry.data = num_done; - sock_eq_report_event(av->eq, FI_AV_COMPLETE, - &eq_entry, sizeof(eq_entry), flags); + if (sock_eq_report_event(av->eq, FI_AV_COMPLETE, + &eq_entry, sizeof(eq_entry), flags)) + SOCK_LOG_ERROR("Error in writing to EQ\n"); + + } static void sock_av_report_error(struct sock_av *av, fi_addr_t *fi_addr, From d4b7477d8091d49b4c1e91954060111106ee6809 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 21 Nov 2024 11:57:37 -0800 Subject: [PATCH 256/393] prov/lnx: fix av strncpy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix compiler error with strncpy: prov/lnx/src/lnx_av.c:272:4: warning: ‘strncpy’ specified bound 64 equals destination size [-Wstringop-truncation] strncpy(peer_prov->lpp_prov_name, prov_name, FI_NAME_MAX); Both dest and srouce are FI_NAME_MAX but strncpy is complaining that it doesn't have space for the NULL terminator if it needs to be truncated. The terminator should already be in the prov_name passed in. Just turn this into a memcpy to make the compiler happy Signed-off-by: Alexia Ingerson --- prov/lnx/src/lnx_av.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prov/lnx/src/lnx_av.c b/prov/lnx/src/lnx_av.c index 3fbc5b59464..f0b8d09fb86 100644 --- a/prov/lnx/src/lnx_av.c +++ b/prov/lnx/src/lnx_av.c @@ -269,7 +269,8 @@ static int lnx_get_or_create_peer_prov(struct dlist_entry *prov_table, dlist_init(&peer_prov->entry); dlist_init(&peer_prov->lpp_map); - strncpy(peer_prov->lpp_prov_name, prov_name, FI_NAME_MAX); + memcpy(peer_prov->lpp_prov_name, prov_name, + FI_NAME_MAX); peer_prov->lpp_prov = entry; From ad80329e5b11c953b388487246be3a54f611a48f Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 19 Nov 2024 16:27:06 -0800 Subject: [PATCH 257/393] prov/efa: Remove efa_send_wr, send_wr_pool and recv_wr_pool from dgram_ep These fields are not being used by dgram_ep any more. Signed-off-by: Jessie Yang --- prov/efa/src/dgram/efa_dgram_ep.c | 14 -------------- prov/efa/src/dgram/efa_dgram_ep.h | 16 ---------------- 2 files changed, 30 deletions(-) diff --git a/prov/efa/src/dgram/efa_dgram_ep.c b/prov/efa/src/dgram/efa_dgram_ep.c index 4f43807035a..635d5e7a9b6 100644 --- a/prov/efa/src/dgram/efa_dgram_ep.c +++ b/prov/efa/src/dgram/efa_dgram_ep.c @@ -63,8 +63,6 @@ static int efa_dgram_ep_close(fid_t fid) ep = container_of(fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid.fid); - ofi_bufpool_destroy(ep->recv_wr_pool); - ofi_bufpool_destroy(ep->send_wr_pool); efa_dgram_ep_destroy(ep); return 0; @@ -444,16 +442,6 @@ int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, */ assert(user_info->tx_attr->iov_limit <= 2); - ret = ofi_bufpool_create(&ep->send_wr_pool, - sizeof(struct efa_send_wr), 16, 0, 1024, 0); - if (ret) - goto err_ep_destroy; - - ret = ofi_bufpool_create(&ep->recv_wr_pool, - sizeof(struct efa_recv_wr), 16, 0, 1024, 0); - if (ret) - goto err_send_wr_destroy; - ep->base_ep.domain = domain; *ep_fid = &ep->base_ep.util_ep.ep_fid; @@ -468,8 +456,6 @@ int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, return 0; -err_send_wr_destroy: - ofi_bufpool_destroy(ep->send_wr_pool); err_ep_destroy: efa_dgram_ep_destroy(ep); return ret; diff --git a/prov/efa/src/dgram/efa_dgram_ep.h b/prov/efa/src/dgram/efa_dgram_ep.h index ecc8f1772dd..b01db81f57e 100644 --- a/prov/efa/src/dgram/efa_dgram_ep.h +++ b/prov/efa/src/dgram/efa_dgram_ep.h @@ -11,24 +11,8 @@ struct efa_dgram_ep { struct efa_dgram_cq *rcq; struct efa_dgram_cq *scq; - - struct ofi_bufpool *send_wr_pool; - struct ofi_bufpool *recv_wr_pool; }; -struct efa_send_wr { - /** @brief Work request struct used by rdma-core */ - struct ibv_send_wr wr; - - /** @brief Scatter gather element array - * - * @details - * EFA device supports a maximum of 2 iov/SGE - */ - struct ibv_sge sge[2]; -}; - - int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *info, struct fid_ep **ep_fid, void *context); From d5d8694a205947d87bd9775cb835a34d5b1b8bc2 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Wed, 20 Nov 2024 16:23:42 -0800 Subject: [PATCH 258/393] prov/efa: Fix efa_msg flags Need to use the flags of util_ep for CQ, which includes FI_COMPLETION flag for FI_TRANSMIT and FI_RECV unless FI_SELECTIVE_COMPLETION is set. Signed-off-by: Jessie Yang --- prov/efa/src/efa_base_ep.h | 3 +++ prov/efa/src/efa_msg.c | 32 +++++++++----------------------- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index f970fe3aba9..820ced150c2 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -15,6 +15,9 @@ #define EFA_QP_DEFAULT_SERVICE_LEVEL 0 #define EFA_QP_LOW_LATENCY_SERVICE_LEVEL 8 +#define efa_rx_flags(efa_base_ep) ((efa_base_ep)->util_ep.rx_op_flags) +#define efa_tx_flags(efa_base_ep) ((efa_base_ep)->util_ep.tx_op_flags) + struct efa_qp { struct ibv_qp *ibv_qp; struct ibv_qp_ex *ibv_qp_ex; diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c index e03ca5aeebe..bbef0eb0569 100644 --- a/prov/efa/src/efa_msg.c +++ b/prov/efa/src/efa_msg.c @@ -151,7 +151,7 @@ static ssize_t efa_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, u { struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); - return efa_post_recv(base_ep, msg, flags); + return efa_post_recv(base_ep, msg, flags | base_ep->util_ep.rx_msg_flags); } static ssize_t efa_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, @@ -164,7 +164,7 @@ static ssize_t efa_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, EFA_SETUP_IOV(iov, buf, len); EFA_SETUP_MSG(msg, &iov, &desc, 1, src_addr, context, 0); - return efa_post_recv(base_ep, &msg, 0); + return efa_post_recv(base_ep, &msg, efa_rx_flags(base_ep)); } static ssize_t efa_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, @@ -175,7 +175,7 @@ static ssize_t efa_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void EFA_SETUP_MSG(msg, iov, desc, count, src_addr, context, 0); - return efa_post_recv(base_ep, &msg, 0); + return efa_post_recv(base_ep, &msg, efa_rx_flags(base_ep)); } static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi_msg *msg, uint64_t flags) @@ -266,7 +266,7 @@ static ssize_t efa_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, u { struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); - return efa_post_send(base_ep, msg, flags); + return efa_post_send(base_ep, msg, flags | base_ep->util_ep.tx_msg_flags); } static ssize_t efa_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, @@ -275,13 +275,11 @@ static ssize_t efa_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); struct fi_msg msg; struct iovec iov; - uint64_t flags; EFA_SETUP_IOV(iov, buf, len); EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, 0); - flags = base_ep->info->tx_attr->op_flags; - return efa_post_send(base_ep, &msg, flags); + return efa_post_send(base_ep, &msg, efa_tx_flags(base_ep)); } static ssize_t efa_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, @@ -290,14 +288,11 @@ static ssize_t efa_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t le struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); struct fi_msg msg; struct iovec iov; - uint64_t flags; EFA_SETUP_IOV(iov, buf, len); EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, data); - flags = base_ep->info->tx_attr->op_flags | FI_REMOTE_CQ_DATA; - - return efa_post_send(base_ep, &msg, flags); + return efa_post_send(base_ep, &msg, efa_tx_flags(base_ep) | FI_REMOTE_CQ_DATA); } static ssize_t efa_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, @@ -305,13 +300,10 @@ static ssize_t efa_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void { struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); struct fi_msg msg; - uint64_t flags; EFA_SETUP_MSG(msg, iov, desc, count, dest_addr, context, 0); - flags = base_ep->info->tx_attr->op_flags; - - return efa_post_send(base_ep, &msg, flags); + return efa_post_send(base_ep, &msg, efa_tx_flags(base_ep)); } static ssize_t efa_ep_msg_inject(struct fid_ep *ep_fid, const void *buf, size_t len, @@ -320,16 +312,13 @@ static ssize_t efa_ep_msg_inject(struct fid_ep *ep_fid, const void *buf, size_t struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); struct fi_msg msg; struct iovec iov; - uint64_t flags; assert(len <= base_ep->domain->device->efa_attr.inline_buf_size); EFA_SETUP_IOV(iov, buf, len); EFA_SETUP_MSG(msg, &iov, NULL, 1, dest_addr, NULL, 0); - flags = base_ep->info->tx_attr->op_flags | FI_INJECT; - - return efa_post_send(base_ep, &msg, flags); + return efa_post_send(base_ep, &msg, FI_INJECT); } static ssize_t efa_ep_msg_injectdata(struct fid_ep *ep_fid, const void *buf, @@ -339,16 +328,13 @@ static ssize_t efa_ep_msg_injectdata(struct fid_ep *ep_fid, const void *buf, struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); struct fi_msg msg; struct iovec iov; - uint64_t flags; assert(len <= base_ep->domain->device->efa_attr.inline_buf_size); EFA_SETUP_IOV(iov, buf, len); EFA_SETUP_MSG(msg, &iov, NULL, 1, dest_addr, NULL, data); - flags = base_ep->info->tx_attr->op_flags | FI_REMOTE_CQ_DATA | FI_INJECT; - - return efa_post_send(base_ep, &msg, flags); + return efa_post_send(base_ep, &msg, FI_REMOTE_CQ_DATA | FI_INJECT); } struct fi_ops_msg efa_msg_ops = { From f12f5ea57d8c95621fe8ed92e34164b71d430a70 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Thu, 14 Nov 2024 12:10:22 -0800 Subject: [PATCH 259/393] prov/efa: Implement the rma interface Rename efa_dgram_rma.c to efa_rma.c and move it to prov/efa/src as a common RMA interface for both rdm and dgram ep type. Update that dgram does not support rma. Implement rdma write and inject. Support inline rdma write. Signed-off-by: Jessie Yang --- libfabric.vcxproj | 2 +- prov/efa/Makefile.include | 2 +- prov/efa/src/dgram/efa_dgram_rma.c | 148 ---------- prov/efa/src/efa_base_ep.c | 10 +- prov/efa/src/efa_rma.c | 409 ++++++++++++++++++++++++++++ prov/efa/src/rdm/efa_rdm_ep_fiops.c | 7 +- 6 files changed, 421 insertions(+), 157 deletions(-) delete mode 100644 prov/efa/src/dgram/efa_dgram_rma.c create mode 100644 prov/efa/src/efa_rma.c diff --git a/libfabric.vcxproj b/libfabric.vcxproj index e85229c9f1a..3eef3ef0521 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -885,9 +885,9 @@ + - diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index e5961cb13d5..81e0fab0aed 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -48,9 +48,9 @@ _efa_files = \ prov/efa/src/efa_env.c \ prov/efa/src/efa_cntr.c \ prov/efa/src/efa_msg.c \ + prov/efa/src/efa_rma.c \ prov/efa/src/dgram/efa_dgram_ep.c \ prov/efa/src/dgram/efa_dgram_cq.c \ - prov/efa/src/dgram/efa_dgram_rma.c \ prov/efa/src/rdm/efa_rdm_peer.c \ prov/efa/src/rdm/efa_rdm_cq.c \ prov/efa/src/rdm/efa_rdm_ep_utils.c \ diff --git a/prov/efa/src/dgram/efa_dgram_rma.c b/prov/efa/src/dgram/efa_dgram_rma.c deleted file mode 100644 index 99f4c1a2929..00000000000 --- a/prov/efa/src/dgram/efa_dgram_rma.c +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include -#include -#include -#include -#include "efa_dgram_ep.h" -#include "efa.h" -#include "efa_av.h" - - -/* - * efa_dgram_rma_post_read() will post a read request. - * - * Input: - * ep: endpoint - * msg: read operation information - * flags: currently no flags is taken - * self_comm: indicate whether the read is toward - * the end point itself. If self_comm is true, - * caller must set msg->addr to FI_ADDR_NOTAVAIL. - * - * On success return 0, - * If read iov and rma_iov count out of device limit, return -FI_EINVAL - * If read failed, return the error of read operation - */ -ssize_t efa_dgram_rma_post_read(struct efa_dgram_ep *ep, const struct fi_msg_rma *msg, - uint64_t flags, bool self_comm) -{ - struct efa_qp *qp; - struct efa_mr *efa_mr; - struct efa_conn *conn; -#ifndef _WIN32 - struct ibv_sge sge_list[msg->iov_count]; -#else - /* MSVC compiler does not support array declarations with runtime size, so hardcode - * the expected iov_limit/max_sq_sge from the lower-level efa provider. - */ - struct ibv_sge sge_list[EFA_DEV_ATTR_MAX_WR_SGE]; -#endif - int i; - - if (OFI_UNLIKELY(msg->iov_count > ep->base_ep.domain->device->ibv_attr.max_sge_rd)) { - EFA_WARN(FI_LOG_CQ, "invalid iov_count!\n"); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(msg->rma_iov_count > ep->base_ep.domain->info->tx_attr->rma_iov_limit)) { - EFA_WARN(FI_LOG_CQ, "invalid rma_iov_count!\n"); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(ofi_total_iov_len(msg->msg_iov, msg->iov_count) - > ep->base_ep.domain->device->max_rdma_size)) { - EFA_WARN(FI_LOG_CQ, "maximum rdma_size exceeded!\n"); - return -FI_EINVAL; - } - - /* caller must provide desc because EFA require FI_MR_LOCAL */ - assert(msg->desc); - - /* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */ - qp = ep->base_ep.qp; - ibv_wr_start(qp->ibv_qp_ex); - qp->ibv_qp_ex->wr_id = (uintptr_t)msg->context; - ibv_wr_rdma_read(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); - - for (i = 0; i < msg->iov_count; ++i) { - sge_list[i].addr = (uint64_t)msg->msg_iov[i].iov_base; - sge_list[i].length = msg->msg_iov[i].iov_len; - assert(msg->desc[i]); - efa_mr = (struct efa_mr *)msg->desc[i]; - sge_list[i].lkey = efa_mr->ibv_mr->lkey; - } - - ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list); - if (self_comm) { - assert(msg->addr == FI_ADDR_NOTAVAIL); - ibv_wr_set_ud_addr(qp->ibv_qp_ex, ep->base_ep.self_ah, - qp->qp_num, qp->qkey); - } else { - conn = efa_av_addr_to_conn(ep->base_ep.av, msg->addr); - assert(conn && conn->ep_addr); - ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, - conn->ep_addr->qpn, conn->ep_addr->qkey); - } - - return ibv_wr_complete(qp->ibv_qp_ex); -} - -static -ssize_t efa_dgram_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) -{ - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - - return efa_dgram_rma_post_read(ep, msg, flags, false); -} - -static -ssize_t efa_dgram_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t iov_count, fi_addr_t src_addr, uint64_t addr, - uint64_t key, void *context) -{ - struct fi_rma_iov rma_iov; - struct fi_msg_rma msg; - - rma_iov.addr = addr; - rma_iov.len = ofi_total_iov_len(iov, iov_count); - rma_iov.key = key; - - memset(&msg, 0, sizeof(msg)); - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = iov_count; - msg.addr = src_addr; - msg.context = context; - msg.rma_iov = &rma_iov; - msg.rma_iov_count = 1; - - return efa_dgram_rma_readmsg(ep, &msg, 0); -} - -static -ssize_t efa_dgram_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, uint64_t addr, uint64_t key, - void *context) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - return efa_dgram_rma_readv(ep, &iov, &desc, 1, src_addr, addr, key, context); -} - -struct fi_ops_rma efa_dgram_ep_rma_ops = { - .size = sizeof(struct fi_ops_rma), - .read = efa_dgram_rma_read, - .readv = efa_dgram_rma_readv, - .readmsg = efa_dgram_rma_readmsg, - .write = fi_no_rma_write, - .writev = fi_no_rma_writev, - .writemsg = fi_no_rma_writemsg, - .inject = fi_no_rma_inject, - .writedata = fi_no_rma_writedata, - .injectdata = fi_no_rma_injectdata, -}; - diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 55997a3cfe6..7e7b6b4a910 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -186,6 +186,12 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, init_attr_ex); } else { assert(init_attr_ex->qp_type == IBV_QPT_DRIVER); + if (efa_device_support_rdma_read()) + init_attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_READ; + if (efa_device_support_rdma_write()) { + init_attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; + init_attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; + } #if HAVE_CAPS_UNSOLICITED_WRITE_RECV if (efa_rdm_use_unsolicited_write_recv()) efa_attr.flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV; @@ -362,7 +368,9 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, base_ep->max_msg_size = info->ep_attr->max_msg_size; base_ep->max_rma_size = info->ep_attr->max_msg_size; base_ep->inject_msg_size = info->tx_attr->inject_size; - base_ep->inject_rma_size = info->tx_attr->inject_size; + /* TODO: update inject_rma_size to inline size after firmware + * supports inline rdma write */ + base_ep->inject_rma_size = 0; return 0; } diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c new file mode 100644 index 00000000000..468ea2e1f76 --- /dev/null +++ b/prov/efa/src/efa_rma.c @@ -0,0 +1,409 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include +#include +#include +#include +#include "efa.h" +#include "efa_av.h" + +#define EFA_SETUP_IOV(iov, buf, len) \ + do { \ + iov.iov_base = (void *) buf; \ + iov.iov_len = (size_t) len; \ + } while (0) + +#define EFA_SETUP_RMA_IOV(rma_iov, _addr, _len, _key) \ + do { \ + rma_iov.addr = (uint64_t) _addr; \ + rma_iov.len = (size_t) _len; \ + rma_iov.key = (uint64_t) _key; \ + } while (0) + +#define EFA_SETUP_MSG_RMA(msg, iov, _desc, count, _addr, _rma_iov, \ + _rma_iov_count, _context, _data) \ + do { \ + msg.msg_iov = (const struct iovec *) iov; \ + msg.desc = (void **) _desc; \ + msg.iov_count = (size_t) count; \ + msg.addr = (fi_addr_t) _addr; \ + msg.rma_iov = (const struct fi_rma_iov *) _rma_iov; \ + msg.rma_iov_count = (size_t) _rma_iov_count; \ + msg.context = (void *) _context; \ + msg.data = (uint32_t) _data; \ + } while (0) + +/** + * @brief check whether endpoint was configured with FI_RMA capability + * @return -FI_EOPNOTSUPP if FI_RMA wasn't requested, 0 if it was. + */ +static inline int efa_rma_check_cap(struct efa_base_ep *base_ep) { + if ((base_ep->info->caps & FI_RMA) == FI_RMA) + return 0; + EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_RMA capability, which was not requested.\n"); + return -FI_EOPNOTSUPP; +} + +/* + * efa_rma_post_read() will post a read request. + * + * Input: + * base_ep: endpoint + * msg: read operation information + * flags: currently no flags is taken + * + * On success return 0, + * If read failed, return the error of read operation + */ +static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, + const struct fi_msg_rma *msg, + uint64_t flags) +{ + struct efa_qp *qp; + struct efa_mr *efa_mr; + struct efa_conn *conn; +#ifndef _WIN32 + struct ibv_sge sge_list[msg->iov_count]; +#else + /* MSVC compiler does not support array declarations with runtime size, so hardcode + * the expected iov_limit/max_sq_sge from the lower-level efa provider. + */ + struct ibv_sge sge_list[EFA_DEV_ATTR_MAX_WR_SGE]; +#endif + int i, err = 0; + + assert(msg->iov_count > 0 && + msg->iov_count <= base_ep->domain->info->tx_attr->iov_limit); + assert(msg->rma_iov_count > 0 && + msg->rma_iov_count <= base_ep->domain->info->tx_attr->rma_iov_limit); + assert(ofi_total_iov_len(msg->msg_iov, msg->iov_count) <= + base_ep->domain->device->max_rdma_size); + + qp = base_ep->qp; + if (!base_ep->is_wr_started) { + ibv_wr_start(qp->ibv_qp_ex); + base_ep->is_wr_started = true; + } + qp->ibv_qp_ex->wr_id = (uintptr_t)msg->context; + + /* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */ + ibv_wr_rdma_read(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); + + for (i = 0; i < msg->iov_count; ++i) { + sge_list[i].addr = (uint64_t)msg->msg_iov[i].iov_base; + sge_list[i].length = msg->msg_iov[i].iov_len; + assert(msg->desc && msg->desc[i]); + efa_mr = (struct efa_mr *)msg->desc[i]; + sge_list[i].lkey = efa_mr->ibv_mr->lkey; + } + + ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list); + + conn = efa_av_addr_to_conn(base_ep->av, msg->addr); + assert(conn && conn->ep_addr); + ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, conn->ep_addr->qpn, + conn->ep_addr->qkey); + + if (!(flags & FI_MORE)) { + err = ibv_wr_complete(qp->ibv_qp_ex); + base_ep->is_wr_started = false; + } + if (OFI_UNLIKELY(err)) + return err; + + return 0; +} + +static +ssize_t efa_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) +{ + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + return efa_rma_post_read(base_ep, msg, flags | base_ep->util_ep.tx_msg_flags); +} + +static +ssize_t efa_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, + size_t iov_count, fi_addr_t src_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + size_t len; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + len = ofi_total_iov_len(iov, iov_count); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, iov, desc, iov_count, src_addr, &rma_iov, 1, + context, 0); + + return efa_rma_post_read(base_ep, &msg, efa_tx_flags(base_ep)); +} + +static +ssize_t efa_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t addr, uint64_t key, + void *context) +{ + struct iovec iov; + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + assert(len <= base_ep->max_rma_size); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, &desc, 1, src_addr, &rma_iov, 1, context, 0); + + return efa_rma_post_read(base_ep, &msg, efa_tx_flags(base_ep)); +} + +/** + * @brief Post a WRITE request + * + * Input: + * base_ep: endpoint + * msg: read operation information + * flags: flags passed + * @return On success return 0, otherwise return a negative libfabric error code. + */ +static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, + const struct fi_msg_rma *msg, + uint64_t flags) +{ + struct efa_qp *qp; + struct efa_conn *conn; +#ifndef _WIN32 + struct ibv_sge sge_list[msg->iov_count]; + struct ibv_data_buf inline_data_list[msg->iov_count]; +#else + /* MSVC compiler does not support array declarations with runtime size, so hardcode + * the expected iov_limit/max_sq_sge from the lower-level efa provider. + */ + struct ibv_sge sge_list[EFA_DEV_ATTR_MAX_WR_SGE]; + struct ibv_data_buf inline_data_list[EFA_DEV_ATTR_MAX_WR_SGE]; +#endif + size_t len; + int i, err = 0; + + qp = base_ep->qp; + if (!base_ep->is_wr_started) { + ibv_wr_start(qp->ibv_qp_ex); + base_ep->is_wr_started = true; + } + qp->ibv_qp_ex->wr_id = (uintptr_t)msg->context; + + if (flags & FI_REMOTE_CQ_DATA) { + ibv_wr_rdma_write_imm(qp->ibv_qp_ex, msg->rma_iov[0].key, + msg->rma_iov[0].addr, msg->data); + } else { + ibv_wr_rdma_write(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); + } + + len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); + if (len <= base_ep->domain->device->efa_attr.inline_buf_size && + len <= base_ep->inject_rma_size && + (!msg->desc || !efa_mr_is_hmem(msg->desc[0]))) { + for (i = 0; i < msg->iov_count; i++) { + inline_data_list[i].addr = msg->msg_iov[i].iov_base; + inline_data_list[i].length = msg->msg_iov[i].iov_len; + } + ibv_wr_set_inline_data_list(qp->ibv_qp_ex, msg->iov_count, inline_data_list); + } else { + for (i = 0; i < msg->iov_count; ++i) { + sge_list[i].addr = (uint64_t)msg->msg_iov[i].iov_base; + sge_list[i].length = msg->msg_iov[i].iov_len; + assert(msg->desc && msg->desc[i]); + sge_list[i].lkey = ((struct efa_mr *)msg->desc[i])->ibv_mr->lkey; + } + ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list); + } + + conn = efa_av_addr_to_conn(base_ep->av, msg->addr); + assert(conn && conn->ep_addr); + ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, conn->ep_addr->qpn, + conn->ep_addr->qkey); + + if (!(flags & FI_MORE)) { + err = ibv_wr_complete(qp->ibv_qp_ex); + base_ep->is_wr_started = false; + } + + if (OFI_UNLIKELY(err)) + return err; + + return 0; +} + +ssize_t efa_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, + uint64_t flags) +{ + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + return efa_rma_post_write(base_ep, msg, flags | base_ep->util_ep.tx_msg_flags); +} + +ssize_t efa_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, + void **desc, size_t iov_count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + size_t len; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + len = ofi_total_iov_len(iov, iov_count); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, iov, desc, iov_count, dest_addr, &rma_iov, 1, + context, 0); + + return efa_rma_post_write(base_ep, &msg, efa_tx_flags(base_ep)); +} + +ssize_t efa_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct iovec iov; + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + assert(len <= base_ep->max_rma_size); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, &desc, 1, dest_addr, &rma_iov, 1, context, 0); + + return efa_rma_post_write(base_ep, &msg, efa_tx_flags(base_ep)); +} + +ssize_t efa_rma_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct iovec iov; + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + assert(len <= base_ep->max_rma_size); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, &desc, 1, dest_addr, &rma_iov, 1, context, data); + + return efa_rma_post_write(base_ep, &msg, FI_REMOTE_CQ_DATA | efa_tx_flags(base_ep)); +} + +ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t addr, uint64_t key) +{ + struct fi_msg_rma msg; + struct iovec iov; + struct fi_rma_iov rma_iov; + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + assert(len <= base_ep->inject_rma_size); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, NULL, 1, dest_addr, &rma_iov, 1, NULL, 0); + + return efa_rma_post_write(base_ep, &msg, FI_INJECT); +} + +ssize_t efa_rma_inject_writedata(struct fid_ep *ep_fid, const void *buf, + size_t len, uint64_t data, fi_addr_t dest_addr, + uint64_t addr, uint64_t key) +{ + struct fi_msg_rma msg; + struct iovec iov; + struct fi_rma_iov rma_iov; + struct efa_base_ep *base_ep; + int err; + + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + assert(len <= base_ep->inject_rma_size); + err = efa_rma_check_cap(base_ep); + if (err) + return err; + + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, NULL, 1, dest_addr, &rma_iov, 1, NULL, data); + + return efa_rma_post_write(base_ep, &msg, FI_INJECT | FI_REMOTE_CQ_DATA); +} + +struct fi_ops_rma efa_dgram_ep_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = fi_no_rma_read, + .readv = fi_no_rma_readv, + .readmsg = fi_no_rma_readmsg, + .write = fi_no_rma_write, + .writev = fi_no_rma_writev, + .writemsg = fi_no_rma_writemsg, + .inject = fi_no_rma_inject, + .writedata = fi_no_rma_writedata, + .injectdata = fi_no_rma_injectdata, +}; + +struct fi_ops_rma efa_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = efa_rma_read, + .readv = efa_rma_readv, + .readmsg = efa_rma_readmsg, + .write = efa_rma_write, + .writev = efa_rma_writev, + .writemsg = efa_rma_writemsg, + .inject = efa_rma_inject_write, + .writedata = efa_rma_writedata, + .injectdata = efa_rma_inject_writedata, +}; diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index d8a1a3fc5e9..98e1d0b4375 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -26,12 +26,6 @@ void efa_rdm_ep_construct_ibv_qp_init_attr_ex(struct efa_rdm_ep *ep, attr_ex->cap.max_recv_sge = ep->base_ep.domain->device->rdm_info->rx_attr->iov_limit; attr_ex->cap.max_inline_data = ep->base_ep.domain->device->efa_attr.inline_buf_size; attr_ex->qp_type = IBV_QPT_DRIVER; - if (efa_device_support_rdma_read()) - attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_READ; - if (efa_device_support_rdma_write()) { - attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; - attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; - } attr_ex->pd = efa_rdm_ep_domain(ep)->ibv_pd; attr_ex->qp_context = ep; attr_ex->sq_sig_all = 1; @@ -564,6 +558,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->max_atomic_size = info->ep_attr->max_msg_size; efa_rdm_ep->inject_tagged_size = info->tx_attr->inject_size; efa_rdm_ep->inject_atomic_size = info->tx_attr->inject_size; + efa_rdm_ep->base_ep.inject_rma_size = info->tx_attr->inject_size; efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size; efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size; efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version); From 5713d82e3f33b8066ff9d210862c6b008d28bf22 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 21 Nov 2024 23:05:02 +0000 Subject: [PATCH 260/393] prov/efa: Adjust the location of tracepoint send_begin_msg_context should be moved to the beginning of generic_send as msg is already constructed at that time, before the txe construction. Such moving allows us to measure the delay of the txe allocation and construction. Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_msg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index 615b3bb47bc..ad65781142e 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -163,6 +163,9 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, struct efa_rdm_peer *pee struct efa_rdm_ope *txe; struct util_srx_ctx *srx_ctx; + efa_rdm_tracepoint(send_begin_msg_context, + (size_t) msg->context, (size_t) msg->addr); + srx_ctx = efa_rdm_ep_get_peer_srx_ctx(ep); assert(msg->iov_count <= ep->base_ep.info->tx_attr->iov_limit); @@ -192,8 +195,6 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, struct efa_rdm_peer *pee efa_rdm_tracepoint(send_begin, txe->msg_id, (size_t) txe->cq_entry.op_context, txe->total_len); - efa_rdm_tracepoint(send_begin_msg_context, - (size_t) msg->context, (size_t) msg->addr); err = efa_rdm_msg_post_rtm(ep, txe); if (OFI_UNLIKELY(err)) { From e468f3168267365b20e33888c33fd5d0f67dd9be Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 22 Nov 2024 03:42:28 +0000 Subject: [PATCH 261/393] prov/efa: Add tracepoints for rma operations Add tracepoints for read and write, including: - efa_rdm:read/write_msg_context_begin: the tp when rma operations begins - efa:post_read/write: the tp when rma requests are posted to efa device Signed-off-by: Shi Jin --- prov/efa/src/efa_tp.h | 18 ++++++++++++++++++ prov/efa/src/efa_tp_def.h | 10 ++++++++++ prov/efa/src/rdm/efa_rdm_pke.c | 8 ++++++++ prov/efa/src/rdm/efa_rdm_rma.c | 7 +++++++ prov/efa/src/rdm/efa_rdm_tracepoint_def.h | 10 ++++++++++ 5 files changed, 53 insertions(+) diff --git a/prov/efa/src/efa_tp.h b/prov/efa/src/efa_tp.h index ec3ce8ebc47..dd2f32f79fb 100644 --- a/prov/efa/src/efa_tp.h +++ b/prov/efa/src/efa_tp.h @@ -43,6 +43,24 @@ static inline void efa_tracepoint_wr_id_post_recv(const void *wr_id) efa_tracepoint(post_recv, (size_t) wr_id, (size_t) ope, (size_t) ope->cq_entry.op_context); } +static inline void efa_tracepoint_wr_id_post_read(const void *wr_id) +{ + struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; + struct efa_rdm_ope *ope = pkt_entry->ope; + if (!ope) + return; + efa_tracepoint(post_read, (size_t) wr_id, (size_t) ope, (size_t) ope->cq_entry.op_context); +} + +static inline void efa_tracepoint_wr_id_post_write(const void *wr_id) +{ + struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; + struct efa_rdm_ope *ope = pkt_entry->ope; + if (!ope) + return; + efa_tracepoint(post_write, (size_t) wr_id, (size_t) ope, (size_t) ope->cq_entry.op_context); +} + #else #define efa_tracepoint(...) do {} while(0) diff --git a/prov/efa/src/efa_tp_def.h b/prov/efa/src/efa_tp_def.h index 72e03988a56..46617d2d2a7 100644 --- a/prov/efa/src/efa_tp_def.h +++ b/prov/efa/src/efa_tp_def.h @@ -40,6 +40,16 @@ LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, post_wr_id, EFA_TP_PROV, LTTNG_UST_TP_ARGS(X_PKT_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, post_recv, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, post_wr_id, EFA_TP_PROV, + post_read, + LTTNG_UST_TP_ARGS(X_PKT_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, post_read, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, post_wr_id, EFA_TP_PROV, + post_write, + LTTNG_UST_TP_ARGS(X_PKT_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, post_write, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + #endif /* _EFA_TP_DEF_H */ #include diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index 73cb58c82b9..6b97eccda1c 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -509,6 +509,10 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, conn->ep_addr->qpn, conn->ep_addr->qkey); } +#if HAVE_LTTNG + efa_tracepoint_wr_id_post_read((void *)pkt_entry); +#endif + err = ibv_wr_complete(qp->ibv_qp_ex); if (OFI_UNLIKELY(err)) @@ -597,6 +601,10 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) conn->ep_addr->qpn, conn->ep_addr->qkey); } +#if HAVE_LTTNG + efa_tracepoint_wr_id_post_write((void *)pkt_entry); +#endif + if (!(txe->fi_flags & FI_MORE)) { err = ibv_wr_complete(qp->ibv_qp_ex); ep->base_ep.is_wr_started = false; diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index fdb0d629a8a..36b2d5171da 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -11,6 +11,7 @@ #include "efa_rdm_rma.h" #include "efa_rdm_pke_cmd.h" #include "efa_cntr.h" +#include "efa_rdm_tracepoint.h" int efa_rdm_rma_verified_copy_iov(struct efa_rdm_ep *ep, struct efa_rma_iov *rma, size_t count, uint32_t flags, @@ -174,6 +175,9 @@ ssize_t efa_rdm_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uin void **tmp_desc; struct util_srx_ctx *srx_ctx; + efa_rdm_tracepoint(read_begin_msg_context, + (size_t) msg->context, (size_t) msg->addr); + EFA_DBG(FI_LOG_EP_DATA, "read iov_len: %lu flags: %lx\n", ofi_total_iov_len(msg->msg_iov, msg->iov_count), @@ -430,6 +434,9 @@ static inline ssize_t efa_rdm_generic_writemsg(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *txe; struct util_srx_ctx *srx_ctx; + efa_rdm_tracepoint(write_begin_msg_context, + (size_t) msg->context, (size_t) msg->addr); + efa_perfset_start(efa_rdm_ep, perf_efa_tx); EFA_DBG(FI_LOG_EP_DATA, diff --git a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h index a11e8c3889c..6e2fab54b7f 100644 --- a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h +++ b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h @@ -87,6 +87,16 @@ LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PRO LTTNG_UST_TP_ARGS(MSG_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, recv_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, + read_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, read_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, + write_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, write_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + #define CQ_ENTRY_ARGS \ int, tag, \ size_t, addr From 1af9ddccaf12dda368a9a54b837e16a14b899038 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Sat, 23 Nov 2024 13:33:51 -0800 Subject: [PATCH 262/393] v2.0.0rc1 Signed-off-by: Jianxin Xiong --- AUTHORS | 6 +++ NEWS.md | 89 ++++++++++++++++++++++++++++++++++++++++ configure.ac | 2 +- fabtests/configure.ac | 2 +- include/windows/config.h | 2 +- 5 files changed, 98 insertions(+), 3 deletions(-) diff --git a/AUTHORS b/AUTHORS index 89f9c5bfa91..c8068cc5361 100644 --- a/AUTHORS +++ b/AUTHORS @@ -58,6 +58,7 @@ Chenwei Zhang Chien Tin Tung Chris Dolan Chris Taylor +Chuck Fossen Chuck Fossen Chuck Fossen Cody Mann @@ -140,6 +141,7 @@ Joe Nemeth Johannes Ziegenbalg John Biddiscombe John Byrne +John Heemstra Jonathan Behrens Jorge Cabrera jose @@ -190,6 +192,7 @@ Neil Spruit Nicholas Sielicki Nicolas Morey-Chaisemartin Nikhil Nanal +nikhil nanal nikhilnanal nikhilnanal nikhilnanal @@ -199,6 +202,7 @@ Noam Beer Oblomov, Sergey Oblomov, Sergey OFIWG Bot +Olga Weiss Olivier Serres orbea Paolo Inaudi @@ -229,6 +233,7 @@ Robert Wespetal Rohit Zambre Ryan Hankins Ryan Hankins +Ryan Hankins RÊmi Dehenne Sai Sunku Sannikov, Alexander @@ -285,6 +290,7 @@ Wenduo Wang wenduwan wenduwan Wesley Bland +wey William Zhang Xuezhao Liu Xuyang Wang diff --git a/NEWS.md b/NEWS.md index f8f46be8e35..0d766534d3d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,95 @@ bug fixes (and other actions) for each version of Libfabric since version 1.0. New major releases include all fixes from minor releases with earlier release dates. +v2.0.0, Fri Dec 13, 2024 +======================== + +## Core + +- xpmem: Cleanup xpmem before monitors +- Remove redundant windows.h +- hmem/cuda: Add env variable to enable/disable CUDA DMABUF +- Update ofi_vrb_speed + +## CXI + +- Add FI_OPT_CUDA_API_PERMITTED tests +- Define FI_CXI_FORCE_DEV_REG_COPY +- Support FI_OPT_CUDA_API_PERMITTED +- Testing FI_RM_ENABLED +- Correct checking of MR test rc +- Update unit test for collectives +- Add test for invalid client RKEY +- Fix broken client key check +- Ignore FLT_OVERFLOW and FLT_INVALID errors +- Update CXI man page. +- Enable dmabuf for ROCR by default. +- Remove disable_dmabuf_cuda and disable_dmabuf_rocr +- Disable use of dmabuf by default for cuda +- Remove use of deprecated FI_ORDER_NONE +- Report RMA order used in debug output +- Remove srx unittests +- Add FI_PEER capability bit +- Support shared receive queues +- Implement shared Completion Queues + +## EFA + +- Add tracepoints for rma operations +- Adjust the location of tracepoint +- Implement the rma interface +- Fix efa_msg flags +- Remove efa_send_wr, send_wr_pool and recv_wr_pool from dgram_ep +- Fix the read_bad_recv_status unit test +- Implement efa_msg interface +- Implement FI_MORE for fi_recv in zero copy recv mode +- Fix the error path of zero copy recv +- Move inject sizes from rdm ep to base ep +- Fix the ep list scan in cq/cntr read +- Fix the error handling for unsolicited recv +- Fall back to zero sl when non-zero sl qp creation failed +- Disable zero copy receive if p2p is not available +- Initialize efa fork support in EFA_INI +- Update efa_hmem and efa_fork_support log to FI_LOG_CORE +- Make efa_hmem_info a global variable +- Set max rma order size correctly + +## Hook + +Fix the preprocessor + +## LNX + +- Fix av strncpy +- Fix various issues with initial commit + +## SHM + +- Cleanup op flags + +## Sockets + +- Fixed coverity issue for unchecked return value. + +## Util + +- Set srx completion flags and msg_len properly +- fi_pingpong: Fix coverity issue about integer overflow + +## Verbs + +- Fix coverity issue about overflowed return value +- Enable implicit dmabuf mr reg for more HMEM ifaces + +## Fabtests + +- Add FI_MORE pytest for fi_recv in zcpy recv mode +- Allow tests with FI_MORE flag by using fi_recvmsg +- New fabtest fi_flood to test over subscription of resources +- test_configs/ofi_rxm/tcp.test: remove cntr RMA testing +- Fix compiler warning about unitialized variable + + v2.0.0 beta, Fri Oct 25, 2024 ============================== diff --git a/configure.ac b/configure.ac index ef368ef9377..c024d2d222b 100644 --- a/configure.ac +++ b/configure.ac @@ -9,7 +9,7 @@ dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ([2.60]) -AC_INIT([libfabric], [2.0.0beta], [ofiwg@lists.openfabrics.org]) +AC_INIT([libfabric], [2.0.0rc1], [ofiwg@lists.openfabrics.org]) AC_CONFIG_SRCDIR([src/fabric.c]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) diff --git a/fabtests/configure.ac b/fabtests/configure.ac index b5ab3117376..9c864c1dd10 100644 --- a/fabtests/configure.ac +++ b/fabtests/configure.ac @@ -5,7 +5,7 @@ dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.57) -AC_INIT([fabtests], [2.0.0beta], [ofiwg@lists.openfabrics.org]) +AC_INIT([fabtests], [2.0.0rc1], [ofiwg@lists.openfabrics.org]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) AC_CONFIG_HEADERS(config.h) diff --git a/include/windows/config.h b/include/windows/config.h index 380bea50323..3fbef1b09a4 100644 --- a/include/windows/config.h +++ b/include/windows/config.h @@ -256,7 +256,7 @@ #define PACKAGE_TARNAME PACKAGE /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.0.0beta" +#define PACKAGE_VERSION "2.0.0rc1" /* Define to the full name and version of this package. */ #define PACKAGE_STRING PACKAGE_NAME " " PACKAGE_VERSION From f60033a49196482acff8051e655c7459c2ec38ee Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 18 Nov 2024 08:51:07 -0600 Subject: [PATCH 263/393] prov/cxi: Update CXI provider max order size Target ordering is not support for any raw or war operations. Waw can be supported. Signed-off-by: Ian Ziemba --- prov/cxi/src/cxip_info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index f0da25e315e..6a0921993c0 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -249,8 +249,8 @@ struct fi_ep_attr cxip_ep_attr = { .protocol = FI_PROTO_CXI, .protocol_version = CXIP_WIRE_PROTO_VERSION, .max_msg_size = CXIP_EP_MAX_MSG_SZ, - .max_order_raw_size = -1, - .max_order_war_size = -1, + .max_order_raw_size = 0, + .max_order_war_size = 0, .max_order_waw_size = -1, .mem_tag_format = FI_TAG_GENERIC >> (64 - CXIP_TAG_WIDTH), .auth_key_size = sizeof(struct cxi_auth_key), From e5b8ad7f7b88229ae7644753c6e04a2ebd0f7be6 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 18 Nov 2024 08:56:44 -0600 Subject: [PATCH 264/393] prov/cxi: Fix RMA/AMO network ordering WAR and RAW are not supported. WAW and RAR within RMA and AMO, respectively, are supported. WAW is support across RMA and AMO. Signed-off-by: Ian Ziemba --- prov/cxi/include/cxip.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 68ea4c0ce7e..0c8d43e88b8 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -143,9 +143,8 @@ #define CXIP_MSG_ORDER (FI_ORDER_SAS | \ FI_ORDER_WAW | \ FI_ORDER_RMA_WAW | \ + FI_ORDER_RMA_RAR | \ FI_ORDER_ATOMIC_WAW | \ - FI_ORDER_ATOMIC_WAR | \ - FI_ORDER_ATOMIC_RAW | \ FI_ORDER_ATOMIC_RAR) #define CXIP_EP_CQ_FLAGS \ From 331228e2645dc8cf9a2b1bebdeeab844f7174603 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 18 Nov 2024 09:18:07 -0600 Subject: [PATCH 265/393] prov/cxi: Set MR relax order on EP order size max_order_waw_size controls target ordering settings. Use this information, in conjuction with RX msg_order, to control relaxed ordering. Signed-off-by: Ian Ziemba --- prov/cxi/include/cxip.h | 13 +++++++++++++ prov/cxi/src/cxip_mr.c | 40 +++++++++++++++++++--------------------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 0c8d43e88b8..29107b58e9a 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -2476,6 +2476,19 @@ cxip_ep_obj_copy_from_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest, ep->require_dev_reg_copy[md->info.iface]); } +static inline bool cxip_ep_obj_mr_relaxed_order(struct cxip_ep_obj *ep) +{ + if ((ep->rx_attr.msg_order & FI_ORDER_RMA_WAW) && + ep->ep_attr.max_order_waw_size != 0) + return false; + + if ((ep->rx_attr.msg_order & FI_ORDER_WAW) && + ep->ep_attr.max_order_waw_size != 0) + return false; + + return true; +} + static inline void cxip_txc_otx_reqs_inc(struct cxip_txc *txc) { assert(ofi_genlock_held(&txc->ep_obj->lock) == 1); diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c index 34d8ead3576..a1d1c5e2829 100644 --- a/prov/cxi/src/cxip_mr.c +++ b/prov/cxi/src/cxip_mr.c @@ -314,7 +314,9 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) uint32_t le_flags; uint64_t ib = 0; int pid_idx; + bool target_relaxed_order; + target_relaxed_order = cxip_ep_obj_mr_relaxed_order(ep_obj); mr->req.cb = cxip_mr_cb; ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl.tgt_evtq, @@ -347,8 +349,7 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) goto err_pte_free; } - le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE | - C_LE_UNRESTRICTED_BODY_RO; + le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE; if (mr->attr.access & FI_REMOTE_WRITE) le_flags |= C_LE_OP_PUT; if (mr->attr.access & FI_REMOTE_READ) @@ -356,15 +357,10 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) if (mr->cntr) le_flags |= C_LE_EVENT_CT_COMM; - /* When FI_FENCE is not requested, restricted operations can used PCIe - * relaxed ordering. Unrestricted operations PCIe relaxed ordering is - * controlled by an env for now. - */ - if (!(ep_obj->caps & FI_FENCE)) { + if (target_relaxed_order) { ib = 1; - - if (cxip_env.enable_unrestricted_end_ro) - le_flags |= C_LE_UNRESTRICTED_END_RO; + le_flags |= C_LE_UNRESTRICTED_END_RO | + C_LE_UNRESTRICTED_BODY_RO; } ret = cxip_pte_append(mr->pte, @@ -475,7 +471,9 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) struct cxip_mr *_mr; uint32_t le_flags; uint64_t ib = 0; + bool target_relaxed_order; + target_relaxed_order = cxip_ep_obj_mr_relaxed_order(ep_obj); mr_cache = &ep_obj->ctrl.opt_mr_cache[lac]; ofi_atomic_inc32(&mr_cache->ref); @@ -542,17 +540,12 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) } le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE | - C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_PUT | C_LE_OP_GET; + C_LE_OP_PUT | C_LE_OP_GET; - /* When FI_FENCE is not requested, restricted operations can used PCIe - * relaxed ordering. Unrestricted operations PCIe relaxed ordering is - * controlled by an env for now. - */ - if (!(ep_obj->caps & FI_FENCE)) { + if (target_relaxed_order) { ib = 1; - - if (cxip_env.enable_unrestricted_end_ro) - le_flags |= C_LE_UNRESTRICTED_END_RO; + le_flags |= C_LE_UNRESTRICTED_END_RO | + C_LE_UNRESTRICTED_BODY_RO; } ret = cxip_pte_append(_mr->pte, 0, -1ULL, lac, @@ -634,6 +627,9 @@ static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr) union cxip_match_bits mb; union cxip_match_bits ib; uint32_t le_flags; + bool target_relaxed_order; + + target_relaxed_order = cxip_ep_obj_mr_relaxed_order(ep_obj); /* TODO: Handle enabling for each bound endpoint */ mr_cache = &ep_obj->ctrl.std_mr_cache[lac]; @@ -676,8 +672,10 @@ static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr) ib.mr_lac = 0; ib.mr_cached = 0; - le_flags = C_LE_EVENT_SUCCESS_DISABLE | C_LE_UNRESTRICTED_BODY_RO | - C_LE_OP_PUT | C_LE_OP_GET; + le_flags = C_LE_EVENT_SUCCESS_DISABLE | C_LE_OP_PUT | C_LE_OP_GET; + if (target_relaxed_order) + le_flags |= C_LE_UNRESTRICTED_END_RO | + C_LE_UNRESTRICTED_BODY_RO; ret = cxip_pte_append(ep_obj->ctrl.pte, 0, -1ULL, mb.mr_lac, C_PTL_LIST_PRIORITY, From 0f2e5da9a602f4efc30ca3312253c0ec65d8e574 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 18 Nov 2024 09:29:32 -0600 Subject: [PATCH 266/393] prov/cxi: Remove FI_CXI_ENABLE_UNRESTRICTED_RO This env var is not needed. Signed-off-by: Ian Ziemba --- prov/cxi/include/cxip.h | 1 - prov/cxi/src/cxip_info.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 29107b58e9a..b332c6dc333 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -297,7 +297,6 @@ struct cxip_environment { size_t ctrl_rx_eq_max_size; char *device_name; size_t cq_fill_percent; - int enable_unrestricted_end_ro; int rget_tc; int cacheline_size; diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index 6a0921993c0..f59d07f6df5 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -648,7 +648,6 @@ struct cxip_environment cxip_env = { .disable_eq_hugetlb = false, .zbcoll_radix = 2, .cq_fill_percent = 50, - .enable_unrestricted_end_ro = true, .rget_tc = FI_TC_UNSPEC, .cacheline_size = CXIP_DEFAULT_CACHE_LINE_SIZE, .coll_job_id = NULL, @@ -743,11 +742,6 @@ static void cxip_env_init(void) fi_param_get_bool(&cxip_prov, "disable_host_register", &cxip_env.disable_host_register); - fi_param_define(&cxip_prov, "enable_unrestricted_end_ro", FI_PARAM_BOOL, - "Default: %d", cxip_env.enable_unrestricted_end_ro); - fi_param_get_bool(&cxip_prov, "enable_unrestricted_end_ro", - &cxip_env.enable_unrestricted_end_ro); - fi_param_define(&cxip_prov, "odp", FI_PARAM_BOOL, "Enables on-demand paging (default %d).", cxip_env.odp); fi_param_get_bool(&cxip_prov, "odp", &cxip_env.odp); From ca59dba1f3086b44d0ff4526d1a71a07cd317b71 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 18 Nov 2024 09:37:44 -0600 Subject: [PATCH 267/393] prov/cxi: Define FI_CXI_MR_TARGET_ORDERING This env var can be used to force MR target ordering to either be strict or relaxed. The default behavior is to set MR target ordering based on user defined endpoint attributes. Signed-off-by: Ian Ziemba --- prov/cxi/include/cxip.h | 20 ++++++++++++++++++++ prov/cxi/src/cxip_info.c | 19 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index b332c6dc333..65b48da6db3 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -244,6 +244,19 @@ enum cxip_rdzv_proto { const char *cxip_rdzv_proto_to_str(enum cxip_rdzv_proto proto); +enum cxip_mr_target_ordering { + /* Sets MR target ordering based on message and target RMA ordering + * options. + */ + MR_ORDER_DEFAULT, + + /* Force ordering to always be strict. */ + MR_ORDER_STRICT, + + /* Force ordering to always be relaxed. */ + MR_ORDER_RELAXED, +}; + struct cxip_environment { /* Translation */ int odp; @@ -323,6 +336,7 @@ struct cxip_environment { size_t mr_cache_events_disable_poll_nsecs; size_t mr_cache_events_disable_le_poll_nsecs; int force_dev_reg_copy; + enum cxip_mr_target_ordering mr_target_ordering; }; extern struct cxip_environment cxip_env; @@ -2477,6 +2491,12 @@ cxip_ep_obj_copy_from_md(struct cxip_ep_obj *ep, struct cxip_md *md, void *dest, static inline bool cxip_ep_obj_mr_relaxed_order(struct cxip_ep_obj *ep) { + if (cxip_env.mr_target_ordering == MR_ORDER_STRICT) + return false; + + if (cxip_env.mr_target_ordering == MR_ORDER_RELAXED) + return true; + if ((ep->rx_attr.msg_order & FI_ORDER_RMA_WAW) && ep->ep_attr.max_order_waw_size != 0) return false; diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index f59d07f6df5..76d1fa204e5 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -670,6 +670,7 @@ struct cxip_environment cxip_env = { .mr_cache_events_disable_le_poll_nsecs = CXIP_MR_CACHE_EVENTS_DISABLE_LE_POLL_NSECS, .force_dev_reg_copy = false, + .mr_target_ordering = MR_ORDER_DEFAULT, }; static void cxip_env_init(void) @@ -1289,6 +1290,24 @@ static void cxip_env_init(void) fi_param_get_bool(&cxip_prov, "force_dev_reg_copy", &cxip_env.force_dev_reg_copy); + fi_param_define(&cxip_prov, "mr_target_ordering", FI_PARAM_STRING, + "MR target ordering (i.e. PCI ordering). Options: default, strict, or relaxed. Recommendation is to leave at default behavior."); + fi_param_get_str(&cxip_prov, "mr_target_ordering", ¶m_str); + + if (param_str) { + if (!strcmp(param_str, "default")) + cxip_env.mr_target_ordering = MR_ORDER_DEFAULT; + else if (!strcmp(param_str, "strict")) + cxip_env.mr_target_ordering = MR_ORDER_STRICT; + else if (!strcmp(param_str, "relaxed")) + cxip_env.mr_target_ordering = MR_ORDER_RELAXED; + else + CXIP_WARN("Unrecognized mr_target_ordering: %s\n", + param_str); + + param_str = NULL; + } + set_system_page_size(); } From 7eacfc278f5afd4eabc16be0f423191f5a856aae Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 18 Nov 2024 10:23:35 -0600 Subject: [PATCH 268/393] man/fi_cxi: Update message and target ordering doc Signed-off-by: Ian Ziemba --- man/fi_cxi.7.md | 47 ++++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md index 4c6e46d14cc..f141d3a5c08 100644 --- a/man/fi_cxi.7.md +++ b/man/fi_cxi.7.md @@ -380,39 +380,24 @@ increase Request buffer space using the variables *FI_CXI_REQ_\**. ## Message Ordering -The CXI provider supports the following ordering rules: +Supported message ordering: FI_ORDER_SAS, FI_ORDER_WAW, FI_ORDER_RMA_WAW, +FI_ORDER_RMA_RAR, FI_ORDER_ATOMIC_WAW, and FI_ORDER_ATOMIC_RAR. -* All message Send operations are always ordered. -* RMA Writes may be ordered by specifying *FI_ORDER_RMA_WAW*. -* AMOs may be ordered by specifying *FI_ORDER_AMO_{WAW|WAR|RAW|RAR}*. -* RMA Writes may be ordered with respect to AMOs by specifying *FI_ORDER_WAW*. - Fetching AMOs may be used to perform short reads that are ordered with - respect to RMA Writes. +Note: Any FI_ORDER_*_{WAR,RAW} are not supported. + +Note: Relaxing the message ordering may result in improved performance. + +## Target Ordering Ordered RMA size limits are set as follows: -* *max_order_waw_size* is -1. RMA Writes and non-fetching AMOs of any size are - ordered with respect to each other. -* *max_order_raw_size* is -1. Fetching AMOs of any size are ordered with - respect to RMA Writes and non-fetching AMOs. -* *max_order_war_size* is -1. RMA Writes and non-fetching AMOs of any size are - ordered with respect to fetching AMOs. - -## PCIe Ordering - -Generally, PCIe writes are strictly ordered. As an optimization, PCIe TLPs may -have the Relaxed Order (RO) bit set to allow writes to be reordered. Cassini -sets the RO bit in PCIe TLPs when possible. Cassini sets PCIe RO as follows: - -* Ordering of messaging operations is established using completion events. - Therefore, all PCIe TLPs related to two-sided message payloads will have RO - set. -* Every PCIe TLP associated with an unordered RMA or AMO operation will have RO - cleared. -* PCIe TLPs associated with the last packet of an ordered RMA or AMO operation - will have RO cleared. -* PCIe TLPs associated with the body packets (all except the last packet of an - operation) of an ordered RMA operation will have RO set. +* *max_order_waw_size* is -1. RMA Writes and AMO writes of any size are ordered with + respect to each other. + +Note: Due to FI_ORDER_\*\_{WAR,RAW} not being supported, max_order_{raw,war}_size +are forced to zero. + +Note: Relaxing the target ordering may result in improved performance. ## Translation @@ -974,6 +959,10 @@ offloading are met. The CXI provider checks for the following environment variables: +*FI_CXI_MR_TARGET_ORDERING* +: MR target ordering (i.e. PCI ordering). Options: default, strict, or relaxed. + Recommendation is to leave at default behavior. + *FI_CXI_ODP* : Enables on-demand paging. If disabled, all DMA buffers are pinned. If enabled and mr_mode bits in the hints exclude FI_MR_ALLOCATED, From cf257c1aa4b386c8687bb495360fbefd85358974 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Sun, 24 Nov 2024 19:09:21 -0600 Subject: [PATCH 269/393] prov/cxi: Depreciate FI_CXI_WEAK_FENCE With CXI provider ordering updated, FI_CXI_WEAK_FENCE is no longer needed. Signed-off-by: Ian Ziemba --- man/fi_cxi.7.md | 5 ----- prov/cxi/include/fi_cxi_ext.h | 9 ++++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md index f141d3a5c08..c2cbffe2b52 100644 --- a/man/fi_cxi.7.md +++ b/man/fi_cxi.7.md @@ -1522,11 +1522,6 @@ if (ret) error; ``` -When an endpoint does not support FI_FENCE (e.g. optimized MR), a provider -specific transmit flag, FI_CXI_WEAK_FENCE, may be specified on an alias EP -to issue a FENCE operation to create a data ordering point for the alias. -This is supported for one-sided operations only. - Alias EP must be closed prior to closing the original EP. ## PCIe Atomics diff --git a/prov/cxi/include/fi_cxi_ext.h b/prov/cxi/include/fi_cxi_ext.h index e8205fc6d2a..c4629d9ef7f 100644 --- a/prov/cxi/include/fi_cxi_ext.h +++ b/prov/cxi/include/fi_cxi_ext.h @@ -118,11 +118,10 @@ enum { */ #define FI_CXI_UNRELIABLE (1ULL << 61) -/* - * Request a provider specific weak FENCE operation to facilitate an - * EP alias ordering point, when the original EP utilizes PCIe RO=1. - */ -#define FI_CXI_WEAK_FENCE (1ULL << 63) +/* Depreciated. */ +#define FI_CXI_WEAK_FENCE \ + _Pragma ("GCC warning \"'FI_CXI_WEAK_FENCE' macro is deprecated\"") \ + (1ULL << 63) /* * Used in conjunction with the deferred work queue API. If a deferred work From bb17d1893a1a88f2c44a7ec4a72ccce7b23ada75 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Mon, 25 Nov 2024 21:11:24 +0000 Subject: [PATCH 270/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_cxi.7 | 71 ++++++++++++++--------------------------------- 1 file changed, 21 insertions(+), 50 deletions(-) diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 index 528787a6e1e..336b716e982 100644 --- a/man/man7/fi_cxi.7 +++ b/man/man7/fi_cxi.7 @@ -1,7 +1,7 @@ .\"t .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_cxi" "7" "2024\-11\-20" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cxi" "7" "2024\-11\-25" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -442,53 +442,25 @@ hybrid RX match modes increase Request buffer space using the variables \f[I]FI_CXI_REQ_*\f[R]. .SS Message Ordering .PP -The CXI provider supports the following ordering rules: -.IP \[bu] 2 -All message Send operations are always ordered. -.IP \[bu] 2 -RMA Writes may be ordered by specifying \f[I]FI_ORDER_RMA_WAW\f[R]. -.IP \[bu] 2 -AMOs may be ordered by specifying -\f[I]FI_ORDER_AMO_{WAW|WAR|RAW|RAR}\f[R]. -.IP \[bu] 2 -RMA Writes may be ordered with respect to AMOs by specifying -\f[I]FI_ORDER_WAW\f[R]. -Fetching AMOs may be used to perform short reads that are ordered with -respect to RMA Writes. +Supported message ordering: FI_ORDER_SAS, FI_ORDER_WAW, +FI_ORDER_RMA_WAW, FI_ORDER_RMA_RAR, FI_ORDER_ATOMIC_WAW, and +FI_ORDER_ATOMIC_RAR. +.PP +Note: Any FI_ORDER_*_{WAR,RAW} are not supported. +.PP +Note: Relaxing the message ordering may result in improved performance. +.SS Target Ordering .PP Ordered RMA size limits are set as follows: .IP \[bu] 2 \f[I]max_order_waw_size\f[R] is -1. -RMA Writes and non-fetching AMOs of any size are ordered with respect to -each other. -.IP \[bu] 2 -\f[I]max_order_raw_size\f[R] is -1. -Fetching AMOs of any size are ordered with respect to RMA Writes and -non-fetching AMOs. -.IP \[bu] 2 -\f[I]max_order_war_size\f[R] is -1. -RMA Writes and non-fetching AMOs of any size are ordered with respect to -fetching AMOs. -.SS PCIe Ordering -.PP -Generally, PCIe writes are strictly ordered. -As an optimization, PCIe TLPs may have the Relaxed Order (RO) bit set to -allow writes to be reordered. -Cassini sets the RO bit in PCIe TLPs when possible. -Cassini sets PCIe RO as follows: -.IP \[bu] 2 -Ordering of messaging operations is established using completion events. -Therefore, all PCIe TLPs related to two-sided message payloads will have -RO set. -.IP \[bu] 2 -Every PCIe TLP associated with an unordered RMA or AMO operation will -have RO cleared. -.IP \[bu] 2 -PCIe TLPs associated with the last packet of an ordered RMA or AMO -operation will have RO cleared. -.IP \[bu] 2 -PCIe TLPs associated with the body packets (all except the last packet -of an operation) of an ordered RMA operation will have RO set. +RMA Writes and AMO writes of any size are ordered with respect to each +other. +.PP +Note: Due to FI_ORDER_*_{WAR,RAW} not being supported, +max_order_{raw,war}_size are forced to zero. +.PP +Note: Relaxing the target ordering may result in improved performance. .SS Translation .PP The CXI provider supports two translation mechanisms: Address @@ -1172,6 +1144,11 @@ offloading are met. .PP The CXI provider checks for the following environment variables: .TP +\f[I]FI_CXI_MR_TARGET_ORDERING\f[R] +MR target ordering (i.e.\ PCI ordering). +Options: default, strict, or relaxed. +Recommendation is to leave at default behavior. +.TP \f[I]FI_CXI_ODP\f[R] Enables on-demand paging. If disabled, all DMA buffers are pinned. @@ -1836,12 +1813,6 @@ if (ret) \f[R] .fi .PP -When an endpoint does not support FI_FENCE (e.g.\ optimized MR), a -provider specific transmit flag, FI_CXI_WEAK_FENCE, may be specified on -an alias EP to issue a FENCE operation to create a data ordering point -for the alias. -This is supported for one-sided operations only. -.PP Alias EP must be closed prior to closing the original EP. .SS PCIe Atomics .PP From 364f208c98ad07119b4dff52d40e659e14488747 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 25 Nov 2024 19:02:49 +0000 Subject: [PATCH 271/393] fabtests/pytest/efa: merge memory_type and check_rma_bw_memory_type Merge these two fixtures as rma_bw_memory_type. Signed-off-by: Shi Jin --- fabtests/pytest/efa/conftest.py | 3 ++- fabtests/pytest/efa/test_rma_bw.py | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fabtests/pytest/efa/conftest.py b/fabtests/pytest/efa/conftest.py index 5c8928bdef5..6192e83fa32 100644 --- a/fabtests/pytest/efa/conftest.py +++ b/fabtests/pytest/efa/conftest.py @@ -28,10 +28,11 @@ def rma_operation_type(request): return request.param @pytest.fixture(scope="module") -def check_rma_bw_memory_type(memory_type, rma_operation_type): +def rma_bw_memory_type(memory_type, rma_operation_type): is_test_bi_dir = False if rma_operation_type == "writedata" else True if is_test_bi_dir and (memory_type not in [_.values[0] for _ in memory_type_list_bi_dir]): pytest.skip("Duplicated memory type for bi-directional test") + return memory_type @pytest.fixture(scope="module", params=["r:0,4,64", diff --git a/fabtests/pytest/efa/test_rma_bw.py b/fabtests/pytest/efa/test_rma_bw.py index 3710db0075b..0c1e6d39916 100644 --- a/fabtests/pytest/efa/test_rma_bw.py +++ b/fabtests/pytest/efa/test_rma_bw.py @@ -7,15 +7,15 @@ @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rma_bw(cmdline_args, iteration_type, rma_operation_type, completion_semantic, memory_type, check_rma_bw_memory_type): +def test_rma_bw(cmdline_args, iteration_type, rma_operation_type, completion_semantic, rma_bw_memory_type): command = "fi_rma_bw -e rdm" command = command + " -o " + rma_operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", timeout=timeout) + efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, rma_bw_memory_type, "all", timeout=timeout) @pytest.mark.parametrize("env_vars", [["FI_EFA_TX_SIZE=64"], ["FI_EFA_RX_SIZE=64"], ["FI_EFA_TX_SIZE=64", "FI_EFA_RX_SIZE=64"]]) -def test_rma_bw_small_tx_rx(cmdline_args, rma_operation_type, completion_semantic, memory_type, env_vars, check_rma_bw_memory_type): +def test_rma_bw_small_tx_rx(cmdline_args, rma_operation_type, completion_semantic, rma_bw_memory_type, env_vars): cmdline_args_copy = copy.copy(cmdline_args) for env_var in env_vars: cmdline_args_copy.append_environ(env_var) @@ -24,15 +24,15 @@ def test_rma_bw_small_tx_rx(cmdline_args, rma_operation_type, completion_semanti command = command + " -o " + rma_operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args_copy.timeout) - efa_run_client_server_test(cmdline_args_copy, command, "short", completion_semantic, memory_type, "all", timeout=timeout) + efa_run_client_server_test(cmdline_args_copy, command, "short", completion_semantic, rma_bw_memory_type, "all", timeout=timeout) @pytest.mark.functional -def test_rma_bw_range(cmdline_args, rma_operation_type, completion_semantic, message_size, memory_type, check_rma_bw_memory_type): +def test_rma_bw_range(cmdline_args, rma_operation_type, completion_semantic, message_size, rma_bw_memory_type): command = "fi_rma_bw -e rdm" command = command + " -o " + rma_operation_type # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type, message_size, timeout=timeout) + efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, rma_bw_memory_type, message_size, timeout=timeout) @pytest.mark.functional From c74eac27f336db534bc78d80066df73d5fafb6a3 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 25 Nov 2024 19:12:50 +0000 Subject: [PATCH 272/393] fabtests/pytest/efa: Avoid duiplicate completion semantic for RMA test For fi_read, there is no difference between DC and non-DC as it's not a transmission. For write and writedata, if device support rdma-write, all the transmissions are DC already. Signed-off-by: Shi Jin --- fabtests/pytest/efa/conftest.py | 14 ++++++++++++++ fabtests/pytest/efa/test_rma_bw.py | 24 ++++++++++++------------ fabtests/pytest/efa/test_rma_pingpong.py | 12 ++++++------ 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/fabtests/pytest/efa/conftest.py b/fabtests/pytest/efa/conftest.py index 6192e83fa32..2871a9b8ca9 100644 --- a/fabtests/pytest/efa/conftest.py +++ b/fabtests/pytest/efa/conftest.py @@ -1,4 +1,5 @@ import pytest +from efa_common import has_rdma # The memory types for bi-directional tests. memory_type_list_bi_dir = [ @@ -34,6 +35,19 @@ def rma_bw_memory_type(memory_type, rma_operation_type): pytest.skip("Duplicated memory type for bi-directional test") return memory_type +@pytest.fixture(scope="function") +def rma_bw_completion_semantic(cmdline_args, completion_semantic, rma_operation_type): + if completion_semantic != 'delivery_complete': + # There is no difference between DC and non-DC for read as it's + # not a transmission + if rma_operation_type == 'read': + pytest.skip("Duplicate completion semantic for fi_read test") + assert rma_operation_type in ['write', 'writedata'] + # If device support rdma write, all the transmissions are DC + if has_rdma(cmdline_args, 'write'): + pytest.skip("Duplicate completion semantic for fi_write* test") + return completion_semantic + @pytest.fixture(scope="module", params=["r:0,4,64", "r:4048,4,4148", diff --git a/fabtests/pytest/efa/test_rma_bw.py b/fabtests/pytest/efa/test_rma_bw.py index 0c1e6d39916..98ff0a3b0d2 100644 --- a/fabtests/pytest/efa/test_rma_bw.py +++ b/fabtests/pytest/efa/test_rma_bw.py @@ -7,15 +7,15 @@ @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rma_bw(cmdline_args, iteration_type, rma_operation_type, completion_semantic, rma_bw_memory_type): +def test_rma_bw(cmdline_args, iteration_type, rma_operation_type, rma_bw_completion_semantic, rma_bw_memory_type): command = "fi_rma_bw -e rdm" command = command + " -o " + rma_operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, rma_bw_memory_type, "all", timeout=timeout) + efa_run_client_server_test(cmdline_args, command, iteration_type, rma_bw_completion_semantic, rma_bw_memory_type, "all", timeout=timeout) @pytest.mark.parametrize("env_vars", [["FI_EFA_TX_SIZE=64"], ["FI_EFA_RX_SIZE=64"], ["FI_EFA_TX_SIZE=64", "FI_EFA_RX_SIZE=64"]]) -def test_rma_bw_small_tx_rx(cmdline_args, rma_operation_type, completion_semantic, rma_bw_memory_type, env_vars): +def test_rma_bw_small_tx_rx(cmdline_args, rma_operation_type, rma_bw_completion_semantic, rma_bw_memory_type, env_vars): cmdline_args_copy = copy.copy(cmdline_args) for env_var in env_vars: cmdline_args_copy.append_environ(env_var) @@ -24,24 +24,24 @@ def test_rma_bw_small_tx_rx(cmdline_args, rma_operation_type, completion_semanti command = command + " -o " + rma_operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args_copy.timeout) - efa_run_client_server_test(cmdline_args_copy, command, "short", completion_semantic, rma_bw_memory_type, "all", timeout=timeout) + efa_run_client_server_test(cmdline_args_copy, command, "short", rma_bw_completion_semantic, rma_bw_memory_type, "all", timeout=timeout) @pytest.mark.functional -def test_rma_bw_range(cmdline_args, rma_operation_type, completion_semantic, message_size, rma_bw_memory_type): +def test_rma_bw_range(cmdline_args, rma_operation_type, rma_bw_completion_semantic, message_size, rma_bw_memory_type): command = "fi_rma_bw -e rdm" command = command + " -o " + rma_operation_type # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, rma_bw_memory_type, message_size, timeout=timeout) + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, rma_bw_memory_type, message_size, timeout=timeout) @pytest.mark.functional -def test_rma_bw_range_no_inject(cmdline_args, rma_operation_type, completion_semantic, inject_message_size): +def test_rma_bw_range_no_inject(cmdline_args, rma_operation_type, rma_bw_completion_semantic, inject_message_size): command = "fi_rma_bw -e rdm -j 0" command = command + " -o " + rma_operation_type # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, "host_to_host", inject_message_size, timeout=timeout) + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, "host_to_host", inject_message_size, timeout=timeout) # This test is run in serial mode because it takes a lot of memory @@ -49,22 +49,22 @@ def test_rma_bw_range_no_inject(cmdline_args, rma_operation_type, completion_sem @pytest.mark.functional # TODO Add "writedata", "write" back in when EFA firmware bug is fixed @pytest.mark.parametrize("operation_type", ["read"]) -def test_rma_bw_1G(cmdline_args, operation_type, completion_semantic): +def test_rma_bw_1G(cmdline_args, operation_type, rma_bw_completion_semantic): # Default window size is 64 resulting in 128GB being registered, which # exceeds max number of registered host pages timeout = max(540, cmdline_args.timeout) command = "fi_rma_bw -e rdm -W 1" command = command + " -o " + operation_type efa_run_client_server_test(cmdline_args, command, 2, - completion_semantic=completion_semantic, message_size=1073741824, + completion_semantic=rma_bw_completion_semantic, message_size=1073741824, memory_type="host_to_host", warmup_iteration_type=0, timeout=timeout) @pytest.mark.functional @pytest.mark.parametrize("operation_type", ["writedata", "write"]) -def test_rma_bw_use_fi_more(cmdline_args, operation_type, completion_semantic, inject_message_size): +def test_rma_bw_use_fi_more(cmdline_args, operation_type, rma_bw_completion_semantic, inject_message_size): command = "fi_rma_bw -e rdm -j 0 --use-fi-more" command = command + " -o " + operation_type # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, "host_to_host", inject_message_size, timeout=timeout) diff --git a/fabtests/pytest/efa/test_rma_pingpong.py b/fabtests/pytest/efa/test_rma_pingpong.py index b3fdf9c1408..7d028f9a09a 100644 --- a/fabtests/pytest/efa/test_rma_pingpong.py +++ b/fabtests/pytest/efa/test_rma_pingpong.py @@ -14,23 +14,23 @@ def rma_pingpong_message_size(request): @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) -def test_rma_pingpong(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type_bi_dir): +def test_rma_pingpong(cmdline_args, iteration_type, operation_type, rma_bw_completion_semantic, memory_type_bi_dir): command = "fi_rma_pingpong -e rdm" command = command + " -o " + operation_type + " " + perf_progress_model_cli - efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type_bi_dir, "all") + efa_run_client_server_test(cmdline_args, command, iteration_type, rma_bw_completion_semantic, memory_type_bi_dir, "all") @pytest.mark.functional @pytest.mark.parametrize("operation_type", ["writedata"]) -def test_rma_pingpong_range(cmdline_args, operation_type, completion_semantic, rma_pingpong_message_size, memory_type_bi_dir): +def test_rma_pingpong_range(cmdline_args, operation_type, rma_bw_completion_semantic, rma_pingpong_message_size, memory_type_bi_dir): command = "fi_rma_pingpong -e rdm" command = command + " -o " + operation_type - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type_bi_dir, rma_pingpong_message_size) + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, memory_type_bi_dir, rma_pingpong_message_size) @pytest.mark.functional @pytest.mark.parametrize("operation_type", ["writedata"]) -def test_rma_pingpong_range_no_inject(cmdline_args, operation_type, completion_semantic, rma_pingpong_message_size, memory_type_bi_dir): +def test_rma_pingpong_range_no_inject(cmdline_args, operation_type, rma_bw_completion_semantic, rma_pingpong_message_size, memory_type_bi_dir): command = "fi_rma_pingpong -e rdm -j 0" command = command + " -o " + operation_type - efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type_bi_dir, rma_pingpong_message_size) + efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, memory_type_bi_dir, rma_pingpong_message_size) From ae9480b761fbd4ec2bc7144f72f35aadcdcad5cc Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 25 Nov 2024 18:11:07 +0000 Subject: [PATCH 273/393] prov/efa: Adjust the error code for flushed receive Currently, efa provider propagate flushed qp error as FI_EHOSTDOWN, which is wrong as flushed QP error means the completions are canceled due to QP was aborted earlier. Using FI_ECANCELED is more reasonable here. Signed-off-by: Shi Jin --- prov/efa/src/efa_errno.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/efa/src/efa_errno.h b/prov/efa/src/efa_errno.h index 4a68fe2488e..029c35d4a07 100644 --- a/prov/efa/src/efa_errno.h +++ b/prov/efa/src/efa_errno.h @@ -151,7 +151,7 @@ static inline int to_fi_errno(enum efa_errno err) { case EFA_IO_COMP_STATUS_OK: return FI_SUCCESS; case EFA_IO_COMP_STATUS_FLUSHED: - return FI_EHOSTDOWN; + return FI_ECANCELED; case EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR: case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH: case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY: From 2ad2e3757b9a299e4e9e005484012b2c89a46f9d Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Mon, 25 Nov 2024 10:14:05 -0600 Subject: [PATCH 274/393] cxi/prov: Fix deferred work test Rgroups implementation requires resource_limits true to use the service. Only verify extra queue work for limited test. NETCASSINI-6912 Signed-off-by: Chuck Fossen --- prov/cxi/test/deferred_work.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/prov/cxi/test/deferred_work.c b/prov/cxi/test/deferred_work.c index 369e276ffad..7531f49b439 100644 --- a/prov/cxi/test/deferred_work.c +++ b/prov/cxi/test/deferred_work.c @@ -960,6 +960,7 @@ static int alloc_service(struct cxil_dev *dev, unsigned int tle_count) struct cxi_svc_fail_info fail_info = {}; struct cxi_svc_desc svc_desc = { .enable = 1, + .resource_limits = 1, .limits = { .type[CXI_RSRC_TYPE_PTE] = { .max = 100, @@ -1195,11 +1196,10 @@ Test(deferred_work_trig_op_limit, enforce_limit_single_thread) cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK iter %d failed %d", i, ret); } - ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work); - if (limited) + if (limited) { + ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work); cr_assert_eq(ret, -FI_ENOSPC, "FI_QUEUE_WORK failed %d", ret); - else - cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + } cr_assert((fi_control(&res.dom->fid, FI_FLUSH_WORK, NULL) == FI_SUCCESS)); From ae133ccc39bd27a1cf974f8bb302953013a88cca Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:20:16 +0000 Subject: [PATCH 275/393] build(deps): bump github/codeql-action from 3.27.4 to 3.27.5 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.27.4 to 3.27.5. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/ea9e4e37992a54ee68a9622e985e60c8e8f12d9f...f09c1c0a94de965c15400f5634aa42fac8fb8f88) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 5763660afe6..ce6ccf224b0 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@ea9e4e37992a54ee68a9622e985e60c8e8f12d9f # v3.27.4 + uses: github/codeql-action/init@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@ea9e4e37992a54ee68a9622e985e60c8e8f12d9f # v3.27.4 + uses: github/codeql-action/autobuild@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@ea9e4e37992a54ee68a9622e985e60c8e8f12d9f # v3.27.4 + uses: github/codeql-action/analyze@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index b1d8ea77f21..c0fad16a230 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@ea9e4e37992a54ee68a9622e985e60c8e8f12d9f # v3.27.4 + uses: github/codeql-action/upload-sarif@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 with: sarif_file: results.sarif From f26695b3ad55283f58bc76ae8890f0e1a25d182e Mon Sep 17 00:00:00 2001 From: Md Bulbul Sharif Date: Wed, 27 Nov 2024 23:25:54 +0000 Subject: [PATCH 276/393] prov/cxi: synchronous fi_close on collective multicast The fi_close() operation manages its internal state and return FI_SUCCESS, or a fatal error code on error. Signed-off-by: Md Bulbul Sharif --- prov/cxi/include/cxip.h | 3 ++ prov/cxi/src/cxip_coll.c | 81 ++++++++++++++++++++++++++++++++++------ 2 files changed, 73 insertions(+), 11 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 65b48da6db3..b8e42ca0a69 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -2957,6 +2957,9 @@ struct cxip_coll_mc { int next_red_id; // next available red_id int max_red_id; // limit total concurrency int seqno; // rolling seqno for packets + int close_state; // the state of the close operation + bool has_closed; // true after a mc close call + bool has_error; // true if any error bool is_multicast; // true if multicast address bool arm_disable; // arm-disable for testing bool retry_disable; // retry-disable for testing diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c index 8d503c1c7b0..ae804e3779f 100644 --- a/prov/cxi/src/cxip_coll.c +++ b/prov/cxi/src/cxip_coll.c @@ -2704,13 +2704,17 @@ static void _curl_delete_mc_obj(struct cxip_coll_mc *mc_obj); static void _cxip_delete_mcast_cb(struct cxip_curl_handle *handle); /* Close multicast collective object */ -static void _close_mc(struct cxip_coll_mc *mc_obj, bool delete) +static void _close_mc(struct cxip_coll_mc *mc_obj, bool delete, bool has_error) { int count; if (!mc_obj) return; TRACE_JOIN("%s starting MC cleanup\n", __func__); + + mc_obj->has_closed = true; + mc_obj->has_error = has_error; + /* clear the mcast_addr -> mc_obj reference*/ ofi_idm_clear(&mc_obj->ep_obj->coll.mcast_map, mc_obj->mcast_addr); mc_obj->ep_obj->coll.is_hwroot = false; @@ -2739,10 +2743,18 @@ static void _close_mc(struct cxip_coll_mc *mc_obj, bool delete) cxip_env.coll_fm_timeout_msec/1000, (cxip_env.coll_fm_timeout_msec%1000)*1000000}; + if (!mc_obj->has_error) + mc_obj->close_state = -FI_EAGAIN; + _tsset(&mc_obj->curlexpires, &expires); _curl_delete_mc_obj(mc_obj); - } else - free(mc_obj); + } else { + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = FI_SUCCESS; + } + } } /* The user can close an individual collective MC address. It must do so on @@ -2752,11 +2764,37 @@ static void _close_mc(struct cxip_coll_mc *mc_obj, bool delete) static int _fi_close_mc(struct fid *fid) { struct cxip_coll_mc *mc_obj; + int ret = FI_SUCCESS; TRACE_JOIN("%s: closing MC\n", __func__); mc_obj = container_of(fid, struct cxip_coll_mc, mc_fid.fid); - _close_mc(mc_obj, true); - return FI_SUCCESS; + if (!mc_obj) { + TRACE_JOIN("%s: MC object is null\n", __func__); + return ret; + } else if (mc_obj->has_closed) { + TRACE_JOIN("%s: close already called before\n", __func__); + return ret; + } else if (mc_obj->has_error) { + TRACE_JOIN("%s: encounted an error earlier\n", __func__); + return ret; + } + + _close_mc(mc_obj, true, false); + while (mc_obj && (ret = mc_obj->close_state) == -FI_EAGAIN) { + ret = cxip_curl_progress(NULL); + if (ret == -FI_EAGAIN) { + usleep(10); + continue; + } + if (ret < 0 && ret != -FI_ENODATA) { + TRACE_JOIN("%s: Curl progress failed, error=%d\n", __func__, ret); + break; + } + usleep(10); + } + free(mc_obj); + + return ret; } /* multicast object libfabric functions */ @@ -2986,6 +3024,11 @@ static int _initialize_mc(void *ptr) _coll_metrics.ep_data.isroot = mc_obj->hwroot_idx == mc_obj->mynode_idx; + /* Initially set close states to success */ + mc_obj->close_state = FI_SUCCESS; + mc_obj->has_closed = false; + mc_obj->has_error = false; + /* Return information to the caller */ jstate->mc_obj = mc_obj; *jstate->mc = &mc_obj->mc_fid; @@ -2996,7 +3039,7 @@ static int _initialize_mc(void *ptr) fail: jstate->prov_errno = FI_CXI_ERRNO_JOIN_FAIL_PTE; - _close_mc(mc_obj, true); + _close_mc(mc_obj, true, true); return ret; } @@ -3076,7 +3119,11 @@ static void _curl_delete_mc_obj(struct cxip_coll_mc *mc_obj) TRACE_JOIN("CURL delete mcast %d failed\n", mc_obj->mcast_addr); free(curl_usrptr); - free(mc_obj); + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = ret; + } } } @@ -3102,7 +3149,11 @@ static void _cxip_delete_mcast_cb(struct cxip_curl_handle *handle) case 201: TRACE_JOIN("callback: %ld SUCCESS MCAST DELETED\n", handle->status); - free(mc_obj); + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = FI_SUCCESS; + } break; case 409: TRACE_JOIN("callback: delete mcast failed: %ld '%s'\n", @@ -3110,7 +3161,11 @@ static void _cxip_delete_mcast_cb(struct cxip_curl_handle *handle) if (_tsexp(&mc_obj->curlexpires)) { TRACE_JOIN("callback: FM expired\n"); - free(mc_obj); + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = FI_CXI_ERRNO_JOIN_CURL_TIMEOUT; + } break; } /* try again */ @@ -3118,7 +3173,11 @@ static void _cxip_delete_mcast_cb(struct cxip_curl_handle *handle) break; default: TRACE_JOIN("callback: %ld unknown status\n", handle->status); - free(mc_obj); + if (mc_obj->has_error) { + free(mc_obj); + } else { + mc_obj->close_state = FI_CXI_ERRNO_JOIN_CURL_FAILED; + } break; } /* free json memory */ @@ -4209,7 +4268,7 @@ void cxip_coll_close(struct cxip_ep_obj *ep_obj) while (!dlist_empty(&ep_obj->coll.mc_list)) { dlist_pop_front(&ep_obj->coll.mc_list, struct cxip_coll_mc, mc_obj, entry); - _close_mc(mc_obj, false); + _close_mc(mc_obj, false, true); } } From 074e98f22eeef5d00ab11c5cd1f02587feaadf1b Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 25 Nov 2024 21:54:56 +0000 Subject: [PATCH 277/393] prov/efa: Add tracepoint for poll cq ope This tracepoint records the event when a polled cqe has an associated op entry already, which is expected for send, RMA, and zero-copy receive operations. Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_cq.c | 7 +++++++ prov/efa/src/rdm/efa_rdm_tracepoint_def.h | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 4b3bcd74d1d..294bef21dec 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -475,7 +475,14 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) pkt_entry = (void *)(uintptr_t)ibv_cq->ibv_cq_ex->wr_id; qp = efa_domain->qp_table[ibv_wc_read_qp_num(ibv_cq->ibv_cq_ex) & efa_domain->qp_table_sz_m1]; ep = container_of(qp->base_ep, struct efa_rdm_ep, base_ep); +#if HAVE_LTTNG efa_rdm_tracepoint(poll_cq, (size_t) ibv_cq->ibv_cq_ex->wr_id); + if (pkt_entry && pkt_entry->ope) + efa_rdm_tracepoint(poll_cq_ope, pkt_entry->ope->msg_id, + (size_t) pkt_entry->ope->cq_entry.op_context, + pkt_entry->ope->total_len, pkt_entry->ope->cq_entry.tag, + pkt_entry->ope->addr); +#endif opcode = ibv_wc_read_opcode(ibv_cq->ibv_cq_ex); if (ibv_cq->ibv_cq_ex->status) { prov_errno = efa_rdm_cq_get_prov_errno(ibv_cq->ibv_cq_ex); diff --git a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h index 6e2fab54b7f..b814e957372 100644 --- a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h +++ b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h @@ -109,6 +109,11 @@ LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_RDM_TP_PROV, x_entry_cq_entry, LTTNG_UST_TP_ARGS(X_ENTRY_ARGS, CQ_ENTRY_ARGS), LTTNG_UST_TP_FIELDS(X_ENTRY_FIELDS CQ_ENTRY_FIELDS)) +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, x_entry_cq_entry, EFA_RDM_TP_PROV, + poll_cq_ope, + LTTNG_UST_TP_ARGS(X_ENTRY_ARGS, CQ_ENTRY_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, poll_cq_ope, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, x_entry_cq_entry, EFA_RDM_TP_PROV, send_end, LTTNG_UST_TP_ARGS(X_ENTRY_ARGS, CQ_ENTRY_ARGS)) From d2f70280433ce62b3129dd3c5e7e71c1e915c5f9 Mon Sep 17 00:00:00 2001 From: Piotr Chmiel Date: Thu, 21 Nov 2024 15:03:35 +0100 Subject: [PATCH 278/393] prov/verbs: Fix data race vrb_open_ep function Resolved a data race in the vrb_open_ep function of the verbs provider caused by concurrent modifications to the global variable vrb_ep_ops. This issue violated the FI_THREAD_SAFE threading model, leading to unpredictable behavior when creating endpoints from multiple threads. Signed-off-by: Piotr Chmiel --- prov/verbs/src/verbs_ep.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/prov/verbs/src/verbs_ep.c b/prov/verbs/src/verbs_ep.c index 63aea82778d..470f05efedf 100644 --- a/prov/verbs/src/verbs_ep.c +++ b/prov/verbs/src/verbs_ep.c @@ -191,7 +191,7 @@ ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr, uint64_t flags) } if (vrb_wr_consumes_recv(wr)) { - if (!ep->peer_rq_credits || + if (!ep->peer_rq_credits || (ep->peer_rq_credits == 1 && !(flags & OFI_PRIORITY))) /* Last credit is reserved for credit update */ goto freectx; @@ -1161,7 +1161,7 @@ static struct fi_ops vrb_ep_ops = { .close = vrb_ep_close, .bind = vrb_ep_bind, .control = vrb_ep_control, - .ops_open = fi_no_ops_open, + .ops_open = vrb_ep_ops_open, }; static struct fi_ops_cm vrb_dgram_cm_ops = { @@ -1394,7 +1394,6 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info, *ep_fid = &ep->util_ep.ep_fid; ep->util_ep.ep_fid.fid.ops = &vrb_ep_ops; ep->util_ep.ep_fid.ops = &vrb_ep_base_ops; - (*ep_fid)->fid.ops->ops_open = vrb_ep_ops_open; vrb_prof_func_end("vrb_open_ep"); From 6a03119504001a7d67625c5b6073f2df26bae9d0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 18:15:18 +0000 Subject: [PATCH 279/393] build(deps): bump jidicula/clang-format-action from 4.13.0 to 4.14.0 Bumps [jidicula/clang-format-action](https://github.com/jidicula/clang-format-action) from 4.13.0 to 4.14.0. - [Release notes](https://github.com/jidicula/clang-format-action/releases) - [Commits](https://github.com/jidicula/clang-format-action/compare/c74383674bf5f7c69f60ce562019c1c94bc1421a...d05cecd4a1a5b7e64c22f5a468456135a43f13f6) --- updated-dependencies: - dependency-name: jidicula/clang-format-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/clang-format-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 704b1c91ceb..c566599ccbe 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -13,7 +13,7 @@ jobs: steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Run clang-format style check for C/C++/Protobuf programs. - uses: jidicula/clang-format-action@c74383674bf5f7c69f60ce562019c1c94bc1421a # v4.13.0 + uses: jidicula/clang-format-action@d05cecd4a1a5b7e64c22f5a468456135a43f13f6 # v4.14.0 with: clang-format-version: '15' check-path: ${{ matrix.path }} From 6f245834b156763b8ad3f480cf0708cfde5df5c1 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 2 Dec 2024 15:43:27 -0800 Subject: [PATCH 280/393] include/windows/osd.h: remove duplicate strtok_r definition The duplicate definition was causing hangs due to a loop Signed-off-by: Alexia Ingerson --- include/windows/osd.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/windows/osd.h b/include/windows/osd.h index d9698bd9724..efd3bf0d125 100644 --- a/include/windows/osd.h +++ b/include/windows/osd.h @@ -930,11 +930,6 @@ static inline char *strcasestr(const char *haystack, const char *needle) return pos; } -static inline char *strtok_r(char *str, const char *delimiters, char **saveptr) -{ - return strtok_s(str, delimiters, saveptr); -} - #ifndef _SC_PAGESIZE #define _SC_PAGESIZE 0 #endif From 3815260aabb5f56177f3ef025fad6d32a845977a Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 19 Nov 2024 11:18:11 -0800 Subject: [PATCH 281/393] contrib/intel/jenkins: Make middlewares build in parallel Build middlewares in parallel instead of after libfabrics. This will save time because shmem, and mpich won't have to build sequentially. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 81 ++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index a28d420af65..1b830c3ae4a 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -428,40 +428,28 @@ pipeline { } } } - stage ('parallel-builds') { + stage ('build-libfabric') { when { equals expected: true, actual: DO_RUN } parallel { - stage ('build-water') { + stage ('water') { steps { script { dir (CI_LOCATION) { run_ci("pre-build", "pr_build_water.json") - run_ci("pre-build", "pr_build_shmem_water.json") - slurm_batch("totodile", "1", - "${env.LOG_DIR}/build_mpich_water_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=mpich --build_hw=water""" - ) } } } } - stage ('build-grass') { + stage ('grass') { steps { script { dir (CI_LOCATION) { run_ci("pre-build", "pr_build_grass.json") - run_ci("pre-build", "pr_build_shmem_grass.json") - slurm_batch("grass", "1", - "${env.LOG_DIR}/build_mpich_grass_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=mpich --build_hw=grass""" - ) } } } } - stage ('build-electric') { + stage ('electric') { steps { script { dir (CI_LOCATION) { @@ -470,7 +458,7 @@ pipeline { } } } - stage ('build-ucx') { + stage ('ucx') { steps { script { dir (CI_LOCATION) { @@ -479,7 +467,7 @@ pipeline { } } } - stage ('build-cyndaquil') { + stage ('cyndaquil') { steps { script { dir (CI_LOCATION) { @@ -488,7 +476,7 @@ pipeline { } } } - stage ('build-quilava') { + stage ('quilava') { steps { script { dir (CI_LOCATION) { @@ -497,7 +485,7 @@ pipeline { } } } - stage ('build-ivysaur') { + stage ('ivysaur') { steps { script { dir (CI_LOCATION) { @@ -506,7 +494,7 @@ pipeline { } } } - stage ('build-daos') { + stage ('daos') { agent { node { label 'daos_head' @@ -528,7 +516,7 @@ pipeline { } } } - stage ('build-fire') { + stage ('fire') { agent { node { label 'ze' @@ -553,6 +541,55 @@ pipeline { } } } + stage('build-middlewares') { + when { equals expected: true, actual: DO_RUN } + parallel { + stage ('shmem-water') { + steps { + script { + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_shmem_water.json") + } + } + } + } + stage ('shmem-grass') { + steps { + script { + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_shmem_grass.json") + } + } + } + } + stage ('mpich-water') { + steps { + script { + dir (CI_LOCATION) { + slurm_batch("totodile", "1", + "${env.LOG_DIR}/build_mpich_water_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=mpich --build_hw=water""" + ) + } + } + } + } + stage ('mpich-grass') { + steps { + script { + dir (CI_LOCATION) { + slurm_batch("grass", "1", + "${env.LOG_DIR}/build_mpich_grass_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=mpich --build_hw=grass""" + ) + } + } + } + } + } + } stage('parallel-tests') { when { equals expected: true, actual: DO_RUN } parallel { From 6e7c67e777a85449ee2c4794785ff1afc2b7ea99 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 19 Nov 2024 18:23:32 -0800 Subject: [PATCH 282/393] contrib/intel/jenkins: Make all slurm jobs have the same name Cancel all slurm jobs on abort path Only jobs not run through new CI will use this feature. New CI is capable of cancelling already. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 1b830c3ae4a..230dbc647cf 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -20,6 +20,7 @@ def slurm_batch(partition, node_num, output, command) { try { sh """sbatch --partition=${partition} -N ${node_num} \ --wait -o ${output} --open-mode=append \ + -J ${env.SLURM_JOB_NAME} \ --wrap=\'env; timeout $TIMEOUT ${command}\' """ } catch (Exception e) { @@ -365,6 +366,7 @@ pipeline { WITH_ENV="'PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH'" CUSTOM_WORKSPACE="${CB_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" DELETE_LOCATION="${env.CUSTOM_WORKSPACE}/middlewares" + SLURM_JOB_NAME="${env.JOB_NAME}_${env.BUILD_NUMBER}" RUN_LOCATION="${env.CUSTOM_WORKSPACE}/ci_resources/legacy_pipeline_scripts/" CI_LOCATION="${env.CUSTOM_WORKSPACE}/ci" LOG_DIR = "${env.CUSTOM_WORKSPACE}/log_dir" @@ -983,6 +985,7 @@ pipeline { node ('ze') { dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } } + sh "scancel --jobname=\"${SLURM_JOB_NAME}\"" dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } } success { From a99f2e8ab1b948539d625bf3f55d53d4bd6ab475 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 21 Nov 2024 06:57:58 -0800 Subject: [PATCH 283/393] contrib/intel/jenkins: Properly name shmem grass Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 230dbc647cf..0fbb470d9f0 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -782,7 +782,7 @@ pipeline { script { dir (CI_LOCATION) { run_ci("CI_shmem_grass", "pr_shmem_1n2ppn_grass.json") - run_ci("CI_shmem_water", "pr_shmem_2n1ppn_water.json") + run_ci("CI_shmem_grass", "pr_shmem_2n1ppn_grass.json") } } } From 406a44c3e66495a4cb6c05c07c68cfdd819396db Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 21 Nov 2024 06:41:05 -0800 Subject: [PATCH 284/393] contrib/intel/jenkins: Add ompi testing Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 0fbb470d9f0..e083814b9bd 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -564,6 +564,24 @@ pipeline { } } } + stage ('ompi-water') { + steps { + script { + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_ompi_water.json") + } + } + } + } + stage ('ompi-grass') { + steps { + script { + dir (CI_LOCATION) { + run_ci("pre-build", "pr_build_ompi_grass.json") + } + } + } + } stage ('mpich-water') { steps { script { From 573fb07c893952cf96f140b5c17b3a8abe96b00d Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 26 Nov 2024 09:51:10 -0800 Subject: [PATCH 285/393] contrib/intel/jenkins: Add lpp & cxi to opt-out Opt-out of running if all changes are in prov/cxi, or prov/lpp Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index e083814b9bd..ed887325dec 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -336,7 +336,7 @@ def skip() { } echo "Changeset is: ${changeStrings.toArray()}" - if (changeStrings.toArray().every { it =~ /(?:fabtests\/pytests|man|prov\/efa|prov\/opx|contrib\/aws).*$/ }) { + if (changeStrings.toArray().every { it =~ /(?:fabtests\/pytests|man|prov\/efa|prov\/opx|prov\/cxi|prov\/lpp|contrib\/aws).*$/ }) { echo "DONT RUN!" return true } From 96eeb8eea18030bca47828fe2d1deb922e8630b3 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Wed, 20 Nov 2024 16:33:34 -0800 Subject: [PATCH 286/393] contrib/intel/jenkins: Rename ivysaur to io_uring Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index ed887325dec..f43e9634b4e 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -487,15 +487,6 @@ pipeline { } } } - stage ('ivysaur') { - steps { - script { - dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_ivysaur.json") - } - } - } - } stage ('daos') { agent { node { @@ -706,13 +697,16 @@ pipeline { } } } - stage('CI_fabtests_ivysaur') { + stage('CI_fabtests_tcp_io_uring') { steps { script { dir (CI_LOCATION) { - run_ci("CI_fabtests_ivysaur_reg", "pr_fabtests_ivysaur_reg.json") - run_ci("CI_fabtests_ivysaur_dbg", "pr_fabtests_ivysaur_dbg.json") - run_ci("CI_fabtests_ivysaur_dl", "pr_fabtests_ivysaur_dl.json") + run_ci("CI_fabtests_tcp_io_uring_reg", + "pr_fabtests_tcp_io_uring_reg.json") + run_ci("CI_fabtests_tcp_io_uring_dbg", + "pr_fabtests_tcp_io_uring_dbg.json") + run_ci("CI_fabtests_tcp_io_uring_dl", + "pr_fabtests_tcp_io_uring_dl.json") } } } From 86494eaa3a73c7954b8535acf8f1476a06fe5956 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 26 Nov 2024 15:27:11 -0800 Subject: [PATCH 287/393] contrib/intel/jenkins: Migrate ucx to water Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 9 --------- 1 file changed, 9 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index f43e9634b4e..95fdcc9ba71 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -460,15 +460,6 @@ pipeline { } } } - stage ('ucx') { - steps { - script { - dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_ucx.json") - } - } - } - } stage ('cyndaquil') { steps { script { From 8274ece2dfbd7f914153b0528053b49b14574088 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Wed, 27 Nov 2024 14:22:30 -0800 Subject: [PATCH 288/393] contrib/intel/jenkins: Update target on weekly Since weekly jobs aren't PR's the target needs to be manually set. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 95fdcc9ba71..b4a1d141560 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -7,6 +7,7 @@ properties([disableConcurrentBuilds(abortPrevious: true)]) @Field def BUILD_MODES=["reg", "dbg", "dl"] @Field def PYTHON_VERSION="3.9" @Field def TIMEOUT="7200" +@Field def weekly=false def run_python(version, command, output=null) { if (output != null) @@ -272,9 +273,17 @@ def bootstrap_ci() { } def checkout_tar(name) { + if (env.WEEKLY == null) { + weekly = false + } else { + weekly = env.WEEKLY.toBoolean() + } dir ("${env.CUSTOM_WORKSPACE}/${name}/libfabric") { checkout scm TARGET=check_target() + if (weekly) { + TARGET=env.WEEKLY_TARGET + } sh """ git remote add upstream ${env.UPSTREAM} git pull --rebase upstream ${TARGET} @@ -386,11 +395,6 @@ pipeline { steps { script { git_diffs() - if (env.WEEKLY == null) { - weekly = false - } else { - weekly = env.WEEKLY.toBoolean() - } if (weekly) { TIMEOUT="21600" } From 0931d0636261d12f920be56f59b99bafda736626 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Mon, 2 Dec 2024 14:29:26 -0800 Subject: [PATCH 289/393] contrib/intel/jenkins: Migrate to new head node Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index b4a1d141560..f5ce711e2f3 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -361,7 +361,7 @@ def skip() { pipeline { agent { node { - label 'main' + label 'cbj-main' customWorkspace "workspace/${JOB_NAME}/${env.BUILD_NUMBER}" } } From 335829fe7818d4eb165bc1630ac1867689eb899b Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 24 Oct 2024 11:19:43 -0700 Subject: [PATCH 290/393] prov/rxm: fix definition of the rxm SAR segment enum The rxm SAR segment type enum was defined inside another struct. While techincally ok, this made it difficult for editors to find the type and reported compiler errors. This cleans it up to make it more readible and easier for editors to find the type Signed-off-by: Alexia Ingerson --- prov/rxm/src/rxm.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index 16074babeac..e2759d6d077 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -417,13 +417,15 @@ struct rxm_pkt { char data[]; }; +enum rxm_sar_seg_type { + RXM_SAR_SEG_FIRST = 1, + RXM_SAR_SEG_MIDDLE = 2, + RXM_SAR_SEG_LAST = 3, +}; + union rxm_sar_ctrl_data { struct { - enum rxm_sar_seg_type { - RXM_SAR_SEG_FIRST = 1, - RXM_SAR_SEG_MIDDLE = 2, - RXM_SAR_SEG_LAST = 3, - } seg_type : 2; + enum rxm_sar_seg_type seg_type : 2; uint32_t offset; }; uint64_t align; From 975428c077c71325c7d595e7d4ebca22561beebc Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Wed, 23 Oct 2024 15:40:06 -0700 Subject: [PATCH 291/393] prov/rxm: add FI_AV_USER_ID support Add application side support for FI_AV_USER_ID which requires saving the fi_addr input as the internal fi_addr (for both the peer API srx use case and for reporting unique source address information). When supporting the capability for the application, remove it form the core provider information as it is only required on the top layer Signed-off-by: Alexia Ingerson --- prov/rxm/src/rxm_attr.c | 2 +- prov/rxm/src/rxm_init.c | 3 +++ prov/util/src/rxm_av.c | 28 ++++++++++++++++++++++------ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/prov/rxm/src/rxm_attr.c b/prov/rxm/src/rxm_attr.c index defa7771188..632543585e4 100644 --- a/prov/rxm/src/rxm_attr.c +++ b/prov/rxm/src/rxm_attr.c @@ -40,7 +40,7 @@ OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \ FI_MULTI_RECV) -#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) +#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID) /* Since we are a layering provider, the attributes for which we rely on the diff --git a/prov/rxm/src/rxm_init.c b/prov/rxm/src/rxm_init.c index 78610bc5f04..1a76796d4e0 100644 --- a/prov/rxm/src/rxm_init.c +++ b/prov/rxm/src/rxm_init.c @@ -262,6 +262,9 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints, core_info->rx_attr->op_flags &= ~FI_MULTI_RECV; + core_info->domain_attr->caps &= ~(FI_AV_USER_ID); + core_info->caps &= ~(FI_AV_USER_ID); + return 0; } diff --git a/prov/util/src/rxm_av.c b/prov/util/src/rxm_av.c index 69a68a884db..a5e30c95026 100644 --- a/prov/util/src/rxm_av.c +++ b/prov/util/src/rxm_av.c @@ -165,7 +165,7 @@ rxm_put_peer_addr(struct rxm_av *av, fi_addr_t fi_addr) static int rxm_av_add_peers(struct rxm_av *av, const void *addr, size_t count, - fi_addr_t *fi_addr) + fi_addr_t *fi_addr, fi_addr_t *user_ids) { struct util_peer_addr *peer; const void *cur_addr; @@ -178,8 +178,12 @@ rxm_av_add_peers(struct rxm_av *av, const void *addr, size_t count, if (!peer) goto err; - peer->fi_addr = fi_addr ? fi_addr[i] : + if (user_ids) { + peer->fi_addr = user_ids[i]; + } else { + peer->fi_addr = fi_addr ? fi_addr[i] : ofi_av_lookup_fi_addr(&av->util_av, cur_addr); + } /* lookup can fail if prior AV insertion failed */ if (peer->fi_addr != FI_ADDR_NOTAVAIL) @@ -276,21 +280,33 @@ static int rxm_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { struct rxm_av *av; + fi_addr_t *user_ids = NULL; int ret; + if (flags & FI_AV_USER_ID) { + assert(fi_addr); + user_ids = calloc(count, sizeof(*user_ids)); + assert(user_ids); + memcpy(user_ids, fi_addr, sizeof(*fi_addr) * count); + } + av = container_of(av_fid, struct rxm_av, util_av.av_fid.fid); ret = ofi_ip_av_insert(av_fid, addr, count, fi_addr, flags, context); if (ret < 0) - return ret; + goto out; count = ret; - ret = rxm_av_add_peers(av, addr, count, fi_addr); + ret = rxm_av_add_peers(av, addr, count, fi_addr, user_ids); if (ret) { rxm_av_remove(av_fid, fi_addr, count, flags); - return ret; + goto out; } +out: + free(user_ids); + if (ret) + return ret; return (int) count; } @@ -319,7 +335,7 @@ static int rxm_av_insertsym(struct fid_av *av_fid, const char *node, if (ret > 0 && ret < count) count = ret; - ret = rxm_av_add_peers(av, addr, count, fi_addr); + ret = rxm_av_add_peers(av, addr, count, fi_addr, NULL); if (ret) { rxm_av_remove(av_fid, fi_addr, count, flags); return ret; From de410e4d93145eb455dbaeb043147846bf351804 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 21 Oct 2024 15:40:44 -0700 Subject: [PATCH 292/393] prov/rxm: add rxm support for using a peer CQs and counters Support using the peer APIs by default using the util peer helper functions. Instead of going through the rxm-specific functions to write to CQs and counters, use the ofi_peer_cq/cntr APIs which use the owner ops. In the default case where rxm is not being used as a peer these will go to the regular ofi_cq_write functions. Signed-off-by: Alexia Ingerson --- prov/rxm/src/rxm.h | 77 +-------------- prov/rxm/src/rxm_cq.c | 200 +++++++++++++++++++++----------------- prov/rxm/src/rxm_ep.c | 44 +++++---- prov/rxm/src/rxm_msg.c | 4 +- prov/rxm/src/rxm_tagged.c | 15 +-- 5 files changed, 146 insertions(+), 194 deletions(-) diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index e2759d6d077..5d18f16e157 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -759,9 +759,10 @@ ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf); int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); - -void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr, - void *op_context, int err); +void rxm_cq_write_tx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err); +void rxm_cq_write_rx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err); void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err); void rxm_handle_comp_error(struct rxm_ep *rxm_ep); ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp); @@ -880,50 +881,6 @@ int rxm_msg_mr_reg_internal(struct rxm_domain *rxm_domain, const void *buf, size_t len, uint64_t acs, uint64_t flags, struct fid_mr **mr); -static inline void rxm_cntr_incerr(struct util_cntr *cntr) -{ - if (cntr) - cntr->cntr_fid.ops->adderr(&cntr->cntr_fid, 1); -} - -static inline void -rxm_cq_write(struct util_cq *cq, void *context, uint64_t flags, size_t len, - void *buf, uint64_t data, uint64_t tag) -{ - int ret; - - FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n", - fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS)); - - ret = ofi_cq_write(cq, context, flags, len, buf, data, tag); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to report completion\n"); - assert(0); - } - if (cq->wait) - cq->wait->signal(cq->wait); -} - -static inline void -rxm_cq_write_src(struct util_cq *cq, void *context, uint64_t flags, size_t len, - void *buf, uint64_t data, uint64_t tag, fi_addr_t addr) -{ - int ret; - - FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n", - fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS)); - - ret = ofi_cq_write_src(cq, context, flags, len, buf, data, tag, addr); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to report completion\n"); - assert(0); - } - if (cq->wait) - cq->wait->signal(cq->wait); -} - ssize_t rxm_get_conn(struct rxm_ep *rxm_ep, fi_addr_t addr, struct rxm_conn **rxm_conn); @@ -998,32 +955,6 @@ rxm_recv_entry_release(struct rxm_recv_entry *entry) ofi_buf_free(entry); } -static inline void -rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, uint64_t flags, - size_t len, char *buf) -{ - if (rx_buf->ep->util_coll_peer_xfer_ops && - rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { - struct fi_cq_tagged_entry cqe = { - .tag = rx_buf->pkt.hdr.tag, - .op_context = rx_buf->recv_entry->context, - }; - rx_buf->ep->util_coll_peer_xfer_ops-> - complete(rx_buf->ep->util_coll_ep, &cqe, 0); - return; - } - - if (rx_buf->ep->rxm_info->caps & FI_SOURCE) - rxm_cq_write_src(rx_buf->ep->util_ep.rx_cq, context, - flags, len, buf, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag, - rx_buf->conn->peer->fi_addr); - else - rxm_cq_write(rx_buf->ep->util_ep.rx_cq, context, - flags, len, buf, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag); -} - struct rxm_mr *rxm_mr_get_map_entry(struct rxm_domain *domain, uint64_t key); struct rxm_recv_entry * diff --git a/prov/rxm/src/rxm_cq.c b/prov/rxm/src/rxm_cq.c index 27c8cc6f1c0..b04b36444d3 100644 --- a/prov/rxm/src/rxm_cq.c +++ b/prov/rxm/src/rxm_cq.c @@ -101,6 +101,35 @@ static void rxm_replace_rx_buf(struct rxm_rx_buf *rx_buf) ofi_buf_free(new_rx_buf); } +static void rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, + uint64_t flags, size_t len, char *buf) +{ + int ret; + + if (rx_buf->ep->util_coll_peer_xfer_ops && + rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { + struct fi_cq_tagged_entry cqe = { + .tag = rx_buf->pkt.hdr.tag, + .op_context = rx_buf->recv_entry->context, + }; + rx_buf->ep->util_coll_peer_xfer_ops-> + complete(rx_buf->ep->util_coll_ep, &cqe, 0); + return; + } + if (rx_buf->ep->rxm_info->caps & FI_SOURCE) + ret = ofi_peer_cq_write(rx_buf->ep->util_ep.rx_cq, context, + flags, len, buf, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, + rx_buf->conn->peer->fi_addr); + else + ret = ofi_peer_cq_write(rx_buf->ep->util_ep.rx_cq, context, + flags, len, buf, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); + if (ret) + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to write rx completion\n"); +} + static void rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) { uint64_t flags; @@ -136,19 +165,19 @@ static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) int ret; if (rx_buf->ep->util_ep.flags & OFI_CNTR_ENABLED) - rxm_cntr_incerr(rx_buf->ep->util_ep.cntrs[CNTR_RX]); + ofi_ep_peer_rx_cntr_incerr(&rx_buf->ep->util_ep, ofi_op_msg); FI_WARN(&rxm_prov, FI_LOG_CQ, "Message truncated: " "recv buf length: %zu message length: %" PRIu64 "\n", done_len, rx_buf->pkt.hdr.size); - ret = ofi_cq_write_error_trunc(rx_buf->ep->util_ep.rx_cq, - rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags, - rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov.iov[0].iov_base, - rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, - rx_buf->pkt.hdr.size - done_len); + ret = ofi_peer_cq_write_error_trunc( + rx_buf->ep->util_ep.rx_cq, + rx_buf->recv_entry->context, + rx_buf->recv_entry->comp_flags | + rx_buf->pkt.hdr.flags, rx_buf->pkt.hdr.size, + rx_buf->recv_entry->rxm_iov.iov[0].iov_base, + rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, + rx_buf->pkt.hdr.size - done_len); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to write recv error CQ\n"); assert(0); @@ -166,16 +195,16 @@ static void rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len) if (rx_buf->recv_entry->flags & FI_COMPLETION || rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV) { - rxm_cq_write_recv_comp(rx_buf, rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags | - (rx_buf->recv_entry->flags & FI_MULTI_RECV), - rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov. - iov[0].iov_base); - } - ofi_ep_cntr_inc(&rx_buf->ep->util_ep, CNTR_RX); - + rxm_cq_write_recv_comp( + rx_buf, rx_buf->recv_entry->context, + rx_buf->recv_entry->comp_flags | + rx_buf->pkt.hdr.flags | + (rx_buf->recv_entry->flags & FI_MULTI_RECV), + rx_buf->pkt.hdr.size, + rx_buf->recv_entry->rxm_iov. + iov[0].iov_base); + } + ofi_ep_peer_rx_cntr_inc(&rx_buf->ep->util_ep, ofi_op_msg); release: rxm_recv_entry_release(recv_entry); rxm_free_rx_buf(rx_buf); @@ -186,8 +215,9 @@ rxm_cq_write_tx_comp(struct rxm_ep *rxm_ep, uint64_t comp_flags, void *app_context, uint64_t flags) { if (flags & FI_COMPLETION) { - rxm_cq_write(rxm_ep->util_ep.tx_cq, app_context, - comp_flags, 0, NULL, 0, 0); + (void) ofi_peer_cq_write(rxm_ep->util_ep.tx_cq, app_context, + comp_flags, 0, NULL, 0, 0, + FI_ADDR_NOTAVAIL); } } @@ -201,9 +231,9 @@ static void rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_tx_buf *rma_buf, rma_buf->flags); if (comp_flags & FI_WRITE) - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_WR); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_write); else - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_RD); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_read_req); if (!(rma_buf->flags & FI_INJECT) && !rxm_ep->rdm_mr_local && rxm_ep->msg_mr_local) { @@ -219,7 +249,7 @@ void rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_buf *tx_buf) rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), tx_buf->app_context, tx_buf->flags); - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_TX); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_msg); } static bool rxm_complete_sar(struct rxm_ep *rxm_ep, @@ -259,7 +289,7 @@ static void rxm_handle_sar_comp(struct rxm_ep *rxm_ep, return; rxm_cq_write_tx_comp(rxm_ep, comp_flags, app_context, tx_flags); - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_TX); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_msg); } static void rxm_rndv_rx_finish(struct rxm_rx_buf *rx_buf) @@ -295,7 +325,7 @@ static void rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, ofi_buf_free(tx_buf->write_rndv.done_buf); tx_buf->write_rndv.done_buf = NULL; } - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_TX); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_msg); rxm_free_tx_buf(rxm_ep, tx_buf); } @@ -518,8 +548,8 @@ ssize_t rxm_rndv_read(struct rxm_rx_buf *rx_buf) rx_buf->recv_entry->rxm_iov.count, total_len, rx_buf); if (ret) { - rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq, - rx_buf->ep->util_ep.cntrs[CNTR_RX], rx_buf, (int) ret); + rxm_cq_write_rx_error(rx_buf->ep, ofi_op_msg, rx_buf, + (int) ret); } return ret; } @@ -561,9 +591,8 @@ static ssize_t rxm_rndv_handle_wr_data(struct rxm_rx_buf *rx_buf) tx_buf->rma.count, total_len, tx_buf); if (ret) - rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq, - rx_buf->ep->util_ep.cntrs[CNTR_RX], - tx_buf, (int) ret); + rxm_cq_write_rx_error(rx_buf->ep, ofi_op_msg, tx_buf, (int) ret); + rxm_free_rx_buf(rx_buf); return ret; } @@ -986,9 +1015,9 @@ ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) static void rxm_handle_remote_write(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp) { - rxm_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, comp->len, NULL, - comp->data, 0); - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_REM_WR); + ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, comp->len, + NULL, comp->data, 0, FI_ADDR_NOTAVAIL); + ofi_ep_peer_rx_cntr_inc(&rxm_ep->util_ep, ofi_op_write); if (comp->op_context) rxm_free_rx_buf(comp->op_context); } @@ -1222,10 +1251,7 @@ static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep, } result_len = op == ofi_op_atomic ? 0 : offset; - if (op == ofi_op_atomic) - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_REM_WR); - else - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_REM_RD); + ofi_ep_peer_rx_cntr_inc(&rxm_ep->util_ep, op); return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf, result_len, FI_SUCCESS); @@ -1236,7 +1262,6 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, { struct rxm_tx_buf *tx_buf; struct rxm_atomic_resp_hdr *resp_hdr; - struct util_cntr *cntr = NULL; uint64_t len; ssize_t copy_len; ssize_t ret = 0; @@ -1286,33 +1311,15 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), tx_buf->app_context, tx_buf->flags); - if (tx_buf->pkt.hdr.op == ofi_op_atomic) { - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_WR); - } else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare || - tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) { - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_RD); - } else { - ret = -FI_EOPNOTSUPP; - goto write_err; - } + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, tx_buf->pkt.hdr.op); free: rxm_free_rx_buf(rx_buf); rxm_free_tx_buf(rxm_ep, tx_buf); return ret; write_err: - if (tx_buf->pkt.hdr.op == ofi_op_atomic) { - cntr = rxm_ep->util_ep.cntrs[CNTR_WR]; - } else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare || - tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) { - cntr = rxm_ep->util_ep.cntrs[CNTR_RD]; - } else { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "unknown atomic request op!\n"); - assert(0); - } - rxm_cq_write_error(rxm_ep->util_ep.tx_cq, cntr, - tx_buf->app_context, (int) ret); + rxm_cq_write_tx_error(rxm_ep, tx_buf->pkt.hdr.op, tx_buf->app_context, + (int) ret); goto free; } @@ -1480,23 +1487,38 @@ ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp) } } -void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr, - void *op_context, int err) +void rxm_cq_write_tx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err) { struct fi_cq_err_entry err_entry = {0}; err_entry.op_context = op_context; err_entry.prov_errno = err; err_entry.err = -err; - if (cntr) - rxm_cntr_incerr(cntr); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, op); - if (ofi_cq_write_error(cq, &err_entry)) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + if (ofi_peer_cq_write_error(rxm_ep->util_ep.tx_cq, &err_entry)) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } +void rxm_cq_write_rx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err) +{ + struct fi_cq_err_entry err_entry = {0}; + err_entry.op_context = op_context; + err_entry.prov_errno = err; + err_entry.err = -err; + + ofi_ep_peer_rx_cntr_incerr(&rxm_ep->util_ep, op); + + if (ofi_peer_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry)) + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); +} + void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err) { struct fi_cq_err_entry err_entry = {0}; @@ -1505,32 +1527,26 @@ void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err) err_entry.prov_errno = err; err_entry.err = -err; if (rxm_ep->util_ep.tx_cq) { - ret = ofi_cq_write_error(rxm_ep->util_ep.tx_cq, &err_entry); + ret = ofi_peer_cq_write_error(rxm_ep->util_ep.tx_cq, &err_entry); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to ofi_cq_write_error\n"); + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } if (rxm_ep->util_ep.rx_cq) { - ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); + ret = ofi_peer_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to ofi_cq_write_error\n"); + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } - if (rxm_ep->util_ep.cntrs[CNTR_TX]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_TX]); - - if (rxm_ep->util_ep.cntrs[CNTR_RX]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_RX]); - - if (rxm_ep->util_ep.cntrs[CNTR_WR]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_WR]); - if (rxm_ep->util_ep.cntrs[CNTR_RD]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_RD]); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, ofi_op_msg); + ofi_ep_peer_rx_cntr_incerr(&rxm_ep->util_ep, ofi_op_msg); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, ofi_op_write); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, ofi_op_read_req); } void rxm_handle_comp_error(struct rxm_ep *rxm_ep) @@ -1583,7 +1599,7 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) case RXM_INJECT_TX: rxm_free_tx_buf(rxm_ep, err_entry.op_context); if (cntr) - rxm_cntr_incerr(cntr); + cntr->peer_cntr->owner_ops->incerr(cntr->peer_cntr); return; case RXM_CREDIT_TX: case RXM_ATOMIC_RESP_SENT: /* BUG: should have consumed tx credit */ @@ -1647,12 +1663,13 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) } if (cntr) - rxm_cntr_incerr(cntr); + cntr->peer_cntr->owner_ops->incerr(cntr->peer_cntr); assert(cq); - ret = ofi_cq_write_error(cq, &err_entry); + ret = ofi_peer_cq_write_error(cq, &err_entry); if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } @@ -1665,8 +1682,8 @@ ssize_t rxm_thru_comp(struct rxm_ep *ep, struct fi_cq_data_entry *comp) cq = (comp->flags & (FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ)) ? ep->util_ep.rx_cq : ep->util_ep.tx_cq; - ret = ofi_cq_write(cq, comp->op_context, comp->flags, comp->len, - comp->buf, comp->data, 0); + ret = ofi_peer_cq_write(cq, comp->op_context, comp->flags, comp->len, + comp->buf, comp->data, 0, FI_ADDR_NOTAVAIL); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to report completion\n"); assert(0); @@ -1692,9 +1709,10 @@ void rxm_thru_comp_error(struct rxm_ep *ep) } cq = (err_entry.flags & FI_RECV) ? ep->util_ep.rx_cq : ep->util_ep.tx_cq; - ret = ofi_cq_write_error(cq, &err_entry); + ret = ofi_peer_cq_write_error(cq, &err_entry); if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } @@ -1730,8 +1748,8 @@ ssize_t rxm_cq_owner_write(struct fid_peer_cq *peer_cq, void *context, } rxm_cq = container_of(peer_cq, struct rxm_cq, peer_cq); - return ofi_cq_write(&rxm_cq->util_cq, req->app_context, req->flags, len, - buf, data, tag); + return ofi_peer_cq_write(&rxm_cq->util_cq, req->app_context, req->flags, + len, buf, data, tag, FI_ADDR_NOTAVAIL); } ssize_t rxm_cq_owner_writeerr(struct fid_peer_cq *peer_cq, @@ -1751,7 +1769,7 @@ ssize_t rxm_cq_owner_writeerr(struct fid_peer_cq *peer_cq, } rxm_cq = container_of(peer_cq, struct rxm_cq, peer_cq); - return ofi_cq_write_error(&rxm_cq->util_cq, &cqe_err); + return ofi_peer_cq_write_error(&rxm_cq->util_cq, &cqe_err); } int rxm_post_recv(struct rxm_rx_buf *rx_buf) diff --git a/prov/rxm/src/rxm_ep.c b/prov/rxm/src/rxm_ep.c index ba6a949122e..69a88e2caaf 100644 --- a/prov/rxm/src/rxm_ep.c +++ b/prov/rxm/src/rxm_ep.c @@ -746,9 +746,8 @@ rxm_ep_sar_handle_segment_failure(struct rxm_deferred_tx_entry *def_tx_entry, { rxm_ep_sar_tx_cleanup(def_tx_entry->rxm_ep, def_tx_entry->rxm_conn, def_tx_entry->sar_seg.cur_seg_tx_buf); - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.tx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_TX], - def_tx_entry->sar_seg.app_context, (int) ret); + rxm_cq_write_tx_error(def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->sar_seg.app_context, (int) ret); } /* Returns FI_SUCCESS if the SAR deferred TX queue is empty, @@ -843,10 +842,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], - def_tx_entry->rndv_ack.rx_buf-> - recv_entry->context, (int) ret); + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->rndv_ack.rx_buf-> + recv_entry->context, (int) ret); } if (def_tx_entry->rndv_ack.rx_buf->recv_entry->rndv .tx_buf->pkt.ctrl_hdr @@ -868,9 +867,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.tx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_TX], - def_tx_entry->rndv_done.tx_buf, (int) ret); + rxm_cq_write_tx_error(def_tx_entry->rxm_ep, + ofi_op_msg, + def_tx_entry->rndv_done.tx_buf, + (int) ret); } RXM_UPDATE_STATE(FI_LOG_EP_DATA, def_tx_entry->rndv_done.tx_buf, @@ -888,10 +888,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], - def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, (int) ret); + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->rndv_read.rx_buf-> + recv_entry->context, (int) ret); } break; case RXM_DEFERRED_TX_RNDV_WRITE: @@ -906,9 +906,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], - def_tx_entry->rndv_write.tx_buf, (int) ret); + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->rndv_write.tx_buf, + (int) ret); } break; case RXM_DEFERRED_TX_SAR_SEG: @@ -939,11 +940,12 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, OFI_PRIORITY); if (ret) { if (ret != -FI_EAGAIN) { - rxm_cq_write_error( - def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, + ofi_op_msg, def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, (int) ret); + recv_entry->context, + (int) ret); } return; } diff --git a/prov/rxm/src/rxm_msg.c b/prov/rxm/src/rxm_msg.c index 46cd1cfe285..3b9088a2858 100644 --- a/prov/rxm/src/rxm_msg.c +++ b/prov/rxm/src/rxm_msg.c @@ -140,8 +140,8 @@ rxm_post_mrecv(struct rxm_ep *ep, const struct iovec *iov, if ((cur_iov.iov_len < ep->min_multi_recv_size) || (ret && cur_iov.iov_len != iov->iov_len)) { - rxm_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV, - 0, NULL, 0, 0); + ofi_peer_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV, + 0, NULL, 0, 0, FI_ADDR_NOTAVAIL); } return ret; diff --git a/prov/rxm/src/rxm_tagged.c b/prov/rxm/src/rxm_tagged.c index 78e3d3ff0e9..8f18f34b3eb 100644 --- a/prov/rxm/src/rxm_tagged.c +++ b/prov/rxm/src/rxm_tagged.c @@ -50,8 +50,9 @@ rxm_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf, RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Discarding message", rx_buf->unexp_msg.addr, rx_buf->unexp_msg.tag); - rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - 0, NULL, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); + ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, + 0, NULL, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); rxm_free_rx_buf(rx_buf); } @@ -73,8 +74,8 @@ rxm_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, rx_buf = rxm_get_unexp_msg(recv_queue, addr, tag, ignore); if (!rx_buf) { FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message not found\n"); - ret = ofi_cq_write_error_peek(rxm_ep->util_ep.rx_cq, tag, - context); + ret = ofi_peer_cq_write_error_peek( + rxm_ep->util_ep.rx_cq, tag, context); if (ret) FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); return; @@ -94,9 +95,9 @@ rxm_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, dlist_remove(&rx_buf->unexp_msg.entry); } - rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - rx_buf->pkt.hdr.size, NULL, - rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); + ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, + rx_buf->pkt.hdr.size, NULL, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); } static ssize_t From 622a773151faacf1126d3792fbd8500689aeae64 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 31 Oct 2024 08:48:51 -0700 Subject: [PATCH 293/393] prov/rxm: replace rxm managed srx with util srx, support FI_PEER Remove rxm implementation of receive queues and leverage the util srx implementation which supports the peer srx API. This allows rxm to use the peer API calls to match receives. To do this, move the rxm protocol information from the receive entry into the rx_buf and allocate it dynamically as needed to track protocol information. This allows rxm to use the default peer_rx_entry instead of its own custom receive entry. With this last piece of the peer API implemented, rxm can also now advertise full support of the FI_PEER capability. Just like the FI_AV_USER_ID capability, rxm removes the bit from the core provider info as it is only a requirement from the application side and not from the message provider Signed-off-by: Alexia Ingerson --- include/ofi_util.h | 5 +- prov/rxm/src/rxm.h | 119 +++----- prov/rxm/src/rxm_attr.c | 3 +- prov/rxm/src/rxm_conn.c | 13 +- prov/rxm/src/rxm_cq.c | 390 ++++++++++++++------------ prov/rxm/src/rxm_domain.c | 24 +- prov/rxm/src/rxm_ep.c | 566 +++++++++++++------------------------- prov/rxm/src/rxm_init.c | 4 +- prov/rxm/src/rxm_msg.c | 235 ++-------------- prov/rxm/src/rxm_tagged.c | 210 ++------------ prov/tcp/src/xnet_av.c | 2 +- prov/util/src/rxm_av.c | 15 +- 12 files changed, 548 insertions(+), 1038 deletions(-) diff --git a/include/ofi_util.h b/include/ofi_util.h index dda5c903e6e..bc590bb4d1a 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -955,12 +955,15 @@ struct rxm_av { struct fid_peer_av peer_av; struct fid_av *util_coll_av; struct fid_av *offload_coll_av; + void (*foreach_ep)(struct util_av *av, struct util_ep *util_ep); }; int rxm_util_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context, size_t conn_size, void (*remove_handler)(struct util_ep *util_ep, - struct util_peer_addr *peer)); + struct util_peer_addr *peer), + void (*foreach_ep)(struct util_av *av, + struct util_ep *ep)); size_t rxm_av_max_peers(struct rxm_av *av); void rxm_ref_peer(struct util_peer_addr *peer); void *rxm_av_alloc_conn(struct rxm_av *av); diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index 5d18f16e157..93e08624fc1 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -183,9 +183,9 @@ do { \ extern struct fi_provider rxm_prov; extern struct util_prov rxm_util_prov; -extern struct fi_ops_msg rxm_msg_ops; +extern struct fi_ops_msg rxm_msg_ops, rxm_no_recv_msg_ops; extern struct fi_ops_msg rxm_msg_thru_ops; -extern struct fi_ops_tagged rxm_tagged_ops; +extern struct fi_ops_tagged rxm_tagged_ops, rxm_no_recv_tagged_ops; extern struct fi_ops_tagged rxm_tagged_thru_ops; extern struct fi_ops_rma rxm_rma_ops; extern struct fi_ops_rma rxm_rma_thru_ops; @@ -265,6 +265,8 @@ struct rxm_fabric { struct rxm_domain { struct util_domain util_domain; struct fid_domain *msg_domain; + struct fid_ep rx_ep; + struct fid_peer_srx *srx; size_t max_atomic_size; size_t rx_post_size; uint64_t mr_key; @@ -443,24 +445,29 @@ rxm_sar_set_seg_type(struct ofi_ctrl_hdr *ctrl_hdr, enum rxm_sar_seg_type seg_ty ((union rxm_sar_ctrl_data *)&(ctrl_hdr->ctrl_data))->seg_type = seg_type; } -struct rxm_recv_match_attr { - fi_addr_t addr; - uint64_t tag; - uint64_t ignore; -}; - -struct rxm_unexp_msg { - struct dlist_entry entry; - fi_addr_t addr; - uint64_t tag; -}; - struct rxm_iov { struct iovec iov[RXM_IOV_LIMIT]; void *desc[RXM_IOV_LIMIT]; uint8_t count; }; +struct rxm_proto_info { + /* Used for SAR protocol */ + struct { + struct dlist_entry entry; + struct dlist_entry pkt_list; + struct fi_peer_rx_entry *rx_entry; + size_t total_recv_len; + struct rxm_conn *conn; + uint64_t msg_id; + } sar; + /* Used for Rendezvous protocol */ + struct { + /* This is used to send RNDV ACK */ + struct rxm_tx_buf *tx_buf; + } rndv; +}; + struct rxm_buf { /* Must stay at top */ struct fi_context fi_context; @@ -478,9 +485,10 @@ struct rxm_rx_buf { /* MSG EP / shared context to which bufs would be posted to */ struct fid_ep *rx_ep; struct dlist_entry repost_entry; + struct dlist_entry unexp_entry; struct rxm_conn *conn; /* msg ep data was received on */ - struct rxm_recv_entry *recv_entry; - struct rxm_unexp_msg unexp_msg; + struct fi_peer_rx_entry *peer_entry; + struct rxm_proto_info *proto_info; uint64_t comp_flags; struct fi_recv_context recv_context; bool repost; @@ -608,49 +616,6 @@ struct rxm_deferred_tx_entry { }; }; -struct rxm_recv_entry { - struct dlist_entry entry; - struct rxm_iov rxm_iov; - fi_addr_t addr; - void *context; - uint64_t flags; - uint64_t tag; - uint64_t ignore; - uint64_t comp_flags; - size_t total_len; - struct rxm_recv_queue *recv_queue; - - /* Used for SAR protocol */ - struct { - struct dlist_entry entry; - size_t total_recv_len; - struct rxm_conn *conn; - uint64_t msg_id; - } sar; - /* Used for Rendezvous protocol */ - struct { - /* This is used to send RNDV ACK */ - struct rxm_tx_buf *tx_buf; - } rndv; -}; -OFI_DECLARE_FREESTACK(struct rxm_recv_entry, rxm_recv_fs); - -enum rxm_recv_queue_type { - RXM_RECV_QUEUE_UNSPEC, - RXM_RECV_QUEUE_MSG, - RXM_RECV_QUEUE_TAGGED, -}; - -struct rxm_recv_queue { - struct rxm_ep *rxm_ep; - enum rxm_recv_queue_type type; - struct rxm_recv_fs *fs; - struct dlist_entry recv_list; - struct dlist_entry unexp_msg_list; - dlist_func_t *match_recv; - dlist_func_t *match_unexp; -}; - struct rxm_eager_ops { void (*comp_tx)(struct rxm_ep *rxm_ep, struct rxm_tx_buf *tx_eager_buf); @@ -690,6 +655,8 @@ struct rxm_ep { struct fi_ops_transfer_peer *offload_coll_peer_xfer_ops; uint64_t offload_coll_mask; + struct fid_peer_srx *srx; + struct fid_cq *msg_cq; uint64_t msg_cq_last_poll; size_t comp_per_progress; @@ -703,7 +670,6 @@ struct rxm_ep { bool do_progress; bool enable_direct_send; - size_t min_multi_recv_size; size_t buffered_min; size_t buffered_limit; size_t inject_limit; @@ -715,15 +681,13 @@ struct rxm_ep { struct ofi_bufpool *rx_pool; struct ofi_bufpool *tx_pool; struct ofi_bufpool *coll_pool; + struct ofi_bufpool *proto_info_pool; + struct rxm_pkt *inject_pkt; struct dlist_entry deferred_queue; struct dlist_entry rndv_wait_list; - struct rxm_recv_queue recv_queue; - struct rxm_recv_queue trecv_queue; - struct ofi_bufpool *multi_recv_pool; - struct rxm_eager_ops *eager_ops; struct rxm_rndv_ops *rndv_ops; }; @@ -757,6 +721,9 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf); +int rxm_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context); + int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); void rxm_cq_write_tx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, @@ -915,17 +882,10 @@ ssize_t rxm_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const void *buf, size_t len); -struct rxm_recv_entry * -rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags, struct rxm_recv_queue *recv_queue); -struct rxm_rx_buf * -rxm_get_unexp_msg(struct rxm_recv_queue *recv_queue, fi_addr_t addr, - uint64_t tag, uint64_t ignore); -ssize_t rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue, - struct rxm_recv_entry *recv_entry, - struct rxm_rx_buf *rx_buf); +ssize_t rxm_handle_unexp_sar(struct fi_peer_rx_entry *peer_entry); +int rxm_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context); + int rxm_post_recv(struct rxm_rx_buf *rx_buf); void rxm_av_remove_handler(struct util_ep *util_ep, struct util_peer_addr *peer); @@ -946,15 +906,6 @@ rxm_free_rx_buf(struct rxm_rx_buf *rx_buf) } } -static inline void -rxm_recv_entry_release(struct rxm_recv_entry *entry) -{ - if (entry->recv_queue) - ofi_freestack_push(entry->recv_queue->fs, entry); - else - ofi_buf_free(entry); -} - struct rxm_mr *rxm_mr_get_map_entry(struct rxm_domain *domain, uint64_t key); struct rxm_recv_entry * diff --git a/prov/rxm/src/rxm_attr.c b/prov/rxm/src/rxm_attr.c index 632543585e4..6dc1241329e 100644 --- a/prov/rxm/src/rxm_attr.c +++ b/prov/rxm/src/rxm_attr.c @@ -40,7 +40,8 @@ OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \ FI_MULTI_RECV) -#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID) +#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID | \ + FI_PEER) /* Since we are a layering provider, the attributes for which we rely on the diff --git a/prov/rxm/src/rxm_conn.c b/prov/rxm/src/rxm_conn.c index afe603234ec..73b26f2a9f3 100644 --- a/prov/rxm/src/rxm_conn.c +++ b/prov/rxm/src/rxm_conn.c @@ -58,7 +58,7 @@ struct rxm_eq_cm_entry { static void rxm_close_conn(struct rxm_conn *conn) { struct rxm_deferred_tx_entry *tx_entry; - struct rxm_recv_entry *rx_entry; + struct fi_peer_rx_entry *rx_entry; struct rxm_rx_buf *buf; FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "closing conn %p\n", conn); @@ -74,16 +74,13 @@ static void rxm_close_conn(struct rxm_conn *conn) while (!dlist_empty(&conn->deferred_sar_segments)) { buf = container_of(conn->deferred_sar_segments.next, - struct rxm_rx_buf, unexp_msg.entry); - dlist_remove(&buf->unexp_msg.entry); - rxm_free_rx_buf(buf); + struct rxm_rx_buf, unexp_entry); + dlist_remove(&buf->unexp_entry); } while (!dlist_empty(&conn->deferred_sar_msgs)) { - rx_entry = container_of(conn->deferred_sar_msgs.next, - struct rxm_recv_entry, sar.entry); - dlist_remove(&rx_entry->entry); - rxm_recv_entry_release(rx_entry); + rx_entry = (struct fi_peer_rx_entry*)conn->deferred_sar_msgs.next; + rx_entry->srx->owner_ops->free_entry(rx_entry); } fi_close(&conn->msg_ep->fid); rxm_flush_msg_cq(conn->ep); diff --git a/prov/rxm/src/rxm_cq.c b/prov/rxm/src/rxm_cq.c index b04b36444d3..51206ddde04 100644 --- a/prov/rxm/src/rxm_cq.c +++ b/prov/rxm/src/rxm_cq.c @@ -106,11 +106,12 @@ static void rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, { int ret; + flags &= ~FI_COMPLETION; if (rx_buf->ep->util_coll_peer_xfer_ops && rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { struct fi_cq_tagged_entry cqe = { .tag = rx_buf->pkt.hdr.tag, - .op_context = rx_buf->recv_entry->context, + .op_context = rx_buf->peer_entry->context, }; rx_buf->ep->util_coll_peer_xfer_ops-> complete(rx_buf->ep->util_coll_ep, &cqe, 0); @@ -137,7 +138,7 @@ static void rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) if ((rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) && rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) != RXM_SAR_SEG_FIRST) { - dlist_insert_tail(&rx_buf->unexp_msg.entry, + dlist_insert_tail(&rx_buf->unexp_entry, &rx_buf->conn->deferred_sar_segments); rxm_replace_rx_buf(rx_buf); } @@ -172,10 +173,11 @@ static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) done_len, rx_buf->pkt.hdr.size); ret = ofi_peer_cq_write_error_trunc( rx_buf->ep->util_ep.rx_cq, - rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags, rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov.iov[0].iov_base, + rx_buf->peer_entry->context, + rx_buf->peer_entry->flags | + rx_buf->pkt.hdr.flags, + rx_buf->pkt.hdr.size, + rx_buf->peer_entry->iov[0].iov_base, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, rx_buf->pkt.hdr.size - done_len); if (ret) { @@ -186,27 +188,22 @@ static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) static void rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len) { - struct rxm_recv_entry *recv_entry = rx_buf->recv_entry; - if (done_len < rx_buf->pkt.hdr.size) { rxm_cq_write_error_trunc(rx_buf, done_len); goto release; } - if (rx_buf->recv_entry->flags & FI_COMPLETION || + if (rx_buf->peer_entry->flags & FI_COMPLETION || rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV) { - rxm_cq_write_recv_comp( - rx_buf, rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags | - (rx_buf->recv_entry->flags & FI_MULTI_RECV), - rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov. - iov[0].iov_base); + rxm_cq_write_recv_comp(rx_buf, rx_buf->peer_entry->context, + rx_buf->peer_entry->flags | + rx_buf->pkt.hdr.flags, + rx_buf->pkt.hdr.size, + rx_buf->peer_entry->iov[0].iov_base); } ofi_ep_peer_rx_cntr_inc(&rx_buf->ep->util_ep, ofi_op_msg); release: - rxm_recv_entry_release(recv_entry); + rx_buf->ep->srx->owner_ops->free_entry(rx_buf->peer_entry); rxm_free_rx_buf(rx_buf); } @@ -294,18 +291,20 @@ static void rxm_handle_sar_comp(struct rxm_ep *rxm_ep, static void rxm_rndv_rx_finish(struct rxm_rx_buf *rx_buf) { + struct rxm_proto_info *proto_info; + RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_FINISH); - if (rx_buf->recv_entry->rndv.tx_buf) { - ofi_buf_free(rx_buf->recv_entry->rndv.tx_buf); - rx_buf->recv_entry->rndv.tx_buf = NULL; + proto_info = rx_buf->proto_info; + if (proto_info->rndv.tx_buf) { + ofi_buf_free(proto_info); + ofi_buf_free(proto_info->rndv.tx_buf); } if (!rx_buf->ep->rdm_mr_local) - rxm_msg_mr_closev(rx_buf->mr, - rx_buf->recv_entry->rxm_iov.count); + rxm_msg_mr_closev(rx_buf->mr, rx_buf->peer_entry->count); - rxm_finish_recv(rx_buf, rx_buf->recv_entry->total_len); + rxm_finish_recv(rx_buf, rx_buf->peer_entry->msg_size); } static void rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, @@ -398,96 +397,135 @@ static int rxm_rx_buf_match_msg_id(struct dlist_entry *item, const void *arg) uint64_t msg_id = *((uint64_t *) arg); struct rxm_rx_buf *rx_buf; - rx_buf = container_of(item, struct rxm_rx_buf, unexp_msg.entry); + rx_buf = container_of(item, struct rxm_rx_buf, unexp_entry); return (msg_id == rx_buf->pkt.ctrl_hdr.msg_id); } -static void rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done) +static void rxm_init_sar_proto(struct rxm_rx_buf *rx_buf) +{ + struct rxm_proto_info *proto_info; + + proto_info = ofi_buf_alloc(rx_buf->ep->proto_info_pool); + if (!proto_info) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to allocate proto info buffer\n"); + return; + } + if (!rx_buf->conn) { + rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, + (int) rx_buf->pkt.ctrl_hdr.conn_id); + } + + proto_info->sar.conn = rx_buf->conn; + proto_info->sar.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; + proto_info->sar.total_recv_len = 0; + proto_info->sar.rx_entry = rx_buf->peer_entry; + + dlist_insert_tail(&proto_info->sar.entry, + &rx_buf->conn->deferred_sar_msgs); + + dlist_init(&proto_info->sar.pkt_list); + if (rx_buf->peer_entry->peer_context) + dlist_insert_tail(&rx_buf->unexp_entry, + &proto_info->sar.pkt_list); + + + rx_buf->proto_info = proto_info; +} + +int rxm_process_seg_data(struct rxm_rx_buf *rx_buf) { enum fi_hmem_iface iface; + struct rxm_proto_info *proto_info; uint64_t device; ssize_t done_len; + int done = 0; - iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, + proto_info = rx_buf->proto_info; + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->peer_entry->iov, + rx_buf->peer_entry->desc, + rx_buf->peer_entry->count, &device); done_len = ofi_copy_to_hmem_iov(iface, device, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, - rx_buf->recv_entry->sar.total_recv_len, + rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, + proto_info->sar.total_recv_len, rx_buf->pkt.data, rx_buf->pkt.ctrl_hdr.seg_size); assert(done_len == rx_buf->pkt.ctrl_hdr.seg_size); - rx_buf->recv_entry->sar.total_recv_len += done_len; + proto_info->sar.total_recv_len += done_len; if ((rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST) || (done_len != rx_buf->pkt.ctrl_hdr.seg_size)) { - - dlist_remove(&rx_buf->recv_entry->sar.entry); - - /* Mark rxm_recv_entry::msg_id as unknown for futher re-use */ - rx_buf->recv_entry->sar.msg_id = RXM_SAR_RX_INIT; - - done_len = rx_buf->recv_entry->sar.total_recv_len; - rx_buf->recv_entry->sar.total_recv_len = 0; - - *done = 1; + if (!rx_buf->peer_entry->peer_context) + dlist_remove(&proto_info->sar.entry); + done_len = proto_info->sar.total_recv_len; + done = 1; + ofi_buf_free(rx_buf->proto_info); rxm_finish_recv(rx_buf, done_len); } else { - if (rx_buf->recv_entry->sar.msg_id == RXM_SAR_RX_INIT) { - if (!rx_buf->conn) { - rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, - (int) rx_buf->pkt.ctrl_hdr.conn_id); - } - - rx_buf->recv_entry->sar.conn = rx_buf->conn; - rx_buf->recv_entry->sar.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; - - dlist_insert_tail(&rx_buf->recv_entry->sar.entry, - &rx_buf->conn->deferred_sar_msgs); - } - /* The RX buffer can be reposted for further re-use */ - rx_buf->recv_entry = NULL; + rx_buf->peer_entry = NULL; rxm_free_rx_buf(rx_buf); - - *done = 0; } + return done; } static void rxm_handle_seg_data(struct rxm_rx_buf *rx_buf) { - struct rxm_recv_entry *recv_entry; + struct rxm_proto_info *proto_info; + struct fi_peer_rx_entry *rx_entry; struct rxm_conn *conn; uint64_t msg_id; struct dlist_entry *entry; - int done; - rxm_process_seg_data(rx_buf, &done); - if (done || !(rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV)) + if (dlist_empty(&rx_buf->proto_info->sar.pkt_list)) { + rxm_process_seg_data(rx_buf); return; + } - recv_entry = rx_buf->recv_entry; + proto_info = rx_buf->proto_info; + dlist_insert_tail(&rx_buf->unexp_entry, &proto_info->sar.pkt_list); + + if ((rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST)) + dlist_remove(&proto_info->sar.entry); + + rx_entry = rx_buf->peer_entry; conn = rx_buf->conn; msg_id = rx_buf->pkt.ctrl_hdr.msg_id; dlist_foreach_container_safe(&conn->deferred_sar_segments, struct rxm_rx_buf, rx_buf, - unexp_msg.entry, entry) { - if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_msg.entry, &msg_id)) + unexp_entry, entry) { + if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_entry, &msg_id)) continue; - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - rxm_process_seg_data(rx_buf, &done); - if (done) + dlist_remove(&rx_buf->unexp_entry); + rx_buf->peer_entry = rx_entry; + if (rxm_process_seg_data(rx_buf)) break; } } +ssize_t rxm_handle_unexp_sar(struct fi_peer_rx_entry *peer_entry) +{ + struct rxm_proto_info *proto_info; + struct rxm_rx_buf *rx_buf; + + rx_buf = (struct rxm_rx_buf *) peer_entry->peer_context; + proto_info = rx_buf->proto_info; + + while (!dlist_empty(&proto_info->sar.pkt_list)) { + dlist_pop_front(&proto_info->sar.pkt_list, + struct rxm_rx_buf, rx_buf, unexp_entry); + rxm_process_seg_data(rx_buf); + } + peer_entry->peer_context = NULL; + return FI_SUCCESS; +} + static ssize_t rxm_rndv_xfer(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep, struct rxm_rndv_hdr *remote_hdr, struct iovec *local_iov, void **local_desc, size_t local_count, size_t total_len, @@ -538,14 +576,15 @@ ssize_t rxm_rndv_read(struct rxm_rx_buf *rx_buf) ssize_t ret; size_t total_len; - total_len = MIN(rx_buf->recv_entry->total_len, rx_buf->pkt.hdr.size); + total_len = MIN(rx_buf->peer_entry->msg_size, rx_buf->pkt.hdr.size); + rx_buf->peer_entry->msg_size = total_len; RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_READ); ret = rxm_rndv_xfer(rx_buf->ep, rx_buf->conn->msg_ep, rx_buf->remote_rndv_hdr, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, total_len, + rx_buf->peer_entry->iov, + rx_buf->peer_entry->desc, + rx_buf->peer_entry->count, total_len, rx_buf); if (ret) { rxm_cq_write_rx_error(rx_buf->ep, ofi_op_msg, rx_buf, @@ -621,28 +660,26 @@ static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf) rx_buf->rndv_rma_index = 0; if (!rx_buf->ep->rdm_mr_local) { - total_recv_len = MIN(rx_buf->recv_entry->total_len, + total_recv_len = MIN(rx_buf->peer_entry->msg_size, rx_buf->pkt.hdr.size); - ret = rxm_msg_mr_regv(rx_buf->ep, rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, + ret = rxm_msg_mr_regv(rx_buf->ep, rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, total_recv_len, rx_buf->ep->rndv_ops->rx_mr_access, rx_buf->mr); if (ret) return ret; - for (i = 0; (i < rx_buf->recv_entry->rxm_iov.count && + for (i = 0; (i < rx_buf->peer_entry->count && rx_buf->mr[i]); i++) { - rx_buf->recv_entry->rxm_iov.desc[i] = - fi_mr_desc(rx_buf->mr[i]); + rx_buf->peer_entry->desc[i] = fi_mr_desc(rx_buf->mr[i]); } } else { struct rxm_mr *mr; - for (i = 0; i < rx_buf->recv_entry->rxm_iov.count; i++) { - mr = rx_buf->recv_entry->rxm_iov.desc[i]; - rx_buf->recv_entry->rxm_iov.desc[i] = - fi_mr_desc(mr->msg_mr); + for (i = 0; i < rx_buf->peer_entry->count; i++) { + mr = rx_buf->peer_entry->desc[i]; + rx_buf->peer_entry->desc[i] = fi_mr_desc(mr->msg_mr); rx_buf->mr[i] = mr->msg_mr; } } @@ -656,9 +693,9 @@ static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf) void rxm_handle_eager(struct rxm_rx_buf *rx_buf) { ssize_t done_len = rxm_copy_to_hmem_iov( - rx_buf->recv_entry->rxm_iov.desc, rx_buf->data, - rx_buf->pkt.hdr.size, rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, 0); + rx_buf->peer_entry->desc, rx_buf->data, + rx_buf->pkt.hdr.size, rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, 0); assert((size_t) done_len == rx_buf->pkt.hdr.size); @@ -671,14 +708,14 @@ void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf) uint64_t device; ssize_t done_len; - iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->peer_entry->iov, + rx_buf->peer_entry->desc, + rx_buf->peer_entry->count, &device); done_len = ofi_copy_to_hmem_iov(iface, device, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, 0, + rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, 0, rx_buf->data, rx_buf->pkt.hdr.size); assert((size_t) done_len == rx_buf->pkt.hdr.size); @@ -686,11 +723,11 @@ void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf) rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { struct fi_cq_tagged_entry cqe = { .tag = rx_buf->pkt.hdr.tag, - .op_context = rx_buf->recv_entry->context, + .op_context = rx_buf->peer_entry->context, }; rx_buf->ep->util_coll_peer_xfer_ops-> complete(rx_buf->ep->util_coll_ep, &cqe, 0); - rxm_recv_entry_release(rx_buf->recv_entry); + rx_buf->ep->srx->owner_ops->free_entry(rx_buf->peer_entry); rxm_free_rx_buf(rx_buf); } else { rxm_finish_recv(rx_buf, done_len); @@ -715,73 +752,26 @@ ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf) } } -static void rxm_adjust_multi_recv(struct rxm_rx_buf *rx_buf) +static inline void rxm_entry_prep_for_queue(struct fi_peer_rx_entry *rx_entry, + struct rxm_rx_buf *rx_buf) { - struct rxm_recv_entry *recv_entry; - struct iovec new_iov; - size_t recv_size; - - recv_size = rx_buf->pkt.hdr.size; - - if (rx_buf->recv_entry->rxm_iov.iov[0].iov_len < recv_size || - rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size < - rx_buf->ep->min_multi_recv_size) - return; - - new_iov.iov_base = (uint8_t *) - rx_buf->recv_entry->rxm_iov.iov[0].iov_base + recv_size; - new_iov.iov_len = rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size;; - - rx_buf->recv_entry->rxm_iov.iov[0].iov_len = recv_size; - - recv_entry = rxm_multi_recv_entry_get(rx_buf->ep, &new_iov, - rx_buf->recv_entry->rxm_iov.desc, 1, - rx_buf->recv_entry->addr, - rx_buf->recv_entry->tag, - rx_buf->recv_entry->ignore, - rx_buf->recv_entry->context, - rx_buf->recv_entry->flags); - - rx_buf->recv_entry->flags &= ~FI_MULTI_RECV; - - dlist_insert_head(&recv_entry->entry, &rx_buf->ep->recv_queue.recv_list); -} - -static ssize_t -rxm_match_rx_buf(struct rxm_rx_buf *rx_buf, - struct rxm_recv_queue *recv_queue, - struct rxm_recv_match_attr *match_attr) -{ - struct dlist_entry *entry; - - entry = dlist_remove_first_match(&recv_queue->recv_list, - recv_queue->match_recv, match_attr); - if (entry) { - rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry, entry); - - if (rx_buf->recv_entry->flags & FI_MULTI_RECV) - rxm_adjust_multi_recv(rx_buf); - - return rxm_handle_rx_buf(rx_buf); + rx_entry->peer_context = rx_buf; + rx_buf->peer_entry = rx_entry; + if (rx_buf->pkt.hdr.flags & FI_REMOTE_CQ_DATA) { + rx_entry->flags |= FI_REMOTE_CQ_DATA; + rx_entry->cq_data = rx_buf->pkt.hdr.data; } - - RXM_DBG_ADDR_TAG(FI_LOG_CQ, "No matching recv found for incoming msg", - match_attr->addr, match_attr->tag); - FI_DBG(&rxm_prov, FI_LOG_CQ, "Enqueueing msg to unexpected msg queue\n"); - rx_buf->unexp_msg.addr = match_attr->addr; - rx_buf->unexp_msg.tag = match_attr->tag; - - dlist_insert_tail(&rx_buf->unexp_msg.entry, - &recv_queue->unexp_msg_list); + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) + rxm_init_sar_proto(rx_buf); rxm_replace_rx_buf(rx_buf); - return 0; } static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) { - struct rxm_recv_match_attr match_attr = { - .addr = FI_ADDR_UNSPEC, - }; + struct fid_peer_srx *srx = rx_buf->ep->srx; + struct fi_peer_rx_entry *rx_entry; + struct fi_peer_match_attr match = {0}; + int ret; if (rx_buf->ep->rxm_info->caps & (FI_SOURCE | FI_DIRECTED_RECV)) { if (rx_buf->ep->msg_srx) @@ -789,7 +779,9 @@ static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) (int) rx_buf->pkt.ctrl_hdr.conn_id); if (!rx_buf->conn) return -FI_EOTHER; - match_attr.addr = rx_buf->conn->peer->fi_addr; + match.addr = rx_buf->conn->peer->fi_addr; + } else { + match.addr = FI_ADDR_UNSPEC; } if (rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV) { @@ -799,33 +791,52 @@ static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) switch(rx_buf->pkt.hdr.op) { case ofi_op_msg: + match.msg_size = rx_buf->pkt.hdr.size; FI_DBG(&rxm_prov, FI_LOG_CQ, "Got MSG op\n"); - return rxm_match_rx_buf(rx_buf, &rx_buf->ep->recv_queue, - &match_attr); + ret = srx->owner_ops->get_msg(srx, &match, &rx_entry); + if (ret == -FI_ENOENT) { + rxm_entry_prep_for_queue(rx_entry, rx_buf); + return srx->owner_ops->queue_msg(rx_entry); + } + rx_entry->peer_context = NULL; + break; case ofi_op_tagged: + match.tag = rx_buf->pkt.hdr.tag; + match.msg_size = rx_buf->pkt.hdr.size; FI_DBG(&rxm_prov, FI_LOG_CQ, "Got TAGGED op\n"); - match_attr.tag = rx_buf->pkt.hdr.tag; - return rxm_match_rx_buf(rx_buf, &rx_buf->ep->trecv_queue, - &match_attr); + ret = srx->owner_ops->get_tag(srx, &match, &rx_entry); + if (ret == -FI_ENOENT) { + rxm_entry_prep_for_queue(rx_entry, rx_buf); + return srx->owner_ops->queue_tag(rx_entry); + } + rx_entry->peer_context = NULL; + break; default: FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown op!\n"); assert(0); return -FI_EINVAL; } + rx_buf->peer_entry = rx_entry; + + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) + rxm_init_sar_proto(rx_buf); + + return rxm_handle_rx_buf(rx_buf); } static int rxm_sar_match_msg_id(struct dlist_entry *item, const void *arg) { uint64_t msg_id = *((uint64_t *) arg); - struct rxm_recv_entry *recv_entry; + struct rxm_proto_info *proto_info; - recv_entry = container_of(item, struct rxm_recv_entry, sar.entry); - return (msg_id == recv_entry->sar.msg_id); + proto_info = container_of(item, struct rxm_proto_info, sar.entry); + return (msg_id == proto_info->sar.msg_id); } static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf) { struct dlist_entry *sar_entry; + struct rxm_proto_info *proto_info; rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, (int) rx_buf->pkt.ctrl_hdr.conn_id); @@ -841,8 +852,9 @@ static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf) if (!sar_entry) return rxm_handle_recv_comp(rx_buf); - rx_buf->recv_entry = container_of(sar_entry, struct rxm_recv_entry, - sar.entry); + proto_info = container_of(sar_entry, struct rxm_proto_info, sar.entry); + rx_buf->peer_entry = proto_info->sar.rx_entry; + rx_buf->proto_info = proto_info; rxm_handle_seg_data(rx_buf); return 0; } @@ -860,8 +872,15 @@ static void rxm_rndv_send_rd_done(struct rxm_rx_buf *rx_buf) ret = -FI_ENOMEM; goto err; } + rx_buf->proto_info = ofi_buf_alloc(rx_buf->ep->proto_info_pool); + if (!rx_buf->proto_info) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to allocated proto info buf\n"); + assert(0); + return; + } - rx_buf->recv_entry->rndv.tx_buf = buf; + rx_buf->proto_info->rndv.tx_buf = buf; buf->pkt.ctrl_hdr.type = rxm_ctrl_rndv_rd_done; buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->remote_index; @@ -888,8 +907,9 @@ static void rxm_rndv_send_rd_done(struct rxm_rx_buf *rx_buf) return; free: + rx_buf->proto_info->rndv.tx_buf = NULL; + ofi_buf_free(rx_buf->proto_info); ofi_buf_free(buf); - rx_buf->recv_entry->rndv.tx_buf = NULL; err: FI_WARN(&rxm_prov, FI_LOG_CQ, "unable to allocate/send rd rndv ack: %s\n", @@ -968,14 +988,22 @@ ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) goto err; } - rx_buf->recv_entry->rndv.tx_buf = buf; + rx_buf->proto_info = ofi_buf_alloc(rx_buf->ep->proto_info_pool); + if (!rx_buf->proto_info) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to allocated proto info buf\n"); + return -FI_ENOMEM; + } + + rx_buf->proto_info->rndv.tx_buf = buf; + buf->pkt.ctrl_hdr.type = rxm_ctrl_rndv_wr_data; buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->remote_index; buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; rxm_rndv_hdr_init(rx_buf->ep, buf->pkt.data, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, rx_buf->mr); + rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, rx_buf->mr); ret = fi_send(rx_buf->conn->msg_ep, &buf->pkt, sizeof(buf->pkt) + sizeof(struct rxm_rndv_hdr), buf->hdr.desc, 0, rx_buf); @@ -999,8 +1027,9 @@ ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) return 0; free: + rx_buf->proto_info->rndv.tx_buf = NULL; + ofi_buf_free(rx_buf->proto_info); ofi_buf_free(buf); - rx_buf->recv_entry->rndv.tx_buf = NULL; err: FI_WARN(&rxm_prov, FI_LOG_CQ, "unable to allocate/send wr rndv ready: %s\n", @@ -1638,7 +1667,7 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) * the event yet. */ rx_buf = (struct rxm_rx_buf *) err_entry.op_context; - if (!rx_buf->recv_entry) { + if (!rx_buf->peer_entry) { ofi_buf_free((struct rxm_rx_buf *)err_entry.op_context); return; } @@ -1647,9 +1676,9 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) case RXM_RNDV_WRITE_DATA_SENT: /* BUG: should fail initial send */ case RXM_RNDV_READ: rx_buf = (struct rxm_rx_buf *) err_entry.op_context; - assert(rx_buf->recv_entry); - err_entry.op_context = rx_buf->recv_entry->context; - err_entry.flags = rx_buf->recv_entry->comp_flags; + assert(rx_buf->peer_entry); + err_entry.op_context = rx_buf->peer_entry->context; + err_entry.flags = rx_buf->peer_entry->flags; cq = rx_buf->ep->util_ep.rx_cq; cntr = rx_buf->ep->util_ep.cntrs[CNTR_RX]; @@ -1780,7 +1809,8 @@ int rxm_post_recv(struct rxm_rx_buf *rx_buf) if (rx_buf->ep->msg_srx) rx_buf->conn = NULL; rx_buf->hdr.state = RXM_RX; - rx_buf->recv_entry = NULL; + rx_buf->peer_entry = NULL; + rx_buf->proto_info = NULL; domain = container_of(rx_buf->ep->util_ep.domain, struct rxm_domain, util_domain); @@ -1858,7 +1888,7 @@ void rxm_ep_do_progress(struct util_ep *util_ep) rxm_conn_progress(rxm_ep); } } else { - rxm_conn_progress(rxm_ep); + rxm_conn_progress(rxm_ep); } } } while ((ret > 0) && (comp_read < rxm_ep->comp_per_progress)); @@ -1975,6 +2005,9 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, if (ret) goto err1; + if (attr->flags & FI_PEER) + goto out; + rxm_domain = container_of(domain, struct rxm_domain, util_domain.domain_fid); @@ -1996,11 +2029,12 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, if (ret) goto err2; } + rxm_cq->util_cq.cq_fid.ops = &rxm_cq_ops; +out: *cq_fid = &rxm_cq->util_cq.cq_fid; /* Override util_cq_fi_ops */ (*cq_fid)->fid.ops = &rxm_cq_fi_ops; - (*cq_fid)->ops = &rxm_cq_ops; return 0; err2: diff --git a/prov/rxm/src/rxm_domain.c b/prov/rxm/src/rxm_domain.c index 055fca16bea..9fcadf56763 100644 --- a/prov/rxm/src/rxm_domain.c +++ b/prov/rxm/src/rxm_domain.c @@ -221,6 +221,25 @@ static struct fi_ops_av_owner rxm_av_owner_ops = { .ep_addr = rxm_peer_av_ep_addr, }; +static fi_addr_t rxm_get_addr(struct fi_peer_rx_entry *rx_entry) +{ + struct rxm_rx_buf *rx_buf = rx_entry->peer_context; + + return rx_buf->conn->peer->fi_addr; +} + +static void rxm_foreach_ep(struct util_av *av, struct util_ep *ep) +{ + struct rxm_ep *rxm_ep; + struct fid_peer_srx *peer_srx; + + rxm_ep = container_of(ep, struct rxm_ep, util_ep); + peer_srx = container_of(rxm_ep->srx, struct fid_peer_srx, ep_fid); + if (peer_srx) + peer_srx->owner_ops->foreach_unspec_addr(peer_srx, &rxm_get_addr); +} + + static int rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context) @@ -236,7 +255,8 @@ rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, ret = rxm_util_av_open(domain_fid, attr, &fid_av_new, context, sizeof(struct rxm_conn), - ofi_av_remove_cleanup ? rxm_av_remove_handler : NULL); + ofi_av_remove_cleanup ? rxm_av_remove_handler : NULL, + &rxm_foreach_ep); if (ret) return ret; @@ -346,7 +366,7 @@ static struct fi_ops_domain rxm_domain_ops = { .cntr_open = rxm_cntr_open, .poll_open = fi_poll_create, .stx_ctx = fi_no_stx_context, - .srx_ctx = fi_no_srx_context, + .srx_ctx = rxm_srx_context, .query_atomic = rxm_ep_query_atomic, .query_collective = rxm_query_collective, }; diff --git a/prov/rxm/src/rxm_ep.c b/prov/rxm/src/rxm_ep.c index 69a88e2caaf..b967643c0c5 100644 --- a/prov/rxm/src/rxm_ep.c +++ b/prov/rxm/src/rxm_ep.c @@ -42,79 +42,6 @@ #include "rxm.h" -static int rxm_match_noop(struct dlist_entry *item, const void *arg) -{ - OFI_UNUSED(item); - OFI_UNUSED(arg); - return 1; -} - -static int rxm_match_recv_entry(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return ofi_match_addr(recv_entry->addr, attr->addr); -} - -static int rxm_match_recv_entry_tag(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return ofi_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); -} - -static int rxm_match_recv_entry_tag_addr(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return ofi_match_addr(recv_entry->addr, attr->addr) && - ofi_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); -} - -static int rxm_match_recv_entry_context(struct dlist_entry *item, const void *context) -{ - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return recv_entry->context == context; -} - -static fi_addr_t rxm_get_unexp_addr(struct rxm_unexp_msg *unexp_msg) -{ - struct rxm_rx_buf *rx_buf; - - rx_buf = container_of(unexp_msg, struct rxm_rx_buf, unexp_msg); - return (unexp_msg->addr != FI_ADDR_UNSPEC) ? - unexp_msg->addr : rx_buf->conn->peer->fi_addr; -} - -static int rxm_match_unexp_msg(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg; - struct rxm_unexp_msg *unexp_msg = - container_of(item, struct rxm_unexp_msg, entry); - return ofi_match_addr(attr->addr, rxm_get_unexp_addr(unexp_msg)); -} - -static int rxm_match_unexp_msg_tag(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_unexp_msg *unexp_msg = - container_of(item, struct rxm_unexp_msg, entry); - return ofi_match_tag(attr->tag, attr->ignore, unexp_msg->tag); -} - -static int rxm_match_unexp_msg_tag_addr(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_unexp_msg *unexp_msg = - container_of(item, struct rxm_unexp_msg, entry); - return ofi_match_addr(attr->addr, rxm_get_unexp_addr(unexp_msg)) && - ofi_match_tag(attr->tag, attr->ignore, unexp_msg->tag); -} - static int rxm_buf_reg(struct ofi_bufpool_region *region) { struct rxm_ep *rxm_ep = region->pool->attr.context; @@ -158,6 +85,7 @@ static void rxm_init_rx_buf(struct ofi_bufpool_region *region, void *buf) fi_mr_desc((struct fid_mr *) region->context) : NULL; rx_buf->ep = ep; rx_buf->data = &rx_buf->pkt.data; + dlist_init(&rx_buf->unexp_entry); } static void rxm_init_tx_buf(struct ofi_bufpool_region *region, void *buf) @@ -186,69 +114,6 @@ static void rxm_buf_close(struct ofi_bufpool_region *region) } } -static void rxm_recv_entry_init(struct rxm_recv_entry *entry, void *arg) -{ - struct rxm_recv_queue *recv_queue = arg; - - assert(recv_queue->type != RXM_RECV_QUEUE_UNSPEC); - - entry->recv_queue = recv_queue; - entry->sar.msg_id = RXM_SAR_RX_INIT; - entry->sar.total_recv_len = 0; - /* set it to NULL to differentiate between regular ACKs and those - * sent with FI_INJECT */ - entry->rndv.tx_buf = NULL; - entry->comp_flags = FI_RECV; - - if (recv_queue->type == RXM_RECV_QUEUE_MSG) - entry->comp_flags |= FI_MSG; - else - entry->comp_flags |= FI_TAGGED; -} - -static int rxm_recv_queue_init(struct rxm_ep *rxm_ep, struct rxm_recv_queue *recv_queue, - size_t size, enum rxm_recv_queue_type type) -{ - recv_queue->rxm_ep = rxm_ep; - recv_queue->type = type; - recv_queue->fs = rxm_recv_fs_create(size, rxm_recv_entry_init, - recv_queue); - if (!recv_queue->fs) - return -FI_ENOMEM; - - dlist_init(&recv_queue->recv_list); - dlist_init(&recv_queue->unexp_msg_list); - if (type == RXM_RECV_QUEUE_MSG) { - if (rxm_ep->rxm_info->caps & FI_DIRECTED_RECV) { - recv_queue->match_recv = rxm_match_recv_entry; - recv_queue->match_unexp = rxm_match_unexp_msg; - } else { - recv_queue->match_recv = rxm_match_noop; - recv_queue->match_unexp = rxm_match_noop; - } - } else { - if (rxm_ep->rxm_info->caps & FI_DIRECTED_RECV) { - recv_queue->match_recv = rxm_match_recv_entry_tag_addr; - recv_queue->match_unexp = rxm_match_unexp_msg_tag_addr; - } else { - recv_queue->match_recv = rxm_match_recv_entry_tag; - recv_queue->match_unexp = rxm_match_unexp_msg_tag; - } - } - - return 0; -} - -static void rxm_recv_queue_close(struct rxm_recv_queue *recv_queue) -{ - /* It indicates that the recv_queue were allocated */ - if (recv_queue->fs) { - rxm_recv_fs_free(recv_queue->fs); - recv_queue->fs = NULL; - } - // TODO cleanup recv_list and unexp msg list -} - static int rxm_ep_create_pools(struct rxm_ep *rxm_ep) { struct ofi_bufpool_attr attr = {0}; @@ -287,8 +152,18 @@ static int rxm_ep_create_pools(struct rxm_ep *rxm_ep) "Unable to create peer xfer context pool\n"); goto free_tx_pool; } - return 0; + attr.size = sizeof(struct rxm_proto_info); + attr.alloc_fn = NULL; + attr.free_fn = NULL; + attr.init_fn = NULL; + ret = ofi_bufpool_create_attr(&attr, &rxm_ep->proto_info_pool); + if (ret) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "Unable to create proto info pool\n"); + goto free_tx_pool; + } + return 0; free_tx_pool: ofi_bufpool_destroy(rxm_ep->tx_pool); @@ -298,62 +173,13 @@ static int rxm_ep_create_pools(struct rxm_ep *rxm_ep) return ret; } -static int rxm_multi_recv_pool_init(struct rxm_ep *rxm_ep) -{ - struct ofi_bufpool_attr attr = { - .size = sizeof(struct rxm_recv_entry), - .alignment = 16, - .max_cnt = 0, - .chunk_cnt = 16, - .alloc_fn = NULL, - .init_fn = NULL, - .context = rxm_ep, - .flags = OFI_BUFPOOL_NO_TRACK, - }; - - return ofi_bufpool_create_attr(&attr, &rxm_ep->multi_recv_pool); -} - -static int rxm_ep_rx_queue_init(struct rxm_ep *rxm_ep) -{ - int ret; - - ret = rxm_recv_queue_init(rxm_ep, &rxm_ep->recv_queue, - rxm_ep->rxm_info->rx_attr->size, - RXM_RECV_QUEUE_MSG); - if (ret) - return ret; - - ret = rxm_recv_queue_init(rxm_ep, &rxm_ep->trecv_queue, - rxm_ep->rxm_info->rx_attr->size, - RXM_RECV_QUEUE_TAGGED); - if (ret) - goto err_recv_tag; - - ret = rxm_multi_recv_pool_init(rxm_ep); - if (ret) - goto err_multi; - - return FI_SUCCESS; - -err_multi: - rxm_recv_queue_close(&rxm_ep->trecv_queue); -err_recv_tag: - rxm_recv_queue_close(&rxm_ep->recv_queue); - return ret; -} - /* It is safe to call this function, even if `rxm_ep_txrx_res_open` * has not yet been called */ static void rxm_ep_txrx_res_close(struct rxm_ep *ep) { - rxm_recv_queue_close(&ep->trecv_queue); - rxm_recv_queue_close(&ep->recv_queue); + if (ep->srx && ep->util_ep.ep_fid.msg != &rxm_no_recv_msg_ops) + (void) util_srx_close(&ep->srx->ep_fid.fid); - if (ep->multi_recv_pool) { - ofi_bufpool_destroy(ep->multi_recv_pool); - ep->multi_recv_pool = NULL; - } if (ep->rx_pool) { ofi_bufpool_destroy(ep->rx_pool); ep->rx_pool = NULL; @@ -362,6 +188,10 @@ static void rxm_ep_txrx_res_close(struct rxm_ep *ep) ofi_bufpool_destroy(ep->tx_pool); ep->tx_pool = NULL; } + if (ep->proto_info_pool) { + ofi_bufpool_destroy(ep->proto_info_pool); + ep->proto_info_pool = NULL; + } if (ep->coll_pool) { ofi_bufpool_destroy(ep->coll_pool); ep->coll_pool = NULL; @@ -420,53 +250,13 @@ static struct rxm_eager_ops coll_eager_ops = { .handle_rx = rxm_handle_coll_eager, }; -static bool rxm_ep_cancel_recv(struct rxm_ep *rxm_ep, - struct rxm_recv_queue *recv_queue, void *context) -{ - struct fi_cq_err_entry err_entry; - struct rxm_recv_entry *recv_entry; - struct dlist_entry *entry; - int ret; - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - entry = dlist_remove_first_match(&recv_queue->recv_list, - rxm_match_recv_entry_context, - context); - if (!entry) - goto unlock; - - recv_entry = container_of(entry, struct rxm_recv_entry, entry); - memset(&err_entry, 0, sizeof(err_entry)); - err_entry.op_context = recv_entry->context; - err_entry.flags |= recv_entry->comp_flags; - err_entry.tag = recv_entry->tag; - err_entry.err = FI_ECANCELED; - err_entry.prov_errno = -FI_ECANCELED; - rxm_recv_entry_release(recv_entry); - ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); - assert(0); - } - -unlock: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return entry != NULL; -} - static ssize_t rxm_ep_cancel(fid_t fid_ep, void *context) { struct rxm_ep *ep; ep = container_of(fid_ep, struct rxm_ep, util_ep.ep_fid); - if (rxm_passthru_info(ep->rxm_info)) - return fi_cancel(&ep->msg_srx->fid, context); - - if (!rxm_ep_cancel_recv(ep, &ep->trecv_queue, context)) - rxm_ep_cancel_recv(ep, &ep->recv_queue, context); - - return 0; + return ep->srx->ep_fid.ops->cancel(&ep->srx->ep_fid.fid, context); } static int rxm_ep_getopt(fid_t fid, int level, int optname, void *optval, @@ -480,10 +270,8 @@ static int rxm_ep_getopt(fid_t fid, int level, int optname, void *optval, switch (optname) { case FI_OPT_MIN_MULTI_RECV: - assert(sizeof(rxm_ep->min_multi_recv_size) == sizeof(size_t)); - *(size_t *)optval = rxm_ep->min_multi_recv_size; - *optlen = sizeof(size_t); - break; + return rxm_ep->srx->ep_fid.ops->getopt(&rxm_ep->srx->ep_fid.fid, + level, optname, optval, optlen); case FI_OPT_BUFFERED_MIN: assert(sizeof(rxm_ep->buffered_min) == sizeof(size_t)); *(size_t *)optval = rxm_ep->buffered_min; @@ -507,11 +295,8 @@ static int rxm_ep_setopt(fid_t fid, int level, int optname, switch (optname) { case FI_OPT_MIN_MULTI_RECV: - rxm_ep->min_multi_recv_size = *(size_t *)optval; - FI_INFO(&rxm_prov, FI_LOG_CORE, - "FI_OPT_MIN_MULTI_RECV set to %zu\n", - rxm_ep->min_multi_recv_size); - break; + return rxm_ep->srx->ep_fid.ops->setopt(&rxm_ep->srx->ep_fid.fid, + level, optname, optval, optlen); case FI_OPT_BUFFERED_MIN: if (rxm_ep->rx_pool) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, @@ -564,99 +349,6 @@ static struct fi_ops_ep rxm_ops_ep = { .tx_size_left = fi_no_tx_size_left, }; - -/* Caller must hold recv_queue->lock -- TODO which lock? */ -struct rxm_rx_buf * -rxm_get_unexp_msg(struct rxm_recv_queue *recv_queue, fi_addr_t addr, - uint64_t tag, uint64_t ignore) -{ - struct rxm_recv_match_attr match_attr; - struct dlist_entry *entry; - - if (dlist_empty(&recv_queue->unexp_msg_list)) - return NULL; - - match_attr.addr = addr; - match_attr.tag = tag; - match_attr.ignore = ignore; - - entry = dlist_find_first_match(&recv_queue->unexp_msg_list, - recv_queue->match_unexp, &match_attr); - if (!entry) - return NULL; - - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Match for posted recv found in unexp" - " msg list\n", match_attr.addr, match_attr.tag); - - return container_of(entry, struct rxm_rx_buf, unexp_msg.entry); -} - -static void rxm_recv_entry_init_common(struct rxm_recv_entry *recv_entry, - const struct iovec *iov, void **desc, size_t count, - fi_addr_t src_addr, uint64_t tag, uint64_t ignore, - void *context, uint64_t flags, - struct rxm_recv_queue *recv_queue) -{ - size_t i; - - assert(!recv_entry->rndv.tx_buf); - recv_entry->rxm_iov.count = (uint8_t) count; - recv_entry->addr = src_addr; - recv_entry->context = context; - recv_entry->flags = flags; - recv_entry->ignore = ignore; - recv_entry->tag = tag; - - recv_entry->sar.msg_id = RXM_SAR_RX_INIT; - recv_entry->sar.total_recv_len = 0; - recv_entry->total_len = 0; - - for (i = 0; i < count; i++) { - recv_entry->rxm_iov.iov[i] = iov[i]; - recv_entry->total_len += iov[i].iov_len; - if (desc && desc[i]) - recv_entry->rxm_iov.desc[i] = desc[i]; - else - recv_entry->rxm_iov.desc[i] = NULL; - } -} - -struct rxm_recv_entry * -rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags, struct rxm_recv_queue *recv_queue) -{ - struct rxm_recv_entry *recv_entry; - - if (ofi_freestack_isempty(recv_queue->fs)) - return NULL; - - recv_entry = ofi_freestack_pop(recv_queue->fs); - - rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag, - ignore, context, flags, recv_queue); - - return recv_entry; -} - -struct rxm_recv_entry * -rxm_multi_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags) -{ - struct rxm_recv_entry *recv_entry; - - recv_entry = ofi_buf_alloc(rxm_ep->multi_recv_pool); - - rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag, - ignore, context, flags, NULL); - - recv_entry->comp_flags = FI_MSG | FI_RECV; - return recv_entry; -} - struct rxm_tx_buf *rxm_get_tx_buf(struct rxm_ep *ep) { struct rxm_tx_buf *buf; @@ -820,6 +512,7 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn) { struct rxm_deferred_tx_entry *def_tx_entry; + struct rxm_proto_info *proto_info; struct iovec iov; struct fi_msg msg; ssize_t ret = 0; @@ -832,12 +525,11 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, struct rxm_deferred_tx_entry, entry); switch (def_tx_entry->type) { case RXM_DEFERRED_TX_RNDV_ACK: + proto_info = def_tx_entry->rndv_ack.rx_buf->proto_info; ret = fi_send(def_tx_entry->rxm_conn->msg_ep, - &def_tx_entry->rndv_ack.rx_buf-> - recv_entry->rndv.tx_buf->pkt, + &proto_info->rndv.tx_buf->pkt, def_tx_entry->rndv_ack.pkt_size, - def_tx_entry->rndv_ack.rx_buf->recv_entry-> - rndv.tx_buf->hdr.desc, + proto_info->rndv.tx_buf->hdr.desc, 0, def_tx_entry->rndv_ack.rx_buf); if (ret) { if (ret == -FI_EAGAIN) @@ -845,11 +537,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, rxm_cq_write_rx_error( def_tx_entry->rxm_ep, ofi_op_msg, def_tx_entry->rndv_ack.rx_buf-> - recv_entry->context, (int) ret); + peer_entry->context, (int) ret); } - if (def_tx_entry->rndv_ack.rx_buf->recv_entry->rndv - .tx_buf->pkt.ctrl_hdr - .type == rxm_ctrl_rndv_rd_done) + if (proto_info->rndv.tx_buf->pkt.ctrl_hdr.type == + rxm_ctrl_rndv_rd_done) RXM_UPDATE_STATE(FI_LOG_EP_DATA, def_tx_entry->rndv_ack.rx_buf, RXM_RNDV_READ_DONE_SENT); @@ -891,7 +582,7 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, rxm_cq_write_rx_error( def_tx_entry->rxm_ep, ofi_op_msg, def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, (int) ret); + peer_entry->context, (int) ret); } break; case RXM_DEFERRED_TX_RNDV_WRITE: @@ -944,7 +635,7 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, def_tx_entry->rxm_ep, ofi_op_msg, def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, + peer_entry->context, (int) ret); } return; @@ -1451,9 +1142,6 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) sizeof(struct rxm_rndv_hdr))), rxm_buffer_size); - assert(!rxm_ep->min_multi_recv_size); - rxm_ep->min_multi_recv_size = rxm_buffer_size; - assert(!rxm_ep->buffered_limit); rxm_ep->buffered_limit = rxm_buffer_size; @@ -1465,13 +1153,11 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) "\t\t MR local: MSG - %d, RxM - %d\n" "\t\t Completions per progress: MSG - %zu\n" "\t\t Buffered min: %zu\n" - "\t\t Min multi recv size: %zu\n" "\t\t inject size: %zu\n" "\t\t Protocol limits: Eager: %zu, SAR: %zu\n", rxm_ep->msg_mr_local, rxm_ep->rdm_mr_local, rxm_ep->comp_per_progress, rxm_ep->buffered_min, - rxm_ep->min_multi_recv_size, rxm_ep->inject_limit, - rxm_ep->eager_limit, rxm_ep->sar_limit); + rxm_ep->inject_limit, rxm_ep->eager_limit, rxm_ep->sar_limit); } static int rxm_ep_txrx_res_open(struct rxm_ep *rxm_ep) @@ -1484,19 +1170,7 @@ static int rxm_ep_txrx_res_open(struct rxm_ep *rxm_ep) dlist_init(&rxm_ep->deferred_queue); - ret = rxm_ep_rx_queue_init(rxm_ep); - if (ret) - goto err; - return FI_SUCCESS; -err: - ofi_bufpool_destroy(rxm_ep->coll_pool); - ofi_bufpool_destroy(rxm_ep->rx_pool); - ofi_bufpool_destroy(rxm_ep->tx_pool); - rxm_ep->coll_pool = NULL; - rxm_ep->rx_pool = NULL; - rxm_ep->tx_pool = NULL; - return ret; } static int rxm_ep_enable_check(struct rxm_ep *rxm_ep) @@ -1526,9 +1200,129 @@ static int rxm_ep_enable_check(struct rxm_ep *rxm_ep) return 0; } +static int rxm_unexp_start(struct fi_peer_rx_entry *rx_entry) +{ + struct rxm_rx_buf *rx_buf = rx_entry->peer_context; + + return rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg ? + rxm_handle_unexp_sar(rx_entry): + rxm_handle_rx_buf(rx_buf); +} + +static int rxm_discard(struct fi_peer_rx_entry *rx_entry) +{ + struct rxm_rx_buf *rx_buf, *seg_rx; + + rx_buf = rx_entry->peer_context; + + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) { + while (!dlist_empty(&rx_buf->proto_info->sar.pkt_list)) { + dlist_pop_front(&rx_buf->proto_info->sar.pkt_list, + struct rxm_rx_buf, seg_rx, unexp_entry); + rxm_free_rx_buf(seg_rx); + } + ofi_buf_free(rx_buf->proto_info); + } + + rxm_free_rx_buf(rx_buf); + return FI_SUCCESS; +} + +struct fi_ops_srx_peer rxm_srx_peer_ops = { + .size = sizeof(struct fi_ops_srx_peer), + .start_msg = rxm_unexp_start, + .start_tag = rxm_unexp_start, + .discard_msg = rxm_discard, + .discard_tag = rxm_discard, +}; + +static int rxm_srx_close(struct fid *fid) +{ + struct rxm_domain *domain = container_of(fid, struct rxm_domain, + rx_ep.fid); + + ofi_atomic_dec32(&domain->util_domain.ref); + + return FI_SUCCESS; +} + +static struct fi_ops rxm_srx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = rxm_srx_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_msg rxm_srx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops_tagged rxm_srx_tagged_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = fi_no_tagged_send, + .sendv = fi_no_tagged_sendv, + .sendmsg = fi_no_tagged_sendmsg, + .inject = fi_no_tagged_inject, + .senddata = fi_no_tagged_senddata, + .injectdata = fi_no_tagged_injectdata, +}; + +int rxm_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct rxm_domain *rxm_domain; + + if (!(attr->op_flags & FI_PEER)) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "shared srx only supported with FI_PEER flag\n"); + return -FI_EINVAL; + } + + rxm_domain = container_of(domain, struct rxm_domain, + util_domain.domain_fid); + + if (rxm_domain->srx) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "Peer SRX context already imported\n"); + return -FI_EINVAL; + } + + rxm_domain->srx = ((struct fi_peer_srx_context *) + (context))->srx; + rxm_domain->srx->peer_ops = &rxm_srx_peer_ops; + rxm_domain->rx_ep.msg = &rxm_srx_msg_ops; + rxm_domain->rx_ep.tagged = &rxm_srx_tagged_ops; + rxm_domain->rx_ep.fid.ops = &rxm_srx_fi_ops; + rxm_domain->rx_ep.fid.fclass = FI_CLASS_SRX_CTX; + *rx_ep = &rxm_domain->rx_ep; + ofi_atomic_inc32(&rxm_domain->util_domain.ref); + + return FI_SUCCESS; +} + +static void rxm_update(struct util_srx_ctx *srx, struct util_rx_entry *rx_entry) +{ + //no update needed +} + static int rxm_ep_ctrl(struct fid *fid, int command, void *arg) { struct rxm_ep *ep; + struct rxm_domain *domain; + struct fid_ep *srx; int ret; ep = container_of(fid, struct rxm_ep, util_ep.ep_fid.fid); @@ -1564,6 +1358,32 @@ static int rxm_ep_ctrl(struct fid *fid, int command, void *arg) if (ret) return ret; + if (!ep->srx) { + domain = container_of(ep->util_ep.domain, + struct rxm_domain, + util_domain.domain_fid); + ret = util_ep_srx_context(&domain->util_domain, + ep->rxm_info->rx_attr->size, + RXM_IOV_LIMIT, rxm_buffer_size, + &rxm_update, &ep->util_ep.lock, + &srx); + if (ret) + return ret; + + ep->srx = container_of(srx, struct fid_peer_srx, + ep_fid.fid); + ep->srx->peer_ops = &rxm_srx_peer_ops; + + ret = util_srx_bind(&ep->srx->ep_fid.fid, + &ep->util_ep.rx_cq->cq_fid.fid, + FI_RECV); + if (ret) + return ret; + } else { + ep->util_ep.ep_fid.msg = &rxm_no_recv_msg_ops; + ep->util_ep.ep_fid.tagged = &rxm_no_recv_tagged_ops; + } + if (ep->msg_srx && !rxm_passthru_info(ep->rxm_info)) { ret = rxm_prepost_recv(ep, ep->msg_srx); if (ret) @@ -1592,10 +1412,21 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) struct rxm_av *rxm_av; struct rxm_cq *rxm_cq; struct rxm_eq *rxm_eq; - int ret, retv = 0; + int ret; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + if (bfid->fclass == FI_CLASS_SRX_CTX) { + if (rxm_ep->srx) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "SRX context already bound to EP\n"); + return -FI_EINVAL; + } + rxm_ep->srx = + (container_of(bfid, struct rxm_domain, rx_ep.fid))->srx; + return FI_SUCCESS; + } + ret = ofi_ep_bind(&rxm_ep->util_ep, bfid, flags); if (ret) return ret; @@ -1608,14 +1439,14 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) &rxm_av->util_coll_av->fid, flags); if (ret) - retv = ret; + return ret; } if (rxm_ep->offload_coll_ep && rxm_av->offload_coll_av) { ret = ofi_ep_fid_bind(&rxm_ep->offload_coll_ep->fid, &rxm_av->offload_coll_av->fid, flags); if (ret) - retv = ret; + return ret; } break; @@ -1626,14 +1457,14 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) &rxm_cq->util_coll_cq->fid, flags); if (ret) - retv = ret; + return ret; } if (rxm_ep->offload_coll_ep && rxm_cq->offload_coll_cq) { ret = ofi_ep_fid_bind(&rxm_ep->offload_coll_ep->fid, &rxm_cq->offload_coll_cq->fid, flags); if (ret) - retv = ret; + return ret; } break; @@ -1644,19 +1475,18 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) &rxm_eq->util_coll_eq->fid, flags); if (ret) - retv = ret; + return ret; } if (rxm_ep->offload_coll_ep && rxm_eq->offload_coll_eq) { ret = ofi_ep_fid_bind(&rxm_ep->offload_coll_ep->fid, &rxm_eq->offload_coll_eq->fid, flags); if (ret) - retv = ret; + return ret; } - break; } - return retv; + return FI_SUCCESS; } static struct fi_ops rxm_ep_fi_ops = { diff --git a/prov/rxm/src/rxm_init.c b/prov/rxm/src/rxm_init.c index 1a76796d4e0..10a7ae535d7 100644 --- a/prov/rxm/src/rxm_init.c +++ b/prov/rxm/src/rxm_init.c @@ -262,8 +262,8 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints, core_info->rx_attr->op_flags &= ~FI_MULTI_RECV; - core_info->domain_attr->caps &= ~(FI_AV_USER_ID); - core_info->caps &= ~(FI_AV_USER_ID); + core_info->domain_attr->caps &= ~(FI_AV_USER_ID | FI_PEER); + core_info->caps &= ~(FI_AV_USER_ID | FI_PEER); return 0; } diff --git a/prov/rxm/src/rxm_msg.c b/prov/rxm/src/rxm_msg.c index 3b9088a2858..5d48e88e53a 100644 --- a/prov/rxm/src/rxm_msg.c +++ b/prov/rxm/src/rxm_msg.c @@ -40,214 +40,16 @@ #include "rxm.h" - -ssize_t rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue, - struct rxm_recv_entry *recv_entry, - struct rxm_rx_buf *rx_buf) -{ - struct rxm_recv_match_attr match_attr; - struct dlist_entry *entry; - bool last; - ssize_t ret; - - ret = rxm_handle_rx_buf(rx_buf); - last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST; - if (ret || last) - return ret; - - match_attr.addr = recv_entry->addr; - match_attr.tag = recv_entry->tag; - match_attr.ignore = recv_entry->ignore; - - dlist_foreach_container_safe(&recv_queue->unexp_msg_list, - struct rxm_rx_buf, rx_buf, - unexp_msg.entry, entry) { - if (!recv_queue->match_unexp(&rx_buf->unexp_msg.entry, - &match_attr)) - continue; - /* Handle unordered completions from MSG provider */ - if ((rx_buf->pkt.ctrl_hdr.msg_id != recv_entry->sar.msg_id) || - ((rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg))) - continue; - - if (!rx_buf->conn) { - rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, - (int) rx_buf->pkt.ctrl_hdr.conn_id); - } - if (recv_entry->sar.conn != rx_buf->conn) - continue; - rx_buf->recv_entry = recv_entry; - dlist_remove(&rx_buf->unexp_msg.entry); - last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == - RXM_SAR_SEG_LAST; - ret = rxm_handle_rx_buf(rx_buf); - if (ret || last) - break; - } - return ret; -} - -/* - * We don't expect to have unexpected messages when the app is using - * multi-recv buffers. Optimize for that case. - * - * If there are unexpected messages waiting when we post a mult-recv buffer, - * we trim off the start of the buffer, treat it as a normal buffer, and pair - * it with an unexpected message. We continue doing this until either no - * unexpected messages are left or the multi-recv buffer has been consumed. - */ -static ssize_t -rxm_post_mrecv(struct rxm_ep *ep, const struct iovec *iov, - void **desc, void *context, uint64_t op_flags) -{ - struct rxm_recv_entry *recv_entry; - struct rxm_rx_buf *rx_buf; - struct iovec cur_iov = *iov; - ssize_t ret; - - do { - recv_entry = rxm_recv_entry_get(ep, &cur_iov, desc, 1, - FI_ADDR_UNSPEC, 0, 0, context, - op_flags, &ep->recv_queue); - if (!recv_entry) { - ret = -FI_ENOMEM; - break; - } - - rx_buf = rxm_get_unexp_msg(&ep->recv_queue, recv_entry->addr, 0, 0); - if (!rx_buf) { - dlist_insert_tail(&recv_entry->entry, - &ep->recv_queue.recv_list); - return 0; - } - - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - recv_entry->flags &= ~FI_MULTI_RECV; - recv_entry->total_len = MIN(cur_iov.iov_len, rx_buf->pkt.hdr.size); - recv_entry->rxm_iov.iov[0].iov_len = recv_entry->total_len; - - cur_iov.iov_base = (uint8_t *) cur_iov.iov_base + recv_entry->total_len; - cur_iov.iov_len -= recv_entry->total_len; - - if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) - ret = rxm_handle_rx_buf(rx_buf); - else - ret = rxm_handle_unexp_sar(&ep->recv_queue, recv_entry, - rx_buf); - - } while (!ret && cur_iov.iov_len >= ep->min_multi_recv_size); - - if ((cur_iov.iov_len < ep->min_multi_recv_size) || - (ret && cur_iov.iov_len != iov->iov_len)) { - ofi_peer_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV, - 0, NULL, 0, 0, FI_ADDR_NOTAVAIL); - } - - return ret; -} - -static ssize_t -rxm_recv_common(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - void *context, uint64_t op_flags) -{ - struct rxm_recv_entry *recv_entry; - struct rxm_rx_buf *rx_buf; - ssize_t ret; - - assert(rxm_ep->util_ep.rx_cq); - assert(count <= rxm_ep->rxm_info->rx_attr->iov_limit); - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - if (op_flags & FI_MULTI_RECV) { - ret = rxm_post_mrecv(rxm_ep, iov, desc, context, op_flags); - goto release; - } - - recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, src_addr, - 0, 0, context, op_flags, - &rxm_ep->recv_queue); - if (!recv_entry) { - ret = -FI_EAGAIN; - goto release; - } - - rx_buf = rxm_get_unexp_msg(&rxm_ep->recv_queue, recv_entry->addr, 0, 0); - if (!rx_buf) { - dlist_insert_tail(&recv_entry->entry, - &rxm_ep->recv_queue.recv_list); - ret = FI_SUCCESS; - goto release; - } - - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - - ret = (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) ? - rxm_handle_rx_buf(rx_buf) : - rxm_handle_unexp_sar(&rxm_ep->recv_queue, recv_entry, rx_buf); - -release: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; -} - -static ssize_t -rxm_buf_recv(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - void *context, uint64_t flags) -{ - struct rxm_recv_entry *recv_entry; - struct fi_recv_context *recv_ctx = context; - struct rxm_rx_buf *rx_buf; - ssize_t ret = 0; - - context = recv_ctx->context; - rx_buf = container_of(recv_ctx, struct rxm_rx_buf, recv_context); - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Claiming buffered receive\n"); - - recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, - src_addr, 0, 0, context, - flags, &rxm_ep->recv_queue); - if (!recv_entry) { - ret = -FI_EAGAIN; - goto unlock; - } - - recv_entry->comp_flags |= FI_CLAIM; - - rx_buf->recv_entry = recv_entry; - ret = rxm_handle_rx_buf(rx_buf); - } else { - assert(flags & FI_DISCARD); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Discarding buffered receive\n"); - rxm_free_rx_buf(rx_buf); - } -unlock: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; -} - static ssize_t rxm_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - if (rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV) - return rxm_buf_recv(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, - flags | rxm_ep->util_ep.rx_msg_flags); - - return rxm_recv_common(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, - flags | rxm_ep->util_ep.rx_msg_flags); + return util_srx_generic_recv(&rxm_ep->srx->ep_fid, msg->msg_iov, + msg->desc, msg->iov_count, msg->addr, + msg->context, + flags | rxm_ep->util_ep.rx_msg_flags); } @@ -262,8 +64,9 @@ rxm_recv(struct fid_ep *ep_fid, void *buf, size_t len, .iov_len = len, }; - return rxm_recv_common(rxm_ep, &iov, &desc, 1, src_addr, - context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_recv(&rxm_ep->srx->ep_fid, &iov, &desc, 1, + src_addr, context, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -273,8 +76,9 @@ rxm_recvv(struct fid_ep *ep_fid, const struct iovec *iov, struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_recv_common(rxm_ep, iov, desc, count, src_addr, - context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_recv(&rxm_ep->srx->ep_fid, iov, desc, count, + src_addr, context, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -661,15 +465,13 @@ rxm_send_eager(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, eager_buf->app_context = context; eager_buf->flags = flags; + rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, + flags, &eager_buf->pkt); if (rxm_use_direct_send(rxm_ep, count, flags)) { - rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, - flags, &eager_buf->pkt); ret = rxm_direct_send(rxm_ep, rxm_conn, eager_buf, iov, desc, count); } else { - rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, - flags, &eager_buf->pkt); ret = rxm_copy_from_hmem_iov(desc, eager_buf->pkt.data, eager_buf->pkt.hdr.size, iov, count, 0); @@ -891,6 +693,19 @@ struct fi_ops_msg rxm_msg_ops = { .injectdata = rxm_injectdata, }; +struct fi_ops_msg rxm_no_recv_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = rxm_send, + .sendv = rxm_sendv, + .sendmsg = rxm_sendmsg, + .inject = rxm_inject, + .senddata = rxm_senddata, + .injectdata = rxm_injectdata, +}; + static ssize_t rxm_recv_thru(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) diff --git a/prov/rxm/src/rxm_tagged.c b/prov/rxm/src/rxm_tagged.c index 8f18f34b3eb..1276bac0ba3 100644 --- a/prov/rxm/src/rxm_tagged.c +++ b/prov/rxm/src/rxm_tagged.c @@ -43,189 +43,21 @@ #include "rxm.h" -static void -rxm_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf, - void *context) -{ - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Discarding message", - rx_buf->unexp_msg.addr, rx_buf->unexp_msg.tag); - - ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - 0, NULL, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); - rxm_free_rx_buf(rx_buf); -} - -static void -rxm_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, - uint64_t ignore, void *context, uint64_t flags, - struct rxm_recv_queue *recv_queue) -{ - struct rxm_rx_buf *rx_buf; - int ret; - - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Peeking message", addr, tag); - - /* peek doesn't support peer transfer at this moment */ - assert(!(flags & FI_PEER_TRANSFER)); - - rxm_ep_do_progress(&rxm_ep->util_ep); - - rx_buf = rxm_get_unexp_msg(recv_queue, addr, tag, ignore); - if (!rx_buf) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message not found\n"); - ret = ofi_peer_cq_write_error_peek( - rxm_ep->util_ep.rx_cq, tag, context); - if (ret) - FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); - return; - } - - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message found\n"); - - if (flags & FI_DISCARD) { - dlist_remove(&rx_buf->unexp_msg.entry); - rxm_discard_recv(rxm_ep, rx_buf, context); - return; - } - - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Marking message for Claim\n"); - ((struct fi_context *)context)->internal[0] = rx_buf; - dlist_remove(&rx_buf->unexp_msg.entry); - } - - ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - rx_buf->pkt.hdr.size, NULL, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); -} - -static ssize_t -rxm_post_trecv(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, uint64_t op_flags) -{ - struct rxm_recv_entry *recv_entry; - struct rxm_rx_buf *rx_buf; - - assert(count <= rxm_ep->rxm_info->rx_attr->iov_limit); - - recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, op_flags, - &rxm_ep->trecv_queue); - if (!recv_entry) - return -FI_EAGAIN; - - rx_buf = rxm_get_unexp_msg(&rxm_ep->trecv_queue, recv_entry->addr, - recv_entry->tag, recv_entry->ignore); - if (!rx_buf) { - dlist_insert_tail(&recv_entry->entry, - &rxm_ep->trecv_queue.recv_list); - return FI_SUCCESS; - } - - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - - if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) - return rxm_handle_rx_buf(rx_buf); - else - return rxm_handle_unexp_sar(&rxm_ep->trecv_queue, recv_entry, - rx_buf); -} - -static ssize_t -rxm_trecv_common(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t op_flags) -{ - ssize_t ret; - - if (op_flags & FI_PEER_TRANSFER) - tag |= RXM_PEER_XFER_TAG_FLAG; - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - ret = rxm_post_trecv(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, op_flags); - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; -} - static ssize_t rxm_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, uint64_t flags) { - struct rxm_ep *rxm_ep; - struct rxm_recv_entry *recv_entry; - struct fi_recv_context *recv_ctx; - struct rxm_rx_buf *rx_buf; - void *context = msg->context; - ssize_t ret = 0; - - rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - flags |= rxm_ep->util_ep.rx_msg_flags; - - if (!(flags & (FI_CLAIM | FI_PEEK)) && - !(rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV)) { - return rxm_trecv_common(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, - msg->tag, msg->ignore, context, flags); - } - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - if (rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV) { - recv_ctx = msg->context; - context = recv_ctx->context; - rx_buf = container_of(recv_ctx, struct rxm_rx_buf, recv_context); - - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Claiming buffered receive\n"); - goto claim; - } - - assert(flags & FI_DISCARD); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Discarding buffered receive\n"); - rxm_free_rx_buf(rx_buf); - goto unlock; - } - - if (flags & FI_PEEK) { - rxm_peek_recv(rxm_ep, msg->addr, msg->tag, msg->ignore, - context, flags, &rxm_ep->trecv_queue); - goto unlock; - } - - rx_buf = ((struct fi_context *) context)->internal[0]; - assert(rx_buf); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Claim message\n"); - - if (flags & FI_DISCARD) { - rxm_discard_recv(rxm_ep, rx_buf, context); - goto unlock; - } - -claim: - assert (flags & FI_CLAIM); - recv_entry = rxm_recv_entry_get(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, - msg->tag, msg->ignore, context, flags, - &rxm_ep->trecv_queue); - if (!recv_entry) { - ret = -FI_EAGAIN; - goto unlock; - } - - if (rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV) - recv_entry->comp_flags |= FI_CLAIM; + uint64_t tag = msg->tag; + struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, + util_ep.ep_fid.fid); - rx_buf->recv_entry = recv_entry; - ret = rxm_handle_rx_buf(rx_buf); + if (flags & FI_PEER_TRANSFER) + tag |= RXM_PEER_XFER_TAG_FLAG; -unlock: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; + return util_srx_generic_trecv(&rxm_ep->srx->ep_fid, msg->msg_iov, + msg->desc, msg->iov_count, msg->addr, + msg->context, tag, msg->ignore, + flags | rxm_ep->util_ep.rx_msg_flags); } static ssize_t @@ -240,8 +72,9 @@ rxm_trecv(struct fid_ep *ep_fid, void *buf, size_t len, }; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_trecv_common(rxm_ep, &iov, &desc, 1, src_addr, tag, ignore, - context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_trecv(&rxm_ep->srx->ep_fid, &iov, &desc, 1, + src_addr, context, tag, ignore, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -252,8 +85,9 @@ rxm_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, struct rxm_ep *rxm_ep; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_trecv_common(rxm_ep, iov, desc, count, src_addr, tag, - ignore, context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_trecv(&rxm_ep->srx->ep_fid, iov, desc, count, + src_addr, context, tag, ignore, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -372,7 +206,7 @@ rxm_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len, ret = rxm_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data, rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA, - tag, ofi_op_tagged); + tag, ofi_op_tagged); unlock: ofi_genlock_unlock(&rxm_ep->util_ep.lock); return ret; @@ -416,6 +250,18 @@ struct fi_ops_tagged rxm_tagged_ops = { .injectdata = rxm_tinjectdata, }; +struct fi_ops_tagged rxm_no_recv_tagged_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = rxm_tsend, + .sendv = rxm_tsendv, + .sendmsg = rxm_tsendmsg, + .inject = rxm_tinject, + .senddata = rxm_tsenddata, + .injectdata = rxm_tinjectdata, +}; static ssize_t rxm_trecv_thru(struct fid_ep *ep_fid, void *buf, size_t len, diff --git a/prov/tcp/src/xnet_av.c b/prov/tcp/src/xnet_av.c index 14b82ccdafd..80b18f2a568 100644 --- a/prov/tcp/src/xnet_av.c +++ b/prov/tcp/src/xnet_av.c @@ -38,7 +38,7 @@ int xnet_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context) { return rxm_util_av_open(domain_fid, attr, fid_av, context, - sizeof(struct xnet_conn), NULL); + sizeof(struct xnet_conn), NULL, NULL); } static int xnet_mplex_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, diff --git a/prov/util/src/rxm_av.c b/prov/util/src/rxm_av.c index a5e30c95026..beb11d0620c 100644 --- a/prov/util/src/rxm_av.c +++ b/prov/util/src/rxm_av.c @@ -281,6 +281,8 @@ static int rxm_av_insert(struct fid_av *av_fid, const void *addr, size_t count, { struct rxm_av *av; fi_addr_t *user_ids = NULL; + struct dlist_entry *av_entry; + struct util_ep *util_ep; int ret; if (flags & FI_AV_USER_ID) { @@ -303,6 +305,14 @@ static int rxm_av_insert(struct fid_av *av_fid, const void *addr, size_t count, goto out; } + if (!av->foreach_ep) + goto out; + + dlist_foreach(&av->util_av.ep_list, av_entry) { + util_ep = container_of(av_entry, struct util_ep, av_entry); + av->foreach_ep(&av->util_av, util_ep); + } + out: free(user_ids); if (ret) @@ -420,7 +430,9 @@ static struct fi_ops_av rxm_av_ops = { int rxm_util_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context, size_t conn_size, void (*remove_handler)(struct util_ep *util_ep, - struct util_peer_addr *peer)) + struct util_peer_addr *peer), + void (*foreach_ep)(struct util_av *av, struct util_ep *ep)) + { struct util_domain *domain; struct util_av_attr util_attr; @@ -457,6 +469,7 @@ int rxm_util_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, av->util_av.av_fid.fid.ops = &rxm_av_fi_ops; av->util_av.av_fid.ops = &rxm_av_ops; av->util_av.remove_handler = remove_handler; + av->foreach_ep = foreach_ep; *fid_av = &av->util_av.av_fid; return 0; From 3b85472adf15ce16fb2d43b71e219cc1f5750806 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 3 Dec 2024 10:55:09 -0500 Subject: [PATCH 294/393] prov/lnx: Convert peer table to use buffer pools Convert peer table to use buffer pools in order to utilize the built-in capabilities of expanding the table as more peers are added dynamically. The peer table is protected by the domain's genlock. Signed-off-by: Amir Shehata --- prov/lnx/include/lnx.h | 19 ++---- prov/lnx/src/lnx_av.c | 142 +++++++++++++++-------------------------- prov/lnx/src/lnx_ops.c | 69 ++++++++++++-------- 3 files changed, 97 insertions(+), 133 deletions(-) diff --git a/prov/lnx/include/lnx.h b/prov/lnx/include/lnx.h index 450324d5d92..e6ed95f2efa 100644 --- a/prov/lnx/include/lnx.h +++ b/prov/lnx/include/lnx.h @@ -33,7 +33,6 @@ #ifndef LNX_H #define LNX_H -#define LNX_DEF_AV_SIZE 1024 #define LNX_MAX_LOCAL_EPS 16 #define LNX_IOV_LIMIT 4 @@ -180,6 +179,7 @@ struct lnx_peer_prov { struct lnx_peer { /* true if peer can be reached over shared memory, false otherwise */ bool lp_local; + fi_addr_t lp_fi_addr; /* Each provider that we can reach the peer on will have an entry * below. Each entry will contain all the local provider endpoints we @@ -200,10 +200,9 @@ struct lnx_peer { struct lnx_peer_table { struct util_av lpt_av; int lpt_max_count; - int lpt_count; struct lnx_domain *lpt_domain; - /* an array of peer entries */ - struct lnx_peer **lpt_entries; + /* an array of peer entries of type struct lnx_peer */ + struct ofi_bufpool *lpt_entries; }; struct lnx_ctx { @@ -293,6 +292,9 @@ int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct fid_av **av, void *context); +struct lnx_peer * +lnx_av_lookup_addr(struct lnx_peer_table *peer_tbl, fi_addr_t addr); + int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq, void *context); @@ -314,15 +316,6 @@ void lnx_free_entry(struct fi_peer_rx_entry *entry); void lnx_foreach_unspec_addr(struct fid_peer_srx *srx, fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)); -static inline struct lnx_peer * -lnx_get_peer(struct lnx_peer **peers, fi_addr_t addr) -{ - if (!peers || addr == FI_ADDR_UNSPEC) - return NULL; - - return peers[addr]; -} - static inline void lnx_get_core_desc(struct lnx_mem_desc *desc, void **mem_desc) { diff --git a/prov/lnx/src/lnx_av.c b/prov/lnx/src/lnx_av.c index f0b8d09fb86..60a26f1ea28 100644 --- a/prov/lnx/src/lnx_av.c +++ b/prov/lnx/src/lnx_av.c @@ -58,76 +58,25 @@ #include "rdma/fi_ext.h" #include "lnx.h" -static void lnx_free_peer(struct lnx_peer *lp) +struct lnx_peer * +lnx_av_lookup_addr(struct lnx_peer_table *peer_tbl, fi_addr_t addr) { - struct lnx_peer_prov *lpp; - struct dlist_entry *tmp, *tmp2; - struct lnx_local2peer_map *lpm; + struct lnx_peer *entry; - dlist_foreach_container_safe(&lp->lp_provs, - struct lnx_peer_prov, lpp, entry, tmp) { - dlist_foreach_container_safe(&lpp->lpp_map, - struct lnx_local2peer_map, lpm, entry, tmp2) { - dlist_remove(&lpm->entry); - free(lpm); - } - dlist_remove(&lpp->entry); - free(lpp); - } + if (addr == FI_ADDR_UNSPEC) + return NULL; - free(lp); -} - -#if ENABLE_DEBUG -static void lnx_print_peer(int idx, struct lnx_peer *lp) -{ - int k; - struct lnx_peer_prov *lpp; - struct lnx_local2peer_map *lpm; + ofi_genlock_lock(&peer_tbl->lpt_domain->ld_domain.lock); - FI_DBG(&lnx_prov, FI_LOG_CORE, - "%d: lnx_peer[%d] is %s\n", getpid(), idx, - (lp->lp_local) ? "local" : "remote"); - dlist_foreach_container(&lp->lp_provs, - struct lnx_peer_prov, lpp, entry) { - FI_DBG(&lnx_prov, FI_LOG_CORE, - "%d: peer[%p] provider %s\n", getpid(), lpp, - lpp->lpp_prov_name); - dlist_foreach_container(&lpp->lpp_map, - struct lnx_local2peer_map, lpm, entry) { - FI_DBG(&lnx_prov, FI_LOG_CORE, - " %d: peer has %d mapped addrs\n", - getpid(), lpm->addr_count); - for (k = 0; k < lpm->addr_count; k++) - FI_DBG(&lnx_prov, FI_LOG_CORE, - " %d: addr = %lu\n", - getpid(), lpm->peer_addrs[k]); - } - } -} -#endif /* ENABLE_DEBUG */ + entry = ofi_bufpool_get_ibuf(peer_tbl->lpt_entries, addr); -static int lnx_peer_insert(struct lnx_peer_table *tbl, - struct lnx_peer *lp) -{ - int i; + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); - if (tbl->lpt_max_count == 0 || - tbl->lpt_count >= tbl->lpt_max_count) - return -FI_ENOENT; - - for (i = 0; i < tbl->lpt_max_count; i++) { - if (!tbl->lpt_entries[i]) { - tbl->lpt_entries[i] = lp; -#if ENABLE_DEBUG - lnx_print_peer(i, lp); -#endif - tbl->lpt_count++; - return i; - } - } + if (!entry) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Invalid fi_addr %#lx\n", addr); - return -FI_ENOENT; + return entry; } static int lnx_peer_av_remove(struct lnx_peer *lp) @@ -160,19 +109,22 @@ static int lnx_peer_av_remove(struct lnx_peer *lp) return frc; } -static int lnx_peer_remove(struct lnx_peer_table *tbl, int idx) +static int lnx_peer_remove(struct lnx_peer_table *tbl, fi_addr_t addr) { - struct lnx_peer *lp = tbl->lpt_entries[idx]; + struct lnx_peer *lp = NULL; int rc = 0; + ofi_genlock_lock(&tbl->lpt_domain->ld_domain.lock); + lp = ofi_bufpool_get_ibuf(tbl->lpt_entries, addr); if (!lp) - return 0; + goto out; rc = lnx_peer_av_remove(lp); - tbl->lpt_entries[idx] = NULL; - tbl->lpt_count--; + ofi_ibuf_free(lp); +out: + ofi_genlock_unlock(&tbl->lpt_domain->ld_domain.lock); return rc; } @@ -193,7 +145,7 @@ static int lnx_cleanup_avs(struct local_prov *prov) static inline void lnx_free_peer_tbl(struct lnx_peer_table *peer_tbl) { - free(peer_tbl->lpt_entries); + ofi_bufpool_destroy(peer_tbl->lpt_entries); free(peer_tbl); } @@ -501,10 +453,14 @@ int lnx_av_insert(struct fid_av *av, const void *addr, size_t count, la->la_prov_count <= 0) return -FI_EPROTO; - /* this is a local peer */ - lp = calloc(sizeof(*lp), 1); - if (!lp) + ofi_genlock_lock(&peer_tbl->lpt_domain->ld_domain.lock); + lp = ofi_ibuf_alloc(peer_tbl->lpt_entries); + if (!lp) { + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); return -FI_ENOMEM; + } + idx = ofi_buf_index(lp); + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); dlist_init(&lp->lp_provs); @@ -521,20 +477,18 @@ int lnx_av_insert(struct fid_av *av, const void *addr, size_t count, rc = lnx_peer_map_addrs(prov_table, lp, la, flags, context); if (rc) { - free(lp); + ofi_genlock_lock(&peer_tbl->lpt_domain->ld_domain.lock); + ofi_ibuf_free(lp); + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); return rc; } - idx = lnx_peer_insert(peer_tbl, lp); - if (idx == -1) { - rc = lnx_peer_av_remove(lp); - lnx_free_peer(lp); - FI_INFO(&lnx_prov, FI_LOG_CORE, - "Peer table size exceeded. Removed = %d\n", rc); - return -FI_ENOENT; - } + if (flags & FI_AV_USER_ID) + lp->lp_fi_addr = fi_addr[i]; + else + lp->lp_fi_addr = idx; - fi_addr[i] = (fi_addr_t) idx; + fi_addr[i] = idx; la = next_peer(la); } @@ -622,8 +576,12 @@ int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct lnx_domain *lnx_domain; struct lnx_peer_table *peer_tbl; struct local_prov *entry; - size_t table_sz = LNX_DEF_AV_SIZE; + size_t table_sz; int rc = 0; + struct ofi_bufpool_attr pool_attr = { + .size = sizeof(struct lnx_peer), + .flags = OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_INDEXED, + }; if (!attr) return -FI_EINVAL; @@ -634,24 +592,24 @@ int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, if (attr->type != FI_AV_TABLE) attr->type = FI_AV_TABLE; + lnx_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid.fid); + fabric = lnx_domain->ld_fabric; + peer_tbl = calloc(sizeof(*peer_tbl), 1); if (!peer_tbl) return -FI_ENOMEM; - if (attr->count != 0) - table_sz = attr->count; + table_sz = attr->count ? attr->count : ofi_universe_size; + table_sz = roundup_power_of_two(table_sz); + pool_attr.chunk_cnt = table_sz; - peer_tbl->lpt_entries = - calloc(sizeof(struct lnx_peer *) * table_sz, 1); - if (!peer_tbl->lpt_entries) { + rc = ofi_bufpool_create_attr(&pool_attr, &peer_tbl->lpt_entries); + if (rc) { rc = -FI_ENOMEM; goto failed; } - lnx_domain = container_of(domain, struct lnx_domain, - ld_domain.domain_fid.fid); - fabric = lnx_domain->ld_fabric; - rc = ofi_av_init_lightweight(&lnx_domain->ld_domain, attr, &peer_tbl->lpt_av, context); if (rc) { diff --git a/prov/lnx/src/lnx_ops.c b/prov/lnx/src/lnx_ops.c index 7d94b7c9352..2c6b725c0ac 100644 --- a/prov/lnx/src/lnx_ops.c +++ b/prov/lnx/src/lnx_ops.c @@ -416,7 +416,7 @@ ssize_t lnx_trecv(struct fid_ep *ep, void *buf, size_t len, void *desc, * multiple endpoints. Each endpoint has its own fi_addr_t which is * core provider specific. */ - lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + lp = lnx_av_lookup_addr(peer_tbl, src_addr); if (lp) { rc = lnx_select_recv_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc); @@ -464,7 +464,7 @@ ssize_t lnx_trecvv(struct fid_ep *ep, const struct iovec *iov, void **desc, peer_tbl = lep->le_peer_tbl; lnx_get_core_desc(*desc, &mem_desc); - lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + lp = lnx_av_lookup_addr(peer_tbl, src_addr); if (lp) { rc = lnx_select_recv_pathway(lp, lep->le_domain, *desc, &cep, &core_addr, iov, count, &mre, &mem_desc); @@ -509,7 +509,7 @@ ssize_t lnx_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, peer_tbl = lep->le_peer_tbl; - lp = lnx_get_peer(peer_tbl->lpt_entries, msg->addr); + lp = lnx_av_lookup_addr(peer_tbl, msg->addr); if (lp) { rc = lnx_select_recv_pathway(lp, lep->le_domain, *msg->desc, &cep, &core_addr, msg->msg_iov, @@ -549,6 +549,7 @@ ssize_t lnx_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -562,8 +563,8 @@ ssize_t lnx_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, desc, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc, NULL); if (rc) return rc; @@ -585,6 +586,7 @@ ssize_t lnx_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -597,8 +599,8 @@ ssize_t lnx_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, (desc) ? *desc : NULL, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, (desc) ? *desc : NULL, &cep, &core_addr, iov, count, &mre, &mem_desc, NULL); if (rc) return rc; @@ -619,6 +621,7 @@ ssize_t lnx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -632,8 +635,8 @@ ssize_t lnx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[msg->addr], - lep->le_domain, + lp = lnx_av_lookup_addr(peer_tbl, msg->addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, (msg->desc) ? *msg->desc : NULL, &cep, &core_addr, msg->msg_iov, msg->iov_count, &mre, &mem_desc, NULL); @@ -661,6 +664,7 @@ ssize_t lnx_tinject(struct fid_ep *ep, const void *buf, size_t len, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -672,8 +676,8 @@ ssize_t lnx_tinject(struct fid_ep *ep, const void *buf, size_t len, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, NULL, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, NULL, &cep, &core_addr, NULL, 0, &mre, NULL, NULL); if (rc) return rc; @@ -695,6 +699,7 @@ ssize_t lnx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -708,8 +713,8 @@ ssize_t lnx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, desc, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc, NULL); if (rc) return rc; @@ -732,6 +737,7 @@ ssize_t lnx_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -743,8 +749,8 @@ ssize_t lnx_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, NULL, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, NULL, &cep, &core_addr, NULL, 0, &mre, NULL, NULL); if (rc) return rc; @@ -767,6 +773,7 @@ lnx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -783,8 +790,8 @@ lnx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[src_addr], - lep->le_domain, desc, &cep, + lp = lnx_av_lookup_addr(peer_tbl, src_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc, &rkey); if (rc) goto out; @@ -810,6 +817,7 @@ lnx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -826,9 +834,9 @@ lnx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, desc, &cep, - &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); if (rc) goto out; @@ -856,6 +864,7 @@ lnx_atomic_write(struct fid_ep *ep, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -872,8 +881,8 @@ lnx_atomic_write(struct fid_ep *ep, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, desc, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc, &rkey); if (rc) goto out; @@ -902,6 +911,7 @@ lnx_atomic_readwrite(struct fid_ep *ep, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -918,9 +928,10 @@ lnx_atomic_readwrite(struct fid_ep *ep, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, - &mre, &mem_desc, &rkey); + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, result_desc, + &cep, &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); if (rc) goto out; @@ -950,6 +961,7 @@ lnx_atomic_compwrite(struct fid_ep *ep, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -966,9 +978,10 @@ lnx_atomic_compwrite(struct fid_ep *ep, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, - &mre, &mem_desc, &rkey); + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, result_desc, &cep, + &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); if (rc) goto out; From 65581c10b3eedc98c80566502f5f96f52324565d Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 5 Dec 2024 14:42:01 -0500 Subject: [PATCH 295/393] prov/lnx: Initialize flags to 0 flags is allocated on the stack which might have some random values. Ensure it's initialized to 0 because if sent to SHM provider uninitialized it could cause the provider to misbehave, since it's value is being checked. Signed-off-by: Amir Shehata --- prov/lnx/include/lnx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/lnx/include/lnx.h b/prov/lnx/include/lnx.h index e6ed95f2efa..3d6506891e4 100644 --- a/prov/lnx/include/lnx.h +++ b/prov/lnx/include/lnx.h @@ -336,7 +336,7 @@ int lnx_create_mr(const struct iovec *iov, fi_addr_t addr, struct fi_mr_attr attr = {}; struct fi_mr_attr cur_abi_attr; struct ofi_mr_info info = {}; - uint64_t flags; + uint64_t flags = 0; int rc; attr.iov_count = 1; From 834197094e81f1afd87c83cd62275c4df309624b Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Wed, 4 Dec 2024 12:16:53 -0800 Subject: [PATCH 296/393] prov/ucx: Fix incorrect return value checking for fi_param_get() When reading the two runtime parameters the condition for the actions was inverted by mistake. Signed-off-by: Jianxin Xiong --- prov/ucx/src/ucx_domain.c | 2 +- prov/ucx/src/ucx_fabric.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/prov/ucx/src/ucx_domain.c b/prov/ucx/src/ucx_domain.c index f608ba66574..2e03bd4b8dc 100644 --- a/prov/ucx/src/ucx_domain.c +++ b/prov/ucx/src/ucx_domain.c @@ -368,7 +368,7 @@ int ucx_domain_open(struct fid_fabric *fabric, struct fi_info *info, return -ENOMEM; ofi_status = fi_param_get_size_t(NULL, "universe_size", &univ_size); - if (ofi_status) { + if (ofi_status == FI_SUCCESS) { params.estimated_num_eps = univ_size; params.field_mask |= UCP_PARAM_FIELD_ESTIMATED_NUM_EPS; } diff --git a/prov/ucx/src/ucx_fabric.c b/prov/ucx/src/ucx_fabric.c index 106c7127db1..eba1263a9b7 100644 --- a/prov/ucx/src/ucx_fabric.c +++ b/prov/ucx/src/ucx_fabric.c @@ -87,7 +87,7 @@ static char* ucx_local_host_resolve() char *result = NULL; status = fi_param_get(&ucx_prov, "ns_iface", &iface); - if (!status) + if (status != FI_SUCCESS) iface = NULL; if (getifaddrs(&ifaddr) == -1) { From 409c00d953687786ac614117f1302dd0f55b97e4 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Fri, 6 Dec 2024 16:34:17 -0800 Subject: [PATCH 297/393] prov/ucx: Fix segfault in ucx_send_callback In one code path, the request was not initialized before the callback function is called. As a result, NULL cq was dereferenced, leading to segfault. Signed-off-by: Jianxin Xiong --- prov/ucx/src/ucx_core.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/prov/ucx/src/ucx_core.c b/prov/ucx/src/ucx_core.c index d1ffc947296..058770b29b9 100644 --- a/prov/ucx/src/ucx_core.c +++ b/prov/ucx/src/ucx_core.c @@ -94,6 +94,25 @@ ssize_t ucx_do_sendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, return ucx_translate_errcode(*(ucs_status_t*)status); } + if (UCS_PTR_STATUS(status) != UCS_OK) { + struct ucx_request *req = (struct ucx_request *)status; + + /* + * Set up the req fields before the callback function is called + * (in ucp_worker_progress or ucp_worker_flush). + */ + req->ep = u_ep; + if (!no_completion) { + req->completion.op_context = msg->context; + req->completion.flags = FI_SEND | + (mode == UCX_MSG ? FI_MSG : FI_TAGGED); + req->completion.len = msg->msg_iov[0].iov_len; + req->completion.buf = msg->msg_iov[0].iov_base; + req->completion.tag = msg->tag; + req->cq = cq; + } + } + if (flags & FI_INJECT) { if(UCS_PTR_STATUS(status) != UCS_OK) { while ((cstatus = ucp_request_check_status(status)) @@ -110,13 +129,6 @@ ssize_t ucx_do_sendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, goto done; } - if (no_completion) { - if (UCS_PTR_STATUS(status) != UCS_OK) - goto fence; - - goto done; - } - if (msg->context) { struct fi_context *ctx = ((struct fi_context*)(msg->context)); @@ -129,16 +141,6 @@ ssize_t ucx_do_sendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, * Not done yet. completion will be handled by the callback * function. */ - struct ucx_request *req = (struct ucx_request *)status; - - req->completion.op_context = msg->context; - req->completion.flags = FI_SEND | - (mode == UCX_MSG ? FI_MSG : FI_TAGGED); - req->completion.len = msg->msg_iov[0].iov_len; - req->completion.buf = msg->msg_iov[0].iov_base; - req->completion.tag = msg->tag; - req->ep = u_ep; - req->cq = cq; goto fence; } From 7953ad79abb4a95a81591e74bb30f28b2f7208c1 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Fri, 6 Dec 2024 22:51:04 -0800 Subject: [PATCH 298/393] Makefile.am: Keep using libfabric.so.1 as the soname Since the ABI is still backward compatible, there is no need to bump the library major version which unnecessarily introduces incompatibilty between RPM packages due to soname change. Signed-off-by: Jianxin Xiong --- Makefile.am | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Makefile.am b/Makefile.am index 204352db93b..359669202dc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -223,7 +223,7 @@ src_libfabric_la_LIBADD = src_libfabric_la_DEPENDENCIES = libfabric.map if !EMBEDDED -src_libfabric_la_LDFLAGS += -version-info 2:0:0 +src_libfabric_la_LDFLAGS += -version-info 27:0:26 endif src_libfabric_la_LDFLAGS += -export-dynamic \ $(libfabric_version_script) @@ -451,9 +451,6 @@ dist-hook: libfabric.spec cp libfabric.spec $(distdir) perl $(top_srcdir)/config/distscript.pl "$(distdir)" "$(PACKAGE_VERSION)" -install-exec-hook: - ln -sf libfabric.so.2 $(DESTDIR)$(libdir)/libfabric.so.1 - TESTS = \ util/fi_info From 6b8f5db6c1bfcff1cd81f16d46a9607f4e8d8ffb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:32:49 +0000 Subject: [PATCH 299/393] build(deps): bump github/codeql-action from 3.27.5 to 3.27.6 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.27.5 to 3.27.6. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/f09c1c0a94de965c15400f5634aa42fac8fb8f88...aa578102511db1f4524ed59b8cc2bae4f6e88195) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index ce6ccf224b0..a42caa6a4d6 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 + uses: github/codeql-action/init@aa578102511db1f4524ed59b8cc2bae4f6e88195 # v3.27.6 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 + uses: github/codeql-action/autobuild@aa578102511db1f4524ed59b8cc2bae4f6e88195 # v3.27.6 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 + uses: github/codeql-action/analyze@aa578102511db1f4524ed59b8cc2bae4f6e88195 # v3.27.6 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index c0fad16a230..409f3fb9bcb 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 + uses: github/codeql-action/upload-sarif@aa578102511db1f4524ed59b8cc2bae4f6e88195 # v3.27.6 with: sarif_file: results.sarif From 5eff15c1e062e86b100198c2806a1b5c0b7be3df Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 9 Dec 2024 11:38:22 -0800 Subject: [PATCH 300/393] prov/shm: fix shm multi recv setopt segfault shm uses the util srx and sets the minimum multi receive size through the srx. However, the srx code doesn't get initialized until the endpoint gets enabled. So if the application calls setopt (before FI_ENABLE), this will segfault because the srx has not been initialized. Instead, we need to save the multi recv size in the shm endpoint to be valid during setopt and then pass that into the util_srx creation to set the multi recv size Signed-off-by: Alexia Ingerson --- prov/shm/src/smr.h | 1 + prov/shm/src/smr_ep.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/prov/shm/src/smr.h b/prov/shm/src/smr.h index 52c534097e1..d4287fdba88 100644 --- a/prov/shm/src/smr.h +++ b/prov/shm/src/smr.h @@ -229,6 +229,7 @@ struct smr_ep { struct smr_tx_fs *tx_fs; struct dlist_entry sar_list; struct dlist_entry ipc_cpy_pend_list; + size_t min_multi_recv_size; int ep_idx; enum ofi_shm_p2p_type p2p_type; diff --git a/prov/shm/src/smr_ep.c b/prov/shm/src/smr_ep.c index dd3d7f53f07..82bc95200b1 100644 --- a/prov/shm/src/smr_ep.c +++ b/prov/shm/src/smr_ep.c @@ -128,14 +128,12 @@ int smr_ep_setopt(fid_t fid, int level, int optname, const void *optval, { struct smr_ep *smr_ep = container_of(fid, struct smr_ep, util_ep.ep_fid); - struct util_srx_ctx *srx; if (level != FI_OPT_ENDPOINT) return -FI_ENOPROTOOPT; if (optname == FI_OPT_MIN_MULTI_RECV) { - srx = smr_ep->srx->ep_fid.fid.context; - srx->min_multi_recv_size = *(size_t *)optval; + smr_ep->min_multi_recv_size = *(size_t *)optval; return FI_SUCCESS; } @@ -1146,7 +1144,7 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) util_domain.domain_fid); ret = util_ep_srx_context(&domain->util_domain, ep->rx_size, SMR_IOV_LIMIT, - SMR_INJECT_SIZE, &smr_update, + ep->min_multi_recv_size, &smr_update, &ep->util_ep.lock, &srx); if (ret) return ret; @@ -1308,6 +1306,8 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info, dlist_init(&ep->sar_list); dlist_init(&ep->ipc_cpy_pend_list); + ep->min_multi_recv_size = SMR_INJECT_SIZE; + ep->util_ep.ep_fid.fid.ops = &smr_ep_fi_ops; ep->util_ep.ep_fid.ops = &smr_ep_ops; ep->util_ep.ep_fid.cm = &smr_cm_ops; From 95a7e8489b48707548f3448671b57bfecf62a701 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 9 Dec 2024 11:41:24 -0800 Subject: [PATCH 301/393] prov/rxm: fix rxm multi recv setopt segfault rxm uses the util srx and sets the minimum multi receive size through the srx. However, the srx code doesn't get initialized until the endpoint gets enabled. So if the application calls setopt (before FI_ENABLE), this will segfault because the srx has not been initialized. Instead, we need to save the multi recv size in the rxm endpoint to be valid during setopt and then pass that into the util_srx creation to set the multi recv size Signed-off-by: Alexia Ingerson --- prov/rxm/src/rxm.h | 1 + prov/rxm/src/rxm_ep.c | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index 93e08624fc1..fa570b455a4 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -677,6 +677,7 @@ struct rxm_ep { size_t eager_limit; size_t sar_limit; size_t tx_credit; + size_t min_multi_recv_size; struct ofi_bufpool *rx_pool; struct ofi_bufpool *tx_pool; diff --git a/prov/rxm/src/rxm_ep.c b/prov/rxm/src/rxm_ep.c index b967643c0c5..de375cc010d 100644 --- a/prov/rxm/src/rxm_ep.c +++ b/prov/rxm/src/rxm_ep.c @@ -295,8 +295,8 @@ static int rxm_ep_setopt(fid_t fid, int level, int optname, switch (optname) { case FI_OPT_MIN_MULTI_RECV: - return rxm_ep->srx->ep_fid.ops->setopt(&rxm_ep->srx->ep_fid.fid, - level, optname, optval, optlen); + rxm_ep->min_multi_recv_size = *(size_t *)optval; + return ret; case FI_OPT_BUFFERED_MIN: if (rxm_ep->rx_pool) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, @@ -1144,6 +1144,7 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) assert(!rxm_ep->buffered_limit); rxm_ep->buffered_limit = rxm_buffer_size; + rxm_ep->min_multi_recv_size = rxm_buffer_size; rxm_config_direct_send(rxm_ep); rxm_ep_init_proto(rxm_ep); @@ -1364,7 +1365,7 @@ static int rxm_ep_ctrl(struct fid *fid, int command, void *arg) util_domain.domain_fid); ret = util_ep_srx_context(&domain->util_domain, ep->rxm_info->rx_attr->size, - RXM_IOV_LIMIT, rxm_buffer_size, + RXM_IOV_LIMIT, ep->min_multi_recv_size, &rxm_update, &ep->util_ep.lock, &srx); if (ret) From 38adf06ccdcc385258ffdb0c1d7d498dee26a901 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 9 Dec 2024 23:48:59 +0000 Subject: [PATCH 302/393] prov/efa: fix efa multi recv setopt segfault efa uses the util srx and sets the minimum multi receive size through the srx. However, the srx code doesn't get initialized until the endpoint gets enabled. So if the application calls setopt (before FI_ENABLE), this will segfault because the srx has not been initialized. Instead, we need to save the multi recv size in the efa endpoint to be valid during setopt and then pass that into the util_srx creation to set the multi recv size Signed-off-by: Alexia Ingerson Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 3 --- prov/efa/test/efa_unit_test_srx.c | 12 +++++------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 98e1d0b4375..47b3f53f9bd 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1650,7 +1650,6 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, { struct efa_rdm_ep *efa_rdm_ep; int intval, ret; - struct util_srx_ctx *srx; efa_rdm_ep = container_of(fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); @@ -1663,8 +1662,6 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, return -FI_EINVAL; efa_rdm_ep->min_multi_recv_size = *(size_t *)optval; - srx = util_get_peer_srx(efa_rdm_ep->peer_srx_ep)->ep_fid.fid.context; - srx->min_multi_recv_size = *(size_t *)optval; break; case FI_OPT_EFA_RNR_RETRY: if (optlen != sizeof(size_t)) diff --git a/prov/efa/test/efa_unit_test_srx.c b/prov/efa/test/efa_unit_test_srx.c index 733faa67d57..e0bff95169b 100644 --- a/prov/efa/test/efa_unit_test_srx.c +++ b/prov/efa/test/efa_unit_test_srx.c @@ -18,21 +18,19 @@ void test_efa_srx_min_multi_recv_size(struct efa_resource **state) struct util_srx_ctx *srx_ctx; size_t min_multi_recv_size_new; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); - /* - * After ep is enabled, the srx->min_multi_recv_size should be - * exactly the same with ep->min_multi_recv_size - */ - assert_true(efa_rdm_ep->min_multi_recv_size == srx_ctx->min_multi_recv_size); /* Set a new min_multi_recv_size via setopt*/ min_multi_recv_size_new = 1024; assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, &min_multi_recv_size_new, sizeof(min_multi_recv_size_new)), 0); + /* Enable EP */ + assert_int_equal(fi_enable(resource->ep), FI_SUCCESS); + /* Check whether srx->min_multi_recv_size is set correctly */ + srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); assert_true(srx_ctx->min_multi_recv_size == min_multi_recv_size_new); } From 9e711f987f0429cd70e7f52747f1b1bee72ec125 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 9 Dec 2024 17:39:19 -0800 Subject: [PATCH 303/393] functional/multi_recv: add opts.min_multi_recv_size to set opt before enable fi_setopt has to be called before enabling an endpoint. This adds an opt arg to allow setting this option in the common code like the other EP options. Signed-off-by: Alexia Ingerson --- fabtests/common/shared.c | 12 +++++++++++- fabtests/functional/multi_recv.c | 6 +----- fabtests/include/shared.h | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index a7d548da097..f485afb484d 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -1463,7 +1463,8 @@ int ft_enable_ep(struct fid_ep *bind_ep, struct fid_eq *bind_eq, struct fid_av * } if (opts.max_msg_size) { - ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_MSG_SIZE, &opts.max_msg_size, sizeof opts.max_msg_size); + ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_MSG_SIZE, + &opts.max_msg_size, sizeof opts.max_msg_size); if (ret && ret != -FI_EOPNOTSUPP) { FT_PRINTERR("fi_setopt(FI_OPT_MAX_MSG_SIZE)", ret); return ret; @@ -1485,6 +1486,15 @@ int ft_enable_ep(struct fid_ep *bind_ep, struct fid_eq *bind_eq, struct fid_av * } } + if (opts.min_multi_recv_size) { + ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, + &opts.min_multi_recv_size, sizeof opts.min_multi_recv_size); + if (ret && ret != -FI_EOPNOTSUPP) { + FT_PRINTERR("fi_setopt(FI_OPT_MIN_MULTI_RECV_SIZE)", ret); + return ret; + } + } + ret = fi_enable(bind_ep); if (ret) { FT_PRINTERR("fi_enable", ret); diff --git a/fabtests/functional/multi_recv.c b/fabtests/functional/multi_recv.c index 8b698e90f5d..58eac21b951 100644 --- a/fabtests/functional/multi_recv.c +++ b/fabtests/functional/multi_recv.c @@ -263,11 +263,6 @@ static int run(void) if (ret) return ret; - ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, - &tx_size, sizeof(tx_size)); - if (ret) - return ret; - ret = post_multi_recv_buffer(); if (ret) return ret; @@ -327,6 +322,7 @@ int main(int argc, char **argv) return EIO; } + opts.min_multi_recv_size = opts.transfer_size; hints->caps = FI_MSG | FI_MULTI_RECV; hints->mode = FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 2a798ecbaa7..679241b9a2d 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -191,6 +191,7 @@ struct ft_opts { size_t transfer_size; size_t max_msg_size; size_t inject_size; + size_t min_multi_recv_size; int window_size; int av_size; int verbose; From cafbae7a68ca73367621146901d784b54a0f0ae2 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Sat, 7 Dec 2024 08:56:00 -0800 Subject: [PATCH 304/393] prov/psm2: Check return value of asprintf This fix a compiler warning of unused return value. Signed-off-by: Jianxin Xiong --- prov/psm2/src/psmx2_attr.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/prov/psm2/src/psmx2_attr.c b/prov/psm2/src/psmx2_attr.c index f7a1f5496d6..920ed918fe8 100644 --- a/prov/psm2/src/psmx2_attr.c +++ b/prov/psm2/src/psmx2_attr.c @@ -335,6 +335,7 @@ void psmx2_update_prov_info(struct fi_info *info, struct psmx2_ep_name *dest_addr) { struct fi_info *p; + int ret; for (p = info; p; p = p->next) { psmx2_dup_addr(p->addr_format, src_addr, @@ -363,10 +364,17 @@ void psmx2_update_prov_info(struct fi_info *info, } free(p->domain_attr->name); - if (unit == PSMX2_DEFAULT_UNIT) + if (unit == PSMX2_DEFAULT_UNIT) { p->domain_attr->name = strdup(psmx2_hfi_info.default_domain_name); - else - asprintf(&p->domain_attr->name, "hfi1_%d", unit); + } else { + ret = asprintf(&p->domain_attr->name, "hfi1_%d", unit); + if (ret < 0) { + p->domain_attr->name = NULL; + FI_WARN(&psmx2_prov, FI_LOG_CORE, + "Failed to allocate domain name for HFI unit %d\n", + unit); + } + } p->tx_attr->inject_size = psmx2_env.inject_size; } From a149f51938a7fa65f05e00da3ce46358ccab91c0 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 26 Nov 2024 06:49:49 +0000 Subject: [PATCH 305/393] prov/efa: Skip rx pkt refill under certain threshold Libfabric currently refill the rx pkt pool in every cq read when there are >0 pkts to post, which makes it have chance to post ibv_recv 1-by-1 if there is only 1 pkt to post per cq read. Such 1-by-1 post is less performant than having a batch post once. This patch improves this strategy by introducing a threshold for the refilling. When When the number of internal rx pkts to post is lower than this threshold, the refill will be skipped. Also introduced FI_EFA_INTERNAL_RX_REFILL_THRESHOLD that allows tuning this parameter. Signed-off-by: Shi Jin --- man/fi_efa.7.md | 5 ++ prov/efa/src/efa_env.c | 4 ++ prov/efa/src/efa_env.h | 6 +++ prov/efa/src/rdm/efa_rdm_ep.h | 2 + prov/efa/src/rdm/efa_rdm_ep_utils.c | 6 ++- prov/efa/test/efa_unit_test_ep.c | 80 +++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 2 + prov/efa/test/efa_unit_tests.h | 2 + 8 files changed, 106 insertions(+), 1 deletion(-) diff --git a/man/fi_efa.7.md b/man/fi_efa.7.md index b6eefc19dc1..077f93c5515 100644 --- a/man/fi_efa.7.md +++ b/man/fi_efa.7.md @@ -338,6 +338,11 @@ for details. : Use device's unsolicited write recv functionality when it's available. (Default: 1). Setting this environment variable to 0 can disable this feature. +*FI_EFA_INTERNAL_RX_REFILL_THRESHOLD* +: The threshold that EFA provider will refill the internal rx pkt pool. (Default: 8). +When the number of internal rx pkts to post is lower than this threshold, +the refill will be skipped. + # SEE ALSO [`fabric`(7)](fabric.7.html), diff --git a/prov/efa/src/efa_env.c b/prov/efa/src/efa_env.c index 79a315c7cbe..ef6eedd57ec 100644 --- a/prov/efa/src/efa_env.c +++ b/prov/efa/src/efa_env.c @@ -39,6 +39,7 @@ struct efa_env efa_env = { .use_sm2 = false, .huge_page_setting = EFA_ENV_HUGE_PAGE_UNSPEC, .use_unsolicited_write_recv = 1, + .internal_rx_refill_threshold = 8, }; /** @@ -132,6 +133,7 @@ void efa_env_param_get(void) &efa_mr_max_cached_size); fi_param_get_size_t(&efa_prov, "tx_size", &efa_env.tx_size); fi_param_get_size_t(&efa_prov, "rx_size", &efa_env.rx_size); + fi_param_get_size_t(&efa_prov, "internal_rx_refill_threshold", &efa_env.internal_rx_refill_threshold); fi_param_get_bool(&efa_prov, "rx_copy_unexp", &efa_env.rx_copy_unexp); fi_param_get_bool(&efa_prov, "rx_copy_ooo", @@ -232,6 +234,8 @@ void efa_env_define() "will use huge page unless FI_EFA_FORK_SAFE is set to 1/on/true."); fi_param_define(&efa_prov, "use_unsolicited_write_recv", FI_PARAM_BOOL, "Use device's unsolicited write recv functionality when it's available. (Default: true)"); + fi_param_define(&efa_prov, "internal_rx_refill_threshold", FI_PARAM_SIZE_T, + "The threshold that EFA provider will refill the internal rx pkt pool. (Default: %zu)", efa_env.internal_rx_refill_threshold); } diff --git a/prov/efa/src/efa_env.h b/prov/efa/src/efa_env.h index 6fdd83a4a21..dbff4182292 100644 --- a/prov/efa/src/efa_env.h +++ b/prov/efa/src/efa_env.h @@ -79,6 +79,12 @@ struct efa_env { int use_sm2; enum efa_env_huge_page_setting huge_page_setting; int use_unsolicited_write_recv; + /** + * The threshold that EFA provider will refill the internal rx pkt pool. + * When the number of internal rx pkts to post is lower than this threshold, + * the refill will be skipped. + */ + size_t internal_rx_refill_threshold; }; extern struct efa_env efa_env; diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index d7a8fc5ddc2..b82741963ef 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -263,6 +263,8 @@ struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep) void efa_rdm_ep_post_internal_rx_pkts(struct efa_rdm_ep *ep); +int efa_rdm_ep_bulk_post_internal_rx_pkts(struct efa_rdm_ep *ep); + /** * @brief return whether this endpoint should write error cq entry for RNR. * diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 83d66a23991..12c3c519983 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -741,7 +741,11 @@ int efa_rdm_ep_bulk_post_internal_rx_pkts(struct efa_rdm_ep *ep) { int i, err; - if (ep->efa_rx_pkts_to_post == 0) + /** + * When efa_env.internal_rx_refill_threshold > efa_rdm_ep_get_rx_pool_size(ep), + * we should always refill when the pool is empty. + */ + if (ep->efa_rx_pkts_to_post < MIN(efa_env.internal_rx_refill_threshold, efa_rdm_ep_get_rx_pool_size(ep))) return 0; assert(ep->efa_rx_pkts_to_post + ep->efa_rx_pkts_posted <= ep->efa_max_outstanding_rx_ops); diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 375ada94683..1ac044ce00c 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -1219,3 +1219,83 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res free(pkt_entry_vec); } + +static +void test_efa_rdm_ep_rx_refill_impl(struct efa_resource **state, int threshold, int rx_size) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_pke *pkt_entry; + int i; + size_t threshold_orig; + + if (threshold < 4 || rx_size < 4) { + fprintf(stderr, "Too small threshold or rx_size for this test\n"); + fail(); + } + + threshold_orig = efa_env.internal_rx_refill_threshold; + + efa_env.internal_rx_refill_threshold = threshold; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + resource->hints->rx_attr->size = rx_size; + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), + resource->hints, true, true); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + assert_int_equal(efa_rdm_ep_get_rx_pool_size(efa_rdm_ep), rx_size); + + /* Grow the rx pool and post rx pkts */ + efa_rdm_ep_post_internal_rx_pkts(efa_rdm_ep); + assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, efa_rdm_ep_get_rx_pool_size(efa_rdm_ep)); + + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); + for (i = 0; i < 4; i++) { + pkt_entry = ofi_bufpool_get_ibuf(efa_rdm_ep->efa_rx_pkt_pool, i); + assert_non_null(pkt_entry); + efa_rdm_pke_release_rx(pkt_entry); + } + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 4); + + efa_rdm_ep_bulk_post_internal_rx_pkts(efa_rdm_ep); + + /** + * efa_rx_pkts_to_post < FI_EFA_RX_REFILL_THRESHOLD + * pkts should NOT be refilled + */ + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 4); + assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, rx_size); + + /* releasing more pkts to reach the threshold or rx_size*/ + for (i = 4; i < MIN(rx_size, threshold); i++) { + pkt_entry = ofi_bufpool_get_ibuf(efa_rdm_ep->efa_rx_pkt_pool, i); + assert_non_null(pkt_entry); + efa_rdm_pke_release_rx(pkt_entry); + } + + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, i); + + efa_rdm_ep_bulk_post_internal_rx_pkts(efa_rdm_ep); + + /** + * efa_rx_pkts_to_post == min(FI_EFA_RX_REFILL_THRESHOLD, FI_EFA_RX_SIZE) + * pkts should be refilled + */ + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); + assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, rx_size + i); + + /* recover the original value */ + efa_env.internal_rx_refill_threshold = threshold_orig; +} + +void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(struct efa_resource **state) +{ + test_efa_rdm_ep_rx_refill_impl(state, 8, 64); +} + +void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(struct efa_resource **state) +{ + test_efa_rdm_ep_rx_refill_impl(state, 128, 64); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 2232ea36059..2ada3f5d820 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -114,6 +114,8 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_cancel, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_eagain, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index d44368bc81f..96958b0255f 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -128,6 +128,8 @@ void test_efa_rdm_ep_close_discard_posted_recv(); void test_efa_rdm_ep_zcpy_recv_cancel(); void test_efa_rdm_ep_zcpy_recv_eagain(); void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(); +void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(); +void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll(); From 8e135b970738a79a1254dee349d97d6a10c07a63 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Tue, 10 Dec 2024 18:45:31 +0000 Subject: [PATCH 306/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- fabtests/man/man7/fabtests.7 | 28 +++++++++---- man/man1/fi_info.1 | 28 +++++++++---- man/man1/fi_pingpong.1 | 48 ++++++++++++++-------- man/man1/fi_strerror.1 | 26 +++++++++--- man/man3/fi_atomic.3 | 30 ++++++++++---- man/man3/fi_av.3 | 74 ++++++++++++++++++++-------------- man/man3/fi_av_set.3 | 24 ++++++++--- man/man3/fi_cm.3 | 24 ++++++++--- man/man3/fi_cntr.3 | 28 +++++++++---- man/man3/fi_collective.3 | 38 ++++++++++++------ man/man3/fi_control.3 | 22 ++++++++-- man/man3/fi_cq.3 | 44 +++++++++++++------- man/man3/fi_domain.3 | 48 ++++++++++++++-------- man/man3/fi_endpoint.3 | 78 +++++++++++++++++++++--------------- man/man3/fi_eq.3 | 34 +++++++++++----- man/man3/fi_errno.3 | 20 +++++++-- man/man3/fi_fabric.3 | 32 ++++++++++----- man/man3/fi_getinfo.3 | 52 +++++++++++++++--------- man/man3/fi_mr.3 | 36 ++++++++++++----- man/man3/fi_msg.3 | 32 ++++++++++----- man/man3/fi_nic.3 | 28 +++++++++---- man/man3/fi_peer.3 | 30 ++++++++++---- man/man3/fi_poll.3 | 24 ++++++++--- man/man3/fi_profile.3 | 20 +++++++-- man/man3/fi_provider.3 | 28 +++++++++---- man/man3/fi_rma.3 | 30 ++++++++++---- man/man3/fi_tagged.3 | 32 ++++++++++----- man/man3/fi_trigger.3 | 22 ++++++++-- man/man3/fi_version.3 | 20 +++++++-- man/man7/fabric.7 | 52 +++++++++++++++--------- man/man7/fi_arch.7 | 18 ++++++++- man/man7/fi_cxi.7 | 78 +++++++++++++++++++++--------------- man/man7/fi_direct.7 | 22 ++++++++-- man/man7/fi_efa.7 | 56 +++++++++++++++++--------- man/man7/fi_guide.7 | 24 ++++++++--- man/man7/fi_hook.7 | 20 +++++++-- man/man7/fi_intro.7 | 28 +++++++++---- man/man7/fi_lnx.7 | 27 ++++++++++--- man/man7/fi_lpp.7 | 20 +++++++-- man/man7/fi_mrail.7 | 30 ++++++++++---- man/man7/fi_opx.7 | 78 +++++++++++++++++++++--------------- man/man7/fi_provider.7 | 54 ++++++++++++++++--------- man/man7/fi_psm2.7 | 28 +++++++++---- man/man7/fi_psm3.7 | 30 ++++++++++---- man/man7/fi_rxd.7 | 20 +++++++-- man/man7/fi_rxm.7 | 22 ++++++++-- man/man7/fi_setup.7 | 23 +++++++++-- man/man7/fi_shm.7 | 22 ++++++++-- man/man7/fi_sockets.7 | 20 +++++++-- man/man7/fi_tcp.7 | 20 +++++++-- man/man7/fi_ucx.7 | 20 +++++++-- man/man7/fi_udp.7 | 20 +++++++-- man/man7/fi_usnic.7 | 72 +++++++++++++++++++-------------- man/man7/fi_verbs.7 | 22 ++++++++-- 54 files changed, 1286 insertions(+), 520 deletions(-) diff --git a/fabtests/man/man7/fabtests.7 b/fabtests/man/man7/fabtests.7 index 22707d61522..acc82feb924 100644 --- a/fabtests/man/man7/fabtests.7 +++ b/fabtests/man/man7/fabtests.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fabtests" "7" "2024\-11\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fabtests" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -271,7 +285,7 @@ FI_ENORX) can be read by the application, if RNR happens. \f[I]fi_efa_rnr_queue_resend\f[R] This test modifies the RNR retry count (rnr_retry) to 0 via fi_setopt, and then tests RNR queue/re-send logic for different packet types. -To run the test, one needs to use \f[C]-c\f[R] option to specify the +To run the test, one needs to use \f[V]-c\f[R] option to specify the category of packet types. .SS Component tests .PP @@ -451,9 +465,9 @@ The default endpoint type is rdm. Allocate data buffers on the specified device, rather than in host memory. Valid options are ze, cuda and synapseai. +.TP *-a -.IP \[bu] 2 -: The name of a shared address vector. +The name of a shared address vector. This option only applies to tests that support shared address vectors. .TP \f[I]-B \f[R] @@ -465,9 +479,9 @@ endpoints to the server. .TP \f[I]-P \f[R] Specifies the port number of the peer endpoint, overriding the default. +.TP *-s -.IP \[bu] 2 -: Specifies the address of the local endpoint. +Specifies the address of the local endpoint. .TP *-F Specifies the address format. diff --git a/man/man1/fi_info.1 b/man/man1/fi_info.1 index b0d5f5aa8c9..657f11afe97 100644 --- a/man/man1/fi_info.1 +++ b/man/man1/fi_info.1 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_info" "1" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_info" "1" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -56,7 +70,7 @@ For more information on address formats, see fi_getinfo(3). .TP \f[I]-p, \[en]provider=\f[R] Filter fabric interfaces by the provider implementation. -For a list of providers, see the \f[C]--list\f[R] option. +For a list of providers, see the \f[V]--list\f[R] option. .TP \f[I]-d, \[en]domain=\f[R] Filter interfaces to only those with the given domain name. @@ -130,7 +144,7 @@ provider: tcp \f[R] .fi .PP -To see the full fi_info structure, specify the \f[C]-v\f[R] option. +To see the full fi_info structure, specify the \f[V]-v\f[R] option. .IP .nf \f[C] @@ -223,7 +237,7 @@ fi_info: \f[R] .fi .PP -To see libfabric related environment variables \f[C]-e\f[R] option. +To see libfabric related environment variables \f[V]-e\f[R] option. .IP .nf \f[C] @@ -243,7 +257,7 @@ $ ./fi_info -e .fi .PP To see libfabric related environment variables with substring use -\f[C]-g\f[R] option. +\f[V]-g\f[R] option. .IP .nf \f[C] @@ -281,6 +295,6 @@ $ ./fi_info -g tcp .fi .SH SEE ALSO .PP -\f[C]fi_getinfo(3)\f[R], \f[C]fi_endpoint(3)\f[R] +\f[V]fi_getinfo(3)\f[R], \f[V]fi_endpoint(3)\f[R] .SH AUTHORS OpenFabrics. diff --git a/man/man1/fi_pingpong.1 b/man/man1/fi_pingpong.1 index 671ec872133..843db111cd7 100644 --- a/man/man1/fi_pingpong.1 +++ b/man/man1/fi_pingpong.1 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_pingpong" "1" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_pingpong" "1" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -21,7 +35,7 @@ fi_pingpong also displays aggregated statistics after each test run, and can additionally verify data integrity upon receipt. .PP By default, the datagram (FI_EP_DGRAM) endpoint is used for the test, -unless otherwise specified via \f[C]-e\f[R]. +unless otherwise specified via \f[V]-e\f[R]. .SH HOW TO RUN TESTS .PP Two copies of the program must be launched: first, one copy must be @@ -47,15 +61,15 @@ client$ fi_pingpong .PP The server and client must be able to communicate properly for the fi_pingpong utility to function. -If any of the \f[C]-e\f[R], \f[C]-I\f[R], \f[C]-S\f[R], or \f[C]-p\f[R] +If any of the \f[V]-e\f[R], \f[V]-I\f[R], \f[V]-S\f[R], or \f[V]-p\f[R] options are used, then they must be specified on the invocation for both the server and the client process. -If the \f[C]-d\f[R] option is specified on the server, then the client +If the \f[V]-d\f[R] option is specified on the server, then the client will select the appropriate domain if no hint is provided on the client side. -If the \f[C]-d\f[R] option is specified on the client, then it must also +If the \f[V]-d\f[R] option is specified on the client, then it must also be specified on the server. -If both the server and client specify the \f[C]-d\f[R] option and the +If both the server and client specify the \f[V]-d\f[R] option and the given domains cannot communicate, then the application will fail. .SS Control Messaging .TP @@ -110,19 +124,19 @@ Activate output debugging (warning: highly verbose) Displays help output for the pingpong test. .SH USAGE EXAMPLES .SS A simple example -.SS Server: \f[C]fi_pingpong -p \f[R] +.SS Server: \f[V]fi_pingpong -p \f[R] .PP -\f[C]server$ fi_pingpong -p sockets\f[R] -.SS Client: \f[C]fi_pingpong -p \f[R] +\f[V]server$ fi_pingpong -p sockets\f[R] +.SS Client: \f[V]fi_pingpong -p \f[R] .PP -\f[C]client$ fi_pingpong -p sockets 192.168.0.123\f[R] +\f[V]client$ fi_pingpong -p sockets 192.168.0.123\f[R] .SS An example with various options .SS Server: .PP -\f[C]server$ fi_pingpong -p usnic -I 1000 -S 1024\f[R] +\f[V]server$ fi_pingpong -p usnic -I 1000 -S 1024\f[R] .SS Client: .PP -\f[C]client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123\f[R] +\f[V]client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123\f[R] .PP Specifically, this will run a pingpong test with: .IP \[bu] 2 @@ -136,14 +150,14 @@ server node as 192.168.0.123 .SS A longer test .SS Server: .PP -\f[C]server$ fi_pingpong -p usnic -I 10000 -S all\f[R] +\f[V]server$ fi_pingpong -p usnic -I 10000 -S all\f[R] .SS Client: .PP -\f[C]client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123\f[R] +\f[V]client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123\f[R] .SH DEFAULTS .PP There is no default provider; if a provider is not specified via the -\f[C]-p\f[R] switch, the test will pick one from the list of available +\f[V]-p\f[R] switch, the test will pick one from the list of available providers (as returned by fi_getinfo(3)). .PP If no endpoint type is specified, `dgram' is used. @@ -178,6 +192,6 @@ client per second .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3) \f[C]fabric\f[R](7), +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3) \f[V]fabric\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man1/fi_strerror.1 b/man/man1/fi_strerror.1 index 80cb05cd760..6293860cab7 100644 --- a/man/man1/fi_strerror.1 +++ b/man/man1/fi_strerror.1 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_strerror" "1" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_strerror" "1" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -14,16 +28,16 @@ fi_strerror FI_ERROR_CODE .fi .SH DESCRIPTION .PP -Display the error string for the given numeric \f[C]FI_ERROR_CODE\f[R]. -\f[C]FI_ERROR_CODE\f[R] may be a hexadecimal, octal, or decimal +Display the error string for the given numeric \f[V]FI_ERROR_CODE\f[R]. +\f[V]FI_ERROR_CODE\f[R] may be a hexadecimal, octal, or decimal constant. -Although the \f[C]fi_strerror\f[R](3) library function only accepts +Although the \f[V]fi_strerror\f[R](3) library function only accepts positive error values, for convenience this utility accepts both positive and negative error values. .PP This is primarily a convenience tool for developers. .SH SEE ALSO .PP -\f[C]fabric\f[R](7) \f[C]fi_errno\f[R](3) +\f[V]fabric\f[R](7) \f[V]fi_errno\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_atomic.3 b/man/man3/fi_atomic.3 index 2054cbbfe1c..71e63395a8d 100644 --- a/man/man3/fi_atomic.3 +++ b/man/man3/fi_atomic.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_atomic" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_atomic" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -129,7 +143,7 @@ Local data buffer to store initial value of remote buffer \f[I]desc / compare_desc / result_desc\f[R] Data descriptor associated with the local data buffer, local compare buffer, and local result buffer, respectively. -See \f[C]fi_mr\f[R](3). +See \f[V]fi_mr\f[R](3). .TP \f[I]dest_addr\f[R] Destination address for connectionless atomic operations. @@ -693,11 +707,11 @@ parameter specifying the tag. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EOPNOTSUPP\f[R] @@ -741,11 +755,11 @@ assigned to the transmitting and receiving endpoints. Both message and data ordering are required if the results of two atomic operations to the same memory buffers are to reflect the second operation acting on the results of the first. -See \f[C]fi_endpoint\f[R](3) for further details and message size +See \f[V]fi_endpoint\f[R](3) for further details and message size restrictions. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_rma\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3), \f[V]fi_rma\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_av.3 b/man/man3/fi_av.3 index 41870d477c5..baff5058852 100644 --- a/man/man3/fi_av.3 +++ b/man/man3/fi_av.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_av" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_av" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -149,14 +163,14 @@ been deprecated, see below). See the NOTES section for AV restrictions on duplicate addresses. .PP \f[B]Deprecated\f[R]: AV operations may be set to operate asynchronously -by specifying the FI_EVENT flag to \f[C]fi_av_open\f[R]. +by specifying the FI_EVENT flag to \f[V]fi_av_open\f[R]. When requesting asynchronous operation, the application must first bind an event queue to the AV before inserting addresses. .SS fi_av_open .PP fi_av_open allocates or opens an address vector. The properties and behavior of the address vector are defined by -\f[C]struct fi_av_attr\f[R]. +\f[V]struct fi_av_attr\f[R]. .IP .nf \f[C] @@ -287,7 +301,7 @@ The context field in all completions will be the context specified to the insert call, and the data field in the final completion entry will report the number of addresses successfully inserted. If an error occurs during the asynchronous insertion, an error -completion entry is returned (see \f[C]fi_eq\f[R](3) for a discussion of +completion entry is returned (see \f[V]fi_eq\f[R](3) for a discussion of the fi_eq_err_entry error completion struct). The context field of the error completion will be the context that was specified in the insert call; the data field will contain the index of @@ -347,10 +361,10 @@ the call will return -FI_EBUSY. .SS fi_av_bind (deprecated) .PP Associates an event queue with the AV. -If an AV has been opened with \f[C]FI_EVENT\f[R], then an event queue +If an AV has been opened with \f[V]FI_EVENT\f[R], then an event queue must be bound to the AV before any insertion calls are attempted. Any calls to insert addresses before an event queue has been bound will -fail with \f[C]-FI_ENOEQ\f[R]. +fail with \f[V]-FI_ENOEQ\f[R]. Flags are reserved for future use and must be 0. .SS fi_av_insert .PP @@ -361,7 +375,7 @@ AV. Addresses inserted into an address vector must be in the same format as specified in the addr_format field of the fi_info struct provided when opening the corresponding domain. -When using the \f[C]FI_ADDR_STR\f[R] format, the \f[C]addr\f[R] +When using the \f[V]FI_ADDR_STR\f[R] format, the \f[V]addr\f[R] parameter should reference an array of strings (char **). .PP \f[B]Deprecated\f[R]: For AV\[cq]s of type FI_AV_MAP, once inserted @@ -395,14 +409,14 @@ buffer must remain valid until the insertion operation completes. Note that if fi_addr is NULL and synchronous operation is requested without using FI_SYNC_ERR flag, individual insertion failures cannot be reported and the application must use other calls, such as -\f[C]fi_av_lookup\f[R] to learn which specific addresses failed to +\f[V]fi_av_lookup\f[R] to learn which specific addresses failed to insert. .PP If the address vector is configured with authorization keys, the fi_addr parameter may be used as input to define the authorization keys associated with the endpoint addresses being inserted. This is done by setting the fi_addr to an authorization key fi_addr_t -generated from \f[C]fi_av_insert_auth_key\f[R] and setting the +generated from \f[V]fi_av_insert_auth_key\f[R] and setting the FI_AUTH_KEY flag. If the FI_AUTH_KEY flag is not set, addresses being inserted will not be associated with any authorization keys. @@ -416,8 +430,8 @@ authorization keys. These fi_addr_t\[cq]s can be used as the target for local data transfer operations. .PP -If the endpoint supports \f[C]FI_DIRECTED_RECV\f[R] or -\f[C]FI_TAGGED_DIRECTED_RECV\f[R], these fi_addr_t\[cq]s can be used to +If the endpoint supports \f[V]FI_DIRECTED_RECV\f[R] or +\f[V]FI_TAGGED_DIRECTED_RECV\f[R], these fi_addr_t\[cq]s can be used to restrict receive buffers to a specific endpoint address and authorization key. .PP @@ -480,10 +494,10 @@ Node should be a string that corresponds to a hostname or network address. The service string corresponds to a textual representation of a transport address. -Applications may also pass in an \f[C]FI_ADDR_STR\f[R] formatted address +Applications may also pass in an \f[V]FI_ADDR_STR\f[R] formatted address as the node parameter. In such cases, the service parameter must be NULL. -See fi_getinfo.3 for details on using \f[C]FI_ADDR_STR\f[R]. +See fi_getinfo.3 for details on using \f[V]FI_ADDR_STR\f[R]. Supported flags are the same as for fi_av_insert. .SS fi_av_insertsym .PP @@ -527,7 +541,7 @@ Note that removing an address may not disable receiving data from the peer endpoint. fi_av_close will automatically cleanup any associated resource. .PP -If the address being removed came from \f[C]fi_av_insert_auth_key\f[R], +If the address being removed came from \f[V]fi_av_insert_auth_key\f[R], the address will only be removed if all endpoints, which have been enabled against the corresponding authorization key, have been closed. If all endpoints are not closed, -FI_EBUSY will be returned. @@ -577,8 +591,8 @@ fi_av_straddr returns a pointer to buf. .SS fi_av_insert_auth_key .PP This function associates authorization keys with an address vector. -This requires the domain to be opened with \f[C]FI_AV_AUTH_KEY\f[R]. -\f[C]FI_AV_AUTH_KEY\f[R] enables endpoints and memory regions to be +This requires the domain to be opened with \f[V]FI_AV_AUTH_KEY\f[R]. +\f[V]FI_AV_AUTH_KEY\f[R] enables endpoints and memory regions to be associated with authorization keys from the address vector. This behavior enables a single endpoint or memory region to be associated with multiple authorization keys. @@ -588,38 +602,38 @@ address vector authorization keys at that point in time. Later authorization key insertions will not propagate to already enabled endpoints and memory regions. .PP -The \f[C]auth_key\f[R] and \f[C]auth_key_size\f[R] parameters are used +The \f[V]auth_key\f[R] and \f[V]auth_key_size\f[R] parameters are used to input the authorization key into the address vector. The structure of the authorization key is provider specific. -If the \f[C]auth_key_size\f[R] does not align with provider specific +If the \f[V]auth_key_size\f[R] does not align with provider specific structure, -FI_EINVAL will be returned. .PP -The output of \f[C]fi_av_insert_auth_key\f[R] is an authorization key +The output of \f[V]fi_av_insert_auth_key\f[R] is an authorization key fi_addr_t handle representing all endpoint addresses against this specific authorization key. For all operations, including address vector, memory registration, and data transfers, which may accept an authorization key fi_addr_t as input, the FI_AUTH_KEY flag must be specified. Otherwise, the fi_addr_t will be treated as an fi_addr_t returned from -the \f[C]fi_av_insert\f[R] and related functions. +the \f[V]fi_av_insert\f[R] and related functions. .PP For endpoints enabled with FI_DIRECTED_RECV, authorization key fi_addr_t\[cq]s can be used to restrict incoming messages to only endpoint addresses within the authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[C]fi_recvmsg\f[R] and \f[C]fi_trecvmsg\f[R]. +\f[V]fi_recvmsg\f[R] and \f[V]fi_trecvmsg\f[R]. .PP For domains enabled with FI_DIRECTED_RECV, authorization key fi_addr_t\[cq]s can be used to restrict memory region access to only endpoint addresses within the authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[C]fi_mr_regattr\f[R]. +\f[V]fi_mr_regattr\f[R]. .PP These authorization key fi_addr_t\[cq]s can later be used an input for endpoint address insertion functions to generate an fi_addr_t for a specific endpoint address and authorization key. This will require passing in the FI_AUTH_KEY flag to -\f[C]fi_av_insert\f[R] and related functions. +\f[V]fi_av_insert\f[R] and related functions. .PP For address vectors configured with FI_AV_USER_ID and endpoints with FI_SOURCE_ERR, all subsequent FI_EADDRNOTAVAIL error events will return @@ -637,7 +651,7 @@ Flags are reserved for future use and must be 0. This functions returns the authorization key associated with a fi_addr_t. Acceptable fi_addr_t\[cq]s input are the output of -\f[C]fi_av_insert_auth_key\f[R] and AV address insertion functions. +\f[V]fi_av_insert_auth_key\f[R] and AV address insertion functions. The returned authorization key is in a provider specific format. On input, the auth_key_size parameter should indicate the size of the auth_key buffer. @@ -746,14 +760,14 @@ function. This function is used to set the group ID portion of an fi_addr_t. .SH RETURN VALUES .PP -Insertion calls, excluding \f[C]fi_av_insert_auth_key\f[R], for an AV +Insertion calls, excluding \f[V]fi_av_insert_auth_key\f[R], for an AV opened for synchronous operation will return the number of addresses that were successfully inserted. In the case of failure, the return value will be less than the number of addresses that was specified. .PP \f[B]Deprecated\f[R]: Insertion calls, excluding -\f[C]fi_av_insert_auth_key\f[R], for an AV opened for asynchronous +\f[V]fi_av_insert_auth_key\f[R], for an AV opened for asynchronous operation (with FI_EVENT flag specified) will return FI_SUCCESS if the operation was successfully initiated. In the case of failure, a negative fabric errno will be returned. @@ -768,10 +782,10 @@ FI_ADDR_NOTAVAIL. .PP All other calls return FI_SUCCESS on success, or a negative value corresponding to fabric errno on error. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_av_set.3 b/man/man3/fi_av_set.3 index 04742ab2629..6e0b0030ee2 100644 --- a/man/man3/fi_av_set.3 +++ b/man/man3/fi_av_set.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_av_set" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_av_set" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -92,7 +106,7 @@ The creation and manipulation of an AV set is a local operation. No fabric traffic is exchanged between peers. As a result, each peer is responsible for creating matching AV sets as part of their collective membership definition. -See \f[C]fi_collective\f[R](3) for a discussion of membership models. +See \f[V]fi_collective\f[R](3) for a discussion of membership models. .SS fi_av_set .PP The fi_av_set call creates a new AV set. @@ -263,9 +277,9 @@ It is an error for a user to request an unsupported collective. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_av\f[R](3), \f[C]fi_collective\f[R](3) +\f[V]fi_av\f[R](3), \f[V]fi_collective\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cm.3 b/man/man3/fi_cm.3 index 7723f65eb7e..1c8c247d9a5 100644 --- a/man/man3/fi_cm.3 +++ b/man/man3/fi_cm.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cm" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cm" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -261,7 +275,7 @@ or an error will occur. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .SH NOTES .PP @@ -279,7 +293,7 @@ events, or as additional err_data to fi_eq_err_entry, in the case of a rejected connection. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cntr.3 b/man/man3/fi_cntr.3 index 2dcdb911498..7f2e2f3b058 100644 --- a/man/man3/fi_cntr.3 +++ b/man/man3/fi_cntr.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cntr" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cntr" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -91,14 +105,14 @@ That is, a counter actually stores two distinct values, with error completions updating an error specific value. .PP Counters are updated following the completion event semantics defined in -\f[C]fi_cq\f[R](3). +\f[V]fi_cq\f[R](3). The timing of the update is based on the type of transfer and any specified operation flags. .SS fi_cntr_open .PP fi_cntr_open allocates a new fabric counter. The properties and behavior of the counter are defined by -\f[C]struct fi_cntr_attr\f[R]. +\f[V]struct fi_cntr_attr\f[R]. .IP .nf \f[C] @@ -278,7 +292,7 @@ On error, a negative value corresponding to fabric errno is returned. fi_cntr_read / fi_cntr_readerr Returns the current value of the counter. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH NOTES .PP In order to support a variety of counter implementations, updates made @@ -300,7 +314,7 @@ fi_cntr_set / fi_cntr_seterr and results of related operations are reflected in the observed value of the counter. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_poll\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3), \f[V]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_collective.3 b/man/man3/fi_collective.3 index 58e3121c6b2..ead102d60c2 100644 --- a/man/man3/fi_collective.3 +++ b/man/man3/fi_collective.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_collective" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_collective" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .TP @@ -152,7 +166,7 @@ be used for required input. .PP In general collective operations can be thought of as coordinated atomic operations between a set of peer endpoints. -Readers should refer to the \f[C]fi_atomic\f[R](3) man page for details +Readers should refer to the \f[V]fi_atomic\f[R](3) man page for details on the atomic operations and datatypes defined by libfabric. .PP A collective operation is a group communication exchange. @@ -199,7 +213,7 @@ provider by creating and configuring an address vector set (AV set). An AV set represents an ordered subset of addresses in an address vector (AV). Details on creating and configuring an AV set are available in -\f[C]fi_av_set\f[R](3). +\f[V]fi_av_set\f[R](3). .PP Once an AV set has been programmed with the collective membership information, an endpoint is joined to the set. @@ -258,7 +272,7 @@ Applications must call fi_close on the collective group to disconnect the endpoint from the group. After a join operation has completed, the fi_mc_addr call may be used to retrieve the address associated with the multicast group. -See \f[C]fi_cm\f[R](3) for additional details on fi_mc_addr(). +See \f[V]fi_cm\f[R](3) for additional details on fi_mc_addr(). .SS Barrier (fi_barrier) .PP The fi_barrier operation provides a mechanism to synchronize peers. @@ -509,7 +523,7 @@ struct fi_collective_attr { \f[R] .fi .PP -For a description of struct fi_atomic_attr, see \f[C]fi_atomic\f[R](3). +For a description of struct fi_atomic_attr, see \f[V]fi_atomic\f[R](3). .TP \f[I]op\f[R] On input, this specifies the atomic operation involved with the @@ -552,7 +566,7 @@ collective operation through the provider. .PP Collective operations map to underlying fi_atomic operations. For a discussion of atomic completion semantics, see -\f[C]fi_atomic\f[R](3). +\f[V]fi_atomic\f[R](3). The completion, ordering, and atomicity of collective operations match those defined for point to point atomic operations. .SH FLAGS @@ -567,11 +581,11 @@ collective operation. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EOPNOTSUPP\f[R] @@ -587,11 +601,11 @@ As such, they follow most of the conventions and restrictions as peer to peer atomic operations. This includes data atomicity, data alignment, and message ordering semantics. -See \f[C]fi_atomic\f[R](3) for additional information on the datatypes +See \f[V]fi_atomic\f[R](3) for additional information on the datatypes and operations defined for atomic and collective operations. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_av\f[R](3), \f[C]fi_atomic\f[R](3), -\f[C]fi_cm\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_av\f[R](3), \f[V]fi_atomic\f[R](3), +\f[V]fi_cm\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_control.3 b/man/man3/fi_control.3 index 65b0890e0f8..2a6eec2f644 100644 --- a/man/man3/fi_control.3 +++ b/man/man3/fi_control.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_control" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_control" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -50,7 +64,7 @@ header files (\[cq]rdma/fi_ext_*.h\[cq]). Please refer to the provider man pages for details. .SH SEE ALSO .PP -\f[C]fi_endpoint\f[R](3), \f[C]fi_cm\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_cq\f[R](3), \f[C]fi_eq\f[R](3), +\f[V]fi_endpoint\f[R](3), \f[V]fi_cm\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_cq\f[R](3), \f[V]fi_eq\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_cq.3 b/man/man3/fi_cq.3 index f3bce7ce489..232c9dfad0d 100644 --- a/man/man3/fi_cq.3 +++ b/man/man3/fi_cq.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cq" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cq" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -123,7 +137,7 @@ Unlike event queues, completion queues are associated with a resource domain and may be offloaded entirely in provider hardware. .PP The properties and behavior of a completion queue are defined by -\f[C]struct fi_cq_attr\f[R]. +\f[V]struct fi_cq_attr\f[R]. .IP .nf \f[C] @@ -354,8 +368,9 @@ Multiple completions may be retrieved from a CQ in a single call. The maximum number of entries to return is limited to the specified count parameter, with the number of entries successfully read from the CQ returned by the call. -(See return values section below.) A count value of 0 may be used to -drive progress on associated endpoints when manual progress is enabled. +(See return values section below.) +A count value of 0 may be used to drive progress on associated endpoints +when manual progress is enabled. .PP CQs are optimized to report operations which have completed successfully. @@ -429,7 +444,7 @@ fi_cq_readerr is a non-blocking call, returning immediately whether an error completion was found or not. .PP Error information is reported to the user through -\f[C]struct fi_cq_err_entry\f[R]. +\f[V]struct fi_cq_err_entry\f[R]. The format of this structure is defined below. .IP .nf @@ -522,8 +537,9 @@ Flags are set for all relevant completions. .TP \f[I]len\f[R] This len field applies to completed receive operations (e.g.\ fi_recv, -fi_trecv, etc.) and the completed write with remote cq data on the -responder side (e.g.\ fi_write, with FI_REMOTE_CQ_DATA flag). +fi_trecv, etc.) +and the completed write with remote cq data on the responder side +(e.g.\ fi_write, with FI_REMOTE_CQ_DATA flag). It indicates the size of transferred \f[I]message\f[R] data \[en] i.e.\ how many data bytes were placed into the associated receive/target buffer by a corresponding fi_send/fi_tsend/fi_write et al call. @@ -954,7 +970,7 @@ When heterogenous memory is involved, the concept of memory domains come into play. Memory domains identify the physical separation of memory, which may or may not be accessible through the same virtual address space. -See the \f[C]fi_mr\f[R](3) man page for further details on memory +See the \f[V]fi_mr\f[R](3) man page for further details on memory domains. .PP Completion ordering and data visibility are only well-defined for @@ -1014,7 +1030,7 @@ As a result, applications can request a lower completion semantic when posting receives. That indicates to the provider that the application will be responsible for handling any device specific flush operations that might be needed. -See \f[C]fi_msg\f[R](3) FLAGS. +See \f[V]fi_msg\f[R](3) FLAGS. .PP For data transfers that do not generate a completion at the target, such as RMA or atomics, it is the responsibility of the application to ensure @@ -1117,11 +1133,11 @@ returns -FI_EAGAIN. : Returns a character string interpretation of the provider specific error returned with a completion. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_poll\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_eq\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_domain.3 b/man/man3/fi_domain.3 index c8f9721e562..be3d6c8af5d 100644 --- a/man/man3/fi_domain.3 +++ b/man/man3/fi_domain.3 @@ -1,7 +1,21 @@ -.\"t -.\" Automatically generated by Pandoc 2.9.2.1 +'\" t +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_domain" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_domain" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -76,7 +90,7 @@ parameter. .PP Similar to fi_domain, but accepts an extra parameter \f[I]flags\f[R]. Mainly used for opening peer domain. -See \f[C]fi_peer\f[R](3). +See \f[V]fi_peer\f[R](3). .SS fi_open_ops .PP fi_open_ops is used to open provider specific interfaces. @@ -173,9 +187,9 @@ through the event queue. If an event queue is not bound to the domain with the FI_REG_MR flag, then memory registration requests complete synchronously. .PP -See \f[C]fi_av_bind\f[R](3), \f[C]fi_ep_bind\f[R](3), -\f[C]fi_mr_bind\f[R](3), \f[C]fi_pep_bind\f[R](3), and -\f[C]fi_scalable_ep_bind\f[R](3) for more information. +See \f[V]fi_av_bind\f[R](3), \f[V]fi_ep_bind\f[R](3), +\f[V]fi_mr_bind\f[R](3), \f[V]fi_pep_bind\f[R](3), and +\f[V]fi_scalable_ep_bind\f[R](3) for more information. .SS fi_close .PP The fi_close call is used to release all resources associated with a @@ -184,7 +198,7 @@ All objects associated with the opened domain must be released prior to calling fi_close, otherwise the call will return -FI_EBUSY. .SH DOMAIN ATTRIBUTES .PP -The \f[C]fi_domain_attr\f[R] structure defines the set of attributes +The \f[V]fi_domain_attr\f[R] structure defines the set of attributes associated with a domain. .IP .nf @@ -649,7 +663,7 @@ size as the endpoint queue(s) that are bound to it. .SS AV Type (av_type) .PP Specifies the type of address vectors that are usable with this domain. -For additional details on AV type, see \f[C]fi_av\f[R](3). +For additional details on AV type, see \f[V]fi_av\f[R](3). The following values may be specified. .TP \f[I]FI_AV_MAP\f[R] (deprecated) @@ -673,7 +687,7 @@ optimal AV type supported by this domain. .SS Memory Registration Mode (mr_mode) .PP Defines memory registration specific mode bits used with this domain. -Full details on MR mode options are available in \f[C]fi_mr\f[R](3). +Full details on MR mode options are available in \f[V]fi_mr\f[R](3). The following values may be specified. .TP \f[I]FI_MR_ALLOCATED\f[R] @@ -854,7 +868,7 @@ If this domain capability is not set, address vectors cannot be opened with FI_AV_USER_ID. Note that FI_AV_USER_ID can still be supported through the AV insert calls without this domain capability set. -See \f[C]fi_av\f[R](3). +See \f[V]fi_av\f[R](3). .TP \f[I]FI_PEER\f[R] Specifies that the domain must support importing resources to be used in @@ -885,7 +899,7 @@ provider, for example. Indicates that the domain supports the ability to share address vectors among multiple processes using the named address vector feature. .PP -See \f[C]fi_getinfo\f[R](3) for a discussion on primary versus secondary +See \f[V]fi_getinfo\f[R](3) for a discussion on primary versus secondary capabilities. .SS Default authorization key (auth_key) .PP @@ -932,7 +946,7 @@ cache or lookup tables. .PP This specifies the default traffic class that will be associated any endpoints created within the domain. -See \f[C]fi_endpoint\f[R](3) for additional information. +See \f[V]fi_endpoint\f[R](3) for additional information. .SS Max Authorization Keys per Endpoint (max_ep_auth_key) .PP The maximum number of authorization keys which can be supported per @@ -941,7 +955,7 @@ connectionless endpoint. .PP The maximum value that a peer group may be assigned, inclusive. Valid peer group id\[cq]s must be between 0 and max_group_id. -See \f[C]fi_av\f[R](3) for additional information on peer groups and +See \f[V]fi_av\f[R](3) for additional information on peer groups and their use. Users may request support for peer groups by setting this to a non-zero value. @@ -953,7 +967,7 @@ the application. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH NOTES .PP Users should call fi_close to release all resources allocated to the @@ -972,7 +986,7 @@ lightly loaded systems, without an administrator configuring system resources appropriately for the installed provider(s). .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_av\f[R](3), -\f[C]fi_eq\f[R](3), \f[C]fi_mr\f[R](3) \f[C]fi_peer\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_av\f[R](3), +\f[V]fi_eq\f[R](3), \f[V]fi_mr\f[R](3) \f[V]fi_peer\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_endpoint.3 b/man/man3/fi_endpoint.3 index cbc4a28a885..f03197d02b2 100644 --- a/man/man3/fi_endpoint.3 +++ b/man/man3/fi_endpoint.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_endpoint" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_endpoint" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -263,7 +277,7 @@ been used. .PP Similar to fi_endpoint, buf accepts an extra parameter \f[I]flags\f[R]. Mainly used for opening endpoints that use peer transfer feature. -See \f[C]fi_peer\f[R](3) +See \f[V]fi_peer\f[R](3) .SS fi_close .PP Closes an endpoint and release all resources associated with it. @@ -576,7 +590,7 @@ FI_HMEM_P2P_DISABLED: Peer to peer support should not be used. fi_setopt() will return -FI_EOPNOTSUPP if the mode requested cannot be supported by the provider. The FI_HMEM_DISABLE_P2P environment variable discussed in -\f[C]fi_mr\f[R](3) takes precedence over this setopt option. +\f[V]fi_mr\f[R](3) takes precedence over this setopt option. .RE \[bu] .RS 2 .TP @@ -609,10 +623,10 @@ Define the maximum message size that can be transferred by the endpoint in a single untagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -621,10 +635,10 @@ Define the maximum message size that can be transferred by the endpoint in a single tagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -633,10 +647,10 @@ Define the maximum message size that can be transferred by the endpoint via a single RMA operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -645,10 +659,10 @@ Define the maximum data size that can be transferred by the endpoint via a single atomic operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]ep_attr->max_msg_size\f[R]. +\f[V]ep_attr->max_msg_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]ep_attr->max_msg_size\f[R] should be used. +In that case, \f[V]ep_attr->max_msg_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -657,10 +671,10 @@ Define the maximum message size that can be injected by the endpoint in a single untagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -669,10 +683,10 @@ Define the maximum message size that can be injected by the endpoint in a single tagged message. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -681,10 +695,10 @@ Define the maximum data size that can be injected by the endpoint in a single RMA operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE \[bu] .RS 2 .TP @@ -693,10 +707,10 @@ Define the maximum data size that can be injected by the endpoint in a single atomic operation. The size is limited by the endpoint\[cq]s configuration and the provider\[cq]s capabilities, and must be less than or equal to -\f[C]tx_attr->inject_size\f[R]. +\f[V]tx_attr->inject_size\f[R]. Providers that don\[cq]t support this option will return -FI_ENOPROTOOPT. -In that case, \f[C]tx_attr->inject_size\f[R] should be used. +In that case, \f[V]tx_attr->inject_size\f[R] should be used. .RE .SS fi_tc_dscp_set .PP @@ -1779,7 +1793,7 @@ value of transmit or receive context attributes of an endpoint. \f[I]FI_COMMIT_COMPLETE\f[R] Indicates that a completion should not be generated (locally or at the peer) until the result of an operation have been made persistent. -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_COMPLETION\f[R] Indicates that a completion queue entry should be written for data @@ -1792,7 +1806,7 @@ See the fi_ep_bind section above for more detail. \f[I]FI_DELIVERY_COMPLETE\f[R] Indicates that a completion should be generated when the operation has been processed by the destination endpoint(s). -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_INJECT\f[R] Indicates that all outbound data buffers should be returned to the @@ -1807,7 +1821,7 @@ This limit is indicated using inject_size (see inject_size above). \f[I]FI_INJECT_COMPLETE\f[R] Indicates that a completion should be generated when the source buffer(s) may be reused. -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .TP \f[I]FI_MULTICAST\f[R] Indicates that data transfers will target multicast addresses by @@ -1831,7 +1845,7 @@ space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). \f[I]FI_TRANSMIT_COMPLETE\f[R] Indicates that a completion should be generated when the transmit operation has completed relative to the local provider. -See \f[C]fi_cq\f[R](3) for additional details on completion semantics. +See \f[V]fi_cq\f[R](3) for additional details on completion semantics. .SH NOTES .PP Users should call fi_close to release all resources allocated to the @@ -1840,10 +1854,10 @@ fabric endpoint. Endpoints allocated with the FI_CONTEXT or FI_CONTEXT2 mode bits set must typically provide struct fi_context(2) as their per operation context parameter. -(See fi_getinfo.3 for details.) However, when FI_SELECTIVE_COMPLETION is -enabled to suppress CQ completion entries, and an operation is initiated -without the FI_COMPLETION flag set, then the context parameter is -ignored. +(See fi_getinfo.3 for details.) +However, when FI_SELECTIVE_COMPLETION is enabled to suppress CQ +completion entries, and an operation is initiated without the +FI_COMPLETION flag set, then the context parameter is ignored. An application does not need to pass in a valid struct fi_context(2) into such data transfers. .PP @@ -1882,7 +1896,7 @@ submitted for processing. For fi_setopt/fi_getopt, a return value of -FI_ENOPROTOOPT indicates the provider does not support the requested option. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EDOMAIN\f[R] @@ -1896,8 +1910,8 @@ The endpoint has not been configured with necessary completion queue. The endpoint\[cq]s state does not permit the requested operation. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) -\f[C]fi_msg\f[R](3), \f[C]fi_tagged\f[R](3), \f[C]fi_rma\f[R](3) -\f[C]fi_peer\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) +\f[V]fi_msg\f[R](3), \f[V]fi_tagged\f[R](3), \f[V]fi_rma\f[R](3) +\f[V]fi_peer\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_eq.3 b/man/man3/fi_eq.3 index 0622ffbbf62..7351e8f0cac 100644 --- a/man/man3/fi_eq.3 +++ b/man/man3/fi_eq.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_eq" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_eq" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -109,7 +123,7 @@ as listening for connection requests. fi_eq_open allocates a new event queue. .PP The properties and behavior of an event queue are defined by -\f[C]struct fi_eq_attr\f[R]. +\f[V]struct fi_eq_attr\f[R]. .IP .nf \f[C] @@ -259,7 +273,7 @@ These include the following types of events: memory registration, address vector resolution, and multicast joins. .PP Control requests report their completion by inserting a -\f[C]struct fi_eq_entry\f[R] into the EQ. +\f[V]struct fi_eq_entry\f[R] into the EQ. The format of this structure is: .IP .nf @@ -283,7 +297,7 @@ The context field will be set to the context specified as part of the operation, if available, otherwise the context will be associated with the fabric descriptor. The data field will be set as described in the man page for the -corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a +corresponding object type (e.g., see \f[V]fi_av\f[R](3) for a description of how asynchronous address vector insertions are completed). .TP @@ -293,7 +307,7 @@ setup or tear down connections between endpoints. There are three connection notification events: FI_CONNREQ, FI_CONNECTED, and FI_SHUTDOWN. Connection notifications are reported using -\f[C]struct fi_eq_cm_entry\f[R]: +\f[V]struct fi_eq_cm_entry\f[R]: .IP .nf \f[C] @@ -432,7 +446,7 @@ The context field will be set to the context specified as part of the operation. .PP The data field will be set as described in the man page for the -corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a +corresponding object type (e.g., see \f[V]fi_av\f[R](3) for a description of how asynchronous address vector insertions are completed). .PP @@ -558,10 +572,10 @@ fi_eq_strerror Returns a character string interpretation of the provider specific error returned with a completion. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3), \f[C]fi_poll\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cntr\f[R](3), \f[V]fi_poll\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_errno.3 b/man/man3/fi_errno.3 index be13272dd97..6175403b1ed 100644 --- a/man/man3/fi_errno.3 +++ b/man/man3/fi_errno.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_errno" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_errno" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -184,6 +198,6 @@ Receiver not ready, no receive buffers available Memory registration limit exceeded .SH SEE ALSO .PP -\f[C]fabric\f[R](7) +\f[V]fabric\f[R](7) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_fabric.3 b/man/man3/fi_fabric.3 index cc2a0e63219..f36f961a10b 100644 --- a/man/man3/fi_fabric.3 +++ b/man/man3/fi_fabric.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_fabric" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_fabric" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -146,11 +160,11 @@ The data parameter is ignored. .TP \f[I]FI_TYPE_EQ_EVENT\f[R] uint32_t event parameter returned from fi_eq_read(). -See \f[C]fi_eq(3)\f[R] for a list of known values. +See \f[V]fi_eq(3)\f[R] for a list of known values. .TP \f[I]FI_TYPE_CQ_EVENT_FLAGS\f[R] uint64_t flags field in fi_cq_xxx_entry structures. -See \f[C]fi_cq(3)\f[R] for valid flags. +See \f[V]fi_cq(3)\f[R] for valid flags. .TP \f[I]FI_TYPE_MR_MODE\f[R] struct fi_domain_attr::mr_mode flags @@ -245,7 +259,7 @@ these environment variables in a production setting. Version information for the fabric provider, in a major.minor format. The use of the FI_MAJOR() and FI_MINOR() version macros may be used to extract the major and minor version data. -See \f[C]fi_version(3)\f[R]. +See \f[V]fi_version(3)\f[R]. .PP In case of an utility provider layered over a core provider, the version would always refer to that of the utility provider. @@ -253,16 +267,16 @@ would always refer to that of the utility provider. .PP The interface version requested by the application. This value corresponds to the version parameter passed into -\f[C]fi_getinfo(3)\f[R]. +\f[V]fi_getinfo(3)\f[R]. .SH RETURN VALUE .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), -\f[C]fi_eq\f[R](3), \f[C]fi_endpoint\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), +\f[V]fi_eq\f[R](3), \f[V]fi_endpoint\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_getinfo.3 b/man/man3/fi_getinfo.3 index 8c531815b94..a0faf6d1121 100644 --- a/man/man3/fi_getinfo.3 +++ b/man/man3/fi_getinfo.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_getinfo" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_getinfo" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -218,8 +232,8 @@ manner. The fi_info::handle field is also used by fi_endpoint() and fi_reject() calls when processing connection requests or to inherit another endpoint\[cq]s attributes. -See \f[C]fi_eq\f[R](3), \f[C]fi_reject\f[R](3), and -\f[C]fi_endpoint\f[R](3). +See \f[V]fi_eq\f[R](3), \f[V]fi_reject\f[R](3), and +\f[V]fi_endpoint\f[R](3). The info->handle field will be ignored by fi_dupinfo and fi_freeinfo. .TP \f[I]tx_attr - transmit context attributes\f[R] @@ -252,7 +266,7 @@ set. On output, the actual endpoint attributes that can be provided will be returned. Output values will be greater than or equal to requested input values. -See \f[C]fi_endpoint\f[R](3) for details. +See \f[V]fi_endpoint\f[R](3) for details. .TP \f[I]domain_attr - domain attributes\f[R] Optionally supplied domain attributes. @@ -262,7 +276,7 @@ be set. On output, the actual domain attributes that can be provided will be returned. Output values will be greater than or equal to requested input values. -See \f[C]fi_domain\f[R](3) for details. +See \f[V]fi_domain\f[R](3) for details. .TP \f[I]fabric_attr - fabric attributes\f[R] Optionally supplied fabric attributes. @@ -271,14 +285,14 @@ When provided as hints, requested values of struct fi_fabric_attr should be set. On output, the actual fabric attributes that can be provided will be returned. -See \f[C]fi_fabric\f[R](3) for details. +See \f[V]fi_fabric\f[R](3) for details. .TP \f[I]nic - network interface details\f[R] Optional attributes related to the hardware NIC associated with the specified fabric, domain, and endpoint data. This field is only valid for providers where the corresponding attributes are closely associated with a hardware NIC. -See \f[C]fi_nic\f[R](3) for details. +See \f[V]fi_nic\f[R](3) for details. .SH CAPABILITIES .PP Interface capabilities are obtained by OR-ing the following flags @@ -310,12 +324,12 @@ Requests that the provider support the association of a user specified identifier with each address vector (AV) address. User identifiers are returned with completion data in place of the AV address. -See \f[C]fi_domain\f[R](3) and \f[C]fi_av\f[R](3) for more details. +See \f[V]fi_domain\f[R](3) and \f[V]fi_av\f[R](3) for more details. .TP \f[I]FI_COLLECTIVE\f[R] Requests support for collective operations. Endpoints that support this capability support the collective operations -defined in \f[C]fi_collective\f[R](3). +defined in \f[V]fi_collective\f[R](3). .TP \f[I]FI_DIRECTED_RECV\f[R] Requests that the communication endpoint use the source address of an @@ -497,7 +511,7 @@ endpoint as send-only or receive-only. \f[I]FI_TRIGGER\f[R] Indicates that the endpoint should support triggered operations. Endpoints support this capability must meet the usage model as described -by \f[C]fi_trigger\f[R](3). +by \f[V]fi_trigger\f[R](3). .TP \f[I]FI_WRITE\f[R] Indicates that the user requires an endpoint capable of initiating @@ -508,7 +522,7 @@ This flag requires that FI_RMA and/or FI_ATOMIC be set. Specifies that the endpoint should support transfers that may be initiated from heterogenous computation devices, such as GPUs. This flag requires that FI_TRIGGER be set. -For additional details on XPU triggers see \f[C]fi_trigger\f[R](3). +For additional details on XPU triggers see \f[V]fi_trigger\f[R](3). .PP Capabilities may be grouped into three general categories: primary, secondary, and primary modifiers. @@ -612,8 +626,8 @@ application for access domains opened with this capability. This flag is defined for compatibility and is ignored if the application version is 1.5 or later and the domain mr_mode is set to anything other than FI_MR_BASIC or FI_MR_SCALABLE. -See the domain attribute mr_mode \f[C]fi_domain\f[R](3) and -\f[C]fi_mr\f[R](3). +See the domain attribute mr_mode \f[V]fi_domain\f[R](3) and +\f[V]fi_mr\f[R](3). .TP \f[I]FI_MSG_PREFIX\f[R] Message prefix mode indicates that an application will provide buffer @@ -673,7 +687,7 @@ these operations. A provider may support one or more of the following addressing formats. In some cases, a selected addressing format may need to be translated or mapped into an address which is native to the fabric. -See \f[C]fi_av\f[R](3). +See \f[V]fi_av\f[R](3). .TP \f[I]FI_ADDR_EFA\f[R] Address is an Amazon Elastic Fabric Adapter (EFA) proprietary format. @@ -761,7 +775,7 @@ This flag is often used with passive endpoints. fi_getinfo() returns 0 on success. On error, fi_getinfo() returns a negative value corresponding to fabric errno. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .PP fi_allocinfo() returns a pointer to a new fi_info structure on success, or NULL on error. @@ -816,11 +830,11 @@ by fi_getinfo. If neither node, service or hints are provided, then fi_getinfo simply returns the list all available communication interfaces. .PP -Multiple threads may call \f[C]fi_getinfo\f[R] simultaneously, without +Multiple threads may call \f[V]fi_getinfo\f[R] simultaneously, without any requirement for serialization. .SH SEE ALSO .PP -\f[C]fi_open\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3), -\f[C]fi_nic\f[R](3) \f[C]fi_trigger\f[R](3) +\f[V]fi_open\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_domain\f[R](3), +\f[V]fi_nic\f[R](3) \f[V]fi_trigger\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_mr.3 b/man/man3/fi_mr.3 index e2797a7b7c9..4d11d894ab9 100644 --- a/man/man3/fi_mr.3 +++ b/man/man3/fi_mr.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_mr" "3" "2024\-10\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_mr" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -703,7 +717,7 @@ specifying the FI_MR_DMABUF flag. The number of entries in the mr_iov array. The maximum number of memory buffers that may be associated with a single memory region is specified as the mr_iov_limit domain attribute. -See \f[C]fi_domain(3)\f[R]. +See \f[V]fi_domain(3)\f[R]. .SS access .PP Indicates the type of \f[I]operations\f[R] that the local or a peer @@ -785,7 +799,7 @@ This field is ignored unless the fabric is opened with API version 1.5 or greater. .PP If the domain is opened with FI_AV_AUTH_KEY, auth_key_size must equal -\f[C]sizeof(struct fi_mr_auth_key)\f[R]. +\f[V]sizeof(struct fi_mr_auth_key)\f[R]. .SS auth_key .PP Indicates the key to associate with this memory registration. @@ -798,7 +812,7 @@ This field is ignored unless the fabric is opened with API version 1.5 or greater. .PP If the domain is opened with FI_AV_AUTH_KEY, auth_key must point to a -user-defined \f[C]struct fi_mr_auth_key\f[R]. +user-defined \f[V]struct fi_mr_auth_key\f[R]. .SS iface .PP Indicates the software interfaces used by the application to allocate @@ -927,7 +941,7 @@ keys in the AV. .PP If the domain was opened with FI_DIRECTED_RECV, addr can be used to limit the memory region to a specific fi_addr_t, including -fi_addr_t\[cq]s return from \f[C]fi_av_insert_auth_key\f[R]. +fi_addr_t\[cq]s return from \f[V]fi_av_insert_auth_key\f[R]. .SH NOTES .PP Direct access to an application\[cq]s memory by a remote peer requires @@ -1057,7 +1071,7 @@ For example, the physical pages referenced by a virtual address range could migrate between host memory and GPU memory, depending on which computational unit is actively using it. .PP -See the \f[C]fi_endpoint\f[R](3) and \f[C]fi_cq\f[R](3) man pages for +See the \f[V]fi_endpoint\f[R](3) and \f[V]fi_cq\f[R](3) man pages for addition discussion on message, data, and completion ordering semantics, including the impact of memory domains. .SH RETURN VALUES @@ -1065,7 +1079,7 @@ including the impact of memory domains. Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_ENOKEY\f[R] @@ -1172,8 +1186,8 @@ Some level of control over the cache is possible through the above mentioned environment variables. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_rma\f[R](3), \f[C]fi_msg\f[R](3), -\f[C]fi_atomic\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_rma\f[R](3), \f[V]fi_msg\f[R](3), +\f[V]fi_atomic\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_msg.3 b/man/man3/fi_msg.3 index 5919225afc4..708288ee5bc 100644 --- a/man/man3/fi_msg.3 +++ b/man/man3/fi_msg.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_msg" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_msg" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -65,7 +79,7 @@ Count of vectored data entries. .TP \f[I]desc\f[R] Descriptor associated with the data buffer. -See \f[C]fi_mr\f[R](3). +See \f[V]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the sent message. @@ -142,7 +156,7 @@ parameter to a remote endpoint as a single message. The fi_sendmsg call supports data transfers over both connected and connectionless endpoints, with the ability to control the send operation per call through the use of flags. -The fi_sendmsg function takes a \f[C]struct fi_msg\f[R] as input. +The fi_sendmsg function takes a \f[V]struct fi_msg\f[R] as input. .IP .nf \f[C] @@ -265,7 +279,7 @@ Note that an entry to the associated receive completion queue will always be generated when the buffer has been consumed, even if other receive completions have been suppressed (i.e.\ the Rx context has been configured for FI_SELECTIVE_COMPLETION). -See the FI_MULTI_RECV completion flag \f[C]fi_cq\f[R](3). +See the FI_MULTI_RECV completion flag \f[V]fi_cq\f[R](3). .TP \f[I]FI_INJECT_COMPLETE\f[R] Applies to fi_sendmsg. @@ -280,7 +294,7 @@ tracked by the provider. For receive operations, indicates that a completion may be generated as soon as the message has been processed by the local provider, even if the message data may not be visible to all processing elements. -See \f[C]fi_cq\f[R](3) for target side completion semantics. +See \f[V]fi_cq\f[R](3) for target side completion semantics. .TP \f[I]FI_DELIVERY_COMPLETE\f[R] Applies to fi_sendmsg. @@ -326,7 +340,7 @@ buffer length. .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .PP See the discussion below for details handling FI_EAGAIN. .SH ERRORS @@ -359,7 +373,7 @@ acknowledgements or flow control messages may need to be processed in order to resume execution. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_nic.3 b/man/man3/fi_nic.3 index e64dfe31473..1a8eab67a09 100644 --- a/man/man3/fi_nic.3 +++ b/man/man3/fi_nic.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_nic" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_nic" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -9,7 +23,7 @@ fi_nic - Fabric network interface card attributes .PP The fid_nic structure defines attributes for a struct fi_info that is directly associated with underlying networking hardware and may be -returned directly from calling \f[C]fi_getinfo\f[R](3). +returned directly from calling \f[V]fi_getinfo\f[R](3). The format of fid_nic and the related substructures are defined below. .PP Note that not all fields of all structures may be available. @@ -135,7 +149,7 @@ Ethernet or InfiniBand. .PP Provider attributes reference provider specific details of the device. These attributes are both provider and device specific. -The attributes can be interpreted by \f[C]fi_tostr\f[R](3). +The attributes can be interpreted by \f[V]fi_tostr\f[R](3). Applications may also use the other attribute fields, such as related fi_fabric_attr: prov_name field, to determine an appropriate structure to cast the attributes. @@ -145,10 +159,10 @@ specific header file included with libfabric package. .SH NOTES .PP The fid_nic structure is returned as part of a call to -\f[C]fi_getinfo\f[R](3). -It is automatically freed as part of calling \f[C]fi_freeinfo\f[R](3) +\f[V]fi_getinfo\f[R](3). +It is automatically freed as part of calling \f[V]fi_freeinfo\f[R](3) .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3) +\f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_peer.3 b/man/man3/fi_peer.3 index 24b6464f9f3..8661ec75b72 100644 --- a/man/man3/fi_peer.3 +++ b/man/man3/fi_peer.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_peer" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_peer" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .TP @@ -273,8 +287,8 @@ If manual progress is needed on the peer CQ, the owner should drive progress by using the fi_cq_read() function with the buf parameter set to NULL and count equal 0. The peer provider should set other functions that attempt to read the -peer\[cq]s CQ (i.e.\ fi_cq_readerr, fi_cq_sread, etc.) to return --FI_ENOSYS. +peer\[cq]s CQ (i.e.\ fi_cq_readerr, fi_cq_sread, etc.) +to return -FI_ENOSYS. .SS fi_ops_cq_owner::write() .PP This call directs the owner to insert new completions into the CQ. @@ -365,8 +379,8 @@ Similar to the peer CQ, if manual progress is needed on the peer counter, the owner should drive progress by using the fi_cntr_read() and the fi_cntr_read() should do nothing but progress the peer cntr. The peer provider should set other functions that attempt to access the -peer\[cq]s cntr (i.e.\ fi_cntr_readerr, fi_cntr_set, etc.) to return --FI_ENOSYS. +peer\[cq]s cntr (i.e.\ fi_cntr_readerr, fi_cntr_set, etc.) +to return -FI_ENOSYS. .SS fi_ops_cntr_owner::inc() .PP This call directs the owner to increment the value of the cntr. @@ -801,9 +815,9 @@ callbacks. .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fi_provider\f[R](7), \f[C]fi_provider\f[R](3), \f[C]fi_cq\f[R](3), +\f[V]fi_provider\f[R](7), \f[V]fi_provider\f[R](3), \f[V]fi_cq\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_poll.3 b/man/man3/fi_poll.3 index a3ee5cbbcb0..b689bf59598 100644 --- a/man/man3/fi_poll.3 +++ b/man/man3/fi_poll.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_poll" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_poll" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -315,7 +329,7 @@ or fid. Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .TP fi_poll On success, if events are available, returns the number of entries @@ -392,7 +406,7 @@ The use of the fi_trywait() function is still required if accessing wait objects directly. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_eq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_eq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_profile.3 b/man/man3/fi_profile.3 index 234a5a10f54..3eaa8532bb3 100644 --- a/man/man3/fi_profile.3 +++ b/man/man3/fi_profile.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_profile" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_profile" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -323,6 +337,6 @@ be returned. For fi_profile_query_vars and fi_profile_query_events, a positive return value indicates the number of variables or events returned in the list. .PP -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_provider.3 b/man/man3/fi_provider.3 index 4814ef7b6d8..41e0289f423 100644 --- a/man/man3/fi_provider.3 +++ b/man/man3/fi_provider.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_provider" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_provider" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -183,7 +197,7 @@ the service or resource to which they correspond. The mr_cache object references the internal memory registration cache used by the different providers. Additional information on the cache is available in the -\f[C]fi_mr(3)\f[R] man page. +\f[V]fi_mr(3)\f[R] man page. .TP \f[I]logging\f[R] The logging object references the internal logging subsystem used by the @@ -193,8 +207,8 @@ Can be opened only once and only the last import is used if imported multiple times. .SS fi_import .PP -This helper function is a combination of \f[C]fi_open\f[R] and -\f[C]fi_import_fid\f[R]. +This helper function is a combination of \f[V]fi_open\f[R] and +\f[V]fi_import_fid\f[R]. It may be used to import a fabric object created and owned by the libfabric user. This allows the upper level libraries or the application to override or @@ -264,9 +278,9 @@ For integrated providers .PP Returns FI_SUCCESS on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3) \f[C]fi_mr\f[R](3), +\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3) \f[V]fi_mr\f[R](3), .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_rma.3 b/man/man3/fi_rma.3 index 236b922f418..39f2d3a52ec 100644 --- a/man/man3/fi_rma.3 +++ b/man/man3/fi_rma.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_rma" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_rma" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -78,7 +92,7 @@ FI_MR_SCALABLE. Protection key associated with the remote memory. .TP \f[I]desc\f[R] -Descriptor associated with the local data buffer See \f[C]fi_mr\f[R](3). +Descriptor associated with the local data buffer See \f[V]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the operation. @@ -175,7 +189,7 @@ struct fi_rma_iov { .PP The write inject call is an optimized version of fi_write. It provides similar completion semantics as fi_inject -\f[C]fi_msg\f[R](3). +\f[V]fi_msg\f[R](3). .SS fi_writedata .PP The write data call is similar to fi_write, but allows for the sending @@ -276,15 +290,15 @@ operation (inclusive) to the posting of a subsequent fenced operation .PP Returns 0 on success. On error, a negative value corresponding to fabric errno is returned. -Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]rdma/fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_tagged.3 b/man/man3/fi_tagged.3 index 7e11b18b037..32624c6fc5f 100644 --- a/man/man3/fi_tagged.3 +++ b/man/man3/fi_tagged.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_tagged" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_tagged" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -74,7 +88,7 @@ Mask of bits to ignore applied to the tag for receive operations. .TP \f[I]desc\f[R] Memory descriptor associated with the data buffer. -See \f[C]fi_mr\f[R](3). +See \f[V]fi_mr\f[R](3). .TP \f[I]data\f[R] Remote CQ data to transfer with the sent data. @@ -199,7 +213,7 @@ struct fi_msg_tagged { .PP The tagged inject call is an optimized version of fi_tsend. It provides similar completion semantics as fi_inject -\f[C]fi_msg\f[R](3). +\f[V]fi_msg\f[R](3). .SS fi_tsenddata .PP The tagged send data call is similar to fi_tsend, but allows for the @@ -287,7 +301,7 @@ Note that an entry to the associated receive completion queue will always be generated when the buffer has been consumed, even if other receive completions have been suppressed (i.e.\ the Rx context has been configured for FI_SELECTIVE_COMPLETION). -See the FI_MULTI_RECV completion flag \f[C]fi_cq\f[R](3). +See the FI_MULTI_RECV completion flag \f[V]fi_cq\f[R](3). .TP \f[I]FI_INJECT_COMPLETE\f[R] Applies to fi_tsendmsg. @@ -381,11 +395,11 @@ ignored. The tagged send and receive calls return 0 on success. On error, a negative value corresponding to fabric \f[I]errno \f[R] is returned. -Fabric errno values are defined in \f[C]fi_errno.h\f[R]. +Fabric errno values are defined in \f[V]fi_errno.h\f[R]. .SH ERRORS .TP \f[I]-FI_EAGAIN\f[R] -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .TP \f[I]-FI_EINVAL\f[R] @@ -395,7 +409,7 @@ Indicates that an invalid argument was supplied by the user. Indicates that an unspecified error occurred. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3), \f[V]fi_cq\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_trigger.3 b/man/man3/fi_trigger.3 index 38e85f16bfb..0e18caa6399 100644 --- a/man/man3/fi_trigger.3 +++ b/man/man3/fi_trigger.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_trigger" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_trigger" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -199,7 +213,7 @@ If a specific request is not supported by the provider, it will fail the operation with -FI_ENOSYS. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_mr\f[R](3), -\f[C]fi_alias\f[R](3), \f[C]fi_cntr\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), \f[V]fi_mr\f[R](3), +\f[V]fi_alias\f[R](3), \f[V]fi_cntr\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man3/fi_version.3 b/man/man3/fi_version.3 index b046adf9132..cb94827f08c 100644 --- a/man/man3/fi_version.3 +++ b/man/man3/fi_version.3 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_version" "3" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_version" "3" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -34,6 +48,6 @@ The upper 16-bits of the version correspond to the major number, and the lower 16-bits correspond with the minor number. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fabric.7 b/man/man7/fabric.7 index b928c04ed89..49c45d804c5 100644 --- a/man/man7/fabric.7 +++ b/man/man7/fabric.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fabric" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fabric" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -16,7 +30,7 @@ fabric - Fabric Interface Library Libfabric is a high-performance fabric software library designed to provide low-latency interfaces to fabric hardware. For an in-depth discussion of the motivation and design see -\f[C]fi_guide\f[R](7). +\f[V]fi_guide\f[R](7). .SH OVERVIEW .PP Libfabric provides `process direct I/O' to application software @@ -37,7 +51,7 @@ All fabric hardware devices and their software drivers are required to support this framework. Devices and the drivers that plug into the libfabric framework are referred to as fabric providers, or simply providers. -Provider details may be found in \f[C]fi_provider\f[R](7). +Provider details may be found in \f[V]fi_provider\f[R](7). .TP \f[I]Fabric Interfaces\f[R] The second component is a set of communication operations. @@ -282,18 +296,18 @@ If the list begins with the `\[ha]' symbol, then the list will be negated. .PP Example: To enable the udp and tcp providers only, set: -\f[C]FI_PROVIDER=\[dq]udp,tcp\[dq]\f[R] +\f[V]FI_PROVIDER=\[dq]udp,tcp\[dq]\f[R] .PP When libfabric is installed, DL providers are put under the \f[I]default provider path\f[R], which is determined by how libfabric is built and installed. Usually the default provider path is -\f[C]/lib/libfabric\f[R] or -\f[C]/lib64/libfabric\f[R]. +\f[V]/lib/libfabric\f[R] or +\f[V]/lib64/libfabric\f[R]. By default, libfabric tries to find DL providers in the following order: .IP "1." 3 Use `dlopen' to load provider libraries named -\f[C]lib-fi.so\f[R] for all providers enabled at build time. +\f[V]lib-fi.so\f[R] for all providers enabled at build time. The search path of `ld.so' is used to locate the files. This step is skipped if libfabric is configured with the option `\[en]enable-restricted-dl'. @@ -363,7 +377,7 @@ can be used to retrieve information about which providers are available in the system. Additionally, it can retrieve a list of all environment variables that may be used to configure libfabric and each provider. -See \f[C]fi_info\f[R](1) for more details. +See \f[V]fi_info\f[R](1) for more details. .SH ENVIRONMENT VARIABLE CONTROLS .PP Core features of libfabric and its providers may be configured by an @@ -400,22 +414,22 @@ may not be available in a child process because of copy on write restrictions. .SS CUDA deadlock .PP -In some cases, calls to \f[C]cudaMemcpy()\f[R] within libfabric may +In some cases, calls to \f[V]cudaMemcpy()\f[R] within libfabric may result in a deadlock. This typically occurs when a CUDA kernel blocks until a -\f[C]cudaMemcpy\f[R] on the host completes. +\f[V]cudaMemcpy\f[R] on the host completes. Applications which can cause such behavior can restrict Libfabric\[cq]s ability to invoke CUDA API operations with the endpoint option -\f[C]FI_OPT_CUDA_API_PERMITTED\f[R]. -See \f[C]fi_endpoint\f[R](3) for more details. +\f[V]FI_OPT_CUDA_API_PERMITTED\f[R]. +See \f[V]fi_endpoint\f[R](3) for more details. .PP Another mechanism which can be used to avoid deadlock is Nvidia\[cq]s GDRCopy. Using GDRCopy requires an external library and kernel module available at https://github.com/NVIDIA/gdrcopy. Libfabric must be configured with GDRCopy support using the -\f[C]--with-gdrcopy\f[R] option, and be run with -\f[C]FI_HMEM_CUDA_USE_GDRCOPY=1\f[R]. +\f[V]--with-gdrcopy\f[R] option, and be run with +\f[V]FI_HMEM_CUDA_USE_GDRCOPY=1\f[R]. This may not be supported by all providers. .SH ABI CHANGES .PP @@ -509,9 +523,9 @@ Added new fields to the following attributes: Added max_group_id .SH SEE ALSO .PP -\f[C]fi_info\f[R](1), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3), -\f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_av\f[R](3), -\f[C]fi_eq\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_cntr\f[R](3), -\f[C]fi_mr\f[R](3) +\f[V]fi_info\f[R](1), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3), +\f[V]fi_endpoint\f[R](3), \f[V]fi_domain\f[R](3), \f[V]fi_av\f[R](3), +\f[V]fi_eq\f[R](3), \f[V]fi_cq\f[R](3), \f[V]fi_cntr\f[R](3), +\f[V]fi_mr\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_arch.7 b/man/man7/fi_arch.7 index fe62ebd155b..21dc5ee4b21 100644 --- a/man/man7/fi_arch.7 +++ b/man/man7/fi_arch.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_arch" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_arch" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .IP .nf diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 index 336b716e982..fd90c654791 100644 --- a/man/man7/fi_cxi.7 +++ b/man/man7/fi_cxi.7 @@ -1,7 +1,21 @@ -.\"t -.\" Automatically generated by Pandoc 2.9.2.1 +'\" t +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_cxi" "7" "2024\-11\-25" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_cxi" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -176,7 +190,7 @@ Classes. .PP While a libfabric user provided authorization key is optional, it is highly encouraged that libfabric users provide an authorization key -through the domain attribute hints during \f[C]fi_getinfo()\f[R]. +through the domain attribute hints during \f[V]fi_getinfo()\f[R]. How libfabric users acquire the authorization key may vary between the users and is outside the scope of this document. .PP @@ -192,18 +206,18 @@ authorization key using them. .IP \[bu] 2 \f[I]SLINGSHOT_VNIS\f[R]: Comma separated list of VNIs. The CXI provider will only use the first VNI if multiple are provide. -Example: \f[C]SLINGSHOT_VNIS=234\f[R]. +Example: \f[V]SLINGSHOT_VNIS=234\f[R]. .IP \[bu] 2 \f[I]SLINGSHOT_DEVICES\f[R]: Comma separated list of device names. Each device index will use the same index to lookup the service ID in \f[I]SLINGSHOT_SVC_IDS\f[R]. -Example: \f[C]SLINGSHOT_DEVICES=cxi0,cxi1\f[R]. +Example: \f[V]SLINGSHOT_DEVICES=cxi0,cxi1\f[R]. .IP \[bu] 2 \f[I]SLINGSHOT_SVC_IDS\f[R]: Comma separated list of pre-configured CXI service IDs. Each service ID index will use the same index to lookup the CXI device in \f[I]SLINGSHOT_DEVICES\f[R]. -Example: \f[C]SLINGSHOT_SVC_IDS=5,6\f[R]. +Example: \f[V]SLINGSHOT_SVC_IDS=5,6\f[R]. .PP \f[B]Note:\f[R] How valid VNIs and device services are configured is outside the responsibility of the CXI provider. @@ -580,7 +594,7 @@ into the fi_control(FI_QUEUE_WORK) critical path. The following subsections outline the CXI provider fork support. .SS RDMA and Fork Overview .PP -Under Linux, \f[C]fork()\f[R] is implemented using copy-on-write (COW) +Under Linux, \f[V]fork()\f[R] is implemented using copy-on-write (COW) pages, so the only penalty that it incurs is the time and memory required to duplicate the parent\[cq]s page tables, mark all of the process\[cq]s page structs as read only and COW, and create a unique @@ -623,22 +637,22 @@ The crux of the issue is the parent issuing forks while trying to do RDMA operations to registered memory regions. Excluding software RDMA emulation, two options exist for RDMA NIC vendors to resolve this data corruption issue. -- Linux \f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC +- Linux \f[V]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC support for on-demand paging (ODP) .SS Linux madvise() MADV_DONTFORK and MADV_DOFORK .PP The generic (i.e.\ non-vendor specific) RDMA NIC solution to the Linux COW fork policy and RDMA problem is to use the following -\f[C]madvise()\f[R] operations during memory registration and +\f[V]madvise()\f[R] operations during memory registration and deregistration: - MADV_DONTFORK: Do not make the pages in this range -available to the child after a \f[C]fork()\f[R]. +available to the child after a \f[V]fork()\f[R]. This is useful to prevent copy-on-write semantics from changing the physical location of a page if the parent writes to it after a -\f[C]fork()\f[R]. +\f[V]fork()\f[R]. (Such page relocations cause problems for hardware that DMAs into the -page.) - MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the -default behavior, whereby a mapping is inherited across -\f[C]fork()\f[R]. +page.) +- MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the default +behavior, whereby a mapping is inherited across \f[V]fork()\f[R]. .PP In the Linux kernel, MADV_DONTFORK will result in the virtual memory area struct (VMA) being marked with the VM_DONTCOPY flag. @@ -649,14 +663,14 @@ Should the child reference the virtual address corresponding to the VMA which was not duplicated, it will segfault. .PP In the previous example, if Process A issued -\f[C]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing +\f[V]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing RDMA memory registration, the physical address 0x1000 would have remained with Process A. This would prevent the Process A data corruption as well. If Process B were to reference virtual address 0xffff0000, it will segfault due to the hole in the virtual address space. .PP -Using \f[C]madvise()\f[R] with MADV_DONTFORK may be problematic for +Using \f[V]madvise()\f[R] with MADV_DONTFORK may be problematic for applications performing RDMA and page aliasing. Paging aliasing is where the parent process uses part or all of a page to share information with the child process. @@ -710,7 +724,7 @@ The CXI provider is subjected to the Linux COW fork policy and RDMA issues described in section \f[I]RDMA and Fork Overview\f[R]. To prevent data corruption with fork, the CXI provider supports the following options: - CXI specific fork environment variables to enable -\f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support* +\f[V]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support* .PP **Formal ODP support pending.* .SS CXI Specific Fork Environment Variables @@ -718,27 +732,27 @@ following options: - CXI specific fork environment variables to enable The CXI software stack has two environment variables related to fork: 0 CXI_FORK_SAFE: Enables base fork safe support. With this environment variable set, regardless of value, libcxi will -issue \f[C]madvise()\f[R] with MADV_DONTFORK on the virtual address +issue \f[V]madvise()\f[R] with MADV_DONTFORK on the virtual address range being registered for RDMA. -In addition, libcxi always align the \f[C]madvise()\f[R] to the system +In addition, libcxi always align the \f[V]madvise()\f[R] to the system default page size. On x86, this is 4 KiB. -To prevent redundant \f[C]madvise()\f[R] calls with MADV_DONTFORK +To prevent redundant \f[V]madvise()\f[R] calls with MADV_DONTFORK against the same virtual address region, reference counting is used -against each tracked \f[C]madvise()\f[R] region. -In addition, libcxi will spilt and merge tracked \f[C]madvise()\f[R] +against each tracked \f[V]madvise()\f[R] region. +In addition, libcxi will spilt and merge tracked \f[V]madvise()\f[R] regions if needed. Once the reference count reaches zero, libcxi will call -\f[C]madvise()\f[R] with MADV_DOFORK, and no longer track the region. +\f[V]madvise()\f[R] with MADV_DOFORK, and no longer track the region. - CXI_FORK_SAFE_HP: With this environment variable set, in conjunction with CXI_FORK_SAFE, libcxi will not assume the page size is system default page size. -Instead, libcxi will walk \f[C]/proc//smaps\f[R] to determine the -correct page size and align the \f[C]madvise()\f[R] calls accordingly. +Instead, libcxi will walk \f[V]/proc//smaps\f[R] to determine the +correct page size and align the \f[V]madvise()\f[R] calls accordingly. This environment variable should be set if huge pages are being used for RDMA. To amortize the per memory registration walk of -\f[C]/proc//smaps\f[R], the libfabric MR cache should be used. +\f[V]/proc//smaps\f[R], the libfabric MR cache should be used. .PP Setting these environment variables will prevent data corruption when the parent issues a fork. @@ -772,7 +786,7 @@ transfer. The following is the CXI provider fork support guidance: - Enable CXI_FORK_SAFE. If huge pages are also used, CXI_FORK_SAFE_HP should be enabled as well. -Since enabling this will result in \f[C]madvice()\f[R] with +Since enabling this will result in \f[V]madvice()\f[R] with MADV_DONTFORK, the following steps should be taken to prevent a child process segfault: - Avoid using stack memory for RDMA - Avoid child process having to access a virtual address range the parent process is @@ -1605,7 +1619,7 @@ It can only be changed prior to any MR being created. .PP CXI domain extensions have been named \f[I]FI_CXI_DOM_OPS_6\f[R]. The flags parameter is ignored. -The fi_open_ops function takes a \f[C]struct fi_cxi_dom_ops\f[R]. +The fi_open_ops function takes a \f[V]struct fi_cxi_dom_ops\f[R]. See an example of usage below: .IP .nf @@ -1698,7 +1712,7 @@ removed from the domain opts prior to software release 2.2. .PP CXI counter extensions have been named \f[I]FI_CXI_COUNTER_OPS\f[R]. The flags parameter is ignored. -The fi_open_ops function takes a \f[C]struct fi_cxi_cntr_ops\f[R]. +The fi_open_ops function takes a \f[V]struct fi_cxi_cntr_ops\f[R]. See an example of usage below. .IP .nf @@ -1821,7 +1835,7 @@ memory operation as a PCIe operation as compared to a NIC operation. The CXI provider extension flag FI_CXI_PCIE_AMO is used to signify this. .PP Since not all libfabric atomic memory operations can be executed as a -PCIe atomic memory operation, \f[C]fi_query_atomic()\f[R] could be used +PCIe atomic memory operation, \f[V]fi_query_atomic()\f[R] could be used to query if a given libfabric atomic memory operation could be executed as PCIe atomic memory operation. .PP @@ -2139,6 +2153,6 @@ In this case, the target NIC is reachable. FI_EIO: Catch all errno. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_direct.7 b/man/man7/fi_direct.7 index 3f9c6a34870..a419ebf1931 100644 --- a/man/man7/fi_direct.7 +++ b/man/man7/fi_direct.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_direct" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_direct" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -71,7 +85,7 @@ The provider sets FI_LOCAL_MR for fi_info:mode. See fi_getinfo for additional details. .SH SEE ALSO .PP -\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), -\f[C]fi_domain\f[R](3) +\f[V]fi_getinfo\f[R](3), \f[V]fi_endpoint\f[R](3), +\f[V]fi_domain\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_efa.7 b/man/man7/fi_efa.7 index ed99b5a3e8a..79f2e7f0d92 100644 --- a/man/man7/fi_efa.7 +++ b/man/man7/fi_efa.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_efa" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_efa" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -95,7 +109,7 @@ No support for counters for the DGRAM endpoint. No support for inject. .SS zero-copy receive mode .IP \[bu] 2 -The receive operation cannot be cancelled via \f[C]fi_cancel()\f[R]. +The receive operation cannot be cancelled via \f[V]fi_cancel()\f[R]. .IP \[bu] 2 Zero-copy receive mode can be enabled only if SHM transfer is disabled. .IP \[bu] 2 @@ -166,12 +180,12 @@ If endpoint is not able to support this feature, it will return .PP The efa provider exports extensions for operations that are not provided by the standard libfabric interface. -These extensions are available via the \[lq]\f[C]fi_ext_efa.h\f[R]\[rq] +These extensions are available via the \[lq]\f[V]fi_ext_efa.h\f[R]\[rq] header file. .SS Domain Operation Extension .PP -Domain operation extension is obtained by calling \f[C]fi_open_ops\f[R] -(see \f[C]fi_domain(3)\f[R]) +Domain operation extension is obtained by calling \f[V]fi_open_ops\f[R] +(see \f[V]fi_domain(3)\f[R]) .IP .nf \f[C] @@ -180,9 +194,9 @@ int fi_open_ops(struct fid *domain, const char *name, uint64_t flags, \f[R] .fi .PP -and requesting \f[C]FI_EFA_DOMAIN_OPS\f[R] in \f[C]name\f[R]. -\f[C]fi_open_ops\f[R] returns \f[C]ops\f[R] as the pointer to the -function table \f[C]fi_efa_ops_domain\f[R] defined as follows: +and requesting \f[V]FI_EFA_DOMAIN_OPS\f[R] in \f[V]name\f[R]. +\f[V]fi_open_ops\f[R] returns \f[V]ops\f[R] as the pointer to the +function table \f[V]fi_efa_ops_domain\f[R] defined as follows: .IP .nf \f[C] @@ -224,20 +238,20 @@ FI_EFA_MR_ATTR_RDMA_RECV_IC_ID: rdma_recv_ic_id has a valid value. \f[I]recv_ic_id\f[R] Physical interconnect used by the device to reach the MR for receive operation. -It is only valid when \f[C]ic_id_validity\f[R] has the -\f[C]FI_EFA_MR_ATTR_RECV_IC_ID\f[R] bit. +It is only valid when \f[V]ic_id_validity\f[R] has the +\f[V]FI_EFA_MR_ATTR_RECV_IC_ID\f[R] bit. .TP \f[I]rdma_read_ic_id\f[R] Physical interconnect used by the device to reach the MR for RDMA read operation. -It is only valid when \f[C]ic_id_validity\f[R] has the -\f[C]FI_EFA_MR_ATTR_RDMA_READ_IC_ID\f[R] bit. +It is only valid when \f[V]ic_id_validity\f[R] has the +\f[V]FI_EFA_MR_ATTR_RDMA_READ_IC_ID\f[R] bit. .TP \f[I]rdma_recv_ic_id\f[R] Physical interconnect used by the device to reach the MR for RDMA write receive. -It is only valid when \f[C]ic_id_validity\f[R] has the -\f[C]FI_EFA_MR_ATTR_RDMA_RECV_IC_ID\f[R] bit. +It is only valid when \f[V]ic_id_validity\f[R] has the +\f[V]FI_EFA_MR_ATTR_RDMA_RECV_IC_ID\f[R] bit. .SS Return value .PP \f[B]query_mr()\f[R] returns 0 on success, or the value of errno on @@ -245,7 +259,7 @@ failure (which indicates the failure reason). .SH Traffic Class (tclass) in EFA .PP To prioritize the messages from a given endpoint, user can specify -\f[C]fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY\f[R] in the +\f[V]fi_info->tx_attr->tclass = FI_TC_LOW_LATENCY\f[R] in the fi_endpoint() call to set the service level in rdma-core. All other tclass values will be ignored. .SH RUNTIME PARAMETERS @@ -328,7 +342,7 @@ to a peer after a receiver not ready error. Enable SHM provider to provide the communication across all intra-node processes. SHM transfer will be disabled in the case where -\f[C]ptrace protection\f[R] is turned on. +\f[V]ptrace protection\f[R] is turned on. You can turn it off to enable shm transfer. .PP FI_EFA_ENABLE_SHM_TRANSFER is parsed during the fi_domain call and is @@ -421,8 +435,14 @@ Use device\[cq]s unsolicited write recv functionality when it\[cq]s available. (Default: 1). Setting this environment variable to 0 can disable this feature. +.TP +\f[I]FI_EFA_INTERNAL_RX_REFILL_THRESHOLD\f[R] +The threshold that EFA provider will refill the internal rx pkt pool. +(Default: 8). +When the number of internal rx pkts to post is lower than this +threshold, the refill will be skipped. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_guide.7 b/man/man7/fi_guide.7 index 3917b17b1ea..2e9ac7ada01 100644 --- a/man/man7/fi_guide.7 +++ b/man/man7/fi_guide.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_guide" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_guide" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -20,16 +34,16 @@ This guide describes the libfabric architecture and interfaces. Due to the length of the guide, it has been broken into multiple pages. These sections are: .TP -\f[I]Introduction \f[BI]\f[CBI]fi_intro\f[BI]\f[I](7)\f[R] +\f[I]Introduction \f[VI]fi_intro\f[I](7)\f[R] This section provides insight into the motivation for the libfabric design and underlying networking features that are being exposed through the API. .TP -\f[I]Architecture \f[BI]\f[CBI]fi_arch\f[BI]\f[I](7)\f[R] +\f[I]Architecture \f[VI]fi_arch\f[I](7)\f[R] This describes the exposed architecture of libfabric, including the object-model and their related operations .TP -\f[I]Setup \f[BI]\f[CBI]fi_setup\f[BI]\f[I](7)\f[R] +\f[I]Setup \f[VI]fi_setup\f[I](7)\f[R] This provides basic bootstrapping and setup for using the libfabric API. .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_hook.7 b/man/man7/fi_hook.7 index 714c8a1f46b..18eb9a9dbce 100644 --- a/man/man7/fi_hook.7 +++ b/man/man7/fi_hook.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_hook" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_hook" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -166,6 +180,6 @@ Application that use FI_TRIGGER operations that attempt to hook calls will likely crash. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_intro.7 b/man/man7/fi_intro.7 index c6965739ae0..3a6dd2507ba 100644 --- a/man/man7/fi_intro.7 +++ b/man/man7/fi_intro.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_intro" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_intro" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -8,7 +22,7 @@ fi_intro - libfabric introduction .SH OVERVIEW .PP This introduction is part of the libfabric\[cq]s programmer\[cq]s guide. -See \f[C]fi_guide\f[R](7). +See \f[V]fi_guide\f[R](7). This section provides insight into the motivation for the libfabric design and underlying networking features that are being exposed through the API. @@ -1124,9 +1138,9 @@ If an application is using 1000 endpoints and posts 100 buffers, each 4 KB, that results in 400 MB of memory space being consumed to receive data. (We can start to realize that by eliminating memory copies, one of the -trade offs is increased memory consumption.) While 400 MB seems like a -lot of memory, there is less than half a megabyte allocated to a single -receive queue. +trade offs is increased memory consumption.) +While 400 MB seems like a lot of memory, there is less than half a +megabyte allocated to a single receive queue. At today\[cq]s networking speeds, that amount of space can be consumed within milliseconds. The result is that if only a few endpoints are in use, the application @@ -1415,6 +1429,6 @@ but it does allow for optimizing network utilization. Libfabric is well architected to support the previously discussed features. For further information on the libfabric architecture, see the next -programmer\[cq]s guide section: \f[C]fi_arch\f[R](7). +programmer\[cq]s guide section: \f[V]fi_arch\f[R](7). .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_lnx.7 b/man/man7/fi_lnx.7 index 8caddfcebf2..c3826b3a383 100644 --- a/man/man7/fi_lnx.7 +++ b/man/man7/fi_lnx.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_lnx" "7" "2024\-10\-24" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_lnx" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -147,9 +161,10 @@ variables: This environment variable is used to specify which providers to link. This must be set in order for the LNX provider to return a list of fi_info blocks in the fi_getinfo() call. -The format which must be used is: ++\&... As mentioned earlier currently -LNX supports linking only two providers the first of which is SHM -followed by one other provider for inter-node operations +The format which must be used is: ++\&... +As mentioned earlier currently LNX supports linking only two providers +the first of which is SHM followed by one other provider for inter-node +operations .TP \f[I]FI_LNX_DISABLE_SHM\f[R] By default this environment variable is set to 0. @@ -170,6 +185,6 @@ off SRQ support by setting this environment variable to 0. It is 1 by default. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_lpp.7 b/man/man7/fi_lpp.7 index 05e0d9a3f03..1d09963ca44 100644 --- a/man/man7/fi_lpp.7 +++ b/man/man7/fi_lpp.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_lpp" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_lpp" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -74,6 +88,6 @@ Use the memcpy implementation in the system libc rather than provider-specific memcpy. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_mrail.7 b/man/man7/fi_mrail.7 index 2939f3b6f30..8af1a0f16a7 100644 --- a/man/man7/fi_mrail.7 +++ b/man/man7/fi_mrail.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_mrail" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_mrail" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -83,18 +97,18 @@ Deprecated. Replaced by \f[I]FI_OFI_MRAIL_ADDR\f[R]. .TP \f[I]FI_OFI_MRAIL_CONFIG\f[R] -Comma separated list of \f[C]:\f[R] pairs, sorted in -ascending order of \f[C]\f[R]. +Comma separated list of \f[V]:\f[R] pairs, sorted in +ascending order of \f[V]\f[R]. Each pair indicated the rail sharing policy to be used for messages up -to the size \f[C]\f[R] and not covered by all previous pairs. -The value of \f[C]\f[R] can be \f[I]fixed\f[R] (a fixed rail is +to the size \f[V]\f[R] and not covered by all previous pairs. +The value of \f[V]\f[R] can be \f[I]fixed\f[R] (a fixed rail is used), \f[I]round-robin\f[R] (one rail per message, selected in round-robin fashion), or \f[I]striping\f[R] (striping across all the rails). -The default configuration is \f[C]16384:fixed,ULONG_MAX:striping\f[R]. +The default configuration is \f[V]16384:fixed,ULONG_MAX:striping\f[R]. The value ULONG_MAX can be input as -1. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_opx.7 b/man/man7/fi_opx.7 index 0653d54a3ef..e77be3efd17 100644 --- a/man/man7/fi_opx.7 +++ b/man/man7/fi_opx.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_opx" "7" "2024\-10\-18" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_opx" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .PP {%include JB/setup %} @@ -174,35 +188,35 @@ Defaults to \[lq]No\[rq] \f[I]FI_OPX_HFI_SELECT\f[R] String. Controls how OPX chooses which HFI to use when opening a context. -Has two forms: - \f[C]\f[R] Force OPX provider to use -\f[C]hfi-unit\f[R]. -- \f[C][,[,...,]]\f[R] Select HFI based -on first matching \f[C]selector\f[R] +Has two forms: - \f[V]\f[R] Force OPX provider to use +\f[V]hfi-unit\f[R]. +- \f[V][,[,...,]]\f[R] Select HFI based +on first matching \f[V]selector\f[R] .PP -Where \f[C]selector\f[R] is one of the following forms: - -\f[C]default\f[R] to use the default logic - \f[C]fixed:\f[R] -to fix to one \f[C]hfi-unit\f[R] - -\f[C]::\f[R] +Where \f[V]selector\f[R] is one of the following forms: - +\f[V]default\f[R] to use the default logic - \f[V]fixed:\f[R] +to fix to one \f[V]hfi-unit\f[R] - +\f[V]::\f[R] .PP -The above fields have the following meaning: - \f[C]selector-type\f[R] +The above fields have the following meaning: - \f[V]selector-type\f[R] The selector criteria the caller opening the context is evaluated against. -- \f[C]hfi-unit\f[R] The HFI to use if the caller matches the selector. -- \f[C]selector-data\f[R] Data the caller must match (e.g.\ NUMA node +- \f[V]hfi-unit\f[R] The HFI to use if the caller matches the selector. +- \f[V]selector-data\f[R] Data the caller must match (e.g.\ NUMA node ID). .PP -Where \f[C]selector-type\f[R] is one of the following: - \f[C]numa\f[R] +Where \f[V]selector-type\f[R] is one of the following: - \f[V]numa\f[R] True when caller is local to the NUMA node ID given by -\f[C]selector-data\f[R]. -- \f[C]core\f[R] True when caller is local to the CPU core given by -\f[C]selector-data\f[R]. +\f[V]selector-data\f[R]. +- \f[V]core\f[R] True when caller is local to the CPU core given by +\f[V]selector-data\f[R]. .PP -And \f[C]selector-data\f[R] is one of the following: - \f[C]value\f[R] -The specific value to match - \f[C]-\f[R] +And \f[V]selector-data\f[R] is one of the following: - \f[V]value\f[R] +The specific value to match - \f[V]-\f[R] Matches with any value in that range .PP In the second form, when opening a context, OPX uses the -\f[C]hfi-unit\f[R] of the first-matching selector. +\f[V]hfi-unit\f[R] of the first-matching selector. Selectors are evaluated left-to-right. OPX will return an error if the caller does not match any selector. .PP @@ -218,27 +232,27 @@ For the second form, as which HFI is selected depends on properties of the caller, deterministic HFI selection requires deterministic caller properties. E.g. -for the \f[C]numa\f[R] selector, if the caller can migrate between NUMA +for the \f[V]numa\f[R] selector, if the caller can migrate between NUMA domains, then HFI selection will not be deterministic. .PP The logic used will always be the first valid in a selector list. -For example, \f[C]default\f[R] and \f[C]fixed\f[R] will match all +For example, \f[V]default\f[R] and \f[V]fixed\f[R] will match all callers, so if either are in the beginning of a selector list, you will -only use \f[C]fixed\f[R] or \f[C]default\f[R] regardles of if there are +only use \f[V]fixed\f[R] or \f[V]default\f[R] regardles of if there are any more selectors. .PP -Examples: - \f[C]FI_OPX_HFI_SELECT=0\f[R] all callers will open contexts +Examples: - \f[V]FI_OPX_HFI_SELECT=0\f[R] all callers will open contexts on HFI 0. -- \f[C]FI_OPX_HFI_SELECT=1\f[R] all callers will open contexts on HFI 1. -- \f[C]FI_OPX_HFI_SELECT=numa:0:0,numa:1:1,numa:0:2,numa:1:3\f[R] +- \f[V]FI_OPX_HFI_SELECT=1\f[R] all callers will open contexts on HFI 1. +- \f[V]FI_OPX_HFI_SELECT=numa:0:0,numa:1:1,numa:0:2,numa:1:3\f[R] callers local to NUMA nodes 0 and 2 will use HFI 0, callers local to NUMA domains 1 and 3 will use HFI 1. -- \f[C]FI_OPX_HFI_SELECT=numa:0:0-3,default\f[R] callers local to NUMA +- \f[V]FI_OPX_HFI_SELECT=numa:0:0-3,default\f[R] callers local to NUMA nodes 0 thru 3 (including 0 and 3) will use HFI 0, and all else will use default selection logic. -- \f[C]FI_OPX_HFI_SELECT=core:1:0,fixed:0\f[R] callers local to CPU core +- \f[V]FI_OPX_HFI_SELECT=core:1:0,fixed:0\f[R] callers local to CPU core 0 will use HFI 1, and all others will use HFI 0. -- \f[C]FI_OPX_HFI_SELECT=default,core:1:0\f[R] all callers will use +- \f[V]FI_OPX_HFI_SELECT=default,core:1:0\f[R] all callers will use default HFI selection logic. .TP \f[I]FI_OPX_DELIVERY_COMPLETION_THRESHOLD\f[R] @@ -298,9 +312,9 @@ This feature is not currently supported. \f[I]FI_OPX_PROG_AFFINITY\f[R] String. This sets the affinity to be used for any progress threads. -Set as a colon-separated triplet as \f[C]start:end:stride\f[R], where +Set as a colon-separated triplet as \f[V]start:end:stride\f[R], where stride controls the interval between selected cores. -For example, \f[C]1:5:2\f[R] will have cores 1, 3, and 5 as valid cores +For example, \f[V]1:5:2\f[R] will have cores 1, 3, and 5 as valid cores for progress threads. By default no affinity is set. .TP @@ -344,6 +358,6 @@ Needs to be set to 1 in case of mixed network. Default is 0. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_provider.7 b/man/man7/fi_provider.7 index a47af04f404..48547f0479d 100644 --- a/man/man7/fi_provider.7 +++ b/man/man7/fi_provider.7 @@ -1,13 +1,27 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_provider" "7" "2024\-10\-25" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_provider" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP fi_provider - Fabric Interface Providers .SH OVERVIEW .PP -See \f[C]fi_arch\f[R](7) for a brief description of how providers fit +See \f[V]fi_arch\f[R](7) for a brief description of how providers fit into the libfabric architecture. .PP Conceptually, a fabric provider implements and maps the libfabric API @@ -74,56 +88,56 @@ This list is not exhaustive. .TP \f[I]CXI\f[R] Provider for Cray\[cq]s Slingshot network. -See \f[C]fi_cxi\f[R](7) for more information. +See \f[V]fi_cxi\f[R](7) for more information. .TP \f[I]EFA\f[R] A provider for the Amazon EC2 Elastic Fabric Adapter (EFA) (https://aws.amazon.com/hpc/efa/), a custom-built OS bypass hardware interface for inter-instance communication on EC2. -See \f[C]fi_efa\f[R](7) for more information. +See \f[V]fi_efa\f[R](7) for more information. .TP \f[I]LPP\f[R] A provider runs on FabreX PCIe networks. -See \f[C]fi_lpp\f[R](7) for more information. +See \f[V]fi_lpp\f[R](7) for more information. .TP \f[I]OPX\f[R] Supports Omni-Path networking from Cornelis Networks. -See \f[C]fi_opx\f[R](7) for more information. +See \f[V]fi_opx\f[R](7) for more information. .TP \f[I]PSM2\f[R] Older provider for Omni-Path networks. -See \f[C]fi_psm2\f[R](7) for more information. +See \f[V]fi_psm2\f[R](7) for more information. .TP \f[I]PSM3\f[R] Provider for Ethernet networking from Intel. -See \f[C]fi_psm3\f[R](7) for more information. +See \f[V]fi_psm3\f[R](7) for more information. .TP \f[I]SHM\f[R] A provider for intra-node communication using shared memory. -See \f[C]fi_shm\f[R](7) for more information. +See \f[V]fi_shm\f[R](7) for more information. .TP \f[I]TCP\f[R] A provider which runs over the TCP/IP protocol and is available on multiple operating systems. This provider enables develop of libfabric applications on most platforms. -See \f[C]fi_tcp\f[R](7) for more information. +See \f[V]fi_tcp\f[R](7) for more information. .TP \f[I]UCX\f[R] A provider which runs over the UCX library which is currently supported by Infiniband fabrics from NVIDIA. -See \f[C]fi_ucx\f[R](7) for more information. +See \f[V]fi_ucx\f[R](7) for more information. .TP \f[I]UDP\f[R] A provider which runs over the UDP/IP protocol and is available on multiple operating systems. This provider enables develop of libfabric applications on most platforms. -See \f[C]fi_udp\f[R](7) for more information. +See \f[V]fi_udp\f[R](7) for more information. .TP \f[I]Verbs\f[R] This provider targets RDMA NICs for both Linux and Windows platforms. -See \f[C]fi_verbs\f[R](7) for more information. +See \f[V]fi_verbs\f[R](7) for more information. .SH Utility Providers .PP Utility providers are named with a starting prefix of \[lq]ofi_\[rq]. @@ -136,17 +150,17 @@ simpler endpoint type. .PP Utility providers show up as part of the return\[cq]s provider\[cq]s name. -See \f[C]fi_fabric\f[R](3). +See \f[V]fi_fabric\f[R](3). Utility providers are enabled automatically for core providers that do not support the feature set requested by an application. .TP \f[I]RxM\f[R] Implements RDM endpoint semantics over MSG endpoints. -See \f[C]fi_rxm\f[R](7) for more information. +See \f[V]fi_rxm\f[R](7) for more information. .TP \f[I]RxD\f[R] Implements RDM endpoint semantis over DGRAM endpoints. -See \f[C]fi_rxd\f[R](7) for more information. +See \f[V]fi_rxd\f[R](7) for more information. .SH Hooking Providers .PP Hooking providers are mostly used for debugging purposes. @@ -157,7 +171,7 @@ Hooking providers can layer over all other providers and intercept, or hook, their calls in order to perform some dedicated task, such as gathering performance data on call paths or providing debug output. .PP -See \f[C]fi_hook\f[R](7) for more information. +See \f[V]fi_hook\f[R](7) for more information. .SH Offload Providers .PP Offload providers start with the naming prefix \[lq]off_\[rq]. @@ -180,9 +194,9 @@ Future releases of the provider will allow linking any number of providers and provide the users with the ability to influence the way the providers are utilized for traffic load. .PP -See \f[C]fi_lnx\f[R](7) for more information. +See \f[V]fi_lnx\f[R](7) for more information. .SH SEE ALSO .PP -\f[C]fabric\f[R](7) \f[C]fi_provider\f[R](3) +\f[V]fabric\f[R](7) \f[V]fi_provider\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm2.7 b/man/man7/fi_psm2.7 index 8233bc835ab..95009032ff2 100644 --- a/man/man7/fi_psm2.7 +++ b/man/man7/fi_psm2.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_psm2" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_psm2" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -214,11 +228,11 @@ See \f[I]FI_PSM2_PROG_AFFINITY\f[R]. When set, specify the set of CPU cores to set the progress thread affinity to. The format is -\f[C][:[:]][,[:[:]]]*\f[R], -where each triplet \f[C]::\f[R] defines a block of +\f[V][:[:]][,[:[:]]]*\f[R], +where each triplet \f[V]::\f[R] defines a block of core_ids. -Both \f[C]\f[R] and \f[C]\f[R] can be either the -\f[C]core_id\f[R] (when >=0) or \f[C]core_id - num_cores\f[R] (when <0). +Both \f[V]\f[R] and \f[V]\f[R] can be either the +\f[V]core_id\f[R] (when >=0) or \f[V]core_id - num_cores\f[R] (when <0). .PP By default affinity is not set. .TP @@ -324,6 +338,6 @@ Valid parameter names are defined in the header file \f[I]rdma/fi_ext_psm2.h\f[R]. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm3\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_psm3\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm3.7 b/man/man7/fi_psm3.7 index 892fea3f805..172eabb1aa8 100644 --- a/man/man7/fi_psm3.7 +++ b/man/man7/fi_psm3.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_psm3" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_psm3" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -11,7 +25,7 @@ The \f[I]psm3\f[R] provider implements a Performance Scaled Messaging capability which supports most verbs UD and sockets devices. Additional features and optimizations can be enabled when running over Intel\[cq]s E810 Ethernet NICs and/or using Intel\[cq]s rendezvous -kernel module (\f[C]rv\f[R]). +kernel module (\f[V]rv\f[R]). PSM 3.x fully integrates the OFI provider and the underlying PSM3 protocols/implementation and only exports the OFI APIs. .SH SUPPORTED FEATURES @@ -209,11 +223,11 @@ See \f[I]FI_PSM3_PROG_AFFINITY\f[R]. When set, specify the set of CPU cores to set the progress thread affinity to. The format is -\f[C][:[:]][,[:[:]]]*\f[R], -where each triplet \f[C]::\f[R] defines a block of +\f[V][:[:]][,[:[:]]]*\f[R], +where each triplet \f[V]::\f[R] defines a block of core_ids. -Both \f[C]\f[R] and \f[C]\f[R] can be either the -\f[C]core_id\f[R] (when >=0) or \f[C]core_id - num_cores\f[R] (when <0). +Both \f[V]\f[R] and \f[V]\f[R] can be either the +\f[V]core_id\f[R] (when >=0) or \f[V]core_id - num_cores\f[R] (when <0). .PP By default affinity is not set. .TP @@ -304,6 +318,6 @@ Notice that if the provider is compiled with macro runtime option will be disabled. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm2\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_psm2\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_rxd.7 b/man/man7/fi_rxd.7 index 1bf65d50151..11a2ac85f18 100644 --- a/man/man7/fi_rxd.7 +++ b/man/man7/fi_rxd.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_rxd" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_rxd" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -61,6 +75,6 @@ Maximum number of packets (per peer) to send at a time. Default: 128 .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_rxm.7 b/man/man7/fi_rxm.7 index c683dde87b1..294f43eba2c 100644 --- a/man/man7/fi_rxm.7 +++ b/man/man7/fi_rxm.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_rxm" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_rxm" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -212,7 +226,7 @@ to only required values. .PP The data transfer API may return -FI_EAGAIN during on-demand connection setup of the core provider FI_MSG_EP. -See \f[C]fi_msg\f[R](3) for a detailed description of handling +See \f[V]fi_msg\f[R](3) for a detailed description of handling FI_EAGAIN. .SH Troubleshooting / Known issues .PP @@ -229,6 +243,6 @@ The workaround is to use shared receive contexts for the MSG provider (FI_OFI_RXM_MSG_TX_SIZE / FI_OFI_RXM_MSG_RX_SIZE). .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_setup.7 b/man/man7/fi_setup.7 index 44cad0029c6..752ace93874 100644 --- a/man/man7/fi_setup.7 +++ b/man/man7/fi_setup.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_setup" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_setup" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -459,8 +473,9 @@ libfabric defines a unique threading model. The libfabric design is heavily influenced by object-oriented programming concepts. A multi-threaded application must determine how libfabric objects -(domains, endpoints, completion queues, etc.) will be allocated among -its threads, or if any thread can access any object. +(domains, endpoints, completion queues, etc.) +will be allocated among its threads, or if any thread can access any +object. For example, an application may spawn a new thread to handle each new connected endpoint. The domain threading field provides a mechanism for an application to diff --git a/man/man7/fi_shm.7 b/man/man7/fi_shm.7 index ff7c5f241bb..f28656601d4 100644 --- a/man/man7/fi_shm.7 +++ b/man/man7/fi_shm.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_shm" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_shm" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -89,7 +103,7 @@ was provided by the application), no supplemental information is required to make it unique and it will remain with only the application-defined address. Note that the actual endpoint name will not include the FI_ADDR_STR -\[dq]*://\[dq] prefix since it cannot be included in any shared memory +\[lq]*://\[rq] prefix since it cannot be included in any shared memory region names. The provider will strip off the prefix before setting the endpoint name. As a result, the addresses \[lq]fi_prefix1://my_node:my_service\[rq] and @@ -204,6 +218,6 @@ different systems. Default 262144 .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_sockets.7 b/man/man7/fi_sockets.7 index 1af06fc6eba..133fdaedb65 100644 --- a/man/man7/fi_sockets.7 +++ b/man/man7/fi_sockets.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_sockets" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_sockets" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -128,6 +142,6 @@ The recommended parameters for large scale runs are \f[I]FI_SOCKETS_DEF_CQ_SZ\f[R], \f[I]FI_SOCKETS_DEF_EQ_SZ\f[R]. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_tcp.7 b/man/man7/fi_tcp.7 index 7d291ccae3e..b25c0958206 100644 --- a/man/man7/fi_tcp.7 +++ b/man/man7/fi_tcp.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_tcp" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_tcp" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -114,6 +128,6 @@ from the tcp provider. This will provide the best performance. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_ucx.7 b/man/man7/fi_ucx.7 index a4928235e12..440a76b429a 100644 --- a/man/man7/fi_ucx.7 +++ b/man/man7/fi_ucx.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_ucx" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_ucx" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -52,6 +66,6 @@ any). Check request leak (default: disabled). .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_udp.7 b/man/man7/fi_udp.7 index 8da3619cb8d..a020f7a54c3 100644 --- a/man/man7/fi_udp.7 +++ b/man/man7/fi_udp.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_udp" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_udp" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -51,6 +65,6 @@ No support for counters. No runtime parameters are currently defined. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3) +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), \f[V]fi_getinfo\f[R](3) .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_usnic.7 b/man/man7/fi_usnic.7 index c351d23f01b..e9104cfd0bd 100644 --- a/man/man7/fi_usnic.7 +++ b/man/man7/fi_usnic.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_usnic" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_usnic" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -29,7 +43,7 @@ installing libnl from RPM or other packaging system, install the If you have libnl (either v1 or v3) installed in a non-standard location (e.g., not in /usr/lib or /usr/lib64), you may need to tell libfabric\[cq]s configure where to find libnl via the -\f[C]--with-libnl=DIR\f[R] command line option (where DIR is the +\f[V]--with-libnl=DIR\f[R] command line option (where DIR is the installation prefix of the libnl package). .RE .IP \[bu] 2 @@ -56,7 +70,7 @@ In particular, there are known bugs in RDM support in the presence of congestion or packet loss (issue 1621). RMA is not yet supported. .IP \[bu] 2 -\f[C]fi_provider\f[R](7) lists requirements for all providers. +\f[V]fi_provider\f[R](7) lists requirements for all providers. The following limitations exist in the \f[I]usnic\f[R] provider: .RS 2 .IP \[bu] 2 @@ -69,13 +83,13 @@ CM operations. Passive endpoints only support listen, setname, and getname CM operations. .IP \[bu] 2 -\f[I]FI_EP_DGRAM\f[R] endpoints support \f[C]fi_sendmsg()\f[R] and -\f[C]fi_recvmsg()\f[R], but some flags are ignored. -\f[C]fi_sendmsg()\f[R] supports \f[C]FI_INJECT\f[R] and -\f[C]FI_COMPLETION\f[R]. -\f[C]fi_recvmsg()\f[R] supports \f[C]FI_MORE\f[R]. +\f[I]FI_EP_DGRAM\f[R] endpoints support \f[V]fi_sendmsg()\f[R] and +\f[V]fi_recvmsg()\f[R], but some flags are ignored. +\f[V]fi_sendmsg()\f[R] supports \f[V]FI_INJECT\f[R] and +\f[V]FI_COMPLETION\f[R]. +\f[V]fi_recvmsg()\f[R] supports \f[V]FI_MORE\f[R]. .IP \[bu] 2 -Address vectors only support \f[C]FI_AV_MAP\f[R]. +Address vectors only support \f[V]FI_AV_MAP\f[R]. .IP \[bu] 2 No counters are supported. .IP \[bu] 2 @@ -119,19 +133,19 @@ file. Version 2 of the \[lq]fabric getinfo\[rq] extension was introduced in Libfabric release v1.3.0 and can be used to retrieve IP and SR-IOV information about a usNIC device obtained from the -\f[C]fi_getinfo\f[R](3) function. +\f[V]fi_getinfo\f[R](3) function. .PP The \[lq]fabric getinfo\[rq] extension is obtained by calling -\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_FABRIC_OPS_1\f[R] to +\f[V]fi_open_ops\f[R] and requesting \f[V]FI_USNIC_FABRIC_OPS_1\f[R] to get the usNIC fabric extension operations. -The \f[C]getinfo\f[R] function accepts a version parameter that can be +The \f[V]getinfo\f[R] function accepts a version parameter that can be used to select different versions of the extension. The information returned by the \[lq]fabric getinfo\[rq] extension is -accessible through a \f[C]fi_usnic_info\f[R] struct that uses a version +accessible through a \f[V]fi_usnic_info\f[R] struct that uses a version tagged union. The accessed union member must correspond with the requested version. It is recommended that applications explicitly request a version rather -than using the header provided \f[C]FI_EXT_USNIC_INFO_VERSION\f[R]. +than using the header provided \f[V]FI_EXT_USNIC_INFO_VERSION\f[R]. Although there is a version 1 of the extension, its use is discouraged, and it may not be available in future releases. .SS Compatibility issues @@ -244,8 +258,8 @@ struct fi_usnic_info_v1 { .fi .PP Version 1 of the \[lq]fabric getinfo\[rq] extension can be used by -explicitly requesting it in the call to \f[C]getinfo\f[R] and accessing -the \f[C]v1\f[R] portion of the \f[C]fi_usnic_info.ui\f[R] union. +explicitly requesting it in the call to \f[V]getinfo\f[R] and accessing +the \f[V]v1\f[R] portion of the \f[V]fi_usnic_info.ui\f[R] union. Use of version 1 is not recommended and it may be removed from future releases. .PP @@ -327,7 +341,7 @@ Libfabric release v1.0.0 and can be used to retrieve the network distance of an address. .PP The \[lq]get_distance\[rq] extension is obtained by calling -\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_AV_OPS_1\f[R] to get +\f[V]fi_open_ops\f[R] and requesting \f[V]FI_USNIC_AV_OPS_1\f[R] to get the usNIC address vector extension operations. .IP .nf @@ -343,9 +357,9 @@ Address vector Destination address .TP \f[I]metric\f[R] -On output this will contain \f[C]-1\f[R] if the destination host is -unreachable, \f[C]0\f[R] is the destination host is locally connected, -and \f[C]1\f[R] otherwise. +On output this will contain \f[V]-1\f[R] if the destination host is +unreachable, \f[V]0\f[R] is the destination host is locally connected, +and \f[V]1\f[R] otherwise. .PP See fi_ext_usnic.h for more details. .SH VERSION DIFFERENCES @@ -355,28 +369,28 @@ The release of libfabric v1.4 introduced a new naming convention for fabric and domain. However the usNIC provider remains backward compatible with applications supporting the old scheme and decides which one to use based on the -version passed to \f[C]fi_getinfo\f[R]: +version passed to \f[V]fi_getinfo\f[R]: .IP \[bu] 2 -When \f[C]FI_VERSION(1,4)\f[R] or higher is used: +When \f[V]FI_VERSION(1,4)\f[R] or higher is used: .RS 2 .IP \[bu] 2 fabric name is the network address with the CIDR notation (i.e., -\f[C]a.b.c.d/e\f[R]) +\f[V]a.b.c.d/e\f[R]) .IP \[bu] 2 -domain name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) +domain name is the usNIC Linux interface name (i.e., \f[V]usnic_X\f[R]) .RE .IP \[bu] 2 -When a lower version number is used, like \f[C]FI_VERSION(1, 3)\f[R], it +When a lower version number is used, like \f[V]FI_VERSION(1, 3)\f[R], it follows the same behavior the usNIC provider exhibited in libfabric <= v1.3: .RS 2 .IP \[bu] 2 -fabric name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) +fabric name is the usNIC Linux interface name (i.e., \f[V]usnic_X\f[R]) .IP \[bu] 2 -domain name is \f[C]NULL\f[R] +domain name is \f[V]NULL\f[R] .RE .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_open_ops\f[R](3), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_open_ops\f[R](3), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_verbs.7 b/man/man7/fi_verbs.7 index c5ffb67d719..1a5e3f11794 100644 --- a/man/man7/fi_verbs.7 +++ b/man/man7/fi_verbs.7 @@ -1,6 +1,20 @@ -.\" Automatically generated by Pandoc 2.9.2.1 +.\" Automatically generated by Pandoc 3.1.3 .\" -.TH "fi_verbs" "7" "2024\-10\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.\" Define V font for inline verbatim, using C font in formats +.\" that render this, and otherwise B font. +.ie "\f[CB]x\f[]"x" \{\ +. ftr V B +. ftr VI BI +. ftr VB B +. ftr VBI BI +.\} +.el \{\ +. ftr V CR +. ftr VI CI +. ftr VB CB +. ftr VBI CBI +.\} +.TH "fi_verbs" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -147,7 +161,7 @@ to be re-mapped when the process is forked (MADV_DONTFORK). .PP The XRC transport is intended to be used when layered with the RXM provider and requires the use of shared receive contexts. -See \f[C]fi_rxm\f[R](7). +See \f[V]fi_rxm\f[R](7). To enable XRC, the following environment variables must usually be set: FI_VERBS_PREFER_XRC and FI_OFI_RXM_USE_SRX. .SH RUNTIME PARAMETERS @@ -280,6 +294,6 @@ post excess receives without draining the CQ. CQ overruns can make the MSG endpoints unusable. .SH SEE ALSO .PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +\f[V]fabric\f[R](7), \f[V]fi_provider\f[R](7), .SH AUTHORS OpenFabrics. From 45b7b2446cb1093bb3efae7138114fed96670797 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Tue, 10 Dec 2024 15:08:03 -0800 Subject: [PATCH 307/393] man: Fix MarkDown format issue in fi_lnx man page The '<' symbol has special meaning and must be escaped. Signed-off-by: Jianxin Xiong --- man/fi_lnx.7.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fi_lnx.7.md b/man/fi_lnx.7.md index f52a08840dc..6d83e914c34 100644 --- a/man/fi_lnx.7.md +++ b/man/fi_lnx.7.md @@ -132,7 +132,7 @@ The *LNX* provider checks for the following environment variables: : This environment variable is used to specify which providers to link. This must be set in order for the LNX provider to return a list of fi_info blocks in the fi_getinfo() call. The format which must be used is: - ++... As mentioned earlier currently LNX supports linking + \+\+... As mentioned earlier currently LNX supports linking only two providers the first of which is SHM followed by one other provider for inter-node operations From f03fe010b869f110e488dc3d4e8dda08870341cf Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Wed, 11 Dec 2024 00:35:19 +0000 Subject: [PATCH 308/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_lnx.7 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/man7/fi_lnx.7 b/man/man7/fi_lnx.7 index c3826b3a383..21ebfcbe09a 100644 --- a/man/man7/fi_lnx.7 +++ b/man/man7/fi_lnx.7 @@ -14,7 +14,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_lnx" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_lnx" "7" "2024\-12\-11" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -161,7 +161,7 @@ variables: This environment variable is used to specify which providers to link. This must be set in order for the LNX provider to return a list of fi_info blocks in the fi_getinfo() call. -The format which must be used is: ++\&... +The format which must be used is: ++\&... As mentioned earlier currently LNX supports linking only two providers the first of which is SHM followed by one other provider for inter-node operations From 0a76ac41d9080d1c55cbb042c4d0de0c241bb068 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 8 Sep 2024 09:51:04 -0700 Subject: [PATCH 309/393] hmem/cuda: avoid stub loading at runtime When the CUDA toolkit is installed, a set of "stub" libraries are installed under /usr/local/cuda*/lib64/stubs/. These libraries include a SONAME field with a `.1' suffix, but the filenames of these stubs are bare. eg: > $ readelf -d /usr/local/cuda-12.5/lib64/stubs/libnvidia-ml.so | grep soname > 0x000000000000000e (SONAME) Library soname: [libnvidia-ml.so.1] The CUDA toolkit does not include any library file with the name `libnvidia-ml.so.1` (or `libcuda.so.1`, etc.), as these are provided by the driver package. This disconnect between the stub filename in the toolkit and the SONAME within it is done intentionally to allow linking with the stub at build time, while ensuring it's never loaded at runtime. In normal dynamic linking cases (ie: without dlopen), the SONAME field of `libnvidia-ml.so.1` is used in the DT_NEEDED tag, where that filename can only come from a driver package and this ensures that the stub library will never match. Match the same behavior and provide `.1` suffixes to dlopen where appropriate for NVIDIA libraries. Signed-off-by: Nicholas Sielicki --- fabtests/common/hmem_cuda.c | 4 ++-- src/hmem_cuda.c | 15 +++++---------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fabtests/common/hmem_cuda.c b/fabtests/common/hmem_cuda.c index 2f02b6f474c..e4aef962fb6 100644 --- a/fabtests/common/hmem_cuda.c +++ b/fabtests/common/hmem_cuda.c @@ -157,9 +157,9 @@ int ft_cuda_init(void) goto err; } - cuda_handle = dlopen("libcuda.so", RTLD_NOW); + cuda_handle = dlopen("libcuda.so.1", RTLD_NOW); if (!cuda_handle) { - FT_ERR("Failed to dlopen libcuda.so\n"); + FT_ERR("Failed to dlopen libcuda.so.1\n"); goto err_dlclose_cudart; } diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c index 0580bdb24f1..ec626bdada5 100644 --- a/src/hmem_cuda.c +++ b/src/hmem_cuda.c @@ -487,22 +487,17 @@ static int cuda_hmem_dl_init(void) return -FI_ENOSYS; } - cuda_attr.driver_handle = dlopen("libcuda.so", RTLD_NOW); + cuda_attr.driver_handle = dlopen("libcuda.so.1", RTLD_NOW); if (!cuda_attr.driver_handle) { FI_WARN(&core_prov, FI_LOG_CORE, - "Failed to dlopen libcuda.so\n"); + "Failed to dlopen libcuda.so.1\n"); goto err_dlclose_cuda_runtime; } - cuda_attr.nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW); + cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW); if (!cuda_attr.nvml_handle) { - FI_INFO(&core_prov, FI_LOG_CORE, - "Failed to dlopen libnvidia-ml.so. Trying libnvidia-ml.so.1\n"); - cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW); - if (!cuda_attr.nvml_handle) { - FI_WARN(&core_prov, FI_LOG_CORE, - "Failed to dlopen libnvidia-ml.so or libnvidia-ml.so.1, bypassing nvml calls\n"); - } + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to dlopen libnvidia-ml.so.1, bypassing nvml calls\n"); } CUDA_DRIVER_FUNCS_DEF(CUDA_DRIVER_FUNCS_DLOPEN) From c312085e6c961dac3f9ec075fae6e89460c5d55a Mon Sep 17 00:00:00 2001 From: Dariusz Sciebura Date: Thu, 28 Nov 2024 16:27:08 +0000 Subject: [PATCH 310/393] prov/tcp: Fix race in writing to xnet_ep_fi_ops xnet_endpoint updates the static global variable: xnet_ep_fi_ops. This naturally causes a data race. This PR eliminates the write. Signed-off-by: Dariusz Sciebura --- prov/tcp/src/xnet_ep.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prov/tcp/src/xnet_ep.c b/prov/tcp/src/xnet_ep.c index 64772fef0aa..0ff5723d9d2 100644 --- a/prov/tcp/src/xnet_ep.c +++ b/prov/tcp/src/xnet_ep.c @@ -677,7 +677,7 @@ static struct fi_ops xnet_ep_fi_ops = { .close = xnet_ep_close, .bind = xnet_ep_bind, .control = xnet_ep_ctrl, - .ops_open = fi_no_ops_open, + .ops_open = xnet_ep_ops_open, }; static int xnet_ep_getopt(fid_t fid, int level, int optname, @@ -828,7 +828,6 @@ int xnet_endpoint(struct fid_domain *domain, struct fi_info *info, (*ep_fid)->msg = &xnet_msg_ops; (*ep_fid)->rma = &xnet_rma_ops; (*ep_fid)->tagged = &xnet_tagged_ops; - (*ep_fid)->fid.ops->ops_open = xnet_ep_ops_open; return 0; err3: From de520c2a9019b1fbf1f032536a438e3837e8bdca Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Wed, 11 Dec 2024 10:24:11 -0800 Subject: [PATCH 311/393] prov/efa: Add unit tests for efa_msg Use FI_EP_RDM as ep type and override efa_msg_ops. Signed-off-by: Jessie Yang --- prov/efa/Makefile.include | 3 +- prov/efa/test/efa_unit_test_mocks.c | 12 ++ prov/efa/test/efa_unit_test_mocks.h | 5 + prov/efa/test/efa_unit_test_msg.c | 267 ++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 9 + prov/efa/test/efa_unit_tests.h | 9 + 6 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 prov/efa/test/efa_unit_test_msg.c diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 81e0fab0aed..6be0ad452f2 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -148,7 +148,8 @@ nodist_prov_efa_test_efa_unit_test_SOURCES = \ prov/efa/test/efa_unit_test_runt.c \ prov/efa/test/efa_unit_test_mr.c \ prov/efa/test/efa_unit_test_rdm_peer.c \ - prov/efa/test/efa_unit_test_pke.c + prov/efa/test/efa_unit_test_pke.c \ + prov/efa/test/efa_unit_test_msg.c efa_CPPFLAGS += -I$(top_srcdir)/include -I$(top_srcdir)/prov/efa/test $(cmocka_CPPFLAGS) diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c index 75dd2bad732..20dd538602c 100644 --- a/prov/efa/test/efa_unit_test_mocks.c +++ b/prov/efa/test/efa_unit_test_mocks.c @@ -88,6 +88,12 @@ void efa_mock_ibv_wr_send_verify_handshake_pkt_local_host_id_and_save_wr(struct return efa_mock_ibv_wr_send_save_wr(qp); } +void efa_mock_ibv_wr_send_imm_save_wr(struct ibv_qp_ex *qp, __be32 imm_data) +{ + g_ibv_submitted_wr_id_vec[g_ibv_submitted_wr_id_cnt] = (void *)qp->wr_id; + g_ibv_submitted_wr_id_cnt++; +} + void efa_mock_ibv_wr_set_inline_data_list_no_op(struct ibv_qp_ex *qp, size_t num_buf, const struct ibv_data_buf *buf_list) @@ -207,6 +213,12 @@ bool efa_mock_efa_device_support_unsolicited_write_recv() return mock(); } +int efa_mock_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + return mock(); +} + struct efa_unit_test_mocks g_efa_unit_test_mocks = { .local_host_id = 0, .peer_host_id = 0, diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h index 3e764c91fb1..7143869a2fa 100644 --- a/prov/efa/test/efa_unit_test_mocks.h +++ b/prov/efa/test/efa_unit_test_mocks.h @@ -74,6 +74,8 @@ uint32_t efa_mock_ibv_read_wc_flags_return_mock(struct ibv_cq_ex *current); bool efa_mock_efadv_wc_is_unsolicited(struct efadv_cq *efadv_cq); +void efa_mock_ibv_wr_send_imm_save_wr(struct ibv_qp_ex *qp, __be32 imm_data); + ssize_t __real_ofi_copy_from_hmem_iov(void *dest, size_t size, enum fi_hmem_iface hmem_iface, uint64_t device, const struct iovec *hmem_iov, @@ -93,6 +95,9 @@ int efa_mock_efa_rdm_pke_read_return_mock(struct efa_rdm_ope *ope); bool efa_mock_efa_device_support_unsolicited_write_recv(void); +int efa_mock_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + struct efa_unit_test_mocks { uint64_t local_host_id; diff --git a/prov/efa/test/efa_unit_test_msg.c b/prov/efa/test/efa_unit_test_msg.c new file mode 100644 index 00000000000..81781aeb6d6 --- /dev/null +++ b/prov/efa/test/efa_unit_test_msg.c @@ -0,0 +1,267 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All + * rights reserved. */ + +#include "efa_unit_tests.h" +#include "ofi_util.h" + +extern struct fi_ops_msg efa_msg_ops; + +static void test_efa_msg_recv_prep(struct efa_resource *resource, + fi_addr_t *addr) +{ + struct ibv_qp *ibv_qp; + struct efa_ep_addr raw_addr; + struct efa_base_ep *base_ep; + size_t raw_addr_len = sizeof(raw_addr); + int ret; + + efa_unit_test_resource_construct(resource, FI_EP_RDM); + resource->ep->msg = &efa_msg_ops; + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + ibv_qp = base_ep->qp->ibv_qp; + ibv_qp->context->ops.post_recv = &efa_mock_ibv_post_recv; + will_return(efa_mock_ibv_post_recv, 0); + + ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(ret, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + ret = fi_av_insert(resource->av, &raw_addr, 1, addr, 0 /* flags */, + NULL /* context */); + assert_int_equal(ret, 1); +} + +void test_efa_msg_fi_recv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + int ret; + void *desc; + + test_efa_msg_recv_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(send_buff.mr); + + ret = fi_recv(resource->ep, send_buff.buff, send_buff.size, desc, addr, + NULL /* context */); + assert_int_equal(ret, 0); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_recvv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct iovec iov; + fi_addr_t addr; + int ret; + void *desc; + + test_efa_msg_recv_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + desc = fi_mr_desc(send_buff.mr); + + ret = fi_recvv(resource->ep, &iov, &desc, 1, addr, NULL /* context */); + assert_int_equal(ret, 0); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_recvmsg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + int ret; + void *desc; + struct iovec iov; + struct fi_msg msg = {0}; + + test_efa_msg_recv_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + desc = fi_mr_desc(send_buff.mr); + efa_unit_test_construct_msg(&msg, &iov, 1, addr, NULL, 0, &desc); + + ret = fi_recvmsg(resource->ep, &msg, 0); + assert_int_equal(ret, 0); + + efa_unit_test_buff_destruct(&send_buff); +} + +static void test_efa_msg_send_prep(struct efa_resource *resource, + fi_addr_t *addr) +{ + struct ibv_qp_ex *ibv_qpx; + struct efa_ep_addr raw_addr; + struct efa_base_ep *base_ep; + size_t raw_addr_len = sizeof(raw_addr); + int ret; + + efa_unit_test_resource_construct(resource, FI_EP_RDM); + resource->ep->msg = &efa_msg_ops; + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + ibv_qpx = base_ep->qp->ibv_qp_ex; + + ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(ret, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + ret = fi_av_insert(resource->av, &raw_addr, 1, addr, 0 /* flags */, + NULL /* context */); + assert_int_equal(ret, 1); + + ibv_qpx->wr_start = &efa_mock_ibv_wr_start_no_op; + /* this mock will save the send work request (wr) in a global list */ + ibv_qpx->wr_send = &efa_mock_ibv_wr_send_save_wr; + ibv_qpx->wr_send_imm = &efa_mock_ibv_wr_send_imm_save_wr; + ibv_qpx->wr_set_inline_data_list = &efa_mock_ibv_wr_set_inline_data_list_no_op; + ibv_qpx->wr_set_sge_list = &efa_mock_ibv_wr_set_sge_list_no_op; + ibv_qpx->wr_set_ud_addr = &efa_mock_ibv_wr_set_ud_addr_no_op; + ibv_qpx->wr_complete = &efa_mock_ibv_wr_complete_no_op; +} + +void test_efa_msg_fi_send(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + void *desc; + int ret; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(send_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_send(resource->ep, send_buff.buff, send_buff.size, desc, addr, + NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_sendv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + struct iovec iov; + void *desc; + int ret; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + desc = fi_mr_desc(send_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_sendv(resource->ep, &iov, &desc, 1, addr, NULL); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_sendmsg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + struct iovec iov; + void *desc; + int ret; + struct fi_msg msg = {0}; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + desc = fi_mr_desc(send_buff.mr); + + efa_unit_test_construct_msg(&msg, &iov, 1, addr, NULL, 0, &desc); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_sendmsg(resource->ep, &msg, 0); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_senddata(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + void *desc; + int ret; + uint64_t data = 0x1234567890ABCDEF; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(send_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_senddata(resource->ep, send_buff.buff, send_buff.size, desc, + data, addr, NULL); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_inject(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + int ret; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 32); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_inject(resource->ep, send_buff.buff, send_buff.size, addr); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +void test_efa_msg_fi_injectdata(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + fi_addr_t addr; + int ret; + uint64_t data = 0x1234567890ABCDEF; + + test_efa_msg_send_prep(resource, &addr); + efa_unit_test_buff_construct(&send_buff, resource, 32); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_injectdata(resource->ep, send_buff.buff, send_buff.size, data, + addr); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&send_buff); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 2ada3f5d820..1bdcda622a2 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -208,6 +208,15 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_peer_keep_pke_in_overflow_list, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_peer_append_overflow_pke_to_recvwin, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_pke_handle_longcts_rtm_send_completion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_recvv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_recvmsg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_send, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_sendv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_sendmsg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_senddata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_inject, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_msg_fi_injectdata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), }; cmocka_set_message_output(CM_OUTPUT_XML); diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 96958b0255f..1a75a5cbfcf 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -222,6 +222,15 @@ void test_efa_rdm_peer_move_overflow_pke_to_recvwin(); void test_efa_rdm_peer_keep_pke_in_overflow_list(); void test_efa_rdm_peer_append_overflow_pke_to_recvwin(); void test_efa_rdm_pke_handle_longcts_rtm_send_completion(); +void test_efa_msg_fi_recv(); +void test_efa_msg_fi_recvv(); +void test_efa_msg_fi_recvmsg(); +void test_efa_msg_fi_send(); +void test_efa_msg_fi_sendv(); +void test_efa_msg_fi_sendmsg(); +void test_efa_msg_fi_senddata(); +void test_efa_msg_fi_inject(); +void test_efa_msg_fi_injectdata(); static inline int efa_unit_test_get_dlist_length(struct dlist_entry *head) From f17ff491701bff08ef185b4d901cc6433a514a3b Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 12 Dec 2024 00:52:47 +0000 Subject: [PATCH 312/393] prov/efa: Add tracepoints for efa_msg and efa_rma This patch adjust various tp providers and events: 1. efa_tracepoint_wr_id_post_* should be renamed as efa_rdm_* because it interpretes the wr id as efa_rdm_ope which is rdm specific. 2. Removed the "ope" field in post_wr_id event class, because it is not used by any analysis today and prevent it from being used by the efa_msg/rma interface which doesn't have ope. 2. Add send/recv/read/write_begin_msg_context events for efa tp provider. Renamed the MSG_ARGS and MSG_FIELDS macros in efa_rdm_tracepoint.h to have RDM prefix. 3. Make efa_rdm_pke.c use efa_rdm_tracepoint_wr_id to generate doorbell ringing events. For efa_msg.c and efa_rma.c, use efa_tracepoint directly. Signed-off-by: Shi Jin --- prov/efa/src/efa_msg.c | 10 +++++-- prov/efa/src/efa_rma.c | 8 ++++++ prov/efa/src/efa_tp.h | 16 +++++------ prov/efa/src/efa_tp_def.h | 35 +++++++++++++++++++++-- prov/efa/src/rdm/efa_rdm_pke.c | 10 +++---- prov/efa/src/rdm/efa_rdm_tracepoint_def.h | 16 +++++------ 6 files changed, 69 insertions(+), 26 deletions(-) diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c index bbef0eb0569..7920afbf531 100644 --- a/prov/efa/src/efa_msg.c +++ b/prov/efa/src/efa_msg.c @@ -69,6 +69,8 @@ static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi ssize_t err, post_recv_err; size_t i, wr_index = base_ep->recv_wr_index; + efa_tracepoint(recv_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); + if (wr_index >= base_ep->info->rx_attr->size) { EFA_INFO(FI_LOG_EP_DATA, "recv_wr_index exceeds the rx limit, " @@ -119,6 +121,8 @@ static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi if (flags & FI_MORE) return 0; + efa_tracepoint(post_recv, wr->wr_id, (uintptr_t)msg->context); + err = ibv_post_recv(qp->ibv_qp, &base_ep->efa_recv_wr_vec[0].wr, &bad_wr); if (OFI_UNLIKELY(err)) { /* On failure, ibv_post_recv() return positive errno. @@ -187,6 +191,8 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi size_t len, i; int ret = 0; + efa_tracepoint(send_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); + dump_msg(msg, "send"); conn = efa_av_addr_to_conn(base_ep->av, msg->addr); @@ -248,9 +254,7 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, conn->ep_addr->qpn, conn->ep_addr->qkey); -#if HAVE_LTTNG - efa_tracepoint_wr_id_post_send((void *)msg->context); -#endif + efa_tracepoint(post_send, qp->ibv_qp_ex->wr_id, (uintptr_t)msg->context); if (!(flags & FI_MORE)) { ret = ibv_wr_complete(qp->ibv_qp_ex); diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c index 468ea2e1f76..a7bad7d3877 100644 --- a/prov/efa/src/efa_rma.c +++ b/prov/efa/src/efa_rma.c @@ -73,6 +73,8 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, #endif int i, err = 0; + efa_tracepoint(read_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); + assert(msg->iov_count > 0 && msg->iov_count <= base_ep->domain->info->tx_attr->iov_limit); assert(msg->rma_iov_count > 0 && @@ -105,6 +107,8 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, conn->ep_addr->qpn, conn->ep_addr->qkey); + efa_tracepoint(post_read, qp->ibv_qp_ex->wr_id, (uintptr_t)msg->context); + if (!(flags & FI_MORE)) { err = ibv_wr_complete(qp->ibv_qp_ex); base_ep->is_wr_started = false; @@ -205,6 +209,8 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, size_t len; int i, err = 0; + efa_tracepoint(write_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); + qp = base_ep->qp; if (!base_ep->is_wr_started) { ibv_wr_start(qp->ibv_qp_ex); @@ -243,6 +249,8 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah, conn->ep_addr->qpn, conn->ep_addr->qkey); + efa_tracepoint(post_write, qp->ibv_qp_ex->wr_id, (uintptr_t)msg->context); + if (!(flags & FI_MORE)) { err = ibv_wr_complete(qp->ibv_qp_ex); base_ep->is_wr_started = false; diff --git a/prov/efa/src/efa_tp.h b/prov/efa/src/efa_tp.h index dd2f32f79fb..ce9151a8619 100644 --- a/prov/efa/src/efa_tp.h +++ b/prov/efa/src/efa_tp.h @@ -25,40 +25,40 @@ /* tracelog() is similar to tracef(), but with a log level param */ #define efa_tracelog lttng_ust_tracelog -static inline void efa_tracepoint_wr_id_post_send(const void *wr_id) +static inline void efa_rdm_tracepoint_wr_id_post_send(const void *wr_id) { struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; struct efa_rdm_ope *ope = pkt_entry->ope; if (!ope) return; - efa_tracepoint(post_send, (size_t) wr_id, (size_t) ope, (size_t) ope->cq_entry.op_context); + efa_tracepoint(post_send, (size_t) wr_id, (size_t) ope->cq_entry.op_context); } -static inline void efa_tracepoint_wr_id_post_recv(const void *wr_id) +static inline void efa_rdm_tracepoint_wr_id_post_recv(const void *wr_id) { struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; struct efa_rdm_ope *ope = pkt_entry->ope; if (!ope) return; - efa_tracepoint(post_recv, (size_t) wr_id, (size_t) ope, (size_t) ope->cq_entry.op_context); + efa_tracepoint(post_recv, (size_t) wr_id, (size_t) ope->cq_entry.op_context); } -static inline void efa_tracepoint_wr_id_post_read(const void *wr_id) +static inline void efa_rdm_tracepoint_wr_id_post_read(const void *wr_id) { struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; struct efa_rdm_ope *ope = pkt_entry->ope; if (!ope) return; - efa_tracepoint(post_read, (size_t) wr_id, (size_t) ope, (size_t) ope->cq_entry.op_context); + efa_tracepoint(post_read, (size_t) wr_id, (size_t) ope->cq_entry.op_context); } -static inline void efa_tracepoint_wr_id_post_write(const void *wr_id) +static inline void efa_rdm_tracepoint_wr_id_post_write(const void *wr_id) { struct efa_rdm_pke *pkt_entry = (struct efa_rdm_pke *) wr_id; struct efa_rdm_ope *ope = pkt_entry->ope; if (!ope) return; - efa_tracepoint(post_write, (size_t) wr_id, (size_t) ope, (size_t) ope->cq_entry.op_context); + efa_tracepoint(post_write, (size_t) wr_id, (size_t) ope->cq_entry.op_context); } #else diff --git a/prov/efa/src/efa_tp_def.h b/prov/efa/src/efa_tp_def.h index 46617d2d2a7..d05dec67f27 100644 --- a/prov/efa/src/efa_tp_def.h +++ b/prov/efa/src/efa_tp_def.h @@ -18,14 +18,45 @@ #define X_PKT_ARGS \ size_t, wr_id, \ - size_t, efa_rdm_ope, \ size_t, context #define X_PKT_FIELDS \ lttng_ust_field_integer_hex(size_t, wr_id, wr_id) \ - lttng_ust_field_integer_hex(size_t, efa_rdm_ope, efa_rdm_ope) \ lttng_ust_field_integer_hex(size_t, context, context) +#define MSG_ARGS \ + size_t, msg_ctx, \ + size_t, addr + +#define MSG_FIELDS \ + lttng_ust_field_integer_hex(size_t, msg_ctx, msg_ctx) \ + lttng_ust_field_integer_hex(size_t, addr, addr) + +LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_TP_PROV, msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS), + LTTNG_UST_TP_FIELDS(MSG_FIELDS)) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, msg_context, EFA_TP_PROV, + send_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, send_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, msg_context, EFA_TP_PROV, + recv_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, recv_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, msg_context, EFA_TP_PROV, + read_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, read_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + +LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, msg_context, EFA_TP_PROV, + write_begin_msg_context, + LTTNG_UST_TP_ARGS(MSG_ARGS)) +LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, write_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) + + LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_TP_PROV, post_wr_id, LTTNG_UST_TP_ARGS(X_PKT_ARGS), LTTNG_UST_TP_FIELDS(X_PKT_FIELDS)) diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index 6b97eccda1c..06e7e2abd7a 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -439,7 +439,7 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, #endif #if HAVE_LTTNG - efa_tracepoint_wr_id_post_send((void *)pkt_entry); + efa_rdm_tracepoint_wr_id_post_send((void *)pkt_entry); #endif } @@ -510,7 +510,7 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, } #if HAVE_LTTNG - efa_tracepoint_wr_id_post_read((void *)pkt_entry); + efa_rdm_tracepoint_wr_id_post_read((void *)pkt_entry); #endif err = ibv_wr_complete(qp->ibv_qp_ex); @@ -602,7 +602,7 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) } #if HAVE_LTTNG - efa_tracepoint_wr_id_post_write((void *)pkt_entry); + efa_rdm_tracepoint_wr_id_post_write((void *)pkt_entry); #endif if (!(txe->fi_flags & FI_MORE)) { @@ -650,7 +650,7 @@ ssize_t efa_rdm_pke_recvv(struct efa_rdm_pke **pke_vec, if (i > 0) ep->base_ep.efa_recv_wr_vec[i-1].wr.next = &recv_wr->wr; #if HAVE_LTTNG - efa_tracepoint_wr_id_post_recv(pke_vec[i]); + efa_rdm_tracepoint_wr_id_post_recv(pke_vec[i]); #endif } @@ -699,7 +699,7 @@ ssize_t efa_rdm_pke_user_recvv(struct efa_rdm_pke **pke_vec, if (wr_index > 0) ep->base_ep.user_recv_wr_vec[wr_index - 1].wr.next = &recv_wr->wr; #if HAVE_LTTNG - efa_tracepoint_wr_id_post_recv(pke_vec[i]); + efa_rdm_tracepoint_wr_id_post_recv(pke_vec[i]); #endif wr_index++; } diff --git a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h index b814e957372..24e2edec270 100644 --- a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h +++ b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h @@ -65,36 +65,36 @@ LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, x_entry, EFA_RDM_TP_PROV, LTTNG_UST_TP_ARGS(X_ENTRY_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, runtread_read_posted, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) -#define MSG_ARGS \ +#define RDM_MSG_ARGS \ size_t, msg_ctx, \ size_t, addr -#define MSG_FIELDS \ +#define RDM_MSG_FIELDS \ lttng_ust_field_integer_hex(size_t, msg_ctx, msg_ctx) \ lttng_ust_field_integer_hex(size_t, addr, addr) LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_RDM_TP_PROV, msg_context, - LTTNG_UST_TP_ARGS(MSG_ARGS), - LTTNG_UST_TP_FIELDS(MSG_FIELDS)) + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS), + LTTNG_UST_TP_FIELDS(RDM_MSG_FIELDS)) LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, send_begin_msg_context, - LTTNG_UST_TP_ARGS(MSG_ARGS)) + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, send_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, recv_begin_msg_context, - LTTNG_UST_TP_ARGS(MSG_ARGS)) + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, recv_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, read_begin_msg_context, - LTTNG_UST_TP_ARGS(MSG_ARGS)) + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, read_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_RDM_TP_PROV, msg_context, EFA_RDM_TP_PROV, write_begin_msg_context, - LTTNG_UST_TP_ARGS(MSG_ARGS)) + LTTNG_UST_TP_ARGS(RDM_MSG_ARGS)) LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_RDM_TP_PROV, write_begin_msg_context, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO) #define CQ_ENTRY_ARGS \ From 2d4ac0e646555c9496ae72b1c6689f54a1b5159e Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Fri, 13 Dec 2024 10:01:24 -0800 Subject: [PATCH 313/393] configure: Bump the version to 2.1.0a1 Signed-off-by: Jianxin Xiong --- configure.ac | 2 +- fabtests/configure.ac | 2 +- include/windows/config.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index c024d2d222b..2c5f194e6f9 100644 --- a/configure.ac +++ b/configure.ac @@ -9,7 +9,7 @@ dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ([2.60]) -AC_INIT([libfabric], [2.0.0rc1], [ofiwg@lists.openfabrics.org]) +AC_INIT([libfabric], [2.1.0a1], [ofiwg@lists.openfabrics.org]) AC_CONFIG_SRCDIR([src/fabric.c]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) diff --git a/fabtests/configure.ac b/fabtests/configure.ac index 9c864c1dd10..29f816b4993 100644 --- a/fabtests/configure.ac +++ b/fabtests/configure.ac @@ -5,7 +5,7 @@ dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.57) -AC_INIT([fabtests], [2.0.0rc1], [ofiwg@lists.openfabrics.org]) +AC_INIT([fabtests], [2.1.0a1], [ofiwg@lists.openfabrics.org]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) AC_CONFIG_HEADERS(config.h) diff --git a/include/windows/config.h b/include/windows/config.h index 3fbef1b09a4..b3676930873 100644 --- a/include/windows/config.h +++ b/include/windows/config.h @@ -256,7 +256,7 @@ #define PACKAGE_TARNAME PACKAGE /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.0.0rc1" +#define PACKAGE_VERSION "2.1.0a1" /* Define to the full name and version of this package. */ #define PACKAGE_STRING PACKAGE_NAME " " PACKAGE_VERSION From ebca5ecc8a85991ff8f4d29bbd2cade382995055 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Fri, 13 Dec 2024 15:57:51 -0800 Subject: [PATCH 314/393] prov/efa: Add unit tests for efa_rma Signed-off-by: Jessie Yang --- prov/efa/Makefile.include | 3 +- prov/efa/test/efa_unit_test_common.c | 16 ++ prov/efa/test/efa_unit_test_mocks.c | 15 ++ prov/efa/test/efa_unit_test_mocks.h | 7 + prov/efa/test/efa_unit_test_rma.c | 273 +++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 9 + prov/efa/test/efa_unit_tests.h | 15 ++ 7 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 prov/efa/test/efa_unit_test_rma.c diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 6be0ad452f2..980f3430644 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -149,7 +149,8 @@ nodist_prov_efa_test_efa_unit_test_SOURCES = \ prov/efa/test/efa_unit_test_mr.c \ prov/efa/test/efa_unit_test_rdm_peer.c \ prov/efa/test/efa_unit_test_pke.c \ - prov/efa/test/efa_unit_test_msg.c + prov/efa/test/efa_unit_test_msg.c \ + prov/efa/test/efa_unit_test_rma.c efa_CPPFLAGS += -I$(top_srcdir)/include -I$(top_srcdir)/prov/efa/test $(cmocka_CPPFLAGS) diff --git a/prov/efa/test/efa_unit_test_common.c b/prov/efa/test/efa_unit_test_common.c index 772bd0608c9..47cae69f20b 100644 --- a/prov/efa/test/efa_unit_test_common.c +++ b/prov/efa/test/efa_unit_test_common.c @@ -35,6 +35,22 @@ void efa_unit_test_construct_tmsg(struct fi_msg_tagged *tmsg, struct iovec *iov, tmsg->ignore = ignore; } +void efa_unit_test_construct_msg_rma(struct fi_msg_rma *msg, struct iovec *iov, + void **desc, size_t iov_count, + fi_addr_t addr, struct fi_rma_iov *rma_iov, + size_t rma_iov_count, void *context, + uint64_t data) +{ + msg->msg_iov = iov; + msg->desc = desc; + msg->iov_count = iov_count; + msg->addr = addr; + msg->rma_iov = rma_iov; + msg->rma_iov_count = rma_iov_count; + msg->context = context; + msg->data = data; +} + struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type) { struct fi_info *hints; diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c index 20dd538602c..d05ded33e0f 100644 --- a/prov/efa/test/efa_unit_test_mocks.c +++ b/prov/efa/test/efa_unit_test_mocks.c @@ -219,6 +219,21 @@ int efa_mock_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, return mock(); } +void efa_mock_ibv_wr_rdma_read_save_wr(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr) +{ + g_ibv_submitted_wr_id_vec[g_ibv_submitted_wr_id_cnt] = (void *)qp->wr_id; + g_ibv_submitted_wr_id_cnt++; +} + +void efa_mock_ibv_wr_rdma_write_imm_save_wr(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, + __be32 imm_data) +{ + g_ibv_submitted_wr_id_vec[g_ibv_submitted_wr_id_cnt] = (void *) qp->wr_id; + g_ibv_submitted_wr_id_cnt++; +} + struct efa_unit_test_mocks g_efa_unit_test_mocks = { .local_host_id = 0, .peer_host_id = 0, diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h index 7143869a2fa..3c256a24075 100644 --- a/prov/efa/test/efa_unit_test_mocks.h +++ b/prov/efa/test/efa_unit_test_mocks.h @@ -98,6 +98,13 @@ bool efa_mock_efa_device_support_unsolicited_write_recv(void); int efa_mock_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); +void efa_mock_ibv_wr_rdma_read_save_wr(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr); + +void efa_mock_ibv_wr_rdma_write_imm_save_wr(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, + __be32 imm_data); + struct efa_unit_test_mocks { uint64_t local_host_id; diff --git a/prov/efa/test/efa_unit_test_rma.c b/prov/efa/test/efa_unit_test_rma.c new file mode 100644 index 00000000000..40be70ec219 --- /dev/null +++ b/prov/efa/test/efa_unit_test_rma.c @@ -0,0 +1,273 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All + * rights reserved. */ + +#include "efa_unit_tests.h" +#include "ofi_util.h" + +extern struct fi_ops_rma efa_rma_ops; + +static void test_efa_rma_prep(struct efa_resource *resource, fi_addr_t *addr) +{ + struct ibv_qp_ex *ibv_qpx; + struct efa_ep_addr raw_addr; + struct efa_base_ep *base_ep; + size_t raw_addr_len = sizeof(raw_addr); + int ret; + + efa_unit_test_resource_construct(resource, FI_EP_RDM); + resource->ep->rma = &efa_rma_ops; + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + ibv_qpx = base_ep->qp->ibv_qp_ex; + ibv_qpx->wr_start = &efa_mock_ibv_wr_start_no_op; + /* this mock will save the send work request (wr) in a global list */ + ibv_qpx->wr_rdma_read = &efa_mock_ibv_wr_rdma_read_save_wr; + ibv_qpx->wr_rdma_write = &efa_mock_ibv_wr_rdma_write_save_wr; + ibv_qpx->wr_rdma_write_imm = &efa_mock_ibv_wr_rdma_write_imm_save_wr; + ibv_qpx->wr_set_inline_data_list = + &efa_mock_ibv_wr_set_inline_data_list_no_op; + ibv_qpx->wr_set_sge_list = &efa_mock_ibv_wr_set_sge_list_no_op; + ibv_qpx->wr_set_ud_addr = &efa_mock_ibv_wr_set_ud_addr_no_op; + ibv_qpx->wr_complete = &efa_mock_ibv_wr_complete_no_op; + + ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(ret, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + ret = fi_av_insert(resource->av, &raw_addr, 1, addr, 0 /* flags */, + NULL /* context */); + assert_int_equal(ret, 1); +} + +void test_efa_rma_read(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t src_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &src_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_read(resource->ep, local_buff.buff, local_buff.size, desc, + src_addr, remote_addr, remote_key, NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_readv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + fi_addr_t src_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &src_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_readv(resource->ep, &iov, &desc, 1, src_addr, remote_addr, + remote_key, NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_readmsg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + struct fi_msg_rma msg = {0}; + struct fi_rma_iov rma_iov; + fi_addr_t src_addr; + void *desc; + int ret; + + test_efa_rma_prep(resource, &src_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + rma_iov.len = local_buff.size; + rma_iov.addr = 0x87654321; + rma_iov.key = 123456; + efa_unit_test_construct_msg_rma(&msg, &iov, &desc, 1, src_addr, + &rma_iov, 1, NULL, 0); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_readmsg(resource->ep, &msg, 0); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_write(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t dest_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_write(resource->ep, local_buff.buff, local_buff.size, desc, + dest_addr, remote_addr, remote_key, NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_writev(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + fi_addr_t dest_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_writev(resource->ep, &iov, &desc, 1, dest_addr, remote_addr, + remote_key, NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_writemsg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + struct fi_msg_rma msg = {0}; + struct fi_rma_iov rma_iov; + fi_addr_t dest_addr; + void *desc; + int ret; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + rma_iov.len = local_buff.size; + rma_iov.addr = 0x87654321; + rma_iov.key = 123456; + efa_unit_test_construct_msg_rma(&msg, &iov, &desc, 1, dest_addr, &rma_iov, + 1, NULL, 0); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_writemsg(resource->ep, &msg, 0); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_writedata(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t dest_addr; + void *desc; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + desc = fi_mr_desc(local_buff.mr); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_writedata(resource->ep, local_buff.buff, local_buff.size, desc, + 0, dest_addr, remote_addr, remote_key, + NULL /* context */); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_inject_write(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t dest_addr; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_inject_write(resource->ep, local_buff.buff, local_buff.size, + dest_addr, remote_addr, remote_key); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_inject_writedata(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + fi_addr_t dest_addr; + int ret; + uint64_t remote_addr = 0x87654321; + uint64_t remote_key = 123456; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_inject_writedata(resource->ep, local_buff.buff, + local_buff.size, 0, dest_addr, remote_addr, + remote_key); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 1bdcda622a2..8330e650f5f 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -217,6 +217,15 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_msg_fi_senddata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_msg_fi_inject, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_msg_fi_injectdata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_read, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_readv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_readmsg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_write, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_writev, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_writemsg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_writedata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_inject_write, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_inject_writedata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), }; cmocka_set_message_output(CM_OUTPUT_XML); diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 1a75a5cbfcf..246dd563a42 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -60,6 +60,12 @@ void efa_unit_test_construct_tmsg(struct fi_msg_tagged *tmsg, struct iovec *iov, void **desc, uint64_t tag, uint64_t ignore); +void efa_unit_test_construct_msg_rma(struct fi_msg_rma *msg, struct iovec *iov, + void **desc, size_t iov_count, + fi_addr_t addr, struct fi_rma_iov *rma_iov, + size_t rma_iov_count, void *context, + uint64_t data); + void new_temp_file(char *template, size_t len); struct efa_unit_test_buff { @@ -231,6 +237,15 @@ void test_efa_msg_fi_sendmsg(); void test_efa_msg_fi_senddata(); void test_efa_msg_fi_inject(); void test_efa_msg_fi_injectdata(); +void test_efa_rma_read(); +void test_efa_rma_readv(); +void test_efa_rma_readmsg(); +void test_efa_rma_write(); +void test_efa_rma_writev(); +void test_efa_rma_writemsg(); +void test_efa_rma_writedata(); +void test_efa_rma_inject_write(); +void test_efa_rma_inject_writedata(); static inline int efa_unit_test_get_dlist_length(struct dlist_entry *head) From 17d9cf2c5a168bc22b6e98d2220cd7f802861233 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 17 Dec 2024 01:35:47 +0000 Subject: [PATCH 315/393] util/pingpong: close mr after ep close pingpong doesn't support FI_MR_ENDPOINT today, so the mr is associated with domain instead of ep. It is unsafe to close mr before closing ep because it can cause an EBUSY error when there are outstanding recvs of the mr posted to the ep/qp. This patch fixes this issue by moving the mr close after the ep close. Signed-off-by: Shi Jin --- util/pingpong.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/pingpong.c b/util/pingpong.c index 68a554c8752..9597d305121 100644 --- a/util/pingpong.c +++ b/util/pingpong.c @@ -1875,12 +1875,12 @@ static void pp_free_res(struct ct_pingpong *ct) { PP_DEBUG("Freeing resources of test suite\n"); - if (ct->mr != &(ct->no_mr)) - PP_CLOSE_FID(ct->mr); PP_CLOSE_FID(ct->ep); PP_CLOSE_FID(ct->pep); PP_CLOSE_FID(ct->rxcq); PP_CLOSE_FID(ct->txcq); + if (ct->mr != &(ct->no_mr)) + PP_CLOSE_FID(ct->mr); PP_CLOSE_FID(ct->av); PP_CLOSE_FID(ct->eq); PP_CLOSE_FID(ct->domain); From 2c200a4c712b7e92aa5b33c183e33a3c7a8d0661 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 12 Dec 2024 11:39:37 -0800 Subject: [PATCH 316/393] contrib/intel/jenkins: Update slurm partitions for new head node Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index f5ce711e2f3..4a7cce17058 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -572,7 +572,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - slurm_batch("totodile", "1", + slurm_batch("water", "1", "${env.LOG_DIR}/build_mpich_water_log", """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ --build_item=mpich --build_hw=water""" @@ -761,7 +761,7 @@ pipeline { } for (def mpi in MPIS) { run_middleware(providers, "mpichtestsuite", "mpichtestsuite", - "grass", "bulbasaur", "2", "${mpi}") + "grass", "bulbasaur,ivysaur", "2", "${mpi}") } } } @@ -778,7 +778,8 @@ pipeline { } for (def mpi in MPIS) { run_middleware(providers, "mpichtestsuite", "mpichtestsuite", - "water", "totodile", "2", "${mpi}") + "water", "squirtle,wartortle,articuno", "2", + "${mpi}") } } } @@ -821,13 +822,15 @@ pipeline { script { dir (RUN_LOCATION) { run_middleware([["verbs", null]], "oneCCL", - "oneccl", "water", "totodile", "2") + "oneccl", "water", + "squirtle,wartortle,articuno", "2") run_middleware([["shm", null]], "oneCCL", - "oneccl", "grass", "bulbasaur,chikorita", "1") + "oneccl", "grass", "bulbasaur,ivysaur", "1") run_middleware([["psm3", null]], "oneCCL", - "oneccl", "water", "totodile", "2") + "oneccl", "water", + "squirtle,wartortle,articuno", "2") run_middleware([["tcp", null]], "oneCCL", - "oneccl", "grass", "bulbasaur,chikorita", "2") + "oneccl", "grass", "bulbasaur,ivysaur", "2") run_middleware([["shm", null]], "oneCCL_DSA", "oneccl", "electric", "pikachu", "1", null, null, """CCL_ATL_SHM=1 FI_SHM_DISABLE_CMA=1 \ From 2f94ead2c5dd9f11d71980fffee148d64898f3a6 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 5 Dec 2024 09:07:56 -0800 Subject: [PATCH 317/393] contrib/intel/jenkins: Uplevel pre-build directory Uplevel pre-build directory so that it is not scp'd Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 4a7cce17058..3849c18f2cb 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -103,6 +103,14 @@ def run_ci(stage_name, config_name) { """ } +def build_ci(config_name) { + sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ + python run.py \ + --output=${env.CUSTOM_WORKSPACE}/pre-build \ + --job=${config_name} + """ +} + def gather_logs(cluster, key, dest, source) { def address = "${env.USER}@${cluster}" @@ -441,7 +449,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_water.json") + build_ci("pr_build_water.json") } } } @@ -450,7 +458,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_grass.json") + build_ci("pr_build_grass.json") } } } @@ -459,7 +467,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_electric.json") + build_ci("pr_build_electric.json") } } } @@ -468,7 +476,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_cyndaquil.json") + build_ci("pr_build_cyndaquil.json") } } } @@ -477,7 +485,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_quilava.json") + build_ci("pr_build_quilava.json") } } } @@ -499,7 +507,7 @@ pipeline { } bootstrap_ci() dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_daos.json") + build_ci("pr_build_daos.json") } } } @@ -522,7 +530,7 @@ pipeline { } bootstrap_ci() dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_fire.json") + build_ci("pr_build_fire.json") } } } @@ -536,7 +544,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_shmem_water.json") + build_ci("pr_build_shmem_water.json") } } } @@ -545,7 +553,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_shmem_grass.json") + build_ci("pr_build_shmem_grass.json") } } } @@ -554,7 +562,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_ompi_water.json") + build_ci("pr_build_ompi_water.json") } } } @@ -563,7 +571,7 @@ pipeline { steps { script { dir (CI_LOCATION) { - run_ci("pre-build", "pr_build_ompi_grass.json") + build_ci("pr_build_ompi_grass.json") } } } From e4a7c574b6269da22cea8b1bfc707bb26fa75814 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 17 Dec 2024 11:03:21 -0800 Subject: [PATCH 318/393] contrib/intel/jenkins: Force Cleanup in Post Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 3849c18f2cb..0ac828fe704 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -995,25 +995,6 @@ pipeline { summarize("all") } } - } - aborted { - node ('daos_head') { - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } - } - node ('ze') { - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } - } - sh "scancel --jobname=\"${SLURM_JOB_NAME}\"" - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } - } - success { - script { - if (DO_RUN) { - CI_summarize(verbose=true) - summarize("all", verbose=true, release=false, - send_mail=env.WEEKLY.toBoolean()) - } - } node ('daos_head') { dir("${env.WORKSPACE}") { deleteDir() } dir("${env.WORKSPACE}@tmp") { deleteDir() } From 8c33b6f5ad5310c62249e4b8a16da29f0cdd74fd Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 17 Dec 2024 11:05:44 -0800 Subject: [PATCH 319/393] contrib/intel/jenkins: Cleanup trailing whitespace Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 0ac828fe704..cdc29596028 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -17,7 +17,6 @@ def run_python(version, command, output=null) { } def slurm_batch(partition, node_num, output, command) { - try { sh """sbatch --partition=${partition} -N ${node_num} \ --wait -o ${output} --open-mode=append \ @@ -113,7 +112,6 @@ def build_ci(config_name) { def gather_logs(cluster, key, dest, source) { def address = "${env.USER}@${cluster}" - try { sh "scp -r -i ${key} ${address}:${source}/* ${dest}/" } catch (Exception e) { From e5fe96ed96df94f5dca21f24662cd60bfee9af51 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 17 Dec 2024 11:09:37 -0800 Subject: [PATCH 320/393] contrib/intel/jenkins: Re-order stages to put slow ones first Put slow stages first so they start executing and other tests can complete in parallel while the slow one is running. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 70 +++++++++++++++---------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index cdc29596028..3e176da7d06 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -605,6 +605,41 @@ pipeline { stage('parallel-tests') { when { equals expected: true, actual: DO_RUN } parallel { + stage('mpichtestsuite-tcp') { + steps { + script { + dir (RUN_LOCATION) { + def providers = [['tcp', null]] + def MPIS = ["mpich"] + if (env.WEEKLY.toBoolean()) { + MPIS = ["impi", "mpich"] + } + for (def mpi in MPIS) { + run_middleware(providers, "mpichtestsuite", "mpichtestsuite", + "grass", "bulbasaur,ivysaur", "2", "${mpi}") + } + } + } + } + } + stage('mpichtestsuite-verbs') { + steps { + script { + dir (RUN_LOCATION) { + def providers = [["verbs","rxm"]] + def MPIS = ["mpich"] + if (env.WEEKLY.toBoolean()) { + MPIS = ["impi", "mpich"] + } + for (def mpi in MPIS) { + run_middleware(providers, "mpichtestsuite", "mpichtestsuite", + "water", "squirtle,wartortle,articuno", "2", + "${mpi}") + } + } + } + } + } stage ('CI_mpi_verbs-rxm_imb') { steps { script { @@ -756,41 +791,6 @@ pipeline { } } } - stage('mpichtestsuite-tcp') { - steps { - script { - dir (RUN_LOCATION) { - def providers = [['tcp', null]] - def MPIS = ["mpich"] - if (env.WEEKLY.toBoolean()) { - MPIS = ["impi", "mpich"] - } - for (def mpi in MPIS) { - run_middleware(providers, "mpichtestsuite", "mpichtestsuite", - "grass", "bulbasaur,ivysaur", "2", "${mpi}") - } - } - } - } - } - stage('mpichtestsuite-verbs') { - steps { - script { - dir (RUN_LOCATION) { - def providers = [["verbs","rxm"]] - def MPIS = ["mpich"] - if (env.WEEKLY.toBoolean()) { - MPIS = ["impi", "mpich"] - } - for (def mpi in MPIS) { - run_middleware(providers, "mpichtestsuite", "mpichtestsuite", - "water", "squirtle,wartortle,articuno", "2", - "${mpi}") - } - } - } - } - } stage('SHMEM_grass') { steps { script { From 9b7f27c93303d1db2d49cc3e73b60771ebeac769 Mon Sep 17 00:00:00 2001 From: Nikhil Nanal Date: Tue, 17 Dec 2024 11:19:12 -0800 Subject: [PATCH 321/393] fabtests: corrected flags argumet type in ft_sendmsg/ft_recvmsg functions fixed fabtests send and recv functions to use flags argument type as uint64_t instead of int as the underlying fi calls use uint64_t. removed declaration of unused function ft_writemsg from shared.h Also fixed functions calling ft_sendmsg and ft_recvmsg touse uint64_t for flags Signed-off-by: Nikhil Nanal --- fabtests/benchmarks/benchmark_shared.c | 8 +++++--- fabtests/common/shared.c | 4 ++-- fabtests/include/shared.h | 6 ++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fabtests/benchmarks/benchmark_shared.c b/fabtests/benchmarks/benchmark_shared.c index ad0d6dca035..7a4ee95ae89 100644 --- a/fabtests/benchmarks/benchmark_shared.c +++ b/fabtests/benchmarks/benchmark_shared.c @@ -425,7 +425,7 @@ static int rma_bw_rx_comp() return ft_tx(ep, remote_fi_addr, FT_RMA_SYNC_MSG_BYTES, &tx_ctx); } -static int set_fi_more_flag(int i, int j, int flags) +static uint64_t set_fi_more_flag(int i, int j, uint64_t flags) { if (j < opts.window_size - 1 && i >= opts.warmup_iterations && i < opts.iterations + opts.warmup_iterations - 1) { @@ -438,7 +438,8 @@ static int set_fi_more_flag(int i, int j, int flags) int bandwidth(void) { - int ret, i, j, flags = 0; + int ret, i, j; + uint64_t flags = 0; size_t inject_size = fi->tx_attr->inject_size; ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, @@ -579,7 +580,8 @@ static int bw_rma_comp(enum ft_rma_opcodes rma_op, int num_completions) int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote) { - int ret, i, j, flags = 0; + int ret, i, j; + uint64_t flags = 0; size_t offset, inject_size = fi->tx_attr->inject_size; ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index f485afb484d..4d88d2d5f3c 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -2923,7 +2923,7 @@ int ft_tx_msg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, size_t size, void } int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, - void *buf, size_t size, void *ctx, int flags) + void *buf, size_t size, void *ctx, uint64_t flags) { struct fi_msg msg; struct fi_msg_tagged tagged_msg; @@ -2963,7 +2963,7 @@ int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, - size_t size, void *ctx, int flags) + size_t size, void *ctx, uint64_t flags) { struct fi_msg msg; struct fi_msg_tagged tagged_msg; diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 679241b9a2d..e4dc4b3c643 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -630,11 +630,9 @@ int ft_get_cq_comp(struct fid_cq *cq, uint64_t *cur, uint64_t total, int timeout int ft_get_cntr_comp(struct fid_cntr *cntr, uint64_t total, int timeout); int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, - void *buf, size_t size, void *ctx, int flags); + void *buf, size_t size, void *ctx, uint64_t flags); int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, - void *buf, size_t size, void *ctx, int flags); -int ft_writemsg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, size_t size, - void *ctx, struct fi_rma_iov *remote, int flags); + void *buf, size_t size, void *ctx, uint64_t flags); int ft_tx_msg(struct fid_ep *ep, fi_addr_t fi_addr, void *buf, size_t size, void *ctx, uint64_t flags); int ft_cq_read_verify(struct fid_cq *cq, void *op_context); From 482e47479eca9afe5fb0390db8e0e8d7e02363c7 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 17 Dec 2024 15:48:30 -0800 Subject: [PATCH 322/393] contrib/intel/jenkins: Do not run pipeline for unauthorized users Lookup a all teams and users in the ofiwg github team. If the submitter is not in the list of users then deny them Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 3e176da7d06..7c956d50a79 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -397,6 +397,23 @@ pipeline { } } } + stage ('bootstrap-ci') { + steps { + script { + bootstrap_ci() + } + } + } + stage('check-authorization') { + steps { + script { + sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ + python ${CI_LOCATION}/authorize.py \ + --author=${env.CHANGE_AUTHOR} + """ + } + } + } stage ('opt-out') { steps { script { @@ -433,13 +450,6 @@ pipeline { } } } - stage ('bootstrap-ci') { - steps { - script { - bootstrap_ci() - } - } - } stage ('build-libfabric') { when { equals expected: true, actual: DO_RUN } parallel { From fde856998fcdaf2c679bfa5d35e39766e1529805 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 19 Dec 2024 06:51:31 -0800 Subject: [PATCH 323/393] prov/lpp: add check for atomics lpp includes stdatomic.h but does not include a check for it in the configure so can cause a build to fail on a system without it Signed-off-by: Alexia Ingerson --- prov/lpp/configure.m4 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/prov/lpp/configure.m4 b/prov/lpp/configure.m4 index 7c447a16cfa..54f864454f7 100644 --- a/prov/lpp/configure.m4 +++ b/prov/lpp/configure.m4 @@ -11,8 +11,11 @@ AC_DEFUN([FI_LPP_CONFIGURE],[ lpp_happy=0 have_lpp_thread_safe=1 + AC_CHECK_DECL([HAVE_ATOMICS], [atomics_happy=1], [atomics_happy=0]) + AS_IF([test "x$macos" = "x1"],[lpp_happy=0], [test x$host_cpu != xx86_64],[lpp_happy=0], + [test x$atomics_happy == "x0"],[lpp_happy=0], [test x"$enable_lpp" != x"no"],[ lpp_happy=1 AC_SUBST(lpp_INCLUDES) From 442fa8991317a004343c7504f7b40c9d87844c58 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Fri, 20 Dec 2024 10:24:28 -0800 Subject: [PATCH 324/393] prov/shm: fix name compare bug Could result in a peer getting incorrectly unmmaped Signed-off-by: Alexia Ingerson --- prov/shm/src/smr_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/shm/src/smr_util.c b/prov/shm/src/smr_util.c index 0c5de80e2a0..0372c7e0597 100644 --- a/prov/shm/src/smr_util.c +++ b/prov/shm/src/smr_util.c @@ -599,7 +599,7 @@ void smr_map_del(struct smr_map *map, int64_t id) assert(id >= 0 && id < SMR_MAX_PEERS); pthread_mutex_lock(&ep_list_lock); dlist_foreach_container(&ep_name_list, struct smr_ep_name, name, entry) { - if (strcmp(name->name, map->peers[id].peer.name)) { + if (!strcmp(name->name, map->peers[id].peer.name)) { local = true; break; } From f893f5f88eb5cf0f4cf4e0154edc91e0b6f8b7bf Mon Sep 17 00:00:00 2001 From: Sai Sunku Date: Thu, 26 Dec 2024 21:25:58 +0000 Subject: [PATCH 325/393] fabtests: Bugfixes for neuron This commit fixes the following bugs in neuron fabtests 1. The neuron accelerator detection is broken on some OSs because the full path of the executable `neuron-ls` was not used 2. Before this commit, each pytest worker was assigned a single neuron core. This works on multi node tests but fails on single node tests because a neuron core can only be opened by a single process. This commit assigns two different neuron cores to each pytest worker for client-server tests: one for the server and one for the client. Trn1 has 2 cores per neuron device and Trn2 has 8 cores per neuron device, so this assignment works for both. 3. When running in serial mode, the env var PYTEST_XDIST_WORKER is not set, so the NEURON_RT_VISIBLE_CORES env var is also not set. This causes the server to occupy all neuron cores and the client fails. So this commit assigns device 0 to the server and client when running with one worker. Signed-off-by: Sai Sunku --- fabtests/pytest/common.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/fabtests/pytest/common.py b/fabtests/pytest/common.py index a6f50fcc9f4..ef3ad8b22da 100644 --- a/fabtests/pytest/common.py +++ b/fabtests/pytest/common.py @@ -68,7 +68,7 @@ def num_cuda_devices(ip): @functools.lru_cache(10) @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) def num_neuron_devices(ip): - proc = run("ssh {} neuron-ls -j".format(ip), shell=True, + proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, encoding="utf-8") @@ -84,7 +84,7 @@ def num_neuron_devices(ip): @functools.lru_cache(10) @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) def num_neuron_cores_on_device(ip, device_id): - proc = run("ssh {} neuron-ls -j".format(ip), shell=True, + proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, encoding="utf-8") @@ -97,7 +97,7 @@ def num_neuron_cores_on_device(ip, device_id): @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) def is_neuron_device_available(ip, device_id): - proc = run("ssh {} neuron-ls -j".format(ip), shell=True, + proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, encoding="utf-8") @@ -455,19 +455,26 @@ def prepare_base_command(self, command_type, executable, if "PYTEST_XDIST_WORKER" in os.environ: worker_id = int(os.environ["PYTEST_XDIST_WORKER"].replace("gw", "")) hmem_device_id = worker_id % num_hmem - if host_memory_type == "cuda": - command += " -i {}".format(hmem_device_id) - else: - assert host_memory_type == "neuron" - num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id) + else: + hmem_device_id = 0 + + if host_memory_type == "cuda": + command += " -i {}".format(hmem_device_id) + else: + assert host_memory_type == "neuron" + num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id) + if command_type == "server": additional_environment = "NEURON_RT_VISIBLE_CORES={}".format( hmem_device_id * num_cores) - wait_until_neuron_device_available(host_ip, hmem_device_id) + else: + additional_environment = "NEURON_RT_VISIBLE_CORES={}".format( + hmem_device_id * num_cores + 1) + wait_until_neuron_device_available(host_ip, hmem_device_id) - if self._cmdline_args.provider == "efa": - import efa.efa_common - efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem) - command += " -d {}-rdm".format(efa_device) + if self._cmdline_args.provider == "efa": + import efa.efa_common + efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem) + command += " -d {}-rdm".format(efa_device) return command, additional_environment From c3f9e2134bfc8524ac80a1306835bbfdc8b4dcb3 Mon Sep 17 00:00:00 2001 From: Sai Sunku Date: Mon, 16 Dec 2024 11:13:35 -0500 Subject: [PATCH 326/393] prov/efa: Decouple AV entry from endpoint Before this change, the EFA AV entry contained a reference to efa_rdm_peer which is specific to a given endpoint. This member also prevented binding a single AV to multiple endpoints. This change removes efa_rdm_peer from AV entry by adding a hashmap to the endpoint that maps fi_addr to efa_rdm_peer. And it also enables multiple EFA endpoints to bind to the same AV. Co-authored-by: Shi Jin Signed-off-by: Sai Sunku --- fabtests/pytest/efa/test_multi_ep.py | 9 +++++- prov/efa/src/efa_av.c | 42 +++++++++++----------------- prov/efa/src/efa_av.h | 2 +- prov/efa/src/efa_base_ep.c | 10 ------- prov/efa/src/efa_errno.h | 3 +- prov/efa/src/rdm/efa_rdm_ep.h | 7 +++++ prov/efa/src/rdm/efa_rdm_ep_fiops.c | 17 +++++++++++ prov/efa/src/rdm/efa_rdm_ep_utils.c | 14 +++++++++- prov/efa/src/rdm/efa_rdm_peer.c | 40 ++++++++++++++++++++++++++ prov/efa/src/rdm/efa_rdm_peer.h | 18 ++++++++++++ prov/efa/test/efa_unit_test_av.c | 36 ++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 1 + prov/efa/test/efa_unit_tests.h | 3 ++ 13 files changed, 163 insertions(+), 39 deletions(-) diff --git a/fabtests/pytest/efa/test_multi_ep.py b/fabtests/pytest/efa/test_multi_ep.py index 634529f0067..bf34a5cca28 100644 --- a/fabtests/pytest/efa/test_multi_ep.py +++ b/fabtests/pytest/efa/test_multi_ep.py @@ -2,10 +2,17 @@ @pytest.mark.functional @pytest.mark.parametrize("shared_cq", [True, False]) -def test_multi_ep(cmdline_args, shared_cq): +def test_multi_ep_cq(cmdline_args, shared_cq): from common import ClientServerTest cmd = "fi_multi_ep -e rdm" if shared_cq: cmd += " -Q" test = ClientServerTest(cmdline_args, cmd) test.run() + +@pytest.mark.functional +def test_multi_ep_av(cmdline_args): + from common import ClientServerTest + cmd = "fi_multi_ep -e rdm -A" + test = ClientServerTest(cmdline_args, cmd) + test.run() diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 0b692ed21a8..5ee81de7ebd 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -243,9 +243,6 @@ void efa_ah_release(struct efa_av *av, struct efa_ah *ah) } } -static -void efa_conn_release(struct efa_av *av, struct efa_conn *conn); - /** * @brief initialize the rdm related resources of an efa_conn object * @@ -266,18 +263,11 @@ int efa_conn_rdm_init(struct efa_av *av, struct efa_conn *conn, bool insert_shm_ int err, ret; char smr_name[EFA_SHM_NAME_MAX]; size_t smr_name_len; - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; assert(av->ep_type == FI_EP_RDM); assert(conn->ep_addr); - /* currently multiple EP bind to same av is not supported */ - assert(!dlist_empty(&av->util_av.ep_list)); - efa_rdm_ep = container_of(av->util_av.ep_list.next, struct efa_rdm_ep, base_ep.util_ep.av_entry); - - peer = &conn->rdm_peer; - efa_rdm_peer_construct(peer, efa_rdm_ep, conn); + conn->shm_fi_addr = FI_ADDR_NOTAVAIL; /* * The efa_conn_rdm_init() call can be made in two situations: @@ -315,8 +305,8 @@ int efa_conn_rdm_init(struct efa_av *av, struct efa_conn *conn, bool insert_shm_ * av. The efa provider should still use peer->shm_fiaddr for transmissions * through shm ep. */ - peer->shm_fiaddr = conn->fi_addr; - ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &peer->shm_fiaddr, FI_AV_USER_ID, NULL); + conn->shm_fi_addr = conn->fi_addr; + ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &conn->shm_fi_addr, FI_AV_USER_ID, NULL); if (OFI_UNLIKELY(ret != 1)) { EFA_WARN(FI_LOG_AV, "Failed to insert address to shm provider's av: %s\n", @@ -326,11 +316,10 @@ int efa_conn_rdm_init(struct efa_av *av, struct efa_conn *conn, bool insert_shm_ EFA_INFO(FI_LOG_AV, "Successfully inserted %s to shm provider's av. efa_fiaddr: %ld shm_fiaddr = %ld\n", - smr_name, conn->fi_addr, peer->shm_fiaddr); + smr_name, conn->fi_addr, conn->shm_fi_addr); - assert(peer->shm_fiaddr < efa_env.shm_av_size); + assert(conn->shm_fi_addr < efa_env.shm_av_size); av->shm_used++; - peer->is_local = 1; } return 0; @@ -350,26 +339,29 @@ void efa_conn_rdm_deinit(struct efa_av *av, struct efa_conn *conn) int err; struct efa_rdm_peer *peer; struct efa_rdm_ep *ep; + struct dlist_entry *entry, *tmp; assert(av->ep_type == FI_EP_RDM); peer = &conn->rdm_peer; - if (peer->is_local && av->shm_rdm_av) { - err = fi_av_remove(av->shm_rdm_av, &peer->shm_fiaddr, 1, 0); + if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL && av->shm_rdm_av) { + err = fi_av_remove(av->shm_rdm_av, &conn->shm_fi_addr, 1, 0); if (err) { EFA_WARN(FI_LOG_AV, "remove address from shm av failed! err=%d\n", err); } else { av->shm_used--; - assert(peer->shm_fiaddr < efa_env.shm_av_size); + assert(conn->shm_fi_addr < efa_env.shm_av_size); } } - /* - * We need peer->shm_fiaddr to remove shm address from shm av table, - * so efa_rdm_peer_clear must be after removing shm av table. - */ - ep = dlist_empty(&av->util_av.ep_list) ? NULL : container_of(av->util_av.ep_list.next, struct efa_rdm_ep, base_ep.util_ep.av_entry); - efa_rdm_peer_destruct(peer, ep); + dlist_foreach_safe(&av->util_av.ep_list, entry, tmp) { + ep = container_of(entry, struct efa_rdm_ep, base_ep.util_ep.av_entry); + peer = efa_rdm_peer_map_lookup(&ep->fi_addr_to_peer_map, conn->fi_addr); + if (peer) { + efa_rdm_peer_destruct(peer, ep); + efa_rdm_peer_map_remove(&ep->fi_addr_to_peer_map, conn->fi_addr, peer); + } + } } /* diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index 75acd87fdd7..b1624398be0 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -27,6 +27,7 @@ struct efa_conn { fi_addr_t fi_addr; fi_addr_t util_av_fi_addr; struct efa_rdm_peer rdm_peer; + fi_addr_t shm_fi_addr; }; struct efa_av_entry { @@ -60,7 +61,6 @@ struct efa_prv_reverse_av { struct efa_av { struct fid_av *shm_rdm_av; struct efa_domain *domain; - struct efa_base_ep *base_ep; size_t used; size_t shm_used; enum fi_av_type type; diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 7e7b6b4a910..5db06721ad9 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -8,15 +8,6 @@ int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av) { - /* - * Binding multiple endpoints to a single AV is currently not - * supported. - */ - if (av->base_ep) { - EFA_WARN(FI_LOG_EP_CTRL, - "Address vector already has endpoint bound to it.\n"); - return -FI_ENOSYS; - } if (base_ep->domain != av->domain) { EFA_WARN(FI_LOG_EP_CTRL, "Address vector doesn't belong to same domain as EP.\n"); @@ -29,7 +20,6 @@ int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av) } base_ep->av = av; - base_ep->av->base_ep = base_ep; return 0; } diff --git a/prov/efa/src/efa_errno.h b/prov/efa/src/efa_errno.h index 029c35d4a07..5d3769d32d6 100644 --- a/prov/efa/src/efa_errno.h +++ b/prov/efa/src/efa_errno.h @@ -107,7 +107,8 @@ _(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \ _(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \ _(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) \ - _(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed)) + _(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed)) \ + _(4127, PEER_MAP_ENTRY_POOL_EXHAUSTED, Peer map entry pool exhausted) /** @} */ diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index b82741963ef..aecb391ec55 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -41,6 +41,10 @@ struct efa_rdm_ep_queued_copy { #define EFA_RDM_EP_MAX_WR_PER_IBV_POST_SEND (4096) #define EFA_RDM_EP_MAX_WR_PER_IBV_POST_RECV (8192) +struct efa_rdm_peer_map { + struct efa_rdm_peer_map_entry *head; +}; + struct efa_rdm_ep { struct efa_base_ep base_ep; @@ -187,6 +191,9 @@ struct efa_rdm_ep { struct dlist_entry entry; /* the count of opes queued before handshake is made with their peers */ size_t ope_queued_before_handshake_cnt; + + struct ofi_bufpool *peer_map_entry_pool; /* bufpool to hold fi_addr->efa_rdm_peer key-value pairs */ + struct efa_rdm_peer_map fi_addr_to_peer_map; /* Hashmap to find efa_rdm_peer given fi_addr */ }; int efa_rdm_ep_flush_queued_blocking_copy_to_hmem(struct efa_rdm_ep *ep); diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 47b3f53f9bd..7f75caf8e92 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -307,7 +307,18 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) if (ret) goto err_free; + ret = ofi_bufpool_create(&ep->peer_map_entry_pool, + sizeof(struct efa_rdm_peer_map_entry), + EFA_RDM_BUFPOOL_ALIGNMENT, + 0, /* no limit to max cnt */ + /* Don't track usage, because endpoint can be closed without removing entries from AV */ + EFA_MIN_AV_SIZE, OFI_BUFPOOL_NO_TRACK); + if (ret) + goto err_free; + efa_rdm_rxe_map_construct(&ep->rxe_map); + efa_rdm_peer_map_construct(&ep->fi_addr_to_peer_map); + return 0; err_free: @@ -341,6 +352,9 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) if (ep->efa_tx_pkt_pool) ofi_bufpool_destroy(ep->efa_tx_pkt_pool); + if (ep->peer_map_entry_pool) + ofi_bufpool_destroy(ep->peer_map_entry_pool); + return ret; } @@ -828,6 +842,9 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) if (efa_rdm_ep->rx_atomrsp_pool) ofi_bufpool_destroy(efa_rdm_ep->rx_atomrsp_pool); + + if (efa_rdm_ep->peer_map_entry_pool) + ofi_bufpool_destroy(efa_rdm_ep->peer_map_entry_pool); } /* diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 12c3c519983..6fc841f2600 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -96,14 +96,26 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer(struct efa_rdm_ep *ep, fi_addr_t addr) { struct util_av_entry *util_av_entry; struct efa_av_entry *av_entry; + struct efa_rdm_peer *peer; if (OFI_UNLIKELY(addr == FI_ADDR_NOTAVAIL)) return NULL; + peer = efa_rdm_peer_map_lookup(&ep->fi_addr_to_peer_map, addr); + if (peer) + return peer; + util_av_entry = ofi_bufpool_get_ibuf(ep->base_ep.util_ep.av->av_entry_pool, addr); av_entry = (struct efa_av_entry *)util_av_entry->data; - return av_entry->conn.ep_addr ? &av_entry->conn.rdm_peer : NULL; + + if (av_entry->conn.ep_addr) { + peer = efa_rdm_peer_map_insert(&ep->fi_addr_to_peer_map, addr, ep); + efa_rdm_peer_construct(peer, ep, &av_entry->conn); + return peer; + } + + return NULL; } /** diff --git a/prov/efa/src/rdm/efa_rdm_peer.c b/prov/efa/src/rdm/efa_rdm_peer.c index 3e8e3dff774..7c82c943835 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.c +++ b/prov/efa/src/rdm/efa_rdm_peer.c @@ -31,6 +31,11 @@ void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, st dlist_init(&peer->txe_list); dlist_init(&peer->rxe_list); dlist_init(&peer->overflow_pke_list); + + if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL) { + peer->shm_fiaddr = conn->shm_fi_addr; + peer->is_local = 1; + } } /** @@ -111,6 +116,41 @@ void efa_rdm_peer_destruct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep) #endif } +struct efa_rdm_peer *efa_rdm_peer_map_insert(struct efa_rdm_peer_map *peer_map, fi_addr_t addr, struct efa_rdm_ep *ep) { + struct efa_rdm_peer_map_entry *map_entry; + struct efa_rdm_peer *peer; + + map_entry = ofi_buf_alloc(ep->peer_map_entry_pool); + if (OFI_UNLIKELY(!map_entry)) { + EFA_WARN(FI_LOG_CQ, + "Map entries for EFA AV to peer mapping exhausted.\n"); + efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_PEER_MAP_ENTRY_POOL_EXHAUSTED); + return NULL; + } + + map_entry->key = addr; + peer = &map_entry->efa_rdm_peer; + + HASH_ADD(hh, peer_map->head, key, sizeof(addr), map_entry); + + return peer; +} + +struct efa_rdm_peer *efa_rdm_peer_map_lookup(struct efa_rdm_peer_map *peer_map, fi_addr_t addr) { + struct efa_rdm_peer_map_entry *map_entry; + + HASH_FIND(hh, peer_map->head, &addr, sizeof(addr), map_entry); + return map_entry ? &map_entry->efa_rdm_peer : NULL; +} + +void efa_rdm_peer_map_remove(struct efa_rdm_peer_map *peer_map, fi_addr_t addr, struct efa_rdm_peer *peer) { + struct efa_rdm_peer_map_entry *map_entry; + + HASH_FIND(hh, peer_map->head, &addr, sizeof(addr), map_entry); + HASH_DEL(peer_map->head, map_entry); + ofi_buf_free(map_entry); +} + /** * @brief run incoming packet_entry through reorder buffer * queue the packet entry if msg_id is larger than expected. diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index 8c2703fc140..cc4deefa0ae 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -75,6 +75,12 @@ struct efa_rdm_peer { struct efa_rdm_peer_user_recv_qp user_recv_qp; }; +struct efa_rdm_peer_map_entry { + uint64_t key; + struct efa_rdm_peer efa_rdm_peer; + UT_hash_handle hh; +}; + /** * @brief check for peer's RDMA_READ support, assuming HANDSHAKE has already occurred * @@ -269,6 +275,12 @@ bool efa_both_support_zero_hdr_data_transfer(struct efa_rdm_ep *ep, struct efa_r (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP)); } +static inline +void efa_rdm_peer_map_construct(struct efa_rdm_peer_map *peer_map) +{ + peer_map->head = NULL; +} + struct efa_conn; void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_conn *conn); @@ -287,4 +299,10 @@ size_t efa_rdm_peer_get_runt_size(struct efa_rdm_peer *peer, struct efa_rdm_ep * int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_rdm_ope *ope); +struct efa_rdm_peer *efa_rdm_peer_map_insert(struct efa_rdm_peer_map *peer_map, fi_addr_t addr, struct efa_rdm_ep *ep); + +struct efa_rdm_peer *efa_rdm_peer_map_lookup(struct efa_rdm_peer_map *peer_map, fi_addr_t addr); + +void efa_rdm_peer_map_remove(struct efa_rdm_peer_map *peer_map, fi_addr_t addr, struct efa_rdm_peer *peer); + #endif /* EFA_RDM_PEER_H */ diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index 9ca730d0b6e..6e11ee5c177 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -74,3 +74,39 @@ void test_av_insert_duplicate_gid(struct efa_resource **state) assert_int_equal(num_addr, 1); assert_int_not_equal(addr1, addr2); } + +/** + * @brief This test verifies that multiple endpoints can bind to the same AV + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_multiple_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fid_ep *ep2, *ep3; + int ret; + + /* Resource construct function creates and binds 1 EP to the AV */ + efa_unit_test_resource_construct(resource, FI_EP_RDM); + + /* Create and bind two new endpoints to the same AV */ + fi_endpoint(resource->domain, resource->info, &ep2, NULL); + ret = fi_ep_bind(ep2, &resource->av->fid, 0); + assert_int_equal(ret, 0); + + fi_endpoint(resource->domain, resource->info, &ep3, NULL); + ret = fi_ep_bind(ep3, &resource->av->fid, 0); + assert_int_equal(ret, 0); + + /* Bind the two new endpoints to the same CQ and enable them */ + fi_ep_bind(ep2, &resource->cq->fid, FI_SEND | FI_RECV); + ret = fi_enable(ep2); + assert_int_equal(ret, 0); + + fi_ep_bind(ep3, &resource->cq->fid, FI_SEND | FI_RECV); + ret = fi_enable(ep3); + assert_int_equal(ret, 0); + + fi_close(&ep2->fid); + fi_close(&ep3->fid); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 8330e650f5f..8d90a988bb9 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -80,6 +80,7 @@ int main(void) const struct CMUnitTest efa_unit_tests[] = { cmocka_unit_test_setup_teardown(test_av_insert_duplicate_raw_addr, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_insert_duplicate_gid, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_multiple_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_device_construct_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_ignore_missing_host_id_file, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_has_valid_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 246dd563a42..b3a6fcbedee 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -40,6 +40,8 @@ void efa_unit_test_resource_construct_ep_not_enabled( struct efa_resource *resource, enum fi_ep_type ep_type); void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled( struct efa_resource *resource, enum fi_ep_type ep_type); +void efa_unit_test_resource_construct_no_av_no_cq_and_ep_not_enabled( + struct efa_resource *resource, enum fi_ep_type ep_type); void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, enum fi_ep_type ep_type, uint32_t fi_version, struct fi_info *hints, @@ -100,6 +102,7 @@ void efa_unit_test_handshake_pkt_construct(struct efa_rdm_pke *pkt_entry, struct /* test cases */ void test_av_insert_duplicate_raw_addr(); void test_av_insert_duplicate_gid(); +void test_av_multiple_ep(); void test_efa_device_construct_error_handling(); void test_efa_rdm_ep_ignore_missing_host_id_file(); void test_efa_rdm_ep_has_valid_host_id(); From a3eb8e277d9b22edb8d7519e48de3f52e98ce11a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:28:40 +0000 Subject: [PATCH 327/393] build(deps): bump github/codeql-action from 3.27.6 to 3.27.9 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.27.6 to 3.27.9. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/aa578102511db1f4524ed59b8cc2bae4f6e88195...df409f7d9260372bd5f19e5b04e83cb3c43714ae) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index a42caa6a4d6..771ad835f61 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@aa578102511db1f4524ed59b8cc2bae4f6e88195 # v3.27.6 + uses: github/codeql-action/init@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@aa578102511db1f4524ed59b8cc2bae4f6e88195 # v3.27.6 + uses: github/codeql-action/autobuild@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@aa578102511db1f4524ed59b8cc2bae4f6e88195 # v3.27.6 + uses: github/codeql-action/analyze@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 409f3fb9bcb..e5925ff9bd4 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@aa578102511db1f4524ed59b8cc2bae4f6e88195 # v3.27.6 + uses: github/codeql-action/upload-sarif@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 with: sarif_file: results.sarif From 7ae9698cfbcb2d7e4c0a69c364c87ccee0ecf936 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 30 Dec 2024 20:32:53 +0000 Subject: [PATCH 328/393] man/fi_setup: Complete partial sentence Signed-off-by: Seth Zegelstein --- man/fi_setup.7.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/man/fi_setup.7.md b/man/fi_setup.7.md index a5afa99b33c..75f60a05642 100644 --- a/man/fi_setup.7.md +++ b/man/fi_setup.7.md @@ -135,11 +135,11 @@ requested, a provider must support a capability if it is asked for or fail the fi_getinfo request. A provider may optionally report non-requested secondary capabilities if doing so would not compromise performance or security. That is, a provider may grant an application a secondary capability, -whether the application. The most commonly accessed secondary capability bits -indicate if provider communication is restricted to the local node Ifor example, -the shared memory provider only supports local communication) and/or remote -nodes (which can be the case for NICs that lack loopback support). Other -secondary capability bits mostly deal with features targeting highly-scalable +regardless of whether the application requested it. The most commonly accessed +secondary capability bits indicate if provider communication is restricted to the +local node (for example, the shared memory provider only supports local communication) +and/or remote nodes (which can be the case for NICs that lack loopback support). +Other secondary capability bits mostly deal with features targeting highly-scalable applications, but may not be commonly supported across multiple providers. Because different providers support different sets of capabilities, applications From 90f3ba917312a83fb82e513d3af6aa76710fff23 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Tue, 31 Dec 2024 16:09:49 +0000 Subject: [PATCH 329/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_setup.7 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/man/man7/fi_setup.7 b/man/man7/fi_setup.7 index 752ace93874..1c7e42dfd44 100644 --- a/man/man7/fi_setup.7 +++ b/man/man7/fi_setup.7 @@ -14,7 +14,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_setup" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_setup" "7" "2024\-12\-31" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -173,9 +173,9 @@ fail the fi_getinfo request. A provider may optionally report non-requested secondary capabilities if doing so would not compromise performance or security. That is, a provider may grant an application a secondary capability, -whether the application. +regardless of whether the application requested it. The most commonly accessed secondary capability bits indicate if -provider communication is restricted to the local node Ifor example, the +provider communication is restricted to the local node (for example, the shared memory provider only supports local communication) and/or remote nodes (which can be the case for NICs that lack loopback support). Other secondary capability bits mostly deal with features targeting From 6e4daf164090e59ed808794b0d739cf1c39fd4ac Mon Sep 17 00:00:00 2001 From: Soumendu Satapathy Date: Tue, 17 Dec 2024 12:41:26 -0600 Subject: [PATCH 330/393] prov/cxi: Remove CXI_MAP_IOVA_ALLOC flag. Remove all CXI_MAP_IOVA_ALLOC references from libfabric. Signed-off-by: Soumendu Satapathy --- prov/cxi/include/cxip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index b8e42ca0a69..0a73fc90582 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -1076,7 +1076,7 @@ struct cxip_eq { }; #define CXIP_EQ_MAP_FLAGS \ - (CXI_MAP_WRITE | CXI_MAP_PIN | CXI_MAP_IOVA_ALLOC) + (CXI_MAP_WRITE | CXI_MAP_PIN) /* * RMA request From ed5560a27db86a6aa625b7362150fbe3b086c0cd Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 20 Dec 2024 00:31:08 +0000 Subject: [PATCH 331/393] prov/efa: Detect unsolicited write recv support status on both sides Currently, when local support unsolicited write recv while the peer doesn't support it, the peer will crash because it expects to get a valid wr_id for IBV_WC_RECV_RDMA_WITH_IMM op code. This peer crash can cause weird error message on sender side's cq when it is still sending data to it. When local doesn't support unsolicited write recv while the peer support it, local will get cq error for the rdma op as "Unexpected status" as well. This patch makes the initiator of rdma write imm detect the unsolicited write recv support status on both sides. If there is inconsistency, the initiator will return error with clear error messages that instruct the mitigation. Signed-off-by: Shi Jin --- prov/efa/docs/efa_rdm_protocol_v4.md | 18 ++++++ prov/efa/src/rdm/efa_rdm_ep.h | 6 ++ prov/efa/src/rdm/efa_rdm_ep_fiops.c | 3 + prov/efa/src/rdm/efa_rdm_peer.h | 17 +++++ prov/efa/src/rdm/efa_rdm_protocol.h | 3 +- prov/efa/src/rdm/efa_rdm_rma.c | 18 ++++++ prov/efa/src/rdm/efa_rdm_util.c | 91 +++++++++++++++++---------- prov/efa/src/rdm/efa_rdm_util.h | 2 + prov/efa/test/efa_unit_test_ep.c | 94 ++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 2 + prov/efa/test/efa_unit_tests.h | 2 + 11 files changed, 221 insertions(+), 35 deletions(-) diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 9016ec00958..1877156779b 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -68,6 +68,12 @@ Chapter 4 "extra features/requests" describes the extra features/requests define * Section 4.6 describe the extra feature: RDMA-Write based message transfer. + * Section 4.7 describe the extra feature: Long read and runting read nack protocol. + + * Section 4.8 describe the extra feature: User receive QP. + + * Section 4.9 describe the extra feature: Unsolicited write recv. + Chapter 5 "What's not covered?" describes the contents that are intentionally left out of this document because they are considered "implementation details". @@ -323,6 +329,7 @@ Table: 2.1 a list of extra features/requests | 5 | RDMA-Write based data transfer | extra feature | libfabric 1.18.0 | Section 4.6 | | 6 | Read nack packets | extra feature | libfabric 1.20.0 | Section 4.7 | | 7 | User recv QP | extra feature & request| libfabric 1.22.0 | Section 4.8 | +| 8 | Unsolicited write recv | extra feature | libfabric 1.22.0 | Section 4.9 | How does protocol v4 maintain backward compatibility when extra features/requests are introduced? @@ -1611,6 +1618,17 @@ zero-copy receive mode. If a receiver gets RTM packets delivered to its default QP, it raises an error because it requests all RTM packets must be delivered to its user recv QP. +### 4.9 Unsolicited write recv + +The "Unsolicited write recv" is an extra feature that was +introduced with the libfabric 1.22.0. When this feature is on, rdma-write +with immediate data will not consume an rx buffer on the responder side. It is +defined as an extra feature because there is a set of requirements (firmware, +EFA kernel module and rdma-core) to be met before an endpoint can use the unsolicited +write recv capability, therefore an endpoint cannot assume the other party supports +unsolicited write recv. The rdma-write with immediate data cannot be issued if there +is a discrepancy on this feature between local and peer. + ## 5. What's not covered? The purpose of this document is to define the communication protocol. Therefore, it is intentionally written diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index aecb391ec55..9b198026d1b 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -455,4 +455,10 @@ static inline int efa_rdm_attempt_to_sync_memops_ioc(struct efa_rdm_ep *ep, stru return err; } +static inline +bool efa_rdm_ep_support_unsolicited_write_recv(struct efa_rdm_ep *ep) +{ + return ep->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; +} + #endif diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 7f75caf8e92..2574e493cbb 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1071,6 +1071,9 @@ void efa_rdm_ep_set_extra_info(struct efa_rdm_ep *ep) ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE; + if (efa_rdm_use_unsolicited_write_recv()) + ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + if (ep->use_zcpy_rx) { /* * When zcpy rx is enabled, an extra QP is created to diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index cc4deefa0ae..21585051921 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -115,6 +115,23 @@ bool efa_rdm_peer_support_rdma_write(struct efa_rdm_peer *peer) (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_RDMA_WRITE); } +/** + * @brief check for peer's unsolicited write support, assuming HANDSHAKE has already occurred + * + * @param[in] peer A peer which we have already received a HANDSHAKE from + * @return bool The peer's unsolicited write recv support + */ +static inline +bool efa_rdm_peer_support_unsolicited_write_recv(struct efa_rdm_peer *peer) +{ + /* Unsolicited write recv is an extra feature defined in version 4 (the base version). + * Because it is an extra feature, an EP will assume the peer does not support + * it before a handshake packet was received. + */ + return (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) && + (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV); +} + static inline bool efa_rdm_peer_support_delivery_complete(struct efa_rdm_peer *peer) { diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h index 1b94b5338d1..8840ce5f401 100644 --- a/prov/efa/src/rdm/efa_rdm_protocol.h +++ b/prov/efa/src/rdm/efa_rdm_protocol.h @@ -40,7 +40,8 @@ struct efa_ep_addr { #define EFA_RDM_EXTRA_FEATURE_RDMA_WRITE BIT_ULL(5) #define EFA_RDM_EXTRA_FEATURE_READ_NACK BIT_ULL(6) #define EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP BIT_ULL(7) -#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 8 +#define EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV BIT_ULL(8) +#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 9 /* * The length of 64-bit extra_info array used in efa_rdm_ep * and efa_rdm_peer diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index 36b2d5171da..cfb399ef055 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -370,6 +370,24 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); if (efa_rdm_rma_should_write_using_rdma(ep, txe, txe->peer)) { + /** + * Unsolicited write recv is a feature that makes rdma-write with + * imm not consume an rx buffer on the responder side, and this + * feature requires consistent support status on both sides. + */ + if ((txe->fi_flags & FI_REMOTE_CQ_DATA) && + (efa_rdm_ep_support_unsolicited_write_recv(ep) != efa_rdm_peer_support_unsolicited_write_recv(txe->peer))) { + (void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, ep->err_msg, "", EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + EFA_WARN(FI_LOG_EP_DATA, + "Inconsistent support status detected on unsolicited write recv.\n" + "My support status: %d, peer support status: %d. %s.\n" + "This is usually caused by inconsistent efa driver, libfabric, or rdma-core versions.\n" + "Please use consistent software versions on both hosts, or disable the unsolicited write " + "recv feature by setting environment variable FI_EFA_USE_UNSOLICITED_WRITE_RECV=0\n", + efa_rdm_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer), + ep->err_msg); + return -FI_EOPNOTSUPP; + } efa_rdm_ope_prepare_to_post_write(txe); return efa_rdm_ope_post_remote_write(txe); } diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index 02880c09dfd..0175b3884a9 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -97,6 +97,53 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) } } +/** + * @brief Construct a message that contains the local and peer information, + * including the efa address and the host id. + * + * @param ep EFA RDM endpoint + * @param addr Remote peer fi_addr_t + * @param msg the ptr of the msg to be constructed (needs to be allocated already!) + * @param base_msg ptr to the base msg that will show at the beginning of msg + * @param msg_len the length of the message + * @return int 0 on success, negative integer on failure + */ +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len) +{ + char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; + char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + size_t len = 0; + int ret; + struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); + + len = sizeof(ep_addr_str); + efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); + len = sizeof(peer_addr_str); + efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); + + if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { + strcpy(local_host_id_str, "N/A"); + } + + if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { + strcpy(peer_host_id_str, "N/A"); + } + + ret = snprintf(msg, msg_len, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", + base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + + if (ret < 0 || ret > msg_len - 1) { + return -FI_EINVAL; + } + + if (strlen(msg) >= msg_len) { + return -FI_ENOBUFS; + } + + return FI_SUCCESS; +} + /** * @brief Write the error message and return its byte length * @param[in] ep EFA RDM endpoint @@ -108,42 +155,18 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) */ int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen) { - char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; - char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - const char *base_msg = efa_strerror(prov_errno); - size_t len = 0; - struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); - - *buf = NULL; - *buflen = 0; - - len = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); - len = sizeof(peer_addr_str); - efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); - - if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { - strcpy(local_host_id_str, "N/A"); - } - - if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { - strcpy(peer_host_id_str, "N/A"); - } - - int ret = snprintf(ep->err_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", - base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + const char *base_msg = efa_strerror(prov_errno); + int ret; - if (ret < 0 || ret > EFA_RDM_ERROR_MSG_BUFFER_LENGTH - 1) { - return -FI_EINVAL; - } + *buf = NULL; + *buflen = 0; - if (strlen(ep->err_msg) >= EFA_RDM_ERROR_MSG_BUFFER_LENGTH) { - return -FI_ENOBUFS; - } + ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, ep->err_msg, base_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + if (ret) + return ret; - *buf = ep->err_msg; - *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + *buf = ep->err_msg; + *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; - return 0; + return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index a2ba0083295..b79bafb4e85 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -19,6 +19,8 @@ bool efa_rdm_get_use_device_rdma(uint32_t fabric_api_version); void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc); +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len); + int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen); #ifdef ENABLE_EFA_POISONING diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 1ac044ce00c..3adc8a136f9 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -659,6 +659,79 @@ void test_efa_rdm_ep_read_queue_before_handshake(struct efa_resource **state) test_efa_rdm_ep_rma_queue_before_handshake(state, ofi_op_read_req); } +/** + * @brief When local support unsolicited write, but the peer doesn't, fi_writedata + * (use rdma-write with imm) should fail as FI_EINVAL + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t peer_addr; + int num_addr; + const int buf_len = 8; + char buf[8] = {0}; + int err; + uint64_t rma_addr, rma_key; + struct efa_rdm_peer *peer; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints->caps |= FI_MSG | FI_TAGGED | FI_RMA; + resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 22), + resource->hints, true, true); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /** + * TODO: It's better to mock this function + * so we can test on platform that doesn't + * support rdma-write. + */ + if (!(efa_rdm_ep_support_rdma_write(efa_rdm_ep))) + skip(); + + /* Make local ep support unsolicited write recv */ + efa_rdm_ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + + /* create a fake peer */ + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); + assert_int_equal(num_addr, 1); + + /* create a fake rma_key and address. fi_read should return before + * they are needed. */ + rma_key = 0x1234; + rma_addr = (uint64_t) &buf; + + /* + * Fake a peer that has made handshake and + * does not support unsolicited write recv + */ + peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + peer->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_RDMA_WRITE; + peer->extra_info[0] &= ~EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + /* make sure shm is not used */ + peer->is_local = false; + + err = fi_writedata(resource->ep, buf, buf_len, + NULL, /* desc, not required */ + 0x1234, + peer_addr, + rma_addr, + rma_key, + NULL); /* context */ + assert_int_equal(err, -FI_EOPNOTSUPP); +} + /** * @brief verify that when shm was used to send a small message (<4k), no copy was performed. * @@ -1299,3 +1372,24 @@ void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(struct efa_resource { test_efa_rdm_ep_rx_refill_impl(state, 128, 64); } + +/** + * @brief when unsolicited write recv is supported (by device + env), + * efa_rdm_ep_support_unsolicited_write_recv + * should return true, otherwise it should return false + * + * @param[in] state struct efa_resource that is managed by the framework + * @param[in] is_supported support status + */ +void test_efa_rdm_ep_support_unsolicited_write_recv(struct efa_resource **state) +{ + struct efa_rdm_ep *efa_rdm_ep; + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct(resource, FI_EP_RDM); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + assert_int_equal(efa_rdm_use_unsolicited_write_recv(), + efa_rdm_ep_support_unsolicited_write_recv(efa_rdm_ep)); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 8d90a988bb9..017f4e65ded 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -117,6 +117,8 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_support_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index b3a6fcbedee..4a796e5385f 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -139,6 +139,8 @@ void test_efa_rdm_ep_zcpy_recv_eagain(); void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(); void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(); void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(); +void test_efa_rdm_ep_support_unsolicited_write_recv(); +void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll(); From 1e3cca62ff490fd44e3eecb7b61a1349164e8aa6 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 31 Dec 2024 10:51:38 -0800 Subject: [PATCH 332/393] prov/efa: Move fork handler installation to efa_domain_open efa_fork_support_enable_if_requested was moved to EFA_INI, so efa_fork_support_install_fork_handler can be registered at any stage that is later. Move efa_fork_support_install_fork_handler back to efa_domain_open to avoid installing fork handler for non-EFA provider during fi_getinfo's provider discovery process. Signed-off-by: Jessie Yang --- prov/efa/src/efa_domain.c | 10 ++++++++++ prov/efa/src/efa_user_info.c | 18 ------------------ 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index e6cab857af3..6af35775ae0 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -290,6 +290,16 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info, efa_domain->util_domain.domain_fid.ops = &efa_ops_domain_dgram; } +#ifndef _WIN32 + err = efa_fork_support_install_fork_handler(); + if (err) { + EFA_WARN(FI_LOG_CORE, + "Unable to install fork handler: %s\n", + strerror(-err)); + return err; + } +#endif + dlist_insert_tail(&efa_domain->list_entry, &g_efa_domain_list); return 0; diff --git a/prov/efa/src/efa_user_info.c b/prov/efa/src/efa_user_info.c index 129c038ee21..e152f2adc23 100644 --- a/prov/efa/src/efa_user_info.c +++ b/prov/efa/src/efa_user_info.c @@ -610,24 +610,6 @@ int efa_getinfo(uint32_t version, const char *node, struct fi_info *dgram_info_list, *rdm_info_list; int err; -#ifndef _WIN32 - /* - * TODO: - * It'd be better to install this during provider init (since that's - * only invoked once) but fork() is currently called by nvml_init in - * other provider's ini (which calls ofi_hmem_init) after efa provider init. - * This can move to the provider init after we get rid of that fork() in - * ofi_hmem_init(). - */ - err = efa_fork_support_install_fork_handler(); - if (err) { - EFA_WARN(FI_LOG_CORE, - "Unable to install fork handler: %s\n", - strerror(-err)); - return err; - } -#endif - if (hints && hints->ep_attr && hints->ep_attr->type == FI_EP_DGRAM) return efa_user_info_get_dgram(version, node, service, flags, hints, info); From 007b747b29dab1a5fcf23831850058481b7ae5d0 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 16 Dec 2024 21:08:49 -0600 Subject: [PATCH 333/393] prov/cxi: Ignore drop count during init When MRs and EPs should ignore drop count during init. Failing to do this may result in enable hangs. Signed-off-by: Ian Ziemba --- prov/cxi/src/cxip_ctrl.c | 2 +- prov/cxi/src/cxip_mr.c | 5 +++-- prov/cxi/src/cxip_msg_hpc.c | 3 ++- prov/cxi/src/cxip_msg_rnr.c | 2 +- prov/cxi/src/cxip_rdzv_pte.c | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/prov/cxi/src/cxip_ctrl.c b/prov/cxi/src/cxip_ctrl.c index e54572fd53e..bb543b6409a 100644 --- a/prov/cxi/src/cxip_ctrl.c +++ b/prov/cxi/src/cxip_ctrl.c @@ -694,7 +694,7 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) } ret = cxip_pte_set_state(ep_obj->ctrl.pte, ep_obj->ctrl.tgq, - C_PTLTE_ENABLED, 0); + C_PTLTE_ENABLED, CXIP_PTE_IGNORE_DROPS); if (ret) { /* This is a bug, we have exclusive access to this CMDQ. */ CXIP_WARN("Failed to enqueue command: %d\n", ret); diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c index a1d1c5e2829..7fbd6e6187a 100644 --- a/prov/cxi/src/cxip_mr.c +++ b/prov/cxi/src/cxip_mr.c @@ -342,7 +342,8 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) goto err_pte_free; } - ret = cxip_pte_set_state(mr->pte, ep_obj->ctrl.tgq, C_PTLTE_ENABLED, 0); + ret = cxip_pte_set_state(mr->pte, ep_obj->ctrl.tgq, C_PTLTE_ENABLED, + CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { /* This is a bug, we have exclusive access to this CMDQ. */ CXIP_WARN("Failed to enqueue command: %d\n", ret); @@ -532,7 +533,7 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) } ret = cxip_pte_set_state(_mr->pte, ep_obj->ctrl.tgq, - C_PTLTE_ENABLED, 0); + C_PTLTE_ENABLED, CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { /* This is a bug, we have exclusive access to this CMDQ. */ CXIP_WARN("Failed to enqueue command: %d\n", ret); diff --git a/prov/cxi/src/cxip_msg_hpc.c b/prov/cxi/src/cxip_msg_hpc.c index 4980a3fd3b0..faf2b52b9fc 100644 --- a/prov/cxi/src/cxip_msg_hpc.c +++ b/prov/cxi/src/cxip_msg_hpc.c @@ -3930,7 +3930,8 @@ static int cxip_rxc_hpc_msg_init(struct cxip_rxc *rxc_base) } /* Start accepting Puts. */ - ret = cxip_pte_set_state(rxc->base.rx_pte, rxc->base.rx_cmdq, state, 0); + ret = cxip_pte_set_state(rxc->base.rx_pte, rxc->base.rx_cmdq, state, + CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { CXIP_WARN("cxip_pte_set_state returned: %d\n", ret); goto free_oflow_buf; diff --git a/prov/cxi/src/cxip_msg_rnr.c b/prov/cxi/src/cxip_msg_rnr.c index 7b4415ea1e8..434968ecd92 100644 --- a/prov/cxi/src/cxip_msg_rnr.c +++ b/prov/cxi/src/cxip_msg_rnr.c @@ -382,7 +382,7 @@ static int cxip_rxc_rnr_msg_init(struct cxip_rxc *rxc_base) /* Start accepting Puts. */ ret = cxip_pte_set_state(rxc->base.rx_pte, rxc->base.rx_cmdq, - C_PTLTE_ENABLED, 0); + C_PTLTE_ENABLED, CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { CXIP_WARN("cxip_pte_set_state returned: %d\n", ret); goto free_pte; diff --git a/prov/cxi/src/cxip_rdzv_pte.c b/prov/cxi/src/cxip_rdzv_pte.c index d99bda07f5c..ab2af82230f 100644 --- a/prov/cxi/src/cxip_rdzv_pte.c +++ b/prov/cxi/src/cxip_rdzv_pte.c @@ -265,7 +265,7 @@ static int cxip_rdzv_base_pte_alloc(struct cxip_txc_hpc *txc, /* Set to enable, event will be processed on link */ ret = cxip_pte_set_state(base_pte->pte, txc->rx_cmdq, - C_PTLTE_ENABLED, 0); + C_PTLTE_ENABLED, CXIP_PTE_IGNORE_DROPS); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to enqueue enable command: %d:%s\n", ret, fi_strerror(-ret)); From 4285d1070f0b5f927dcf4d467960c7e66e2655b0 Mon Sep 17 00:00:00 2001 From: John Heemstra Date: Mon, 6 Jan 2025 11:03:06 -0500 Subject: [PATCH 334/393] prov/cxi: disable retry logic for experimental collectives Signed-off-by: John Heemstra --- prov/cxi/src/cxip_coll.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c index ae804e3779f..2bff682369f 100644 --- a/prov/cxi/src/cxip_coll.c +++ b/prov/cxi/src/cxip_coll.c @@ -3,7 +3,7 @@ * * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2020-2024 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020-2025 Hewlett Packard Enterprise Development LP * Support for accelerated collective reductions. */ @@ -2021,7 +2021,9 @@ bool _is_red_timed_out(struct cxip_coll_reduction *reduction) reduction->red_id); return true; } - return _tsexp(&reduction->tv_expires); + + /* disable timeout logic for now */ + return false; } /* Root node state machine progress. From 9171a4e82e93050ef4de16dfd59318cecc1ef1f0 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 2 Jan 2025 18:32:07 +0000 Subject: [PATCH 335/393] prov/efa: Add missing locks in efa_msg and efa_rma efa_post_send, efa_post_write, efa_post_read accesses base_ep->is_wr_started bool which needs to be protected by a lock, otherwise there will be a race condition when multiple threads to call them. Same issue with efa_post_recv which accesses the recv_wr_index This patch adds the required locking to protect these resources. This lock is a no-op unless FI_THREAD_SAFE. Signed-off-by: Shi Jin --- prov/efa/src/efa_msg.c | 21 +++++++++++++++------ prov/efa/src/efa_rma.c | 17 ++++++++++------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c index 7920afbf531..fbd4adb2bd9 100644 --- a/prov/efa/src/efa_msg.c +++ b/prov/efa/src/efa_msg.c @@ -67,10 +67,12 @@ static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi struct ibv_recv_wr *wr; uintptr_t addr; ssize_t err, post_recv_err; - size_t i, wr_index = base_ep->recv_wr_index; + size_t i, wr_index; efa_tracepoint(recv_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); + ofi_genlock_lock(&base_ep->util_ep.lock); + wr_index = base_ep->recv_wr_index; if (wr_index >= base_ep->info->rx_attr->size) { EFA_INFO(FI_LOG_EP_DATA, "recv_wr_index exceeds the rx limit, " @@ -118,8 +120,10 @@ static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi base_ep->recv_wr_index++; - if (flags & FI_MORE) - return 0; + if (flags & FI_MORE) { + err = 0; + goto out; + } efa_tracepoint(post_recv, wr->wr_id, (uintptr_t)msg->context); @@ -134,6 +138,9 @@ static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi base_ep->recv_wr_index = 0; +out: + ofi_genlock_unlock(&base_ep->util_ep.lock); + return err; out_err: @@ -148,6 +155,8 @@ static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi base_ep->recv_wr_index = 0; + ofi_genlock_unlock(&base_ep->util_ep.lock); + return err; } @@ -209,6 +218,7 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi assert(len <= base_ep->info->ep_attr->max_msg_size); + ofi_genlock_lock(&base_ep->util_ep.lock); if (!base_ep->is_wr_started) { ibv_wr_start(qp->ibv_qp_ex); base_ep->is_wr_started = true; @@ -260,10 +270,9 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi ret = ibv_wr_complete(qp->ibv_qp_ex); base_ep->is_wr_started = false; } - if (OFI_UNLIKELY(ret)) - return ret; - return 0; + ofi_genlock_unlock(&base_ep->util_ep.lock); + return ret; } static ssize_t efa_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c index a7bad7d3877..052e2aa89d7 100644 --- a/prov/efa/src/efa_rma.c +++ b/prov/efa/src/efa_rma.c @@ -83,6 +83,9 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, base_ep->domain->device->max_rdma_size); qp = base_ep->qp; + + ofi_genlock_lock(&base_ep->util_ep.lock); + if (!base_ep->is_wr_started) { ibv_wr_start(qp->ibv_qp_ex); base_ep->is_wr_started = true; @@ -113,10 +116,9 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, err = ibv_wr_complete(qp->ibv_qp_ex); base_ep->is_wr_started = false; } - if (OFI_UNLIKELY(err)) - return err; - return 0; + ofi_genlock_unlock(&base_ep->util_ep.lock); + return err; } static @@ -212,6 +214,9 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, efa_tracepoint(write_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); qp = base_ep->qp; + + ofi_genlock_lock(&base_ep->util_ep.lock); + if (!base_ep->is_wr_started) { ibv_wr_start(qp->ibv_qp_ex); base_ep->is_wr_started = true; @@ -256,10 +261,8 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, base_ep->is_wr_started = false; } - if (OFI_UNLIKELY(err)) - return err; - - return 0; + ofi_genlock_unlock(&base_ep->util_ep.lock); + return err; } ssize_t efa_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, From b9b0c324c9989b7a65dd56072902de0ed75c0677 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 31 Dec 2024 09:41:48 -0800 Subject: [PATCH 336/393] prov/efa: Correct the error code for IBV_WC_RECV_RDMA_WITH_IMM Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 294bef21dec..9437c18b253 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -139,7 +139,7 @@ void efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion( EFA_WARN(FI_LOG_CQ, "Unable to write a cq entry for remote for RECV_RDMA operation: %s\n", fi_strerror(-ret)); - efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_WRITE_SHM_CQ_ENTRY); + efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_WRITE_RECV_COMP); } efa_cntr_report_rx_completion(&ep->base_ep.util_ep, flags); From 52a023f786b2c2715bb3e1d14623130b00f4ec02 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Thu, 19 Dec 2024 15:06:19 -0800 Subject: [PATCH 337/393] prov/efa: Move efa_rdm_cq_wc_is_unsolicited to efa_cq Move efa_use_unsolicited_write_recv to efa.h so they can be used outside rdm. Signed-off-by: Jessie Yang --- prov/efa/src/efa.h | 6 ++++++ prov/efa/src/efa_base_ep.c | 2 +- prov/efa/src/efa_cq.h | 26 +++++++++++++++++++++++++- prov/efa/src/rdm/efa_rdm_cq.c | 27 ++------------------------- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 2 +- prov/efa/src/rdm/efa_rdm_rma.c | 2 +- prov/efa/src/rdm/efa_rdm_util.h | 5 ----- prov/efa/test/efa_unit_test_cq.c | 2 +- prov/efa/test/efa_unit_test_ep.c | 2 +- 9 files changed, 38 insertions(+), 36 deletions(-) diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h index e8325330406..4d8e982355c 100644 --- a/prov/efa/src/efa.h +++ b/prov/efa/src/efa.h @@ -221,4 +221,10 @@ static inline void efa_perfset_end(struct efa_rdm_ep *ep, size_t index) #define efa_perfset_end(ep, index) do {} while (0) #endif +static inline +bool efa_use_unsolicited_write_recv() +{ + return efa_env.use_unsolicited_write_recv && efa_device_support_unsolicited_write_recv(); +} + #endif /* EFA_H */ diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 5db06721ad9..a40268cc120 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -183,7 +183,7 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, init_attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; } #if HAVE_CAPS_UNSOLICITED_WRITE_RECV - if (efa_rdm_use_unsolicited_write_recv()) + if (efa_use_unsolicited_write_recv()) efa_attr.flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV; #endif efa_attr.driver_qp_type = EFADV_QP_DRIVER_TYPE_SRD; diff --git a/prov/efa/src/efa_cq.h b/prov/efa/src/efa_cq.h index 238e769cc93..26366d5094c 100644 --- a/prov/efa/src/efa_cq.h +++ b/prov/efa/src/efa_cq.h @@ -136,7 +136,7 @@ static inline int efa_cq_ibv_cq_ex_open(struct fi_cq_attr *attr, }; #if HAVE_CAPS_UNSOLICITED_WRITE_RECV - if (efa_rdm_use_unsolicited_write_recv()) + if (efa_use_unsolicited_write_recv()) efadv_cq_init_attr.wc_flags |= EFADV_WC_EX_WITH_IS_UNSOLICITED; #endif @@ -176,3 +176,27 @@ static inline int efa_cq_ibv_cq_ex_open(struct fi_cq_attr *attr, &init_attr_ex, ibv_ctx, ibv_cq_ex, ibv_cq_ex_type); } #endif + +#if HAVE_CAPS_UNSOLICITED_WRITE_RECV +/** + * @brief Check whether a completion consumes recv buffer + * + * @param ibv_cq_ex extended ibv cq + * @return true the wc consumes a recv buffer + * @return false the wc doesn't consume a recv buffer + */ +static inline +bool efa_cq_wc_is_unsolicited(struct ibv_cq_ex *ibv_cq_ex) +{ + return efa_use_unsolicited_write_recv() && efadv_wc_is_unsolicited(efadv_cq_from_ibv_cq_ex(ibv_cq_ex)); +} + +#else + +static inline +bool efa_cq_wc_is_unsolicited(struct ibv_cq_ex *ibv_cq_ex) +{ + return false; +} + +#endif diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 9437c18b253..622f9b71fee 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -72,29 +72,6 @@ static struct fi_ops efa_rdm_cq_fi_ops = { }; -#if HAVE_CAPS_UNSOLICITED_WRITE_RECV -/** - * @brief Check whether a completion consumes recv buffer - * - * @param ibv_cq_ex extended ibv cq - * @return true the wc consumes a recv buffer - * @return false the wc doesn't consume a recv buffer - */ -static inline -bool efa_rdm_cq_wc_is_unsolicited(struct ibv_cq_ex *ibv_cq_ex) -{ - return efa_rdm_use_unsolicited_write_recv() && efadv_wc_is_unsolicited(efadv_cq_from_ibv_cq_ex(ibv_cq_ex)); -} - -#else - -static inline -bool efa_rdm_cq_wc_is_unsolicited(struct ibv_cq_ex *ibv_cq_ex) -{ - return false; -} - -#endif /** * @brief handle rdma-core CQ completion resulted from IBV_WRITE_WITH_IMM * @@ -148,7 +125,7 @@ void efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion( * For unsolicited wc, pkt_entry can be NULL, so we can only * access it for solicited wc. */ - if (!efa_rdm_cq_wc_is_unsolicited(ibv_cq_ex)) { + if (!efa_cq_wc_is_unsolicited(ibv_cq_ex)) { /** * Recv with immediate will consume a pkt_entry, but the pkt is not * filled, so free the pkt_entry and record we have one less posted @@ -494,7 +471,7 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) break; case IBV_WC_RECV: /* fall through */ case IBV_WC_RECV_RDMA_WITH_IMM: - if (efa_rdm_cq_wc_is_unsolicited(ibv_cq->ibv_cq_ex)) { + if (efa_cq_wc_is_unsolicited(ibv_cq->ibv_cq_ex)) { EFA_WARN(FI_LOG_CQ, "Receive error %s (%d) for unsolicited write recv", efa_strerror(prov_errno), prov_errno); efa_base_ep_write_eq_error(&ep->base_ep, to_fi_errno(prov_errno), prov_errno); diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 2574e493cbb..bd3babc0818 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1071,7 +1071,7 @@ void efa_rdm_ep_set_extra_info(struct efa_rdm_ep *ep) ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE; - if (efa_rdm_use_unsolicited_write_recv()) + if (efa_use_unsolicited_write_recv()) ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; if (ep->use_zcpy_rx) { diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index cfb399ef055..a10e37edabc 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -384,7 +384,7 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) "This is usually caused by inconsistent efa driver, libfabric, or rdma-core versions.\n" "Please use consistent software versions on both hosts, or disable the unsolicited write " "recv feature by setting environment variable FI_EFA_USE_UNSOLICITED_WRITE_RECV=0\n", - efa_rdm_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer), + efa_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer), ep->err_msg); return -FI_EOPNOTSUPP; } diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index b79bafb4e85..f52496b195e 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -32,10 +32,5 @@ static inline void efa_rdm_poison_mem_region(void *ptr, size_t size) } #endif -static inline -bool efa_rdm_use_unsolicited_write_recv() -{ - return efa_env.use_unsolicited_write_recv && efa_device_support_unsolicited_write_recv(); -} #endif /* _EFA_RDM_UTIL_H */ diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 75e32b39773..3d72a6460c1 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -338,7 +338,7 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; #if HAVE_CAPS_UNSOLICITED_WRITE_RECV - if (efa_rdm_use_unsolicited_write_recv()) { + if (efa_use_unsolicited_write_recv()) { efadv_cq_from_ibv_cq_ex(efa_rdm_cq->ibv_cq.ibv_cq_ex)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; will_return(efa_mock_efadv_wc_is_unsolicited, false); } diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 3adc8a136f9..d64139b986c 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -1390,6 +1390,6 @@ void test_efa_rdm_ep_support_unsolicited_write_recv(struct efa_resource **state) efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert_int_equal(efa_rdm_use_unsolicited_write_recv(), + assert_int_equal(efa_use_unsolicited_write_recv(), efa_rdm_ep_support_unsolicited_write_recv(efa_rdm_ep)); } From f3f3f0ace82daf35927ba5f17f98054ee1f65a8b Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Mon, 30 Dec 2024 12:44:20 -0800 Subject: [PATCH 338/393] prov/efa: Move raw addr functions Move efa_rdm_ep_raw_addr_str, efa_rdm_ep_get_peer_raw_addr, efa_rdm_ep_get_peer_raw_addr_str to base_ep.h so they can be used by efa-raw. Signed-off-by: Jessie Yang --- prov/efa/src/efa_base_ep.c | 38 +++++++++++++++++++++++++++ prov/efa/src/efa_base_ep.h | 9 +++++++ prov/efa/src/rdm/efa_rdm_ep.h | 6 ----- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 2 +- prov/efa/src/rdm/efa_rdm_ep_utils.c | 40 ----------------------------- prov/efa/src/rdm/efa_rdm_pke_cmd.c | 8 +++--- prov/efa/src/rdm/efa_rdm_util.c | 4 +-- 7 files changed, 54 insertions(+), 53 deletions(-) diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index a40268cc120..56bd82bd87e 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -472,3 +472,41 @@ void efa_base_ep_write_eq_error(struct efa_base_ep *ep, ssize_t err, ssize_t pro prov_errno, efa_strerror(prov_errno)); abort(); } + +const char *efa_base_ep_raw_addr_str(struct efa_base_ep *base_ep, char *buf, size_t *buflen) +{ + return ofi_straddr(buf, buflen, FI_ADDR_EFA, &base_ep->src_addr); +} + +/** + * @brief return peer's raw address in #efa_ep_addr + * + * @param[in] ep end point + * @param[in] addr libfabric address + * @returns + * If peer exists, return peer's raw addrress as pointer to #efa_ep_addr; + * Otherwise, return NULL + */ +struct efa_ep_addr *efa_base_ep_get_peer_raw_addr(struct efa_base_ep *base_ep, fi_addr_t addr) +{ + struct efa_av *efa_av; + struct efa_conn *efa_conn; + + efa_av = base_ep->av; + efa_conn = efa_av_addr_to_conn(efa_av, addr); + return efa_conn ? efa_conn->ep_addr : NULL; +} + +/** + * @brief return peer's raw address in a readable string + * + * @param[in] base_ep end point + * @param[in] addr libfabric address + * @param[out] buf a buffer to be used to store string + * @param[in,out] buflen length of `buf` as input. length of the string as output. + * @return a string with peer's raw address + */ +const char *efa_base_ep_get_peer_raw_addr_str(struct efa_base_ep *base_ep, fi_addr_t addr, char *buf, size_t *buflen) +{ + return ofi_straddr(buf, buflen, FI_ADDR_EFA, efa_base_ep_get_peer_raw_addr(base_ep, addr)); +} diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 820ced150c2..3562a64fe34 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -99,4 +99,13 @@ void efa_base_ep_write_eq_error(struct efa_base_ep *ep, ssize_t err, ssize_t prov_errno); +const char *efa_base_ep_raw_addr_str(struct efa_base_ep *base_ep, char *buf, + size_t *buflen); + +struct efa_ep_addr *efa_base_ep_get_peer_raw_addr(struct efa_base_ep *base_ep, + fi_addr_t addr); + +const char *efa_base_ep_get_peer_raw_addr_str(struct efa_base_ep *base_ep, + fi_addr_t addr, char *buf, + size_t *buflen); #endif diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 9b198026d1b..a3429756b30 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -203,12 +203,6 @@ int efa_rdm_ep_flush_queued_blocking_copy_to_hmem(struct efa_rdm_ep *ep); struct efa_ep_addr *efa_rdm_ep_raw_addr(struct efa_rdm_ep *ep); -const char *efa_rdm_ep_raw_addr_str(struct efa_rdm_ep *ep, char *buf, size_t *buflen); - -struct efa_ep_addr *efa_rdm_ep_get_peer_raw_addr(struct efa_rdm_ep *ep, fi_addr_t addr); - -const char *efa_rdm_ep_get_peer_raw_addr_str(struct efa_rdm_ep *ep, fi_addr_t addr, char *buf, size_t *buflen); - struct efa_rdm_peer *efa_rdm_ep_get_peer(struct efa_rdm_ep *ep, fi_addr_t addr); int32_t efa_rdm_ep_get_peer_ahn(struct efa_rdm_ep *ep, fi_addr_t addr); diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index bd3babc0818..fbebfd93455 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1344,7 +1344,7 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) efa_rdm_ep_set_extra_info(ep); ep_addr_strlen = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &ep_addr_strlen); + efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &ep_addr_strlen); EFA_INFO(FI_LOG_EP_CTRL, "libfabric %s efa endpoint created! address: %s\n", fi_tostr("1", FI_TYPE_VERSION), ep_addr_str); diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 6fc841f2600..2d87b48911d 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -25,31 +25,6 @@ struct efa_ep_addr *efa_rdm_ep_raw_addr(struct efa_rdm_ep *ep) return &ep->base_ep.src_addr; } -const char *efa_rdm_ep_raw_addr_str(struct efa_rdm_ep *ep, char *buf, size_t *buflen) -{ - return ofi_straddr(buf, buflen, FI_ADDR_EFA, efa_rdm_ep_raw_addr(ep)); -} - -/** - * @brief return peer's raw address in #efa_ep_addr - * - * @param[in] ep end point - * @param[in] addr libfabric address - * @returns - * If peer exists, return peer's raw addrress as pointer to #efa_ep_addr; - * Otherwise, return NULL - * @relates efa_rdm_peer - */ -struct efa_ep_addr *efa_rdm_ep_get_peer_raw_addr(struct efa_rdm_ep *ep, fi_addr_t addr) -{ - struct efa_av *efa_av; - struct efa_conn *efa_conn; - - efa_av = ep->base_ep.av; - efa_conn = efa_av_addr_to_conn(efa_av, addr); - return efa_conn ? efa_conn->ep_addr : NULL; -} - /** * @brief return peer's ahn * @@ -69,21 +44,6 @@ int32_t efa_rdm_ep_get_peer_ahn(struct efa_rdm_ep *ep, fi_addr_t addr) return efa_conn ? efa_conn->ah->ahn : -1; } -/** - * @brief return peer's raw address in a reable string - * - * @param[in] ep end point - * @param[in] addr libfabric address - * @param[out] buf a buffer tat to be used to store string - * @param[in,out] buflen length of `buf` as input. length of the string as output. - * @relates efa_rdm_peer - * @return a string with peer's raw address - */ -const char *efa_rdm_ep_get_peer_raw_addr_str(struct efa_rdm_ep *ep, fi_addr_t addr, char *buf, size_t *buflen) -{ - return ofi_straddr(buf, buflen, FI_ADDR_EFA, efa_rdm_ep_get_peer_raw_addr(ep, addr)); -} - /** * @brief get pointer to efa_rdm_peer structure for a given libfabric address * diff --git a/prov/efa/src/rdm/efa_rdm_pke_cmd.c b/prov/efa/src/rdm/efa_rdm_pke_cmd.c index f095cc1f772..b8baf5c2935 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_cmd.c +++ b/prov/efa/src/rdm/efa_rdm_pke_cmd.c @@ -453,9 +453,9 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) memset(&ep_addr_str, 0, sizeof(ep_addr_str)); memset(&peer_addr_str, 0, sizeof(peer_addr_str)); buflen = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &buflen); + efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &buflen); buflen = sizeof(peer_addr_str); - efa_rdm_ep_get_peer_raw_addr_str(ep, pkt_entry->addr, peer_addr_str, &buflen); + efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, pkt_entry->addr, peer_addr_str, &buflen); EFA_WARN(FI_LOG_CQ, "While sending a handshake packet, an error occurred." " Our address: %s, peer address: %s\n", @@ -712,7 +712,7 @@ void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) memset(&ep_addr_str, 0, sizeof(ep_addr_str)); buflen = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &buflen); + efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &buflen); EFA_WARN(FI_LOG_CQ, "Packet receive error from non TX/RX packet. Our address: %s\n", ep_addr_str); @@ -751,7 +751,7 @@ fi_addr_t efa_rdm_pke_insert_addr(struct efa_rdm_pke *pkt_entry, void *raw_addr) char self_raw_addr_str[OFI_ADDRSTRLEN]; size_t buflen = OFI_ADDRSTRLEN; - efa_rdm_ep_raw_addr_str(ep, self_raw_addr_str, &buflen); + efa_base_ep_raw_addr_str(&ep->base_ep, self_raw_addr_str, &buflen); EFA_WARN(FI_LOG_CQ, "Host %s received a packet with invalid protocol version %d.\n" "This host can only support protocol version %d and above.\n", diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index 0175b3884a9..d8ec83b9305 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -118,9 +118,9 @@ int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); len = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); + efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &len); len = sizeof(peer_addr_str); - efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); + efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, addr, peer_addr_str, &len); if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { strcpy(local_host_id_str, "N/A"); From a190ce4f25714aa53f956f4142dacd2a4433d602 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Mon, 30 Dec 2024 16:36:36 -0800 Subject: [PATCH 339/393] prov/efa: Remove err_msg from efa_rdm_ep Use a temporary char array to hold error message. err_data can be safely released after ofi_cq_write_err returns. No need to use a persistent field in ep. Signed-off-by: Jessie Yang --- prov/efa/src/efa_base_ep.h | 1 + prov/efa/src/rdm/efa_rdm_cq.c | 4 ++-- prov/efa/src/rdm/efa_rdm_ep.h | 2 -- prov/efa/src/rdm/efa_rdm_ope.c | 10 ++++++++-- prov/efa/src/rdm/efa_rdm_rma.c | 5 +++-- prov/efa/src/rdm/efa_rdm_util.c | 12 +++++------- prov/efa/src/rdm/efa_rdm_util.h | 2 +- prov/efa/test/efa_unit_test_cq.c | 2 +- 8 files changed, 21 insertions(+), 17 deletions(-) diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 3562a64fe34..86657c5dc12 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -14,6 +14,7 @@ #define EFA_QP_DEFAULT_SERVICE_LEVEL 0 #define EFA_QP_LOW_LATENCY_SERVICE_LEVEL 8 +#define EFA_ERROR_MSG_BUFFER_LENGTH 1024 #define efa_rx_flags(efa_base_ep) ((efa_base_ep)->util_ep.rx_op_flags) #define efa_tx_flags(efa_base_ep) ((efa_base_ep)->util_ep.tx_op_flags) diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 622f9b71fee..5a18ef17003 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -348,11 +348,11 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct * QP and we cannot cancel that. */ if (OFI_UNLIKELY(ep->use_zcpy_rx && efa_rdm_pkt_type_is_rtm(pkt_type))) { - void *errbuf; + char errbuf[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; size_t errbuf_len; /* local & peer host-id & ep address will be logged by efa_rdm_write_error_msg */ - if (!efa_rdm_write_error_msg(ep, pkt_entry->addr, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, &errbuf, &errbuf_len)) + if (!efa_rdm_write_error_msg(ep, pkt_entry->addr, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, errbuf, &errbuf_len)) EFA_WARN(FI_LOG_CQ, "Error: %s\n", (const char *) errbuf); efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX); efa_rdm_pke_release_rx(pkt_entry); diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index a3429756b30..1b888e182a4 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -10,7 +10,6 @@ #include "efa_base_ep.h" #include "efa_rdm_rxe_map.h" -#define EFA_RDM_ERROR_MSG_BUFFER_LENGTH 1024 /** @brief Information of a queued copy. * @@ -186,7 +185,6 @@ struct efa_rdm_ep { bool sendrecv_in_order_aligned_128_bytes; /**< whether to support in order send/recv of each aligned 128 bytes memory region */ bool write_in_order_aligned_128_bytes; /**< whether to support in order write of each aligned 128 bytes memory region */ - char err_msg[EFA_RDM_ERROR_MSG_BUFFER_LENGTH]; /* A large enough buffer to store CQ/EQ error data used by e.g. fi_cq_readerr */ struct efa_rdm_pke **pke_vec; struct dlist_entry entry; /* the count of opes queued before handshake is made with their peers */ diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index f24d9c0150e..58a0f51ecaa 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -556,6 +556,7 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) struct dlist_entry *tmp; struct efa_rdm_pke *pkt_entry; int write_cq_err; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; assert(rxe->type == EFA_RDM_RXE); @@ -603,8 +604,10 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) err_entry.data = rxe->cq_entry.data; err_entry.tag = rxe->cq_entry.tag; if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, rxe->addr, prov_errno, - &err_entry.err_data, &err_entry.err_data_size))) { + err_msg, &err_entry.err_data_size))) { err_entry.err_data_size = 0; + } else { + err_entry.err_data = err_msg; } EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n", @@ -660,6 +663,7 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) struct dlist_entry *tmp; struct efa_rdm_pke *pkt_entry; int write_cq_err; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; ep = txe->ep; memset(&err_entry, 0, sizeof(err_entry)); @@ -695,8 +699,10 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) err_entry.data = txe->cq_entry.data; err_entry.tag = txe->cq_entry.tag; if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, txe->addr, prov_errno, - &err_entry.err_data, &err_entry.err_data_size))) { + err_msg, &err_entry.err_data_size))) { err_entry.err_data_size = 0; + } else { + err_entry.err_data = err_msg; } EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n", diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index a10e37edabc..87267f6d8ae 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -360,6 +360,7 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) bool delivery_complete_requested; int ctrl_type, iface, use_p2p; size_t max_eager_rtw_data_size; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; /* * A handshake is required to choose the correct protocol (whether to use device write/read). @@ -377,7 +378,7 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) */ if ((txe->fi_flags & FI_REMOTE_CQ_DATA) && (efa_rdm_ep_support_unsolicited_write_recv(ep) != efa_rdm_peer_support_unsolicited_write_recv(txe->peer))) { - (void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, ep->err_msg, "", EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + (void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, err_msg, "", EFA_ERROR_MSG_BUFFER_LENGTH); EFA_WARN(FI_LOG_EP_DATA, "Inconsistent support status detected on unsolicited write recv.\n" "My support status: %d, peer support status: %d. %s.\n" @@ -385,7 +386,7 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) "Please use consistent software versions on both hosts, or disable the unsolicited write " "recv feature by setting environment variable FI_EFA_USE_UNSOLICITED_WRITE_RECV=0\n", efa_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer), - ep->err_msg); + err_msg); return -FI_EOPNOTSUPP; } efa_rdm_ope_prepare_to_post_write(txe); diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index d8ec83b9305..c9d65061e1b 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -149,24 +149,22 @@ int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, * @param[in] ep EFA RDM endpoint * @param[in] addr Remote peer fi_addr_t * @param[in] prov_errno EFA provider * error code(must be positive) - * @param[out] buf Pointer to the address of error data written by this function + * @param[out] err_msg Pointer to the address of error message written by this function * @param[out] buflen Pointer to the returned error data size * @return A status code. 0 if the error data was written successfully, otherwise a negative FI error code. */ -int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen) +int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, char *err_msg, size_t *buflen) { const char *base_msg = efa_strerror(prov_errno); int ret; - *buf = NULL; - *buflen = 0; + *buflen = 0; - ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, ep->err_msg, base_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, err_msg, base_msg, EFA_ERROR_MSG_BUFFER_LENGTH); if (ret) return ret; - *buf = ep->err_msg; - *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + *buflen = EFA_ERROR_MSG_BUFFER_LENGTH; return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index f52496b195e..7c3daa3432f 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -21,7 +21,7 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc); int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len); -int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen); +int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, char *err_msg, size_t *buflen); #ifdef ENABLE_EFA_POISONING static inline void efa_rdm_poison_mem_region(void *ptr, size_t size) diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 3d72a6460c1..29a06fc1579 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -155,7 +155,7 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, assert_int_equal(ret, -FI_EAVAIL); /* Allocate memory to read CQ error */ - cq_err_entry.err_data_size = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + cq_err_entry.err_data_size = EFA_ERROR_MSG_BUFFER_LENGTH; cq_err_entry.err_data = malloc(cq_err_entry.err_data_size); assert_non_null(cq_err_entry.err_data); From cefc67d5c6a25fce785337c5b53eb2242a4170c6 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Wed, 18 Dec 2024 15:36:19 -0800 Subject: [PATCH 340/393] prov/efa: Implement the cq progress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename efa_dgram_cq.c to efa_cq.c and move it to prov/efa/src as a common CQ interface for efa-raw’s rdm and dgram ep type. Create efa_cq_progress and poll ibv cq directly instead of using efa_dgram_ep_progress. Remove rcq and scq, write completion to util_ep.rx_cq and tx_cq. Construct the error message for cq_err_entry.err_data. Signed-off-by: Jessie Yang --- libfabric.vcxproj | 2 +- prov/efa/Makefile.include | 3 +- prov/efa/src/dgram/efa_dgram_cq.c | 339 --------------------- prov/efa/src/dgram/efa_dgram_cq.h | 28 -- prov/efa/src/dgram/efa_dgram_ep.c | 138 ++------- prov/efa/src/dgram/efa_dgram_ep.h | 3 - prov/efa/src/efa_av.c | 6 +- prov/efa/src/efa_av.h | 2 +- prov/efa/src/efa_cntr.c | 20 +- prov/efa/src/efa_cq.c | 470 ++++++++++++++++++++++++++++++ prov/efa/src/efa_cq.h | 69 +++++ prov/efa/src/efa_domain.c | 3 +- prov/efa/src/efa_msg.c | 5 +- prov/efa/src/efa_rma.c | 4 +- prov/efa/src/rdm/efa_rdm_cq.h | 2 +- prov/efa/test/efa_unit_test_cq.c | 217 +++++++++++++- prov/efa/test/efa_unit_tests.c | 4 + prov/efa/test/efa_unit_tests.h | 4 + 18 files changed, 824 insertions(+), 495 deletions(-) delete mode 100644 prov/efa/src/dgram/efa_dgram_cq.c delete mode 100644 prov/efa/src/dgram/efa_dgram_cq.h create mode 100644 prov/efa/src/efa_cq.c diff --git a/libfabric.vcxproj b/libfabric.vcxproj index 3eef3ef0521..9acba798776 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -886,8 +886,8 @@ + - diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 980f3430644..db5e44df1f0 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -49,8 +49,8 @@ _efa_files = \ prov/efa/src/efa_cntr.c \ prov/efa/src/efa_msg.c \ prov/efa/src/efa_rma.c \ + prov/efa/src/efa_cq.c \ prov/efa/src/dgram/efa_dgram_ep.c \ - prov/efa/src/dgram/efa_dgram_cq.c \ prov/efa/src/rdm/efa_rdm_peer.c \ prov/efa/src/rdm/efa_rdm_cq.c \ prov/efa/src/rdm/efa_rdm_ep_utils.c \ @@ -95,7 +95,6 @@ _efa_headers = \ prov/efa/src/efa_env.h \ prov/efa/src/fi_ext_efa.h \ prov/efa/src/dgram/efa_dgram_ep.h \ - prov/efa/src/dgram/efa_dgram_cq.h \ prov/efa/src/rdm/efa_rdm_peer.h \ prov/efa/src/rdm/efa_rdm_cq.h \ prov/efa/src/rdm/efa_rdm_ep.h \ diff --git a/prov/efa/src/dgram/efa_dgram_cq.c b/prov/efa/src/dgram/efa_dgram_cq.c deleted file mode 100644 index d046549bd66..00000000000 --- a/prov/efa/src/dgram/efa_dgram_cq.c +++ /dev/null @@ -1,339 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include -#include -#include "config.h" -#include -#include "dgram/efa_dgram_ep.h" -#include "efa.h" -#include "efa_cq.h" -#include "efa_av.h" -#include "efa_dgram_cq.h" -#include - -struct efa_wc { - struct ibv_wc ibv_wc; - /* Source address */ - uint16_t efa_ah; -}; - -struct efa_wce { - struct slist_entry entry; - struct efa_wc wc; -}; - -#define EFA_WCE_CNT 1024 - -static inline uint64_t efa_dgram_cq_opcode_to_fi_flags(enum ibv_wc_opcode opcode) { - switch (opcode) { - case IBV_WC_SEND: - return FI_SEND | FI_MSG; - case IBV_WC_RECV: - return FI_RECV | FI_MSG; - default: - assert(0); - return 0; - } -} - -static inline uint32_t efa_dgram_cq_api_version(struct efa_dgram_cq *cq) { - return cq->domain->fabric->util_fabric.fabric_fid.api_version; -} - -ssize_t efa_dgram_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, - uint64_t flags) -{ - struct efa_dgram_cq *cq; - uint32_t api_version; - - cq = container_of(cq_fid, struct efa_dgram_cq, util_cq.cq_fid); - - ofi_spin_lock(&cq->lock); - - if (!cq->ibv_cq_ex->status) - goto err; - - api_version = efa_dgram_cq_api_version(cq); - - entry->op_context = (void *)(uintptr_t)cq->ibv_cq_ex->wr_id; - entry->flags = efa_dgram_cq_opcode_to_fi_flags(ibv_wc_read_opcode(cq->ibv_cq_ex)); - entry->err = FI_EIO; - entry->prov_errno = ibv_wc_read_vendor_err(cq->ibv_cq_ex); - EFA_WARN(FI_LOG_CQ, "Work completion status: %s\n", efa_strerror(entry->prov_errno)); - - ofi_spin_unlock(&cq->lock); - - /* We currently don't have err_data to give back to the user. */ - if (FI_VERSION_GE(api_version, FI_VERSION(1, 5))) - entry->err_data_size = 0; - - return sizeof(*entry); -err: - ofi_spin_unlock(&cq->lock); - return -FI_EAGAIN; -} - -static void efa_dgram_cq_read_context_entry(struct ibv_cq_ex *ibv_cqx, int i, void *buf) -{ - struct fi_cq_entry *entry = buf; - - entry[i].op_context = (void *)ibv_cqx->wr_id; -} - -static void efa_dgram_cq_read_msg_entry(struct ibv_cq_ex *ibv_cqx, int i, void *buf) -{ - struct fi_cq_msg_entry *entry = buf; - - entry[i].op_context = (void *)(uintptr_t)ibv_cqx->wr_id; - entry[i].flags = efa_dgram_cq_opcode_to_fi_flags(ibv_wc_read_opcode(ibv_cqx)); - entry[i].len = ibv_wc_read_byte_len(ibv_cqx); -} - -static void efa_dgram_cq_read_data_entry(struct ibv_cq_ex *ibv_cqx, int i, void *buf) -{ - struct fi_cq_data_entry *entry = buf; - - entry[i].op_context = (void *)ibv_cqx->wr_id; - entry[i].flags = efa_dgram_cq_opcode_to_fi_flags(ibv_wc_read_opcode(ibv_cqx)); - entry[i].data = 0; - entry[i].len = ibv_wc_read_byte_len(ibv_cqx); -} - -/** - * @brief Convert an error code from CQ poll API, e.g. `ibv_start_poll`, `ibv_end_poll`. - * The returned error code must be 0 (success) or negative (error). - * As a special case, if input error code is ENOENT (there was no item on CQ), we should return -FI_EAGAIN. - * @param[in] err Return value from `ibv_start_poll` or `ibv_end_poll` - * @returns Converted error code - */ -static inline ssize_t efa_dgram_cq_ibv_poll_error_to_fi_error(ssize_t err) { - if (err == ENOENT) { - return -FI_EAGAIN; - } - - if (err > 0) { - return -err; - } - - return err; -} - -ssize_t efa_dgram_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, - fi_addr_t *src_addr) -{ - bool should_end_poll = false; - struct efa_dgram_cq *cq; - struct efa_av *av; - ssize_t err = 0; - size_t num_cqe = 0; /* Count of read entries */ - uint32_t qp_num, src_qp, slid; - - /* Initialize an empty ibv_poll_cq_attr struct for ibv_start_poll. - * EFA expects .comp_mask = 0, or otherwise returns EINVAL. - */ - struct ibv_poll_cq_attr poll_cq_attr = {.comp_mask = 0}; - - cq = container_of(cq_fid, struct efa_dgram_cq, util_cq.cq_fid); - - ofi_spin_lock(&cq->lock); - - /* Call ibv_start_poll only once regardless of count == 0 */ - err = ibv_start_poll(cq->ibv_cq_ex, &poll_cq_attr); - should_end_poll = !err; - - while (!err && num_cqe < count) { - if (cq->ibv_cq_ex->status) { - err = -FI_EAVAIL; - break; - } - - if (src_addr) { - qp_num = ibv_wc_read_qp_num(cq->ibv_cq_ex); - src_qp = ibv_wc_read_src_qp(cq->ibv_cq_ex); - slid = ibv_wc_read_slid(cq->ibv_cq_ex); - av = cq->domain->qp_table[qp_num & cq->domain->qp_table_sz_m1]->base_ep->av; - - src_addr[num_cqe] = efa_av_reverse_lookup_dgram(av, slid, src_qp); - } - - cq->read_entry(cq->ibv_cq_ex, num_cqe, buf); - num_cqe++; - - err = ibv_next_poll(cq->ibv_cq_ex); - } - - err = efa_dgram_cq_ibv_poll_error_to_fi_error(err); - - if (should_end_poll) - ibv_end_poll(cq->ibv_cq_ex); - - ofi_spin_unlock(&cq->lock); - - return num_cqe ? num_cqe : err; -} - -static const char *efa_dgram_cq_strerror(struct fid_cq *cq_fid, - int prov_errno, - const void *err_data, - char *buf, size_t len) -{ - return err_data - ? (const char *) err_data - : efa_strerror(prov_errno); -} - -static struct fi_ops_cq efa_dgram_cq_ops = { - .size = sizeof(struct fi_ops_cq), - .read = ofi_cq_read, - .readfrom = ofi_cq_readfrom, - .readerr = ofi_cq_readerr, - .sread = fi_no_cq_sread, - .sreadfrom = fi_no_cq_sreadfrom, - .signal = fi_no_cq_signal, - .strerror = efa_dgram_cq_strerror -}; - -static int efa_dgram_cq_control(fid_t fid, int command, void *arg) -{ - int ret = 0; - - switch (command) { - default: - ret = -FI_ENOSYS; - break; - } - - return ret; -} - -static int efa_dgram_cq_close(fid_t fid) -{ - struct efa_dgram_cq *cq; - int ret; - - cq = container_of(fid, struct efa_dgram_cq, util_cq.cq_fid.fid); - - ofi_bufpool_destroy(cq->wce_pool); - - ofi_spin_destroy(&cq->lock); - - ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq_ex)); - if (ret) - return ret; - - ret = ofi_cq_cleanup(&cq->util_cq); - if (ret) - return ret; - - free(cq); - - return 0; -} - -static struct fi_ops efa_dgram_cq_fi_ops = { - .size = sizeof(struct fi_ops), - .close = efa_dgram_cq_close, - .bind = fi_no_bind, - .control = efa_dgram_cq_control, - .ops_open = fi_no_ops_open, -}; - -/** - * @brief Create and set cq->ibv_cq_ex - * - * @param[in] cq Pointer to the efa_dgram_cq. cq->ibv_cq_ex must be NULL. - * @param[in] attr Pointer to fi_cq_attr. - * @param[out] Return code = 0 if successful, or negative otherwise. - */ -static inline int efa_dgram_cq_set_ibv_cq_ex(struct efa_dgram_cq *cq, struct fi_cq_attr *attr) -{ - enum ibv_cq_ex_type ibv_cq_ex_type; - - if (cq->ibv_cq_ex) { - EFA_WARN(FI_LOG_CQ, "CQ already has attached ibv_cq_ex\n"); - return -FI_EALREADY; - } - - return efa_cq_ibv_cq_ex_open(attr, cq->domain->device->ibv_ctx, - &cq->ibv_cq_ex, &ibv_cq_ex_type); -} - -int efa_dgram_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, - struct fid_cq **cq_fid, void *context) -{ - struct efa_dgram_cq *cq; - int err; - - if (attr->wait_obj != FI_WAIT_NONE) - return -FI_ENOSYS; - - cq = calloc(1, sizeof(*cq)); - if (!cq) - return -FI_ENOMEM; - - err = ofi_cq_init(&efa_prov, domain_fid, attr, &cq->util_cq, - &ofi_cq_progress, context); - if (err) { - EFA_WARN(FI_LOG_CQ, "Unable to create UTIL_CQ\n"); - goto err_free_cq; - } - - cq->domain = container_of(domain_fid, struct efa_domain, - util_domain.domain_fid); - - err = efa_dgram_cq_set_ibv_cq_ex(cq, attr); - if (err) { - EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ\n"); - err = -FI_EINVAL; - goto err_free_util_cq; - } - - err = ofi_bufpool_create(&cq->wce_pool, sizeof(struct efa_wce), 16, 0, - EFA_WCE_CNT, 0); - if (err) { - EFA_WARN(FI_LOG_CQ, "Failed to create wce_pool\n"); - goto err_destroy_cq; - } - - switch (attr->format) { - case FI_CQ_FORMAT_UNSPEC: - case FI_CQ_FORMAT_CONTEXT: - cq->read_entry = efa_dgram_cq_read_context_entry; - cq->entry_size = sizeof(struct fi_cq_entry); - break; - case FI_CQ_FORMAT_MSG: - cq->read_entry = efa_dgram_cq_read_msg_entry; - cq->entry_size = sizeof(struct fi_cq_msg_entry); - break; - case FI_CQ_FORMAT_DATA: - cq->read_entry = efa_dgram_cq_read_data_entry; - cq->entry_size = sizeof(struct fi_cq_data_entry); - break; - case FI_CQ_FORMAT_TAGGED: - default: - err = -FI_ENOSYS; - goto err_destroy_pool; - } - - ofi_spin_init(&cq->lock); - - *cq_fid = &cq->util_cq.cq_fid; - (*cq_fid)->fid.fclass = FI_CLASS_CQ; - (*cq_fid)->fid.context = context; - (*cq_fid)->fid.ops = &efa_dgram_cq_fi_ops; - (*cq_fid)->ops = &efa_dgram_cq_ops; - - return 0; - -err_destroy_pool: - ofi_bufpool_destroy(cq->wce_pool); -err_destroy_cq: - ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq_ex)); -err_free_util_cq: - ofi_cq_cleanup(&cq->util_cq); -err_free_cq: - free(cq); - return err; -} diff --git a/prov/efa/src/dgram/efa_dgram_cq.h b/prov/efa/src/dgram/efa_dgram_cq.h deleted file mode 100644 index fbb986d3f72..00000000000 --- a/prov/efa/src/dgram/efa_dgram_cq.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#ifndef EFA_DGRAM_CQ_H -#define EFA_DGRAM_CQ_H - -typedef void (*efa_dgram_cq_read_entry)(struct ibv_cq_ex *ibv_cqx, int index, void *buf); - -struct efa_dgram_cq { - struct util_cq util_cq; - struct efa_domain *domain; - size_t entry_size; - efa_dgram_cq_read_entry read_entry; - ofi_spin_t lock; - struct ofi_bufpool *wce_pool; - uint32_t flags; /* User defined capability mask */ - - struct ibv_cq_ex *ibv_cq_ex; -}; - -int efa_dgram_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, - struct fid_cq **cq_fid, void *context); - -ssize_t efa_dgram_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, fi_addr_t *src_addr); - -ssize_t efa_dgram_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, uint64_t flags); - -#endif \ No newline at end of file diff --git a/prov/efa/src/dgram/efa_dgram_ep.c b/prov/efa/src/dgram/efa_dgram_ep.c index 635d5e7a9b6..3119b8bee72 100644 --- a/prov/efa/src/dgram/efa_dgram_ep.c +++ b/prov/efa/src/dgram/efa_dgram_ep.c @@ -4,12 +4,11 @@ #include "config.h" #include "efa_dgram_ep.h" -#include "efa_dgram_cq.h" #include "efa.h" #include "efa_av.h" +#include "efa_cq.h" #include -#define efa_dgram_cq_PROGRESS_ENTRIES 500 static int efa_dgram_ep_getopt(fid_t fid, int level, int optname, void *optval, size_t *optlen) @@ -71,8 +70,9 @@ static int efa_dgram_ep_close(fid_t fid) static int efa_dgram_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { struct efa_dgram_ep *ep; - struct efa_dgram_cq *cq; + struct efa_cq *cq; struct efa_av *av; + struct efa_domain *efa_domain; struct util_eq *eq; struct util_cntr *cntr; int ret; @@ -94,24 +94,15 @@ static int efa_dgram_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) if (!(flags & (FI_RECV | FI_TRANSMIT))) return -FI_EBADFLAGS; - cq = container_of(bfid, struct efa_dgram_cq, util_cq.cq_fid); - if (ep->base_ep.domain != cq->domain) + cq = container_of(bfid, struct efa_cq, util_cq.cq_fid); + efa_domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain); + if (ep->base_ep.domain != efa_domain) return -FI_EINVAL; ret = ofi_ep_bind_cq(&ep->base_ep.util_ep, &cq->util_cq, flags); if (ret) return ret; - if (flags & FI_RECV) { - if (ep->rcq) - return -EINVAL; - ep->rcq = cq; - } - if (flags & FI_TRANSMIT) { - if (ep->scq) - return -EINVAL; - ep->scq = cq; - } break; case FI_CLASS_AV: av = container_of(bfid, struct efa_av, util_av.av_fid.fid); @@ -186,46 +177,47 @@ static int efa_dgram_ep_setflags(struct fid_ep *ep_fid, uint64_t flags) static int efa_dgram_ep_enable(struct fid_ep *ep_fid) { struct ibv_qp_init_attr_ex attr_ex = { 0 }; - struct ibv_pd *ibv_pd; struct efa_dgram_ep *ep; + struct efa_cq *scq, *rcq; int err; ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - if (!ep->scq && !ep->rcq) { + scq = ep->base_ep.util_ep.tx_cq ? container_of(ep->base_ep.util_ep.tx_cq, struct efa_cq, util_cq) : NULL; + rcq = ep->base_ep.util_ep.rx_cq ? container_of(ep->base_ep.util_ep.rx_cq, struct efa_cq, util_cq) : NULL; + + if (!scq && !rcq) { EFA_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to a send or receive completion queue\n"); return -FI_ENOCQ; } - if (!ep->scq && ofi_send_allowed(ep->base_ep.info->caps)) { + if (!scq && ofi_needs_tx(ep->base_ep.info->caps)) { EFA_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); return -FI_ENOCQ; } - if (!ep->rcq && ofi_recv_allowed(ep->base_ep.info->caps)) { + if (!rcq && ofi_needs_rx(ep->base_ep.info->caps)) { EFA_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled. (FI_RECV)\n"); return -FI_ENOCQ; } - if (ep->scq) { + if (scq) { attr_ex.cap.max_send_wr = ep->base_ep.info->tx_attr->size; attr_ex.cap.max_send_sge = ep->base_ep.info->tx_attr->iov_limit; - attr_ex.send_cq = ibv_cq_ex_to_cq(ep->scq->ibv_cq_ex); - ibv_pd = ep->scq->domain->ibv_pd; + attr_ex.send_cq = ibv_cq_ex_to_cq(scq->ibv_cq.ibv_cq_ex); } else { - attr_ex.send_cq = ibv_cq_ex_to_cq(ep->rcq->ibv_cq_ex); - ibv_pd = ep->rcq->domain->ibv_pd; + attr_ex.send_cq = ibv_cq_ex_to_cq(rcq->ibv_cq.ibv_cq_ex); } - if (ep->rcq) { + if (rcq) { attr_ex.cap.max_recv_wr = ep->base_ep.info->rx_attr->size; attr_ex.cap.max_recv_sge = ep->base_ep.info->rx_attr->iov_limit; - attr_ex.recv_cq = ibv_cq_ex_to_cq(ep->rcq->ibv_cq_ex); + attr_ex.recv_cq = ibv_cq_ex_to_cq(rcq->ibv_cq.ibv_cq_ex); } else { - attr_ex.recv_cq = ibv_cq_ex_to_cq(ep->scq->ibv_cq_ex); + attr_ex.recv_cq = ibv_cq_ex_to_cq(scq->ibv_cq.ibv_cq_ex); } attr_ex.cap.max_inline_data = @@ -234,7 +226,7 @@ static int efa_dgram_ep_enable(struct fid_ep *ep_fid) assert(EFA_EP_TYPE_IS_DGRAM(ep->base_ep.domain->info)); attr_ex.qp_type = IBV_QPT_UD; attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; - attr_ex.pd = ibv_pd; + attr_ex.pd = container_of(ep->base_ep.util_ep.domain, struct efa_domain, util_domain)->ibv_pd; attr_ex.qp_context = ep; attr_ex.sq_sig_all = 1; @@ -277,89 +269,19 @@ static struct fi_ops efa_dgram_ep_ops = { .ops_open = fi_no_ops_open, }; -static void efa_dgram_ep_progress_internal(struct efa_dgram_ep *ep, struct efa_dgram_cq *efa_dgram_cq) +/** + * @brief progress engine for the EFA dgram endpoint + * + * This function now a no-op. + * + * @param[in] util_ep The endpoint FID to progress + */ +static +void efa_ep_progress_no_op(struct util_ep *util_ep) { - struct util_cq *cq; - struct fi_cq_tagged_entry cq_entry[efa_dgram_cq_PROGRESS_ENTRIES] = {0}; - struct fi_cq_tagged_entry *temp_cq_entry; - struct fi_cq_err_entry cq_err_entry = {0}; - fi_addr_t src_addr[efa_dgram_cq_PROGRESS_ENTRIES]; - uint64_t flags; - int i; - ssize_t ret, err; - - cq = &efa_dgram_cq->util_cq; - flags = ep->base_ep.util_ep.caps; - - VALGRIND_MAKE_MEM_DEFINED(&cq_entry, sizeof(cq_entry)); - - ret = efa_dgram_cq_readfrom(&cq->cq_fid, cq_entry, efa_dgram_cq_PROGRESS_ENTRIES, - (flags & FI_SOURCE) ? src_addr : NULL); - if (ret == -FI_EAGAIN) - return; - - if (OFI_UNLIKELY(ret < 0)) { - if (OFI_UNLIKELY(ret != -FI_EAVAIL)) { - EFA_WARN(FI_LOG_CQ, "no error available errno: %ld\n", ret); - efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_DGRAM_CQ_READ); - return; - } - - err = efa_dgram_cq_readerr(&cq->cq_fid, &cq_err_entry, flags); - if (OFI_UNLIKELY(err < 0)) { - EFA_WARN(FI_LOG_CQ, "unable to read error entry errno: %ld\n", err); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, cq_err_entry.prov_errno); - return; - } - - ofi_cq_write_error(cq, &cq_err_entry); - return; - } - - temp_cq_entry = (struct fi_cq_tagged_entry *)cq_entry; - for (i = 0; i < ret; i++) { - (flags & FI_SOURCE) ? - ofi_cq_write_src(cq, temp_cq_entry->op_context, - temp_cq_entry->flags, - temp_cq_entry->len, - temp_cq_entry->buf, - temp_cq_entry->data, - temp_cq_entry->tag, - src_addr[i]) : - ofi_cq_write(cq, temp_cq_entry->op_context, - temp_cq_entry->flags, - temp_cq_entry->len, - temp_cq_entry->buf, - temp_cq_entry->data, - temp_cq_entry->tag); - - temp_cq_entry = (struct fi_cq_tagged_entry *) - ((uint8_t *)temp_cq_entry + efa_dgram_cq->entry_size); - } return; } -void efa_dgram_ep_progress(struct util_ep *ep) -{ - struct efa_dgram_ep *efa_dgram_ep; - struct efa_dgram_cq *rcq; - struct efa_dgram_cq *scq; - - efa_dgram_ep = container_of(ep, struct efa_dgram_ep, base_ep.util_ep); - rcq = efa_dgram_ep->rcq; - scq = efa_dgram_ep->scq; - - ofi_genlock_lock(&ep->lock); - - if (rcq) - efa_dgram_ep_progress_internal(efa_dgram_ep, rcq); - - if (scq && scq != rcq) - efa_dgram_ep_progress_internal(efa_dgram_ep, scq); - - ofi_genlock_unlock(&ep->lock); -} - static struct fi_ops_atomic efa_dgram_ep_atomic_ops = { .size = sizeof(struct fi_ops_atomic), .write = fi_no_atomic_write, @@ -433,7 +355,7 @@ int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, if (!ep) return -FI_ENOMEM; - ret = efa_base_ep_construct(&ep->base_ep, domain_fid, user_info, efa_dgram_ep_progress, context); + ret = efa_base_ep_construct(&ep->base_ep, domain_fid, user_info, efa_ep_progress_no_op, context); if (ret) goto err_ep_destroy; diff --git a/prov/efa/src/dgram/efa_dgram_ep.h b/prov/efa/src/dgram/efa_dgram_ep.h index b01db81f57e..18ab0dc8703 100644 --- a/prov/efa/src/dgram/efa_dgram_ep.h +++ b/prov/efa/src/dgram/efa_dgram_ep.h @@ -8,9 +8,6 @@ struct efa_dgram_ep { struct efa_base_ep base_ep; - - struct efa_dgram_cq *rcq; - struct efa_dgram_cq *scq; }; int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *info, diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 5ee81de7ebd..4b1d2f70442 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -53,7 +53,7 @@ struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr) struct util_av_entry *util_av_entry; struct efa_av_entry *efa_av_entry; - if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC)) + if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL)) return NULL; if (av->type == FI_AV_MAP) { @@ -70,7 +70,7 @@ struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr) } /** - * @brief find fi_addr for dgram endpoint + * @brief find fi_addr for efa endpoint * * @param[in] av address vector * @param[in] ahn address handle number @@ -78,7 +78,7 @@ struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr) * @return On success, return fi_addr to the peer who send the packet * If no such peer exist, return FI_ADDR_NOTAVAIL */ -fi_addr_t efa_av_reverse_lookup_dgram(struct efa_av *av, uint16_t ahn, uint16_t qpn) +fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn) { struct efa_cur_reverse_av *cur_entry; struct efa_cur_reverse_av_key cur_key; diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index b1624398be0..acf7e58e320 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -86,6 +86,6 @@ struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr); fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, uint16_t qpn, struct efa_rdm_pke *pkt_entry); -fi_addr_t efa_av_reverse_lookup_dgram(struct efa_av *av, uint16_t ahn, uint16_t qpn); +fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn); #endif \ No newline at end of file diff --git a/prov/efa/src/efa_cntr.c b/prov/efa/src/efa_cntr.c index fa1f548c525..8082ae76fd1 100644 --- a/prov/efa/src/efa_cntr.c +++ b/prov/efa/src/efa_cntr.c @@ -178,6 +178,24 @@ static void efa_rdm_cntr_progress(struct util_cntr *cntr) ofi_genlock_unlock(&cntr->ep_list_lock); } +static void efa_cntr_progress(struct util_cntr *cntr) +{ + struct util_ep *ep; + struct fid_list_entry *fid_entry; + struct dlist_entry *item; + + ofi_genlock_lock(&cntr->ep_list_lock); + dlist_foreach(&cntr->ep_list, item) { + fid_entry = container_of(item, struct fid_list_entry, entry); + ep = container_of(fid_entry->fid, struct util_ep, ep_fid.fid); + if (ep->tx_cq) + efa_cq_progress(ep->tx_cq); + if (ep->rx_cq && ep->rx_cq != ep->tx_cq) + efa_cq_progress(ep->rx_cq); + } + ofi_genlock_unlock(&cntr->ep_list_lock); +} + int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct fid_cntr **cntr_fid, void *context) { @@ -199,7 +217,7 @@ int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, cntr_progress_func = efa_domain->info->ep_attr->type == FI_EP_RDM ? efa_rdm_cntr_progress - : ofi_cntr_progress; + : efa_cntr_progress; ret = ofi_cntr_init(&efa_prov, domain, attr, &cntr->util_cntr, cntr_progress_func, context); diff --git a/prov/efa/src/efa_cq.c b/prov/efa/src/efa_cq.c new file mode 100644 index 00000000000..a5b737d89ac --- /dev/null +++ b/prov/efa/src/efa_cq.c @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include +#include +#include "config.h" +#include +#include "dgram/efa_dgram_ep.h" +#include "efa.h" +#include "efa_av.h" +#include "efa_cntr.h" +#include "efa_cq.h" +#include + + +static inline uint64_t efa_cq_opcode_to_fi_flags(enum ibv_wc_opcode opcode) { + switch (opcode) { + case IBV_WC_SEND: + return FI_SEND | FI_MSG; + case IBV_WC_RECV: + return FI_RECV | FI_MSG; + case IBV_WC_RDMA_WRITE: + return FI_RMA | FI_WRITE; + case IBV_WC_RECV_RDMA_WITH_IMM: + return FI_REMOTE_CQ_DATA | FI_RMA | FI_REMOTE_WRITE; + case IBV_WC_RDMA_READ: + return FI_RMA | FI_READ; + default: + assert(0); + return 0; + } +} + +static void efa_cq_construct_cq_entry(struct ibv_cq_ex *ibv_cqx, + struct fi_cq_tagged_entry *entry) +{ + entry->op_context = (void *)ibv_cqx->wr_id; + entry->flags = efa_cq_opcode_to_fi_flags(ibv_wc_read_opcode(ibv_cqx)); + entry->len = ibv_wc_read_byte_len(ibv_cqx); + entry->buf = NULL; + entry->data = 0; + entry->tag = 0; + + if (ibv_wc_read_wc_flags(ibv_cqx) & IBV_WC_WITH_IMM) { + entry->flags |= FI_REMOTE_CQ_DATA; + entry->data = ibv_wc_read_imm_data(ibv_cqx); + } +} + +/** + * @brief handle the situation that a TX/RX operation encountered error + * + * This function does the following to handle error: + * + * 1. write an error cq entry for the operation, if writing + * CQ error entry failed, it will write eq entry. + * + * 2. increase error counter. + * + * 3. print warning message with self and peer's raw address + * + * @param[in] base_ep efa_base_ep + * @param[in] ibv_cq_ex extended ibv cq + * @param[in] err positive libfabric error code + * @param[in] prov_errno positive EFA provider specific error code + * @param[in] is_tx if the error is for TX or RX operation + */ +static void efa_cq_handle_error(struct efa_base_ep *base_ep, + struct ibv_cq_ex *ibv_cq_ex, int err, + int prov_errno, bool is_tx) +{ + struct fi_cq_err_entry err_entry; + fi_addr_t addr; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; + int write_cq_err; + + memset(&err_entry, 0, sizeof(err_entry)); + efa_cq_construct_cq_entry(ibv_cq_ex, (struct fi_cq_tagged_entry *) &err_entry); + err_entry.err = err; + err_entry.prov_errno = prov_errno; + + if (is_tx) + // TODO: get correct peer addr for TX operation + addr = FI_ADDR_NOTAVAIL; + else + addr = efa_av_reverse_lookup(base_ep->av, + ibv_wc_read_slid(ibv_cq_ex), + ibv_wc_read_src_qp(ibv_cq_ex)); + + if (OFI_UNLIKELY(efa_write_error_msg(base_ep, addr, prov_errno, + err_msg, + &err_entry.err_data_size))) { + err_entry.err_data_size = 0; + } else { + err_entry.err_data = err_msg; + } + + EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n", + err_entry.err, + err_entry.err_data + ? (const char *) err_entry.err_data + : efa_strerror(err_entry.prov_errno), + err_entry.prov_errno); + + efa_show_help(err_entry.prov_errno); + + efa_cntr_report_error(&base_ep->util_ep, err_entry.flags); + write_cq_err = ofi_cq_write_error(is_tx ? base_ep->util_ep.tx_cq : + base_ep->util_ep.rx_cq, + &err_entry); + if (write_cq_err) { + EFA_WARN( + FI_LOG_CQ, + "Error writing error cq entry when handling %s error\n", + is_tx ? "TX" : "RX"); + efa_base_ep_write_eq_error(base_ep, err, prov_errno); + } +} + +/** + * @brief handle the event that a TX request has been completed + * + * @param[in] base_ep efa_base_ep + * @param[in] ibv_cq_ex extended ibv cq + * @param[in] cq_entry fi_cq_tagged_entry + */ +static void efa_cq_handle_tx_completion(struct efa_base_ep *base_ep, + struct ibv_cq_ex *ibv_cq_ex, + struct fi_cq_tagged_entry *cq_entry) +{ + struct util_cq *tx_cq = base_ep->util_ep.tx_cq; + int ret = 0; + + /* NULL wr_id means no FI_COMPLETION flag */ + if (!ibv_cq_ex->wr_id) + return; + + /* TX completions should not send peer address to util_cq */ + if (base_ep->util_ep.caps & FI_SOURCE) + ret = ofi_cq_write_src(tx_cq, cq_entry->op_context, + cq_entry->flags, cq_entry->len, + cq_entry->buf, cq_entry->data, + cq_entry->tag, FI_ADDR_NOTAVAIL); + else + ret = ofi_cq_write(tx_cq, cq_entry->op_context, cq_entry->flags, + cq_entry->len, cq_entry->buf, cq_entry->data, + cq_entry->tag); + + if (OFI_UNLIKELY(ret)) { + EFA_WARN(FI_LOG_CQ, "Unable to write send completion: %s\n", + fi_strerror(-ret)); + efa_cq_handle_error(base_ep, ibv_cq_ex, -ret, + FI_EFA_ERR_WRITE_SEND_COMP, true); + } +} + +/** + * @brief handle the event that a RX request has been completed + * + * @param[in] base_ep efa_base_ep + * @param[in] ibv_cq_ex extended ibv cq + * @param[in] cq_entry fi_cq_tagged_entry + */ +static void efa_cq_handle_rx_completion(struct efa_base_ep *base_ep, + struct ibv_cq_ex *ibv_cq_ex, + struct fi_cq_tagged_entry *cq_entry) +{ + struct util_cq *rx_cq = base_ep->util_ep.rx_cq; + fi_addr_t src_addr; + int ret = 0; + + /* NULL wr_id means no FI_COMPLETION flag */ + if (!ibv_cq_ex->wr_id) + return; + + if (base_ep->util_ep.caps & FI_SOURCE) { + src_addr = efa_av_reverse_lookup(base_ep->av, + ibv_wc_read_slid(ibv_cq_ex), + ibv_wc_read_src_qp(ibv_cq_ex)); + ret = ofi_cq_write_src(rx_cq, cq_entry->op_context, + cq_entry->flags, cq_entry->len, + cq_entry->buf, cq_entry->data, + cq_entry->tag, src_addr); + } else { + ret = ofi_cq_write(rx_cq, cq_entry->op_context, cq_entry->flags, + cq_entry->len, cq_entry->buf, cq_entry->data, + cq_entry->tag); + } + + if (OFI_UNLIKELY(ret)) { + EFA_WARN(FI_LOG_CQ, "Unable to write recv completion: %s\n", + fi_strerror(-ret)); + efa_cq_handle_error(base_ep, ibv_cq_ex, -ret, + FI_EFA_ERR_WRITE_RECV_COMP, false); + } +} + +/** + * @brief handle rdma-core CQ completion resulted from IBV_WRITE_WITH_IMM + * + * This function handles hardware-assisted RDMA writes with immediate data at + * remote endpoint. These do not have a packet context, nor do they have a + * connid available. + * + * @param[in] base_ep efa_base_ep + * @param[in] ibv_cq_ex extended ibv cq + */ +static void +efa_cq_proc_ibv_recv_rdma_with_imm_completion(struct efa_base_ep *base_ep, + struct ibv_cq_ex *ibv_cq_ex) +{ + struct util_cq *rx_cq = base_ep->util_ep.rx_cq; + int ret; + fi_addr_t src_addr; + uint32_t imm_data = ibv_wc_read_imm_data(ibv_cq_ex); + uint32_t len = ibv_wc_read_byte_len(ibv_cq_ex); + uint64_t flags = FI_REMOTE_CQ_DATA | FI_RMA | FI_REMOTE_WRITE; + + if (base_ep->util_ep.caps & FI_SOURCE) { + src_addr = efa_av_reverse_lookup(base_ep->av, + ibv_wc_read_slid(ibv_cq_ex), + ibv_wc_read_src_qp(ibv_cq_ex)); + ret = ofi_cq_write_src(rx_cq, NULL, flags, len, NULL, imm_data, + 0, src_addr); + } else { + ret = ofi_cq_write(rx_cq, NULL, flags, len, NULL, imm_data, 0); + } + + if (OFI_UNLIKELY(ret)) { + EFA_WARN(FI_LOG_CQ, + "Unable to write a cq entry for remote for RECV_RDMA " + "operation: %s\n", + fi_strerror(-ret)); + efa_base_ep_write_eq_error(base_ep, -ret, + FI_EFA_ERR_WRITE_RECV_COMP); + } +} + +/** + * @brief poll rdma-core cq and process the cq entry + * + * @param[in] cqe_to_process Max number of cq entry to poll and process. + * A negative number means to poll until cq empty. + * @param[in] util_cq util_cq + */ +void efa_cq_poll_ibv_cq(ssize_t cqe_to_process, struct util_cq *util_cq) +{ + bool should_end_poll = false; + struct efa_base_ep *base_ep; + struct efa_cq *cq; + struct efa_domain *efa_domain; + struct fi_cq_tagged_entry cq_entry = {0}; + struct fi_cq_err_entry err_entry; + ssize_t err = 0; + size_t num_cqe = 0; /* Count of read entries */ + int prov_errno, opcode; + + /* Initialize an empty ibv_poll_cq_attr struct for ibv_start_poll. + * EFA expects .comp_mask = 0, or otherwise returns EINVAL. + */ + struct ibv_poll_cq_attr poll_cq_attr = {.comp_mask = 0}; + + cq = container_of(util_cq, struct efa_cq, util_cq); + efa_domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain); + + /* Call ibv_start_poll only once */ + err = ibv_start_poll(cq->ibv_cq.ibv_cq_ex, &poll_cq_attr); + should_end_poll = !err; + + while (!err) { + base_ep = efa_domain->qp_table[ibv_wc_read_qp_num(cq->ibv_cq.ibv_cq_ex) & efa_domain->qp_table_sz_m1]->base_ep; + opcode = ibv_wc_read_opcode(cq->ibv_cq.ibv_cq_ex); + if (cq->ibv_cq.ibv_cq_ex->status) { + prov_errno = ibv_wc_read_vendor_err(cq->ibv_cq.ibv_cq_ex); + switch (opcode) { + case IBV_WC_SEND: /* fall through */ + case IBV_WC_RDMA_WRITE: /* fall through */ + case IBV_WC_RDMA_READ: + efa_cq_handle_error(base_ep, cq->ibv_cq.ibv_cq_ex, + to_fi_errno(prov_errno), + prov_errno, true); + break; + case IBV_WC_RECV: /* fall through */ + case IBV_WC_RECV_RDMA_WITH_IMM: + if (efa_cq_wc_is_unsolicited(cq->ibv_cq.ibv_cq_ex)) { + EFA_WARN(FI_LOG_CQ, + "Receive error %s (%d) for " + "unsolicited write recv", + efa_strerror(prov_errno), + prov_errno); + efa_base_ep_write_eq_error( + base_ep, + to_fi_errno(prov_errno), + prov_errno); + break; + } + efa_cq_handle_error(base_ep, cq->ibv_cq.ibv_cq_ex, + to_fi_errno(prov_errno), + prov_errno, false); + break; + default: + EFA_WARN(FI_LOG_EP_CTRL, "Unhandled op code %d\n", opcode); + assert(0 && "Unhandled op code"); + } + break; + } + + efa_cq_construct_cq_entry(cq->ibv_cq.ibv_cq_ex, &cq_entry); + + switch (opcode) { + case IBV_WC_SEND: /* fall through */ + case IBV_WC_RDMA_WRITE: /* fall through */ + case IBV_WC_RDMA_READ: + efa_cq_handle_tx_completion(base_ep, cq->ibv_cq.ibv_cq_ex, &cq_entry); + efa_cntr_report_tx_completion(&base_ep->util_ep, cq_entry.flags); + break; + case IBV_WC_RECV: + efa_cq_handle_rx_completion(base_ep, cq->ibv_cq.ibv_cq_ex, &cq_entry); + efa_cntr_report_rx_completion(&base_ep->util_ep, cq_entry.flags); + break; + case IBV_WC_RECV_RDMA_WITH_IMM: + efa_cq_proc_ibv_recv_rdma_with_imm_completion( + base_ep, cq->ibv_cq.ibv_cq_ex); + efa_cntr_report_rx_completion(&base_ep->util_ep, cq_entry.flags); + break; + default: + EFA_WARN(FI_LOG_EP_CTRL, + "Unhandled cq type\n"); + assert(0 && "Unhandled cq type"); + } + + num_cqe++; + if (num_cqe == cqe_to_process) { + break; + } + + err = ibv_next_poll(cq->ibv_cq.ibv_cq_ex); + } + + if (err && err != ENOENT) { + err = err > 0 ? err : -err; + prov_errno = ibv_wc_read_vendor_err(cq->ibv_cq.ibv_cq_ex); + EFA_WARN(FI_LOG_CQ, + "Unexpected error when polling ibv cq, err: %s (%zd) " + "prov_errno: %s (%d)\n", + fi_strerror(err), err, efa_strerror(prov_errno), + prov_errno); + efa_show_help(prov_errno); + err_entry = (struct fi_cq_err_entry) { + .err = err, + .prov_errno = prov_errno, + .op_context = NULL, + }; + ofi_cq_write_error(&cq->util_cq, &err_entry); + } + + if (should_end_poll) + ibv_end_poll(cq->ibv_cq.ibv_cq_ex); +} + +static const char *efa_cq_strerror(struct fid_cq *cq_fid, + int prov_errno, + const void *err_data, + char *buf, size_t len) +{ + return err_data + ? (const char *) err_data + : efa_strerror(prov_errno); +} + +static struct fi_ops_cq efa_cq_ops = { + .size = sizeof(struct fi_ops_cq), + .read = ofi_cq_read, + .readfrom = ofi_cq_readfrom, + .readerr = ofi_cq_readerr, + .sread = fi_no_cq_sread, + .sreadfrom = fi_no_cq_sreadfrom, + .signal = fi_no_cq_signal, + .strerror = efa_cq_strerror +}; + +void efa_cq_progress(struct util_cq *cq) +{ + efa_cq_poll_ibv_cq(efa_env.efa_cq_read_size, cq); +} + +static int efa_cq_close(fid_t fid) +{ + struct efa_cq *cq; + int ret; + + cq = container_of(fid, struct efa_cq, util_cq.cq_fid.fid); + + if (cq->ibv_cq.ibv_cq_ex) { + ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq.ibv_cq_ex)); + if (ret) { + EFA_WARN(FI_LOG_CQ, "Unable to close ibv cq: %s\n", + fi_strerror(-ret)); + return ret; + } + cq->ibv_cq.ibv_cq_ex = NULL; + } + + ret = ofi_cq_cleanup(&cq->util_cq); + if (ret) + return ret; + + free(cq); + + return 0; +} + +static struct fi_ops efa_cq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = efa_cq_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + + +int efa_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, + struct fid_cq **cq_fid, void *context) +{ + struct efa_cq *cq; + struct efa_domain *efa_domain; + int err, retv; + + if (attr->wait_obj != FI_WAIT_NONE) + return -FI_ENOSYS; + + cq = calloc(1, sizeof(*cq)); + if (!cq) + return -FI_ENOMEM; + + err = ofi_cq_init(&efa_prov, domain_fid, attr, &cq->util_cq, + &efa_cq_progress, context); + if (err) { + EFA_WARN(FI_LOG_CQ, "Unable to create UTIL_CQ\n"); + goto err_free_cq; + } + + efa_domain = container_of(cq->util_cq.domain, struct efa_domain, + util_domain); + err = efa_cq_ibv_cq_ex_open(attr, efa_domain->device->ibv_ctx, + &cq->ibv_cq.ibv_cq_ex, + &cq->ibv_cq.ibv_cq_ex_type); + if (err) { + EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %s\n", fi_strerror(err)); + goto err_free_util_cq; + } + + *cq_fid = &cq->util_cq.cq_fid; + (*cq_fid)->fid.fclass = FI_CLASS_CQ; + (*cq_fid)->fid.context = context; + (*cq_fid)->fid.ops = &efa_cq_fi_ops; + (*cq_fid)->ops = &efa_cq_ops; + + return 0; + +err_free_util_cq: + retv = ofi_cq_cleanup(&cq->util_cq); + if (retv) + EFA_WARN(FI_LOG_CQ, "Unable to close util cq: %s\n", + fi_strerror(-retv)); +err_free_cq: + free(cq); + return err; +} diff --git a/prov/efa/src/efa_cq.h b/prov/efa/src/efa_cq.h index 26366d5094c..8d328d8e7fd 100644 --- a/prov/efa/src/efa_cq.h +++ b/prov/efa/src/efa_cq.h @@ -18,6 +18,11 @@ struct efa_ibv_cq_poll_list_entry { struct efa_ibv_cq *cq; }; +struct efa_cq { + struct util_cq util_cq; + struct efa_ibv_cq ibv_cq; +}; + /* * Control header with completion data. CQ data length is static. */ @@ -177,6 +182,11 @@ static inline int efa_cq_ibv_cq_ex_open(struct fi_cq_attr *attr, } #endif +int efa_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, + struct fid_cq **cq_fid, void *context); + +void efa_cq_progress(struct util_cq *cq); + #if HAVE_CAPS_UNSOLICITED_WRITE_RECV /** * @brief Check whether a completion consumes recv buffer @@ -200,3 +210,62 @@ bool efa_cq_wc_is_unsolicited(struct ibv_cq_ex *ibv_cq_ex) } #endif + +/** + * @brief Write the error message and return its byte length + * @param[in] ep EFA base endpoint + * @param[in] addr Remote peer fi_addr_t + * @param[in] prov_errno EFA provider * error code(must be positive) + * @param[out] err_msg Pointer to the address of error message written by + * this function + * @param[out] buflen Pointer to the returned error data size + * @return A status code. 0 if the error data was written successfully, + * otherwise a negative FI error code. + */ +static inline int efa_write_error_msg(struct efa_base_ep *ep, fi_addr_t addr, + int prov_errno, char *err_msg, + size_t *buflen) +{ + char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; + char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + const char *base_msg = efa_strerror(prov_errno); + size_t len = 0; + uint64_t local_host_id; + + *buflen = 0; + + len = sizeof(ep_addr_str); + efa_base_ep_raw_addr_str(ep, ep_addr_str, &len); + len = sizeof(peer_addr_str); + efa_base_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); + + local_host_id = efa_get_host_id(efa_env.host_id_file); + if (!local_host_id || + EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, + EFA_HOST_ID_STRING_LENGTH + 1, + "i-%017lx", local_host_id)) { + strcpy(local_host_id_str, "N/A"); + } + + /* efa-raw cannot get peer host id without a handshake */ + strcpy(peer_host_id_str, "N/A"); + + int ret = snprintf(err_msg, EFA_ERROR_MSG_BUFFER_LENGTH, + "%s My EFA addr: %s My host id: %s Peer EFA addr: " + "%s Peer host id: %s", + base_msg, ep_addr_str, local_host_id_str, + peer_addr_str, peer_host_id_str); + + if (ret < 0 || ret > EFA_ERROR_MSG_BUFFER_LENGTH - 1) { + return -FI_EINVAL; + } + + if (strlen(err_msg) >= EFA_ERROR_MSG_BUFFER_LENGTH) { + return -FI_ENOBUFS; + } + + *buflen = EFA_ERROR_MSG_BUFFER_LENGTH; + + return 0; +} diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 6af35775ae0..17e948c7eef 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -12,7 +12,6 @@ #include "rdm/efa_rdm_cq.h" #include "rdm/efa_rdm_atomic.h" #include "dgram/efa_dgram_ep.h" -#include "dgram/efa_dgram_cq.h" struct dlist_entry g_efa_domain_list; @@ -33,7 +32,7 @@ static struct fi_ops efa_ops_domain_fid = { static struct fi_ops_domain efa_ops_domain_dgram = { .size = sizeof(struct fi_ops_domain), .av_open = efa_av_open, - .cq_open = efa_dgram_cq_open, + .cq_open = efa_cq_open, .endpoint = efa_dgram_ep_open, .scalable_ep = fi_no_scalable_ep, .cntr_open = efa_cntr_open, diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c index fbd4adb2bd9..c2af757e112 100644 --- a/prov/efa/src/efa_msg.c +++ b/prov/efa/src/efa_msg.c @@ -99,9 +99,9 @@ static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi } wr = &base_ep->efa_recv_wr_vec[wr_index].wr; - wr->wr_id = (uintptr_t)msg->context; wr->num_sge = msg->iov_count; wr->sg_list = base_ep->efa_recv_wr_vec[wr_index].sge; + wr->wr_id = (uintptr_t) ((flags & FI_COMPLETION) ? msg->context : NULL); for (i = 0; i < msg->iov_count; i++) { addr = (uintptr_t)msg->msg_iov[i].iov_base; @@ -224,7 +224,8 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi base_ep->is_wr_started = true; } - qp->ibv_qp_ex->wr_id = (uintptr_t)msg->context; + qp->ibv_qp_ex->wr_id = (uintptr_t) ((flags & FI_COMPLETION) ? msg->context : NULL); + if (flags & FI_REMOTE_CQ_DATA) { ibv_wr_send_imm(qp->ibv_qp_ex, msg->data); } else { diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c index 052e2aa89d7..cf4987c34eb 100644 --- a/prov/efa/src/efa_rma.c +++ b/prov/efa/src/efa_rma.c @@ -90,7 +90,7 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, ibv_wr_start(qp->ibv_qp_ex); base_ep->is_wr_started = true; } - qp->ibv_qp_ex->wr_id = (uintptr_t)msg->context; + qp->ibv_qp_ex->wr_id = (uintptr_t) ((flags & FI_COMPLETION) ? msg->context : NULL); /* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */ ibv_wr_rdma_read(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); @@ -221,7 +221,7 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, ibv_wr_start(qp->ibv_qp_ex); base_ep->is_wr_started = true; } - qp->ibv_qp_ex->wr_id = (uintptr_t)msg->context; + qp->ibv_qp_ex->wr_id = (uintptr_t) ((flags & FI_COMPLETION) ? msg->context : NULL); if (flags & FI_REMOTE_CQ_DATA) { ibv_wr_rdma_write_imm(qp->ibv_qp_ex, msg->rma_iov[0].key, diff --git a/prov/efa/src/rdm/efa_rdm_cq.h b/prov/efa/src/rdm/efa_rdm_cq.h index 932c57109d7..a56d62dac40 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.h +++ b/prov/efa/src/rdm/efa_rdm_cq.h @@ -9,8 +9,8 @@ struct efa_rdm_cq { struct util_cq util_cq; - struct fid_cq *shm_cq; struct efa_ibv_cq ibv_cq; + struct fid_cq *shm_cq; struct dlist_entry ibv_cq_poll_list; bool need_to_scan_ep_list; }; diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 29a06fc1579..df415f7cd9a 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -3,8 +3,8 @@ #include "efa_unit_tests.h" #include "dgram/efa_dgram_ep.h" -#include "dgram/efa_dgram_cq.h" #include "rdm/efa_rdm_cq.h" +#include "efa_av.h" /** * @brief implementation of test cases for fi_cq_read() works with empty device CQ for given endpoint type @@ -27,7 +27,7 @@ void test_impl_cq_read_empty_cq(struct efa_resource *resource, enum fi_ep_type e struct efa_dgram_ep *efa_dgram_ep; efa_dgram_ep = container_of(resource->ep, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - ibv_cqx = efa_dgram_ep->rcq->ibv_cq_ex; + ibv_cqx = container_of(efa_dgram_ep->base_ep.util_ep.rx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; } else { struct efa_rdm_ep *efa_rdm_ep; @@ -811,3 +811,216 @@ void test_ibv_cq_ex_read_ignore_removed_peer() skip(); } #endif + +static void test_efa_cq_read(struct efa_resource *resource, fi_addr_t *addr, + int ibv_wc_opcode, int status, int vendor_error) +{ + int ret; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct efa_ep_addr raw_addr; + struct ibv_cq_ex *ibv_cqx; + struct ibv_qp_ex *ibv_qpx; + struct efa_base_ep *base_ep; + + efa_unit_test_resource_construct(resource, FI_EP_DGRAM); + + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + ibv_qpx = base_ep->qp->ibv_qp_ex; + + ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(ret, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + ret = fi_av_insert(resource->av, &raw_addr, 1, addr, 0 /* flags */, NULL /* context */); + assert_int_equal(ret, 1); + + ibv_qpx->wr_start = &efa_mock_ibv_wr_start_no_op; + /* this mock will save the send work request (wr) in a global list */ + ibv_qpx->wr_send = &efa_mock_ibv_wr_send_save_wr; + ibv_qpx->wr_set_sge_list = &efa_mock_ibv_wr_set_sge_list_no_op; + ibv_qpx->wr_set_ud_addr = &efa_mock_ibv_wr_set_ud_addr_no_op; + ibv_qpx->wr_complete = &efa_mock_ibv_wr_complete_no_op; + + base_ep->qp->ibv_qp->context->ops.post_recv = &efa_mock_ibv_post_recv; + will_return_maybe(efa_mock_ibv_post_recv, 0); + + if (ibv_wc_opcode == IBV_WC_RECV) { + ibv_cqx = container_of(base_ep->util_ep.rx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->wr_id = (uintptr_t)12345; + will_return(efa_mock_ibv_start_poll_return_mock, 0); + ibv_cqx->status = status; + } else { + ibv_cqx = container_of(base_ep->util_ep.tx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; + /* this mock will set ibv_cq_ex->wr_id to the wr_id of the head of global send_wr, + * and set ibv_cq_ex->status to mock value */ + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_use_saved_send_wr_with_mock_status; + will_return(efa_mock_ibv_start_poll_use_saved_send_wr_with_mock_status, status); + } + + ibv_cqx->next_poll = &efa_mock_ibv_next_poll_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + will_return_maybe(efa_mock_ibv_end_poll_check_mock, NULL); + will_return_maybe(efa_mock_ibv_next_poll_return_mock, 0); + will_return_maybe(efa_mock_ibv_read_opcode_return_mock, ibv_wc_opcode); + will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, base_ep->qp->qp_num); + will_return_maybe(efa_mock_ibv_read_vendor_err_return_mock, vendor_error); +#if HAVE_EFADV_CQ_EX + ibv_cqx->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; + ibv_cqx->read_slid = &efa_mock_ibv_read_slid_return_mock; + ibv_cqx->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; + ibv_cqx->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; + will_return_maybe(efa_mock_ibv_read_byte_len_return_mock, 4096); + will_return_maybe(efa_mock_ibv_read_slid_return_mock, efa_av_addr_to_conn(base_ep->av, *addr)->ah->ahn); + will_return_maybe(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn); + will_return_maybe(efa_mock_ibv_read_wc_flags_return_mock, 0); +#endif +} + +/** + * @brief test EFA CQ's fi_cq_read() works properly when rdma-core return + * success status for send operation. + */ +void test_efa_cq_read_send_success(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct fi_cq_data_entry cq_entry; + fi_addr_t addr; + int ret; + + test_efa_cq_read(resource, &addr, IBV_WC_SEND, IBV_WC_SUCCESS, 0); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_send(resource->ep, send_buff.buff, send_buff.size, + fi_mr_desc(send_buff.mr), addr, (void *) 12345); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + ret = fi_cq_read(resource->cq, &cq_entry, 1); + /* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */ + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + assert_int_equal(ret, 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief test EFA CQ's fi_cq_read() works properly when rdma-core return + * success status for recv operation. + */ +void test_efa_cq_read_recv_success(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff recv_buff; + struct fi_cq_data_entry cq_entry; + fi_addr_t addr; + int ret; + + test_efa_cq_read(resource, &addr, IBV_WC_RECV, IBV_WC_SUCCESS, 0); + efa_unit_test_buff_construct(&recv_buff, resource, 4096 /* buff_size */); + + ret = fi_recv(resource->ep, recv_buff.buff, recv_buff.size, + fi_mr_desc(recv_buff.mr), addr, NULL); + assert_int_equal(ret, 0); + + ret = fi_cq_read(resource->cq, &cq_entry, 1); + assert_int_equal(ret, 1); + + efa_unit_test_buff_destruct(&recv_buff); +} + +static void efa_cq_check_cq_err_entry(struct efa_resource *resource, int vendor_error) { + struct fi_cq_err_entry cq_err_entry = {0}; + const char *strerror; + int ret; + + /* Allocate memory to read CQ error */ + cq_err_entry.err_data_size = EFA_ERROR_MSG_BUFFER_LENGTH; + cq_err_entry.err_data = malloc(cq_err_entry.err_data_size); + assert_non_null(cq_err_entry.err_data); + + ret = fi_cq_readerr(resource->cq, &cq_err_entry, 0); + assert_true(cq_err_entry.err_data_size > 0); + strerror = fi_cq_strerror(resource->cq, cq_err_entry.prov_errno, + cq_err_entry.err_data, NULL, 0); + + assert_int_equal(ret, 1); + assert_int_not_equal(cq_err_entry.err, FI_SUCCESS); + assert_int_equal(cq_err_entry.prov_errno, vendor_error); + assert_true(strlen(strerror) > 0); +} + +/** + * @brief test EFA CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core return bad status for send. + * + * When the send operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available. + * then user should call fi_cq_readerr() to get an error CQ entry that contain error code. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_efa_cq_read_send_failure(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct fi_cq_data_entry cq_entry; + fi_addr_t addr; + int ret; + + test_efa_cq_read(resource, &addr, IBV_WC_SEND, IBV_WC_GENERAL_ERR, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); + efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_send(resource->ep, send_buff.buff, send_buff.size, + fi_mr_desc(send_buff.mr), addr, (void *) 12345); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + ret = fi_cq_read(resource->cq, &cq_entry, 1); + /* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */ + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + assert_int_equal(ret, -FI_EAVAIL); + + efa_cq_check_cq_err_entry(resource, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); + + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief test EFA CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core return bad status for recv. + * + * When the recv operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available. + * then user should call fi_cq_readerr() to get an error CQ entry that contain error code. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_efa_cq_read_recv_failure(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff recv_buff; + struct fi_cq_data_entry cq_entry; + fi_addr_t addr; + int ret; + + test_efa_cq_read(resource, &addr, IBV_WC_RECV, IBV_WC_GENERAL_ERR, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); + efa_unit_test_buff_construct(&recv_buff, resource, 4096 /* buff_size */); + + ret = fi_recv(resource->ep, recv_buff.buff, recv_buff.size, + fi_mr_desc(recv_buff.mr), addr, NULL); + assert_int_equal(ret, 0); + + ret = fi_cq_read(resource->cq, &cq_entry, 1); + assert_int_equal(ret, -FI_EAVAIL); + + efa_cq_check_cq_err_entry(resource, + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); + + efa_unit_test_buff_destruct(&recv_buff); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 017f4e65ded..3e3ba43ef04 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -229,6 +229,10 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rma_writedata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rma_inject_write, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rma_inject_writedata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cq_read_send_success, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cq_read_recv_success, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cq_read_send_failure, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cq_read_recv_failure, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), }; cmocka_set_message_output(CM_OUTPUT_XML); diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 4a796e5385f..86bef64edab 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -251,6 +251,10 @@ void test_efa_rma_writemsg(); void test_efa_rma_writedata(); void test_efa_rma_inject_write(); void test_efa_rma_inject_writedata(); +void test_efa_cq_read_send_success(); +void test_efa_cq_read_recv_success(); +void test_efa_cq_read_send_failure(); +void test_efa_cq_read_recv_failure(); static inline int efa_unit_test_get_dlist_length(struct dlist_entry *head) From d4ee2cfc3f2dde7263ed34f33361272b2fbb6503 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 7 Jan 2025 11:41:27 -0800 Subject: [PATCH 341/393] prov/efa: Add missing mock for wc_is_unsolicited in unit test Signed-off-by: Jessie Yang --- prov/efa/test/efa_unit_test_cq.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index df415f7cd9a..c5b93cd5e66 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -878,6 +878,12 @@ static void test_efa_cq_read(struct efa_resource *resource, fi_addr_t *addr, will_return_maybe(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn); will_return_maybe(efa_mock_ibv_read_wc_flags_return_mock, 0); #endif +#if HAVE_CAPS_UNSOLICITED_WRITE_RECV + if (efa_use_unsolicited_write_recv()) { + efadv_cq_from_ibv_cq_ex(ibv_cqx)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; + will_return_maybe(efa_mock_efadv_wc_is_unsolicited, false); + } +#endif } /** From 1eb88f646c9752b399553bb397995d03ee73d33f Mon Sep 17 00:00:00 2001 From: Steve Welch Date: Sun, 8 Dec 2024 14:15:06 -0600 Subject: [PATCH 342/393] prov/cxi: Fix CQ wait FD logic Implement cxi managed internal wait FDs. EP bound to a CQ(s) with wait_obj will allocate their own internal CXI wait object and add sysfs_notify FD to the CQ. Fix CQ trywait logic to correctly enable h/w EQ interrupts and include control EQ which may require progress be initiated. NETCASSINI-6749 Signed-off-by: Steve Welch --- man/fi_cxi.7.md | 4 +- prov/cxi/include/cxip.h | 19 +- prov/cxi/src/cxip_cq.c | 223 +++++++++++++++-------- prov/cxi/src/cxip_ctrl.c | 147 +-------------- prov/cxi/src/cxip_ep.c | 175 +++++++++++++++--- prov/cxi/src/cxip_evtq.c | 5 +- prov/cxi/src/cxip_fabric.c | 32 +++- prov/cxi/src/cxip_rxc.c | 3 +- prov/cxi/src/cxip_txc.c | 3 +- prov/cxi/test/cxip_test_common.c | 1 + prov/cxi/test/tagged.c | 300 +++++++++++++++++++++++++++++-- 11 files changed, 646 insertions(+), 266 deletions(-) diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md index c2cbffe2b52..0109d19e9ed 100644 --- a/man/fi_cxi.7.md +++ b/man/fi_cxi.7.md @@ -80,7 +80,9 @@ The CXI provider supports FI_THREAD_SAFE and FI_THREAD_DOMAIN threading models. The CXI provider supports FI_WAIT_FD and FI_WAIT_POLLFD CQ wait object types. FI_WAIT_UNSPEC will default to FI_WAIT_FD. However FI_WAIT_NONE should achieve -the lowest latency and reduce interrupt overhead. +the lowest latency and reduce interrupt overhead. NOTE: A process may return +from a epoll_wait/poll when provider progress is required and a CQ event may +not be available. ## Additional Features diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 0a73fc90582..34a4a9d242c 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -1420,8 +1420,8 @@ struct cxip_cq { */ struct ofi_genlock ep_list_lock; - /* Internal CXI wait object allocated only if required. */ - struct cxil_wait_obj *priv_wait; + /* CXI CQ wait object EPs are maintained in epoll FD */ + int ep_fd; /* CXI specific fields. */ struct cxip_domain *domain; @@ -2428,6 +2428,10 @@ struct cxip_ep_obj { struct cxip_txc *txc; struct cxip_rxc *rxc; + /* Internal support for CQ wait object */ + struct cxil_wait_obj *priv_wait; + int wait_fd; + /* ASIC version associated with EP/Domain */ enum cassini_version asic_ver; @@ -3148,7 +3152,8 @@ static inline bool cxip_cmdq_match(struct cxip_cmdq *cmdq, uint16_t vni, } int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq, - size_t num_events, size_t num_fc_events); + size_t num_events, size_t num_fc_events, + struct cxil_wait_obj *priv_wait); void cxip_evtq_fini(struct cxip_evtq *eq); int cxip_domain(struct fid_fabric *fabric, struct fi_info *info, @@ -3228,6 +3233,9 @@ int cxip_cq_req_complete_addr(struct cxip_req *req, fi_addr_t src); int cxip_cq_req_error(struct cxip_req *req, size_t olen, int err, int prov_errno, void *err_data, size_t err_data_size, fi_addr_t src_addr); +int cxip_cq_add_wait_fd(struct cxip_cq *cq, int wait_fd, int events); +void cxip_cq_del_wait_fd(struct cxip_cq *cq, int wait_fd); + int proverr2errno(int err); struct cxip_req *cxip_evtq_req_alloc(struct cxip_evtq *evtq, int remap, void *req_ctx); @@ -3235,9 +3243,9 @@ void cxip_evtq_req_free(struct cxip_req *req); void cxip_evtq_progress(struct cxip_evtq *evtq); void cxip_ep_progress(struct fid *fid); -int cxip_ep_peek(struct fid *fid); void cxip_ep_flush_trig_reqs(struct cxip_ep_obj *ep_obj); +int cxip_cq_trywait(struct cxip_cq *cq); void cxip_cq_progress(struct cxip_cq *cq); void cxip_util_cq_progress(struct util_cq *util_cq); int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, @@ -3266,8 +3274,7 @@ void cxip_ep_tgt_ctrl_progress(struct cxip_ep_obj *ep_obj); void cxip_ep_tgt_ctrl_progress_locked(struct cxip_ep_obj *ep_obj); int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj); void cxip_ep_ctrl_fini(struct cxip_ep_obj *ep_obj); -void cxip_ep_ctrl_del_wait(struct cxip_ep_obj *ep_obj); -int cxip_ep_ctrl_trywait(void *arg); +int cxip_ep_trywait(struct cxip_ep_obj *ep_obj, struct cxip_cq *cq); int cxip_av_set(struct fid_av *av, struct fi_av_set_attr *attr, struct fid_av_set **av_set_fid, void * context); diff --git a/prov/cxi/src/cxip_cq.c b/prov/cxi/src/cxip_cq.c index 1c6504c90bc..f55eb27141f 100644 --- a/prov/cxi/src/cxip_cq.c +++ b/prov/cxi/src/cxip_cq.c @@ -184,37 +184,33 @@ static const char *cxip_cq_strerror(struct fid_cq *cq, int prov_errno, return errmsg; } -/* - * cxip_cq_trywait - Return success if able to block waiting for CQ events. - */ -static int cxip_cq_trywait(void *arg) +int cxip_cq_trywait(struct cxip_cq *cq) { - struct cxip_cq *cq = (struct cxip_cq *)arg; struct fid_list_entry *fid_entry; struct dlist_entry *item; + struct cxip_ep *ep; - assert(cq->util_cq.wait); - - if (!cq->priv_wait) { + if (cq->ep_fd < 0) { CXIP_WARN("No CXI wait object\n"); return -FI_EINVAL; } + ofi_genlock_lock(&cq->util_cq.cq_lock); + if (!ofi_cirque_isempty(cq->util_cq.cirq)) { + ofi_genlock_unlock(&cq->util_cq.cq_lock); + return -FI_EAGAIN; + } + ofi_genlock_unlock(&cq->util_cq.cq_lock); + ofi_genlock_lock(&cq->ep_list_lock); dlist_foreach(&cq->util_cq.ep_list, item) { fid_entry = container_of(item, struct fid_list_entry, entry); - if (cxip_ep_peek(fid_entry->fid)) { - ofi_genlock_unlock(&cq->ep_list_lock); + ep = container_of(fid_entry->fid, struct cxip_ep, ep.fid); - return -FI_EAGAIN; - } - } + if (!ep->ep_obj->priv_wait) + continue; - /* Clear wait, and check for any events */ - cxil_clear_wait_obj(cq->priv_wait); - dlist_foreach(&cq->util_cq.ep_list, item) { - fid_entry = container_of(item, struct fid_list_entry, entry); - if (cxip_ep_peek(fid_entry->fid)) { + if (cxip_ep_trywait(ep->ep_obj, cq)) { ofi_genlock_unlock(&cq->ep_list_lock); return -FI_EAGAIN; @@ -256,21 +252,12 @@ static int cxip_cq_close(struct fid *fid) { struct cxip_cq *cq = container_of(fid, struct cxip_cq, util_cq.cq_fid.fid); - int ret; if (ofi_atomic_get32(&cq->util_cq.ref)) return -FI_EBUSY; - if (cq->priv_wait) { - ret = ofi_wait_del_fd(cq->util_cq.wait, - cxil_get_wait_obj_fd(cq->priv_wait)); - if (ret) - CXIP_WARN("Wait FD delete error: %d\n", ret); - - ret = cxil_destroy_wait_obj(cq->priv_wait); - if (ret) - CXIP_WARN("Release CXI wait object failed: %d\n", ret); - } + if (cq->ep_fd >= 0) + close(cq->ep_fd); ofi_cq_cleanup(&cq->util_cq); ofi_genlock_destroy(&cq->ep_list_lock); @@ -281,14 +268,116 @@ static int cxip_cq_close(struct fid *fid) return 0; } +static int cxip_cq_signal(struct fid_cq *cq_fid) +{ + return -FI_ENOSYS; +} + +static int cxip_cq_control(fid_t fid, int command, void *arg) +{ + struct cxip_cq *cq = container_of(fid, struct cxip_cq, util_cq.cq_fid); + struct fi_wait_pollfd *pollfd; + int ret; + + switch (command) { + case FI_GETWAIT: + if (cq->ep_fd < 0) { + ret = -FI_ENODATA; + break; + } + if (cq->attr.wait_obj == FI_WAIT_FD) { + *(int *) arg = cq->ep_fd; + return FI_SUCCESS; + } + + pollfd = arg; + if (pollfd->nfds >= 1) { + pollfd->fd[0].fd = cq->ep_fd; + pollfd->fd[0].events = POLLIN; + pollfd->nfds = 1; + + ret = FI_SUCCESS; + } else { + ret = -FI_ETOOSMALL; + } + break; + case FI_GETWAITOBJ: + *(enum fi_wait_obj *) arg = cq->attr.wait_obj; + ret = FI_SUCCESS; + break; + default: + ret = -FI_ENOSYS; + break; + } + + return ret; +} + +static ssize_t cxip_cq_sreadfrom(struct fid_cq *cq_fid, void *buf, + size_t count, fi_addr_t *src_addr, + const void *cond, int timeout) +{ + struct cxip_cq *cq = container_of(cq_fid, struct cxip_cq, + util_cq.cq_fid); + struct epoll_event ev; + uint64_t endtime; + ssize_t ret; + + if (!cq->attr.wait_obj) + return -FI_EINVAL; + + endtime = ofi_timeout_time(timeout); + + do { + ret = fi_cq_readfrom(cq_fid, buf, count, src_addr); + if (ret != -FI_EAGAIN) + break; + + if (ofi_adjust_timeout(endtime, &timeout)) + return -FI_EAGAIN; + + ret = cxip_cq_trywait(cq); + if (ret == -FI_EAGAIN) { + ret = 0; + continue; + } + assert(ret == FI_SUCCESS); + + memset(&ev, 0, sizeof(ev)); + ret = epoll_wait(cq->ep_fd, &ev, 1, timeout); + if (ret > 0) + ret = 0; + + } while (!ret); + + return ret == -FI_ETIMEDOUT ? -FI_EAGAIN : ret; +} + +static ssize_t cxip_cq_sread(struct fid_cq *cq_fid, void *buf, size_t count, + const void *cond, int timeout) +{ + return cxip_cq_sreadfrom(cq_fid, buf, count, NULL, cond, timeout); +} + static struct fi_ops cxip_cq_fi_ops = { .size = sizeof(struct fi_ops), .close = cxip_cq_close, .bind = fi_no_bind, - .control = ofi_cq_control, + .control = cxip_cq_control, .ops_open = fi_no_ops_open, }; +static struct fi_ops_cq cxip_cq_ops = { + .size = sizeof(struct fi_ops_cq), + .read = ofi_cq_read, + .readfrom = ofi_cq_readfrom, + .readerr = ofi_cq_readerr, + .sread = cxip_cq_sread, + .sreadfrom = cxip_cq_sreadfrom, + .signal = cxip_cq_signal, + .strerror = ofi_cq_strerror, +}; + static struct fi_cq_attr cxip_cq_def_attr = { .flags = 0, .format = FI_CQ_FORMAT_CONTEXT, @@ -348,50 +437,35 @@ static int cxip_cq_verify_attr(struct fi_cq_attr *attr) return FI_SUCCESS; } -/* - * cxip_cq_alloc_priv_wait - Allocate an internal wait channel for the CQ. - */ -static int cxip_cq_alloc_priv_wait(struct cxip_cq *cq) +/* EP adds wait FD to the CQ epoll FD */ +int cxip_cq_add_wait_fd(struct cxip_cq *cq, int wait_fd, int events) { + struct epoll_event ev = { + .events = events, + }; int ret; - int wait_fd; - - assert(cq->domain); - - /* Not required or already created */ - if (!cq->util_cq.wait || cq->priv_wait) - return FI_SUCCESS; - - ret = cxil_alloc_wait_obj(cq->domain->lni->lni, &cq->priv_wait); - if (ret) { - CXIP_WARN("Allocation of internal wait object failed %d\n", - ret); - return ret; - } - wait_fd = cxil_get_wait_obj_fd(cq->priv_wait); - ret = fi_fd_nonblock(wait_fd); - if (ret) { - CXIP_WARN("Unable to set CQ wait non-blocking mode: %d\n", ret); - goto destroy_wait; - } + ret = epoll_ctl(cq->ep_fd, EPOLL_CTL_ADD, wait_fd, &ev); + if (ret < 0) { + ret = errno; + CXIP_WARN("EP wait FD add to CQ failed %d\n", ret); - ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, POLLIN, - cxip_cq_trywait, cq, &cq->util_cq.cq_fid.fid); - if (ret) { - CXIP_WARN("Add FD of internal wait object failed: %d\n", ret); - goto destroy_wait; + return -FI_EINVAL; } - CXIP_DBG("Add CQ private wait object, CQ intr FD: %d\n", wait_fd); - return FI_SUCCESS; +} -destroy_wait: - cxil_destroy_wait_obj(cq->priv_wait); - cq->priv_wait = NULL; +/* EP deletes wait FD from the CQ epoll FD */ +void cxip_cq_del_wait_fd(struct cxip_cq *cq, int wait_fd) +{ + int ret; - return ret; + ret = epoll_ctl(cq->ep_fd, EPOLL_CTL_DEL, wait_fd, NULL); + if (ret < 0) { + ret = errno; + CXIP_WARN("EP wait FD delete from CQ failed %d\n", ret); + } } /* @@ -402,6 +476,7 @@ int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, { struct cxip_domain *cxi_dom; struct cxip_cq *cxi_cq; + struct fi_cq_attr temp_attr; int ret; if (!domain || !cq) @@ -425,7 +500,10 @@ int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, cxi_cq->attr = *attr; } - ret = ofi_cq_init(&cxip_prov, domain, &cxi_cq->attr, &cxi_cq->util_cq, + /* CXI does not use common code internal wait object */ + temp_attr = cxi_cq->attr; + temp_attr.wait_obj = FI_WAIT_NONE; + ret = ofi_cq_init(&cxip_prov, domain, &temp_attr, &cxi_cq->util_cq, cxip_util_cq_progress, context); if (ret != FI_SUCCESS) { CXIP_WARN("ofi_cq_init() failed: %d\n", ret); @@ -434,9 +512,10 @@ int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, cxi_cq->util_cq.cq_fid.ops->strerror = &cxip_cq_strerror; cxi_cq->util_cq.cq_fid.fid.ops = &cxip_cq_fi_ops; - + cxi_cq->util_cq.cq_fid.ops = &cxip_cq_ops; cxi_cq->domain = cxi_dom; cxi_cq->ack_batch_size = cxip_env.eq_ack_batch_size; + cxi_cq->ep_fd = -1; /* Optimize locking when possible */ if (cxi_dom->util_domain.threading == FI_THREAD_DOMAIN || @@ -445,11 +524,11 @@ int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, else ofi_genlock_init(&cxi_cq->ep_list_lock, OFI_LOCK_SPINLOCK); - if (cxi_cq->util_cq.wait) { - ret = cxip_cq_alloc_priv_wait(cxi_cq); - if (ret != FI_SUCCESS) { - CXIP_WARN("Unable to allocate CXI wait obj: %d\n", - ret); + if (cxi_cq->attr.wait_obj) { + cxi_cq->ep_fd = epoll_create1(0); + if (cxi_cq->ep_fd < 0) { + CXIP_WARN("Unable to open epoll FD: %s\n", + strerror(errno)); goto err_wait_alloc; } } diff --git a/prov/cxi/src/cxip_ctrl.c b/prov/cxi/src/cxip_ctrl.c index bb543b6409a..03b117b7ef4 100644 --- a/prov/cxi/src/cxip_ctrl.c +++ b/prov/cxi/src/cxip_ctrl.c @@ -406,36 +406,6 @@ void cxip_ep_tgt_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl.tgt_evtq, false, true); } -/* - * cxip_ep_ctrl_trywait() - Return 0 if no events need to be progressed. - */ -int cxip_ep_ctrl_trywait(void *arg) -{ - struct cxip_ep_obj *ep_obj = (struct cxip_ep_obj *)arg; - - if (!ep_obj->ctrl.wait) { - CXIP_WARN("No CXI ep_obj wait object\n"); - return -FI_EINVAL; - } - - if (cxi_eq_peek_event(ep_obj->ctrl.tgt_evtq) || - cxi_eq_peek_event(ep_obj->ctrl.tx_evtq)) - return -FI_EAGAIN; - - ofi_genlock_lock(&ep_obj->lock); - cxil_clear_wait_obj(ep_obj->ctrl.wait); - - if (cxi_eq_peek_event(ep_obj->ctrl.tgt_evtq) || - cxi_eq_peek_event(ep_obj->ctrl.tx_evtq)) { - ofi_genlock_unlock(&ep_obj->lock); - - return -FI_EAGAIN; - } - ofi_genlock_unlock(&ep_obj->lock); - - return FI_SUCCESS; -} - static void cxip_eq_ctrl_eq_free(void *eq_buf, struct cxi_md *eq_md, struct cxi_eq *eq) { @@ -484,7 +454,7 @@ static int cxip_ep_ctrl_eq_alloc(struct cxip_ep_obj *ep_obj, size_t len, /* ep_obj->ctrl.wait will be NULL if not required */ ret = cxil_alloc_evtq(ep_obj->domain->lni->lni, *eq_md, &eq_attr, - ep_obj->ctrl.wait, NULL, eq); + ep_obj->priv_wait, NULL, eq); if (ret) goto err_free_eq_md; @@ -500,107 +470,6 @@ static int cxip_ep_ctrl_eq_alloc(struct cxip_ep_obj *ep_obj, size_t len, return ret; } -/* - * cxip_ep_wait_required() - return true if base EP wait object is required. - */ -static bool cxip_ctrl_wait_required(struct cxip_ep_obj *ep_obj) -{ - if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq->priv_wait) - return true; - - if (ep_obj->txc->send_cq && ep_obj->txc->send_cq->priv_wait) - return true; - - return false; -} - -/* - * cxip_ep_ctrl_del_wait() - Delete control FD object - */ -void cxip_ep_ctrl_del_wait(struct cxip_ep_obj *ep_obj) -{ - int wait_fd; - - wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl.wait); - - if (ep_obj->txc->send_cq) { - ofi_wait_del_fd(ep_obj->txc->send_cq->util_cq.wait, wait_fd); - CXIP_DBG("Deleted control HW EQ FD: %d from CQ: %p\n", - wait_fd, ep_obj->txc->send_cq); - } - - if (ep_obj->rxc->recv_cq && - ep_obj->rxc->recv_cq != ep_obj->txc->send_cq) { - ofi_wait_del_fd(ep_obj->rxc->recv_cq->util_cq.wait, wait_fd); - CXIP_DBG("Deleted control HW EQ FD: %d from CQ %p\n", - wait_fd, ep_obj->rxc->recv_cq); - } -} - -/* - * cxip_ep_ctrl_add_wait() - Add control FD to CQ object - */ -int cxip_ep_ctrl_add_wait(struct cxip_ep_obj *ep_obj) -{ - struct cxip_cq *cq; - int wait_fd; - int ret; - - ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, - &ep_obj->ctrl.wait); - if (ret) { - CXIP_WARN("Control wait object allocation failed: %d\n", ret); - return -FI_ENOMEM; - } - - wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl.wait); - ret = fi_fd_nonblock(wait_fd); - if (ret) { - CXIP_WARN("Unable to set control wait non-blocking: %d, %s\n", - ret, fi_strerror(-ret)); - goto err; - } - - cq = ep_obj->txc->send_cq; - if (cq) { - ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, - POLLIN, cxip_ep_ctrl_trywait, ep_obj, - &cq->util_cq.cq_fid.fid); - if (ret) { - CXIP_WARN("TX CQ add FD failed: %d, %s\n", - ret, fi_strerror(-ret)); - goto err; - } - } - - if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq != cq) { - cq = ep_obj->rxc->recv_cq; - - ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, - POLLIN, cxip_ep_ctrl_trywait, ep_obj, - &cq->util_cq.cq_fid.fid); - if (ret) { - CXIP_WARN("RX CQ add FD failed: %d, %s\n", - ret, fi_strerror(-ret)); - goto err_add_fd; - } - } - - CXIP_DBG("Added control EQ private wait object, intr FD: %d\n", - wait_fd); - - return FI_SUCCESS; - -err_add_fd: - if (ep_obj->txc->send_cq) - ofi_wait_del_fd(ep_obj->txc->send_cq->util_cq.wait, wait_fd); -err: - cxil_destroy_wait_obj(ep_obj->ctrl.wait); - ep_obj->ctrl.wait = NULL; - - return ret; -} - /* * cxip_ep_ctrl_init() - Initialize endpoint control resources. * @@ -624,20 +493,6 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) if (ep_obj->domain->mr_match_events) pt_opts.en_event_match = 1; - /* If CQ(s) are using a wait object, then control event - * queues need to unblock CQ poll as well. CQ will add the - * associated FD to the CQ FD list. - */ - if (cxip_ctrl_wait_required(ep_obj)) { - ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, - &ep_obj->ctrl.wait); - if (ret) { - CXIP_WARN("EP ctrl wait object alloc failed: %d\n", - ret); - return ret; - } - } - ret = cxip_ep_ctrl_eq_alloc(ep_obj, 4 * sc_page_size, &ep_obj->ctrl.tx_evtq_buf, &ep_obj->ctrl.tx_evtq_buf_md, diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c index aebec245ef7..48333d02ae2 100644 --- a/prov/cxi/src/cxip_ep.c +++ b/prov/cxi/src/cxip_ep.c @@ -187,26 +187,6 @@ void cxip_ep_progress(struct fid *fid) } } -/* - * cxip_ep_peek() - Peek at EP event queues - * - * Return whether the associated EP event queues are empty. - */ -int cxip_ep_peek(struct fid *fid) -{ - struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); - struct cxip_ep_obj *ep_obj = ep->ep_obj; - - if (ep_obj->txc->tx_evtq.eq && - cxi_eq_peek_event(ep_obj->txc->tx_evtq.eq)) - return -FI_EAGAIN; - if (ep_obj->rxc->rx_evtq.eq && - cxi_eq_peek_event(ep_obj->rxc->rx_evtq.eq)) - return -FI_EAGAIN; - - return FI_SUCCESS; -} - /* * fi_ep_get_unexpected_msgs() - Get unexpected message information, exposed * via domain open ops. @@ -491,6 +471,134 @@ ssize_t cxip_ep_cancel(fid_t fid, void *context) return ret; } +/* + * cxip_ep_destroy_priv_wait - Free an internal wait channel for the EP. + */ +static void cxip_ep_destroy_priv_wait(struct cxip_ep_obj *ep_obj) +{ + assert(ep_obj->priv_wait); + + if (ep_obj->txc->send_cq && ep_obj->txc->send_cq->attr.wait_obj) + cxip_cq_del_wait_fd(ep_obj->txc->send_cq, ep_obj->wait_fd); + + if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq->attr.wait_obj && + ep_obj->rxc->recv_cq != ep_obj->txc->send_cq) + cxip_cq_del_wait_fd(ep_obj->rxc->recv_cq, ep_obj->wait_fd); + + cxil_destroy_wait_obj(ep_obj->priv_wait); + + ep_obj->priv_wait = NULL; + ep_obj->wait_fd = -1; +} + +/* + * cxip_ep_alloc_priv_wait - Allocate an internal wait channel for the EP. + */ +static int cxip_ep_alloc_priv_wait(struct cxip_ep_obj *ep_obj) +{ + bool tx_cq_added = false; + int ret; + + assert(ep_obj->priv_wait == NULL); + + ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, &ep_obj->priv_wait); + if (ret) { + CXIP_WARN("Alloc of EP internal wait object failed %d\n", + ret); + return ret; + } + + ep_obj->wait_fd = cxil_get_wait_obj_fd(ep_obj->priv_wait); + ret = fi_fd_nonblock(ep_obj->wait_fd); + if (ret) { + CXIP_WARN("Unable to set EP wait non-blocking mode: %d\n", ret); + goto destroy_wait; + } + + if (ep_obj->txc->send_cq && ep_obj->txc->send_cq->attr.wait_obj) { + ret = cxip_cq_add_wait_fd(ep_obj->txc->send_cq, ep_obj->wait_fd, + EPOLLPRI | POLLERR); + if (ret) + goto destroy_wait; + + tx_cq_added = true; + } + + if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq->attr.wait_obj && + ep_obj->rxc->recv_cq != ep_obj->txc->send_cq) { + ret = cxip_cq_add_wait_fd(ep_obj->rxc->recv_cq, ep_obj->wait_fd, + EPOLLPRI | POLLERR); + if (ret) { + if (tx_cq_added) + cxip_cq_del_wait_fd(ep_obj->txc->send_cq, + ep_obj->wait_fd); + goto destroy_wait; + } + } + + CXIP_DBG("Add EP private wait object, EP intr FD: %d\n", + ep_obj->wait_fd); + + return FI_SUCCESS; + +destroy_wait: + cxil_destroy_wait_obj(ep_obj->priv_wait); + ep_obj->priv_wait = NULL; + ep_obj->wait_fd = -1; + + return ret; +} + +/* + * cxip_ep_trywait() - Determine if hardware events are waiting to be processed + * for EP based on CQ. + */ +int cxip_ep_trywait(struct cxip_ep_obj *ep_obj, struct cxip_cq *cq) +{ + assert(ep_obj->priv_wait); + + ofi_genlock_lock(&ep_obj->lock); + cxil_clear_wait_obj(ep_obj->priv_wait); + + /* Enable any currently disabled EQ interrupts, if events are + * ready shortcut and return. + */ + if ((ep_obj->txc->send_cq == cq || + ep_obj->rxc->recv_cq == cq) && ep_obj->txc->tx_evtq.eq) { + cxi_eq_int_enable(ep_obj->txc->tx_evtq.eq); + ep_obj->txc->tx_evtq.unacked_events = 0; + + if (cxi_eq_peek_event(ep_obj->txc->tx_evtq.eq)) + goto ready; + } + + if (ep_obj->rxc->recv_cq == cq && ep_obj->rxc->rx_evtq.eq) { + cxi_eq_int_enable(ep_obj->rxc->rx_evtq.eq); + ep_obj->rxc->rx_evtq.unacked_events = 0; + + if (cxi_eq_peek_event(ep_obj->rxc->rx_evtq.eq)) + goto ready; + } + + /* Side band control messages can also require progress */ + cxi_eq_int_enable(ep_obj->ctrl.tx_evtq); + if (cxi_eq_peek_event(ep_obj->ctrl.tx_evtq)) + goto ready; + + cxi_eq_int_enable(ep_obj->ctrl.tgt_evtq); + if (cxi_eq_peek_event(ep_obj->ctrl.tgt_evtq)) + goto ready; + + ofi_genlock_unlock(&ep_obj->lock); + + return FI_SUCCESS; + +ready: + ofi_genlock_unlock(&ep_obj->lock); + + return -FI_EAGAIN; +} + /* * cxip_ep_enable() - Enable standard EP. */ @@ -504,10 +612,23 @@ static int cxip_ep_enable(struct fid_ep *fid_ep) if (ep_obj->enabled) goto unlock; + /* Allocate an EP internal wait object if a CQ is bound with a + * wait object specified. + */ + if ((ep_obj->txc->send_cq && ep_obj->txc->send_cq->attr.wait_obj) || + (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq->attr.wait_obj)) { + ret = cxip_ep_alloc_priv_wait(ep_obj); + if (ret) { + CXIP_WARN("EP internal wait alloc failed %s\n", + fi_strerror(-ret)); + goto unlock; + } + } + if (!ep_obj->av) { CXIP_WARN("Endpoint must be bound to an AV\n"); ret = -FI_ENOAV; - goto unlock; + goto free_wait; } assert(ep_obj->domain->enabled); @@ -517,7 +638,7 @@ static int cxip_ep_enable(struct fid_ep *fid_ep) ret = cxip_av_auth_key_get_vnis(ep_obj->av, &ep_obj->vnis, &ep_obj->vni_count); if (ret) - goto unlock; + goto free_wait; ret = cxip_portals_table_alloc(ep_obj->domain->lni, ep_obj->vnis, ep_obj->vni_count, @@ -541,7 +662,7 @@ static int cxip_ep_enable(struct fid_ep *fid_ep) if (ret != FI_SUCCESS) { CXIP_WARN("Failed to allocate portals table: %d\n", ret); - goto unlock; + goto free_wait; } } @@ -625,6 +746,10 @@ static int cxip_ep_enable(struct fid_ep *fid_ep) ep_obj->vni_count); ep_obj->vnis = NULL; } +free_wait: + if (ep_obj->priv_wait) + cxip_ep_destroy_priv_wait(ep_obj); + unlock: ofi_genlock_unlock(&ep_obj->lock); @@ -688,6 +813,8 @@ int cxip_free_endpoint(struct cxip_ep *ep) cxip_txc_close(ep); cxip_rxc_close(ep); cxip_ep_disable(ep_obj); + if (ep_obj->priv_wait) + cxip_ep_destroy_priv_wait(ep_obj); ofi_genlock_unlock(&ep_obj->lock); ofi_atomic_dec32(&ep_obj->domain->ref); @@ -695,6 +822,7 @@ int cxip_free_endpoint(struct cxip_ep *ep) cxip_txc_free(ep_obj->txc); cxip_rxc_free(ep_obj->rxc); + free(ep_obj); ep->ep_obj = NULL; @@ -1277,6 +1405,7 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, ep_obj->tgq_size = hints->rx_attr->size; ep_obj->tx_attr = *hints->tx_attr; ep_obj->rx_attr = *hints->rx_attr; + ep_obj->wait_fd = -1; ep_obj->asic_ver = cxip_dom->iface->info->cassini_version; diff --git a/prov/cxi/src/cxip_evtq.c b/prov/cxi/src/cxip_evtq.c index e4c90c31980..42384ca85a8 100644 --- a/prov/cxi/src/cxip_evtq.c +++ b/prov/cxi/src/cxip_evtq.c @@ -457,7 +457,8 @@ static size_t cxip_evtq_get_queue_size(struct cxip_cq *cq, size_t num_events) #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq, - size_t num_events, size_t num_fc_events) + size_t num_events, size_t num_fc_events, + struct cxil_wait_obj *priv_wait) { struct cxi_eq_attr eq_attr = { .reserved_slots = num_fc_events, @@ -561,7 +562,7 @@ int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq, /* cq->priv_wait is NULL if not backed by wait object */ ret = cxil_alloc_evtq(cq->domain->lni->lni, evtq->md, &eq_attr, - cq->priv_wait, NULL, &evtq->eq); + priv_wait, NULL, &evtq->eq); if (ret) { CXIP_WARN("Failed to allocated EQ: %d\n", ret); goto err_unmap_eq_buf; diff --git a/prov/cxi/src/cxip_fabric.c b/prov/cxi/src/cxip_fabric.c index c8528cf829c..b9eede784a4 100644 --- a/prov/cxi/src/cxip_fabric.c +++ b/prov/cxi/src/cxip_fabric.c @@ -24,13 +24,41 @@ int cxip_eq_def_sz = CXIP_EQ_DEF_SZ; static int read_default_params; +static int cxip_trywait(struct fid_fabric *fabric, struct fid **fids, + int count) +{ + struct cxip_cq *cq; + int ret; + int i; + + for (i = 0; i < count; i++) { + switch (fids[i]->fclass) { + case FI_CLASS_CQ: + cq = container_of(fids[i], struct cxip_cq, + util_cq.cq_fid.fid); + ret = cxip_cq_trywait(cq); + if (ret) + return ret; + break; + case FI_CLASS_EQ: + case FI_CLASS_CNTR: + case FI_CLASS_WAIT: + return -FI_ENOSYS; + default: + return -FI_EINVAL; + } + } + + return FI_SUCCESS; +} + static struct fi_ops_fabric cxip_fab_ops = { .size = sizeof(struct fi_ops_fabric), .domain = cxip_domain, .passive_ep = fi_no_passive_ep, .eq_open = cxip_eq_open, - .wait_open = ofi_wait_fd_open, - .trywait = ofi_trywait, + .wait_open = fi_no_wait_open, + .trywait = cxip_trywait, }; static int cxip_fabric_close(fid_t fid) diff --git a/prov/cxi/src/cxip_rxc.c b/prov/cxi/src/cxip_rxc.c index cc3f3e9f91a..8051ccdcade 100644 --- a/prov/cxi/src/cxip_rxc.c +++ b/prov/cxi/src/cxip_rxc.c @@ -127,7 +127,8 @@ static int rxc_msg_init(struct cxip_rxc *rxc) /* Base message initialization */ num_events = cxip_rxc_get_num_events(rxc); - ret = cxip_evtq_init(&rxc->rx_evtq, rxc->recv_cq, num_events, 1); + ret = cxip_evtq_init(&rxc->rx_evtq, rxc->recv_cq, num_events, 1, + rxc->ep_obj->priv_wait); if (ret) { CXIP_WARN("Failed to initialize RXC event queue: %d, %s\n", ret, fi_strerror(-ret)); diff --git a/prov/cxi/src/cxip_txc.c b/prov/cxi/src/cxip_txc.c index fdbd64af604..49d19fd6b58 100644 --- a/prov/cxi/src/cxip_txc.c +++ b/prov/cxi/src/cxip_txc.c @@ -328,7 +328,8 @@ int cxip_txc_enable(struct cxip_txc *txc) num_events = cxip_txc_get_num_events(txc); - ret = cxip_evtq_init(&txc->tx_evtq, txc->send_cq, num_events, 0); + ret = cxip_evtq_init(&txc->tx_evtq, txc->send_cq, num_events, 0, + txc->ep_obj->priv_wait); if (ret) { CXIP_WARN("Failed to initialize TX event queue: %d, %s\n", ret, fi_strerror(-ret)); diff --git a/prov/cxi/test/cxip_test_common.c b/prov/cxi/test/cxip_test_common.c index b0fe3ccd622..fd3fec4b5c5 100644 --- a/prov/cxi/test/cxip_test_common.c +++ b/prov/cxi/test/cxip_test_common.c @@ -774,6 +774,7 @@ void cxit_setup_enabled_ep_fd(void) cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->threading = FI_THREAD_SAFE; cxit_setup_ep(); diff --git a/prov/cxi/test/tagged.c b/prov/cxi/test/tagged.c index e711767f308..b486a340f60 100644 --- a/prov/cxi/test/tagged.c +++ b/prov/cxi/test/tagged.c @@ -5475,7 +5475,262 @@ Test(tagged_src_err, addr) TestSuite(tagged_cq_wait, .init = cxit_setup_rma_fd, .fini = cxit_teardown_rma_fd, - .timeout = CXIT_DEFAULT_TIMEOUT); + .timeout = 20); + +Test(tagged_cq_wait, timeout_poll) +{ + struct fid *fids[1]; + int cq_fd; + int ret; + struct pollfd fds; + int timeout = 100; + uint64_t end_ms; + uint64_t start_ms; + + sleep(1); + + ret = fi_control(&cxit_rx_cq->fid, FI_GETWAIT, &cq_fd); + cr_assert_eq(ret, FI_SUCCESS, "Get RX CQ wait FD %d", ret); + + fids[0] = &cxit_rx_cq->fid; + ret = fi_trywait(cxit_fabric, fids, 1); + cr_assert_eq(ret, FI_SUCCESS, "Unexpected fi_trywait return %d\n", + ret); + + fds.fd = cq_fd; + fds.events = POLLIN; + start_ms = ofi_gettime_ms(); + ret = poll(&fds, 1, timeout); + cr_assert_eq(ret, 0, "Poll did not timed out, %d", ret); + end_ms = ofi_gettime_ms(); + cr_assert(end_ms >= start_ms + timeout, + "Timeout too short %ld ms asked for %d ms", + end_ms - start_ms, timeout); +} + +Test(tagged_cq_wait, timeout_epoll) +{ + struct epoll_event ev = { + .events = EPOLLIN, + .data.u32 = 0, + }; + int ret; + int epfd; + int waitfd; + struct fid *fids[1]; + int timeout = 100; + uint64_t end_ms; + uint64_t start_ms; + + sleep(1); + + epfd = epoll_create1(0); + cr_assert(epfd >= 0, "epoll_create1() failed %s\n", + strerror(errno)); + + ret = fi_control(&cxit_tx_cq->fid, FI_GETWAIT, &waitfd); + cr_assert(ret == FI_SUCCESS, "get FD for wait object failed %s\n", + strerror(errno)); + + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, waitfd, &ev); + cr_assert(ret == 0, "epoll_ctl failed %s\n", strerror(errno)); + + fids[0] = &cxit_tx_cq->fid; + ret = fi_trywait(cxit_fabric, fids, 1); + cr_assert(ret == FI_SUCCESS, "fi_trywait failed %s\n", + fi_strerror(-ret)); + + /* Ensure timeout since events should not be outsanding */ + memset(&ev, 0, sizeof(ev)); + start_ms = ofi_gettime_ms(); + ret = epoll_wait(epfd, &ev, 1, timeout); + cr_assert(ret == 0, "epoll_wait did not timeout\n"); + end_ms = ofi_gettime_ms(); + cr_assert(end_ms >= start_ms + timeout, + "Timeout too short %ld ms asked for %d ms", + end_ms - start_ms, timeout); + + close(epfd); +} + +Test(tagged_cq_wait, timeout_sread) +{ + int ret; + int timeout = 100; + struct fi_cq_tagged_entry rx_cqe; + uint64_t end_ms; + uint64_t start_ms = ofi_gettime_ms(); + + /* No events should be available. Timeout returns -FI_EAGAIN. */ + ret = fi_cq_sread(cxit_rx_cq, &rx_cqe, 1, NULL, timeout); + cr_assert_eq(ret, -FI_EAGAIN, "Poll did not timed out, %s", + fi_strerror(ret)); + end_ms = ofi_gettime_ms(); + cr_assert(end_ms >= start_ms + timeout, + "Timeout too short %ld ms asked for %d ms", + end_ms - start_ms, timeout); +} + +struct simple_rx_wait { + bool epoll; + bool ux_msg; +}; + +static void *simple_rx_worker(void *data) +{ + struct simple_rx_wait *arg = (struct simple_rx_wait *) data; + struct fid *fids[1]; + int ret; + int recv_len = 64; + uint8_t *recv_buf; + struct fi_cq_tagged_entry rx_cqe; + fi_addr_t from; + int cq_fd; + struct epoll_event ev = { + .events = EPOLLIN, + .data.u32 = 0, + }; + int epfd; + struct pollfd fds; + int tries = 0; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + ret = fi_control(&cxit_rx_cq->fid, FI_GETWAIT, &cq_fd); + cr_assert_eq(ret, FI_SUCCESS, "Get CQ wait FD %d", cq_fd); + + fids[0] = &cxit_rx_cq->fid; + + /* We want to block waiting for the recv event */ + if (arg->epoll) { + epfd = epoll_create1(0); + cr_assert(epfd >= 0, "epoll_create1() failed %s", + strerror(errno)); + + ev.data.fd = cq_fd; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, cq_fd, &ev); + cr_assert_eq(ret, 0, "epoll_ctl() failed %s", strerror(errno)); + } + + /* For UX message tests, trywait should return -FI_EAGAIN */ +cqe_not_ready: + ret = fi_trywait(cxit_fabric, fids, 1); + if (arg->ux_msg) { + cr_assert_eq(ret, -FI_EAGAIN, "UX event not ready, ret %s\n", + fi_strerror(-ret)); + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "UX message not received\n"); + goto done; + } + + /* No event should be pending, nothing sent yet */ + if (tries == 0) + cr_assert_eq(ret, FI_SUCCESS, "RX CQ event pending ret %d", ret); + + /* Wait for message */ + if (ret == FI_SUCCESS) { + if (arg->epoll) { + struct epoll_event evs[1] = {}; + + ret = epoll_wait(epfd, evs, 1, 5000); + } else { + fds.fd = cq_fd; + fds.events = POLLIN; + ret = poll(&fds, 1, 5000); + } + cr_assert(ret != 0, "RX poll timed out, ret %d\n", ret); + cr_assert(ret > 0, "Unexpected poll error %d\n", ret); + } + + /* We can get woken up for the send event, so -FI_EAGAIN + * is possible. Make sure no more than two wakeups occur. + */ + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == -FI_EAGAIN && ++tries < 2) + goto cqe_not_ready; + + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + +done: + free(recv_buf); + pthread_exit(NULL); +} + +void simple_rx_wait(bool epoll, bool ux_msg) +{ + pthread_t rx_thread; + pthread_attr_t attr = {}; + int ret; + int i; + int send_len = 64; + uint8_t *send_buf; + struct fi_cq_tagged_entry tx_cqe; + struct simple_rx_wait arg = { + .epoll = epoll, + .ux_msg = ux_msg, + }; + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + if (!arg.ux_msg) { + /* Start processing receives */ + ret = pthread_create(&rx_thread, &attr, simple_rx_worker, &arg); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + + /* Make sure receive is posted and thread is polling */ + sleep(1); + } + + /* Send 64 byte message to self */ + ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + if (arg.ux_msg) { + /* Start processing receives */ + ret = pthread_create(&rx_thread, &attr, simple_rx_worker, &arg); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + } + + ret = pthread_join(rx_thread, NULL); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + free(send_buf); +} + +Test(tagged_cq_wait, simple_rx_epoll) +{ + simple_rx_wait(true, false); +} + +Test(tagged_cq_wait, simple_rx_epoll_ux) +{ + simple_rx_wait(true, true); +} + +Test(tagged_cq_wait, simple_rx_poll) +{ + simple_rx_wait(false, false); +} + +Test(tagged_cq_wait, simple_rx_poll_ux) +{ + simple_rx_wait(false, true); +} struct fd_params { size_t length; @@ -5500,36 +5755,54 @@ static void *tagged_cq_wait_evt_worker(void *data) struct fid *fids[1]; int cq_fd; size_t completions = 0; + struct epoll_event ev = { + .events = EPOLLIN, + .data.u32 = 0, + }; + int epfd; args = (struct tagged_cq_wait_event_args *)data; if (args->poll) { + epfd = epoll_create1(0); + cr_assert(epfd >= 0, "epoll_create1() failed %s", + strerror(errno)); + ret = fi_control(&args->cq->fid, FI_GETWAIT, &cq_fd); cr_assert_eq(ret, FI_SUCCESS, "Get CQ wait FD %d", ret); - fids[0] = &args->cq->fid; + + ev.data.fd = cq_fd; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, cq_fd, &ev); + cr_assert_eq(ret, 0, "epoll_ctl() failed %s", + strerror(errno)); } while (completions < args->io_num) { if (args->poll) { + fids[0] = &args->cq->fid; ret = fi_trywait(cxit_fabric, fids, 1); if (ret == FI_SUCCESS) { - struct pollfd fds; - - fds.fd = cq_fd; - fds.events = POLLIN; + struct epoll_event evs[1] = {}; - ret = poll(&fds, 1, args->timeout); - cr_assert_neq(ret, 0, "Poll timed out"); + ret = epoll_wait(epfd, evs, 1, args->timeout); + cr_assert_neq(ret, 0, "%s CQ poll timed out", + args->cq == cxit_tx_cq ? + "TX" : "RX"); cr_assert_eq(ret, 1, "Poll error"); } + ret = fi_cq_read(args->cq, &args->cqe[completions], 1); if (ret == 1) completions++; + + sched_yield(); } else { ret = fi_cq_sread(args->cq, &args->cqe[completions], 1, NULL, args->timeout); - cr_assert_eq(ret, 1, "Completion not received\n"); + cr_assert_eq(ret, 1, + "%s completion not received ret %d\n", + args->cq == cxit_tx_cq ? "TX" : "RX", ret); completions++; } } @@ -5577,7 +5850,7 @@ void do_cq_wait(struct fd_params *param) struct tagged_thread_args *rx_args; pthread_t tx_thread; pthread_t rx_thread; - pthread_attr_t attr; + pthread_attr_t attr = {}; struct tagged_cq_wait_event_args tx_evt_args = { .cq = cxit_tx_cq, .io_num = param->num_ios, @@ -5650,14 +5923,14 @@ void do_cq_wait(struct fd_params *param) /* Sends last for expected messaging */ if (!param->ux_msg) { - /* Make sure receive has blocked */ + /* Make RX process first */ sleep(1); - cq_wait_post_sends(tx_args, param); /* Start processing Send events */ ret = pthread_create(&tx_thread, &attr, tagged_cq_wait_evt_worker, (void *)&tx_evt_args); + cq_wait_post_sends(tx_args, param); } /* Wait for the RX/TX event threads to complete */ @@ -5689,11 +5962,13 @@ void do_cq_wait(struct fd_params *param) free(rx_args); } +/* Test multiple threads using poll or sread on both CQ */ ParameterizedTestParameters(tagged_cq_wait, wait_fd) { size_t param_sz; static struct fd_params params[] = { + /* Test direct FI_WAIT_FD polling */ {.length = 1024, .num_ios = 4, .timeout = 5000, @@ -5702,6 +5977,7 @@ ParameterizedTestParameters(tagged_cq_wait, wait_fd) .num_ios = 4, .timeout = 5000, .poll = true}, + /* Test indirect FI_WAIT_FD polling via fi_cq_sread */ {.length = 1024, .num_ios = 4, .timeout = 5000, From 96631a200c7ccfac39214e285149f003262d462b Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Wed, 8 Jan 2025 04:06:54 +0000 Subject: [PATCH 343/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_cxi.7 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 index fd90c654791..e6b9c6717e3 100644 --- a/man/man7/fi_cxi.7 +++ b/man/man7/fi_cxi.7 @@ -15,7 +15,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_cxi" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cxi" "7" "2025\-01\-08" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -97,6 +97,8 @@ types. FI_WAIT_UNSPEC will default to FI_WAIT_FD. However FI_WAIT_NONE should achieve the lowest latency and reduce interrupt overhead. +NOTE: A process may return from a epoll_wait/poll when provider progress +is required and a CQ event may not be available. .SS Additional Features .PP The CXI provider also supports the following capabilities and features: From 7f77ff59c4f6b53c586f564319d3fb12c32d53f2 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 8 Jan 2025 02:20:42 +0000 Subject: [PATCH 344/393] contrib/aws: Skip default checkout Skip the default checkout of the workspace because we explicitly checkout out the workspace after we clean it. Signed-off-by: Seth Zegelstein Co-authored-by: Nathan Na --- contrib/aws/Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index 2edbb0c5945..4fb0128a759 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -131,6 +131,7 @@ pipeline { options { buildDiscarder(logRotator(daysToKeepStr: "90")) timeout(time: 10, unit: 'HOURS') + skipDefaultCheckout() } environment { // AWS region where the cluster is created From c8c36e4e603e5d53a2c8cf6c6a5a5af3747aa8c7 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 8 Jan 2025 02:25:02 +0000 Subject: [PATCH 345/393] contrib/aws: Remove debug print Signed-off-by: Seth Zegelstein --- contrib/aws/Jenkinsfile | 1 - 1 file changed, 1 deletion(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index 4fb0128a759..bbb9aa8cd34 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -152,7 +152,6 @@ pipeline { stage("Download and extract PortaFiducia") { steps { script { - sh 'printenv' download_and_extract_portafiducia('PortaFiducia') } } From d0d27257da86e3255957ea0879cd41f90d093cc3 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Mon, 6 Jan 2025 13:10:53 -0800 Subject: [PATCH 346/393] prov/efa: Remove inline write logic for rma inject We previously set the inject size to 0 to prevent using inline write, but fabtests can use inject rma for 0 size message. Make rma inject temporarily return FI_ENOSYS before firmware supports inline write. Signed-off-by: Jessie Yang --- prov/efa/src/efa_rma.c | 80 ++++++------------------------- prov/efa/test/efa_unit_test_rma.c | 39 +++++++++++---- prov/efa/test/efa_unit_tests.c | 1 + prov/efa/test/efa_unit_tests.h | 1 + 4 files changed, 47 insertions(+), 74 deletions(-) diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c index cf4987c34eb..8fee3a2021b 100644 --- a/prov/efa/src/efa_rma.c +++ b/prov/efa/src/efa_rma.c @@ -200,7 +200,6 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, struct efa_conn *conn; #ifndef _WIN32 struct ibv_sge sge_list[msg->iov_count]; - struct ibv_data_buf inline_data_list[msg->iov_count]; #else /* MSVC compiler does not support array declarations with runtime size, so hardcode * the expected iov_limit/max_sq_sge from the lower-level efa provider. @@ -208,9 +207,14 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, struct ibv_sge sge_list[EFA_DEV_ATTR_MAX_WR_SGE]; struct ibv_data_buf inline_data_list[EFA_DEV_ATTR_MAX_WR_SGE]; #endif - size_t len; int i, err = 0; + if (flags & FI_INJECT) { + EFA_WARN(FI_LOG_EP_DATA, + "FI_INJECT is not supported by efa rma yet.\n"); + return -FI_ENOSYS; + } + efa_tracepoint(write_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); qp = base_ep->qp; @@ -230,24 +234,13 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, ibv_wr_rdma_write(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); } - len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); - if (len <= base_ep->domain->device->efa_attr.inline_buf_size && - len <= base_ep->inject_rma_size && - (!msg->desc || !efa_mr_is_hmem(msg->desc[0]))) { - for (i = 0; i < msg->iov_count; i++) { - inline_data_list[i].addr = msg->msg_iov[i].iov_base; - inline_data_list[i].length = msg->msg_iov[i].iov_len; - } - ibv_wr_set_inline_data_list(qp->ibv_qp_ex, msg->iov_count, inline_data_list); - } else { - for (i = 0; i < msg->iov_count; ++i) { - sge_list[i].addr = (uint64_t)msg->msg_iov[i].iov_base; - sge_list[i].length = msg->msg_iov[i].iov_len; - assert(msg->desc && msg->desc[i]); - sge_list[i].lkey = ((struct efa_mr *)msg->desc[i])->ibv_mr->lkey; - } - ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list); + for (i = 0; i < msg->iov_count; ++i) { + sge_list[i].addr = (uint64_t)msg->msg_iov[i].iov_base; + sge_list[i].length = msg->msg_iov[i].iov_len; + assert(msg->desc && msg->desc[i]); + sge_list[i].lkey = ((struct efa_mr *)msg->desc[i])->ibv_mr->lkey; } + ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list); conn = efa_av_addr_to_conn(base_ep->av, msg->addr); assert(conn && conn->ep_addr); @@ -348,51 +341,6 @@ ssize_t efa_rma_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, return efa_rma_post_write(base_ep, &msg, FI_REMOTE_CQ_DATA | efa_tx_flags(base_ep)); } -ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, - fi_addr_t dest_addr, uint64_t addr, uint64_t key) -{ - struct fi_msg_rma msg; - struct iovec iov; - struct fi_rma_iov rma_iov; - struct efa_base_ep *base_ep; - int err; - - base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); - assert(len <= base_ep->inject_rma_size); - err = efa_rma_check_cap(base_ep); - if (err) - return err; - - EFA_SETUP_IOV(iov, buf, len); - EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); - EFA_SETUP_MSG_RMA(msg, &iov, NULL, 1, dest_addr, &rma_iov, 1, NULL, 0); - - return efa_rma_post_write(base_ep, &msg, FI_INJECT); -} - -ssize_t efa_rma_inject_writedata(struct fid_ep *ep_fid, const void *buf, - size_t len, uint64_t data, fi_addr_t dest_addr, - uint64_t addr, uint64_t key) -{ - struct fi_msg_rma msg; - struct iovec iov; - struct fi_rma_iov rma_iov; - struct efa_base_ep *base_ep; - int err; - - base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); - assert(len <= base_ep->inject_rma_size); - err = efa_rma_check_cap(base_ep); - if (err) - return err; - - EFA_SETUP_IOV(iov, buf, len); - EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); - EFA_SETUP_MSG_RMA(msg, &iov, NULL, 1, dest_addr, &rma_iov, 1, NULL, data); - - return efa_rma_post_write(base_ep, &msg, FI_INJECT | FI_REMOTE_CQ_DATA); -} - struct fi_ops_rma efa_dgram_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_no_rma_read, @@ -414,7 +362,7 @@ struct fi_ops_rma efa_rma_ops = { .write = efa_rma_write, .writev = efa_rma_writev, .writemsg = efa_rma_writemsg, - .inject = efa_rma_inject_write, + .inject = fi_no_rma_inject, .writedata = efa_rma_writedata, - .injectdata = efa_rma_inject_writedata, + .injectdata = fi_no_rma_injectdata, }; diff --git a/prov/efa/test/efa_unit_test_rma.c b/prov/efa/test/efa_unit_test_rma.c index 40be70ec219..cb42a8528fd 100644 --- a/prov/efa/test/efa_unit_test_rma.c +++ b/prov/efa/test/efa_unit_test_rma.c @@ -25,8 +25,6 @@ static void test_efa_rma_prep(struct efa_resource *resource, fi_addr_t *addr) ibv_qpx->wr_rdma_read = &efa_mock_ibv_wr_rdma_read_save_wr; ibv_qpx->wr_rdma_write = &efa_mock_ibv_wr_rdma_write_save_wr; ibv_qpx->wr_rdma_write_imm = &efa_mock_ibv_wr_rdma_write_imm_save_wr; - ibv_qpx->wr_set_inline_data_list = - &efa_mock_ibv_wr_set_inline_data_list_no_op; ibv_qpx->wr_set_sge_list = &efa_mock_ibv_wr_set_sge_list_no_op; ibv_qpx->wr_set_ud_addr = &efa_mock_ibv_wr_set_ud_addr_no_op; ibv_qpx->wr_complete = &efa_mock_ibv_wr_complete_no_op; @@ -241,11 +239,9 @@ void test_efa_rma_inject_write(struct efa_resource **state) test_efa_rma_prep(resource, &dest_addr); efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); - assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); ret = fi_inject_write(resource->ep, local_buff.buff, local_buff.size, dest_addr, remote_addr, remote_key); - assert_int_equal(ret, 0); - assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + assert_int_equal(ret, -FI_ENOSYS); efa_unit_test_buff_destruct(&local_buff); } @@ -262,12 +258,39 @@ void test_efa_rma_inject_writedata(struct efa_resource **state) test_efa_rma_prep(resource, &dest_addr); efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); - assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); ret = fi_inject_writedata(resource->ep, local_buff.buff, local_buff.size, 0, dest_addr, remote_addr, remote_key); - assert_int_equal(ret, 0); - assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + assert_int_equal(ret, -FI_ENOSYS); + + efa_unit_test_buff_destruct(&local_buff); +} + +void test_efa_rma_writemsg_with_inject(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + struct fi_msg_rma msg = {0}; + struct fi_rma_iov rma_iov; + fi_addr_t dest_addr; + void *desc; + int ret; + + test_efa_rma_prep(resource, &dest_addr); + efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + desc = fi_mr_desc(local_buff.mr); + rma_iov.len = local_buff.size; + rma_iov.addr = 0x87654321; + rma_iov.key = 123456; + efa_unit_test_construct_msg_rma(&msg, &iov, &desc, 1, dest_addr, &rma_iov, + 1, NULL, 0); + + ret = fi_writemsg(resource->ep, &msg, FI_INJECT); + assert_int_equal(ret, -FI_ENOSYS); efa_unit_test_buff_destruct(&local_buff); } diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 3e3ba43ef04..293e080c0dd 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -229,6 +229,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rma_writedata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rma_inject_write, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rma_inject_writedata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_writemsg_with_inject, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_cq_read_send_success, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_cq_read_recv_success, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_cq_read_send_failure, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 86bef64edab..689fd4fa3a8 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -251,6 +251,7 @@ void test_efa_rma_writemsg(); void test_efa_rma_writedata(); void test_efa_rma_inject_write(); void test_efa_rma_inject_writedata(); +void test_efa_rma_writemsg_with_inject(); void test_efa_cq_read_send_success(); void test_efa_cq_read_recv_success(); void test_efa_cq_read_send_failure(); From 0c66ae04ba66686f850ab6afb7df60b470a6dc00 Mon Sep 17 00:00:00 2001 From: Sai Sunku Date: Tue, 7 Jan 2025 19:39:20 -0500 Subject: [PATCH 347/393] Revert "prov/efa: Decouple AV entry from endpoint" This reverts commit c3f9e2134bfc8524ac80a1306835bbfdc8b4dcb3. Signed-off-by: Sai Sunku --- fabtests/pytest/efa/test_multi_ep.py | 9 +----- prov/efa/src/efa_av.c | 42 +++++++++++++++++----------- prov/efa/src/efa_av.h | 2 +- prov/efa/src/efa_base_ep.c | 10 +++++++ prov/efa/src/efa_errno.h | 3 +- prov/efa/src/rdm/efa_rdm_ep.h | 7 ----- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 17 ----------- prov/efa/src/rdm/efa_rdm_ep_utils.c | 14 +--------- prov/efa/src/rdm/efa_rdm_peer.c | 40 -------------------------- prov/efa/src/rdm/efa_rdm_peer.h | 18 ------------ prov/efa/test/efa_unit_test_av.c | 36 ------------------------ prov/efa/test/efa_unit_tests.c | 1 - prov/efa/test/efa_unit_tests.h | 3 -- 13 files changed, 39 insertions(+), 163 deletions(-) diff --git a/fabtests/pytest/efa/test_multi_ep.py b/fabtests/pytest/efa/test_multi_ep.py index bf34a5cca28..634529f0067 100644 --- a/fabtests/pytest/efa/test_multi_ep.py +++ b/fabtests/pytest/efa/test_multi_ep.py @@ -2,17 +2,10 @@ @pytest.mark.functional @pytest.mark.parametrize("shared_cq", [True, False]) -def test_multi_ep_cq(cmdline_args, shared_cq): +def test_multi_ep(cmdline_args, shared_cq): from common import ClientServerTest cmd = "fi_multi_ep -e rdm" if shared_cq: cmd += " -Q" test = ClientServerTest(cmdline_args, cmd) test.run() - -@pytest.mark.functional -def test_multi_ep_av(cmdline_args): - from common import ClientServerTest - cmd = "fi_multi_ep -e rdm -A" - test = ClientServerTest(cmdline_args, cmd) - test.run() diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 4b1d2f70442..7fef9d5b41c 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -243,6 +243,9 @@ void efa_ah_release(struct efa_av *av, struct efa_ah *ah) } } +static +void efa_conn_release(struct efa_av *av, struct efa_conn *conn); + /** * @brief initialize the rdm related resources of an efa_conn object * @@ -263,11 +266,18 @@ int efa_conn_rdm_init(struct efa_av *av, struct efa_conn *conn, bool insert_shm_ int err, ret; char smr_name[EFA_SHM_NAME_MAX]; size_t smr_name_len; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer; assert(av->ep_type == FI_EP_RDM); assert(conn->ep_addr); - conn->shm_fi_addr = FI_ADDR_NOTAVAIL; + /* currently multiple EP bind to same av is not supported */ + assert(!dlist_empty(&av->util_av.ep_list)); + efa_rdm_ep = container_of(av->util_av.ep_list.next, struct efa_rdm_ep, base_ep.util_ep.av_entry); + + peer = &conn->rdm_peer; + efa_rdm_peer_construct(peer, efa_rdm_ep, conn); /* * The efa_conn_rdm_init() call can be made in two situations: @@ -305,8 +315,8 @@ int efa_conn_rdm_init(struct efa_av *av, struct efa_conn *conn, bool insert_shm_ * av. The efa provider should still use peer->shm_fiaddr for transmissions * through shm ep. */ - conn->shm_fi_addr = conn->fi_addr; - ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &conn->shm_fi_addr, FI_AV_USER_ID, NULL); + peer->shm_fiaddr = conn->fi_addr; + ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &peer->shm_fiaddr, FI_AV_USER_ID, NULL); if (OFI_UNLIKELY(ret != 1)) { EFA_WARN(FI_LOG_AV, "Failed to insert address to shm provider's av: %s\n", @@ -316,10 +326,11 @@ int efa_conn_rdm_init(struct efa_av *av, struct efa_conn *conn, bool insert_shm_ EFA_INFO(FI_LOG_AV, "Successfully inserted %s to shm provider's av. efa_fiaddr: %ld shm_fiaddr = %ld\n", - smr_name, conn->fi_addr, conn->shm_fi_addr); + smr_name, conn->fi_addr, peer->shm_fiaddr); - assert(conn->shm_fi_addr < efa_env.shm_av_size); + assert(peer->shm_fiaddr < efa_env.shm_av_size); av->shm_used++; + peer->is_local = 1; } return 0; @@ -339,29 +350,26 @@ void efa_conn_rdm_deinit(struct efa_av *av, struct efa_conn *conn) int err; struct efa_rdm_peer *peer; struct efa_rdm_ep *ep; - struct dlist_entry *entry, *tmp; assert(av->ep_type == FI_EP_RDM); peer = &conn->rdm_peer; - if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL && av->shm_rdm_av) { - err = fi_av_remove(av->shm_rdm_av, &conn->shm_fi_addr, 1, 0); + if (peer->is_local && av->shm_rdm_av) { + err = fi_av_remove(av->shm_rdm_av, &peer->shm_fiaddr, 1, 0); if (err) { EFA_WARN(FI_LOG_AV, "remove address from shm av failed! err=%d\n", err); } else { av->shm_used--; - assert(conn->shm_fi_addr < efa_env.shm_av_size); + assert(peer->shm_fiaddr < efa_env.shm_av_size); } } - dlist_foreach_safe(&av->util_av.ep_list, entry, tmp) { - ep = container_of(entry, struct efa_rdm_ep, base_ep.util_ep.av_entry); - peer = efa_rdm_peer_map_lookup(&ep->fi_addr_to_peer_map, conn->fi_addr); - if (peer) { - efa_rdm_peer_destruct(peer, ep); - efa_rdm_peer_map_remove(&ep->fi_addr_to_peer_map, conn->fi_addr, peer); - } - } + /* + * We need peer->shm_fiaddr to remove shm address from shm av table, + * so efa_rdm_peer_clear must be after removing shm av table. + */ + ep = dlist_empty(&av->util_av.ep_list) ? NULL : container_of(av->util_av.ep_list.next, struct efa_rdm_ep, base_ep.util_ep.av_entry); + efa_rdm_peer_destruct(peer, ep); } /* diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index acf7e58e320..5d885adbdca 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -27,7 +27,6 @@ struct efa_conn { fi_addr_t fi_addr; fi_addr_t util_av_fi_addr; struct efa_rdm_peer rdm_peer; - fi_addr_t shm_fi_addr; }; struct efa_av_entry { @@ -61,6 +60,7 @@ struct efa_prv_reverse_av { struct efa_av { struct fid_av *shm_rdm_av; struct efa_domain *domain; + struct efa_base_ep *base_ep; size_t used; size_t shm_used; enum fi_av_type type; diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 56bd82bd87e..2abdee189dc 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -8,6 +8,15 @@ int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av) { + /* + * Binding multiple endpoints to a single AV is currently not + * supported. + */ + if (av->base_ep) { + EFA_WARN(FI_LOG_EP_CTRL, + "Address vector already has endpoint bound to it.\n"); + return -FI_ENOSYS; + } if (base_ep->domain != av->domain) { EFA_WARN(FI_LOG_EP_CTRL, "Address vector doesn't belong to same domain as EP.\n"); @@ -20,6 +29,7 @@ int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av) } base_ep->av = av; + base_ep->av->base_ep = base_ep; return 0; } diff --git a/prov/efa/src/efa_errno.h b/prov/efa/src/efa_errno.h index 5d3769d32d6..029c35d4a07 100644 --- a/prov/efa/src/efa_errno.h +++ b/prov/efa/src/efa_errno.h @@ -107,8 +107,7 @@ _(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \ _(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \ _(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) \ - _(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed)) \ - _(4127, PEER_MAP_ENTRY_POOL_EXHAUSTED, Peer map entry pool exhausted) + _(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed)) /** @} */ diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 1b888e182a4..fc298010249 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -40,10 +40,6 @@ struct efa_rdm_ep_queued_copy { #define EFA_RDM_EP_MAX_WR_PER_IBV_POST_SEND (4096) #define EFA_RDM_EP_MAX_WR_PER_IBV_POST_RECV (8192) -struct efa_rdm_peer_map { - struct efa_rdm_peer_map_entry *head; -}; - struct efa_rdm_ep { struct efa_base_ep base_ep; @@ -189,9 +185,6 @@ struct efa_rdm_ep { struct dlist_entry entry; /* the count of opes queued before handshake is made with their peers */ size_t ope_queued_before_handshake_cnt; - - struct ofi_bufpool *peer_map_entry_pool; /* bufpool to hold fi_addr->efa_rdm_peer key-value pairs */ - struct efa_rdm_peer_map fi_addr_to_peer_map; /* Hashmap to find efa_rdm_peer given fi_addr */ }; int efa_rdm_ep_flush_queued_blocking_copy_to_hmem(struct efa_rdm_ep *ep); diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index fbebfd93455..3508fe7ba75 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -307,18 +307,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) if (ret) goto err_free; - ret = ofi_bufpool_create(&ep->peer_map_entry_pool, - sizeof(struct efa_rdm_peer_map_entry), - EFA_RDM_BUFPOOL_ALIGNMENT, - 0, /* no limit to max cnt */ - /* Don't track usage, because endpoint can be closed without removing entries from AV */ - EFA_MIN_AV_SIZE, OFI_BUFPOOL_NO_TRACK); - if (ret) - goto err_free; - efa_rdm_rxe_map_construct(&ep->rxe_map); - efa_rdm_peer_map_construct(&ep->fi_addr_to_peer_map); - return 0; err_free: @@ -352,9 +341,6 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) if (ep->efa_tx_pkt_pool) ofi_bufpool_destroy(ep->efa_tx_pkt_pool); - if (ep->peer_map_entry_pool) - ofi_bufpool_destroy(ep->peer_map_entry_pool); - return ret; } @@ -842,9 +828,6 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) if (efa_rdm_ep->rx_atomrsp_pool) ofi_bufpool_destroy(efa_rdm_ep->rx_atomrsp_pool); - - if (efa_rdm_ep->peer_map_entry_pool) - ofi_bufpool_destroy(efa_rdm_ep->peer_map_entry_pool); } /* diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 2d87b48911d..9cf297acfc2 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -56,26 +56,14 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer(struct efa_rdm_ep *ep, fi_addr_t addr) { struct util_av_entry *util_av_entry; struct efa_av_entry *av_entry; - struct efa_rdm_peer *peer; if (OFI_UNLIKELY(addr == FI_ADDR_NOTAVAIL)) return NULL; - peer = efa_rdm_peer_map_lookup(&ep->fi_addr_to_peer_map, addr); - if (peer) - return peer; - util_av_entry = ofi_bufpool_get_ibuf(ep->base_ep.util_ep.av->av_entry_pool, addr); av_entry = (struct efa_av_entry *)util_av_entry->data; - - if (av_entry->conn.ep_addr) { - peer = efa_rdm_peer_map_insert(&ep->fi_addr_to_peer_map, addr, ep); - efa_rdm_peer_construct(peer, ep, &av_entry->conn); - return peer; - } - - return NULL; + return av_entry->conn.ep_addr ? &av_entry->conn.rdm_peer : NULL; } /** diff --git a/prov/efa/src/rdm/efa_rdm_peer.c b/prov/efa/src/rdm/efa_rdm_peer.c index 7c82c943835..3e8e3dff774 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.c +++ b/prov/efa/src/rdm/efa_rdm_peer.c @@ -31,11 +31,6 @@ void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, st dlist_init(&peer->txe_list); dlist_init(&peer->rxe_list); dlist_init(&peer->overflow_pke_list); - - if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL) { - peer->shm_fiaddr = conn->shm_fi_addr; - peer->is_local = 1; - } } /** @@ -116,41 +111,6 @@ void efa_rdm_peer_destruct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep) #endif } -struct efa_rdm_peer *efa_rdm_peer_map_insert(struct efa_rdm_peer_map *peer_map, fi_addr_t addr, struct efa_rdm_ep *ep) { - struct efa_rdm_peer_map_entry *map_entry; - struct efa_rdm_peer *peer; - - map_entry = ofi_buf_alloc(ep->peer_map_entry_pool); - if (OFI_UNLIKELY(!map_entry)) { - EFA_WARN(FI_LOG_CQ, - "Map entries for EFA AV to peer mapping exhausted.\n"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_PEER_MAP_ENTRY_POOL_EXHAUSTED); - return NULL; - } - - map_entry->key = addr; - peer = &map_entry->efa_rdm_peer; - - HASH_ADD(hh, peer_map->head, key, sizeof(addr), map_entry); - - return peer; -} - -struct efa_rdm_peer *efa_rdm_peer_map_lookup(struct efa_rdm_peer_map *peer_map, fi_addr_t addr) { - struct efa_rdm_peer_map_entry *map_entry; - - HASH_FIND(hh, peer_map->head, &addr, sizeof(addr), map_entry); - return map_entry ? &map_entry->efa_rdm_peer : NULL; -} - -void efa_rdm_peer_map_remove(struct efa_rdm_peer_map *peer_map, fi_addr_t addr, struct efa_rdm_peer *peer) { - struct efa_rdm_peer_map_entry *map_entry; - - HASH_FIND(hh, peer_map->head, &addr, sizeof(addr), map_entry); - HASH_DEL(peer_map->head, map_entry); - ofi_buf_free(map_entry); -} - /** * @brief run incoming packet_entry through reorder buffer * queue the packet entry if msg_id is larger than expected. diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index 21585051921..fe2f79ead61 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -75,12 +75,6 @@ struct efa_rdm_peer { struct efa_rdm_peer_user_recv_qp user_recv_qp; }; -struct efa_rdm_peer_map_entry { - uint64_t key; - struct efa_rdm_peer efa_rdm_peer; - UT_hash_handle hh; -}; - /** * @brief check for peer's RDMA_READ support, assuming HANDSHAKE has already occurred * @@ -292,12 +286,6 @@ bool efa_both_support_zero_hdr_data_transfer(struct efa_rdm_ep *ep, struct efa_r (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP)); } -static inline -void efa_rdm_peer_map_construct(struct efa_rdm_peer_map *peer_map) -{ - peer_map->head = NULL; -} - struct efa_conn; void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_conn *conn); @@ -316,10 +304,4 @@ size_t efa_rdm_peer_get_runt_size(struct efa_rdm_peer *peer, struct efa_rdm_ep * int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_rdm_ope *ope); -struct efa_rdm_peer *efa_rdm_peer_map_insert(struct efa_rdm_peer_map *peer_map, fi_addr_t addr, struct efa_rdm_ep *ep); - -struct efa_rdm_peer *efa_rdm_peer_map_lookup(struct efa_rdm_peer_map *peer_map, fi_addr_t addr); - -void efa_rdm_peer_map_remove(struct efa_rdm_peer_map *peer_map, fi_addr_t addr, struct efa_rdm_peer *peer); - #endif /* EFA_RDM_PEER_H */ diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index 6e11ee5c177..9ca730d0b6e 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -74,39 +74,3 @@ void test_av_insert_duplicate_gid(struct efa_resource **state) assert_int_equal(num_addr, 1); assert_int_not_equal(addr1, addr2); } - -/** - * @brief This test verifies that multiple endpoints can bind to the same AV - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_multiple_ep(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct fid_ep *ep2, *ep3; - int ret; - - /* Resource construct function creates and binds 1 EP to the AV */ - efa_unit_test_resource_construct(resource, FI_EP_RDM); - - /* Create and bind two new endpoints to the same AV */ - fi_endpoint(resource->domain, resource->info, &ep2, NULL); - ret = fi_ep_bind(ep2, &resource->av->fid, 0); - assert_int_equal(ret, 0); - - fi_endpoint(resource->domain, resource->info, &ep3, NULL); - ret = fi_ep_bind(ep3, &resource->av->fid, 0); - assert_int_equal(ret, 0); - - /* Bind the two new endpoints to the same CQ and enable them */ - fi_ep_bind(ep2, &resource->cq->fid, FI_SEND | FI_RECV); - ret = fi_enable(ep2); - assert_int_equal(ret, 0); - - fi_ep_bind(ep3, &resource->cq->fid, FI_SEND | FI_RECV); - ret = fi_enable(ep3); - assert_int_equal(ret, 0); - - fi_close(&ep2->fid); - fi_close(&ep3->fid); -} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 293e080c0dd..63316838a21 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -80,7 +80,6 @@ int main(void) const struct CMUnitTest efa_unit_tests[] = { cmocka_unit_test_setup_teardown(test_av_insert_duplicate_raw_addr, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_insert_duplicate_gid, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_av_multiple_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_device_construct_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_ignore_missing_host_id_file, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_has_valid_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 689fd4fa3a8..a13033e6f8b 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -40,8 +40,6 @@ void efa_unit_test_resource_construct_ep_not_enabled( struct efa_resource *resource, enum fi_ep_type ep_type); void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled( struct efa_resource *resource, enum fi_ep_type ep_type); -void efa_unit_test_resource_construct_no_av_no_cq_and_ep_not_enabled( - struct efa_resource *resource, enum fi_ep_type ep_type); void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, enum fi_ep_type ep_type, uint32_t fi_version, struct fi_info *hints, @@ -102,7 +100,6 @@ void efa_unit_test_handshake_pkt_construct(struct efa_rdm_pke *pkt_entry, struct /* test cases */ void test_av_insert_duplicate_raw_addr(); void test_av_insert_duplicate_gid(); -void test_av_multiple_ep(); void test_efa_device_construct_error_handling(); void test_efa_rdm_ep_ignore_missing_host_id_file(); void test_efa_rdm_ep_has_valid_host_id(); From 37da1647f2ffb75bc0f0f13a879b7f92b8ac6f20 Mon Sep 17 00:00:00 2001 From: Steve Welch Date: Wed, 8 Jan 2025 14:37:25 -0600 Subject: [PATCH 348/393] prov/cxi: cxi EQ do not support wait objects Make sure wait objects are not allowed for cxi EQs. NETCASSINI-6964 Signed-off-by: Steve Welch --- prov/cxi/src/cxip_eq.c | 12 +++++++++++- prov/cxi/test/eq.c | 27 ++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/prov/cxi/src/cxip_eq.c b/prov/cxi/src/cxip_eq.c index 010fdcc7183..6c1dec45319 100644 --- a/prov/cxi/src/cxip_eq.c +++ b/prov/cxi/src/cxip_eq.c @@ -29,6 +29,8 @@ #include "cxip.h" +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EQ, __VA_ARGS__) + static int cxip_eq_close(struct fid *fid) { struct cxip_eq *cxi_eq; @@ -104,7 +106,7 @@ static struct fi_ops cxi_eq_fi_ops = { static struct fi_eq_attr cxip_eq_def_attr = { .size = CXIP_EQ_DEF_SZ, .flags = 0, - .wait_obj = FI_WAIT_FD, + .wait_obj = FI_WAIT_NONE, .signaling_vector = 0, .wait_set = NULL }; @@ -124,6 +126,14 @@ int cxip_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, else cxi_eq->attr = *attr; + if (cxi_eq->attr.wait_obj != FI_WAIT_NONE) { + CXIP_WARN("Unsupported EQ attribute wait obj %d\n", + cxi_eq->attr.wait_obj); + ret = -FI_ENOSYS; + + goto err0; + } + ret = ofi_eq_init(fabric, &cxi_eq->attr, &cxi_eq->util_eq.eq_fid, context); if (ret != FI_SUCCESS) diff --git a/prov/cxi/test/eq.c b/prov/cxi/test/eq.c index 00730982b22..1d31bd4bf9e 100644 --- a/prov/cxi/test/eq.c +++ b/prov/cxi/test/eq.c @@ -27,7 +27,7 @@ TestSuite(eq, .init = cxit_setup_eq, .fini = cxit_teardown_eq, .timeout = CXIT_DEFAULT_TIMEOUT); -/* Test basic CQ creation */ +/* Test basic EQ creation */ Test(eq, simple) { cxit_create_eq(); @@ -35,3 +35,28 @@ Test(eq, simple) cxit_destroy_eq(); } +void eq_bad_wait_obj(enum fi_wait_obj wait_obj) + +{ + struct fi_eq_attr attr = { + .size = 32, + .flags = FI_WRITE, + .wait_obj = wait_obj, + }; + int ret; + + ret = fi_eq_open(cxit_fabric, &attr, &cxit_eq, NULL); + cr_assert(ret == -FI_ENOSYS, "fi_eq_open unexpected success"); + cr_assert(cxit_eq == NULL, "cxit_eq not NULL on bad wait_obj"); +} + +Test(eq, bad_wait_obj_unspec) +{ + eq_bad_wait_obj(FI_WAIT_UNSPEC); +} + +Test(eq, bad_wait_obj_wait_fd) +{ + eq_bad_wait_obj(FI_WAIT_UNSPEC); +} + From c5ffaebd0181e33c86d7141be7595fcf8835a386 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 8 Jan 2025 19:45:03 +0000 Subject: [PATCH 349/393] contrib/aws: Fix cluster resource leaks for superseded jobs If a user puts up a PR, and then proceeds to push to that PR while AWS CI is still running, Jenkins will use a feature called milestones to cancel the old build. Jenkins will try to nicely abort the stage with sig-term like behavior, wait 10 seconds and do another nice abort with sig-term behavior, and finally wait 10 more seconds before killing the stage with sig-kill like behavior. We have a race condition that sometimes leaks clusters if the cleanup didn't finish before the stage got forcefully terminated. After Jenkins terminates the stages, it always runs the post-build actions. Currently, the post build actions only call cleanup in one region, but the CI creates clusters in multiple regions. If/When resources are leaked, they get automatically cleaned up by the resource reaper in the account after they are alive for a fixed time. This PR allows us to be more frugal by cleaning up correctly, instead of relying on the resource reaper to clean up for us. It also prevent us from hitting our tight AWS account level EC2 limits. This PR fixes our cleanup race by calling the cleanup logic for every region in the ALWAYS post build stage. If the cluster is already cleaned up (successful run), the cleanup call is a no-op. Signed-off-by: Seth Zegelstein --- contrib/aws/Jenkinsfile | 42 ++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index bbb9aa8cd34..0025d9bacae 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -61,6 +61,14 @@ def get_random_string(len) { return s } +def get_cluster_name_prefix(build_tag) { + prefix = sh( + script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\" | tr -d '.\\n'", + returnStdout: true + ) + return prefix.take(28) +} + def get_cluster_name(build_tag, os, instance_type) { /* * Compose the cluster name. Pcluster requires a cluster name under 60 characters. @@ -68,13 +76,10 @@ def get_cluster_name(build_tag, os, instance_type) { * Jenkins does not allow groovy to use the replace() method * of string. Therefore we used shell command sed to replace "." with "" */ - build_tag = sh( - script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\"", - returnStdout: true - ) + build_tag = get_cluster_name_prefix(build_tag) def cluster_name = sh( - script: "echo '${build_tag.take(28)}-${os.take(10)}-${instance_type.take(10)}-'${get_random_string(8)} | tr -d '.\\n'", + script: "echo '${build_tag}-${os.take(10)}-${instance_type.take(10)}-'${get_random_string(8)} | tr -d '.\\n'", returnStdout: true ) @@ -133,10 +138,6 @@ pipeline { timeout(time: 10, unit: 'HOURS') skipDefaultCheckout() } - environment { - // AWS region where the cluster is created - REGION="us-west-2" - } stages { // Cleanup workspace before job start. stage("Clean up workspace") { @@ -241,17 +242,20 @@ pipeline { sh 'find PortaFiducia/tests/outputs -name "*.xml" | xargs du -shc' junit testResults: 'PortaFiducia/tests/outputs/**/*.xml', keepLongStdio: false archiveArtifacts artifacts: 'PortaFiducia/tests/outputs/**/*.*' + script { + // Try To Cleanup Resources + def regions = ["us-east-1", "eu-north-1", "us-west-2"] + cluster_name_prefix = get_cluster_name_prefix(env.BUILD_TAG) + regions.each { region -> + sh ". venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name '${cluster_name_prefix}*' --region ${region}" + } + // Windows Cluster, has a different name + sh """ + . venv/bin/activate + ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name WindowsLibfabricCi_${env.CHANGE_ID}_* + """ + } } - failure { - sh ''' - . venv/bin/activate - ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name WindowsLibfabricCi_${env.CHANGE_ID}_* - ''' - } - aborted { - sh '. venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name "$BUILD_TAG"\'*\' --region $REGION' - } - // Cleanup workspace after job completes. cleanup { deleteDir() } From 83f406fd8504ad322466b0b73573c441cde0b2d1 Mon Sep 17 00:00:00 2001 From: szegel Date: Wed, 8 Jan 2025 02:15:49 +0000 Subject: [PATCH 350/393] contrib/aws: Remove test config file, and pass in command line arguments instead Signed-off-by: szegel --- contrib/aws/Jenkinsfile | 82 ++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index 0025d9bacae..6c6b234419a 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -43,13 +43,13 @@ def install_porta_fiducia() { ''' } -def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, test_config_file, addl_args) { +def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, addl_args) { /* * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments * param@ args: str, the command line arguments */ def cluster_name = get_cluster_name(build_tag, os, instance_type) - def args = "--config configs/${test_config_file} --os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml" + def args = "--os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml" sh ". venv/bin/activate; cd PortaFiducia/tests && ./test_orchestrator.py ${args}" } @@ -105,7 +105,7 @@ def get_single_node_windows_test_stage_with_lock(stage_name, lock_label) { } -def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, instance_count, region, test_config, lock_label, addl_args) { +def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, instance_count, region, lock_label, addl_args) { /* * Generate a single test stage that run test_orchestrator.py with the given parameters. * param@ stage_name: the name of the stage @@ -114,14 +114,13 @@ def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, instance_ * param@ instance_type: the instance type for the test stage. * param@ instance_count: number of intances to use * param@ region: the (default) aws region where the tests are run. - * param@ test_config: the name of test config file in PortaFiducia/tests/configs/ * param@ addl_args: additional arguments passed to test_orchestrator.py * return@: the test stage. */ return { stage("${stage_name}") { lock(label: lock_label, quantity: instance_count) { - this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) + this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, addl_args) } } } @@ -169,9 +168,24 @@ pipeline { steps { script { def stages = [:] - // This needs the extra space at the end - // Set 12 hour timeout for all clusters - def addl_args_pr = "--timeout 720 --test-libfabric-pr $env.CHANGE_ID " + def timeout = "--timeout 720" + def generic_pf = "--cluster-type manual_cluster --test-target libfabric --test-type pr --test-libfabric-pr $env.CHANGE_ID" + // onesided tests are covered by imb + // collective tests are covered by omb + def test_list = "--test-list test_efa_ut 'test_omb and not onesided' test_fabtests_functional test_fork_support test_backward_compatibility 'test_imb and not collective'" + + def efa_provider = "--test-libfabric-provider efa" + def addl_args_efa = "${timeout} ${generic_pf} ${efa_provider} ${test_list}" + + def shm_provider = "--test-libfabric-provider shm" + def addl_args_shm = "${timeout} ${generic_pf} ${shm_provider} ${test_list}" + + def tcp_provider = "--test-libfabric-provider tcp --enable-efa false" + def addl_args_tcp = "${timeout} ${generic_pf} ${tcp_provider} ${test_list}" + + def sockets_provider = "--test-libfabric-provider sockets --enable-efa false" + def addl_args_sockets = "${timeout} ${generic_pf} ${sockets_provider} ${test_list}" + // Use lockable resources to limit the number of jobs that can get executed in parallel def g4dn8x_lock_label = "g4dn8x" def g4dn12x_lock_label = "g4dn12x" @@ -182,43 +196,43 @@ pipeline { def c6g2x_lock_label = "c6g2x" // Single Node Tests - EFA - stages["1_g4dn_alinux2-efa"] = get_test_stage_with_lock("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr) - stages["1_g4dn_alinux2023-efa"] = get_test_stage_with_lock("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr) - stages["1_g4dn_ubuntu2004-efa"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr) - stages["1_g4dn_rhel8-efa"] = get_test_stage_with_lock("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr) + stages["1_g4dn_alinux2-efa"] = get_test_stage_with_lock("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa) + stages["1_g4dn_alinux2023-efa"] = get_test_stage_with_lock("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa) + stages["1_g4dn_ubuntu2004-efa"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa) + stages["1_g4dn_rhel8-efa"] = get_test_stage_with_lock("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa) // Single Node Tests - SHM - stages["1_g4dn_alinux2_shm"] = get_test_stage_with_lock("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr + "--test-libfabric-provider shm") - stages["1_g4dn_alinux2023_shm"] = get_test_stage_with_lock("1_g4dn_alinux2023_shm", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr + "--test-libfabric-provider shm") - stages["1_g4dn_ubuntu2004_shm"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", g4dn8x_lock_label, addl_args_pr + "--test-libfabric-provider shm") - stages["1_c5_rhel8_shm"] = get_test_stage_with_lock("1_c5_rhel8_shm", env.BUILD_TAG, "rhel8", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", c52x_lock_label, addl_args_pr + "--test-libfabric-provider shm --enable-efa false") - stages["1_c5_ubuntu2004_shm_disable-cma"] = get_test_stage_with_lock("1_c5_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "c5.2xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", c52x_lock_label, addl_args_pr + "--test-libfabric-provider shm --enable-cma false --enable-efa false") + stages["1_g4dn_alinux2_shm"] = get_test_stage_with_lock("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm) + stages["1_g4dn_alinux2023_shm"] = get_test_stage_with_lock("1_g4dn_alinux2023_shm", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm) + stages["1_g4dn_ubuntu2004_shm"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm) + stages["1_c5_rhel8_shm"] = get_test_stage_with_lock("1_c5_rhel8_shm", env.BUILD_TAG, "rhel8", "c5.2xlarge", 1, "us-east-1", c52x_lock_label, addl_args_shm + " --enable-efa false") + stages["1_c5_ubuntu2004_shm_disable-cma"] = get_test_stage_with_lock("1_c5_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "c5.2xlarge", 1, "us-east-1", c52x_lock_label, addl_args_shm + " --enable-cma false --enable-efa false") // Single Node Windows Test stages["EFA_Windows_Test"] = get_single_node_windows_test_stage_with_lock("EFA_Windows_Test", c5n18x_lock_label) // Multi Node Tests - EFA - stages["2_hpc6a_alinux2_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr) - stages["2_hpc6a_alinux2023_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr) - stages["2_c6gn_alinux2_efa"] = get_test_stage_with_lock("2_c6gn_alinux2_efa", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6gn16x_lock_label, addl_args_pr) - stages["2_c6gn_alinux2023_efa"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6gn16x_lock_label, addl_args_pr) - stages["2_c5n_alinux2_efa"] = get_test_stage_with_lock("2_c5n_alinux2_efa", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", c5n18x_lock_label, addl_args_pr) - stages["2_c5n_alinux2023_efa"] = get_test_stage_with_lock("2_c5n_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", "libfabric_pr_test.yaml", c5n18x_lock_label, addl_args_pr) - stages["2_hpc6a_ubuntu2004_efa"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr) - stages["2_hpc6a_rhel8_efa"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", hpc6a48x_lock_label, addl_args_pr) + stages["2_hpc6a_alinux2_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa) + stages["2_hpc6a_alinux2023_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa) + stages["2_c6gn_alinux2_efa"] = get_test_stage_with_lock("2_c6gn_alinux2_efa", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa) + stages["2_c6gn_alinux2023_efa"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa) + stages["2_c5n_alinux2_efa"] = get_test_stage_with_lock("2_c5n_alinux2_efa", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa) + stages["2_c5n_alinux2023_efa"] = get_test_stage_with_lock("2_c5n_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa) + stages["2_hpc6a_ubuntu2004_efa"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa) + stages["2_hpc6a_rhel8_efa"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa) // Multi Node Tests - TCP - stages["2_c6g_alinux2_tcp"] = get_test_stage_with_lock("2_c6g_alinux2_tcp", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["2_c6g_alinux2023_tcp"] = get_test_stage_with_lock("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["2_c6g_ubuntu2004_tcp"] = get_test_stage_with_lock("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["2_c6g_rhel8_tcp"] = get_test_stage_with_lock("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false") - stages["3_g4dn_alinux2_tcp"] = get_test_stage_with_lock("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", "libfabric_pr_test.yaml", g4dn12x_lock_label, addl_args_pr + "--test-libfabric-provider tcp --enable-efa false --test-list test_nccl_tests") + stages["2_c6g_alinux2_tcp"] = get_test_stage_with_lock("2_c6g_alinux2_tcp", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) + stages["2_c6g_alinux2023_tcp"] = get_test_stage_with_lock("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) + stages["2_c6g_ubuntu2004_tcp"] = get_test_stage_with_lock("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) + stages["2_c6g_rhel8_tcp"] = get_test_stage_with_lock("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) + stages["3_g4dn_alinux2_tcp"] = get_test_stage_with_lock("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", g4dn12x_lock_label, addl_args_tcp + " --test-list test_nccl_tests") // Multi Node Tests - SOCKETS - stages["2_c6g_alinux2_sockets"] = get_test_stage_with_lock("2_c6g_alinux2_sockets", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") - stages["2_c6g_alinux2023_sockets"] = get_test_stage_with_lock("2_c6g_alinux2023_sockets", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") - stages["2_c6g_ubuntu2004_sockets"] = get_test_stage_with_lock("2_c6g_ubuntu2004_sockets", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") - stages["2_c6g_rhel8_sockets"] = get_test_stage_with_lock("2_c6g_rhel8_sockets", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", "libfabric_pr_test.yaml", c6g2x_lock_label, addl_args_pr + "--test-libfabric-provider sockets --enable-efa false") + stages["2_c6g_alinux2_sockets"] = get_test_stage_with_lock("2_c6g_alinux2_sockets", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_sockets) + stages["2_c6g_alinux2023_sockets"] = get_test_stage_with_lock("2_c6g_alinux2023_sockets", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_sockets) + stages["2_c6g_ubuntu2004_sockets"] = get_test_stage_with_lock("2_c6g_ubuntu2004_sockets", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_sockets) + stages["2_c6g_rhel8_sockets"] = get_test_stage_with_lock("2_c6g_rhel8_sockets", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_sockets) parallel stages } From 9f5b600f7df22eda7b6a0347e85cf748bc979a1a Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 8 Jan 2025 18:23:49 +0000 Subject: [PATCH 351/393] contrib/aws: EFA 2 node MPI/Libfabric tests in parallel Reduce the time it takes to run AWS PR CI to 2.5 hours by creating new clusters to run Libfabric/MPI tests in parallel. AWS's Jenkins uses lockable resources to limit the max number of instances used at any one time. This patch will cause Jenkins to scale up to our max number of allowed instances faster, which effectively reduces the number of jobs that can be run in parallel (jobs will queue). The queue time will be shorter because the jobs using the resource under contention will run faster. When there are few jobs running on the server, jobs time will go from 4.5 hours to 2.5 hours. When the server is under heavy load, job completion time should not change much. Signed-off-by: Seth Zegelstein --- contrib/aws/Jenkinsfile | 58 +++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index 6c6b234419a..39a54e9219f 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -168,23 +168,27 @@ pipeline { steps { script { def stages = [:] - def timeout = "--timeout 720" + def timeout = "--timeout 210" def generic_pf = "--cluster-type manual_cluster --test-target libfabric --test-type pr --test-libfabric-pr $env.CHANGE_ID" - // onesided tests are covered by imb - // collective tests are covered by omb - def test_list = "--test-list test_efa_ut 'test_omb and not onesided' test_fabtests_functional test_fork_support test_backward_compatibility 'test_imb and not collective'" + // onesided tests are covered by imb, collective tests are covered by omb + def mpi_collective_tests = "'test_omb and not onesided'" + def libfabric_tests = "test_efa_ut test_fabtests_functional test_fork_support test_backward_compatibility" + def one_sided_tests = "'test_imb and not collective'" + def libfabric_and_onesided_tests = "${libfabric_tests} ${one_sided_tests}" def efa_provider = "--test-libfabric-provider efa" - def addl_args_efa = "${timeout} ${generic_pf} ${efa_provider} ${test_list}" + def addl_args_efa_libfabric_mpi = "${timeout} ${generic_pf} ${efa_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}" + def addl_args_efa_mpi = "${timeout} ${generic_pf} ${efa_provider} --test-list ${mpi_collective_tests}" + def addl_args_efa_libfabric_and_onesided_mpi = "${timeout} ${generic_pf} ${efa_provider} --test-list ${libfabric_and_onesided_tests}" def shm_provider = "--test-libfabric-provider shm" - def addl_args_shm = "${timeout} ${generic_pf} ${shm_provider} ${test_list}" + def addl_args_shm = "${timeout} ${generic_pf} ${shm_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}" def tcp_provider = "--test-libfabric-provider tcp --enable-efa false" - def addl_args_tcp = "${timeout} ${generic_pf} ${tcp_provider} ${test_list}" + def addl_args_tcp = "${timeout} ${generic_pf} ${tcp_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}" def sockets_provider = "--test-libfabric-provider sockets --enable-efa false" - def addl_args_sockets = "${timeout} ${generic_pf} ${sockets_provider} ${test_list}" + def addl_args_sockets = "${timeout} ${generic_pf} ${sockets_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}" // Use lockable resources to limit the number of jobs that can get executed in parallel def g4dn8x_lock_label = "g4dn8x" @@ -196,10 +200,10 @@ pipeline { def c6g2x_lock_label = "c6g2x" // Single Node Tests - EFA - stages["1_g4dn_alinux2-efa"] = get_test_stage_with_lock("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa) - stages["1_g4dn_alinux2023-efa"] = get_test_stage_with_lock("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa) - stages["1_g4dn_ubuntu2004-efa"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa) - stages["1_g4dn_rhel8-efa"] = get_test_stage_with_lock("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa) + stages["1_g4dn_alinux2-efa"] = get_test_stage_with_lock("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi) + stages["1_g4dn_alinux2023-efa"] = get_test_stage_with_lock("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi) + stages["1_g4dn_ubuntu2004-efa"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi) + stages["1_g4dn_rhel8-efa"] = get_test_stage_with_lock("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi) // Single Node Tests - SHM stages["1_g4dn_alinux2_shm"] = get_test_stage_with_lock("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm) @@ -212,14 +216,28 @@ pipeline { stages["EFA_Windows_Test"] = get_single_node_windows_test_stage_with_lock("EFA_Windows_Test", c5n18x_lock_label) // Multi Node Tests - EFA - stages["2_hpc6a_alinux2_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa) - stages["2_hpc6a_alinux2023_efa"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa) - stages["2_c6gn_alinux2_efa"] = get_test_stage_with_lock("2_c6gn_alinux2_efa", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa) - stages["2_c6gn_alinux2023_efa"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa) - stages["2_c5n_alinux2_efa"] = get_test_stage_with_lock("2_c5n_alinux2_efa", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa) - stages["2_c5n_alinux2023_efa"] = get_test_stage_with_lock("2_c5n_alinux2023_efa", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa) - stages["2_hpc6a_ubuntu2004_efa"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa) - stages["2_hpc6a_rhel8_efa"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa) + stages["2_hpc6a_alinux2_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa_mpi", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi) + stages["2_hpc6a_alinux2_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_hpc6a_alinux2023_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa_mpi", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi) + stages["2_hpc6a_alinux2023_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_c6gn_alinux2023_efa_mpi"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa_mpi", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_mpi) + stages["2_c6gn_alinux2023_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_c5n_alinux2_efa_mpi"] = get_test_stage_with_lock("2_c5n_alinux2_efa_mpi", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_mpi) + stages["2_c5n_alinux2_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_c5n_alinux2_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_c5n_alinux2023_efa_mpi"] = get_test_stage_with_lock("2_c5n_alinux2023_efa_mpi", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_mpi) + stages["2_c5n_alinux2023_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_c5n_alinux2023_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_hpc6a_ubuntu2004_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa_mpi", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi) + stages["2_hpc6a_ubuntu2004_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa_libfabric_and_one_sided", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + stages["2_hpc6a_rhel8_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa_mpi", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi) + stages["2_hpc6a_rhel8_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa_libfabric_and_one_sided", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi) + + // cg6n AL2 builds are the slowest b/c they have asan turned on with debug, and have slower memcpy speeds + // split "libfabric tests" into "fabtests", and imb + def addl_args_efa_one_sided_only = "${timeout} ${generic_pf} ${efa_provider} --test-list ${one_sided_tests}" + def addl_args_efa_libfabric_only = "${timeout} ${generic_pf} ${efa_provider} --test-list ${libfabric_tests}" + stages["2_c6gn_alinux2_efa_mpi"] = get_test_stage_with_lock("2_c6gn_alinux2_efa_mpi", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_mpi) + stages["2_c6gn_alinux2_efa_one_sided"] = get_test_stage_with_lock("2_c6gn_alinux2_efa_one_sided", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_one_sided_only) + stages["2_c6gn_alinux2_efa_libfabric"] = get_test_stage_with_lock("2_c6gn_alinux2_efa_libfabric", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_libfabric_only) // Multi Node Tests - TCP stages["2_c6g_alinux2_tcp"] = get_test_stage_with_lock("2_c6g_alinux2_tcp", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) From 2a320f1ee698e489b2f96a462f5ae8ee11f787d7 Mon Sep 17 00:00:00 2001 From: PukNgae Cryolitia Date: Thu, 9 Jan 2025 20:46:47 +0800 Subject: [PATCH 352/393] prov/opx: use `page_sizes[OFI_PAGE_SIZE]-1` instead of `PAGE_MASK` Fix: https://github.com/ofiwg/libfabric/issues/10661 Signed-off-by: PukNgae Cryolitia --- prov/opx/include/rdma/opx/opx_hfi1_pre_cn5000.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prov/opx/include/rdma/opx/opx_hfi1_pre_cn5000.h b/prov/opx/include/rdma/opx/opx_hfi1_pre_cn5000.h index 2b57bc16115..c83eb5b75da 100644 --- a/prov/opx/include/rdma/opx/opx_hfi1_pre_cn5000.h +++ b/prov/opx/include/rdma/opx/opx_hfi1_pre_cn5000.h @@ -38,6 +38,7 @@ #include #include "fi_opx_hfi1.h" +#include "ofi_mem.h" /* Implementation PRE-CN5000 */ #ifdef OPX_PRE_CN5000 @@ -116,7 +117,7 @@ int opx_get_port(struct hfi1_user_info_dep *uinfo) #define OPX_HFI1_MMAP_MAGIC 0xdabbad00 -#define opx_offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) +#define opx_offset_in_page(p) ((unsigned long)(p) & (page_sizes[OFI_PAGE_SIZE]-1)) #define OPX_HFI1_MMAP_TOKEN_SET(field, val) \ (((val) & OPX_HFI1_MMAP_##field##_MASK) << OPX_HFI1_MMAP_##field##_SHIFT) From 294963e80c30c21a4a81bfd18c3ad29d2356cda6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Jan 2025 18:03:14 +0000 Subject: [PATCH 353/393] build(deps): bump actions/upload-artifact from 4.4.3 to 4.6.0 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.4.3 to 4.6.0. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882...65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/coverity.yml | 2 +- .github/workflows/pr-ci.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 4cad165bc2b..a448a906d11 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -94,7 +94,7 @@ jobs: --form description="`$PWD/install/bin/fi_info -l`" \ https://scan.coverity.com/builds?project=ofiwg%2Flibfabric - name: Upload build logs - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: coverity-build-log.txt path: cov-int/build-log.txt diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 9d595f5f844..bf325eee07c 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -70,7 +70,7 @@ jobs: $PWD/install/bin/fi_info -l - name: Upload build logs if: failure() - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: ${{ matrix.os }}-${{ matrix.cc }}-config.log path: config.log @@ -115,7 +115,7 @@ jobs: $PWD/install/bin/fi_info -c FI_HMEM - name: Upload build logs if: failure() - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: hmem-config.log path: config.log @@ -139,7 +139,7 @@ jobs: make -j2 - name: Upload build logs if: failure() - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: macos-config.log path: config.log diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index e5925ff9bd4..f3a1a3ebe17 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -60,7 +60,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: SARIF file path: results.sarif From e6b3dc0e5ef1151aa4b11d583f4283bd857a6da9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Jan 2025 18:03:31 +0000 Subject: [PATCH 354/393] build(deps): bump github/codeql-action from 3.27.9 to 3.28.1 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.27.9 to 3.28.1. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/df409f7d9260372bd5f19e5b04e83cb3c43714ae...b6a472f63d85b9c78a3ac5e89422239fc15e9b3c) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 771ad835f61..426f0c4f60b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 + uses: github/codeql-action/init@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 + uses: github/codeql-action/autobuild@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 + uses: github/codeql-action/analyze@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index f3a1a3ebe17..9450f4ca1c1 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 + uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 with: sarif_file: results.sarif From 71dd1a12818a7629974db375b403798749c3e394 Mon Sep 17 00:00:00 2001 From: Sai Sunku Date: Mon, 13 Jan 2025 19:47:55 +0000 Subject: [PATCH 355/393] prov/efa: Deprecate FI_AV_MAP FI_AV_MAP is deprecated in Libfabric 2.x. EFA provider was overriding AV type to FI_AV_TABLE even before this change. This change removes all references to FI_AV_MAP in the EFA provider. It will print a warning and switch to FI_AV_TABLE if the application requests FI_AV_MAP. Signed-off-by: Sai Sunku --- man/fi_efa.7.md | 4 +++- prov/efa/src/efa_av.c | 16 ++++++++-------- prov/efa/src/efa_av.h | 2 -- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/man/fi_efa.7.md b/man/fi_efa.7.md index 077f93c5515..cdfcfff3350 100644 --- a/man/fi_efa.7.md +++ b/man/fi_efa.7.md @@ -39,7 +39,9 @@ The following features are supported: message size of the MTU of the underlying hardware (approximately 8 KiB). *Address vectors* -: The provider supports *FI_AV_TABLE* and *FI_AV_MAP* address vector types. +: The provider supports *FI_AV_TABLE*. *FI_AV_MAP* was deprecated in Libfabric 2.x. + Applications can still use *FI_AV_MAP* to create an address vector. But the EFA + provider implementation will print a warning and switch to *FI_AV_TABLE*. *FI_EVENT* is unsupported. *Completion events* diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 7fef9d5b41c..6e8d0fcaa7c 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -56,10 +56,6 @@ struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr) if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL)) return NULL; - if (av->type == FI_AV_MAP) { - return (struct efa_conn *)fi_addr; - } - assert(av->type == FI_AV_TABLE); util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, fi_addr); if (!util_av_entry) @@ -475,8 +471,8 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, conn = &efa_av_entry->conn; memset(conn, 0, sizeof(*conn)); conn->ep_addr = (struct efa_ep_addr *)efa_av_entry->ep_addr; - assert(av->type == FI_AV_MAP || av->type == FI_AV_TABLE); - conn->fi_addr = (av->type == FI_AV_MAP) ? (uintptr_t)(void *)conn : util_av_fi_addr; + assert(av->type == FI_AV_TABLE); + conn->fi_addr = util_av_fi_addr; conn->util_av_fi_addr = util_av_fi_addr; conn->ah = efa_ah_alloc(av, raw_addr->raw); @@ -691,7 +687,7 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid); struct efa_conn *conn = NULL; - if (av->type != FI_AV_MAP && av->type != FI_AV_TABLE) + if (av->type != FI_AV_TABLE) return -FI_EINVAL; if (fi_addr == FI_ADDR_NOTAVAIL) @@ -744,7 +740,7 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, return -FI_EINVAL; av = container_of(av_fid, struct efa_av, util_av.av_fid); - if (av->type != FI_AV_MAP && av->type != FI_AV_TABLE) + if (av->type != FI_AV_TABLE) return -FI_EINVAL; ofi_mutex_lock(&av->util_av.lock); @@ -897,6 +893,10 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, if (!av) return -FI_ENOMEM; + if (attr->type == FI_AV_MAP) { + EFA_WARN(FI_LOG_AV, "FI_AV_MAP is deprecated in Libfabric 2.x. Please use FI_AV_TABLE. " + "EFA provider will now switch to using FI_AV_TABLE.\n"); + } attr->type = FI_AV_TABLE; efa_domain = container_of(domain_fid, struct efa_domain, util_domain.domain_fid); diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index 5d885adbdca..2ee14eda6e4 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -22,8 +22,6 @@ struct efa_ah { struct efa_conn { struct efa_ah *ah; struct efa_ep_addr *ep_addr; - /* for FI_AV_TABLE, fi_addr is same as util_av_fi_addr, - * for FI_AV_MAP, fi_addr is pointer to efa_conn; */ fi_addr_t fi_addr; fi_addr_t util_av_fi_addr; struct efa_rdm_peer rdm_peer; From 628c65ab1d0e29fa7c696c6323486aebe79df6f1 Mon Sep 17 00:00:00 2001 From: OFIWG Bot Date: Mon, 13 Jan 2025 23:25:29 +0000 Subject: [PATCH 356/393] Updated nroff-generated man pages Signed-off-by: OFIWG Bot --- man/man7/fi_efa.7 | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/man/man7/fi_efa.7 b/man/man7/fi_efa.7 index 79f2e7f0d92..6d6f780b3c7 100644 --- a/man/man7/fi_efa.7 +++ b/man/man7/fi_efa.7 @@ -14,7 +14,7 @@ . ftr VB CB . ftr VBI CBI .\} -.TH "fi_efa" "7" "2024\-12\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_efa" "7" "2025\-01\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -56,8 +56,12 @@ maximum message size of the MTU of the underlying hardware (approximately 8 KiB). .TP \f[I]Address vectors\f[R] -The provider supports \f[I]FI_AV_TABLE\f[R] and \f[I]FI_AV_MAP\f[R] -address vector types. +The provider supports \f[I]FI_AV_TABLE\f[R]. +\f[I]FI_AV_MAP\f[R] was deprecated in Libfabric 2.x. +Applications can still use \f[I]FI_AV_MAP\f[R] to create an address +vector. +But the EFA provider implementation will print a warning and switch to +\f[I]FI_AV_TABLE\f[R]. \f[I]FI_EVENT\f[R] is unsupported. .TP \f[I]Completion events\f[R] From 815a1662c83f8e399ed34ce4784bb31e8930aed6 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Mon, 13 Jan 2025 18:53:15 +0000 Subject: [PATCH 357/393] prov/efa: Make efa_rdm_cq use efa_cq The structs efa_rdm_cq and efa_cq share the same members util_cq and ibv_cq. This patch makes efa_rdm_cq use efa_cq as a subset so we can convert between efa_rdm_cq and efa_cq via container_of, like efa_rdm_ep and efa_base_ep. Signed-off-by: Shi Jin --- prov/efa/src/efa_base_ep.c | 11 +++ prov/efa/src/efa_base_ep.h | 5 ++ prov/efa/src/rdm/efa_rdm_cq.c | 42 +++++------ prov/efa/src/rdm/efa_rdm_cq.h | 3 +- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 74 ++++++++++---------- prov/efa/test/efa_unit_test_cq.c | 104 ++++++++++++++-------------- prov/efa/test/efa_unit_test_ep.c | 32 +++++---- 7 files changed, 144 insertions(+), 127 deletions(-) diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 2abdee189dc..85068fa91c6 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -4,6 +4,7 @@ #include #include "efa.h" #include "efa_av.h" +#include "efa_cq.h" #include "rdm/efa_rdm_protocol.h" int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av) @@ -520,3 +521,13 @@ const char *efa_base_ep_get_peer_raw_addr_str(struct efa_base_ep *base_ep, fi_ad { return ofi_straddr(buf, buflen, FI_ADDR_EFA, efa_base_ep_get_peer_raw_addr(base_ep, addr)); } + +struct efa_cq *efa_base_ep_get_tx_cq(struct efa_base_ep *ep) +{ + return ep->util_ep.tx_cq ? container_of(ep->util_ep.tx_cq, struct efa_cq, util_cq) : NULL; +} + +struct efa_cq *efa_base_ep_get_rx_cq(struct efa_base_ep *ep) +{ + return ep->util_ep.rx_cq ? container_of(ep->util_ep.rx_cq, struct efa_cq, util_cq) : NULL; +} diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 86657c5dc12..a7d1526919e 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -109,4 +109,9 @@ struct efa_ep_addr *efa_base_ep_get_peer_raw_addr(struct efa_base_ep *base_ep, const char *efa_base_ep_get_peer_raw_addr_str(struct efa_base_ep *base_ep, fi_addr_t addr, char *buf, size_t *buflen); + +struct efa_cq *efa_base_ep_get_tx_cq(struct efa_base_ep *ep); + +struct efa_cq *efa_base_ep_get_rx_cq(struct efa_base_ep *ep); + #endif diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 5a18ef17003..24051cc2e8a 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -36,16 +36,16 @@ int efa_rdm_cq_close(struct fid *fid) retv = 0; - cq = container_of(fid, struct efa_rdm_cq, util_cq.cq_fid.fid); + cq = container_of(fid, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); - if (cq->ibv_cq.ibv_cq_ex) { - ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq.ibv_cq_ex)); + if (cq->efa_cq.ibv_cq.ibv_cq_ex) { + ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->efa_cq.ibv_cq.ibv_cq_ex)); if (ret) { EFA_WARN(FI_LOG_CQ, "Unable to close ibv cq: %s\n", fi_strerror(-ret)); return ret; } - cq->ibv_cq.ibv_cq_ex = NULL; + cq->efa_cq.ibv_cq.ibv_cq_ex = NULL; } if (cq->shm_cq) { @@ -56,7 +56,7 @@ int efa_rdm_cq_close(struct fid *fid) } } - ret = ofi_cq_cleanup(&cq->util_cq); + ret = ofi_cq_cleanup(&cq->efa_cq.util_cq); if (ret) return ret; free(cq); @@ -435,13 +435,13 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) int prov_errno; struct efa_rdm_ep *ep = NULL; struct fi_cq_err_entry err_entry; - struct efa_rdm_cq *efa_rdm_cq; + struct efa_cq *efa_cq; struct efa_domain *efa_domain; struct efa_qp *qp; struct dlist_entry rx_progressed_ep_list, *tmp; - efa_rdm_cq = container_of(ibv_cq, struct efa_rdm_cq, ibv_cq); - efa_domain = container_of(efa_rdm_cq->util_cq.domain, struct efa_domain, util_domain); + efa_cq = container_of(ibv_cq, struct efa_cq, ibv_cq); + efa_domain = container_of(efa_cq->util_cq.domain, struct efa_domain, util_domain); dlist_init(&rx_progressed_ep_list); /* Call ibv_start_poll only once */ @@ -538,7 +538,7 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) .prov_errno = prov_errno, .op_context = NULL }; - ofi_cq_write_error(&efa_rdm_cq->util_cq, &err_entry); + ofi_cq_write_error(&efa_cq->util_cq, &err_entry); } if (should_end_poll) @@ -559,9 +559,9 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun ssize_t ret; struct efa_domain *domain; - cq = container_of(cq_fid, struct efa_rdm_cq, util_cq.cq_fid.fid); + cq = container_of(cq_fid, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); - domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain); + domain = container_of(cq->efa_cq.util_cq.domain, struct efa_domain, util_domain); ofi_genlock_lock(&domain->srx_lock); @@ -573,13 +573,13 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun * completion to efa. Use ofi_cq_read_entries to get the number of * shm completions without progressing efa ep again. */ - ret = ofi_cq_read_entries(&cq->util_cq, buf, count, src_addr); + ret = ofi_cq_read_entries(&cq->efa_cq.util_cq, buf, count, src_addr); if (ret > 0) goto out; } - ret = ofi_cq_readfrom(&cq->util_cq.cq_fid, buf, count, src_addr); + ret = ofi_cq_readfrom(&cq->efa_cq.util_cq.cq_fid, buf, count, src_addr); out: ofi_genlock_unlock(&domain->srx_lock); @@ -608,8 +608,8 @@ static void efa_rdm_cq_progress(struct util_cq *cq) struct fid_list_entry *fid_entry; ofi_genlock_lock(&cq->ep_list_lock); - efa_rdm_cq = container_of(cq, struct efa_rdm_cq, util_cq); - efa_domain = container_of(efa_rdm_cq->util_cq.domain, struct efa_domain, util_domain); + efa_rdm_cq = container_of(cq, struct efa_rdm_cq, efa_cq.util_cq); + efa_domain = container_of(efa_rdm_cq->efa_cq.util_cq.domain, struct efa_domain, util_domain); /** * TODO: It's better to just post the initial batch of internal rx pkts during ep enable @@ -671,19 +671,19 @@ int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, dlist_init(&cq->ibv_cq_poll_list); cq->need_to_scan_ep_list = false; - ret = ofi_cq_init(&efa_prov, domain, attr, &cq->util_cq, + ret = ofi_cq_init(&efa_prov, domain, attr, &cq->efa_cq.util_cq, &efa_rdm_cq_progress, context); if (ret) goto free; - ret = efa_cq_ibv_cq_ex_open(attr, efa_domain->device->ibv_ctx, &cq->ibv_cq.ibv_cq_ex, &cq->ibv_cq.ibv_cq_ex_type); + ret = efa_cq_ibv_cq_ex_open(attr, efa_domain->device->ibv_ctx, &cq->efa_cq.ibv_cq.ibv_cq_ex, &cq->efa_cq.ibv_cq.ibv_cq_ex_type); if (ret) { EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %s\n", fi_strerror(ret)); goto close_util_cq; } - *cq_fid = &cq->util_cq.cq_fid; + *cq_fid = &cq->efa_cq.util_cq.cq_fid; (*cq_fid)->fid.ops = &efa_rdm_cq_fi_ops; (*cq_fid)->ops = &efa_rdm_cq_ops; @@ -693,7 +693,7 @@ int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, /* Bind ep with shm provider's cq */ shm_cq_attr.flags |= FI_PEER; peer_cq_context.size = sizeof(peer_cq_context); - peer_cq_context.cq = cq->util_cq.peer_cq; + peer_cq_context.cq = cq->efa_cq.util_cq.peer_cq; ret = fi_cq_open(efa_domain->shm_domain, &shm_cq_attr, &cq->shm_cq, &peer_cq_context); if (ret) { @@ -704,12 +704,12 @@ int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, return 0; destroy_ibv_cq: - retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq.ibv_cq_ex)); + retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->efa_cq.ibv_cq.ibv_cq_ex)); if (retv) EFA_WARN(FI_LOG_CQ, "Unable to close ibv cq: %s\n", fi_strerror(-retv)); close_util_cq: - retv = ofi_cq_cleanup(&cq->util_cq); + retv = ofi_cq_cleanup(&cq->efa_cq.util_cq); if (retv) EFA_WARN(FI_LOG_CQ, "Unable to close util cq: %s\n", fi_strerror(-retv)); diff --git a/prov/efa/src/rdm/efa_rdm_cq.h b/prov/efa/src/rdm/efa_rdm_cq.h index a56d62dac40..e1a865ee127 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.h +++ b/prov/efa/src/rdm/efa_rdm_cq.h @@ -8,8 +8,7 @@ #include struct efa_rdm_cq { - struct util_cq util_cq; - struct efa_ibv_cq ibv_cq; + struct efa_cq efa_cq; struct fid_cq *shm_cq; struct dlist_entry ibv_cq_poll_list; bool need_to_scan_ep_list; diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 3508fe7ba75..1981ed9825f 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -37,13 +37,13 @@ void efa_rdm_ep_construct_ibv_qp_init_attr_ex(struct efa_rdm_ep *ep, static inline struct efa_rdm_cq *efa_rdm_ep_get_tx_rdm_cq(struct efa_rdm_ep *ep) { - return ep->base_ep.util_ep.tx_cq ? container_of(ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, util_cq) : NULL; + return ep->base_ep.util_ep.tx_cq ? container_of(ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, efa_cq.util_cq) : NULL; } static inline struct efa_rdm_cq *efa_rdm_ep_get_rx_rdm_cq(struct efa_rdm_ep *ep) { - return ep->base_ep.util_ep.rx_cq ? container_of(ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, util_cq) : NULL; + return ep->base_ep.util_ep.rx_cq ? container_of(ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, efa_cq.util_cq) : NULL; } /** @@ -58,33 +58,33 @@ static int efa_rdm_ep_create_base_ep_ibv_qp(struct efa_rdm_ep *ep) { struct ibv_qp_init_attr_ex attr_ex = { 0 }; - struct efa_rdm_cq *tx_rdm_cq, *rx_rdm_cq; + struct efa_cq *tx_cq, *rx_cq; struct ibv_cq_ex *tx_ibv_cq, *rx_ibv_cq; int ret; - tx_rdm_cq = efa_rdm_ep_get_tx_rdm_cq(ep); - rx_rdm_cq = efa_rdm_ep_get_rx_rdm_cq(ep); + tx_cq = efa_base_ep_get_tx_cq(&ep->base_ep); + rx_cq = efa_base_ep_get_rx_cq(&ep->base_ep); - if (!tx_rdm_cq && !rx_rdm_cq) { + if (!tx_cq && !rx_cq) { EFA_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to a send or receive completion queue\n"); return -FI_ENOCQ; } - if (!tx_rdm_cq && ofi_needs_tx(ep->base_ep.info->caps)) { + if (!tx_cq && ofi_needs_tx(ep->base_ep.info->caps)) { EFA_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); return -FI_ENOCQ; } - if (!rx_rdm_cq && ofi_needs_rx(ep->base_ep.info->caps)) { + if (!rx_cq && ofi_needs_rx(ep->base_ep.info->caps)) { EFA_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled (FI_RECV).\n"); return -FI_ENOCQ; } - tx_ibv_cq = tx_rdm_cq ? tx_rdm_cq->ibv_cq.ibv_cq_ex : rx_rdm_cq->ibv_cq.ibv_cq_ex; - rx_ibv_cq = rx_rdm_cq ? rx_rdm_cq->ibv_cq.ibv_cq_ex : tx_rdm_cq->ibv_cq.ibv_cq_ex; + tx_ibv_cq = tx_cq ? tx_cq->ibv_cq.ibv_cq_ex : rx_cq->ibv_cq.ibv_cq_ex; + rx_ibv_cq = rx_cq ? rx_cq->ibv_cq.ibv_cq_ex : tx_cq->ibv_cq.ibv_cq_ex; efa_rdm_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, tx_ibv_cq, rx_ibv_cq); @@ -699,9 +699,9 @@ static int efa_rdm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) } break; case FI_CLASS_CQ: - cq = container_of(bfid, struct efa_rdm_cq, util_cq.cq_fid.fid); + cq = container_of(bfid, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); - ret = ofi_ep_bind_cq(&efa_rdm_ep->base_ep.util_ep, &cq->util_cq, flags); + ret = ofi_ep_bind_cq(&efa_rdm_ep->base_ep.util_ep, &cq->efa_cq.util_cq, flags); if (ret) return ret; @@ -873,12 +873,12 @@ bool efa_rdm_ep_has_unfinished_send(struct efa_rdm_ep *efa_rdm_ep) static inline void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep) { - struct efa_rdm_cq *tx_cq, *rx_cq; + struct efa_cq *tx_cq, *rx_cq; ofi_genlock_lock(&efa_rdm_ep_domain(efa_rdm_ep)->srx_lock); - tx_cq = efa_rdm_ep_get_tx_rdm_cq(efa_rdm_ep); - rx_cq = efa_rdm_ep_get_rx_rdm_cq(efa_rdm_ep); + tx_cq = efa_base_ep_get_tx_cq(&efa_rdm_ep->base_ep); + rx_cq = efa_base_ep_get_rx_cq(&efa_rdm_ep->base_ep); while (efa_rdm_ep_has_unfinished_send(efa_rdm_ep)) { /* poll cq until empty */ @@ -898,10 +898,10 @@ void efa_rdm_ep_remove_cntr_ibv_cq_poll_list(struct efa_rdm_ep *ep) int i; struct efa_cntr *efa_cntr; struct util_cntr *util_cntr; - struct efa_rdm_cq *tx_cq, *rx_cq; + struct efa_cq *tx_cq, *rx_cq; - tx_cq = efa_rdm_ep_get_tx_rdm_cq(ep); - rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); + tx_cq = efa_base_ep_get_tx_cq(&ep->base_ep); + rx_cq = efa_base_ep_get_rx_cq(&ep->base_ep); for (i = 0; i< CNTR_CNT; i++) { util_cntr = ep->base_ep.util_ep.cntrs[i]; @@ -928,16 +928,16 @@ void efa_rdm_ep_remove_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) * It must happen after ofi_endpoint_close * so we have cq's reference counters updated. */ - if (tx_cq && !ofi_atomic_get32(&tx_cq->util_cq.ref)) { - efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + if (tx_cq && !ofi_atomic_get32(&tx_cq->efa_cq.util_cq.ref)) { + efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->efa_cq.util_cq.ep_list_lock, &tx_cq->efa_cq.ibv_cq); if (rx_cq) - efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->efa_cq.util_cq.ep_list_lock, &tx_cq->efa_cq.ibv_cq); } - if (rx_cq && !ofi_atomic_get32(&rx_cq->util_cq.ref)) { - efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + if (rx_cq && !ofi_atomic_get32(&rx_cq->efa_cq.util_cq.ref)) { + efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->efa_cq.util_cq.ep_list_lock, &rx_cq->efa_cq.ibv_cq); if (tx_cq) - efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->efa_cq.util_cq.ep_list_lock, &rx_cq->efa_cq.ibv_cq); } } @@ -1099,7 +1099,7 @@ static void efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) efa_av->shm_rdm_av = NULL; } - efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, util_cq); + efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, efa_cq.util_cq); if (efa_rdm_cq->shm_cq) { ret = fi_close(&efa_rdm_cq->shm_cq->fid); if (ret) @@ -1107,7 +1107,7 @@ static void efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) efa_rdm_cq->shm_cq = NULL; } - efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, util_cq); + efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, efa_cq.util_cq); if (efa_rdm_cq->shm_cq) { ret = fi_close(&efa_rdm_cq->shm_cq->fid); if (ret) @@ -1187,9 +1187,9 @@ int efa_rdm_ep_insert_cntr_ibv_cq_poll_list(struct efa_rdm_ep *ep) int i, ret; struct efa_cntr *efa_cntr; struct util_cntr *util_cntr; - struct efa_rdm_cq *tx_cq, *rx_cq; - tx_cq = efa_rdm_ep_get_tx_rdm_cq(ep); - rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); + struct efa_cq *tx_cq, *rx_cq; + tx_cq = efa_base_ep_get_tx_cq(&ep->base_ep); + rx_cq = efa_base_ep_get_rx_cq(&ep->base_ep); for (i = 0; i < CNTR_CNT; i++) { util_cntr = ep->base_ep.util_ep.cntrs[i]; @@ -1224,33 +1224,33 @@ int efa_rdm_ep_insert_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); if (tx_cq) { - ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->efa_cq.util_cq.ep_list_lock, &tx_cq->efa_cq.ibv_cq); if (ret) return ret; if (rx_cq) { - ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->efa_cq.util_cq.ep_list_lock, &rx_cq->efa_cq.ibv_cq); if (ret) return ret; } - ofi_genlock_lock(&tx_cq->util_cq.ep_list_lock); + ofi_genlock_lock(&tx_cq->efa_cq.util_cq.ep_list_lock); tx_cq->need_to_scan_ep_list = true; - ofi_genlock_unlock(&tx_cq->util_cq.ep_list_lock); + ofi_genlock_unlock(&tx_cq->efa_cq.util_cq.ep_list_lock); } if (rx_cq) { - ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->efa_cq.util_cq.ep_list_lock, &rx_cq->efa_cq.ibv_cq); if (ret) return ret; if (tx_cq) { - ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->efa_cq.util_cq.ep_list_lock, &tx_cq->efa_cq.ibv_cq); if (ret) return ret; } - ofi_genlock_lock(&rx_cq->util_cq.ep_list_lock); + ofi_genlock_lock(&rx_cq->efa_cq.util_cq.ep_list_lock); rx_cq->need_to_scan_ep_list = true; - ofi_genlock_unlock(&rx_cq->util_cq.ep_list_lock); + ofi_genlock_unlock(&rx_cq->efa_cq.util_cq.ep_list_lock); } return FI_SUCCESS; diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index c5b93cd5e66..e69fb8b432e 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -20,22 +20,13 @@ void test_impl_cq_read_empty_cq(struct efa_resource *resource, enum fi_ep_type e struct ibv_cq_ex *ibv_cqx; struct fi_cq_data_entry cq_entry; int ret; + struct efa_base_ep *efa_base_ep; efa_unit_test_resource_construct(resource, ep_type); - if (ep_type == FI_EP_DGRAM) { - struct efa_dgram_ep *efa_dgram_ep; - - efa_dgram_ep = container_of(resource->ep, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - ibv_cqx = container_of(efa_dgram_ep->base_ep.util_ep.rx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; - } else { - struct efa_rdm_ep *efa_rdm_ep; - - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert(efa_rdm_ep->base_ep.util_ep.rx_cq); - ibv_cqx = container_of(efa_rdm_ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, util_cq)->ibv_cq.ibv_cq_ex; - } + efa_base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + ibv_cqx = container_of(efa_base_ep->util_ep.rx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; /* ibv_start_poll to return ENOENT means device CQ is empty */ @@ -110,8 +101,8 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, efa_rdm_ep->host_id = local_host_id; ibv_qpx = efa_rdm_ep->base_ep.qp->ibv_qp_ex; - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); - ibv_cqx = efa_rdm_cq->ibv_cq.ibv_cq_ex; + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(ret, 0); @@ -296,6 +287,7 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) struct fi_eq_err_entry eq_err_entry; int ret; struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; efa_unit_test_resource_construct(resource, FI_EP_RDM); @@ -314,13 +306,14 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) assert_non_null(pkt_entry); efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; - efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; will_return(efa_mock_ibv_start_poll_return_mock, 0); will_return(efa_mock_ibv_end_poll_check_mock, NULL); @@ -334,12 +327,12 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) /* the recv error will not populate to application cq because it's an EFA internal error and * and not related to any application recv. Currently we can only read the error from eq. */ - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; - efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; + ibv_cqx->wr_id = (uintptr_t)pkt_entry; + ibv_cqx->status = IBV_WC_GENERAL_ERR; #if HAVE_CAPS_UNSOLICITED_WRITE_RECV if (efa_use_unsolicited_write_recv()) { - efadv_cq_from_ibv_cq_ex(efa_rdm_cq->ibv_cq.ibv_cq_ex)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; + efadv_cq_from_ibv_cq_ex(ibv_cqx)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; will_return(efa_mock_efadv_wc_is_unsolicited, false); } #endif @@ -372,18 +365,20 @@ void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(struct efa_resource struct fi_eq_err_entry eq_err_entry; int ret; struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; efa_unit_test_resource_construct(resource, FI_EP_RDM); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; - efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; will_return(efa_mock_ibv_start_poll_return_mock, 0); will_return(efa_mock_ibv_end_poll_check_mock, NULL); @@ -399,10 +394,10 @@ void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(struct efa_resource #if HAVE_CAPS_UNSOLICITED_WRITE_RECV if (use_unsolicited_recv) { - efadv_cq_from_ibv_cq_ex(efa_rdm_cq->ibv_cq.ibv_cq_ex)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; + efadv_cq_from_ibv_cq_ex(ibv_cqx)->wc_is_unsolicited = &efa_mock_efadv_wc_is_unsolicited; will_return(efa_mock_efa_device_support_unsolicited_write_recv, true); will_return(efa_mock_efadv_wc_is_unsolicited, true); - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = 0; + ibv_cqx->wr_id = 0; } else { /* * For solicited write recv, it will consume an internal rx pkt @@ -411,7 +406,7 @@ void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(struct efa_resource struct efa_rdm_pke *pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); assert_non_null(pkt_entry); efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; + ibv_cqx->wr_id = (uintptr_t)pkt_entry; } #else /* @@ -421,12 +416,12 @@ void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(struct efa_resource struct efa_rdm_pke *pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); assert_non_null(pkt_entry); efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; + ibv_cqx->wr_id = (uintptr_t)pkt_entry; #endif /* the recv rdma with imm will not populate to application cq because it's an EFA internal error and * and not related to any application operations. Currently we can only read the error from eq. */ - efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; + ibv_cqx->status = IBV_WC_GENERAL_ERR; ret = fi_cq_read(resource->cq, &cq_entry, 1); assert_int_equal(ret, -FI_EAGAIN); @@ -460,13 +455,16 @@ void test_ibv_cq_ex_read_failed_poll(struct efa_resource **state) struct fi_cq_err_entry cq_err_entry; int ret; struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; efa_unit_test_resource_construct(resource, FI_EP_RDM); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); - efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; + + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; will_return(efa_mock_ibv_start_poll_return_mock, EFAULT); will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); @@ -536,7 +534,7 @@ int test_efa_rdm_cq_get_ibv_cq_poll_list_length(struct fid_cq *cq_fid) { struct efa_rdm_cq *cq; - cq = container_of(cq_fid, struct efa_rdm_cq, util_cq.cq_fid.fid); + cq = container_of(cq_fid, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); return efa_unit_test_get_dlist_length(&cq->ibv_cq_poll_list); } @@ -598,7 +596,7 @@ void test_efa_rdm_cq_post_initial_rx_pkts(struct efa_resource **state) efa_unit_test_resource_construct(resource, FI_EP_RDM); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); /* At this time, rx pkts are not growed and posted */ assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); @@ -641,6 +639,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc struct efa_unit_test_buff recv_buff; int ret; struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; /* * Always use mocked efadv_create_cq instead of the real one. @@ -659,7 +658,8 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc efa_unit_test_resource_construct(resource, FI_EP_RDM); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; /* Construct a minimal recv buffer */ efa_unit_test_buff_construct(&recv_buff, resource, efa_rdm_ep->min_multi_recv_size); @@ -698,19 +698,19 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc efa_unit_test_eager_msgrtm_pkt_construct(pkt_entry, &pkt_attr); /* Setup CQ */ - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; - efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_slid = &efa_mock_ibv_read_slid_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; + ibv_cqx->wr_id = (uintptr_t)pkt_entry; + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->read_slid = &efa_mock_ibv_read_slid_return_mock; + ibv_cqx->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + ibv_cqx->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; + ibv_cqx->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; if (support_efadv_cq) { - efadv_cq = efadv_cq_from_ibv_cq_ex(efa_rdm_cq->ibv_cq.ibv_cq_ex); + efadv_cq = efadv_cq_from_ibv_cq_ex(ibv_cqx); assert_non_null(efadv_cq); efadv_cq->wc_read_sgid = &efa_mock_efadv_wc_read_sgid_return_zero_code_and_expect_next_poll_and_set_gid; diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index d64139b986c..f2d1d1f0e7a 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -110,6 +110,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin struct efa_rdm_pke *pkt_entry; uint64_t actual_peer_host_id = UINT64_MAX; struct efa_rdm_cq *efa_rdm_cq; + struct ibv_cq_ex *ibv_cqx; g_efa_unit_test_mocks.local_host_id = local_host_id; g_efa_unit_test_mocks.peer_host_id = peer_host_id; @@ -120,7 +121,8 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin efa_unit_test_resource_construct_rdm_shm_disabled(resource); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; efa_rdm_ep->host_id = g_efa_unit_test_mocks.local_host_id; @@ -166,18 +168,18 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin expect_function_call(efa_mock_ibv_wr_send_verify_handshake_pkt_local_host_id_and_save_wr); /* Setup CQ */ - efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_slid = &efa_mock_ibv_read_slid_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_SUCCESS; - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; + ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; + ibv_cqx->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; + ibv_cqx->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; + ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + ibv_cqx->read_slid = &efa_mock_ibv_read_slid_return_mock; + ibv_cqx->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + ibv_cqx->read_wc_flags = &efa_mock_ibv_read_wc_flags_return_mock; + ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; + ibv_cqx->status = IBV_WC_SUCCESS; + ibv_cqx->wr_id = (uintptr_t)pkt_entry; expect_function_call(efa_mock_ibv_next_poll_check_function_called_and_return_mock); /* Receive handshake packet */ @@ -210,8 +212,8 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin * We need to poll the CQ twice explicitly to point the CQE * to the saved send wr in handshake */ - efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; - efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)g_ibv_submitted_wr_id_vec[0]; + ibv_cqx->status = IBV_WC_GENERAL_ERR; + ibv_cqx->wr_id = (uintptr_t)g_ibv_submitted_wr_id_vec[0]; /* Progress the send wr to clean up outstanding tx ops */ cq_read_send_ret = fi_cq_read(resource->cq, &cq_entry, 1); From a93becab3e2722137b790b8cde224691197988c0 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Tue, 14 Jan 2025 16:51:15 +0000 Subject: [PATCH 358/393] prov/efa: Remove util_av_fi_addr from efa_conn 71dd1a12 deprecates FI_AV_MAP support for the EFA provider. With this deprecation, we no longer need to maintain util_av_fi_addr and fi_addr in the efa_conn struct b/c they will always be equal. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_av.c | 15 +++++++-------- prov/efa/src/efa_av.h | 1 - 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 6e8d0fcaa7c..9c574c54121 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -444,7 +444,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, struct util_av_entry *util_av_entry = NULL; struct efa_av_entry *efa_av_entry = NULL; struct efa_conn *conn; - fi_addr_t util_av_fi_addr; + fi_addr_t fi_addr; int err; if (flags & FI_SYNC_ERR) @@ -456,7 +456,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, return NULL; } - err = ofi_av_insert_addr(&av->util_av, raw_addr, &util_av_fi_addr); + err = ofi_av_insert_addr(&av->util_av, raw_addr, &fi_addr); if (err) { EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n", fi_strerror(err)); @@ -464,7 +464,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, } util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, - util_av_fi_addr); + fi_addr); efa_av_entry = (struct efa_av_entry *)util_av_entry->data; assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)efa_av_entry->ep_addr)); @@ -472,8 +472,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, memset(conn, 0, sizeof(*conn)); conn->ep_addr = (struct efa_ep_addr *)efa_av_entry->ep_addr; assert(av->type == FI_AV_TABLE); - conn->fi_addr = util_av_fi_addr; - conn->util_av_fi_addr = util_av_fi_addr; + conn->fi_addr = fi_addr; conn->ah = efa_ah_alloc(av, raw_addr->raw); if (!conn->ah) @@ -502,7 +501,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, efa_ah_release(av, conn->ah); conn->ep_addr = NULL; - err = ofi_av_remove_addr(&av->util_av, util_av_fi_addr); + err = ofi_av_remove_addr(&av->util_av, fi_addr); if (err) EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n", err); @@ -552,11 +551,11 @@ void efa_conn_release(struct efa_av *av, struct efa_conn *conn) efa_ah_release(av, conn->ah); - util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, conn->util_av_fi_addr); + util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, conn->fi_addr); assert(util_av_entry); efa_av_entry = (struct efa_av_entry *)util_av_entry->data; - err = ofi_av_remove_addr(&av->util_av, conn->util_av_fi_addr); + err = ofi_av_remove_addr(&av->util_av, conn->fi_addr); if (err) { EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err); } diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index 2ee14eda6e4..bd4d4a2d74e 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -23,7 +23,6 @@ struct efa_conn { struct efa_ah *ah; struct efa_ep_addr *ep_addr; fi_addr_t fi_addr; - fi_addr_t util_av_fi_addr; struct efa_rdm_peer rdm_peer; }; From 091b20b82e06c90ad12a2ba8de58fdc4c521b27b Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 15 Jan 2025 15:51:47 -0600 Subject: [PATCH 359/393] prov/cxi: Fix fi_cq_strerror ofi_cq_strerror() was being called instead of cxip_cq_strerror(). Signed-off-by: Ian Ziemba --- prov/cxi/src/cxip_cq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prov/cxi/src/cxip_cq.c b/prov/cxi/src/cxip_cq.c index f55eb27141f..a4613c66149 100644 --- a/prov/cxi/src/cxip_cq.c +++ b/prov/cxi/src/cxip_cq.c @@ -375,7 +375,7 @@ static struct fi_ops_cq cxip_cq_ops = { .sread = cxip_cq_sread, .sreadfrom = cxip_cq_sreadfrom, .signal = cxip_cq_signal, - .strerror = ofi_cq_strerror, + .strerror = cxip_cq_strerror, }; static struct fi_cq_attr cxip_cq_def_attr = { @@ -510,7 +510,6 @@ int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, goto err_util_cq; } - cxi_cq->util_cq.cq_fid.ops->strerror = &cxip_cq_strerror; cxi_cq->util_cq.cq_fid.fid.ops = &cxip_cq_fi_ops; cxi_cq->util_cq.cq_fid.ops = &cxip_cq_ops; cxi_cq->domain = cxi_dom; From 77afcafb46643b22285a7c38cb4a654e2709ff14 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 15 Jan 2025 01:53:13 +0000 Subject: [PATCH 360/393] prov/efa: Move struct efa_ep_addr to efa_base_ep struct efa_ep_addr is used in both the DGRAM and RDM providers, so the structure definition should be in a common file, and not an RDM only file. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_base_ep.h | 12 ++++++++++++ prov/efa/src/rdm/efa_rdm_protocol.h | 11 ----------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index a7d1526919e..37a9a5924a8 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -27,6 +27,18 @@ struct efa_qp { uint32_t qkey; }; +#define EFA_GID_LEN 16 + +struct efa_ep_addr { + uint8_t raw[EFA_GID_LEN]; + uint16_t qpn; + uint16_t pad; + uint32_t qkey; + struct efa_ep_addr *next; +}; + +#define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr) + struct efa_av; struct efa_recv_wr { diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h index 8840ce5f401..05fe40fd36a 100644 --- a/prov/efa/src/rdm/efa_rdm_protocol.h +++ b/prov/efa/src/rdm/efa_rdm_protocol.h @@ -16,18 +16,7 @@ #define EFA_RDM_PROTOCOL_VERSION (4) -/* raw address format. (section 1.4) */ -#define EFA_GID_LEN 16 -struct efa_ep_addr { - uint8_t raw[EFA_GID_LEN]; - uint16_t qpn; - uint16_t pad; - uint32_t qkey; - struct efa_ep_addr *next; -}; - -#define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr) /* * Extra Feature/Request Flags (section 2.1) From ca0758b66d316334f2ceded6912424a8ff712049 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 15 Jan 2025 16:30:14 +0000 Subject: [PATCH 361/393] prov/efa: Regulate the usage of optnames FI_OPT_EFA_USE_DEVICE_RDMA, FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES, and FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES should only apply to the fi_setopt call as they are used to enforce the endpoint to perform required operations. If they are not set, provider will do the traffic with supported capabilities by default. In that regard, it causes confusions to support these optnames in the getopt calls. Signed-off-by: Shi Jin --- man/fi_efa.7.md | 9 ++++++--- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 18 ------------------ prov/efa/test/efa_unit_test_ep.c | 3 --- 3 files changed, 6 insertions(+), 24 deletions(-) diff --git a/man/fi_efa.7.md b/man/fi_efa.7.md index cdfcfff3350..fe62d820b1e 100644 --- a/man/fi_efa.7.md +++ b/man/fi_efa.7.md @@ -115,7 +115,8 @@ provider for AWS Neuron or Habana SynapseAI. these operations are assisted by hardware support (return value is false). *FI_OPT_EFA_USE_DEVICE_RDMA - bool* -: Only available if the application selects a libfabric API version >= 1.18. +: These option only applies to the fi_setopt() call. + Only available if the application selects a libfabric API version >= 1.18. This option allows an application to change libfabric's behavior with respect to RDMA transfers. Note that there is also an environment variable FI_EFA_USE_DEVICE_RDMA which the user may set as well. If the @@ -131,7 +132,8 @@ provider for AWS Neuron or Habana SynapseAI. revisions. *FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES - bool* -: It is used to force the endpoint to use in-order send/recv operation for each 128 bytes +: These option only applies to the fi_setopt() call. + It is used to force the endpoint to use in-order send/recv operation for each 128 bytes aligned block. Enabling the option will guarantee data inside each 128 bytes aligned block being sent and received in order, it will also guarantee data to be delivered to the receive buffer only once. If endpoint is not able to @@ -139,7 +141,8 @@ provider for AWS Neuron or Habana SynapseAI. *FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES - bool* -: It is used to set the endpoint to use in-order RDMA write operation for each 128 bytes +: These option only applies to the fi_setopt() call.. + It is used to set the endpoint to use in-order RDMA write operation for each 128 bytes aligned block. Enabling the option will guarantee data inside each 128 bytes aligned block being written in order, it will also guarantee data to be delivered to the target buffer only once. If endpoint is not able to support diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 1981ed9825f..468a7c65a57 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1883,24 +1883,6 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, *(bool *)optval = true; *optlen = sizeof(bool); break; - case FI_OPT_EFA_USE_DEVICE_RDMA: - if (*optlen < sizeof(bool)) - return -FI_ETOOSMALL; - *(bool *)optval = efa_rdm_ep->use_device_rdma; - *optlen = sizeof(bool); - break; - case FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES: - if (*optlen < sizeof(bool)) - return -FI_ETOOSMALL; - *(bool *)optval = efa_rdm_ep->sendrecv_in_order_aligned_128_bytes; - *optlen = sizeof(bool); - break; - case FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES: - if (*optlen < sizeof(bool)) - return -FI_ETOOSMALL; - *(bool *)optval = efa_rdm_ep->write_in_order_aligned_128_bytes; - *optlen = sizeof(bool); - break; default: EFA_INFO(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); return -FI_ENOPROTOOPT; diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index f2d1d1f0e7a..c67902bc609 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -892,9 +892,6 @@ void test_efa_rdm_ep_getopt(struct efa_resource **state, size_t opt_len, int exp FI_OPT_EFA_EMULATED_READ, FI_OPT_EFA_EMULATED_WRITE, FI_OPT_EFA_EMULATED_ATOMICS, - FI_OPT_EFA_USE_DEVICE_RDMA, - FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES, - FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES }; size_t num_opt_names = sizeof(opt_names) / sizeof(int); From 950fb1b4f015044bbffde1ea7bf1d3feb83f35c9 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 15 Jan 2025 16:36:25 +0000 Subject: [PATCH 362/393] man/fi_endpoint: regulate the usage of optname FI_OPT_SHARED_MEMORY_PERMITTED is used to enforce an endpoint to NOT use shm in the data transfer. It should only apply to the fi_setopt call, in the same model of FI_OPT_CUDA_API_PERMITTED. Signed-off-by: Shi Jin --- man/fi_endpoint.3.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/fi_endpoint.3.md b/man/fi_endpoint.3.md index dfb51d69d00..e74049c2857 100644 --- a/man/fi_endpoint.3.md +++ b/man/fi_endpoint.3.md @@ -537,7 +537,8 @@ The following option levels and option names and parameters are defined. All providers that support FI_HMEM capability implement this option. - *FI_OPT_SHARED_MEMORY_PERMITTED - bool* -: This option controls the use of shared memory for intra-node communication. +: This option only applies to the fi_setopt call. + This option controls the use of shared memory for intra-node communication. Setting it to true will allow the use of shared memory. When set to false, shared memory will not be used and the implementation of intra-node communication is provider dependent. From 5efe991307be517109a0dd519d3aa475d5c84475 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 21 Jan 2025 19:55:26 +0000 Subject: [PATCH 363/393] prov/efa: Adjust the logging level for unreleased rxe Closing an ep with outstanding receives is a legal use case. There shouldn't be warning for this. Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 468a7c65a57..fb7213ee650 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -779,7 +779,7 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) dlist_foreach_safe(&efa_rdm_ep->rxe_list, entry, tmp) { rxe = container_of(entry, struct efa_rdm_ope, ep_entry); - EFA_WARN(FI_LOG_EP_CTRL, + EFA_INFO(FI_LOG_EP_CTRL, "Closing ep with unreleased rxe\n"); efa_rdm_rxe_release(rxe); } From a61db60b4fb10c9dd4ca09baffcf326ff2d62b82 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 3 Jan 2025 22:21:47 +0000 Subject: [PATCH 364/393] prov/efa: Migrate efa_dgram_ep to efa_ep Migrate dgram/efa_dgram_ep.c to efa_ep.c as a common ep interface for both dgram and rdm ep type. dgram repo now has nothing and is removed. Signed-off-by: Shi Jin --- libfabric.vcxproj | 3 +- prov/efa/Makefile.include | 4 +- prov/efa/src/dgram/efa_dgram_ep.h | 18 --- prov/efa/src/efa_base_ep.h | 3 + prov/efa/src/efa_cq.c | 1 - prov/efa/src/efa_domain.c | 3 +- .../src/{dgram/efa_dgram_ep.c => efa_ep.c} | 134 +++++++++--------- prov/efa/test/efa_unit_test_cq.c | 2 - 8 files changed, 74 insertions(+), 94 deletions(-) delete mode 100644 prov/efa/src/dgram/efa_dgram_ep.h rename prov/efa/src/{dgram/efa_dgram_ep.c => efa_ep.c} (65%) diff --git a/libfabric.vcxproj b/libfabric.vcxproj index 9acba798776..f3f3c5e5dc9 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -887,7 +887,7 @@ - + @@ -1011,7 +1011,6 @@ - diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index db5e44df1f0..a5c2842d389 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -50,7 +50,7 @@ _efa_files = \ prov/efa/src/efa_msg.c \ prov/efa/src/efa_rma.c \ prov/efa/src/efa_cq.c \ - prov/efa/src/dgram/efa_dgram_ep.c \ + prov/efa/src/efa_ep.c \ prov/efa/src/rdm/efa_rdm_peer.c \ prov/efa/src/rdm/efa_rdm_cq.c \ prov/efa/src/rdm/efa_rdm_ep_utils.c \ @@ -94,7 +94,6 @@ _efa_headers = \ prov/efa/src/efa_prov.h \ prov/efa/src/efa_env.h \ prov/efa/src/fi_ext_efa.h \ - prov/efa/src/dgram/efa_dgram_ep.h \ prov/efa/src/rdm/efa_rdm_peer.h \ prov/efa/src/rdm/efa_rdm_cq.h \ prov/efa/src/rdm/efa_rdm_ep.h \ @@ -187,7 +186,6 @@ endif ENABLE_EFA_UNIT_TEST efa_CPPFLAGS += \ -I$(top_srcdir)/prov/efa/src/ \ - -I$(top_srcdir)/prov/efa/src/dgram/ \ -I$(top_srcdir)/prov/efa/src/rdm/ rdmainclude_HEADERS += \ diff --git a/prov/efa/src/dgram/efa_dgram_ep.h b/prov/efa/src/dgram/efa_dgram_ep.h deleted file mode 100644 index 18ab0dc8703..00000000000 --- a/prov/efa/src/dgram/efa_dgram_ep.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include "efa_base_ep.h" - -#ifndef EFA_DGRAM_H -#define EFA_DGRAM_H - -struct efa_dgram_ep { - struct efa_base_ep base_ep; -}; - -int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *info, - struct fid_ep **ep_fid, void *context); - -extern struct fi_ops_msg efa_dgram_ep_msg_ops; -extern struct fi_ops_rma efa_dgram_ep_rma_ops; -#endif diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 37a9a5924a8..dac538f32ab 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -94,6 +94,9 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, int efa_base_ep_getname(fid_t fid, void *addr, size_t *addrlen); +int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, + struct fid_ep **ep_fid, void *context); + int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, uint32_t tclass); void efa_qp_destruct(struct efa_qp *qp); diff --git a/prov/efa/src/efa_cq.c b/prov/efa/src/efa_cq.c index a5b737d89ac..ea9f13c365e 100644 --- a/prov/efa/src/efa_cq.c +++ b/prov/efa/src/efa_cq.c @@ -6,7 +6,6 @@ #include #include "config.h" #include -#include "dgram/efa_dgram_ep.h" #include "efa.h" #include "efa_av.h" #include "efa_cntr.h" diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 17e948c7eef..34de62cebac 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -11,7 +11,6 @@ #include "efa_cntr.h" #include "rdm/efa_rdm_cq.h" #include "rdm/efa_rdm_atomic.h" -#include "dgram/efa_dgram_ep.h" struct dlist_entry g_efa_domain_list; @@ -33,7 +32,7 @@ static struct fi_ops_domain efa_ops_domain_dgram = { .size = sizeof(struct fi_ops_domain), .av_open = efa_av_open, .cq_open = efa_cq_open, - .endpoint = efa_dgram_ep_open, + .endpoint = efa_ep_open, .scalable_ep = fi_no_scalable_ep, .cntr_open = efa_cntr_open, .poll_open = fi_no_poll_open, diff --git a/prov/efa/src/dgram/efa_dgram_ep.c b/prov/efa/src/efa_ep.c similarity index 65% rename from prov/efa/src/dgram/efa_dgram_ep.c rename to prov/efa/src/efa_ep.c index 3119b8bee72..3b8b9190629 100644 --- a/prov/efa/src/dgram/efa_dgram_ep.c +++ b/prov/efa/src/efa_ep.c @@ -3,14 +3,16 @@ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "config.h" -#include "efa_dgram_ep.h" #include "efa.h" #include "efa_av.h" #include "efa_cq.h" #include -static int efa_dgram_ep_getopt(fid_t fid, int level, int optname, +extern struct fi_ops_msg efa_msg_ops; +extern struct fi_ops_rma efa_rma_ops; + +static int efa_ep_getopt(fid_t fid, int level, int optname, void *optval, size_t *optlen) { switch (level) { @@ -22,7 +24,7 @@ static int efa_dgram_ep_getopt(fid_t fid, int level, int optname, return 0; } -static int efa_dgram_ep_setopt(fid_t fid, int level, int optname, const void *optval, size_t optlen) +static int efa_ep_setopt(fid_t fid, int level, int optname, const void *optval, size_t optlen) { switch (level) { case FI_OPT_ENDPOINT: @@ -33,22 +35,22 @@ static int efa_dgram_ep_setopt(fid_t fid, int level, int optname, const void *op return 0; } -static struct fi_ops_ep efa_dgram_ep_base_ops = { +static struct fi_ops_ep efa_ep_base_ops = { .size = sizeof(struct fi_ops_ep), .cancel = fi_no_cancel, - .getopt = efa_dgram_ep_getopt, - .setopt = efa_dgram_ep_setopt, + .getopt = efa_ep_getopt, + .setopt = efa_ep_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, .rx_size_left = fi_no_rx_size_left, .tx_size_left = fi_no_tx_size_left, }; -static void efa_dgram_ep_destroy(struct efa_dgram_ep *ep) +static void efa_ep_destroy(struct efa_base_ep *ep) { int ret; - ret = efa_base_ep_destruct(&ep->base_ep); + ret = efa_base_ep_destruct(ep); if (ret) { EFA_WARN(FI_LOG_EP_CTRL, "Unable to close base endpoint\n"); } @@ -56,20 +58,20 @@ static void efa_dgram_ep_destroy(struct efa_dgram_ep *ep) free(ep); } -static int efa_dgram_ep_close(fid_t fid) +static int efa_ep_close(fid_t fid) { - struct efa_dgram_ep *ep; + struct efa_base_ep *ep; - ep = container_of(fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid.fid); + ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); - efa_dgram_ep_destroy(ep); + efa_ep_destroy(ep); return 0; } -static int efa_dgram_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { - struct efa_dgram_ep *ep; + struct efa_base_ep *ep; struct efa_cq *cq; struct efa_av *av; struct efa_domain *efa_domain; @@ -77,7 +79,7 @@ static int efa_dgram_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) struct util_cntr *cntr; int ret; - ep = container_of(fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid.fid); + ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); ret = ofi_ep_bind_valid(&efa_prov, bfid, flags); if (ret) return ret; @@ -96,31 +98,31 @@ static int efa_dgram_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) cq = container_of(bfid, struct efa_cq, util_cq.cq_fid); efa_domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain); - if (ep->base_ep.domain != efa_domain) + if (ep->domain != efa_domain) return -FI_EINVAL; - ret = ofi_ep_bind_cq(&ep->base_ep.util_ep, &cq->util_cq, flags); + ret = ofi_ep_bind_cq(&ep->util_ep, &cq->util_cq, flags); if (ret) return ret; break; case FI_CLASS_AV: av = container_of(bfid, struct efa_av, util_av.av_fid.fid); - ret = efa_base_ep_bind_av(&ep->base_ep, av); + ret = efa_base_ep_bind_av(ep, av); if (ret) return ret; break; case FI_CLASS_CNTR: cntr = container_of(bfid, struct util_cntr, cntr_fid.fid); - ret = ofi_ep_bind_cntr(&ep->base_ep.util_ep, cntr, flags); + ret = ofi_ep_bind_cntr(&ep->util_ep, cntr, flags); if (ret) return ret; break; case FI_CLASS_EQ: eq = container_of(bfid, struct util_eq, eq_fid.fid); - ret = ofi_ep_bind_eq(&ep->base_ep.util_ep, eq); + ret = ofi_ep_bind_eq(&ep->util_ep, eq); if (ret) return ret; break; @@ -131,11 +133,11 @@ static int efa_dgram_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) return 0; } -static int efa_dgram_ep_getflags(struct fid_ep *ep_fid, uint64_t *flags) +static int efa_ep_getflags(struct fid_ep *ep_fid, uint64_t *flags) { - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_tx_attr *tx_attr = ep->base_ep.info->tx_attr; - struct fi_rx_attr *rx_attr = ep->base_ep.info->rx_attr; + struct efa_base_ep *ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_tx_attr *tx_attr = ep->info->tx_attr; + struct fi_rx_attr *rx_attr = ep->info->rx_attr; if ((*flags & FI_TRANSMIT) && (*flags & FI_RECV)) { EFA_WARN(FI_LOG_EP_CTRL, "Both Tx/Rx flags cannot be specified\n"); @@ -151,11 +153,11 @@ static int efa_dgram_ep_getflags(struct fid_ep *ep_fid, uint64_t *flags) return 0; } -static int efa_dgram_ep_setflags(struct fid_ep *ep_fid, uint64_t flags) +static int efa_ep_setflags(struct fid_ep *ep_fid, uint64_t flags) { - struct efa_dgram_ep *ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); - struct fi_tx_attr *tx_attr = ep->base_ep.info->tx_attr; - struct fi_rx_attr *rx_attr = ep->base_ep.info->rx_attr; + struct efa_base_ep *ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); + struct fi_tx_attr *tx_attr = ep->info->tx_attr; + struct fi_rx_attr *rx_attr = ep->info->rx_attr; if ((flags & FI_TRANSMIT) && (flags & FI_RECV)) { EFA_WARN(FI_LOG_EP_CTRL, "Both Tx/Rx flags cannot be specified.\n"); @@ -174,17 +176,17 @@ static int efa_dgram_ep_setflags(struct fid_ep *ep_fid, uint64_t flags) return 0; } -static int efa_dgram_ep_enable(struct fid_ep *ep_fid) +static int efa_ep_enable(struct fid_ep *ep_fid) { struct ibv_qp_init_attr_ex attr_ex = { 0 }; - struct efa_dgram_ep *ep; + struct efa_base_ep *ep; struct efa_cq *scq, *rcq; int err; - ep = container_of(ep_fid, struct efa_dgram_ep, base_ep.util_ep.ep_fid); + ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); - scq = ep->base_ep.util_ep.tx_cq ? container_of(ep->base_ep.util_ep.tx_cq, struct efa_cq, util_cq) : NULL; - rcq = ep->base_ep.util_ep.rx_cq ? container_of(ep->base_ep.util_ep.rx_cq, struct efa_cq, util_cq) : NULL; + scq = ep->util_ep.tx_cq ? container_of(ep->util_ep.tx_cq, struct efa_cq, util_cq) : NULL; + rcq = ep->util_ep.rx_cq ? container_of(ep->util_ep.rx_cq, struct efa_cq, util_cq) : NULL; if (!scq && !rcq) { EFA_WARN(FI_LOG_EP_CTRL, @@ -192,53 +194,53 @@ static int efa_dgram_ep_enable(struct fid_ep *ep_fid) return -FI_ENOCQ; } - if (!scq && ofi_needs_tx(ep->base_ep.info->caps)) { + if (!scq && ofi_needs_tx(ep->info->caps)) { EFA_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); return -FI_ENOCQ; } - if (!rcq && ofi_needs_rx(ep->base_ep.info->caps)) { + if (!rcq && ofi_needs_rx(ep->info->caps)) { EFA_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled. (FI_RECV)\n"); return -FI_ENOCQ; } if (scq) { - attr_ex.cap.max_send_wr = ep->base_ep.info->tx_attr->size; - attr_ex.cap.max_send_sge = ep->base_ep.info->tx_attr->iov_limit; + attr_ex.cap.max_send_wr = ep->info->tx_attr->size; + attr_ex.cap.max_send_sge = ep->info->tx_attr->iov_limit; attr_ex.send_cq = ibv_cq_ex_to_cq(scq->ibv_cq.ibv_cq_ex); } else { attr_ex.send_cq = ibv_cq_ex_to_cq(rcq->ibv_cq.ibv_cq_ex); } if (rcq) { - attr_ex.cap.max_recv_wr = ep->base_ep.info->rx_attr->size; - attr_ex.cap.max_recv_sge = ep->base_ep.info->rx_attr->iov_limit; + attr_ex.cap.max_recv_wr = ep->info->rx_attr->size; + attr_ex.cap.max_recv_sge = ep->info->rx_attr->iov_limit; attr_ex.recv_cq = ibv_cq_ex_to_cq(rcq->ibv_cq.ibv_cq_ex); } else { attr_ex.recv_cq = ibv_cq_ex_to_cq(scq->ibv_cq.ibv_cq_ex); } attr_ex.cap.max_inline_data = - ep->base_ep.domain->device->efa_attr.inline_buf_size; + ep->domain->device->efa_attr.inline_buf_size; - assert(EFA_EP_TYPE_IS_DGRAM(ep->base_ep.domain->info)); + assert(EFA_EP_TYPE_IS_DGRAM(ep->domain->info)); attr_ex.qp_type = IBV_QPT_UD; attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; - attr_ex.pd = container_of(ep->base_ep.util_ep.domain, struct efa_domain, util_domain)->ibv_pd; + attr_ex.pd = container_of(ep->util_ep.domain, struct efa_domain, util_domain)->ibv_pd; attr_ex.qp_context = ep; attr_ex.sq_sig_all = 1; - err = efa_base_ep_create_qp(&ep->base_ep, &attr_ex); + err = efa_base_ep_create_qp(ep, &attr_ex); if (err) return err; - return efa_base_ep_enable(&ep->base_ep); + return efa_base_ep_enable(ep); } -static int efa_dgram_ep_control(struct fid *fid, int command, void *arg) +static int efa_ep_control(struct fid *fid, int command, void *arg) { struct fid_ep *ep_fid; @@ -247,11 +249,11 @@ static int efa_dgram_ep_control(struct fid *fid, int command, void *arg) ep_fid = container_of(fid, struct fid_ep, fid); switch (command) { case FI_GETOPSFLAG: - return efa_dgram_ep_getflags(ep_fid, (uint64_t *)arg); + return efa_ep_getflags(ep_fid, (uint64_t *)arg); case FI_SETOPSFLAG: - return efa_dgram_ep_setflags(ep_fid, *(uint64_t *)arg); + return efa_ep_setflags(ep_fid, *(uint64_t *)arg); case FI_ENABLE: - return efa_dgram_ep_enable(ep_fid); + return efa_ep_enable(ep_fid); default: return -FI_ENOSYS; } @@ -261,11 +263,11 @@ static int efa_dgram_ep_control(struct fid *fid, int command, void *arg) } } -static struct fi_ops efa_dgram_ep_ops = { +static struct fi_ops efa_ep_ops = { .size = sizeof(struct fi_ops), - .close = efa_dgram_ep_close, - .bind = efa_dgram_ep_bind, - .control = efa_dgram_ep_control, + .close = efa_ep_close, + .bind = efa_ep_bind, + .control = efa_ep_control, .ops_open = fi_no_ops_open, }; @@ -282,7 +284,7 @@ void efa_ep_progress_no_op(struct util_ep *util_ep) return; } -static struct fi_ops_atomic efa_dgram_ep_atomic_ops = { +static struct fi_ops_atomic efa_atomic_ops = { .size = sizeof(struct fi_ops_atomic), .write = fi_no_atomic_write, .writev = fi_no_atomic_writev, @@ -299,7 +301,7 @@ static struct fi_ops_atomic efa_dgram_ep_atomic_ops = { .compwritevalid = fi_no_atomic_compwritevalid, }; -struct fi_ops_cm efa_dgram_ep_cm_ops = { +struct fi_ops_cm efa_ep_cm_ops = { .size = sizeof(struct fi_ops_cm), .setname = fi_no_setname, .getname = efa_base_ep_getname, @@ -312,12 +314,12 @@ struct fi_ops_cm efa_dgram_ep_cm_ops = { .join = fi_no_join, }; -int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, +int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, struct fid_ep **ep_fid, void *context) { struct efa_domain *domain; const struct fi_info *prov_info; - struct efa_dgram_ep *ep; + struct efa_base_ep *ep; int ret; domain = container_of(domain_fid, struct efa_domain, @@ -355,7 +357,7 @@ int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, if (!ep) return -FI_ENOMEM; - ret = efa_base_ep_construct(&ep->base_ep, domain_fid, user_info, efa_ep_progress_no_op, context); + ret = efa_base_ep_construct(ep, domain_fid, user_info, efa_ep_progress_no_op, context); if (ret) goto err_ep_destroy; @@ -364,21 +366,21 @@ int efa_dgram_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, */ assert(user_info->tx_attr->iov_limit <= 2); - ep->base_ep.domain = domain; + ep->domain = domain; - *ep_fid = &ep->base_ep.util_ep.ep_fid; + *ep_fid = &ep->util_ep.ep_fid; (*ep_fid)->fid.fclass = FI_CLASS_EP; (*ep_fid)->fid.context = context; - (*ep_fid)->fid.ops = &efa_dgram_ep_ops; - (*ep_fid)->ops = &efa_dgram_ep_base_ops; - (*ep_fid)->msg = &efa_dgram_ep_msg_ops; - (*ep_fid)->cm = &efa_dgram_ep_cm_ops; - (*ep_fid)->rma = &efa_dgram_ep_rma_ops; - (*ep_fid)->atomic = &efa_dgram_ep_atomic_ops; + (*ep_fid)->fid.ops = &efa_ep_ops; + (*ep_fid)->ops = &efa_ep_base_ops; + (*ep_fid)->msg = &efa_msg_ops; + (*ep_fid)->cm = &efa_ep_cm_ops; + (*ep_fid)->rma = &efa_rma_ops; + (*ep_fid)->atomic = &efa_atomic_ops; return 0; err_ep_destroy: - efa_dgram_ep_destroy(ep); + efa_ep_destroy(ep); return ret; } diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index e69fb8b432e..e939d182b60 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -2,7 +2,6 @@ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_unit_tests.h" -#include "dgram/efa_dgram_ep.h" #include "rdm/efa_rdm_cq.h" #include "efa_av.h" @@ -25,7 +24,6 @@ void test_impl_cq_read_empty_cq(struct efa_resource *resource, enum fi_ep_type e efa_unit_test_resource_construct(resource, ep_type); efa_base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); - ibv_cqx = container_of(efa_base_ep->util_ep.rx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; From 1781cd9ce179699756cb6388490fa88b4eddb801 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 8 Jan 2025 00:24:26 +0000 Subject: [PATCH 365/393] prov/efa: Extend efa_ep interface Extend efa ep interface to make it cover all the applied features that efa-rdm ep interface supports today. It also refactors and moves several internal efa_rdm_ep functions to efa_base_ep.c to cover both efa_direct and efa_rdm ep. Signed-off-by: Shi Jin --- prov/efa/src/efa.h | 17 ++ prov/efa/src/efa_base_ep.c | 218 ++++++++++++++++- prov/efa/src/efa_base_ep.h | 9 + prov/efa/src/efa_cntr.c | 16 +- prov/efa/src/efa_cq.c | 8 +- prov/efa/src/efa_cq.h | 7 + prov/efa/src/efa_ep.c | 281 +++++++++++++--------- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 254 +++----------------- prov/efa/test/efa_unit_test_av.c | 4 +- prov/efa/test/efa_unit_test_cntr.c | 54 ++++- prov/efa/test/efa_unit_test_common.c | 69 ++++-- prov/efa/test/efa_unit_test_cq.c | 20 +- prov/efa/test/efa_unit_test_domain.c | 6 +- prov/efa/test/efa_unit_test_ep.c | 314 ++++++++++++++++++++++--- prov/efa/test/efa_unit_test_hmem.c | 6 +- prov/efa/test/efa_unit_test_info.c | 38 +-- prov/efa/test/efa_unit_test_mr.c | 2 +- prov/efa/test/efa_unit_test_msg.c | 7 +- prov/efa/test/efa_unit_test_ope.c | 14 +- prov/efa/test/efa_unit_test_pke.c | 2 +- prov/efa/test/efa_unit_test_rdm_peer.c | 14 +- prov/efa/test/efa_unit_test_rma.c | 5 +- prov/efa/test/efa_unit_test_runt.c | 26 +- prov/efa/test/efa_unit_test_send.c | 2 +- prov/efa/test/efa_unit_test_srx.c | 6 +- prov/efa/test/efa_unit_tests.c | 13 +- prov/efa/test/efa_unit_tests.h | 26 +- 27 files changed, 948 insertions(+), 490 deletions(-) diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h index 4d8e982355c..aef070fdc5f 100644 --- a/prov/efa/src/efa.h +++ b/prov/efa/src/efa.h @@ -227,4 +227,21 @@ bool efa_use_unsolicited_write_recv() return efa_env.use_unsolicited_write_recv && efa_device_support_unsolicited_write_recv(); } +/** + * Convenience macro for setopt with an enforced threshold + */ +#define EFA_EP_SETOPT_THRESHOLD(opt, field, threshold) { \ + size_t _val = *(size_t *) optval; \ + if (optlen != sizeof field) \ + return -FI_EINVAL; \ + if (_val > threshold) { \ + EFA_WARN(FI_LOG_EP_CTRL, \ + "Requested size of %zu for FI_OPT_" #opt " " \ + "exceeds the maximum (%zu)\n", \ + _val, threshold); \ + return -FI_EINVAL; \ + } \ + field = _val; \ +} + #endif /* EFA_H */ diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 85068fa91c6..11cbe558454 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -5,6 +5,7 @@ #include "efa.h" #include "efa_av.h" #include "efa_cq.h" +#include "efa_cntr.h" #include "rdm/efa_rdm_protocol.h" int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av) @@ -366,9 +367,10 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, base_ep->qp = NULL; base_ep->user_recv_qp = NULL; - base_ep->max_msg_size = info->ep_attr->max_msg_size; - base_ep->max_rma_size = info->ep_attr->max_msg_size; - base_ep->inject_msg_size = info->tx_attr->inject_size; + /* Use device's native limit as the default value of base ep*/ + base_ep->max_msg_size = (size_t) base_ep->domain->device->ibv_port_attr.max_msg_sz; + base_ep->max_rma_size = (size_t) base_ep->domain->device->max_rdma_size; + base_ep->inject_msg_size = (size_t) base_ep->domain->device->efa_attr.inline_buf_size; /* TODO: update inject_rma_size to inline size after firmware * supports inline rdma write */ base_ep->inject_rma_size = 0; @@ -531,3 +533,213 @@ struct efa_cq *efa_base_ep_get_rx_cq(struct efa_base_ep *ep) { return ep->util_ep.rx_cq ? container_of(ep->util_ep.rx_cq, struct efa_cq, util_cq) : NULL; } + +/** + * @brief Construct the ibv qp init attr for given ep and cq + * + * @param ep a ptr to the efa_base_ep + * @param attr_ex the constructed qp attr + * @param tx_cq tx cq + * @param rx_cq rx cq + */ +static inline +void efa_base_ep_construct_ibv_qp_init_attr_ex(struct efa_base_ep *ep, + struct ibv_qp_init_attr_ex *attr_ex, + struct ibv_cq_ex *tx_cq, + struct ibv_cq_ex *rx_cq) +{ + struct fi_info *info; + + if (ep->info->ep_attr->type == FI_EP_RDM) { + attr_ex->qp_type = IBV_QPT_DRIVER; + info = ep->domain->device->rdm_info; + } else { + assert(ep->info->ep_attr->type == FI_EP_DGRAM); + attr_ex->qp_type = IBV_QPT_UD; + info = ep->domain->device->dgram_info; + } + attr_ex->cap.max_send_wr = info->tx_attr->size; + attr_ex->cap.max_send_sge = info->tx_attr->iov_limit; + attr_ex->cap.max_recv_wr = info->rx_attr->size; + attr_ex->cap.max_recv_sge = info->rx_attr->iov_limit; + attr_ex->cap.max_inline_data = ep->domain->device->efa_attr.inline_buf_size; + attr_ex->pd = ep->domain->ibv_pd; + attr_ex->qp_context = ep; + attr_ex->sq_sig_all = 1; + + attr_ex->send_cq = ibv_cq_ex_to_cq(tx_cq); + attr_ex->recv_cq = ibv_cq_ex_to_cq(rx_cq); +} + +/** + * @brief check the in order aligned 128 bytes support for a given ibv_wr_op code + * + * @param ep efa_base_ep + * @param op_code ibv wr op code + * @return int 0 if in order aligned 128 bytes is supported, -FI_EOPNOTSUPP if + * it is not supported. Other negative integer for other errors. + */ +int efa_base_ep_check_qp_in_order_aligned_128_bytes(struct efa_base_ep *ep, + enum ibv_wr_opcode op_code) +{ + struct efa_qp *qp = NULL; + struct ibv_qp_init_attr_ex attr_ex = {0}; + int ret, retv; + struct ibv_cq_ex *ibv_cq_ex = NULL; + enum ibv_cq_ex_type ibv_cq_ex_type; + struct fi_cq_attr cq_attr = {0}; + + ret = efa_cq_ibv_cq_ex_open(&cq_attr, ep->domain->device->ibv_ctx, &ibv_cq_ex, &ibv_cq_ex_type); + if (ret) { + EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %d\n", ret); + ret = -FI_EINVAL; + goto out; + } + + /* Create a dummy qp for query only */ + efa_base_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, ibv_cq_ex, ibv_cq_ex); + + ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC); + if (ret) + goto out; + + if (!efa_qp_support_op_in_order_aligned_128_bytes(qp, op_code)) + ret = -FI_EOPNOTSUPP; + +out: + if (qp) + efa_qp_destruct(qp); + + if (ibv_cq_ex) { + retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(ibv_cq_ex)); + if (retv) + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close ibv cq: %s\n", + fi_strerror(-retv)); + } + return ret; +} + +/** + * @brief Insert tx/rx cq into the cntrs the ep is bind to + * + * @param ep efa_base_ep + * @return int 0 on success, negative integer on failure + */ +int efa_base_ep_insert_cntr_ibv_cq_poll_list(struct efa_base_ep *ep) +{ + int i, ret; + struct efa_cntr *efa_cntr; + struct util_cntr *util_cntr; + struct efa_cq *tx_cq, *rx_cq; + + tx_cq = efa_base_ep_get_tx_cq(ep); + rx_cq = efa_base_ep_get_rx_cq(ep); + + for (i = 0; i < CNTR_CNT; i++) { + util_cntr = ep->util_ep.cntrs[i]; + if (util_cntr) { + efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); + if (tx_cq) { + ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); + if (ret) + return ret; + } + if (rx_cq) { + ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); + if (ret) + return ret; + } + ofi_genlock_lock(&efa_cntr->util_cntr.ep_list_lock); + efa_cntr->need_to_scan_ep_list = true; + ofi_genlock_unlock(&efa_cntr->util_cntr.ep_list_lock); + } + } + + return FI_SUCCESS; +} + +/** + * @brief Remove tx/rx cq from the cntr that ep is bind to + * + * @param ep efa_base_ep + */ +void efa_base_ep_remove_cntr_ibv_cq_poll_list(struct efa_base_ep *ep) +{ + int i; + struct efa_cntr *efa_cntr; + struct util_cntr *util_cntr; + struct efa_cq *tx_cq, *rx_cq; + + tx_cq = efa_base_ep_get_tx_cq(ep); + rx_cq = efa_base_ep_get_rx_cq(ep); + + for (i = 0; i< CNTR_CNT; i++) { + util_cntr = ep->util_ep.cntrs[i]; + if (util_cntr) { + efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); + if (tx_cq && !ofi_atomic_get32(&tx_cq->util_cq.ref)) + efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); + + if (rx_cq && !ofi_atomic_get32(&rx_cq->util_cq.ref)) + efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); + } + } +} + +/** + * @brief Create and enable the IBV QP that backs the EP + * + * @param ep efa_base_ep + * @param create_user_recv_qp whether to create the user_recv_qp. This boolean + * is only true for the zero copy recv mode in the efa-rdm endpoint + * + * @return int 0 on success, negative integer on failure + */ +int efa_base_ep_create_and_enable_qp(struct efa_base_ep *ep, bool create_user_recv_qp) +{ + struct ibv_qp_init_attr_ex attr_ex = { 0 }; + struct efa_cq *scq, *rcq; + struct ibv_cq_ex *tx_ibv_cq, *rx_ibv_cq; + int err; + + scq = efa_base_ep_get_tx_cq(ep); + rcq = efa_base_ep_get_rx_cq(ep); + + if (!scq && !rcq) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint is not bound to a send or receive completion queue\n"); + return -FI_ENOCQ; + } + + if (!scq && ofi_needs_tx(ep->info->caps)) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); + return -FI_ENOCQ; + } + + if (!rcq && ofi_needs_rx(ep->info->caps)) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled. (FI_RECV)\n"); + return -FI_ENOCQ; + } + + tx_ibv_cq = scq ? scq->ibv_cq.ibv_cq_ex : rcq->ibv_cq.ibv_cq_ex; + rx_ibv_cq = rcq ? rcq->ibv_cq.ibv_cq_ex : scq->ibv_cq.ibv_cq_ex; + + efa_base_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, tx_ibv_cq, rx_ibv_cq); + + err = efa_base_ep_create_qp(ep, &attr_ex); + if (err) + return err; + + if (create_user_recv_qp) { + err = efa_qp_create(&ep->user_recv_qp, &attr_ex, ep->info->tx_attr->tclass); + if (err) { + efa_base_ep_destruct_qp(ep); + return err; + } + ep->user_recv_qp->base_ep = ep; + } + + return efa_base_ep_enable(ep); +} diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index dac538f32ab..52901fcc9ec 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -129,4 +129,13 @@ struct efa_cq *efa_base_ep_get_tx_cq(struct efa_base_ep *ep); struct efa_cq *efa_base_ep_get_rx_cq(struct efa_base_ep *ep); +int efa_base_ep_check_qp_in_order_aligned_128_bytes(struct efa_base_ep *base_ep, + enum ibv_wr_opcode op_code); + +int efa_base_ep_insert_cntr_ibv_cq_poll_list(struct efa_base_ep *ep); + +void efa_base_ep_remove_cntr_ibv_cq_poll_list(struct efa_base_ep *ep); + +int efa_base_ep_create_and_enable_qp(struct efa_base_ep *ep, bool create_user_recv_qp); + #endif diff --git a/prov/efa/src/efa_cntr.c b/prov/efa/src/efa_cntr.c index 8082ae76fd1..c30a3d862d4 100644 --- a/prov/efa/src/efa_cntr.c +++ b/prov/efa/src/efa_cntr.c @@ -180,18 +180,16 @@ static void efa_rdm_cntr_progress(struct util_cntr *cntr) static void efa_cntr_progress(struct util_cntr *cntr) { - struct util_ep *ep; - struct fid_list_entry *fid_entry; struct dlist_entry *item; + struct efa_ibv_cq_poll_list_entry *poll_list_entry; + struct efa_cntr *efa_cntr; + + efa_cntr = container_of(cntr, struct efa_cntr, util_cntr); ofi_genlock_lock(&cntr->ep_list_lock); - dlist_foreach(&cntr->ep_list, item) { - fid_entry = container_of(item, struct fid_list_entry, entry); - ep = container_of(fid_entry->fid, struct util_ep, ep_fid.fid); - if (ep->tx_cq) - efa_cq_progress(ep->tx_cq); - if (ep->rx_cq && ep->rx_cq != ep->tx_cq) - efa_cq_progress(ep->rx_cq); + dlist_foreach(&efa_cntr->ibv_cq_poll_list, item) { + poll_list_entry = container_of(item, struct efa_ibv_cq_poll_list_entry, entry); + efa_cq_poll_ibv_cq(efa_env.efa_cq_read_size, poll_list_entry->cq); } ofi_genlock_unlock(&cntr->ep_list_lock); } diff --git a/prov/efa/src/efa_cq.c b/prov/efa/src/efa_cq.c index ea9f13c365e..1ca9416b618 100644 --- a/prov/efa/src/efa_cq.c +++ b/prov/efa/src/efa_cq.c @@ -243,7 +243,7 @@ efa_cq_proc_ibv_recv_rdma_with_imm_completion(struct efa_base_ep *base_ep, * A negative number means to poll until cq empty. * @param[in] util_cq util_cq */ -void efa_cq_poll_ibv_cq(ssize_t cqe_to_process, struct util_cq *util_cq) +void efa_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) { bool should_end_poll = false; struct efa_base_ep *base_ep; @@ -260,7 +260,7 @@ void efa_cq_poll_ibv_cq(ssize_t cqe_to_process, struct util_cq *util_cq) */ struct ibv_poll_cq_attr poll_cq_attr = {.comp_mask = 0}; - cq = container_of(util_cq, struct efa_cq, util_cq); + cq = container_of(ibv_cq, struct efa_cq, ibv_cq); efa_domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain); /* Call ibv_start_poll only once */ @@ -381,7 +381,9 @@ static struct fi_ops_cq efa_cq_ops = { void efa_cq_progress(struct util_cq *cq) { - efa_cq_poll_ibv_cq(efa_env.efa_cq_read_size, cq); + struct efa_cq *efa_cq = container_of(cq, struct efa_cq, util_cq); + + efa_cq_poll_ibv_cq(efa_env.efa_cq_read_size, &efa_cq->ibv_cq); } static int efa_cq_close(fid_t fid) diff --git a/prov/efa/src/efa_cq.h b/prov/efa/src/efa_cq.h index 8d328d8e7fd..efdf2cb15db 100644 --- a/prov/efa/src/efa_cq.h +++ b/prov/efa/src/efa_cq.h @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +#ifndef _EFA_CQ_H +#define _EFA_CQ_H + #include "efa.h" enum ibv_cq_ex_type { @@ -269,3 +272,7 @@ static inline int efa_write_error_msg(struct efa_base_ep *ep, fi_addr_t addr, return 0; } + +void efa_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq); + +#endif /* end of _EFA_CQ_H*/ \ No newline at end of file diff --git a/prov/efa/src/efa_ep.c b/prov/efa/src/efa_ep.c index 3b8b9190629..8aa3268adf2 100644 --- a/prov/efa/src/efa_ep.c +++ b/prov/efa/src/efa_ep.c @@ -15,24 +15,165 @@ extern struct fi_ops_rma efa_rma_ops; static int efa_ep_getopt(fid_t fid, int level, int optname, void *optval, size_t *optlen) { - switch (level) { - case FI_OPT_ENDPOINT: + struct efa_base_ep *ep; + + ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); + + if (level != FI_OPT_ENDPOINT) return -FI_ENOPROTOOPT; + + switch (optname) { + case FI_OPT_EFA_RNR_RETRY: + if (*optlen < sizeof(size_t)) + return -FI_ETOOSMALL; + *(size_t *)optval = ep->rnr_retry; + *optlen = sizeof(size_t); + break; + /* p2p is required for efa direct ep */ + case FI_OPT_FI_HMEM_P2P: + if (*optlen < sizeof(int)) + return -FI_ETOOSMALL; + *(int *)optval = FI_HMEM_P2P_REQUIRED; + *optlen = sizeof(int); + break; + case FI_OPT_MAX_MSG_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = ep->max_msg_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_MAX_RMA_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = ep->max_rma_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_INJECT_MSG_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = ep->inject_msg_size; + *optlen = sizeof (size_t); + break; + case FI_OPT_INJECT_RMA_SIZE: + if (*optlen < sizeof (size_t)) + return -FI_ETOOSMALL; + *(size_t *) optval = ep->inject_rma_size; + *optlen = sizeof (size_t); + break; + /* Emulated read/write is NOT used for efa direct ep */ + case FI_OPT_EFA_EMULATED_READ: /* fall through */ + case FI_OPT_EFA_EMULATED_WRITE: + if (*optlen < sizeof(bool)) + return -FI_ETOOSMALL; + *(bool *)optval = false; + *optlen = sizeof(bool); + break; default: + EFA_INFO(FI_LOG_EP_CTRL, "Unknown / unsupported endpoint option\n"); return -FI_ENOPROTOOPT; } - return 0; + + return FI_SUCCESS; } static int efa_ep_setopt(fid_t fid, int level, int optname, const void *optval, size_t optlen) { - switch (level) { - case FI_OPT_ENDPOINT: + int ret, intval; + struct efa_base_ep *ep; + + ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); + + if (level != FI_OPT_ENDPOINT) return -FI_ENOPROTOOPT; + + switch (optname) { + case FI_OPT_EFA_RNR_RETRY: + if (optlen != sizeof(size_t)) + return -FI_EINVAL; + + /* + * Application is required to call to fi_setopt before EP + * enabled. If it's calling to fi_setopt after EP enabled, + * fail the call. + * + * efa_ep->qp will be NULL before EP enabled, use it to check + * if the call to fi_setopt is before or after EP enabled for + * convience, instead of calling to ibv_query_qp + */ + if (ep->efa_qp_enabled) { + EFA_WARN(FI_LOG_EP_CTRL, + "The option FI_OPT_EFA_RNR_RETRY is required " + "to be set before EP enabled\n"); + return -FI_EINVAL; + } + + if (!efa_domain_support_rnr_retry_modify(ep->domain)) { + EFA_WARN(FI_LOG_EP_CTRL, + "RNR capability is not supported\n"); + return -FI_ENOSYS; + } + ep->rnr_retry = *(size_t *)optval; + break; + case FI_OPT_FI_HMEM_P2P: + if (optlen != sizeof(int)) + return -FI_EINVAL; + + intval = *(int *)optval; + + if (intval == FI_HMEM_P2P_DISABLED) { + EFA_WARN(FI_LOG_EP_CTRL, "p2p is required by implementation\n"); + return -FI_EOPNOTSUPP; + } + break; + case FI_OPT_MAX_MSG_SIZE: + EFA_EP_SETOPT_THRESHOLD(MAX_MSG_SIZE, ep->max_msg_size, (size_t) ep->domain->device->ibv_port_attr.max_msg_sz) + break; + case FI_OPT_MAX_RMA_SIZE: + EFA_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, ep->max_rma_size, (size_t) ep->domain->device->max_rdma_size) + break; + case FI_OPT_INJECT_MSG_SIZE: + EFA_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, ep->inject_msg_size, (size_t) ep->domain->device->efa_attr.inline_buf_size) + break; + case FI_OPT_INJECT_RMA_SIZE: + EFA_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, ep->inject_rma_size, (size_t) 0) + break; + /* no op as efa direct ep will not use cuda api and shm in data transfer */ + case FI_OPT_CUDA_API_PERMITTED: /* fall through */ + case FI_OPT_SHARED_MEMORY_PERMITTED: + break; + /* no op as efa direct ep will always use rdma for rma operations in data transfer */ + case FI_OPT_EFA_USE_DEVICE_RDMA: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + if (!(*(bool *)optval) && (ep->info->caps & FI_RMA)) { + EFA_WARN(FI_LOG_EP_CTRL, "Device rdma is required for rma operations\n"); + return -FI_EOPNOTSUPP; + } + break; + case FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + if (*(bool *)optval) { + ret = efa_base_ep_check_qp_in_order_aligned_128_bytes(ep, IBV_WR_SEND); + if (ret) + return ret; + } + break; + case FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES: + if (optlen != sizeof(bool)) + return -FI_EINVAL; + if (*(bool *)optval) { + ret = efa_base_ep_check_qp_in_order_aligned_128_bytes(ep, IBV_WR_RDMA_WRITE); + if (ret) + return ret; + } + break; default: + EFA_INFO(FI_LOG_EP_CTRL, "Unknown / unsupported endpoint option\n"); return -FI_ENOPROTOOPT; } - return 0; + + return FI_SUCCESS; } static struct fi_ops_ep efa_ep_base_ops = { @@ -46,25 +187,25 @@ static struct fi_ops_ep efa_ep_base_ops = { .tx_size_left = fi_no_tx_size_left, }; -static void efa_ep_destroy(struct efa_base_ep *ep) +static int efa_ep_close(fid_t fid) { + struct efa_base_ep *ep; int ret; + ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); + + /* We need to free the util_ep first to avoid race conditions + * with other threads progressing the cntr. */ + efa_base_ep_close_util_ep(ep); + + efa_base_ep_remove_cntr_ibv_cq_poll_list(ep); + ret = efa_base_ep_destruct(ep); if (ret) { EFA_WARN(FI_LOG_EP_CTRL, "Unable to close base endpoint\n"); } free(ep); -} - -static int efa_ep_close(fid_t fid) -{ - struct efa_base_ep *ep; - - ep = container_of(fid, struct efa_base_ep, util_ep.ep_fid.fid); - - efa_ep_destroy(ep); return 0; } @@ -108,6 +249,11 @@ static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) break; case FI_CLASS_AV: av = container_of(bfid, struct efa_av, util_av.av_fid.fid); + /* Bind util provider endpoint and av */ + ret = ofi_ep_bind_av(&ep->util_ep, &av->util_av); + if (ret) + return ret; + ret = efa_base_ep_bind_av(ep, av); if (ret) return ret; @@ -127,6 +273,7 @@ static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) return ret; break; default: + EFA_WARN(FI_LOG_EP_CTRL, "invalid fid class\n"); return -EINVAL; } @@ -178,66 +325,20 @@ static int efa_ep_setflags(struct fid_ep *ep_fid, uint64_t flags) static int efa_ep_enable(struct fid_ep *ep_fid) { - struct ibv_qp_init_attr_ex attr_ex = { 0 }; struct efa_base_ep *ep; - struct efa_cq *scq, *rcq; int err; ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); - scq = ep->util_ep.tx_cq ? container_of(ep->util_ep.tx_cq, struct efa_cq, util_cq) : NULL; - rcq = ep->util_ep.rx_cq ? container_of(ep->util_ep.rx_cq, struct efa_cq, util_cq) : NULL; - - if (!scq && !rcq) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a send or receive completion queue\n"); - return -FI_ENOCQ; - } - - if (!scq && ofi_needs_tx(ep->info->caps)) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); - return -FI_ENOCQ; - } - - if (!rcq && ofi_needs_rx(ep->info->caps)) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled. (FI_RECV)\n"); - return -FI_ENOCQ; - } - - if (scq) { - attr_ex.cap.max_send_wr = ep->info->tx_attr->size; - attr_ex.cap.max_send_sge = ep->info->tx_attr->iov_limit; - attr_ex.send_cq = ibv_cq_ex_to_cq(scq->ibv_cq.ibv_cq_ex); - } else { - attr_ex.send_cq = ibv_cq_ex_to_cq(rcq->ibv_cq.ibv_cq_ex); - } - - if (rcq) { - attr_ex.cap.max_recv_wr = ep->info->rx_attr->size; - attr_ex.cap.max_recv_sge = ep->info->rx_attr->iov_limit; - attr_ex.recv_cq = ibv_cq_ex_to_cq(rcq->ibv_cq.ibv_cq_ex); - } else { - attr_ex.recv_cq = ibv_cq_ex_to_cq(scq->ibv_cq.ibv_cq_ex); - } - - attr_ex.cap.max_inline_data = - ep->domain->device->efa_attr.inline_buf_size; - - assert(EFA_EP_TYPE_IS_DGRAM(ep->domain->info)); - attr_ex.qp_type = IBV_QPT_UD; - attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; - attr_ex.pd = container_of(ep->util_ep.domain, struct efa_domain, util_domain)->ibv_pd; - - attr_ex.qp_context = ep; - attr_ex.sq_sig_all = 1; - - err = efa_base_ep_create_qp(ep, &attr_ex); + err = efa_base_ep_create_and_enable_qp(ep, false); if (err) return err; - return efa_base_ep_enable(ep); + err = efa_base_ep_insert_cntr_ibv_cq_poll_list(ep); + if (err) + efa_base_ep_destruct_qp(ep); + + return err; } static int efa_ep_control(struct fid *fid, int command, void *arg) @@ -317,42 +418,9 @@ struct fi_ops_cm efa_ep_cm_ops = { int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, struct fid_ep **ep_fid, void *context) { - struct efa_domain *domain; - const struct fi_info *prov_info; struct efa_base_ep *ep; int ret; - domain = container_of(domain_fid, struct efa_domain, - util_domain.domain_fid); - - if (!user_info || !user_info->ep_attr || !user_info->domain_attr || - strncmp(domain->device->ibv_ctx->device->name, user_info->domain_attr->name, - strlen(domain->device->ibv_ctx->device->name))) { - EFA_INFO(FI_LOG_DOMAIN, "Invalid info->domain_attr->name\n"); - return -FI_EINVAL; - } - - prov_info = efa_domain_get_prov_info(domain, user_info->ep_attr->type); - assert(prov_info); - - assert(user_info->ep_attr); - ret = ofi_check_ep_attr(&efa_util_prov, user_info->fabric_attr->api_version, prov_info, user_info); - if (ret) - return ret; - - if (user_info->tx_attr) { - ret = ofi_check_tx_attr(&efa_prov, prov_info->tx_attr, - user_info->tx_attr, user_info->mode); - if (ret) - return ret; - } - - if (user_info->rx_attr) { - ret = ofi_check_rx_attr(&efa_prov, prov_info, user_info->rx_attr, user_info->mode); - if (ret) - return ret; - } - ep = calloc(1, sizeof(*ep)); if (!ep) return -FI_ENOMEM; @@ -361,13 +429,6 @@ int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, if (ret) goto err_ep_destroy; - /* struct efa_send_wr and efa_recv_wr allocates memory for 2 IOV - * So check with an assert statement that iov_limit is 2 or less - */ - assert(user_info->tx_attr->iov_limit <= 2); - - ep->domain = domain; - *ep_fid = &ep->util_ep.ep_fid; (*ep_fid)->fid.fclass = FI_CLASS_EP; (*ep_fid)->fid.context = context; @@ -381,6 +442,8 @@ int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, return 0; err_ep_destroy: - efa_ep_destroy(ep); + efa_base_ep_destruct(ep); + if (ep) + free(ep); return ret; } diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index fb7213ee650..f619ea21e49 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -14,25 +14,6 @@ #include "efa_rdm_pke_req.h" #include "efa_cntr.h" -static -void efa_rdm_ep_construct_ibv_qp_init_attr_ex(struct efa_rdm_ep *ep, - struct ibv_qp_init_attr_ex *attr_ex, - struct ibv_cq_ex *tx_cq, - struct ibv_cq_ex *rx_cq) -{ - attr_ex->cap.max_send_wr = ep->base_ep.domain->device->rdm_info->tx_attr->size; - attr_ex->cap.max_send_sge = ep->base_ep.domain->device->rdm_info->tx_attr->iov_limit; - attr_ex->cap.max_recv_wr = ep->base_ep.domain->device->rdm_info->rx_attr->size; - attr_ex->cap.max_recv_sge = ep->base_ep.domain->device->rdm_info->rx_attr->iov_limit; - attr_ex->cap.max_inline_data = ep->base_ep.domain->device->efa_attr.inline_buf_size; - attr_ex->qp_type = IBV_QPT_DRIVER; - attr_ex->pd = efa_rdm_ep_domain(ep)->ibv_pd; - attr_ex->qp_context = ep; - attr_ex->sq_sig_all = 1; - - attr_ex->send_cq = ibv_cq_ex_to_cq(tx_cq); - attr_ex->recv_cq = ibv_cq_ex_to_cq(rx_cq); -} static inline struct efa_rdm_cq *efa_rdm_ep_get_tx_rdm_cq(struct efa_rdm_ep *ep) @@ -46,68 +27,6 @@ struct efa_rdm_cq *efa_rdm_ep_get_rx_rdm_cq(struct efa_rdm_ep *ep) return ep->base_ep.util_ep.rx_cq ? container_of(ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, efa_cq.util_cq) : NULL; } -/** - * @brief set the "efa_qp" field in the efa_rdm_ep->efa_base_ep - * called by efa_rdm_ep_open() - * - * @param[in,out] ep The EFA RDM endpoint to set the qp in - * @return int 0 on success, negative libfabric error code otherwise - * @todo merge this function with #efa_base_ep_construct - */ -static -int efa_rdm_ep_create_base_ep_ibv_qp(struct efa_rdm_ep *ep) -{ - struct ibv_qp_init_attr_ex attr_ex = { 0 }; - struct efa_cq *tx_cq, *rx_cq; - struct ibv_cq_ex *tx_ibv_cq, *rx_ibv_cq; - int ret; - - tx_cq = efa_base_ep_get_tx_cq(&ep->base_ep); - rx_cq = efa_base_ep_get_rx_cq(&ep->base_ep); - - if (!tx_cq && !rx_cq) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a send or receive completion queue\n"); - return -FI_ENOCQ; - } - - if (!tx_cq && ofi_needs_tx(ep->base_ep.info->caps)) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); - return -FI_ENOCQ; - } - - if (!rx_cq && ofi_needs_rx(ep->base_ep.info->caps)) { - EFA_WARN(FI_LOG_EP_CTRL, - "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled (FI_RECV).\n"); - return -FI_ENOCQ; - } - - tx_ibv_cq = tx_cq ? tx_cq->ibv_cq.ibv_cq_ex : rx_cq->ibv_cq.ibv_cq_ex; - rx_ibv_cq = rx_cq ? rx_cq->ibv_cq.ibv_cq_ex : tx_cq->ibv_cq.ibv_cq_ex; - - efa_rdm_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, tx_ibv_cq, rx_ibv_cq); - - ret = efa_base_ep_create_qp(&ep->base_ep, &attr_ex); - if (ret) - return ret; - - /** - * Create separate user_recv_qp to receive pkts that carries user data - * without any headers. - */ - if (ep->use_zcpy_rx) { - ret = efa_qp_create(&ep->base_ep.user_recv_qp, &attr_ex, ep->base_ep.info->tx_attr->tclass); - if (ret) { - efa_base_ep_destruct_qp(&ep->base_ep); - return ret; - } - ep->base_ep.user_recv_qp->base_ep = &ep->base_ep; - } - - return FI_SUCCESS; -} - static int efa_rdm_pke_pool_mr_reg_handler(struct ofi_bufpool_region *region) { @@ -554,11 +473,26 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, EFA_INFO(FI_LOG_EP_CTRL, "efa_rdm_ep->host_id: i-%017lx\n", efa_rdm_ep->host_id); } + /** + * These fields are set as efa device's default limit in base_ep + * Override the them to the values that are supported by efa-rdm. + * The info->ep_attr->max_msg_size is UINT64_MAX for efa-rdm because + * it supports segmentation of a large message into small pieces that + * fit into the device limit. The info->tx_attr->inject_size is currently + * the MIN(efa_mtu_size - max_hdr_size, shm_inject_size) + * as it supports emulated injection by copying user tx buffer into + * internal bounce buffer. + */ + efa_rdm_ep->base_ep.max_msg_size = info->ep_attr->max_msg_size; + efa_rdm_ep->base_ep.max_rma_size = info->ep_attr->max_msg_size; + efa_rdm_ep->base_ep.inject_msg_size = info->tx_attr->inject_size; + efa_rdm_ep->base_ep.inject_rma_size = info->tx_attr->inject_size; + + /* efa_rdm_ep's own fields */ efa_rdm_ep->max_tagged_size = info->ep_attr->max_msg_size; efa_rdm_ep->max_atomic_size = info->ep_attr->max_msg_size; efa_rdm_ep->inject_tagged_size = info->tx_attr->inject_size; efa_rdm_ep->inject_atomic_size = info->tx_attr->inject_size; - efa_rdm_ep->base_ep.inject_rma_size = info->tx_attr->inject_size; efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size; efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size; efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version); @@ -892,30 +826,6 @@ void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep) ofi_genlock_unlock(&efa_rdm_ep_domain(efa_rdm_ep)->srx_lock); } -static inline -void efa_rdm_ep_remove_cntr_ibv_cq_poll_list(struct efa_rdm_ep *ep) -{ - int i; - struct efa_cntr *efa_cntr; - struct util_cntr *util_cntr; - struct efa_cq *tx_cq, *rx_cq; - - tx_cq = efa_base_ep_get_tx_cq(&ep->base_ep); - rx_cq = efa_base_ep_get_rx_cq(&ep->base_ep); - - for (i = 0; i< CNTR_CNT; i++) { - util_cntr = ep->base_ep.util_ep.cntrs[i]; - if (util_cntr) { - efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); - if (tx_cq && !ofi_atomic_get32(&tx_cq->util_cq.ref)) - efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); - - if (rx_cq && !ofi_atomic_get32(&rx_cq->util_cq.ref)) - efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); - } - } -} - static inline void efa_rdm_ep_remove_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) { @@ -1007,7 +917,7 @@ static int efa_rdm_ep_close(struct fid *fid) * with other threads progressing the cq. */ efa_base_ep_close_util_ep(&efa_rdm_ep->base_ep); - efa_rdm_ep_remove_cntr_ibv_cq_poll_list(efa_rdm_ep); + efa_base_ep_remove_cntr_ibv_cq_poll_list(&efa_rdm_ep->base_ep); efa_rdm_ep_remove_cq_ibv_cq_poll_list(efa_rdm_ep); @@ -1181,39 +1091,6 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep) efa_rdm_ep_close_shm_resources(ep); } -static inline -int efa_rdm_ep_insert_cntr_ibv_cq_poll_list(struct efa_rdm_ep *ep) -{ - int i, ret; - struct efa_cntr *efa_cntr; - struct util_cntr *util_cntr; - struct efa_cq *tx_cq, *rx_cq; - tx_cq = efa_base_ep_get_tx_cq(&ep->base_ep); - rx_cq = efa_base_ep_get_rx_cq(&ep->base_ep); - - for (i = 0; i < CNTR_CNT; i++) { - util_cntr = ep->base_ep.util_ep.cntrs[i]; - if (util_cntr) { - efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); - if (tx_cq) { - ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); - if (ret) - return ret; - } - if (rx_cq) { - ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); - if (ret) - return ret; - } - ofi_genlock_lock(&efa_cntr->util_cntr.ep_list_lock); - efa_cntr->need_to_scan_ep_list = true; - ofi_genlock_unlock(&efa_cntr->util_cntr.ep_list_lock); - } - } - - return FI_SUCCESS; -} - static inline int efa_rdm_ep_insert_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) { @@ -1271,6 +1148,7 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) struct fi_peer_srx_context peer_srx_context = {0}; struct fi_rx_attr peer_srx_attr = {0}; struct util_srx_ctx *srx_ctx; + bool create_user_recv_qp = false; switch (command) { case FI_ENABLE: @@ -1301,14 +1179,10 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) ep->base_ep.inject_rma_size = MIN(ep->base_ep.inject_rma_size, efa_rdm_ep_domain(ep)->device->efa_attr.inline_buf_size); + create_user_recv_qp = true; } - ret = efa_rdm_ep_create_base_ep_ibv_qp(ep); - if (ret) - return ret; - - /* efa_base_ep_enable destroys qp in the error path */ - ret = efa_base_ep_enable(&ep->base_ep); + ret = efa_base_ep_create_and_enable_qp(&ep->base_ep, create_user_recv_qp); if (ret) return ret; @@ -1316,7 +1190,7 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) if (ret) goto err_destroy_qp; - ret = efa_rdm_ep_insert_cntr_ibv_cq_poll_list(ep); + ret = efa_base_ep_insert_cntr_ibv_cq_poll_list(&ep->base_ep); if (ret) goto err_destroy_qp; @@ -1572,72 +1446,6 @@ static int efa_rdm_ep_set_use_device_rdma(struct efa_rdm_ep *ep, bool use_device return 0; } -/** - * @brief check the in order aligned 128 bytes support for a given ibv_wr_op code - * - * @param ep efa_rdm_ep - * @param op_code ibv wr op code - * @return int 0 if in order aligned 128 bytes is supported, -FI_EOPNOTSUPP if - * it is not supported. Other negative integer for other errors. - */ -static -int efa_rdm_ep_check_qp_in_order_aligned_128_bytes(struct efa_rdm_ep *ep, - enum ibv_wr_opcode op_code) -{ - struct efa_qp *qp = NULL; - struct ibv_qp_init_attr_ex attr_ex = {0}; - int ret, retv; - struct ibv_cq_ex *ibv_cq_ex = NULL; - enum ibv_cq_ex_type ibv_cq_ex_type; - struct fi_cq_attr cq_attr = {0}; - - ret = efa_cq_ibv_cq_ex_open(&cq_attr, efa_rdm_ep_domain(ep)->device->ibv_ctx, &ibv_cq_ex, &ibv_cq_ex_type); - if (ret) { - EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %d\n", ret); - ret = -FI_EINVAL; - goto out; - } - - /* Create a dummy qp for query only */ - efa_rdm_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, ibv_cq_ex, ibv_cq_ex); - - ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC); - if (ret) - goto out; - - if (!efa_qp_support_op_in_order_aligned_128_bytes(qp, op_code)) - ret = -FI_EOPNOTSUPP; - -out: - if (qp) - efa_qp_destruct(qp); - - if (ibv_cq_ex) { - retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(ibv_cq_ex)); - if (retv) - EFA_WARN(FI_LOG_EP_CTRL, "Unable to close ibv cq: %s\n", - fi_strerror(-retv)); - } - return ret; -} - -/** - * Convenience macro for setopt with an enforced threshold - */ -#define EFA_RDM_EP_SETOPT_THRESHOLD(opt, field, threshold) { \ - size_t _val = *(size_t *) optval; \ - if (optlen != sizeof field) \ - return -FI_EINVAL; \ - if (_val > threshold) { \ - EFA_WARN(FI_LOG_EP_CTRL, \ - "Requested size of %zu for FI_OPT_" #opt " " \ - "exceeds the maximum (%zu)\n", \ - _val, threshold); \ - return -FI_EINVAL; \ - } \ - field = _val; \ -} - /** * @brief implement the fi_setopt() API for EFA RDM endpoint * @param[in] fid fid to endpoint @@ -1718,28 +1526,28 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, return ret; break; case FI_OPT_MAX_MSG_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(MAX_MSG_SIZE, efa_rdm_ep->base_ep.max_msg_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + EFA_EP_SETOPT_THRESHOLD(MAX_MSG_SIZE, efa_rdm_ep->base_ep.max_msg_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_MAX_TAGGED_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(MAX_TAGGED_SIZE, efa_rdm_ep->max_tagged_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + EFA_EP_SETOPT_THRESHOLD(MAX_TAGGED_SIZE, efa_rdm_ep->max_tagged_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_MAX_RMA_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, efa_rdm_ep->base_ep.max_rma_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + EFA_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, efa_rdm_ep->base_ep.max_rma_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_MAX_ATOMIC_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(MAX_ATOMIC_SIZE, efa_rdm_ep->max_atomic_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) + EFA_EP_SETOPT_THRESHOLD(MAX_ATOMIC_SIZE, efa_rdm_ep->max_atomic_size, efa_rdm_ep->base_ep.info->ep_attr->max_msg_size) break; case FI_OPT_INJECT_MSG_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, efa_rdm_ep->base_ep.inject_msg_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + EFA_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, efa_rdm_ep->base_ep.inject_msg_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_INJECT_TAGGED_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_TAGGED_SIZE, efa_rdm_ep->inject_tagged_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + EFA_EP_SETOPT_THRESHOLD(INJECT_TAGGED_SIZE, efa_rdm_ep->inject_tagged_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_INJECT_RMA_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, efa_rdm_ep->base_ep.inject_rma_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + EFA_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, efa_rdm_ep->base_ep.inject_rma_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_INJECT_ATOMIC_SIZE: - EFA_RDM_EP_SETOPT_THRESHOLD(INJECT_ATOMIC_SIZE, efa_rdm_ep->inject_atomic_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) + EFA_EP_SETOPT_THRESHOLD(INJECT_ATOMIC_SIZE, efa_rdm_ep->inject_atomic_size, efa_rdm_ep->base_ep.info->tx_attr->inject_size) break; case FI_OPT_EFA_USE_DEVICE_RDMA: if (optlen != sizeof(bool)) @@ -1756,7 +1564,7 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, * application buffer on device */ if (*(bool *)optval) { - ret = efa_rdm_ep_check_qp_in_order_aligned_128_bytes(efa_rdm_ep, IBV_WR_RDMA_READ); + ret = efa_base_ep_check_qp_in_order_aligned_128_bytes(&efa_rdm_ep->base_ep, IBV_WR_RDMA_READ); if (ret) return ret; } @@ -1766,7 +1574,7 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, if (optlen != sizeof(bool)) return -FI_EINVAL; if (*(bool *)optval) { - ret = efa_rdm_ep_check_qp_in_order_aligned_128_bytes(efa_rdm_ep, IBV_WR_RDMA_WRITE); + ret = efa_base_ep_check_qp_in_order_aligned_128_bytes(&efa_rdm_ep->base_ep, IBV_WR_RDMA_WRITE); if (ret) return ret; } diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index 9ca730d0b6e..dd6f813a059 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -19,7 +19,7 @@ void test_av_insert_duplicate_raw_addr(struct efa_resource **state) fi_addr_t addr1, addr2; int err, num_addr; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_check_mock; err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -54,7 +54,7 @@ void test_av_insert_duplicate_gid(struct efa_resource **state) fi_addr_t addr1, addr2; int err, num_addr; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_check_mock; err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); diff --git a/prov/efa/test/efa_unit_test_cntr.c b/prov/efa/test/efa_unit_test_cntr.c index 2aa2ea60927..d9d4852d2f2 100644 --- a/prov/efa/test/efa_unit_test_cntr.c +++ b/prov/efa/test/efa_unit_test_cntr.c @@ -10,7 +10,7 @@ * @return int the length of the ibv_cq_poll_list */ static -int test_efa_rdm_cntr_get_ibv_cq_poll_list_length(struct fid_cntr *cntr_fid) +int test_efa_cntr_get_ibv_cq_poll_list_length(struct fid_cntr *cntr_fid) { int i = 0; struct dlist_entry *item; @@ -30,14 +30,12 @@ int test_efa_rdm_cntr_get_ibv_cq_poll_list_length(struct fid_cntr *cntr_fid) * * @param state struct efa_resource that is managed by the framework */ -void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resource **state) +static +void test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep_impl(struct efa_resource *resource) { - struct efa_resource *resource = *state; struct fid_cntr *cntr; struct fi_cntr_attr cntr_attr = {0}; - efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); - assert_int_equal(fi_cntr_open(resource->domain, &cntr_attr, &cntr, NULL), 0); /* TODO: expand this test to all flags */ @@ -46,7 +44,7 @@ void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resou assert_int_equal(fi_enable(resource->ep), 0); /* efa_unit_test_resource_construct binds single OFI CQ as both tx/rx cq of ep */ - assert_int_equal(test_efa_rdm_cntr_get_ibv_cq_poll_list_length(cntr), 1); + assert_int_equal(test_efa_cntr_get_ibv_cq_poll_list_length(cntr), 1); /* ep must be closed before cq/av/eq... */ fi_close(&resource->ep->fid); @@ -55,21 +53,35 @@ void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resou fi_close(&cntr->fid); } +void test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep_impl(resource); +} + +void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); + test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep_impl(resource); +} + /** * @brief Check the length of ibv_cq_poll_list in cntr when separate tx/rx cq is bind to 1 ep. * * @param state struct efa_resource that is managed by the framework */ -void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_resource **state) +static +void test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep_impl(struct efa_resource *resource) { - struct efa_resource *resource = *state; struct fid_cq *txcq, *rxcq; struct fi_cq_attr cq_attr = {0}; struct fid_cntr *cntr; struct fi_cntr_attr cntr_attr = {0}; - efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM); - assert_int_equal(fi_cq_open(resource->domain, &cq_attr, &txcq, NULL), 0); assert_int_equal(fi_ep_bind(resource->ep, &txcq->fid, FI_SEND), 0); @@ -85,7 +97,7 @@ void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_r assert_int_equal(fi_enable(resource->ep), 0); - assert_int_equal(test_efa_rdm_cntr_get_ibv_cq_poll_list_length(cntr), 2); + assert_int_equal(test_efa_cntr_get_ibv_cq_poll_list_length(cntr), 2); /* ep must be closed before cq/av/eq... */ fi_close(&resource->ep->fid); @@ -95,7 +107,23 @@ void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_r fi_close(&cntr->fid); } -void test_efa_cntr_post_initial_rx_pkts(struct efa_resource **state) +void test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep_impl(resource); +} + +void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); + test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep_impl(resource); +} + +void test_efa_rdm_cntr_post_initial_rx_pkts(struct efa_resource **state) { struct efa_resource *resource = *state; struct efa_rdm_ep *efa_rdm_ep; @@ -104,7 +132,7 @@ void test_efa_cntr_post_initial_rx_pkts(struct efa_resource **state) struct efa_cntr *efa_cntr; uint64_t cnt; - efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); /* At this time, rx pkts are not growed and posted */ diff --git a/prov/efa/test/efa_unit_test_common.c b/prov/efa/test/efa_unit_test_common.c index 47cae69f20b..13bb1882465 100644 --- a/prov/efa/test/efa_unit_test_common.c +++ b/prov/efa/test/efa_unit_test_common.c @@ -2,6 +2,7 @@ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_unit_tests.h" +#include "efa_cq.h" #include "efa_rdm_pke_utils.h" #include "efa_rdm_pke_nonreq.h" #include "efa_rdm_pke_req.h" @@ -51,7 +52,7 @@ void efa_unit_test_construct_msg_rma(struct fi_msg_rma *msg, struct iovec *iov, msg->data = data; } -struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type) +struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type, char *prov_name) { struct fi_info *hints; @@ -59,10 +60,11 @@ struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type) if (!hints) return NULL; - hints->fabric_attr->prov_name = strdup("efa"); + hints->fabric_attr->prov_name = strdup(prov_name); hints->ep_attr->type = ep_type; - hints->domain_attr->mr_mode |= FI_MR_LOCAL | FI_MR_ALLOCATED; + /* Use a minimal caps that efa / efa-direct should always support */ + hints->domain_attr->mr_mode = MR_MODE_BITS; if (ep_type == FI_EP_DGRAM) { hints->mode |= FI_MSG_PREFIX; } @@ -70,15 +72,17 @@ struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type) return hints; } +/* TODO: remove use_efa_direct after we have efa_direct implemented in fi_info */ void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, enum fi_ep_type ep_type, uint32_t fi_version, struct fi_info *hints, - bool enable_ep, bool open_cq) + bool enable_ep, bool open_cq, char* prov_name) { int ret = 0; struct fi_av_attr av_attr = {0}; struct fi_cq_attr cq_attr = {0}; struct fi_eq_attr eq_attr = {0}; + struct efa_domain *efa_domain; ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints, &resource->info); if (ret) @@ -92,6 +96,17 @@ void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, if (ret) goto err; + /* + * TODO: Remove this function pointer override when we have it assigned + * for efa-direct correctly. + */ + if (!strcmp(EFA_DIRECT_PROV_NAME, prov_name)) { + efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); + + efa_domain->util_domain.domain_fid.ops->endpoint = efa_ep_open; + efa_domain->util_domain.domain_fid.ops->cq_open = efa_cq_open; + } + ret = fi_endpoint(resource->domain, resource->info, &resource->ep, NULL); if (ret) goto err; @@ -131,13 +146,19 @@ void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, assert_int_equal(ret, 0); } -void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_type ep_type) +void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_type ep_type, char *prov_name) { - resource->hints = efa_unit_test_alloc_hints(ep_type); + + /* TODO use prov_name here when efa-direct fi_info is implemented */ + resource->hints = efa_unit_test_alloc_hints(ep_type, EFA_PROV_NAME); if (!resource->hints) goto err; - efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), - resource->hints, true, true); + if (!strcmp(EFA_DIRECT_PROV_NAME, prov_name)) + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(2, 0), + resource->hints, true, true, prov_name); + else + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), + resource->hints, true, true, prov_name); return; err: @@ -148,13 +169,19 @@ void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_ } void efa_unit_test_resource_construct_ep_not_enabled(struct efa_resource *resource, - enum fi_ep_type ep_type) + enum fi_ep_type ep_type, char *prov_name) { - resource->hints = efa_unit_test_alloc_hints(ep_type); + /* TODO use prov_name here when efa-direct fi_info is implemented */ + resource->hints = efa_unit_test_alloc_hints(ep_type, EFA_PROV_NAME); if (!resource->hints) goto err; - efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), - resource->hints, false, true); + + if (!strcmp(EFA_DIRECT_PROV_NAME, prov_name)) + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(2, 0), + resource->hints, false, true, prov_name); + else + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), + resource->hints, false, true, prov_name); return; err: @@ -165,13 +192,19 @@ void efa_unit_test_resource_construct_ep_not_enabled(struct efa_resource *resour } void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(struct efa_resource *resource, - enum fi_ep_type ep_type) + enum fi_ep_type ep_type, char *prov_name) { - resource->hints = efa_unit_test_alloc_hints(ep_type); + /* TODO use prov_name here when efa-direct fi_info is implemented */ + resource->hints = efa_unit_test_alloc_hints(ep_type, EFA_PROV_NAME); if (!resource->hints) goto err; - efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), - resource->hints, false, false); + + if (!strcmp(EFA_DIRECT_PROV_NAME, prov_name)) + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(2, 0), + resource->hints, false, false, prov_name); + else + efa_unit_test_resource_construct_with_hints(resource, ep_type, FI_VERSION(1, 14), + resource->hints, false, false, prov_name); return; err: @@ -189,12 +222,12 @@ void efa_unit_test_resource_construct_rdm_shm_disabled(struct efa_resource *reso int ret; bool shm_permitted = false; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); if (!resource->hints) goto err; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, false, true); + resource->hints, false, true, EFA_PROV_NAME); ret = fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_SHARED_MEMORY_PERMITTED, &shm_permitted, diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index e939d182b60..795aa7b8066 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -21,7 +21,7 @@ void test_impl_cq_read_empty_cq(struct efa_resource *resource, enum fi_ep_type e int ret; struct efa_base_ep *efa_base_ep; - efa_unit_test_resource_construct(resource, ep_type); + efa_unit_test_resource_construct(resource, ep_type, EFA_PROV_NAME); efa_base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); ibv_cqx = container_of(efa_base_ep->util_ep.rx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; @@ -288,7 +288,7 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) struct ibv_cq_ex *ibv_cqx; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); /* @@ -366,7 +366,7 @@ void test_ibv_cq_ex_read_bad_recv_rdma_with_imm_status_impl(struct efa_resource struct ibv_cq_ex *ibv_cqx; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); @@ -455,7 +455,7 @@ void test_ibv_cq_ex_read_failed_poll(struct efa_resource **state) struct efa_rdm_cq *efa_rdm_cq; struct ibv_cq_ex *ibv_cqx; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); ibv_cqx = efa_rdm_cq->efa_cq.ibv_cq.ibv_cq_ex; @@ -498,7 +498,7 @@ void test_rdm_cq_create_error_handling(struct efa_resource **state) } efa_device_construct(&efa_device, 0, ibv_device_list[0]); - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); assert_int_equal(fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info), 0); assert_int_equal(fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL), 0); @@ -546,7 +546,7 @@ void test_efa_rdm_cq_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resourc { struct efa_resource *resource = *state; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* efa_unit_test_resource_construct binds single OFI CQ as both tx/rx cq of ep */ assert_int_equal(test_efa_rdm_cq_get_ibv_cq_poll_list_length(resource->cq), 1); @@ -563,7 +563,7 @@ void test_efa_rdm_cq_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_res struct fid_cq *txcq, *rxcq; struct fi_cq_attr cq_attr = {0}; - efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM); + efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); assert_int_equal(fi_cq_open(resource->domain, &cq_attr, &txcq, NULL), 0); @@ -592,7 +592,7 @@ void test_efa_rdm_cq_post_initial_rx_pkts(struct efa_resource **state) struct efa_rdm_ep *efa_rdm_ep; struct efa_rdm_cq *efa_rdm_cq; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); @@ -653,7 +653,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc expect_function_call(efa_mock_efadv_create_cq_set_eopnotsupp_and_return_null); } - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, efa_cq.util_cq.cq_fid.fid); @@ -820,7 +820,7 @@ static void test_efa_cq_read(struct efa_resource *resource, fi_addr_t *addr, struct ibv_qp_ex *ibv_qpx; struct efa_base_ep *base_ep; - efa_unit_test_resource_construct(resource, FI_EP_DGRAM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); ibv_qpx = base_ep->qp->ibv_qp_ex; diff --git a/prov/efa/test/efa_unit_test_domain.c b/prov/efa/test/efa_unit_test_domain.c index ccfa1c53149..29a21d29fb9 100644 --- a/prov/efa/test/efa_unit_test_domain.c +++ b/prov/efa/test/efa_unit_test_domain.c @@ -10,7 +10,7 @@ void test_efa_domain_open_ops_wrong_name(struct efa_resource **state) int ret; struct fi_efa_ops_domain *efa_domain_ops; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); ret = fi_open_ops(&resource->domain->fid, "arbitrary name", 0, (void **)&efa_domain_ops, NULL); assert_int_equal(ret, -FI_EINVAL); @@ -61,7 +61,7 @@ void test_efa_domain_open_ops_mr_query(struct efa_resource **state) { struct efa_resource *resource = *state; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* set recv_ic_id as 0 */ g_efa_unit_test_mocks.efadv_query_mr = &efa_mock_efadv_query_mr_recv_ic_id_0; @@ -114,7 +114,7 @@ void test_efa_domain_open_ops_mr_query(struct efa_resource **state) { struct efa_resource *resource = *state; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); test_efa_domain_open_ops_mr_query_common( resource, diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index c67902bc609..5926e38e267 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -37,7 +37,7 @@ void test_efa_rdm_ep_host_id(struct efa_resource **state, bool file_exists, char efa_env.host_id_file = host_id_file; } - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -272,7 +272,7 @@ void test_efa_rdm_ep_pkt_pool_flags(struct efa_resource **state) { struct efa_resource *resource = *state; efa_env.huge_page_setting = EFA_ENV_HUGE_PAGE_DISABLED; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); check_ep_pkt_pool_flags(resource->ep, OFI_BUFPOOL_NONSHARED); } @@ -290,7 +290,7 @@ void test_efa_rdm_ep_pkt_pool_page_alignment(struct efa_resource **state) struct efa_rdm_ep *efa_rdm_ep; struct efa_resource *resource = *state; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_env.huge_page_setting = EFA_ENV_HUGE_PAGE_DISABLED; ret = fi_endpoint(resource->domain, resource->info, &ep, NULL); @@ -321,7 +321,7 @@ void test_efa_rdm_read_copy_pkt_pool_128_alignment(struct efa_resource **state) struct efa_resource *resource = *state; struct efa_domain *efa_domain = NULL; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* rx_readcopy_pkt_pool is only created when application requested FI_HMEM */ efa_domain = container_of(resource->domain, struct efa_domain, @@ -358,7 +358,7 @@ void test_efa_rdm_pke_get_available_copy_methods_align128(struct efa_resource ** struct efa_resource *resource = *state; bool local_read_available, gdrcopy_available, cuda_memcpy_available; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_mr.peer.iface = FI_HMEM_CUDA; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -593,11 +593,11 @@ void test_efa_rdm_ep_rma_queue_before_handshake(struct efa_resource **state, int struct efa_rdm_ope *txe; struct efa_rdm_peer *peer; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); resource->hints->caps |= FI_MSG | FI_TAGGED | FI_RMA; resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, true, true); + resource->hints, true, true, EFA_PROV_NAME); /* ensure we don't have RMA capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -681,11 +681,11 @@ void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(struct efa_resource uint64_t rma_addr, rma_key; struct efa_rdm_peer *peer; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); resource->hints->caps |= FI_MSG | FI_TAGGED | FI_RMA; resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 22), - resource->hints, true, true); + resource->hints, true, true, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -750,7 +750,7 @@ void test_efa_rdm_ep_send_with_shm_no_copy(struct efa_resource **state) char buff[8] = {0}; int err; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* create a fake peer */ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -789,12 +789,12 @@ void test_efa_rdm_ep_rma_without_caps(struct efa_resource **state) int err; uint64_t rma_addr, rma_key; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); resource->hints->caps |= FI_MSG | FI_TAGGED; resource->hints->caps &= ~FI_RMA; resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, true, true); + resource->hints, true, true, EFA_PROV_NAME); /* ensure we don't have RMA capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -840,12 +840,12 @@ void test_efa_rdm_ep_atomic_without_caps(struct efa_resource **state) int err; uint64_t rma_addr, rma_key; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); resource->hints->caps |= FI_MSG | FI_TAGGED; resource->hints->caps &= ~FI_ATOMIC; resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, true, true); + resource->hints, true, true, EFA_PROV_NAME); /* ensure we don't have ATOMIC capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -895,7 +895,7 @@ void test_efa_rdm_ep_getopt(struct efa_resource **state, size_t opt_len, int exp }; size_t num_opt_names = sizeof(opt_names) / sizeof(int); - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); for (i = 0; i < num_opt_names; i++) { opt_len_temp = opt_len; @@ -941,7 +941,7 @@ void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_common(struct efa_reso { struct efa_resource *resource = *state; - efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); /* fi_setopt should always succeed */ assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, @@ -1001,7 +1001,7 @@ static void test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, ofi_hmem_disable_p2p = cuda_p2p_disabled; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, false, true); + resource->hints, false, true, EFA_PROV_NAME); /* System memory P2P should always be enabled */ assert_true(g_efa_hmem_info[FI_HMEM_SYSTEM].initialized); @@ -1068,7 +1068,7 @@ void test_efa_rdm_ep_user_zcpy_rx_disabled(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->mode = FI_MSG_PREFIX; @@ -1084,7 +1084,7 @@ void test_efa_rdm_ep_user_disable_p2p_zcpy_rx_disabled(struct efa_resource **sta { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->mode = FI_MSG_PREFIX; @@ -1100,7 +1100,7 @@ void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas(struct efa_resource **state { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->msg_order = FI_ORDER_SAS; @@ -1118,7 +1118,7 @@ void test_efa_rdm_ep_user_p2p_not_supported_zcpy_rx_happy(struct efa_resource ** { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->mode = FI_MSG_PREFIX; @@ -1134,7 +1134,7 @@ void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local(struct efa_resource { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->caps = FI_MSG; @@ -1148,7 +1148,7 @@ void test_efa_rdm_ep_close_discard_posted_recv(struct efa_resource **state) struct efa_resource *resource = *state; char buf[16]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* Post recv and then close ep */ assert_int_equal(fi_recv(resource->ep, (void *) buf, 16, NULL, FI_ADDR_UNSPEC, NULL), 0); @@ -1168,7 +1168,7 @@ void test_efa_rdm_ep_zcpy_recv_cancel(struct efa_resource **state) struct fi_context cancel_context = {0}; struct efa_unit_test_buff recv_buff; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->caps = FI_MSG; @@ -1202,7 +1202,7 @@ void test_efa_rdm_ep_zcpy_recv_eagain(struct efa_resource **state) int i; struct efa_rdm_ep *efa_rdm_ep; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->caps = FI_MSG; @@ -1310,11 +1310,11 @@ void test_efa_rdm_ep_rx_refill_impl(struct efa_resource **state, int threshold, efa_env.internal_rx_refill_threshold = threshold; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->rx_attr->size = rx_size; efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), - resource->hints, true, true); + resource->hints, true, true, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); assert_int_equal(efa_rdm_ep_get_rx_pool_size(efa_rdm_ep), rx_size); @@ -1385,10 +1385,268 @@ void test_efa_rdm_ep_support_unsolicited_write_recv(struct efa_resource **state) struct efa_rdm_ep *efa_rdm_ep; struct efa_resource *resource = *state; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); assert_int_equal(efa_use_unsolicited_write_recv(), efa_rdm_ep_support_unsolicited_write_recv(efa_rdm_ep)); } + +/** + * @brief Test the default operational sizes for efa_rdm_ep + * + * @param state + */ +void test_efa_rdm_ep_default_sizes(struct efa_resource **state) +{ + struct efa_rdm_ep *efa_rdm_ep; + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* sizes shared with base_ep */ + assert_int_equal(efa_rdm_ep->base_ep.max_msg_size, resource->info->ep_attr->max_msg_size); + assert_int_equal(efa_rdm_ep->base_ep.max_rma_size, resource->info->ep_attr->max_msg_size); + assert_int_equal(efa_rdm_ep->base_ep.inject_msg_size, resource->info->tx_attr->inject_size); + assert_int_equal(efa_rdm_ep->base_ep.inject_rma_size, resource->info->tx_attr->inject_size); + + /* efa_rdm_ep's own fields */ + assert_int_equal(efa_rdm_ep->max_tagged_size, resource->info->ep_attr->max_msg_size); + assert_int_equal(efa_rdm_ep->max_atomic_size, resource->info->ep_attr->max_msg_size); + assert_int_equal(efa_rdm_ep->inject_tagged_size, resource->info->tx_attr->inject_size); + assert_int_equal(efa_rdm_ep->inject_atomic_size, resource->info->tx_attr->inject_size); +} + +/** + * @brief Test the fi_endpoint API for efa_ep + * for rdm ep type (because the dgram ep type should + * have the same logic) + * @param state + */ +void test_efa_ep_open(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_base_ep *efa_ep; + struct efa_domain *efa_domain; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_domain = container_of(resource->domain, struct efa_domain, + util_domain.domain_fid); + + /* Check various size limits defaults */ + assert_true(efa_ep->max_msg_size == efa_domain->device->ibv_port_attr.max_msg_sz); + assert_true(efa_ep->max_rma_size == efa_domain->device->max_rdma_size); + assert_true(efa_ep->inject_msg_size == efa_domain->device->efa_attr.inline_buf_size); + /* TODO: update inject_rma_size to inline size after firmware + * supports inline rdma write */ + assert_true(efa_ep->inject_rma_size == 0); +} + +/** + * @brief Test the fi_cancel API for efa_ep + * (for rdm ep type because dgram logic should be the same) + * It should return -FI_ENOSYS as device doesn't support it; + * @param state + */ +void test_efa_ep_cancel(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + int ret; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + ret = fi_cancel((struct fid *)resource->ep, NULL); + assert_int_equal(ret, -FI_ENOSYS); +} + +/** + * @brief Test the fi_getopt API fo efa_ep + * + * @param state + */ +void test_efa_ep_getopt(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + int optval_int; + bool optval_bool; + size_t optval_size_t; + size_t optlen; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + + optlen = sizeof(optval_int); + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_FI_HMEM_P2P, &optval_int, &optlen), 0); + assert_int_equal(optval_int, FI_HMEM_P2P_REQUIRED); + + optlen = sizeof(optval_bool); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_EMULATED_READ, &optval_bool, &optlen), 0); + assert_false(optval_bool); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_EMULATED_WRITE, &optval_bool, &optlen), 0); + assert_false(optval_bool); + + optlen = sizeof(optval_size_t); + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_RNR_RETRY, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->rnr_retry); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_MSG_SIZE, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->max_msg_size); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_RMA_SIZE, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->max_rma_size); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->inject_msg_size); + + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, &optval_size_t, &optlen), 0); + assert_int_equal(optval_size_t, efa_ep->inject_rma_size); +} + +/** + * @brief Test the fi_setopt API for efa_ep + * When RMA is requested, FI_OPT_EFA_USE_DEVICE_RDMA + * cannot be set as false + * @param state + */ +void test_efa_ep_setopt_use_device_rdma(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + bool optval; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + + /* Hard code RMA caps in ep->info for local testing purpose */ + efa_ep->info->caps |= FI_RMA; + + /* Disable rdma is not allowed when user requests FI_RMA */ + optval = false; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_USE_DEVICE_RDMA, &optval, sizeof(optval)), -FI_EOPNOTSUPP); +} + +/** + * @brief Test the fi_setopt API for efa_ep + * FI_OPT_FI_HMEM_P2P cannot be set as FI_HMEM_P2P_DISABLED + * @param state + */ +void test_efa_ep_setopt_hmem_p2p(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + int optval; + int optvals[] = { + FI_HMEM_P2P_DISABLED, + FI_HMEM_P2P_ENABLED, + FI_HMEM_P2P_PREFERRED, + FI_HMEM_P2P_REQUIRED, + }; + size_t num_optvals = sizeof(optvals) / sizeof(int); + int i, expected_return; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + /* FI_HMEM_P2P_DISABLED is not allowed */ + for (i = 0; i < num_optvals; i++) { + optval = optvals[i]; + expected_return = (optval == FI_HMEM_P2P_DISABLED) ? -FI_EOPNOTSUPP : FI_SUCCESS; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_FI_HMEM_P2P, &optval, sizeof(optval)), expected_return); + } +} + +/** + * @brief Test the fi_setopt API for efa_ep with FI_OPT_EFA_RNR_RETRY + * @param state + */ +void test_efa_ep_setopt_rnr_retry(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + size_t optval; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + assert_false(efa_ep->efa_qp_enabled); + + optval = 7; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_RNR_RETRY, &optval, sizeof(optval)), FI_SUCCESS); + assert_int_equal(efa_ep->rnr_retry, optval); + + /* hack qp enabled status to allow local test */ + efa_ep->efa_qp_enabled = true; + /* fi_setopt should fail when it's called after ep enable */ + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_RNR_RETRY, &optval, sizeof(optval)), -FI_EINVAL); + /* recover */ + efa_ep->efa_qp_enabled = false; +} + +/** + * @brief Test the fi_setopt API for efa_ep with FI_OPT_*_SIZE + * @param state + */ +void test_efa_ep_setopt_sizes(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + size_t optval; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + + size_t size_thresholds[] = { + [FI_OPT_MAX_MSG_SIZE] = (size_t) efa_ep->domain->device->ibv_port_attr.max_msg_sz, + [FI_OPT_MAX_RMA_SIZE] = (size_t) efa_ep->domain->device->max_rdma_size, + [FI_OPT_INJECT_MSG_SIZE] = (size_t) efa_ep->domain->device->efa_attr.inline_buf_size, + [FI_OPT_INJECT_RMA_SIZE] = (size_t) 0, + }; + int optnames[] = { + FI_OPT_MAX_MSG_SIZE, + FI_OPT_MAX_RMA_SIZE, + FI_OPT_INJECT_MSG_SIZE, + FI_OPT_INJECT_RMA_SIZE, + }; + size_t num_optnames = sizeof(optnames) / sizeof(int); + int i, optname; + + for (i = 0; i < num_optnames; i++) { + optname = optnames[i]; + + /* set optval <= threshold is allowed */ + optval = 0.5 * size_thresholds[optname]; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, optname, &optval, sizeof(optval)), FI_SUCCESS); + + /* set optval > threshold is NOT allowed */ + optval = size_thresholds[optname] + 10; + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, optname, &optval, sizeof(optval)), -FI_EINVAL); + } +} + +/** + * @brief Test fi_ep_bind and fi_enable API for efa_ep + * + * @param state + */ +void test_efa_ep_bind_and_enable(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_base_ep *efa_ep; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); + + efa_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + + assert_true(efa_ep->efa_qp_enabled); + /* we shouldn't have user recv qp for efa-direct */ + assert_true(efa_ep->user_recv_qp == NULL); +} \ No newline at end of file diff --git a/prov/efa/test/efa_unit_test_hmem.c b/prov/efa/test/efa_unit_test_hmem.c index 90a366f7064..2b278bddfba 100644 --- a/prov/efa/test/efa_unit_test_hmem.c +++ b/prov/efa/test/efa_unit_test_hmem.c @@ -20,7 +20,7 @@ void test_efa_hmem_info_update_neuron(struct efa_resource **state) uint32_t efa_device_caps_orig; bool neuron_initialized_orig; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info); @@ -58,7 +58,7 @@ void test_efa_hmem_info_disable_p2p_neuron(struct efa_resource **state) ofi_hmem_disable_p2p = 1; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info); @@ -109,7 +109,7 @@ void test_efa_hmem_info_disable_p2p_cuda(struct efa_resource **state) ofi_hmem_disable_p2p = 1; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info); diff --git a/prov/efa/test/efa_unit_test_info.c b/prov/efa/test/efa_unit_test_info.c index 1380e36976c..febb386f4f3 100644 --- a/prov/efa/test/efa_unit_test_info.c +++ b/prov/efa/test/efa_unit_test_info.c @@ -15,7 +15,7 @@ void test_info_open_ep_with_wrong_info() struct fid_ep *ep = NULL; int err; - hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + hints = efa_unit_test_alloc_hints(FI_EP_DGRAM, EFA_PROV_NAME); err = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, hints, &info); assert_int_equal(err, 0); @@ -113,7 +113,7 @@ void test_info_tx_rx_msg_order_rdm_order_none(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); test_info_tx_rx_msg_order_from_hints(resource->hints, 0); @@ -123,7 +123,7 @@ void test_info_tx_rx_msg_order_rdm_order_sas(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->msg_order = FI_ORDER_SAS; @@ -135,7 +135,7 @@ void test_info_tx_rx_msg_order_dgram_order_none(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM, EFA_PROV_NAME); assert_non_null(resource->hints); test_info_tx_rx_msg_order_from_hints(resource->hints, 0); @@ -149,7 +149,7 @@ void test_info_tx_rx_msg_order_dgram_order_sas(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->msg_order = FI_ORDER_SAS; @@ -191,7 +191,7 @@ void test_info_max_order_size_dgram_with_atomic(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->caps = FI_ATOMIC; @@ -207,7 +207,7 @@ void test_info_max_order_size_rdm_with_atomic_no_order(struct efa_resource **sta { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); @@ -229,7 +229,7 @@ void test_info_max_order_size_rdm_with_atomic_order(struct efa_resource **state) - g_device_list[0].rdm_info->src_addrlen - EFA_RDM_IOV_LIMIT * sizeof(struct fi_rma_iov); - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->caps = FI_ATOMIC; @@ -244,7 +244,7 @@ void test_info_tx_rx_op_flags_rdm(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->op_flags = FI_DELIVERY_COMPLETE; @@ -256,7 +256,7 @@ void test_info_tx_rx_size_rdm(struct efa_resource **state) { struct efa_resource *resource = *state; - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); assert_non_null(resource->hints); resource->hints->tx_attr->size = 16; @@ -317,7 +317,7 @@ void test_info_check_shm_info_hmem() { struct fi_info *hints; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->caps |= FI_HMEM; test_info_check_shm_info_from_hints(hints); @@ -330,7 +330,7 @@ void test_info_check_shm_info_op_flags() { struct fi_info *hints; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->tx_attr->op_flags |= FI_COMPLETION; hints->rx_attr->op_flags |= FI_COMPLETION; @@ -345,7 +345,7 @@ void test_info_check_shm_info_threading() { struct fi_info *hints; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->domain_attr->threading = FI_THREAD_DOMAIN; test_info_check_shm_info_from_hints(hints); @@ -363,7 +363,7 @@ void test_info_check_hmem_cuda_support_on_api_lt_1_18() if (!hmem_ops[FI_HMEM_CUDA].initialized) skip(); - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->caps |= FI_HMEM; hints->domain_attr->mr_mode |= FI_MR_HMEM; @@ -402,7 +402,7 @@ void test_info_check_hmem_cuda_support_on_api_ge_1_18() if (!hmem_ops[FI_HMEM_CUDA].initialized) skip(); - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); hints->caps |= FI_HMEM; hints->domain_attr->mr_mode |= FI_MR_HMEM; @@ -429,7 +429,7 @@ void test_info_check_no_hmem_support_when_not_requested() struct fi_info *hints, *info = NULL; int err; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); err = fi_getinfo(FI_VERSION(1,6), NULL, NULL, 0, hints, &info); assert_int_equal(err, 0); @@ -467,7 +467,7 @@ void test_use_device_rdma( const int env_val, unsetenv("FI_EFA_USE_DEVICE_RDMA"); } - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); ret = fi_getinfo(api_version, NULL, NULL, 0ULL, hints, &info); assert_int_equal(ret, 0); @@ -531,7 +531,7 @@ static int get_first_nic_name(char **name) { char *nic_name = NULL; struct fi_info *hints, *info; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, hints, &info); fi_freeinfo(hints); if (ret) @@ -566,7 +566,7 @@ static void test_efa_nic_selection(const char *filter, const char *expect_first_ struct fi_info *hints, *info; efa_env.iface = (char *) filter; - hints = efa_unit_test_alloc_hints(FI_EP_RDM); + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_PROV_NAME); ret = fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, hints, &info); fi_freeinfo(hints); if (expect_first_name) { diff --git a/prov/efa/test/efa_unit_test_mr.c b/prov/efa/test/efa_unit_test_mr.c index 71ccb8e7a35..5516d4f325e 100644 --- a/prov/efa/test/efa_unit_test_mr.c +++ b/prov/efa/test/efa_unit_test_mr.c @@ -11,7 +11,7 @@ void test_efa_mr_reg_counters(struct efa_resource **state) char *buf; struct fid_mr *mr; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); assert_true(efa_domain->ibv_mr_reg_ct == 0); diff --git a/prov/efa/test/efa_unit_test_msg.c b/prov/efa/test/efa_unit_test_msg.c index 81781aeb6d6..b0df253fbeb 100644 --- a/prov/efa/test/efa_unit_test_msg.c +++ b/prov/efa/test/efa_unit_test_msg.c @@ -5,7 +5,6 @@ #include "efa_unit_tests.h" #include "ofi_util.h" -extern struct fi_ops_msg efa_msg_ops; static void test_efa_msg_recv_prep(struct efa_resource *resource, fi_addr_t *addr) @@ -16,8 +15,7 @@ static void test_efa_msg_recv_prep(struct efa_resource *resource, size_t raw_addr_len = sizeof(raw_addr); int ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); - resource->ep->msg = &efa_msg_ops; + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); ibv_qp = base_ep->qp->ibv_qp; @@ -108,8 +106,7 @@ static void test_efa_msg_send_prep(struct efa_resource *resource, size_t raw_addr_len = sizeof(raw_addr); int ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); - resource->ep->msg = &efa_msg_ops; + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); ibv_qpx = base_ep->qp->ibv_qp_ex; diff --git a/prov/efa/test/efa_unit_test_ope.c b/prov/efa/test/efa_unit_test_ope.c index d5229cbcc18..701e2bb8c68 100644 --- a/prov/efa/test/efa_unit_test_ope.c +++ b/prov/efa/test/efa_unit_test_ope.c @@ -65,7 +65,7 @@ void test_efa_rdm_ope_prepare_to_post_send_with_no_enough_tx_pkts(struct efa_res struct efa_resource *resource = *state; struct efa_rdm_ep *efa_rdm_ep; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->efa_outstanding_tx_ops = efa_rdm_ep->efa_max_outstanding_tx_ops - 1; @@ -88,7 +88,7 @@ void test_efa_rdm_ope_prepare_to_post_send_host_memory(struct efa_resource **sta int expected_pkt_entry_cnt; int expected_pkt_entry_data_size_vec[1024]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* data size should be aligned and evenly distributed. * alignment for host memory is 8 byte by default. @@ -137,7 +137,7 @@ void test_efa_rdm_ope_prepare_to_post_send_host_memory_align128(struct efa_resou int expected_pkt_entry_cnt; int expected_pkt_entry_data_size_vec[1024]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = true; @@ -186,7 +186,7 @@ void test_efa_rdm_ope_prepare_to_post_send_cuda_memory(struct efa_resource **sta int expected_pkt_entry_cnt; int expected_pkt_entry_data_size_vec[1024]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* default alignment of cuda memory is 64 bytes */ msg_length = 12000; @@ -211,7 +211,7 @@ void test_efa_rdm_ope_prepare_to_post_send_cuda_memory_align128(struct efa_resou int expected_pkt_entry_cnt; int expected_pkt_entry_data_size_vec[1024]; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = true; @@ -243,7 +243,7 @@ void test_efa_rdm_ope_post_write_0_byte(struct efa_resource **state) fi_addr_t addr; int ret, err; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(ret, 0); @@ -314,7 +314,7 @@ void test_efa_rdm_rxe_post_local_read_or_queue_cleanup_txe(struct efa_resource * */ g_efa_unit_test_mocks.efa_rdm_pke_read = &efa_mock_efa_rdm_pke_read_return_mock; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); diff --git a/prov/efa/test/efa_unit_test_pke.c b/prov/efa/test/efa_unit_test_pke.c index d52ccf76cc3..e7fda0365a1 100644 --- a/prov/efa/test/efa_unit_test_pke.c +++ b/prov/efa/test/efa_unit_test_pke.c @@ -24,7 +24,7 @@ void test_efa_rdm_pke_handle_longcts_rtm_send_completion(struct efa_resource **s int err, numaddr; struct efa_rdm_ope *txe; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); diff --git a/prov/efa/test/efa_unit_test_rdm_peer.c b/prov/efa/test/efa_unit_test_rdm_peer.c index 1170ef9b999..da909ed4905 100644 --- a/prov/efa/test/efa_unit_test_rdm_peer.c +++ b/prov/efa/test/efa_unit_test_rdm_peer.c @@ -81,7 +81,7 @@ void test_efa_rdm_peer_reorder_expected_msg_id(struct efa_resource **state) { uint32_t msg_id, exp_msg_id; int expected_ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_id = 0; exp_msg_id = 0; @@ -96,7 +96,7 @@ void test_efa_rdm_peer_reorder_smaller_msg_id(struct efa_resource **state) { uint32_t msg_id, exp_msg_id; int expected_ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_id = 1; exp_msg_id = 10; @@ -110,7 +110,7 @@ void test_efa_rdm_peer_reorder_larger_msg_id(struct efa_resource **state) { uint32_t msg_id, exp_msg_id; int expected_ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_id = 10; exp_msg_id = 0; @@ -125,7 +125,7 @@ void test_efa_rdm_peer_reorder_overflow_msg_id(struct efa_resource **state) { uint32_t msg_id, exp_msg_id; int expected_ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_id = 16384; exp_msg_id = 0; @@ -192,7 +192,7 @@ void test_efa_rdm_peer_move_overflow_pke_to_recvwin(struct efa_resource **state) struct efa_rdm_peer *peer; struct efa_rdm_pke *pkt_entry; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* overflow_pke_list has a pkt entry with msg_id 18000. * After calling efa_rdm_peer_move_overflow_pke_to_recvwin when exp_msg_id = 16384, @@ -213,7 +213,7 @@ void test_efa_rdm_peer_keep_pke_in_overflow_list(struct efa_resource **state) { struct efa_rdm_peer_overflow_pke_list_entry *overflow_pke_list_entry; struct dlist_entry *tmp; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); /* overflow_pke_list has a pkt entry with msg_id 33000. * After calling efa_rdm_peer_move_overflow_pke_to_recvwin when exp_msg_id = 16384, @@ -269,7 +269,7 @@ void test_efa_rdm_peer_append_overflow_pke_to_recvwin(struct efa_resource **stat struct efa_rdm_ep *efa_rdm_ep; int ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); diff --git a/prov/efa/test/efa_unit_test_rma.c b/prov/efa/test/efa_unit_test_rma.c index cb42a8528fd..fd5818657ba 100644 --- a/prov/efa/test/efa_unit_test_rma.c +++ b/prov/efa/test/efa_unit_test_rma.c @@ -15,10 +15,11 @@ static void test_efa_rma_prep(struct efa_resource *resource, fi_addr_t *addr) size_t raw_addr_len = sizeof(raw_addr); int ret; - efa_unit_test_resource_construct(resource, FI_EP_RDM); - resource->ep->rma = &efa_rma_ops; + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_PROV_NAME); base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + /* Add rma caps explicitly to ep->info to allow local test */ + base_ep->info->caps |= FI_RMA; ibv_qpx = base_ep->qp->ibv_qp_ex; ibv_qpx->wr_start = &efa_mock_ibv_wr_start_no_op; /* this mock will save the send work request (wr) in a global list */ diff --git a/prov/efa/test/efa_unit_test_runt.c b/prov/efa/test/efa_unit_test_runt.c index ae09f0a1c0e..5a49d0775ac 100644 --- a/prov/efa/test/efa_unit_test_runt.c +++ b/prov/efa/test/efa_unit_test_runt.c @@ -61,7 +61,7 @@ void test_efa_rdm_peer_get_runt_size_no_enough_runt(struct efa_resource **state) size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1001; @@ -79,7 +79,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_smaller_than_alignment(struct e size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1000; @@ -97,7 +97,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_exceeding_total_len(struct efa_ size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 0; @@ -115,7 +115,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_normal(struct efa_resource **st size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 10000; @@ -135,7 +135,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_128_multiple_alignment(struct e size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; @@ -158,7 +158,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_non_128_multiple_alignment(stru size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; @@ -181,7 +181,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_smaller_than_128_alignment(stru size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; @@ -202,7 +202,7 @@ void test_efa_rdm_peer_get_runt_size_cuda_memory_exceeding_total_len_128_alignme size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = 1; @@ -222,7 +222,7 @@ void test_efa_rdm_peer_get_runt_size_host_memory_smaller_than_alignment(struct e size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1000; @@ -240,7 +240,7 @@ void test_efa_rdm_peer_get_runt_size_host_memory_exceeding_total_len(struct efa_ size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 1111; peer_num_runt_bytes_in_flight = 0; @@ -258,7 +258,7 @@ void test_efa_rdm_peer_get_runt_size_host_memory_normal(struct efa_resource **st size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 10000; @@ -330,7 +330,7 @@ void test_efa_rdm_peer_select_readbase_rtm_no_runt(struct efa_resource **state) size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1000; @@ -347,7 +347,7 @@ void test_efa_rdm_peer_select_readbase_rtm_do_runt(struct efa_resource **state) size_t peer_num_runt_bytes_in_flight; size_t total_runt_size; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); msg_length = 12000; peer_num_runt_bytes_in_flight = 1000; diff --git a/prov/efa/test/efa_unit_test_send.c b/prov/efa/test/efa_unit_test_send.c index b3ed1a7873c..3b811e12222 100644 --- a/prov/efa/test/efa_unit_test_send.c +++ b/prov/efa/test/efa_unit_test_send.c @@ -20,7 +20,7 @@ void test_efa_rdm_msg_send_to_local_peer_with_null_desc(struct efa_resource **st struct fi_msg msg = {0}; struct fi_msg_tagged tmsg = {0}; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(ret, 0); diff --git a/prov/efa/test/efa_unit_test_srx.c b/prov/efa/test/efa_unit_test_srx.c index e0bff95169b..57ce6402b70 100644 --- a/prov/efa/test/efa_unit_test_srx.c +++ b/prov/efa/test/efa_unit_test_srx.c @@ -18,7 +18,7 @@ void test_efa_srx_min_multi_recv_size(struct efa_resource **state) struct util_srx_ctx *srx_ctx; size_t min_multi_recv_size_new; - efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); /* Set a new min_multi_recv_size via setopt*/ @@ -42,7 +42,7 @@ void test_efa_srx_cq(struct efa_resource **state) struct efa_rdm_ep *efa_rdm_ep; struct util_srx_ctx *srx_ctx; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); @@ -57,7 +57,7 @@ void test_efa_srx_lock(struct efa_resource **state) struct util_srx_ctx *srx_ctx; struct efa_domain *efa_domain; - efa_unit_test_resource_construct(resource, FI_EP_RDM); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_PROV_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 63316838a21..93991120fd4 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -118,6 +118,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_support_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_default_sizes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -201,7 +202,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_cq_post_initial_rx_pkts, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_cntr_post_initial_rx_pkts, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_cntr_post_initial_rx_pkts, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_peer_reorder_expected_msg_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_peer_reorder_smaller_msg_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_peer_reorder_larger_msg_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -233,6 +234,16 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_cq_read_recv_success, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_cq_read_send_failure, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_cq_read_recv_failure, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_open, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_cancel, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_getopt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_setopt_use_device_rdma, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_setopt_hmem_p2p, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_setopt_rnr_retry, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_setopt_sizes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_ep_bind_and_enable, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), }; cmocka_set_message_output(CM_OUTPUT_XML); diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index a13033e6f8b..bfe0b4c0aee 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -22,6 +22,9 @@ extern struct efa_mock_ibv_send_wr_list g_ibv_send_wr_list; extern struct efa_unit_test_mocks g_efa_unit_test_mocks; extern struct efa_env efa_env; +#define EFA_DIRECT_PROV_NAME "efa-direct" +#define EFA_PROV_NAME "efa" + struct efa_resource { struct fi_info *hints; struct fi_info *info; @@ -33,17 +36,17 @@ struct efa_resource { struct fid_cq *cq; }; -struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type); +struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type, char *prov_name); -void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_type ep_type); +void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_type ep_type, char *prov_name); void efa_unit_test_resource_construct_ep_not_enabled( - struct efa_resource *resource, enum fi_ep_type ep_type); + struct efa_resource *resource, enum fi_ep_type ep_type, char *prov_name); void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled( - struct efa_resource *resource, enum fi_ep_type ep_type); + struct efa_resource *resource, enum fi_ep_type ep_type, char *prov_name); void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, enum fi_ep_type ep_type, uint32_t fi_version, struct fi_info *hints, - bool enable_ep, bool open_cq); + bool enable_ep, bool open_cq, char *prov_name); void efa_unit_test_resource_construct_rdm_shm_disabled(struct efa_resource *resource); @@ -138,6 +141,7 @@ void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(); void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(); void test_efa_rdm_ep_support_unsolicited_write_recv(); void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(); +void test_efa_rdm_ep_default_sizes(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll(); @@ -221,7 +225,7 @@ void test_efa_rdm_cq_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(); void test_efa_rdm_cq_post_initial_rx_pkts(); void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(); void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(); -void test_efa_cntr_post_initial_rx_pkts(); +void test_efa_rdm_cntr_post_initial_rx_pkts(); void test_efa_rdm_peer_reorder_expected_msg_id(); void test_efa_rdm_peer_reorder_smaller_msg_id(); void test_efa_rdm_peer_reorder_larger_msg_id(); @@ -253,6 +257,16 @@ void test_efa_cq_read_send_success(); void test_efa_cq_read_recv_success(); void test_efa_cq_read_send_failure(); void test_efa_cq_read_recv_failure(); +void test_efa_ep_open(); +void test_efa_ep_cancel(); +void test_efa_ep_getopt(); +void test_efa_ep_setopt_use_device_rdma(); +void test_efa_ep_setopt_hmem_p2p(); +void test_efa_ep_setopt_rnr_retry(); +void test_efa_ep_setopt_sizes(); +void test_efa_ep_bind_and_enable(); +void test_efa_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(); +void test_efa_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(); static inline int efa_unit_test_get_dlist_length(struct dlist_entry *head) From 736d452a563596c2a894a6fbc468963415d0cb09 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 16 Jan 2025 00:40:56 +0000 Subject: [PATCH 366/393] prov/efa: Do infinite rnr retry for base ep by default Currently, efa_base_ep's default rnr_retry is 3 which only does a few retry in the firmware level for RNR. This is due to the efa_rdm_ep supports libfabric level RNR retry. However, the efa-direct ep doesn't support libfabric level RNR retry. Then we should make it do infinite RNR retry (7), which is also the default behavior of SRD QP. Signed-off-by: Shi Jin --- prov/efa/src/efa_base_ep.c | 3 ++- prov/efa/src/efa_base_ep.h | 20 ++++++++++++++++++++ prov/efa/src/efa_env.c | 1 - prov/efa/src/efa_env.h | 17 ----------------- prov/efa/src/rdm/efa_rdm_ep.h | 2 +- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 5 +++++ prov/efa/test/efa_unit_test_ep.c | 2 ++ 7 files changed, 30 insertions(+), 20 deletions(-) diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 11cbe558454..52dae8a030d 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -350,7 +350,8 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, return -FI_ENOMEM; } - base_ep->rnr_retry = efa_env.rnr_retry; + /* This is SRD qp's default behavior */ + base_ep->rnr_retry = EFA_RNR_INFINITE_RETRY; base_ep->efa_recv_wr_vec = calloc(sizeof(struct efa_recv_wr), EFA_RDM_EP_MAX_WR_PER_IBV_POST_RECV); if (!base_ep->efa_recv_wr_vec) { diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 52901fcc9ec..cb1edea598b 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -16,6 +16,26 @@ #define EFA_QP_LOW_LATENCY_SERVICE_LEVEL 8 #define EFA_ERROR_MSG_BUFFER_LENGTH 1024 +/* Default rnr_retry for efa-rdm ep. + * If first attempt to send a packet failed, + * this value controls how many times firmware + * retries the send before it report an RNR error + * (via rdma-core error cq entry). + * The valid number is from + * 0 (no retry) + * to + * EFA_RNR_INFINITY_RETRY (retry infinitely) + */ +#define EFA_RDM_DEFAULT_RNR_RETRY (3) +/** + * Infinite retry. + * NOTICE: this is the default rnr_retry + * mode for SRD qp. So modifying qp_attr.rnr_retry + * to this value has the same behavior as + * not modifying qp's rnr_retry attribute + */ +#define EFA_RNR_INFINITE_RETRY (7) + #define efa_rx_flags(efa_base_ep) ((efa_base_ep)->util_ep.rx_op_flags) #define efa_tx_flags(efa_base_ep) ((efa_base_ep)->util_ep.tx_op_flags) diff --git a/prov/efa/src/efa_env.c b/prov/efa/src/efa_env.c index ef6eedd57ec..d35c1cc9bde 100644 --- a/prov/efa/src/efa_env.c +++ b/prov/efa/src/efa_env.c @@ -34,7 +34,6 @@ struct efa_env efa_env = { .efa_max_gdrcopy_msg_size = 32768, .efa_read_segment_size = 1073741824, .efa_write_segment_size = 1073741824, /* need to confirm this constant. */ - .rnr_retry = 3, /* Setting this value to EFA_RNR_INFINITE_RETRY makes the firmware retry indefinitey */ .host_id_file = "/sys/devices/virtual/dmi/id/board_asset_tag", /* Available on EC2 instances and containers */ .use_sm2 = false, .huge_page_setting = EFA_ENV_HUGE_PAGE_UNSPEC, diff --git a/prov/efa/src/efa_env.h b/prov/efa/src/efa_env.h index dbff4182292..16286bbd4bc 100644 --- a/prov/efa/src/efa_env.h +++ b/prov/efa/src/efa_env.h @@ -6,12 +6,6 @@ #include "efa_prov.h" -/** - * Setting ibv_qp_attr.rnr_retry to this number when modifying qp - * to cause firmware to retry indefinitely. - */ -#define EFA_RNR_INFINITE_RETRY 7 - enum efa_env_huge_page_setting { EFA_ENV_HUGE_PAGE_UNSPEC, /**< user did not set FI_EFA_USE_HUGE_PAGE, provider will decide whether to use huge page*/ @@ -48,17 +42,6 @@ struct efa_env { size_t efa_max_gdrcopy_msg_size; size_t efa_read_segment_size; size_t efa_write_segment_size; - /* If first attempt to send a packet failed, - * this value controls how many times firmware - * retries the send before it report an RNR error - * (via rdma-core error cq entry). - * - * The valid number is from - * 0 (no retry) - * to - * EFA_RNR_INFINITY_RETRY (retry infinitely) - */ - int rnr_retry; /** * The absolute path to a file that contains an EC2 instance id-like string. * If host_id_file is provided, the program will attempt to read the diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index fc298010249..d5f2e76d8ce 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -274,7 +274,7 @@ int efa_rdm_ep_bulk_post_internal_rx_pkts(struct efa_rdm_ep *ep); static inline bool efa_rdm_ep_should_write_rnr_completion(struct efa_rdm_ep *ep) { - return (efa_env.rnr_retry < EFA_RNR_INFINITE_RETRY) && + return (ep->base_ep.rnr_retry < EFA_RNR_INFINITE_RETRY) && (ep->handle_resource_management == FI_RM_DISABLED); } diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index f619ea21e49..ba0c6940f3d 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -487,6 +487,11 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->base_ep.max_rma_size = info->ep_attr->max_msg_size; efa_rdm_ep->base_ep.inject_msg_size = info->tx_attr->inject_size; efa_rdm_ep->base_ep.inject_rma_size = info->tx_attr->inject_size; + /* + * base ep is configured as infinite retry, use a different default + * for efa_rdm_ep to allow libfabric level retry. + */ + efa_rdm_ep->base_ep.rnr_retry = EFA_RDM_DEFAULT_RNR_RETRY; /* efa_rdm_ep's own fields */ efa_rdm_ep->max_tagged_size = info->ep_attr->max_msg_size; diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 5926e38e267..1c12b5913dd 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -1412,6 +1412,7 @@ void test_efa_rdm_ep_default_sizes(struct efa_resource **state) assert_int_equal(efa_rdm_ep->base_ep.max_rma_size, resource->info->ep_attr->max_msg_size); assert_int_equal(efa_rdm_ep->base_ep.inject_msg_size, resource->info->tx_attr->inject_size); assert_int_equal(efa_rdm_ep->base_ep.inject_rma_size, resource->info->tx_attr->inject_size); + assert_int_equal(efa_rdm_ep->base_ep.rnr_retry, EFA_RDM_DEFAULT_RNR_RETRY); /* efa_rdm_ep's own fields */ assert_int_equal(efa_rdm_ep->max_tagged_size, resource->info->ep_attr->max_msg_size); @@ -1445,6 +1446,7 @@ void test_efa_ep_open(struct efa_resource **state) /* TODO: update inject_rma_size to inline size after firmware * supports inline rdma write */ assert_true(efa_ep->inject_rma_size == 0); + assert_int_equal(efa_ep->rnr_retry, EFA_RNR_INFINITE_RETRY); } /** From 56254add828811cf974bd46a9075c759a580b34b Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Fri, 17 Jan 2025 10:37:46 -0800 Subject: [PATCH 367/393] prov/efa: Remove x86-64 architecture check for static_assert This commit removes the x86-64 architecture check from the static_assert conditional compilation directive. The static_assert feature is not architecture-dependent and should be checked on all platforms that support it. Signed-off-by: Jessie Yang --- prov/efa/src/rdm/efa_rdm_pke.h | 2 +- prov/efa/src/rdm/efa_rdm_protocol.h | 2 +- prov/efa/src/rdm/efa_rdm_util.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_pke.h b/prov/efa/src/rdm/efa_rdm_pke.h index 223822ce595..3bd0e51390d 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.h +++ b/prov/efa/src/rdm/efa_rdm_pke.h @@ -195,7 +195,7 @@ struct efa_rdm_pke { _Alignas(EFA_RDM_PKE_ALIGNMENT) char wiredata[0]; }; -#if defined(static_assert) && defined(__x86_64__) +#if defined(static_assert) static_assert(sizeof (struct efa_rdm_pke) % EFA_RDM_PKE_ALIGNMENT == 0, "efa_rdm_pke alignment check"); #endif diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h index 05fe40fd36a..975cbd44e94 100644 --- a/prov/efa/src/rdm/efa_rdm_protocol.h +++ b/prov/efa/src/rdm/efa_rdm_protocol.h @@ -104,7 +104,7 @@ #define EFA_RDM_RUNT_PKT_END 148 #define EFA_RDM_EXTRA_REQ_PKT_END 148 -#if defined(static_assert) && defined(__x86_64__) +#if defined(static_assert) #define EFA_RDM_ENSURE_HEADER_SIZE(hdr, size) \ static_assert(sizeof (struct hdr) == (size), #hdr " size check") #else diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index 7c3daa3432f..123fda9c59f 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -10,7 +10,7 @@ #define EFA_RDM_MSG_PREFIX_SIZE (sizeof(struct efa_rdm_pke) + sizeof(struct efa_rdm_eager_msgrtm_hdr) + EFA_RDM_REQ_OPT_RAW_ADDR_HDR_SIZE) -#if defined(static_assert) && defined(__x86_64__) +#if defined(static_assert) static_assert(EFA_RDM_MSG_PREFIX_SIZE % 8 == 0, "message prefix size alignment check"); #endif From 3d0412748bd8d11d8c7561718a18529f8f82e462 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Wed, 15 Jan 2025 13:27:09 -0800 Subject: [PATCH 368/393] prov/efa: Implement FI_CONTEXT2 in EFA Direct Store the completion flags and peer address in FI_CONTEXT2 and retrieve later when writing cq. Signed-off-by: Jessie Yang --- prov/efa/src/efa.h | 35 ++++++++++++++++++ prov/efa/src/efa_cq.c | 8 +++-- prov/efa/src/efa_msg.c | 6 ++-- prov/efa/src/efa_rma.c | 8 +++-- prov/efa/test/efa_unit_test_cq.c | 62 ++++++++++++++++++++++++++------ 5 files changed, 102 insertions(+), 17 deletions(-) diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h index aef070fdc5f..5f1cf162c2b 100644 --- a/prov/efa/src/efa.h +++ b/prov/efa/src/efa.h @@ -107,6 +107,41 @@ struct efa_fabric { #endif }; +struct efa_context { + uint64_t completion_flags; + fi_addr_t addr; +}; + +#if defined(static_assert) +static_assert(sizeof(struct efa_context) <= sizeof(struct fi_context2), + "efa_context must not be larger than fi_context2"); +#endif + +/** + * Prepare and return a pointer to an EFA context structure. + * + * @param context Pointer to the msg context. + * @param addr Peer address associated with the operation. + * @param flags Operation flags (e.g., FI_COMPLETION). + * @param completion_flags Completion flags reported in the cq entry. + * @return A pointer to an initialized EFA context structure, + * or NULL if context is invalid or FI_COMPLETION is not set. + */ +static inline struct efa_context *efa_fill_context(const void *context, + fi_addr_t addr, + uint64_t flags, + uint64_t completion_flags) +{ + if (!context || !(flags & FI_COMPLETION)) + return NULL; + + struct efa_context *efa_context = (struct efa_context *) context; + efa_context->completion_flags = completion_flags; + efa_context->addr = addr; + + return efa_context; +} + static inline int efa_str_to_ep_addr(const char *node, const char *service, struct efa_ep_addr *addr) { diff --git a/prov/efa/src/efa_cq.c b/prov/efa/src/efa_cq.c index 1ca9416b618..d5bfdb2c949 100644 --- a/prov/efa/src/efa_cq.c +++ b/prov/efa/src/efa_cq.c @@ -35,7 +35,10 @@ static void efa_cq_construct_cq_entry(struct ibv_cq_ex *ibv_cqx, struct fi_cq_tagged_entry *entry) { entry->op_context = (void *)ibv_cqx->wr_id; - entry->flags = efa_cq_opcode_to_fi_flags(ibv_wc_read_opcode(ibv_cqx)); + if (ibv_cqx->wr_id) + entry->flags = ((struct efa_context *) ibv_cqx->wr_id)->completion_flags; + else + entry->flags = efa_cq_opcode_to_fi_flags(ibv_wc_read_opcode(ibv_cqx)); entry->len = ibv_wc_read_byte_len(ibv_cqx); entry->buf = NULL; entry->data = 0; @@ -80,8 +83,7 @@ static void efa_cq_handle_error(struct efa_base_ep *base_ep, err_entry.prov_errno = prov_errno; if (is_tx) - // TODO: get correct peer addr for TX operation - addr = FI_ADDR_NOTAVAIL; + addr = ibv_cq_ex->wr_id ? ((struct efa_context *)ibv_cq_ex->wr_id)->addr : FI_ADDR_NOTAVAIL; else addr = efa_av_reverse_lookup(base_ep->av, ibv_wc_read_slid(ibv_cq_ex), diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c index c2af757e112..5d5768c8ff1 100644 --- a/prov/efa/src/efa_msg.c +++ b/prov/efa/src/efa_msg.c @@ -101,7 +101,8 @@ static inline ssize_t efa_post_recv(struct efa_base_ep *base_ep, const struct fi wr = &base_ep->efa_recv_wr_vec[wr_index].wr; wr->num_sge = msg->iov_count; wr->sg_list = base_ep->efa_recv_wr_vec[wr_index].sge; - wr->wr_id = (uintptr_t) ((flags & FI_COMPLETION) ? msg->context : NULL); + wr->wr_id = (uintptr_t) efa_fill_context(msg->context, msg->addr, flags, + FI_RECV | FI_MSG); for (i = 0; i < msg->iov_count; i++) { addr = (uintptr_t)msg->msg_iov[i].iov_base; @@ -224,7 +225,8 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi base_ep->is_wr_started = true; } - qp->ibv_qp_ex->wr_id = (uintptr_t) ((flags & FI_COMPLETION) ? msg->context : NULL); + qp->ibv_qp_ex->wr_id = (uintptr_t) efa_fill_context( + msg->context, msg->addr, flags, FI_SEND | FI_MSG); if (flags & FI_REMOTE_CQ_DATA) { ibv_wr_send_imm(qp->ibv_qp_ex, msg->data); diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c index 8fee3a2021b..da33b44350f 100644 --- a/prov/efa/src/efa_rma.c +++ b/prov/efa/src/efa_rma.c @@ -90,7 +90,9 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, ibv_wr_start(qp->ibv_qp_ex); base_ep->is_wr_started = true; } - qp->ibv_qp_ex->wr_id = (uintptr_t) ((flags & FI_COMPLETION) ? msg->context : NULL); + + qp->ibv_qp_ex->wr_id = (uintptr_t) efa_fill_context( + msg->context, msg->addr, flags, FI_RMA | FI_READ); /* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */ ibv_wr_rdma_read(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); @@ -225,7 +227,9 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, ibv_wr_start(qp->ibv_qp_ex); base_ep->is_wr_started = true; } - qp->ibv_qp_ex->wr_id = (uintptr_t) ((flags & FI_COMPLETION) ? msg->context : NULL); + + qp->ibv_qp_ex->wr_id = (uintptr_t) efa_fill_context( + msg->context, msg->addr, flags, FI_RMA | FI_WRITE); if (flags & FI_REMOTE_CQ_DATA) { ibv_wr_rdma_write_imm(qp->ibv_qp_ex, msg->rma_iov[0].key, diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 795aa7b8066..82dcd38952f 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -811,7 +811,8 @@ void test_ibv_cq_ex_read_ignore_removed_peer() #endif static void test_efa_cq_read(struct efa_resource *resource, fi_addr_t *addr, - int ibv_wc_opcode, int status, int vendor_error) + int ibv_wc_opcode, int status, int vendor_error, + struct efa_context *ctx) { int ret; size_t raw_addr_len = sizeof(struct efa_ep_addr); @@ -845,7 +846,7 @@ static void test_efa_cq_read(struct efa_resource *resource, fi_addr_t *addr, if (ibv_wc_opcode == IBV_WC_RECV) { ibv_cqx = container_of(base_ep->util_ep.rx_cq, struct efa_cq, util_cq)->ibv_cq.ibv_cq_ex; ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; - ibv_cqx->wr_id = (uintptr_t)12345; + ctx->completion_flags = FI_RECV | FI_MSG; will_return(efa_mock_ibv_start_poll_return_mock, 0); ibv_cqx->status = status; } else { @@ -853,8 +854,11 @@ static void test_efa_cq_read(struct efa_resource *resource, fi_addr_t *addr, /* this mock will set ibv_cq_ex->wr_id to the wr_id of the head of global send_wr, * and set ibv_cq_ex->status to mock value */ ibv_cqx->start_poll = &efa_mock_ibv_start_poll_use_saved_send_wr_with_mock_status; + ctx->completion_flags = FI_SEND | FI_MSG; will_return(efa_mock_ibv_start_poll_use_saved_send_wr_with_mock_status, status); } + ctx->addr = *addr; + ibv_cqx->wr_id = (uintptr_t) ctx; ibv_cqx->next_poll = &efa_mock_ibv_next_poll_return_mock; ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; @@ -892,19 +896,29 @@ void test_efa_cq_read_send_success(struct efa_resource **state) { struct efa_resource *resource = *state; struct efa_unit_test_buff send_buff; + struct efa_base_ep *base_ep; + struct efa_context *efa_context; + struct fi_context2 ctx; struct fi_cq_data_entry cq_entry; fi_addr_t addr; int ret; - test_efa_cq_read(resource, &addr, IBV_WC_SEND, IBV_WC_SUCCESS, 0); + test_efa_cq_read(resource, &addr, IBV_WC_SEND, IBV_WC_SUCCESS, 0, + (struct efa_context *) &ctx); efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); ret = fi_send(resource->ep, send_buff.buff, send_buff.size, - fi_mr_desc(send_buff.mr), addr, (void *) 12345); + fi_mr_desc(send_buff.mr), addr, &ctx); assert_int_equal(ret, 0); assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_context = (struct efa_context *) base_ep->qp->ibv_qp_ex->wr_id; + assert_true(efa_context->completion_flags & FI_SEND); + assert_true(efa_context->completion_flags & FI_MSG); + assert_true(efa_context->addr == addr); + ret = fi_cq_read(resource->cq, &cq_entry, 1); /* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */ assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); @@ -921,17 +935,27 @@ void test_efa_cq_read_recv_success(struct efa_resource **state) { struct efa_resource *resource = *state; struct efa_unit_test_buff recv_buff; + struct efa_base_ep *base_ep; + struct efa_context *efa_context; struct fi_cq_data_entry cq_entry; + struct fi_context2 ctx; fi_addr_t addr; int ret; - test_efa_cq_read(resource, &addr, IBV_WC_RECV, IBV_WC_SUCCESS, 0); + test_efa_cq_read(resource, &addr, IBV_WC_RECV, IBV_WC_SUCCESS, 0, + (struct efa_context *) &ctx); efa_unit_test_buff_construct(&recv_buff, resource, 4096 /* buff_size */); ret = fi_recv(resource->ep, recv_buff.buff, recv_buff.size, - fi_mr_desc(recv_buff.mr), addr, NULL); + fi_mr_desc(recv_buff.mr), addr, &ctx); assert_int_equal(ret, 0); + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_context = (struct efa_context *) base_ep->efa_recv_wr_vec[base_ep->recv_wr_index].wr.wr_id; + assert_true(efa_context->completion_flags & FI_RECV); + assert_true(efa_context->completion_flags & FI_MSG); + assert_true(efa_context->addr == addr); + ret = fi_cq_read(resource->cq, &cq_entry, 1); assert_int_equal(ret, 1); @@ -971,20 +995,29 @@ void test_efa_cq_read_send_failure(struct efa_resource **state) { struct efa_resource *resource = *state; struct efa_unit_test_buff send_buff; + struct efa_base_ep *base_ep; + struct efa_context *efa_context; struct fi_cq_data_entry cq_entry; + struct fi_context2 ctx; fi_addr_t addr; int ret; test_efa_cq_read(resource, &addr, IBV_WC_SEND, IBV_WC_GENERAL_ERR, - EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE, (struct efa_context *) &ctx); efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); ret = fi_send(resource->ep, send_buff.buff, send_buff.size, - fi_mr_desc(send_buff.mr), addr, (void *) 12345); + fi_mr_desc(send_buff.mr), addr, &ctx); assert_int_equal(ret, 0); assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_context = (struct efa_context *) base_ep->qp->ibv_qp_ex->wr_id; + assert_true(efa_context->completion_flags & FI_SEND); + assert_true(efa_context->completion_flags & FI_MSG); + assert_true(efa_context->addr == addr); + ret = fi_cq_read(resource->cq, &cq_entry, 1); /* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */ assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); @@ -1008,18 +1041,27 @@ void test_efa_cq_read_recv_failure(struct efa_resource **state) { struct efa_resource *resource = *state; struct efa_unit_test_buff recv_buff; + struct efa_base_ep *base_ep; + struct efa_context *efa_context; struct fi_cq_data_entry cq_entry; + struct fi_context2 ctx; fi_addr_t addr; int ret; test_efa_cq_read(resource, &addr, IBV_WC_RECV, IBV_WC_GENERAL_ERR, - EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE, (struct efa_context *) &ctx); efa_unit_test_buff_construct(&recv_buff, resource, 4096 /* buff_size */); ret = fi_recv(resource->ep, recv_buff.buff, recv_buff.size, - fi_mr_desc(recv_buff.mr), addr, NULL); + fi_mr_desc(recv_buff.mr), addr, &ctx); assert_int_equal(ret, 0); + base_ep = container_of(resource->ep, struct efa_base_ep, util_ep.ep_fid); + efa_context = (struct efa_context *) base_ep->efa_recv_wr_vec[base_ep->recv_wr_index].wr.wr_id; + assert_true(efa_context->completion_flags & FI_RECV); + assert_true(efa_context->completion_flags & FI_MSG); + assert_true(efa_context->addr == addr); + ret = fi_cq_read(resource->cq, &cq_entry, 1); assert_int_equal(ret, -FI_EAVAIL); From d1fd795c8ead8806202b460abc6cdcd85137d22a Mon Sep 17 00:00:00 2001 From: Sai Sunku Date: Wed, 22 Jan 2025 11:57:41 -0500 Subject: [PATCH 369/393] contrib/aws: Reduce nccl test iteration count Signed-off-by: Sai Sunku --- contrib/aws/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile index 39a54e9219f..252b60fadc3 100644 --- a/contrib/aws/Jenkinsfile +++ b/contrib/aws/Jenkinsfile @@ -244,7 +244,7 @@ pipeline { stages["2_c6g_alinux2023_tcp"] = get_test_stage_with_lock("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) stages["2_c6g_ubuntu2004_tcp"] = get_test_stage_with_lock("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) stages["2_c6g_rhel8_tcp"] = get_test_stage_with_lock("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp) - stages["3_g4dn_alinux2_tcp"] = get_test_stage_with_lock("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", g4dn12x_lock_label, addl_args_tcp + " --test-list test_nccl_tests") + stages["3_g4dn_alinux2_tcp"] = get_test_stage_with_lock("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", g4dn12x_lock_label, addl_args_tcp + " --test-list test_nccl_tests --test-iterations fastest") // Multi Node Tests - SOCKETS stages["2_c6g_alinux2_sockets"] = get_test_stage_with_lock("2_c6g_alinux2_sockets", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_sockets) From dedbbfc69ccb0f29c9e6e214a20e813893d65dde Mon Sep 17 00:00:00 2001 From: Mike Uttormark Date: Fri, 13 Dec 2024 13:35:21 -0600 Subject: [PATCH 370/393] prov/util: Separate uffd and import mem monitors Other memory monitors, such as CUDA, ROCR, and ZE, have a .c file for the implementation. This change cleans up the util_mem_monitor.c code by defining a uffd and import .c file, thus aligning to other memory monitor implementations. Signed-off-by: Mike Uttormark Signed-off-by: Ian Ziemba --- Makefile.am | 2 + include/ofi_mr.h | 5 +- libfabric.vcxproj | 2 + prov/util/src/import_mem_monitor.c | 195 ++++++++++ prov/util/src/uffd_mem_monitor.c | 450 +++++++++++++++++++++++ prov/util/src/util_mem_monitor.c | 566 ----------------------------- 6 files changed, 653 insertions(+), 567 deletions(-) create mode 100644 prov/util/src/import_mem_monitor.c create mode 100644 prov/util/src/uffd_mem_monitor.c diff --git a/Makefile.am b/Makefile.am index 359669202dc..f91f3d1f265 100644 --- a/Makefile.am +++ b/Makefile.am @@ -92,6 +92,8 @@ common_srcs = \ prov/util/src/ze_ipc_monitor.c \ prov/util/src/xpmem_monitor.c \ prov/util/src/kdreg2_mem_monitor.c \ + prov/util/src/uffd_mem_monitor.c \ + prov/util/src/import_mem_monitor.c \ prov/util/src/util_profile.c \ prov/coll/src/coll_attr.c \ prov/coll/src/coll_av.c \ diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 12383413110..b0556eee019 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -118,9 +118,12 @@ static inline uint64_t ofi_mr_get_prov_mode(uint32_t version, } } - /* Single lock used by all memory monitors and MR caches. */ extern pthread_mutex_t mm_lock; + +/* Lock used to coordinate monitor states. */ +extern pthread_mutex_t mm_state_lock; + /* The read-write lock is an additional lock used to protect the dlist_entry * list of ofi_mem_monitor. Due to the necessity of releasing the mm_lock * while walking the dlist in ofi_monitor_notify, we need a separate lock to diff --git a/libfabric.vcxproj b/libfabric.vcxproj index f3f3c5e5dc9..2921b8316ca 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -760,6 +760,8 @@ + + diff --git a/prov/util/src/import_mem_monitor.c b/prov/util/src/import_mem_monitor.c new file mode 100644 index 00000000000..e7be581526f --- /dev/null +++ b/prov/util/src/import_mem_monitor.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2017 Cray Inc. All rights reserved. + * Copyright (c) 2017-2021 Intel Inc. All rights reserved. + * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * (C) Copyright 2020,2024 Hewlett Packard Enterprise Development LP + * Copyright (C) 2024 Cornelis Networks. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include + +static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor); +static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor); +static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor); +static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor); +static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); +static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); +static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry); + +struct ofi_import_monitor { + struct ofi_mem_monitor monitor; + struct fid_mem_monitor *impfid; +}; + +static struct ofi_import_monitor impmon = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_import_monitor_init, + .monitor.cleanup = ofi_import_monitor_cleanup, + .monitor.start = ofi_import_monitor_start, + .monitor.stop = ofi_import_monitor_stop, + .monitor.subscribe = ofi_import_monitor_subscribe, + .monitor.unsubscribe = ofi_import_monitor_unsubscribe, + .monitor.valid = ofi_import_monitor_valid, + .monitor.name = "import", +}; + +struct ofi_mem_monitor *import_monitor = &impmon.monitor; + +static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor) +{ + ofi_monitor_init(monitor); +} + +static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor) +{ + assert(!impmon.impfid); + ofi_monitor_cleanup(monitor); +} + +static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor) +{ + if (!impmon.impfid) + return -FI_ENOSYS; + + return impmon.impfid->export_ops->start(impmon.impfid); +} + +static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor) +{ + assert(impmon.impfid); + impmon.impfid->export_ops->stop(impmon.impfid); +} + +static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + assert(impmon.impfid); + return impmon.impfid->export_ops->subscribe(impmon.impfid, addr, len); +} + +static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + assert(impmon.impfid); + impmon.impfid->export_ops->unsubscribe(impmon.impfid, addr, len); +} + +static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + assert(impmon.impfid); + return impmon.impfid->export_ops->valid(impmon.impfid, + entry->info.iov.iov_base, + entry->info.iov.iov_len); +} + +static void ofi_import_monitor_notify(struct fid_mem_monitor *monitor, + const void *addr, size_t len) +{ + assert(monitor->fid.context == &impmon); + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + ofi_monitor_notify(&impmon.monitor, addr, len); + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); +} + +static int ofi_close_import(struct fid *fid) +{ + pthread_mutex_lock(&mm_state_lock); + impmon.monitor.state = FI_MM_STATE_IDLE; + pthread_mutex_unlock(&mm_state_lock); + impmon.impfid = NULL; + return 0; +} + +static struct fi_ops_mem_notify import_ops = { + .size = sizeof(struct fi_ops_mem_notify), + .notify = ofi_import_monitor_notify, +}; + +static struct fi_ops impfid_ops = { + .size = sizeof(struct fi_ops), + .close = ofi_close_import, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, + .tostr = fi_no_tostr, + .ops_set = fi_no_ops_set, +}; + +int ofi_monitor_import(struct fid *fid) +{ + struct fid_mem_monitor *impfid; + + if (fid->fclass != FI_CLASS_MEM_MONITOR) + return -FI_ENOSYS; + + if (impmon.impfid) { + FI_WARN(&core_prov, FI_LOG_MR, + "imported monitor already exists\n"); + return -FI_EBUSY; + } + + if (default_monitor && !dlist_empty(&default_monitor->list)) { + FI_WARN(&core_prov, FI_LOG_MR, + "cannot replace active monitor\n"); + return -FI_EBUSY; + } + + impfid = container_of(fid, struct fid_mem_monitor, fid); + if (impfid->export_ops->size < sizeof(struct fi_ops_mem_monitor)) + return -FI_EINVAL; + + impmon.impfid = impfid; + impfid->fid.context = &impmon; + impfid->fid.ops = &impfid_ops; + impfid->import_ops = &import_ops; + + FI_INFO(&core_prov, FI_LOG_MR, + "setting imported memory monitor as default\n"); + default_monitor = &impmon.monitor; + return 0; +} diff --git a/prov/util/src/uffd_mem_monitor.c b/prov/util/src/uffd_mem_monitor.c new file mode 100644 index 00000000000..c06b49178a1 --- /dev/null +++ b/prov/util/src/uffd_mem_monitor.c @@ -0,0 +1,450 @@ +/* + * Copyright (c) 2017 Cray Inc. All rights reserved. + * Copyright (c) 2017-2021 Intel Inc. All rights reserved. + * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * (C) Copyright 2020,2024 Hewlett Packard Enterprise Development LP + * Copyright (C) 2024 Cornelis Networks. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include + +#ifndef UFFD_USER_MODE_ONLY +#define UFFD_USER_MODE_ONLY 0 +#endif + +static int ofi_uffd_start(struct ofi_mem_monitor *monitor); +static void ofi_uffd_stop(struct ofi_mem_monitor *monitor); + +static struct ofi_uffd uffd = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_monitor_init, + .monitor.cleanup = ofi_monitor_cleanup, + .monitor.start = ofi_uffd_start, + .monitor.stop = ofi_uffd_stop, + .monitor.name = "uffd", + .fd = -1, + .exit_pipe = { -1, -1 }, +}; +struct ofi_mem_monitor *uffd_monitor = &uffd.monitor; + +#if HAVE_UFFD_MONITOR + +#include +#include +#include +#include + +static void ofi_uffd_pagefault_handler(struct uffd_msg *msg); + +/* The userfault fd monitor requires for events that could + * trigger it to be handled outside of the monitor functions + * itself. When a fault occurs on a monitored region, the + * faulting thread is put to sleep until the event is read + * via the userfault file descriptor. If this fault occurs + * within the userfault handling thread, no threads will + * read this event and our threads cannot progress, resulting + * in a hang. + */ +static void *ofi_uffd_handler(void *arg) +{ + struct uffd_msg msg; + struct pollfd fds[2]; + int ret; + + fds[0].fd = uffd.fd; + fds[0].events = POLLIN; + fds[1].fd = uffd.exit_pipe[0]; + fds[1].events = POLLIN; + + for (;;) { + ret = poll(fds, 2, -1); + if (ret < 0 || fds[1].revents) + break; + + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + ret = read(uffd.fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); + if (errno != EAGAIN) + break; + continue; + } + + FI_DBG(&core_prov, FI_LOG_MR, "Received UFFD event %d\n", msg.event); + + switch (msg.event) { + case UFFD_EVENT_REMOVE: + ofi_monitor_unsubscribe(&uffd.monitor, + (void *) (uintptr_t) msg.arg.remove.start, + (size_t) (msg.arg.remove.end - + msg.arg.remove.start), NULL); + /* fall through */ + case UFFD_EVENT_UNMAP: + ofi_monitor_notify(&uffd.monitor, + (void *) (uintptr_t) msg.arg.remove.start, + (size_t) (msg.arg.remove.end - + msg.arg.remove.start)); + break; + case UFFD_EVENT_REMAP: + ofi_monitor_notify(&uffd.monitor, + (void *) (uintptr_t) msg.arg.remap.from, + (size_t) msg.arg.remap.len); + break; + case UFFD_EVENT_PAGEFAULT: + ofi_uffd_pagefault_handler(&msg); + break; + default: + FI_WARN(&core_prov, FI_LOG_MR, + "Unhandled uffd event %d\n", msg.event); + break; + } + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); + } + return NULL; +} + +static void ofi_uffd_pagefault_handler(struct uffd_msg *msg) +{ + struct uffdio_zeropage zp; + int i; + int ret; + void * const address = (void *) (uintptr_t) msg->arg.pagefault.address; + uint64_t const flags = (uint64_t) msg->arg.pagefault.flags; +#if HAVE_UFFD_THREAD_ID + uint32_t const ptid = (uint32_t) msg->arg.pagefault.feat.ptid; +#endif + /* ofi_uffd_register sets the mode to + * UFFDIO_REGISTER_MODE_MISSING. As a result, we can + * get read, write or write-protect notifications via + * UFFD_EVENT_PAGEFAULT. The only ones we can sensibly + * handle are writes to non-backed pages. + * (Read and write-protect notifications are likely + * application bugs.) + */ + + if (flags != UFFD_PAGEFAULT_FLAG_WRITE) { +#if HAVE_UFFD_THREAD_ID + FI_WARN(&core_prov, FI_LOG_MR, + "UFFD pagefault with unrecognized flags: %lu, address %p, thread %u\n", + flags, address, ptid); +#else + FI_WARN(&core_prov, FI_LOG_MR, + "UFFD pagefault with unrecognized flags: %lu, address %p\n", + flags, address); +#endif + /* The faulting thread is halted at this point. In + * theory we could wake it up with UFFDIO_WAKE. In + * practice that requires the address range of the + * fault, information we don't have from the + * pagefault event. + */ + + return; + } + + /* The event tells us the address of the fault + * (which can be anywhere on the page). It does not + * tell us the size of the page so we have to guess + * from the list of known page_sizes. + * + * We employ the standard resolution: install a zeroed page. + */ + + for (i = 0; i < num_page_sizes; ) { + /* setup a zeropage reqest for this pagesize */ + zp.range.start = (uint64_t) (uintptr_t) + ofi_get_page_start(address, page_sizes[i]); + zp.range.len = (uint64_t) page_sizes[i]; + zp.mode = 0; + zp.zeropage = 0; + + ret = ioctl(uffd.fd, UFFDIO_ZEROPAGE, &zp); + + if (ret == 0) /* success */ + return; + + /* Note: the documentation (man ioctl_userfaultfd) says + * that the ioctl() returns -1 on error and errno is set + * to indicate the error. It also says that the zeropage + * member of struct uffdio_zeropage is set to the negated + * error. The unit tests for uffd say + * real retval in uffdio_zeropage.zeropage + * so that's what we use here. + */ + + if (zp.zeropage == -EAGAIN) + /* This is a tough case. If the memory map is + * changing, the kernel returns EAGAIN before + * installing the zeroed page. So the page + * fault has not been rectified. If we don't try + * again, the application will crash. If we add + * a maximum retry count we could still end up + * with an unresolved page fault. + * + * It's likely a kernel bug or (something else + * bad like OOM) if it returns EAGAIN forever. + * So we retry until we get something besides + * EAGAIN. + */ + continue; /* retry this page size */ + + i++; /* try next page size */ + + if (zp.zeropage == -EINVAL) /* wrong page size */ + continue; + + /* If we get here we failed to install the zeroed + * page for this page size and it wasn't a size error. + * We could either stop trying or go on to the + * next pagesize. We choose to print a message and try + * another page size. + */ + + FI_DBG(&core_prov, FI_LOG_MR, + "Unable to install zeroed page of size %zu to handle page fault." + " address = %p zeropage = %lld errno = %d\n", + page_sizes[i], address, zp.zeropage, errno); + } + + FI_WARN(&core_prov, FI_LOG_MR, + "Unable to handle event UFFD_EVENT_PAGEFAULT for address %p.\n", + address); +} + +static int ofi_uffd_register(const void *addr, size_t len, size_t page_size) +{ + struct uffdio_register reg; + int ret; + + reg.range.start = (uint64_t) (uintptr_t) + ofi_get_page_start(addr, page_size); + reg.range.len = ofi_get_page_bytes(addr, len, page_size); + reg.mode = UFFDIO_REGISTER_MODE_MISSING; + ret = ioctl(uffd.fd, UFFDIO_REGISTER, ®); + if (ret < 0) { + if (errno != EINVAL) { + FI_WARN(&core_prov, FI_LOG_MR, + "ioctl/uffd_reg: %s\n", strerror(errno)); + } + return -errno; + } + return 0; +} + +static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + int i; + + assert(monitor == &uffd.monitor); + for (i = 0; i < num_page_sizes; i++) { + if (!ofi_uffd_register(addr, len, page_sizes[i])) + return 0; + } + return -FI_EFAULT; +} + +static int ofi_uffd_unregister(const void *addr, size_t len, size_t page_size) +{ + struct uffdio_range range; + int ret; + + range.start = (uint64_t) (uintptr_t) + ofi_get_page_start(addr, page_size); + range.len = ofi_get_page_bytes(addr, len, page_size); + ret = ioctl(uffd.fd, UFFDIO_UNREGISTER, &range); + if (ret < 0) { + if (errno != EINVAL) { + FI_WARN(&core_prov, FI_LOG_MR, + "ioctl/uffd_unreg: %s\n", strerror(errno)); + } + return -errno; + } + return 0; +} + +/* May be called from mr cache notifier callback */ +static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + int i; + + assert(monitor == &uffd.monitor); + for (i = 0; i < num_page_sizes; i++) { + if (!ofi_uffd_unregister(addr, len, page_sizes[i])) + break; + } +} + +static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + /* no-op */ + return true; +} + +static void ofi_uffd_close_fd(struct ofi_uffd *monitor) +{ + close(monitor->fd); + monitor->fd = -1; +} + +static void ofi_uffd_close_pipe(struct ofi_uffd *monitor) +{ + close(monitor->exit_pipe[0]); + close(monitor->exit_pipe[1]); + monitor->exit_pipe[0] = -1; + monitor->exit_pipe[1] = -1; +} + +static int ofi_uffd_start(struct ofi_mem_monitor *monitor) +{ + struct uffdio_api api; + int ret; + + if (uffd.fd >= 0) + return 0; + + if (!num_page_sizes) + return -FI_ENODATA; + + ret = pipe(uffd.exit_pipe); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "uffd/pipe: %s\n", strerror(errno)); + return -errno; + } + + uffd.fd = syscall(__NR_userfaultfd, + O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); + if (uffd.fd < 0) { + FI_WARN(&core_prov, FI_LOG_MR, + "syscall/userfaultfd %s\n", strerror(errno)); + ret = -errno; + goto close_pipe; + } + + api.api = UFFD_API; + api.features = UFFD_FEATURE_EVENT_UNMAP | UFFD_FEATURE_EVENT_REMOVE | + UFFD_FEATURE_EVENT_REMAP; + ret = ioctl(uffd.fd, UFFDIO_API, &api); + if (ret < 0) { + FI_WARN(&core_prov, FI_LOG_MR, + "ioctl/uffdio: %s\n", strerror(errno)); + ret = -errno; + goto close_fd; + } + + if (api.api != UFFD_API) { + FI_WARN(&core_prov, FI_LOG_MR, "uffd features not supported\n"); + ret = -FI_ENOSYS; + goto close_fd; + } + + ret = pthread_create(&uffd.thread, NULL, ofi_uffd_handler, &uffd); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "failed to create handler thread %s\n", strerror(ret)); + ret = -ret; + goto close_fd; + } + + uffd.monitor.subscribe = ofi_uffd_subscribe; + uffd.monitor.unsubscribe = ofi_uffd_unsubscribe; + uffd.monitor.valid = ofi_uffd_valid; + + FI_INFO(&core_prov, FI_LOG_MR, + "Memory monitor uffd started.\n"); + + return 0; + +close_fd: + + ofi_uffd_close_fd(&uffd); + +close_pipe: + + ofi_uffd_close_pipe(&uffd); + + FI_WARN(&core_prov, FI_LOG_MR, + "Memory monitor uffd failed to start: %s.\n", + strerror(-ret)); + + return ret; +} + +static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) +{ + ssize_t num_written; + + if (uffd.fd < 0) + return; + + /* tell the thread to exit with the exit_pipe */ + + num_written = write(uffd.exit_pipe[1], "X", 1); + if (num_written != 1) { + FI_WARN(&core_prov, FI_LOG_MR, + "uffd/close: unable to write to exit pipe: %s", + strerror(errno)); + } + + pthread_join(uffd.thread, NULL); + + ofi_uffd_close_fd(&uffd); + ofi_uffd_close_pipe(&uffd); + + FI_INFO(&core_prov, FI_LOG_MR, + "Memory monitor uffd stopped.\n"); +} + +#else /* HAVE_UFFD_MONITOR */ + +static int ofi_uffd_start(struct ofi_mem_monitor *monitor) +{ + return -FI_ENOSYS; +} + +static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) +{ +} + +#endif /* HAVE_UFFD_MONITOR */ diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index 2b6cc9b0ed5..f0e094dc6e4 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -39,33 +39,11 @@ #include #include -#include -#include -#include - -#ifndef UFFD_USER_MODE_ONLY -#define UFFD_USER_MODE_ONLY 0 -#endif pthread_mutex_t mm_lock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t mm_state_lock = PTHREAD_MUTEX_INITIALIZER; pthread_rwlock_t mm_list_rwlock = PTHREAD_RWLOCK_INITIALIZER; -static int ofi_uffd_start(struct ofi_mem_monitor *monitor); -static void ofi_uffd_stop(struct ofi_mem_monitor *monitor); - -static struct ofi_uffd uffd = { - .monitor.iface = FI_HMEM_SYSTEM, - .monitor.init = ofi_monitor_init, - .monitor.cleanup = ofi_monitor_cleanup, - .monitor.start = ofi_uffd_start, - .monitor.stop = ofi_uffd_stop, - .monitor.name = "uffd", - .fd = -1, - .exit_pipe = { -1, -1 }, -}; -struct ofi_mem_monitor *uffd_monitor = &uffd.monitor; - struct ofi_mem_monitor *default_monitor; struct ofi_mem_monitor *default_cuda_monitor; struct ofi_mem_monitor *default_rocr_monitor; @@ -574,547 +552,3 @@ void ofi_monitor_unsubscribe_no_op(struct ofi_mem_monitor *notifier, union ofi_mr_hmem_info *hmem_info) { } - -#if HAVE_UFFD_MONITOR - -#include -#include -#include -#include - -static void ofi_uffd_pagefault_handler(struct uffd_msg *msg); - -/* The userfault fd monitor requires for events that could - * trigger it to be handled outside of the monitor functions - * itself. When a fault occurs on a monitored region, the - * faulting thread is put to sleep until the event is read - * via the userfault file descriptor. If this fault occurs - * within the userfault handling thread, no threads will - * read this event and our threads cannot progress, resulting - * in a hang. - */ -static void *ofi_uffd_handler(void *arg) -{ - struct uffd_msg msg; - struct pollfd fds[2]; - int ret; - - fds[0].fd = uffd.fd; - fds[0].events = POLLIN; - fds[1].fd = uffd.exit_pipe[0]; - fds[1].events = POLLIN; - - for (;;) { - ret = poll(fds, 2, -1); - if (ret < 0 || fds[1].revents) - break; - - pthread_rwlock_rdlock(&mm_list_rwlock); - pthread_mutex_lock(&mm_lock); - ret = read(uffd.fd, &msg, sizeof(msg)); - if (ret != sizeof(msg)) { - pthread_mutex_unlock(&mm_lock); - pthread_rwlock_unlock(&mm_list_rwlock); - if (errno != EAGAIN) - break; - continue; - } - - FI_DBG(&core_prov, FI_LOG_MR, "Received UFFD event %d\n", msg.event); - - switch (msg.event) { - case UFFD_EVENT_REMOVE: - ofi_monitor_unsubscribe(&uffd.monitor, - (void *) (uintptr_t) msg.arg.remove.start, - (size_t) (msg.arg.remove.end - - msg.arg.remove.start), NULL); - /* fall through */ - case UFFD_EVENT_UNMAP: - ofi_monitor_notify(&uffd.monitor, - (void *) (uintptr_t) msg.arg.remove.start, - (size_t) (msg.arg.remove.end - - msg.arg.remove.start)); - break; - case UFFD_EVENT_REMAP: - ofi_monitor_notify(&uffd.monitor, - (void *) (uintptr_t) msg.arg.remap.from, - (size_t) msg.arg.remap.len); - break; - case UFFD_EVENT_PAGEFAULT: - ofi_uffd_pagefault_handler(&msg); - break; - default: - FI_WARN(&core_prov, FI_LOG_MR, - "Unhandled uffd event %d\n", msg.event); - break; - } - pthread_mutex_unlock(&mm_lock); - pthread_rwlock_unlock(&mm_list_rwlock); - } - return NULL; -} - -static void ofi_uffd_pagefault_handler(struct uffd_msg *msg) -{ - struct uffdio_zeropage zp; - int i; - int ret; - void * const address = (void *) (uintptr_t) msg->arg.pagefault.address; - uint64_t const flags = (uint64_t) msg->arg.pagefault.flags; -#if HAVE_UFFD_THREAD_ID - uint32_t const ptid = (uint32_t) msg->arg.pagefault.feat.ptid; -#endif - /* ofi_uffd_register sets the mode to - * UFFDIO_REGISTER_MODE_MISSING. As a result, we can - * get read, write or write-protect notifications via - * UFFD_EVENT_PAGEFAULT. The only ones we can sensibly - * handle are writes to non-backed pages. - * (Read and write-protect notifications are likely - * application bugs.) - */ - - if (flags != UFFD_PAGEFAULT_FLAG_WRITE) { -#if HAVE_UFFD_THREAD_ID - FI_WARN(&core_prov, FI_LOG_MR, - "UFFD pagefault with unrecognized flags: %lu, address %p, thread %u\n", - flags, address, ptid); -#else - FI_WARN(&core_prov, FI_LOG_MR, - "UFFD pagefault with unrecognized flags: %lu, address %p\n", - flags, address); -#endif - /* The faulting thread is halted at this point. In - * theory we could wake it up with UFFDIO_WAKE. In - * practice that requires the address range of the - * fault, information we don't have from the - * pagefault event. - */ - - return; - } - - /* The event tells us the address of the fault - * (which can be anywhere on the page). It does not - * tell us the size of the page so we have to guess - * from the list of known page_sizes. - * - * We employ the standard resolution: install a zeroed page. - */ - - for (i = 0; i < num_page_sizes; ) { - /* setup a zeropage reqest for this pagesize */ - zp.range.start = (uint64_t) (uintptr_t) - ofi_get_page_start(address, page_sizes[i]); - zp.range.len = (uint64_t) page_sizes[i]; - zp.mode = 0; - zp.zeropage = 0; - - ret = ioctl(uffd.fd, UFFDIO_ZEROPAGE, &zp); - - if (ret == 0) /* success */ - return; - - /* Note: the documentation (man ioctl_userfaultfd) says - * that the ioctl() returns -1 on error and errno is set - * to indicate the error. It also says that the zeropage - * member of struct uffdio_zeropage is set to the negated - * error. The unit tests for uffd say - * real retval in uffdio_zeropage.zeropage - * so that's what we use here. - */ - - if (zp.zeropage == -EAGAIN) - /* This is a tough case. If the memory map is - * changing, the kernel returns EAGAIN before - * installing the zeroed page. So the page - * fault has not been rectified. If we don't try - * again, the application will crash. If we add - * a maximum retry count we could still end up - * with an unresolved page fault. - * - * It's likely a kernel bug or (something else - * bad like OOM) if it returns EAGAIN forever. - * So we retry until we get something besides - * EAGAIN. - */ - continue; /* retry this page size */ - - i++; /* try next page size */ - - if (zp.zeropage == -EINVAL) /* wrong page size */ - continue; - - /* If we get here we failed to install the zeroed - * page for this page size and it wasn't a size error. - * We could either stop trying or go on to the - * next pagesize. We choose to print a message and try - * another page size. - */ - - FI_DBG(&core_prov, FI_LOG_MR, - "Unable to install zeroed page of size %zu to handle page fault." - " address = %p zeropage = %lld errno = %d\n", - page_sizes[i], address, zp.zeropage, errno); - } - - FI_WARN(&core_prov, FI_LOG_MR, - "Unable to handle event UFFD_EVENT_PAGEFAULT for address %p.\n", - address); -} - -static int ofi_uffd_register(const void *addr, size_t len, size_t page_size) -{ - struct uffdio_register reg; - int ret; - - reg.range.start = (uint64_t) (uintptr_t) - ofi_get_page_start(addr, page_size); - reg.range.len = ofi_get_page_bytes(addr, len, page_size); - reg.mode = UFFDIO_REGISTER_MODE_MISSING; - ret = ioctl(uffd.fd, UFFDIO_REGISTER, ®); - if (ret < 0) { - if (errno != EINVAL) { - FI_WARN(&core_prov, FI_LOG_MR, - "ioctl/uffd_reg: %s\n", strerror(errno)); - } - return -errno; - } - return 0; -} - -static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ - int i; - - assert(monitor == &uffd.monitor); - for (i = 0; i < num_page_sizes; i++) { - if (!ofi_uffd_register(addr, len, page_sizes[i])) - return 0; - } - return -FI_EFAULT; -} - -static int ofi_uffd_unregister(const void *addr, size_t len, size_t page_size) -{ - struct uffdio_range range; - int ret; - - range.start = (uint64_t) (uintptr_t) - ofi_get_page_start(addr, page_size); - range.len = ofi_get_page_bytes(addr, len, page_size); - ret = ioctl(uffd.fd, UFFDIO_UNREGISTER, &range); - if (ret < 0) { - if (errno != EINVAL) { - FI_WARN(&core_prov, FI_LOG_MR, - "ioctl/uffd_unreg: %s\n", strerror(errno)); - } - return -errno; - } - return 0; -} - -/* May be called from mr cache notifier callback */ -static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ - int i; - - assert(monitor == &uffd.monitor); - for (i = 0; i < num_page_sizes; i++) { - if (!ofi_uffd_unregister(addr, len, page_sizes[i])) - break; - } -} - -static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, - const struct ofi_mr_info *info, - struct ofi_mr_entry *entry) -{ - /* no-op */ - return true; -} - -static void ofi_uffd_close_fd(struct ofi_uffd *monitor) -{ - close(monitor->fd); - monitor->fd = -1; -} - -static void ofi_uffd_close_pipe(struct ofi_uffd *monitor) -{ - close(monitor->exit_pipe[0]); - close(monitor->exit_pipe[1]); - monitor->exit_pipe[0] = -1; - monitor->exit_pipe[1] = -1; -} - -static int ofi_uffd_start(struct ofi_mem_monitor *monitor) -{ - struct uffdio_api api; - int ret; - - if (uffd.fd >= 0) - return 0; - - if (!num_page_sizes) - return -FI_ENODATA; - - ret = pipe(uffd.exit_pipe); - if (ret) { - FI_WARN(&core_prov, FI_LOG_MR, - "uffd/pipe: %s\n", strerror(errno)); - return -errno; - } - - uffd.fd = syscall(__NR_userfaultfd, - O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); - if (uffd.fd < 0) { - FI_WARN(&core_prov, FI_LOG_MR, - "syscall/userfaultfd %s\n", strerror(errno)); - ret = -errno; - goto close_pipe; - } - - api.api = UFFD_API; - api.features = UFFD_FEATURE_EVENT_UNMAP | UFFD_FEATURE_EVENT_REMOVE | - UFFD_FEATURE_EVENT_REMAP; - ret = ioctl(uffd.fd, UFFDIO_API, &api); - if (ret < 0) { - FI_WARN(&core_prov, FI_LOG_MR, - "ioctl/uffdio: %s\n", strerror(errno)); - ret = -errno; - goto close_fd; - } - - if (api.api != UFFD_API) { - FI_WARN(&core_prov, FI_LOG_MR, "uffd features not supported\n"); - ret = -FI_ENOSYS; - goto close_fd; - } - - ret = pthread_create(&uffd.thread, NULL, ofi_uffd_handler, &uffd); - if (ret) { - FI_WARN(&core_prov, FI_LOG_MR, - "failed to create handler thread %s\n", strerror(ret)); - ret = -ret; - goto close_fd; - } - - uffd.monitor.subscribe = ofi_uffd_subscribe; - uffd.monitor.unsubscribe = ofi_uffd_unsubscribe; - uffd.monitor.valid = ofi_uffd_valid; - - FI_INFO(&core_prov, FI_LOG_MR, - "Memory monitor uffd started.\n"); - - return 0; - -close_fd: - - ofi_uffd_close_fd(&uffd); - -close_pipe: - - ofi_uffd_close_pipe(&uffd); - - FI_WARN(&core_prov, FI_LOG_MR, - "Memory monitor uffd failed to start: %s.\n", - strerror(-ret)); - - return ret; -} - -static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) -{ - ssize_t num_written; - - if (uffd.fd < 0) - return; - - /* tell the thread to exit with the exit_pipe */ - - num_written = write(uffd.exit_pipe[1], "X", 1); - if (num_written != 1) { - FI_WARN(&core_prov, FI_LOG_MR, - "uffd/close: unable to write to exit pipe: %s", - strerror(errno)); - } - - pthread_join(uffd.thread, NULL); - - ofi_uffd_close_fd(&uffd); - ofi_uffd_close_pipe(&uffd); - - FI_INFO(&core_prov, FI_LOG_MR, - "Memory monitor uffd stopped.\n"); -} - -#else /* HAVE_UFFD_MONITOR */ - -static int ofi_uffd_start(struct ofi_mem_monitor *monitor) -{ - return -FI_ENOSYS; -} - -static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) -{ -} - -#endif /* HAVE_UFFD_MONITOR */ - - -static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor); -static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor); -static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor); -static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor); -static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info); -static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info); -static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier, - const struct ofi_mr_info *info, - struct ofi_mr_entry *entry); - -struct ofi_import_monitor { - struct ofi_mem_monitor monitor; - struct fid_mem_monitor *impfid; -}; - -static struct ofi_import_monitor impmon = { - .monitor.iface = FI_HMEM_SYSTEM, - .monitor.init = ofi_import_monitor_init, - .monitor.cleanup = ofi_import_monitor_cleanup, - .monitor.start = ofi_import_monitor_start, - .monitor.stop = ofi_import_monitor_stop, - .monitor.subscribe = ofi_import_monitor_subscribe, - .monitor.unsubscribe = ofi_import_monitor_unsubscribe, - .monitor.valid = ofi_import_monitor_valid, - .monitor.name = "import", -}; - -struct ofi_mem_monitor *import_monitor = &impmon.monitor; - -static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor) -{ - ofi_monitor_init(monitor); -} - -static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor) -{ - assert(!impmon.impfid); - ofi_monitor_cleanup(monitor); -} - -static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor) -{ - if (!impmon.impfid) - return -FI_ENOSYS; - - return impmon.impfid->export_ops->start(impmon.impfid); -} - -static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor) -{ - assert(impmon.impfid); - impmon.impfid->export_ops->stop(impmon.impfid); -} - -static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ - assert(impmon.impfid); - return impmon.impfid->export_ops->subscribe(impmon.impfid, addr, len); -} - -static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ - assert(impmon.impfid); - impmon.impfid->export_ops->unsubscribe(impmon.impfid, addr, len); -} - -static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier, - const struct ofi_mr_info *info, - struct ofi_mr_entry *entry) -{ - assert(impmon.impfid); - return impmon.impfid->export_ops->valid(impmon.impfid, - entry->info.iov.iov_base, - entry->info.iov.iov_len); -} - -static void ofi_import_monitor_notify(struct fid_mem_monitor *monitor, - const void *addr, size_t len) -{ - assert(monitor->fid.context == &impmon); - pthread_rwlock_rdlock(&mm_list_rwlock); - pthread_mutex_lock(&mm_lock); - ofi_monitor_notify(&impmon.monitor, addr, len); - pthread_mutex_unlock(&mm_lock); - pthread_rwlock_unlock(&mm_list_rwlock); -} - -static int ofi_close_import(struct fid *fid) -{ - pthread_mutex_lock(&mm_state_lock); - impmon.monitor.state = FI_MM_STATE_IDLE; - pthread_mutex_unlock(&mm_state_lock); - impmon.impfid = NULL; - return 0; -} - -static struct fi_ops_mem_notify import_ops = { - .size = sizeof(struct fi_ops_mem_notify), - .notify = ofi_import_monitor_notify, -}; - -static struct fi_ops impfid_ops = { - .size = sizeof(struct fi_ops), - .close = ofi_close_import, - .bind = fi_no_bind, - .control = fi_no_control, - .ops_open = fi_no_ops_open, - .tostr = fi_no_tostr, - .ops_set = fi_no_ops_set, -}; - -int ofi_monitor_import(struct fid *fid) -{ - struct fid_mem_monitor *impfid; - - if (fid->fclass != FI_CLASS_MEM_MONITOR) - return -FI_ENOSYS; - - if (impmon.impfid) { - FI_WARN(&core_prov, FI_LOG_MR, - "imported monitor already exists\n"); - return -FI_EBUSY; - } - - if (default_monitor && !dlist_empty(&default_monitor->list)) { - FI_WARN(&core_prov, FI_LOG_MR, - "cannot replace active monitor\n"); - return -FI_EBUSY; - } - - impfid = container_of(fid, struct fid_mem_monitor, fid); - if (impfid->export_ops->size < sizeof(struct fi_ops_mem_monitor)) - return -FI_EINVAL; - - impmon.impfid = impfid; - impfid->fid.context = &impmon; - impfid->fid.ops = &impfid_ops; - impfid->import_ops = &import_ops; - - FI_INFO(&core_prov, FI_LOG_MR, - "setting imported memory monitor as default\n"); - default_monitor = &impmon.monitor; - return 0; -} From 1c71093a1f169b014509b986e458a1e3bb10e7ee Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 15 Jan 2025 21:45:51 -0600 Subject: [PATCH 371/393] prov/util: Support mem monitors with per sub ctx Some memory monitors, such as kdreg2, have a subscription context per MR cache entry. These memory monitors require unsubscribe to be called for each freed MR cache entry. To support this, call unsubscribe when an entry is remove from the MR cache RB tree. If a memory monitor does not support a subscription context per MR, unsubscribe must be implemented as a noop. Update uffd and rocr memory monitors accordingly. Signed-off-by: Ian Ziemba --- prov/util/src/rocr_mem_monitor.c | 17 ++++++++--------- prov/util/src/uffd_mem_monitor.c | 13 +++++++++++-- prov/util/src/util_mem_monitor.c | 8 ++++++++ prov/util/src/util_mr_cache.c | 13 ++++++++----- 4 files changed, 35 insertions(+), 16 deletions(-) diff --git a/prov/util/src/rocr_mem_monitor.c b/prov/util/src/rocr_mem_monitor.c index c194814c640..fff16fb330a 100644 --- a/prov/util/src/rocr_mem_monitor.c +++ b/prov/util/src/rocr_mem_monitor.c @@ -62,6 +62,11 @@ static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const struct ofi_mr_info *info, struct ofi_mr_entry *entry); +/* Since ROCR may have many MR cache entries for the same VA range and + * ofi_monitor_unsubscribe() is called for every MR cache entry being freed, + * ROCR unsubscribe needs to be a noop. Else, MR cache entries may no longer + * be monitored. + */ static struct rocr_mm rocr_mm = { .mm = { .iface = FI_HMEM_ROCR, @@ -70,7 +75,7 @@ static struct rocr_mm rocr_mm = { .start = rocr_mm_start, .stop = rocr_mm_stop, .subscribe = rocr_mm_subscribe, - .unsubscribe = rocr_mm_unsubscribe, + .unsubscribe = ofi_monitor_unsubscribe_no_op, .valid = rocr_mm_valid, .name = "rocr", }, @@ -136,7 +141,7 @@ static void rocr_mm_dealloc_cb(void *addr, void *user_data) pthread_rwlock_rdlock(&mm_list_rwlock); pthread_mutex_lock(&mm_lock); - ofi_monitor_unsubscribe(rocr_monitor, addr, len, NULL); + rocr_mm_unsubscribe(rocr_monitor, addr, len, NULL); pthread_mutex_unlock(&mm_lock); pthread_rwlock_unlock(&mm_list_rwlock); } @@ -381,12 +386,6 @@ static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr, return -FI_ENOSYS; } -static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) -{ -} - static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const struct ofi_mr_info *info, struct ofi_mr_entry *entry) @@ -401,7 +400,7 @@ static struct ofi_mem_monitor rocr_mm = { .start = rocr_mm_start, .stop = rocr_mm_stop, .subscribe = rocr_mm_subscribe, - .unsubscribe = rocr_mm_unsubscribe, + .unsubscribe = ofi_monitor_unsubscribe_no_op, .valid = rocr_mm_valid, .name = "rocr", }; diff --git a/prov/util/src/uffd_mem_monitor.c b/prov/util/src/uffd_mem_monitor.c index c06b49178a1..dfec177a32f 100644 --- a/prov/util/src/uffd_mem_monitor.c +++ b/prov/util/src/uffd_mem_monitor.c @@ -67,6 +67,9 @@ struct ofi_mem_monitor *uffd_monitor = &uffd.monitor; #include static void ofi_uffd_pagefault_handler(struct uffd_msg *msg); +static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); /* The userfault fd monitor requires for events that could * trigger it to be handled outside of the monitor functions @@ -108,7 +111,7 @@ static void *ofi_uffd_handler(void *arg) switch (msg.event) { case UFFD_EVENT_REMOVE: - ofi_monitor_unsubscribe(&uffd.monitor, + ofi_uffd_unsubscribe(&uffd.monitor, (void *) (uintptr_t) msg.arg.remove.start, (size_t) (msg.arg.remove.end - msg.arg.remove.start), NULL); @@ -388,7 +391,13 @@ static int ofi_uffd_start(struct ofi_mem_monitor *monitor) } uffd.monitor.subscribe = ofi_uffd_subscribe; - uffd.monitor.unsubscribe = ofi_uffd_unsubscribe; + + /* Since UFFD may have many MR cache entries for the same VA range and + * ofi_monitor_unsubscribe() is called for every MR cache entry being + * freed, UFFD unsubscribe needs to be a noop. Else, MR cache entries + * may no longer be monitored. + */ + uffd.monitor.unsubscribe = ofi_monitor_unsubscribe_no_op; uffd.monitor.valid = ofi_uffd_valid; FI_INFO(&core_prov, FI_LOG_MR, diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index f0e094dc6e4..27caaf66c77 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -504,6 +504,7 @@ void ofi_monitor_flush(struct ofi_mem_monitor *monitor) } } +/* For each new cached MR cache entry, subscribed is called. */ int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, union ofi_mr_hmem_info *hmem_info) @@ -522,6 +523,13 @@ int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor, return ret; } +/* For each cached MR entry freed, unsubscribe is called. + + * If a memory monitor does not have a context per subscribe (e.g., a single + * monitored region servering multiple MRs), the memory monitor must implement + * unsubscribe as a noop. This may result in extra notification events, but is + * harmless to correct operation. + */ void ofi_monitor_unsubscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, union ofi_mr_hmem_info *hmem_info) diff --git a/prov/util/src/util_mr_cache.c b/prov/util/src/util_mr_cache.c index ea8bf15570f..2f0af31fc4c 100644 --- a/prov/util/src/util_mr_cache.c +++ b/prov/util/src/util_mr_cache.c @@ -125,15 +125,18 @@ static void util_mr_free_entry(struct ofi_mr_cache *cache, static void util_mr_uncache_entry_storage(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { - /* Without subscription context, we might unsubscribe from - * an address range in use by another region. As a result, - * we remain subscribed. This may result in extra - * notification events, but is harmless to correct operation. - */ + enum fi_hmem_iface iface = entry->info.iface; + struct ofi_mem_monitor *monitor = cache->monitors[iface]; ofi_rbmap_delete(&cache->tree, entry->node); entry->node = NULL; + /* Some memory monitors have a subscription context per MR. These + * memory monitors require ofi_monitor_unsubscribe() to be called. + */ + ofi_monitor_unsubscribe(monitor, entry->info.iov.iov_base, + entry->info.iov.iov_len, &entry->hmem_info); + cache->cached_cnt--; cache->cached_size -= entry->info.iov.iov_len; } From 6990129631b4e508b34b770538fdf70ededcd8f2 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Thu, 16 Jan 2025 11:00:23 -0600 Subject: [PATCH 372/393] prov/util: Fix ROCR and memhooks deadlock ROCR deallocation CB will call rocr_unsubscribe with mm_lock held. If memhooks is used, since rocr_unsubscribe may call free, this can result in memhooks intercepting the free and leading to deadlock. To avoid this, freeing is deferred until locks are released. Signed-off-by: Ian Ziemba --- prov/util/src/rocr_mem_monitor.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/prov/util/src/rocr_mem_monitor.c b/prov/util/src/rocr_mem_monitor.c index fff16fb330a..1e78c1ac3c3 100644 --- a/prov/util/src/rocr_mem_monitor.c +++ b/prov/util/src/rocr_mem_monitor.c @@ -44,6 +44,7 @@ struct rocr_mm_entry { struct iovec iov; struct ofi_rbnode *node; + struct dlist_entry entry; }; struct rocr_mm { @@ -57,7 +58,8 @@ static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, union ofi_mr_hmem_info *hmem_info); static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info); + union ofi_mr_hmem_info *hmem_info, + struct dlist_entry *free_list); static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const struct ofi_mr_info *info, struct ofi_mr_entry *entry); @@ -138,15 +140,22 @@ static struct rocr_mm_entry *rocr_mm_entry_find(const void *addr) static void rocr_mm_dealloc_cb(void *addr, void *user_data) { size_t len = (size_t) user_data; + DEFINE_LIST(free_list); + struct rocr_mm_entry *entry; pthread_rwlock_rdlock(&mm_list_rwlock); pthread_mutex_lock(&mm_lock); - rocr_mm_unsubscribe(rocr_monitor, addr, len, NULL); + rocr_mm_unsubscribe(rocr_monitor, addr, len, NULL, &free_list); pthread_mutex_unlock(&mm_lock); pthread_rwlock_unlock(&mm_list_rwlock); + + while (!dlist_empty(&free_list)) { + dlist_pop_front(&free_list, struct rocr_mm_entry, entry, entry); + free(entry); + } } -static void rocr_mm_entry_free(struct rocr_mm_entry *entry) +static void rocr_mm_entry_delete(struct rocr_mm_entry *entry) { hsa_status_t hsa_ret __attribute__((unused)); @@ -166,6 +175,11 @@ static void rocr_mm_entry_free(struct rocr_mm_entry *entry) hsa_ret == HSA_STATUS_ERROR_INVALID_ARGUMENT); ofi_rbmap_delete(rocr_mm.dev_region_tree, entry->node); +} + +static void rocr_mm_entry_free(struct rocr_mm_entry *entry) +{ + rocr_mm_entry_delete(entry); free(entry); } @@ -267,7 +281,8 @@ static void rocr_mm_stop(struct ofi_mem_monitor *monitor) static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len, - union ofi_mr_hmem_info *hmem_info) + union ofi_mr_hmem_info *hmem_info, + struct dlist_entry *free_list) { struct rocr_mm_entry *entry; size_t cur_len = len; @@ -291,7 +306,14 @@ static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, next_addr = (void *) ((uintptr_t) ofi_iov_end(&entry->iov) + 1); - rocr_mm_entry_free(entry); + /* Since unsubscribed is called with mm_lock held, calling free + * may result in deadlocks if memhooks is used. To prevent this, + * entries are placed on a list to be freed later. + * + * Entry still needs to be deleted. + */ + rocr_mm_entry_delete(entry); + dlist_insert_tail(&entry->entry, free_list); cur_len -= MIN((uintptr_t) next_addr - (uintptr_t) cur_addr, cur_len); From 0eedfbbc38f14967c8330eacaf6e09711eef3ba3 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Fri, 17 Jan 2025 12:32:59 -0600 Subject: [PATCH 373/393] prov/util: Statically set uffd callbacks Subscribe, unsubscribe, and valid are callbacks which are dynamically setup. Change this to be statically set. Signed-off-by: Ian Ziemba --- prov/util/src/uffd_mem_monitor.c | 39 ++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/prov/util/src/uffd_mem_monitor.c b/prov/util/src/uffd_mem_monitor.c index dfec177a32f..7172e1e7441 100644 --- a/prov/util/src/uffd_mem_monitor.c +++ b/prov/util/src/uffd_mem_monitor.c @@ -46,6 +46,12 @@ static int ofi_uffd_start(struct ofi_mem_monitor *monitor); static void ofi_uffd_stop(struct ofi_mem_monitor *monitor); +static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); +static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry); static struct ofi_uffd uffd = { .monitor.iface = FI_HMEM_SYSTEM, @@ -53,6 +59,15 @@ static struct ofi_uffd uffd = { .monitor.cleanup = ofi_monitor_cleanup, .monitor.start = ofi_uffd_start, .monitor.stop = ofi_uffd_stop, + .monitor.subscribe = ofi_uffd_subscribe, + + /* Since UFFD may have many MR cache entries for the same VA range and + * ofi_monitor_unsubscribe() is called for every MR cache entry being + * freed, UFFD unsubscribe needs to be a noop. Else, MR cache entries + * may no longer be monitored. + */ + .monitor.unsubscribe = ofi_monitor_unsubscribe_no_op, + .monitor.valid = ofi_uffd_valid, .monitor.name = "uffd", .fd = -1, .exit_pipe = { -1, -1 }, @@ -390,16 +405,6 @@ static int ofi_uffd_start(struct ofi_mem_monitor *monitor) goto close_fd; } - uffd.monitor.subscribe = ofi_uffd_subscribe; - - /* Since UFFD may have many MR cache entries for the same VA range and - * ofi_monitor_unsubscribe() is called for every MR cache entry being - * freed, UFFD unsubscribe needs to be a noop. Else, MR cache entries - * may no longer be monitored. - */ - uffd.monitor.unsubscribe = ofi_monitor_unsubscribe_no_op; - uffd.monitor.valid = ofi_uffd_valid; - FI_INFO(&core_prov, FI_LOG_MR, "Memory monitor uffd started.\n"); @@ -456,4 +461,18 @@ static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) { } +static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + return -FI_ENOSYS; +} + +static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + return false; +} + #endif /* HAVE_UFFD_MONITOR */ From 271561742cc7c504c4f9674aede655e5fe8eec0c Mon Sep 17 00:00:00 2001 From: Mike Uttormark Date: Mon, 16 Dec 2024 15:03:31 -0600 Subject: [PATCH 374/393] prov/cxi: Test monitor unsubscribe An MR cache utilizing kdreg2 will have incorrect MR cache count stats if unsubscribe is not called. Signed-off-by: Mike Uttormark Signed-off-by: Ian Ziemba --- prov/cxi/Makefile.include | 3 +- prov/cxi/test/mr_cache.c | 121 ++++++++++++++++++++++++++++++++++++++ prov/cxi/test/test.sh | 3 + 3 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 prov/cxi/test/mr_cache.c diff --git a/prov/cxi/Makefile.include b/prov/cxi/Makefile.include index b529f942ce7..9faa7874d8f 100644 --- a/prov/cxi/Makefile.include +++ b/prov/cxi/Makefile.include @@ -129,7 +129,8 @@ nodist_prov_cxi_test_cxitest_SOURCES = \ prov/cxi/test/auth_key.c \ prov/cxi/test/fork.c \ prov/cxi/test/mem_reg.c \ - prov/cxi/test/nic.c + prov/cxi/test/nic.c \ + prov/cxi/test/mr_cache.c prov_cxi_test_cxitest_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \ $(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS) diff --git a/prov/cxi/test/mr_cache.c b/prov/cxi/test/mr_cache.c new file mode 100644 index 00000000000..b2035cb8063 --- /dev/null +++ b/prov/cxi/test/mr_cache.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "libcxi/libcxi.h" +#include "cxip.h" +#include "cxip_test_common.h" + +#define SETENV_OVERWRITE 1 + +TestSuite(mr_cache, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(mr_cache, cache_full) +{ + static struct { + const char *name; + const char *value; + } envs[] = { + { .name = "FI_MR_CACHE_MONITOR", .value = "kdreg2", }, + { .name = "FI_MR_CACHE_MAX_COUNT", .value = "4", }, + }; + struct { + void *addr; + struct fid_mr *mr; + } *region_data; + size_t i; + int ret; + long page_size; + unsigned long num_regions, total_regions; + struct ofi_mr_cache *cache; + struct cxip_domain *cxip_dom; + + /* setup the environment */ + for (i = 0; i < ARRAY_SIZE(envs); i++) { + ret = setenv(envs[i].name, envs[i].value, SETENV_OVERWRITE); + cr_assert_eq(ret, 0, "Failed to set %s to %s: %d", + envs[i].name, envs[i].value, errno); + } + + /* allocate the memory regions */ + page_size = sysconf(_SC_PAGESIZE); + cr_assert(page_size > 0, + "sysconf(_SC_PAGESIZE) return %ld: errno = %d", page_size, errno); + + ret = sscanf(getenv("FI_MR_CACHE_MAX_COUNT"), "%lu", &num_regions); + cr_assert_eq(ret, 1, "Failed to get number of regions: %d %d:%s", + ret, errno, strerror(errno)); + + /* one extra to push one out of the cache */ + total_regions = num_regions + 1; + region_data = calloc(total_regions, sizeof(*region_data)); + cr_assert_not_null(region_data); + + for (i = 0; i < total_regions; i++) { + region_data[i].addr = mmap(NULL, page_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + -1, 0); + cr_assert_not_null(region_data[i].addr); + } + + /* create the domain */ + cxit_setup_domain(); + cxit_create_domain(); + + /* Register the max number of regions */ + for (i = 0; i < num_regions; i++) { + ret = fi_mr_reg(cxit_domain, region_data[i].addr, + page_size, FI_READ | FI_WRITE, + 0, 0, 0, ®ion_data[i].mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_mr_reg failed for region %lu: %d", i, ret); + } + + /* See that the cache is full */ + cxip_dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + cache = &cxip_dom->iomm; + cr_assert(cache->cached_cnt == cache->cached_max_cnt, + "Cache is not full: %zu != %zu", + cache->cached_cnt, cache->cached_max_cnt); + cr_assert(cache->uncached_cnt == 0, + "Cache has uncached entries: %zu", + cache->uncached_cnt); + + /* release the registrations, this should put them on the LRU list */ + for(i = 0; i < num_regions; i++) { + ret = fi_close(®ion_data[i].mr->fid); + cr_assert_eq(ret, FI_SUCCESS, + "Failed to close mr %zu: %d", + i, ret); + } + + /* Register one more, this should push one off LRU list */ + ret = fi_mr_reg(cxit_domain, region_data[num_regions].addr, + page_size, FI_READ | FI_WRITE, + 0, 0, 0, ®ion_data[num_regions].mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_mr_reg failed for region %lu: %d", num_regions, ret); + + /* Cache should remain full */ + cr_assert(cache->cached_cnt == cache->cached_max_cnt, + "Cache is not full: %zu != %zu", + cache->cached_cnt, cache->cached_max_cnt); + cr_assert(cache->uncached_cnt == 0, + "Cache has uncached entries: %zu", + cache->uncached_cnt); + + cxit_teardown_domain(); +} diff --git a/prov/cxi/test/test.sh b/prov/cxi/test/test.sh index ea6a913703f..21914365f87 100755 --- a/prov/cxi/test/test.sh +++ b/prov/cxi/test/test.sh @@ -149,6 +149,8 @@ fork_safe_kdreg2_test=( unlimited_triggered_ops_test=( "FI_CXI_ENABLE_TRIG_OP_LIMIT=0 ./cxitest -j 1 --verbose --filter=\"deferred_work_trig_op_limit/*\" --tap=cxitest-disable-trig-op-limit.tap") +mr_cache_test=("./cxitest --verbose --tap=cxitest-mr_cache_test.tap --filter=\"mr_cache/*\" -j 1") + long_test_suite=( "basic_test" "swget_test" @@ -174,6 +176,7 @@ long_test_suite=( "fork_safe_memhooks_test" "fork_safe_kdreg2_test" "unlimited_triggered_ops_test" + "mr_cache_test" ) # ################################################################ From 83ab8a3fe3e07a325319f3513004f3cc00a84303 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Fri, 17 Jan 2025 13:46:25 -0800 Subject: [PATCH 375/393] prov/efa: fix leak of dmabuf fd in cuda p2p probe prior to this patch, when efa_hmem_info_check_p2p_support_cuda elected to attempt dmabuf for p2p, we previously leaked the file descriptor returned by cuMemGetHandleForAddressRange in all cases. This ultimately meant the dmabuf stuck around for the lifetime of the process, even after dereg and after releasing the memory back to the device mempool. All calls to cuda_get_dmabuf_fd need a corresponding close call. Signed-off-by: Nicholas Sielicki --- prov/efa/src/efa_hmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c index 61eca026219..996d171d95b 100644 --- a/prov/efa/src/efa_hmem.c +++ b/prov/efa/src/efa_hmem.c @@ -129,6 +129,7 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in if (ret == FI_SUCCESS) { ibv_mr = ibv_reg_dmabuf_mr(g_device_list[0].ibv_pd, dmabuf_offset, len, (uint64_t)ptr, dmabuf_fd, ibv_access); + (void)close(dmabuf_fd); if (!ibv_mr) { EFA_INFO(FI_LOG_CORE, "Unable to register CUDA device buffer via dmabuf: %s. " From 837812474343f17261a974d9b6f005190512248d Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 20 Jan 2025 10:06:23 -0600 Subject: [PATCH 376/393] src/hmem: Define ofi_hmem_put_dmabuf_fd For some HMEM ifaces, ofi_hmem_get_dmabuf_fd() may result in a new FD being allocated. Define ofi_hmem_put_dmabuf_fd() to close FD. Signed-off-by: Ian Ziemba --- include/ofi_hmem.h | 7 +++++++ src/hmem.c | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/ofi_hmem.h b/include/ofi_hmem.h index 9db6d94cd70..65b2ba7534d 100644 --- a/include/ofi_hmem.h +++ b/include/ofi_hmem.h @@ -131,6 +131,7 @@ struct ofi_hmem_ops { const void *src, size_t size); int (*get_dmabuf_fd)(const void *addr, uint64_t size, int *fd, uint64_t *offset); + int (*put_dmabuf_fd)(int fd); }; extern struct ofi_hmem_ops hmem_ops[]; @@ -357,6 +358,11 @@ static inline int ofi_hmem_no_get_dmabuf_fd(const void *addr, uint64_t size, return -FI_ENOSYS; } +static inline int ofi_hmem_no_put_dmabuf_fd(int fd) +{ + return -FI_ENOSYS; +} + static inline bool ofi_hmem_p2p_disabled(void) { return ofi_hmem_disable_p2p; @@ -450,5 +456,6 @@ int ofi_hmem_dev_reg_copy_from_hmem(enum fi_hmem_iface iface, uint64_t handle, void *dest, const void *src, size_t size); int ofi_hmem_get_dmabuf_fd(enum fi_hmem_iface, const void *addr, uint64_t size, int *fd, uint64_t *offset); +int ofi_hmem_put_dmabuf_fd(enum fi_hmem_iface iface, int fd); #endif /* _OFI_HMEM_H_ */ diff --git a/src/hmem.c b/src/hmem.c index a624f8dddff..b4ced3ddc9b 100644 --- a/src/hmem.c +++ b/src/hmem.c @@ -141,6 +141,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = ofi_hmem_system_dev_reg_copy, .dev_reg_copy_from_hmem = ofi_hmem_system_dev_reg_copy, .get_dmabuf_fd = ofi_hmem_no_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, [FI_HMEM_CUDA] = { .initialized = false, @@ -167,6 +168,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = cuda_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = cuda_dev_reg_copy_from_hmem, .get_dmabuf_fd = cuda_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, [FI_HMEM_ROCR] = { .initialized = false, @@ -193,6 +195,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = rocr_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = rocr_dev_reg_copy_from_hmem, .get_dmabuf_fd = rocr_hmem_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, [FI_HMEM_ZE] = { .initialized = false, @@ -219,6 +222,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = ze_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = ze_dev_reg_copy_from_hmem, .get_dmabuf_fd = ze_hmem_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, [FI_HMEM_NEURON] = { .initialized = false, @@ -244,6 +248,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = ofi_hmem_no_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = ofi_hmem_no_dev_reg_copy_from_hmem, .get_dmabuf_fd = neuron_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, [FI_HMEM_SYNAPSEAI] = { .initialized = false, @@ -269,6 +274,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = ofi_hmem_no_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = ofi_hmem_no_dev_reg_copy_from_hmem, .get_dmabuf_fd = synapseai_get_dmabuf_fd, + .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, }, }; @@ -820,3 +826,8 @@ int ofi_hmem_get_dmabuf_fd(enum fi_hmem_iface iface, const void *addr, { return hmem_ops[iface].get_dmabuf_fd(addr, size, fd, offset); } + +int ofi_hmem_put_dmabuf_fd(enum fi_hmem_iface iface, int fd) +{ + return hmem_ops[iface].put_dmabuf_fd(fd); +} From fc611e5a3f6dc4f2e91fb44342abbc7788983d95 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Mon, 20 Jan 2025 10:15:26 -0600 Subject: [PATCH 377/393] hmem/rocr: Support ofi_hmem_put_dmabuf_fd() With ROCR, callers of ofi_hmem_get_dmabuf_fd() should call ofi_hmem_put_dmabuf_fd() once the DMA buf region is no longer used. Signed-off-by: Ian Ziemba --- include/ofi_hmem.h | 1 + src/hmem.c | 2 +- src/hmem_rocr.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/include/ofi_hmem.h b/include/ofi_hmem.h index 65b2ba7534d..d7a2983c1ce 100644 --- a/include/ofi_hmem.h +++ b/include/ofi_hmem.h @@ -168,6 +168,7 @@ int rocr_dev_reg_copy_from_hmem(uint64_t handle, void *dest, const void *src, size_t size); int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd, uint64_t *offset); +int rocr_hmem_put_dmabuf_fd(int fd); int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size); int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size); diff --git a/src/hmem.c b/src/hmem.c index b4ced3ddc9b..b42e8e720d4 100644 --- a/src/hmem.c +++ b/src/hmem.c @@ -195,7 +195,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = rocr_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = rocr_dev_reg_copy_from_hmem, .get_dmabuf_fd = rocr_hmem_get_dmabuf_fd, - .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, + .put_dmabuf_fd = rocr_hmem_put_dmabuf_fd, }, [FI_HMEM_ZE] = { .initialized = false, diff --git a/src/hmem_rocr.c b/src/hmem_rocr.c index bba705ba8ef..05caf3cf24b 100644 --- a/src/hmem_rocr.c +++ b/src/hmem_rocr.c @@ -130,6 +130,7 @@ struct hsa_ops { #if HAVE_HSA_AMD_PORTABLE_EXPORT_DMABUF hsa_status_t (*hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset); + hsa_status_t (*hsa_amd_portable_close_dmabuf)(int dmabuf); #endif }; @@ -183,6 +184,7 @@ static struct hsa_ops hsa_ops = { .hsa_iterate_agents = hsa_iterate_agents, #if HAVE_HSA_AMD_PORTABLE_EXPORT_DMABUF .hsa_amd_portable_export_dmabuf = hsa_amd_portable_export_dmabuf, + .hsa_amd_portable_close_dmabuf = hsa_amd_portable_close_dmabuf, #endif .hsa_system_get_info = hsa_system_get_info, }; @@ -863,6 +865,13 @@ static int rocr_hmem_dl_init(void) "Failed to find hsa_amd_portable_export_dmabuf\n"); goto err; } + + hsa_ops.hsa_amd_portable_close_dmabuf = dlsym(hsa_handle, "hsa_amd_portable_close_dmabuf"); + if (!hsa_ops.hsa_amd_portable_close_dmabuf) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_amd_portable_close_dmabuf\n"); + goto err; + } #endif return FI_SUCCESS; @@ -1184,6 +1193,25 @@ int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd, return FI_SUCCESS; } +int rocr_hmem_put_dmabuf_fd(int fd) +{ +#if HAVE_HSA_AMD_PORTABLE_EXPORT_DMABUF + hsa_status_t hsa_ret; + + hsa_ret = hsa_ops.hsa_amd_portable_close_dmabuf(fd); + if (hsa_ret != HSA_STATUS_SUCCESS) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to close dmabuf handle: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + return -FI_EIO; + } + + return FI_SUCCESS; +#else + return -FI_ENOSYS; +#endif +} + #else int rocr_copy_from_dev(uint64_t device, void *dest, const void *src, @@ -1310,4 +1338,9 @@ int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd, return -FI_ENOSYS; } +int rocr_hmem_put_dmabuf_fd(int fd) +{ + return -FI_ENOSYS; +} + #endif /* HAVE_ROCR */ From 5357ceecd708d7eedd72e2df292a7a6b55a9df58 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 22 Jan 2025 09:09:40 -0600 Subject: [PATCH 378/393] hmem/cuda: Support ofi_hmem_put_dmabuf_fd() With CUDA, callers of ofi_hmem_get_dmabuf_fd() should call ofi_hmem_put_dmabuf_fd() once the DMA buf region is no longer used. Signed-off-by: Ian Ziemba --- include/ofi_hmem.h | 1 + src/hmem.c | 2 +- src/hmem_cuda.c | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/ofi_hmem.h b/include/ofi_hmem.h index d7a2983c1ce..72fab37a650 100644 --- a/include/ofi_hmem.h +++ b/include/ofi_hmem.h @@ -195,6 +195,7 @@ bool cuda_is_gdrcopy_enabled(void); bool cuda_is_dmabuf_supported(void); int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd, uint64_t *offset); +int cuda_put_dmabuf_fd(int fd); void cuda_gdrcopy_to_dev(uint64_t handle, void *dev, const void *host, size_t size); diff --git a/src/hmem.c b/src/hmem.c index b42e8e720d4..7c3fa57d619 100644 --- a/src/hmem.c +++ b/src/hmem.c @@ -168,7 +168,7 @@ struct ofi_hmem_ops hmem_ops[] = { .dev_reg_copy_to_hmem = cuda_dev_reg_copy_to_hmem, .dev_reg_copy_from_hmem = cuda_dev_reg_copy_from_hmem, .get_dmabuf_fd = cuda_get_dmabuf_fd, - .put_dmabuf_fd = ofi_hmem_no_put_dmabuf_fd, + .put_dmabuf_fd = cuda_put_dmabuf_fd, }, [FI_HMEM_ROCR] = { .initialized = false, diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c index ec626bdada5..7fcd6450940 100644 --- a/src/hmem_cuda.c +++ b/src/hmem_cuda.c @@ -748,6 +748,16 @@ int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd, #endif /* HAVE_CUDA_DMABUF */ } +int cuda_put_dmabuf_fd(int fd) +{ +#if HAVE_CUDA_DMABUF + close(fd); + return FI_SUCCESS; +#else + return -FI_ENOSYS; +#endif /* HAVE_CUDA_DMABUF */ +} + int cuda_hmem_init(void) { int ret; @@ -1047,6 +1057,11 @@ int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd, return -FI_ENOSYS; } +int cuda_put_dmabuf_fd(int fd) +{ + return -FI_ENOSYS; +} + int cuda_set_sync_memops(void *ptr) { return FI_SUCCESS; From b1d3bb4af78f25b4ac34b7654e1ef1076aee7d82 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Tue, 21 Jan 2025 09:00:25 -0600 Subject: [PATCH 379/393] prov/cxi: Integrate with ofi_hmem_put_dmabuf_fd Signed-off-by: Ian Ziemba --- prov/cxi/include/cxip.h | 2 ++ prov/cxi/src/cxip_iomm.c | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 34a4a9d242c..70ef46a2a69 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -821,8 +821,10 @@ struct cxip_md { struct cxi_md *md; struct ofi_mr_info info; uint64_t handle; + int dmabuf_fd; bool handle_valid; bool cached; + bool dmabuf_fd_valid; }; #define CXIP_MR_DOMAIN_HT_BUCKETS 16 diff --git a/prov/cxi/src/cxip_iomm.c b/prov/cxi/src/cxip_iomm.c index b998bd34aee..4723c311d97 100644 --- a/prov/cxi/src/cxip_iomm.c +++ b/prov/cxi/src/cxip_iomm.c @@ -39,6 +39,10 @@ static int cxip_dmabuf_hints(enum fi_hmem_iface iface, void *iov_base, hints->dmabuf_offset = offset; hints->dmabuf_valid = true; + /* Need to cache DMA buf FD to release later. */ + md->dmabuf_fd = dmabuf_fd; + md->dmabuf_fd_valid = true; + return FI_SUCCESS; } @@ -106,7 +110,7 @@ static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) CXIP_WARN(MAP_FAIL_MSG, dom->lni->lni->id, entry->info.iov.iov_base, entry->info.iov.iov_len, map_flags, ret, fi_strerror(-ret)); - goto err; + goto err_free_dmabuf; } /* If the md len is larger than the iov_len, the VA and len have @@ -161,6 +165,9 @@ static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) err_unmap: cxil_unmap(md->md); +err_free_dmabuf: + if (md->dmabuf_fd_valid) + ofi_hmem_put_dmabuf_fd(entry->info.iface, md->dmabuf_fd); err: md->dom = NULL; return ret; @@ -181,6 +188,9 @@ static void cxip_do_unmap(struct ofi_mr_cache *cache, if (md->handle_valid) ofi_hmem_dev_unregister(entry->info.iface, md->handle); + if (md->dmabuf_fd_valid) + ofi_hmem_put_dmabuf_fd(entry->info.iface, md->dmabuf_fd); + ret = cxil_unmap(md->md); if (ret) CXIP_WARN("cxil_unmap failed: %d\n", ret); @@ -426,7 +436,7 @@ static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr, &uncached_md->md); if (ret) { CXIP_WARN("cxil_map failed: %d:%s\n", ret, fi_strerror(-ret)); - goto err_free_uncached_md; + goto err_free_dmabuf; } /* zeHostMalloc() returns FI_HMEM_ZE but this cannot currently be @@ -466,8 +476,12 @@ static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr, return FI_SUCCESS; + err_unmap: cxil_unmap(uncached_md->md); +err_free_dmabuf: + if (uncached_md->dmabuf_fd_valid) + ofi_hmem_put_dmabuf_fd(attr->iface, uncached_md->dmabuf_fd); err_free_uncached_md: free(uncached_md); @@ -575,6 +589,9 @@ static void cxip_unmap_nocache(struct cxip_md *md) { int ret; + if (md->dmabuf_fd_valid) + ofi_hmem_put_dmabuf_fd(md->info.iface, md->dmabuf_fd); + if (md->handle_valid) ofi_hmem_dev_unregister(md->info.iface, md->handle); From 587e37a0f6eb0ed2049d0fd41d2e642144f860e5 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Tue, 21 Jan 2025 17:39:25 -0600 Subject: [PATCH 380/393] prov/cxi: Test ROCR with DMA buf offset Performing multiple HSA allocations appears to result in a DMA buf offset. Verify that the CXI provider can register a DMA buf offset memory region. Signed-off-by: Ian Ziemba --- prov/cxi/test/rocr.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/prov/cxi/test/rocr.c b/prov/cxi/test/rocr.c index 3d9567e133e..b0647d4038f 100644 --- a/prov/cxi/test/rocr.c +++ b/prov/cxi/test/rocr.c @@ -761,3 +761,48 @@ Test(hsa, verify_hmemDevReg_fine) verify_dev_reg_handle(true, FINE); } + +Test(hsa, dmabuf_offset) +{ + hsa_status_t hsa_ret; + void *bufs[2]; + int ret; + int i; + struct fid_mr *mrs[2]; + size_t size = 1024 * 1024; + + ret = setenv("FI_HMEM_ROCR_USE_DMABUF", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + ret = setenv("FI_MR_ROCR_CACHE_MONITOR_ENABLED", "0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cxit_setup_msg(); + + hsa_ret = hsa_memory_allocate(coarse_grain, size, &bufs[0]); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", + hsa_ret); + + ret = fi_mr_reg(cxit_domain, bufs[0], size, FI_READ | FI_WRITE, 0, 0, 0, + &mrs[0], NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + hsa_ret = hsa_memory_allocate(coarse_grain, size, &bufs[1]); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", + hsa_ret); + + ret = fi_mr_reg(cxit_domain, bufs[1], size, FI_READ | FI_WRITE, 0, 0, 0, + &mrs[1], NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + for (i = 0; i < 2; i++) { + ret = fi_close(&(mrs[i]->fid)); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + + hsa_ret = hsa_memory_free(bufs[i]); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree failed: %d", + hsa_ret); + } + + cxit_teardown_msg(); +} From 4431fe574bf9855b061ef34f774b422f9118e9b0 Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Tue, 21 Jan 2025 17:39:50 -0600 Subject: [PATCH 381/393] prov/cxi: Test ROCR with DMA buf FD recycling When a MR is freed, the CXI provider should free the DMA buf FD used for the ROCR region. Failing to do this will result in FDs being exhausted. Signed-off-by: Ian Ziemba --- prov/cxi/test/rocr.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/prov/cxi/test/rocr.c b/prov/cxi/test/rocr.c index b0647d4038f..3328d4bb103 100644 --- a/prov/cxi/test/rocr.c +++ b/prov/cxi/test/rocr.c @@ -806,3 +806,40 @@ Test(hsa, dmabuf_offset) cxit_teardown_msg(); } + +Test(hsa, dmabuf_stress) +{ + hsa_status_t hsa_ret; + int ret; + int i; + void *buf; + size_t size = 1024 * 1024; + struct fid_mr *mr; + + ret = setenv("FI_HMEM_ROCR_USE_DMABUF", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + ret = setenv("FI_MR_ROCR_CACHE_MONITOR_ENABLED", "0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + hsa_ret = hsa_memory_allocate(coarse_grain, size, &buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", + hsa_ret); + + cxit_setup_msg(); + + for (i = 0; i < 2048; i++) { + ret = fi_mr_reg(cxit_domain, buf, size, FI_READ | FI_WRITE, + 0, 0, 0, &mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + ret = fi_close(&mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + } + + cxit_teardown_msg(); + + hsa_ret = hsa_memory_free(buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree failed: %d", + hsa_ret); +} From ba880cce6cdf84d2c9f99a9eec975092d703727b Mon Sep 17 00:00:00 2001 From: Ian Ziemba Date: Wed, 22 Jan 2025 15:13:44 +0000 Subject: [PATCH 382/393] prov/cxi: Test CUDA with DMA buf FD recycling When a MR is freed, the CXI provider should free the DMA buf FD used for the CUDA region. Failing to do this will result in FDs being exhausted. Signed-off-by: Ian Ziemba --- prov/cxi/test/cuda.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/prov/cxi/test/cuda.c b/prov/cxi/test/cuda.c index 4776556635a..b63432c2a73 100644 --- a/prov/cxi/test/cuda.c +++ b/prov/cxi/test/cuda.c @@ -580,3 +580,38 @@ Test(cuda, verify_force_dev_reg_local) cxit_destroy_cqs(); cxit_teardown_ep(); } + +Test(cuda, dmabuf_stress) +{ + int ret; + int i; + void *buf; + size_t size = 1024 * 1024; + struct fid_mr *mr; + cudaError_t cuda_ret; + + ret = setenv("FI_HMEM_CUDA_USE_DMABUF", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + ret = setenv("FI_MR_CUDA_CACHE_MONITOR_ENABLED", "0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + cuda_ret = cudaMalloc(&buf, size); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret); + + cxit_setup_msg(); + + for (i = 0; i < 2048; i++) { + ret = fi_mr_reg(cxit_domain, buf, size, FI_READ | FI_WRITE, + 0, 0, 0, &mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + ret = fi_close(&mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + } + + cxit_teardown_msg(); + + cuda_ret = cudaFree(buf); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret); +} From 3f4571d76112b6b90b2efd6239466f621e857187 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Wed, 22 Jan 2025 13:22:49 -0800 Subject: [PATCH 383/393] fabtests: Add support for FI_CONTEXT2 This allows testing FI_CONTEXT2 in providers that require this mode bit. Signed-off-by: Jessie Yang --- fabtests/benchmarks/dgram_pingpong.c | 2 +- fabtests/benchmarks/rdm_bw.c | 2 +- fabtests/benchmarks/rdm_cntr_pingpong.c | 2 +- fabtests/benchmarks/rdm_pingpong.c | 2 +- fabtests/benchmarks/rdm_tagged_bw.c | 2 +- fabtests/benchmarks/rdm_tagged_pingpong.c | 2 +- fabtests/benchmarks/rma_bw.c | 2 +- fabtests/benchmarks/rma_pingpong.c | 2 +- fabtests/common/shared.c | 4 ++-- fabtests/functional/av_xfer.c | 2 +- fabtests/functional/cq_data.c | 2 +- fabtests/functional/dgram.c | 2 +- fabtests/functional/flood.c | 2 +- fabtests/functional/inject_test.c | 4 ++-- fabtests/functional/loopback.c | 2 +- fabtests/functional/mcast.c | 2 +- fabtests/functional/multi_ep.c | 6 +++--- fabtests/functional/multi_mr.c | 4 ++-- fabtests/functional/multi_recv.c | 4 ++-- fabtests/functional/rdm.c | 2 +- fabtests/functional/rdm_atomic.c | 4 ++-- fabtests/functional/rdm_deferred_wq.c | 2 +- fabtests/functional/rdm_multi_client.c | 2 +- fabtests/functional/rdm_multi_domain.c | 6 +++--- fabtests/functional/rdm_rma_event.c | 6 +++--- fabtests/functional/rdm_rma_trigger.c | 2 +- fabtests/functional/rdm_shared_av.c | 2 +- fabtests/functional/rdm_tagged_peek.c | 4 ++-- fabtests/functional/recv_cancel.c | 4 ++-- fabtests/functional/resmgmt_test.c | 6 +++--- fabtests/functional/shared_ctx.c | 2 +- fabtests/functional/unexpected_msg.c | 2 +- fabtests/functional/unmap_mem.c | 2 +- fabtests/include/shared.h | 2 +- fabtests/man/fabtests.7.md | 2 +- fabtests/multinode/src/core.c | 2 +- fabtests/multinode/src/core_coll.c | 2 +- fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c | 2 +- fabtests/prov/efa/src/rdm_rnr_queue_resend.c | 4 ++-- fabtests/prov/efa/src/rdm_rnr_read_cq_error.c | 2 +- fabtests/regression/sighandler_test.c | 2 +- fabtests/unit/getinfo_test.c | 2 +- 42 files changed, 58 insertions(+), 58 deletions(-) diff --git a/fabtests/benchmarks/dgram_pingpong.c b/fabtests/benchmarks/dgram_pingpong.c index c65460851ab..63774e31aa7 100644 --- a/fabtests/benchmarks/dgram_pingpong.c +++ b/fabtests/benchmarks/dgram_pingpong.c @@ -127,7 +127,7 @@ int main(int argc, char **argv) if (opts.options & FT_OPT_SIZE) hints->ep_attr->max_msg_size = opts.transfer_size; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rdm_bw.c b/fabtests/benchmarks/rdm_bw.c index d229cc7c1fc..6b445db6fc0 100644 --- a/fabtests/benchmarks/rdm_bw.c +++ b/fabtests/benchmarks/rdm_bw.c @@ -80,7 +80,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->tx_attr->tclass = FI_TC_BULK_DATA; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rdm_cntr_pingpong.c b/fabtests/benchmarks/rdm_cntr_pingpong.c index 76eebaf515d..6793f10e806 100644 --- a/fabtests/benchmarks/rdm_cntr_pingpong.c +++ b/fabtests/benchmarks/rdm_cntr_pingpong.c @@ -106,7 +106,7 @@ int main(int argc, char **argv) hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; ret = run(); diff --git a/fabtests/benchmarks/rdm_pingpong.c b/fabtests/benchmarks/rdm_pingpong.c index f5c5871e22d..f4d4169672d 100644 --- a/fabtests/benchmarks/rdm_pingpong.c +++ b/fabtests/benchmarks/rdm_pingpong.c @@ -73,7 +73,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rdm_tagged_bw.c b/fabtests/benchmarks/rdm_tagged_bw.c index c2d795edb3a..e40a9be025b 100644 --- a/fabtests/benchmarks/rdm_tagged_bw.c +++ b/fabtests/benchmarks/rdm_tagged_bw.c @@ -105,7 +105,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->caps = FI_TAGGED; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->tx_attr->tclass = FI_TC_BULK_DATA; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rdm_tagged_pingpong.c b/fabtests/benchmarks/rdm_tagged_pingpong.c index 36a11152eb8..70c4ac6dec2 100644 --- a/fabtests/benchmarks/rdm_tagged_pingpong.c +++ b/fabtests/benchmarks/rdm_tagged_pingpong.c @@ -73,7 +73,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_TAGGED; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->tx_attr->tclass = FI_TC_LOW_LATENCY; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rma_bw.c b/fabtests/benchmarks/rma_bw.c index 8247dd79a0c..7d0764f5267 100644 --- a/fabtests/benchmarks/rma_bw.c +++ b/fabtests/benchmarks/rma_bw.c @@ -93,7 +93,7 @@ int main(int argc, char **argv) hints->caps = FI_MSG | FI_RMA; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->threading = FI_THREAD_DOMAIN; hints->addr_format = opts.address_format; diff --git a/fabtests/benchmarks/rma_pingpong.c b/fabtests/benchmarks/rma_pingpong.c index 76742f27d3a..07e9564e9b8 100644 --- a/fabtests/benchmarks/rma_pingpong.c +++ b/fabtests/benchmarks/rma_pingpong.c @@ -93,7 +93,7 @@ int main(int argc, char **argv) hints->caps = FI_MSG | FI_RMA | FI_WRITE | FI_REMOTE_WRITE; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->addr_format = opts.address_format; while ((op = getopt_long(argc, argv, "Uh" CS_OPTS INFO_OPTS API_OPTS diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 4d88d2d5f3c..dc99eb574a0 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -73,7 +73,7 @@ struct fid_eq *eq; struct fid_mc *mc; struct fid_mr no_mr; -struct fi_context tx_ctx, rx_ctx; +struct fi_context2 tx_ctx, rx_ctx; struct ft_context *tx_ctx_arr = NULL, *rx_ctx_arr = NULL; uint64_t remote_cq_data = 0; @@ -3231,7 +3231,7 @@ int ft_wait_child(void) int ft_finalize_ep(struct fid_ep *ep) { int ret; - struct fi_context ctx; + struct fi_context2 ctx; ret = ft_sendmsg(ep, remote_fi_addr, tx_buf, 4, &ctx, FI_TRANSMIT_COMPLETE); if (ret) diff --git a/fabtests/functional/av_xfer.c b/fabtests/functional/av_xfer.c index d6e341b4381..cb3a6491993 100644 --- a/fabtests/functional/av_xfer.c +++ b/fabtests/functional/av_xfer.c @@ -235,7 +235,7 @@ int main(int argc, char **argv) hints->caps = hints->ep_attr->type == FI_EP_RDM ? FI_TAGGED : FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; base_hints = hints; diff --git a/fabtests/functional/cq_data.c b/fabtests/functional/cq_data.c index ca35ccb06d3..b700778a815 100644 --- a/fabtests/functional/cq_data.c +++ b/fabtests/functional/cq_data.c @@ -164,7 +164,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->domain_attr->cq_data_size = 4; /* required minimum */ - hints->mode |= FI_CONTEXT | FI_RX_CQ_DATA; + hints->mode |= FI_CONTEXT | FI_CONTEXT2 | FI_RX_CQ_DATA; hints->caps = FI_MSG; if (opts.cqdata_op == FT_CQDATA_WRITEDATA) diff --git a/fabtests/functional/dgram.c b/fabtests/functional/dgram.c index b9503cd65fc..47b77379d9a 100644 --- a/fabtests/functional/dgram.c +++ b/fabtests/functional/dgram.c @@ -74,7 +74,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_DGRAM; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/flood.c b/fabtests/functional/flood.c index f85f5274e75..dccb06076c1 100644 --- a/fabtests/functional/flood.c +++ b/fabtests/functional/flood.c @@ -283,7 +283,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/inject_test.c b/fabtests/functional/inject_test.c index 057682f7c4d..a826257153d 100644 --- a/fabtests/functional/inject_test.c +++ b/fabtests/functional/inject_test.c @@ -86,7 +86,7 @@ static int send_msg(int sendmsg, size_t size) static int receive_msg(size_t size) { int ret; - struct fi_context inj_ctx; + struct fi_context2 inj_ctx; ft_tag = 0xabcd; ret = ft_post_rx(ep, size, &inj_ctx); @@ -194,7 +194,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->ep_attr->type = FI_EP_RDM; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->caps = FI_TAGGED; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->domain_attr->mr_mode = opts.mr_mode; diff --git a/fabtests/functional/loopback.c b/fabtests/functional/loopback.c index f66e0604bcb..5031d71327e 100644 --- a/fabtests/functional/loopback.c +++ b/fabtests/functional/loopback.c @@ -90,7 +90,7 @@ int main(int argc, char **argv) opts.src_addr = "127.0.0.1"; hints->caps = FI_LOCAL_COMM | FI_MSG | FI_TAGGED; hints->ep_attr->type = FI_EP_RDM; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; while ((op = getopt(argc, argv, "h" INFO_OPTS)) != -1) { switch (op) { diff --git a/fabtests/functional/mcast.c b/fabtests/functional/mcast.c index 7a486c868b0..64c95ae9308 100644 --- a/fabtests/functional/mcast.c +++ b/fabtests/functional/mcast.c @@ -102,7 +102,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_DGRAM; hints->caps = FI_MSG | FI_MULTICAST; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/multi_ep.c b/fabtests/functional/multi_ep.c index 2bddb8b9b1c..836df9450e0 100644 --- a/fabtests/functional/multi_ep.c +++ b/fabtests/functional/multi_ep.c @@ -53,8 +53,8 @@ static char **send_bufs, **recv_bufs; static struct fid_mr **send_mrs, **recv_mrs; static void **send_descs, **recv_descs; static struct fi_rma_iov *peer_iovs; -static struct fi_context *recv_ctx; -static struct fi_context *send_ctx; +static struct fi_context2 *recv_ctx; +static struct fi_context2 *send_ctx; static struct fid_cq **txcqs, **rxcqs; static struct fid_av **avs; static fi_addr_t *remote_fiaddr; @@ -657,7 +657,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->caps = FI_MSG | FI_RMA; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/multi_mr.c b/fabtests/functional/multi_mr.c index bc0e5521319..fefebc4fabc 100644 --- a/fabtests/functional/multi_mr.c +++ b/fabtests/functional/multi_mr.c @@ -178,7 +178,7 @@ static int init_multi_mr_res(void) static int mr_key_test() { int i, ret = 0; - struct fi_context rma_ctx; + struct fi_context2 rma_ctx; for (i = 0; i < mr_count; i++) { tx_buf = (char *)mr_res_array[i].buf; @@ -319,7 +319,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->caps = FI_RMA | FI_RMA_EVENT | FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/multi_recv.c b/fabtests/functional/multi_recv.c index 58eac21b951..672bb15c727 100644 --- a/fabtests/functional/multi_recv.c +++ b/fabtests/functional/multi_recv.c @@ -41,7 +41,7 @@ #define MAX_XFER_SIZE (1 << 20) static struct fid_mr *mr_multi_recv; -struct fi_context ctx_multi_recv[2]; +struct fi_context2 ctx_multi_recv[2]; static int use_recvmsg, comp_per_buf; @@ -324,7 +324,7 @@ int main(int argc, char **argv) opts.min_multi_recv_size = opts.transfer_size; hints->caps = FI_MSG | FI_MULTI_RECV; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->rx_attr->op_flags = FI_MULTI_RECV; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm.c b/fabtests/functional/rdm.c index a887b70d418..666ca13c671 100644 --- a/fabtests/functional/rdm.c +++ b/fabtests/functional/rdm.c @@ -94,7 +94,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_atomic.c b/fabtests/functional/rdm_atomic.c index b329c6a66e7..915dba884d1 100644 --- a/fabtests/functional/rdm_atomic.c +++ b/fabtests/functional/rdm_atomic.c @@ -44,7 +44,7 @@ static void *cpy_dst; static struct fid_mr *mr_result; static struct fid_mr *mr_compare; -static struct fi_context fi_ctx_atomic; +static struct fi_context2 fi_ctx_atomic; static enum fi_datatype datatype; static int run_all_ops = 1, run_all_datatypes = 1; @@ -591,7 +591,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_ATOMICS; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; ret = run(); diff --git a/fabtests/functional/rdm_deferred_wq.c b/fabtests/functional/rdm_deferred_wq.c index 7526c709861..0f780bee56e 100644 --- a/fabtests/functional/rdm_deferred_wq.c +++ b/fabtests/functional/rdm_deferred_wq.c @@ -633,7 +633,7 @@ int main(int argc, char **argv) tested_op == FI_OP_COMPARE_ATOMIC) hints->caps |= FI_ATOMIC; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_multi_client.c b/fabtests/functional/rdm_multi_client.c index 332a19989f6..f3e5a5cd4dd 100644 --- a/fabtests/functional/rdm_multi_client.c +++ b/fabtests/functional/rdm_multi_client.c @@ -222,7 +222,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_multi_domain.c b/fabtests/functional/rdm_multi_domain.c index 71e0848eebf..14a8ebb440d 100644 --- a/fabtests/functional/rdm_multi_domain.c +++ b/fabtests/functional/rdm_multi_domain.c @@ -55,7 +55,7 @@ struct test_domain { struct fid_av *av; struct fid_mr *mr; struct fid_cq *tx_cq; - struct fi_context *rma_ctx; + struct fi_context2 *rma_ctx; }; struct test_domain *domain_res_array; @@ -274,7 +274,7 @@ static void free_domain_res() } static int write_data(void *buffer, size_t size, int dom_idx, - int remote_dom_idx, struct fi_context *rma_ctx) + int remote_dom_idx, struct fi_context2 *rma_ctx) { int ret = -FI_EAGAIN; @@ -427,7 +427,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_RMA | FI_RMA_EVENT | FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_rma_event.c b/fabtests/functional/rdm_rma_event.c index 8aaec557771..cb6786c2294 100644 --- a/fabtests/functional/rdm_rma_event.c +++ b/fabtests/functional/rdm_rma_event.c @@ -39,8 +39,8 @@ struct fi_rma_iov local; -struct fi_context fi_ctx_write; -struct fi_context fi_ctx_read; +struct fi_context2 fi_ctx_write; +struct fi_context2 fi_ctx_read; static int run_test(void) { @@ -126,7 +126,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_RMA | FI_RMA_EVENT; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_rma_trigger.c b/fabtests/functional/rdm_rma_trigger.c index d08191c7cdf..0fdba0c330c 100644 --- a/fabtests/functional/rdm_rma_trigger.c +++ b/fabtests/functional/rdm_rma_trigger.c @@ -154,7 +154,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_RMA | FI_RMA_EVENT | FI_TRIGGER; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_shared_av.c b/fabtests/functional/rdm_shared_av.c index b113f3354be..ce9d6b8c85b 100644 --- a/fabtests/functional/rdm_shared_av.c +++ b/fabtests/functional/rdm_shared_av.c @@ -189,7 +189,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_SHARED_AV; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/rdm_tagged_peek.c b/fabtests/functional/rdm_tagged_peek.c index c583d37013b..1cce508f3e3 100644 --- a/fabtests/functional/rdm_tagged_peek.c +++ b/fabtests/functional/rdm_tagged_peek.c @@ -42,7 +42,7 @@ #define BASE_TAG 0x900d #define SEND_CNT 10 -static struct fi_context fi_context; +static struct fi_context2 fi_context; static int wait_for_send_comp(int count) { @@ -355,7 +355,7 @@ int main(int argc, char **argv) hints->rx_attr->msg_order = FI_ORDER_SAS; hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_TAGGED; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/recv_cancel.c b/fabtests/functional/recv_cancel.c index d943f7d8259..376cb6f6076 100644 --- a/fabtests/functional/recv_cancel.c +++ b/fabtests/functional/recv_cancel.c @@ -76,7 +76,7 @@ static int recv_cancel_host(void) int ret = 0; int retries = 0; struct fi_cq_err_entry recv_completion, cancel_error_entry; - struct fi_context cancel_recv_ctx, standard_recv_ctx; + struct fi_context2 cancel_recv_ctx, standard_recv_ctx; memset(&cancel_error_entry, 0, sizeof(cancel_error_entry)); @@ -246,7 +246,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->caps = FI_TAGGED; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/resmgmt_test.c b/fabtests/functional/resmgmt_test.c index ef2f3565e82..e27073e8e25 100644 --- a/fabtests/functional/resmgmt_test.c +++ b/fabtests/functional/resmgmt_test.c @@ -47,7 +47,7 @@ int delay, tagged; static int send_loop(size_t size) { int q_opts = 0; int ret; - struct fi_context send_ctx[max_opts]; + struct fi_context2 send_ctx[max_opts]; while (q_opts < max_opts) { do { @@ -91,7 +91,7 @@ static int receive_loop(size_t size) { int ret; int q_opts = 0; - struct fi_context recv_ctx[max_opts]; + struct fi_context2 recv_ctx[max_opts]; while (q_opts < max_opts) { do { @@ -262,7 +262,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/shared_ctx.c b/fabtests/functional/shared_ctx.c index 016a56e87fc..52b8711f75e 100644 --- a/fabtests/functional/shared_ctx.c +++ b/fabtests/functional/shared_ctx.c @@ -613,7 +613,7 @@ int main(int argc, char **argv) if (!(hints->caps & FI_TAGGED)) hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/functional/unexpected_msg.c b/fabtests/functional/unexpected_msg.c index f01a029bdf1..70921913178 100644 --- a/fabtests/functional/unexpected_msg.c +++ b/fabtests/functional/unexpected_msg.c @@ -369,7 +369,7 @@ int main(int argc, char **argv) if (optind < argc) opts.dst_addr = argv[optind]; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->caps = FI_TAGGED; diff --git a/fabtests/functional/unmap_mem.c b/fabtests/functional/unmap_mem.c index 5ba36581da4..01de49dd5a0 100644 --- a/fabtests/functional/unmap_mem.c +++ b/fabtests/functional/unmap_mem.c @@ -170,7 +170,7 @@ int main(int argc, char **argv) if (optind < argc) opts.dst_addr = argv[optind]; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->caps = FI_MSG; hints->domain_attr->mr_mode = opts.mr_mode; diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index e4dc4b3c643..b57a7dab0ed 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -251,7 +251,7 @@ extern size_t buf_size, tx_size, rx_size, tx_mr_size, rx_mr_size; extern int tx_fd, rx_fd; extern int timeout; -extern struct fi_context tx_ctx, rx_ctx; +extern struct fi_context2 tx_ctx, rx_ctx; extern uint64_t remote_cq_data; extern uint64_t tx_seq, rx_seq, tx_cq_cntr, rx_cq_cntr; diff --git a/fabtests/man/fabtests.7.md b/fabtests/man/fabtests.7.md index bac6c1b3c4c..2cbfab0e5bc 100644 --- a/fabtests/man/fabtests.7.md +++ b/fabtests/man/fabtests.7.md @@ -339,7 +339,7 @@ The following keys and respective key values may be used in the config file. FI_WRITE, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_TAGGED, FI_DIRECTED_RECV *mode - values OR'ed together* -: FI_CONTEXT, FI_RX_CQ_DATA +: FI_CONTEXT, FI_CONTEXT2, FI_RX_CQ_DATA *ep_type* : FI_EP_MSG, FI_EP_DGRAM, FI_EP_RDM diff --git a/fabtests/multinode/src/core.c b/fabtests/multinode/src/core.c index dbc21cc42e6..d1770fe7a53 100644 --- a/fabtests/multinode/src/core.c +++ b/fabtests/multinode/src/core.c @@ -87,7 +87,7 @@ static int multi_setup_fabric(int argc, char **argv) struct fi_rma_iov remote; hints->ep_attr->type = FI_EP_RDM; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; if (pm_job.transfer_method == multi_msg) { diff --git a/fabtests/multinode/src/core_coll.c b/fabtests/multinode/src/core_coll.c index 7d6b5ddfcf1..d9fe4dc0d09 100644 --- a/fabtests/multinode/src/core_coll.c +++ b/fabtests/multinode/src/core_coll.c @@ -524,7 +524,7 @@ static inline void setup_hints(void) { hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG | FI_COLLECTIVE; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; } diff --git a/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c b/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c index ca38195e48d..72da2063d7a 100644 --- a/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c +++ b/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; hints->addr_format = opts.address_format; diff --git a/fabtests/prov/efa/src/rdm_rnr_queue_resend.c b/fabtests/prov/efa/src/rdm_rnr_queue_resend.c index 9a8889ca4cf..31139165d45 100644 --- a/fabtests/prov/efa/src/rdm_rnr_queue_resend.c +++ b/fabtests/prov/efa/src/rdm_rnr_queue_resend.c @@ -146,7 +146,7 @@ static int trigger_rnr_queue_resend(enum fi_op atomic_op, void *result, void *co struct fid_mr *mr_result, struct fid_mr *mr_compare) { int i, ret; - struct fi_context fi_ctx_atomic; + struct fi_context2 fi_ctx_atomic; if (opts.rma_op) { for (i = 0; i < global_expected_rnr_error; i++) { @@ -434,7 +434,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps |= FI_MSG | FI_RMA | FI_ATOMICS; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; /* FI_RM_ENABLED to is required for queue/resend logic to happen in RNR case */ diff --git a/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c b/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c index 4c7edf2886c..85e0e67db1d 100644 --- a/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c +++ b/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c @@ -176,7 +176,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_MSG; - hints->mode |= FI_CONTEXT; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = opts.mr_mode; /* FI_RM_DISABLED is required to get RNR error CQ entry */ diff --git a/fabtests/regression/sighandler_test.c b/fabtests/regression/sighandler_test.c index 84cf532fd4a..dc54fd98a2c 100644 --- a/fabtests/regression/sighandler_test.c +++ b/fabtests/regression/sighandler_test.c @@ -80,7 +80,7 @@ int main(int argc, char **argv) } } hints->caps = FI_MSG; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; if (ft_init_fabric()) { ft_freehints(hints); exit(EXIT_FAILURE); diff --git a/fabtests/unit/getinfo_test.c b/fabtests/unit/getinfo_test.c index 3c3060b4810..37518a94b92 100644 --- a/fabtests/unit/getinfo_test.c +++ b/fabtests/unit/getinfo_test.c @@ -645,7 +645,7 @@ static int test_caps_regression(char *node, char *service, uint64_t flags, /* Limit mode bits to common, older options only */ hints->caps |= fi->caps; - hints->mode = FI_CONTEXT; + hints->mode = FI_CONTEXT | FI_CONTEXT2; hints->domain_attr->mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP; fi_freeinfo(*info); From 427ab3fb322da9e75ca54a83817f2bff929397f1 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Thu, 23 Jan 2025 22:25:34 +0000 Subject: [PATCH 384/393] prov/efa: Use cuda_put_dmabuf_fd Use cuda_put_dmabuf_fd to close fd Signed-off-by: Shi Jin --- prov/efa/src/efa_hmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c index 996d171d95b..18dba70ca2c 100644 --- a/prov/efa/src/efa_hmem.c +++ b/prov/efa/src/efa_hmem.c @@ -129,7 +129,7 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in if (ret == FI_SUCCESS) { ibv_mr = ibv_reg_dmabuf_mr(g_device_list[0].ibv_pd, dmabuf_offset, len, (uint64_t)ptr, dmabuf_fd, ibv_access); - (void)close(dmabuf_fd); + (void)cuda_put_dmabuf_fd(dmabuf_fd); if (!ibv_mr) { EFA_INFO(FI_LOG_CORE, "Unable to register CUDA device buffer via dmabuf: %s. " From 2e8ef33a633b1f2b2c1b8dd0cb1b3a6696ef20d0 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 23 Jan 2025 11:40:31 -0800 Subject: [PATCH 385/393] contrib/intel/jenkins: Add --send-mail for new ci summary Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 7c956d50a79..7385ad5ae91 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -120,18 +120,19 @@ def gather_logs(cluster, key, dest, source) { } def CI_summarize(verbose=false) { + cmd = """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ + python ${CI_LOCATION}/summarize.py \ + --log_directory=${env.LOG_DIR} + """ if (verbose) { - sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ - python ${CI_LOCATION}/summarize.py \ - --log_directory=${env.LOG_DIR} \ - -v - """ - } else { - sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ - python ${CI_LOCATION}/summarize.py \ - --log_directory=${env.LOG_DIR} - """ + cmd = "${cmd} -v" } + + if (weekly || RELEASE) { + cmd = "${cmd} --send-mail" + } + + sh "${cmd}" } def summarize(item, verbose=false, release=false, send_mail=false) { From b280fc04ffb1d909962754f5ec464b2b02c70289 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Wed, 22 Jan 2025 14:55:09 -0800 Subject: [PATCH 386/393] fabtests/efa: add rdma check for unsolicited write recv Signed-off-by: Jessie Yang --- fabtests/prov/efa/configure.m4 | 5 ++++ fabtests/prov/efa/src/efa_rdma_checker.c | 31 ++++++++++++++++++------ fabtests/pytest/efa/efa_common.py | 2 +- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/fabtests/prov/efa/configure.m4 b/fabtests/prov/efa/configure.m4 index bf0f3b624e9..b8252b209f5 100644 --- a/fabtests/prov/efa/configure.m4 +++ b/fabtests/prov/efa/configure.m4 @@ -28,5 +28,10 @@ AS_IF([test x"$have_efadv" = x"1"], [ [], [efa_rdma_checker_happy=0], [[#include ]]) + + AC_CHECK_DECL(EFADV_DEVICE_ATTR_CAPS_UNSOLICITED_WRITE_RECV, + [], + [efa_rdma_checker_happy=0], + [[#include ]]) ]) AM_CONDITIONAL([BUILD_EFA_RDMA_CHECKER], [test $efa_rdma_checker_happy -eq 1]) diff --git a/fabtests/prov/efa/src/efa_rdma_checker.c b/fabtests/prov/efa/src/efa_rdma_checker.c index b764bbba91b..9215cc63d91 100644 --- a/fabtests/prov/efa/src/efa_rdma_checker.c +++ b/fabtests/prov/efa/src/efa_rdma_checker.c @@ -14,6 +14,7 @@ enum rdma_op { READ, WRITE, + UNSOLICITED_WRITE_RECV, }; /* @@ -37,8 +38,10 @@ int main(int argc, char *argv[]) op = READ; } else if (!strcasecmp(optarg, "write")) { op = WRITE; + } else if (!strcasecmp(optarg, "writedata")) { + op = UNSOLICITED_WRITE_RECV; } else { - fprintf(stderr, "Unknown operation '%s. Allowed: read | write'\n", optarg); + fprintf(stderr, "Unknown operation '%s. Allowed: read | write | writedata '\n", optarg); return EXIT_FAILURE; } break; @@ -46,7 +49,7 @@ int main(int argc, char *argv[]) case 'h': default: fprintf(stderr, "Usage:\n"); - FT_PRINT_OPTS_USAGE("fi_efa_rdma_checker -o ", "rdma operation type: read|write"); + FT_PRINT_OPTS_USAGE("fi_efa_rdma_checker -o ", "rdma operation type: read | write | writedata"); return EXIT_FAILURE; } } @@ -86,11 +89,25 @@ int main(int argc, char *argv[]) if (op == READ) goto out; - if (efadv_attr.device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_WRITE) { - fprintf(stdout, "rdma write is enabled \n"); - } else { - fprintf(stderr, "rdma write is NOT enabled \n"); - err = op == WRITE ? 1 : 0; + if (op == WRITE) { + if (efadv_attr.device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_WRITE) { + fprintf(stdout, "rdma write is enabled \n"); + } else { + fprintf(stderr, "rdma write is NOT enabled \n"); + err = 1; + } + goto out; + } + + if (op == UNSOLICITED_WRITE_RECV) { + if (efadv_attr.device_caps & EFADV_DEVICE_ATTR_CAPS_UNSOLICITED_WRITE_RECV) { + fprintf(stdout, + "rdma unsolicited write recv is enabled \n"); + } else { + fprintf(stderr, "rdma unsolicited write recv is NOT " + "enabled \n"); + err = 1; + } } out: diff --git a/fabtests/pytest/efa/efa_common.py b/fabtests/pytest/efa/efa_common.py index d5f0eb959c0..6f5e311a97f 100644 --- a/fabtests/pytest/efa/efa_common.py +++ b/fabtests/pytest/efa/efa_common.py @@ -74,7 +74,7 @@ def has_rdma(cmdline_args, operation): operation: rdma operation name, allowed values are read and write return: a boolean """ - assert operation in ["read", "write"] + assert operation in ["read", "write", "writedata"] binpath = cmdline_args.binpath or "" cmd = "timeout " + str(cmdline_args.timeout) \ + " " + os.path.join(binpath, f"fi_efa_rdma_checker -o {operation}") From 9024dab3d9da87cf492c163b22deafcef0ff75ec Mon Sep 17 00:00:00 2001 From: Jerome Soumagne Date: Wed, 22 Jan 2025 20:49:52 -0600 Subject: [PATCH 387/393] prov/tcp: Fix FI_MULTI_RECV not set on error When multiple multi-recv buffers are posted, FI_MULTI_RECV would only be set on error if an mrecv entry was already created, meaning the buffer would have already been in-use. If the buffer has not been used yet and a cancelation for this buffer has been processed, correctly set FI_MULTI_RECV when reporting the error, indicating that the buffer is no longer in use. Signed-off-by: Jerome Soumagne --- prov/tcp/src/xnet_cq.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/prov/tcp/src/xnet_cq.c b/prov/tcp/src/xnet_cq.c index 03ea975371d..2090bdf7170 100644 --- a/prov/tcp/src/xnet_cq.c +++ b/prov/tcp/src/xnet_cq.c @@ -202,13 +202,15 @@ void xnet_report_error(struct xnet_xfer_entry *xfer_entry, int err) err_entry.flags = xfer_entry->cq_flags & ~FI_COMPLETION; if (err_entry.flags & FI_RECV) { - if (xfer_entry->ctrl_flags & XNET_MULTI_RECV && - xfer_entry->mrecv) { - xfer_entry->mrecv->ref_cnt--; - if (!xfer_entry->mrecv->ref_cnt) { + if (xfer_entry->ctrl_flags & XNET_MULTI_RECV) { + if (xfer_entry->mrecv) { + xfer_entry->mrecv->ref_cnt--; + if (!xfer_entry->mrecv->ref_cnt) { + err_entry.flags |= FI_MULTI_RECV; + free(xfer_entry->mrecv); + } + } else err_entry.flags |= FI_MULTI_RECV; - free(xfer_entry->mrecv); - } } xnet_get_cq_info(xfer_entry, &err_entry.flags, &err_entry.data, &err_entry.tag); From 68aaf62e675d542085adaf18e0523e1800b0ab4d Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Mon, 20 Jan 2025 14:00:25 +0100 Subject: [PATCH 388/393] prov/cxi: Add FI_CXI_CURL_LIB_PATH #define from autoconf This ensures that the libcurl dlopen path is correct If the user passes '--with-curl=' to configure, then the dlopen of libcurl should honor that selection and use the file path passed in Signed-off-by: John Biddiscombe --- prov/cxi/configure.m4 | 4 +++- prov/cxi/src/cxip_curl.c | 48 ++++++++++++++++++++++++++-------------- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/prov/cxi/configure.m4 b/prov/cxi/configure.m4 index ec76590ca2e..cab843f47a2 100644 --- a/prov/cxi/configure.m4 +++ b/prov/cxi/configure.m4 @@ -35,8 +35,10 @@ AC_DEFUN([FI_CXI_CONFIGURE],[ [CPPFLAGS="-I$with_cxi_uapi_headers/include $CPPFLAGS"]) # Support non-standard install path for curl. This is needed by CXI provider. + # Add #define of the path to the curl library for use in the code AC_ARG_WITH([curl], - [AS_HELP_STRING([--with-curl=DIR], [Install directory for curl])]) + [AS_HELP_STRING([--with-curl=DIR], [Install directory for curl])], + [AC_DEFINE_UNQUOTED([FI_CXI_CURL_LIB_PATH], ["$with_curl"], [Path to the curl install root])]) # Support non-standard install path for json-c. This is needed by CXI provider. AC_ARG_WITH([json-c], diff --git a/prov/cxi/src/cxip_curl.c b/prov/cxi/src/cxip_curl.c index e849ebad806..289d2ea8198 100644 --- a/prov/cxi/src/cxip_curl.c +++ b/prov/cxi/src/cxip_curl.c @@ -189,28 +189,44 @@ int cxip_curl_load_symbols(void) if (cxip_curlhandle) return 0; + char *curl_libpath = NULL; + #ifdef FI_CXI_CURL_LIB_PATH + curl_libpath = strdup(FI_CXI_CURL_LIB_PATH "/%s/libcurl.so.%d"); + TRACE_CURL("FI_CXI_CURL_LIB_PATH set to '%s'\n", curl_libpath); + #else + curl_libpath = strdup("/usr/%s/libcurl.so.%d"); + #endif + /* Try to find latest usable version */ // TODO test earlier versions for (version = 4; version >= 4; version--) { - sprintf(libfile, "/usr/lib64/libcurl.so.%d", version); - libpath = realpath(libfile, NULL); - if (!libpath) { - TRACE_CURL("could not expand '%s'\n", libfile); - CXIP_INFO("could not expand '%s'\n", libfile); - continue; - } - TRACE_CURL("dlopen '%s'\n", libpath); - h = dlopen(libpath, RTLD_NOW); - if (!h) { - TRACE_CURL("%s not found\n", libpath); - CXIP_INFO("%s not found\n", libpath); + const char *lib_dirs[] = {"lib", "lib64"}; + for (int i = 0; i < 2; i++) { + sprintf(libfile, curl_libpath, lib_dirs[i], version); + TRACE_CURL("Checking libcurl at '%s'\n", libfile); + libpath = realpath(libfile, NULL); + if (!libpath) { + TRACE_CURL("could not expand '%s'\n", libfile); + CXIP_INFO("could not expand '%s'\n", libfile); + continue; + } + TRACE_CURL("dlopen '%s'\n", libpath); + h = dlopen(libpath, RTLD_NOW); + if (!h) { + TRACE_CURL("%s not found\n", libpath); + CXIP_INFO("%s not found\n", libpath); + free(libpath); + continue; + } + TRACE_CURL("%s found\n", libpath); free(libpath); - continue; + break; + } + if (h) { + break; } - TRACE_CURL("%s found\n", libpath); - free(libpath); - break; } + free(curl_libpath); if (!h) { TRACE_CURL("libcurl not supported\n"); CXIP_WARN("libcurl not supported\n"); From 0e6f63f3ccb6f7ec180fab516135b2e6863c176f Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Wed, 22 Jan 2025 22:28:57 +0100 Subject: [PATCH 389/393] prov/cxi: Make string setup of FI_CXI_CURL_LIB_PATH safe Signed-off-by: John Biddiscombe --- prov/cxi/src/cxip_curl.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/prov/cxi/src/cxip_curl.c b/prov/cxi/src/cxip_curl.c index 289d2ea8198..f3954327651 100644 --- a/prov/cxi/src/cxip_curl.c +++ b/prov/cxi/src/cxip_curl.c @@ -180,7 +180,7 @@ struct curlfunc curlary[] = { int cxip_curl_load_symbols(void) { struct curlfunc *funcptr; - char libfile[256], *libpath; + char *libfile = NULL, *libpath; int version; int errcnt; void *h; @@ -202,12 +202,19 @@ int cxip_curl_load_symbols(void) for (version = 4; version >= 4; version--) { const char *lib_dirs[] = {"lib", "lib64"}; for (int i = 0; i < 2; i++) { - sprintf(libfile, curl_libpath, lib_dirs[i], version); + int len = snprintf(NULL, 0, curl_libpath, lib_dirs[i], version) + 1; + libfile = malloc(len); + if (!libfile) { + free(curl_libpath); + return -FI_ENOMEM; + } + snprintf(libfile, len, curl_libpath, lib_dirs[i], version); TRACE_CURL("Checking libcurl at '%s'\n", libfile); libpath = realpath(libfile, NULL); if (!libpath) { TRACE_CURL("could not expand '%s'\n", libfile); CXIP_INFO("could not expand '%s'\n", libfile); + free(libfile); continue; } TRACE_CURL("dlopen '%s'\n", libpath); @@ -216,10 +223,12 @@ int cxip_curl_load_symbols(void) TRACE_CURL("%s not found\n", libpath); CXIP_INFO("%s not found\n", libpath); free(libpath); + free(libfile); continue; } TRACE_CURL("%s found\n", libpath); free(libpath); + free(libfile); break; } if (h) { From 10e61add573a2fe795ac9b0f4b18e58e9b55b12e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Jan 2025 17:51:30 +0000 Subject: [PATCH 390/393] build(deps): bump actions/stale from 9.0.0 to 9.1.0 Bumps [actions/stale](https://github.com/actions/stale) from 9.0.0 to 9.1.0. - [Release notes](https://github.com/actions/stale/releases) - [Changelog](https://github.com/actions/stale/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/stale/compare/28ca1036281a5e5922ead5184a1bbf96e5fc984e...5bef64f19d7facfb25b37b414482c7164d639639) --- updated-dependencies: - dependency-name: actions/stale dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/stale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 9f8db3dcdc7..4c169cb8f64 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -18,7 +18,7 @@ jobs: pull-requests: write steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 + - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} days-before-stale: 360 From b73236665101e2508239344414d9f2930267aee3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Jan 2025 17:51:38 +0000 Subject: [PATCH 391/393] build(deps): bump github/codeql-action from 3.28.1 to 3.28.5 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.28.1 to 3.28.5. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/b6a472f63d85b9c78a3ac5e89422239fc15e9b3c...f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 6 +++--- .github/workflows/scorecard.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 426f0c4f60b..5979537be95 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 + uses: github/codeql-action/init@f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4 # v3.28.5 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -66,7 +66,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 + uses: github/codeql-action/autobuild@f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4 # v3.28.5 # â„šī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -79,6 +79,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 + uses: github/codeql-action/analyze@f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4 # v3.28.5 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 9450f4ca1c1..8fa7a945e07 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -68,6 +68,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 + uses: github/codeql-action/upload-sarif@f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4 # v3.28.5 with: sarif_file: results.sarif From f88f4b680a302902fe907f308e7440030449420a Mon Sep 17 00:00:00 2001 From: Stephen Oost Date: Wed, 9 Oct 2024 08:47:47 -0700 Subject: [PATCH 392/393] prov/tcp: only progress rx when connected we may receive uring events before we're fully connected so don't try to progress rx until that connection is established Signed-off-by: Stephen Oost --- prov/tcp/src/xnet_progress.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prov/tcp/src/xnet_progress.c b/prov/tcp/src/xnet_progress.c index aa76968e175..41eaf8ef944 100644 --- a/prov/tcp/src/xnet_progress.c +++ b/prov/tcp/src/xnet_progress.c @@ -1236,7 +1236,8 @@ static void xnet_uring_rx_done(struct xnet_ep *ep, int res) else xnet_complete_rx(ep, FI_SUCCESS); } - xnet_progress_rx(ep); + if (ep->state == XNET_CONNECTED) + xnet_progress_rx(ep); return; disable_ep: From 11e570ab64f2f8d092fb7c85a4512e3fbb5ae26d Mon Sep 17 00:00:00 2001 From: Stephen Oost Date: Thu, 29 Aug 2024 07:16:59 -0700 Subject: [PATCH 393/393] prov/tcp: use readv2 when passing flags to io uring the previously used io_uring_prep_readv function does not support flags, instead flags were being passed as an offset, triggering an illegal seek error Signed-off-by: Stephen Oost --- src/iouring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/iouring.c b/src/iouring.c index ca0b91635ae..1a2f016e902 100644 --- a/src/iouring.c +++ b/src/iouring.c @@ -168,7 +168,10 @@ ssize_t ofi_sockapi_recvv_uring(struct ofi_sockapi *sockapi, SOCKET sock, if (!sqe) return -FI_EOVERFLOW; - io_uring_prep_readv(sqe, sock, iov, cnt, flags); + /* MSG_NOSIGNAL would return ENOTSUP with io_uring */ + flags &= ~MSG_NOSIGNAL; + + io_uring_prep_readv2(sqe, sock, iov, cnt, 0, flags); io_uring_sqe_set_data(sqe, ctx); ctx->uring_sqe_inuse = true; uring->credits--;