Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7158381
replace ionic header files with ccqe version
QizhouZhang97 May 6, 2026
961c636
compatiable ccqe/non-ccqe
QizhouZhang97 May 6, 2026
b07315f
dlopen ionic_dv_create_cq_ex
QizhouZhang97 May 8, 2026
24ac86d
fix ccqe check logic
QizhouZhang97 May 8, 2026
538f42c
fix judge
QizhouZhang97 May 8, 2026
76efae7
fix print
QizhouZhang97 May 8, 2026
8387688
checkout host ccqe in runtime
QizhouZhang97 May 8, 2026
e81f47f
format
QizhouZhang97 May 8, 2026
ee8b8f2
remove defines
QizhouZhang97 May 8, 2026
a3ec281
retrigger CI
QizhouZhang97 May 9, 2026
b05ee30
restore ccqe defines
QizhouZhang97 May 9, 2026
96dde32
debug
QizhouZhang97 May 9, 2026
93ad1d7
dbg2
QizhouZhang97 May 9, 2026
b6bd5be
fix
QizhouZhang97 May 9, 2026
c37428c
remove printf
QizhouZhang97 May 9, 2026
ae4672d
remove print
QizhouZhang97 May 9, 2026
0084085
dump cmd
QizhouZhang97 May 9, 2026
b6ebc4f
update deect ccqe logic
QizhouZhang97 May 11, 2026
49199be
revise print
QizhouZhang97 May 11, 2026
1ee356b
revise host logic
QizhouZhang97 May 11, 2026
5dfbd7c
format
QizhouZhang97 May 11, 2026
87f5e2a
modify log
QizhouZhang97 May 11, 2026
5b6a686
host log
QizhouZhang97 May 11, 2026
127a139
ci
QizhouZhang97 May 11, 2026
93d600d
test ci
QizhouZhang97 May 11, 2026
6c456b0
fix parse of firmware minor version
May 18, 2026
30c9c19
fix ccqe poll & remove redundant code
QizhouZhang97 May 18, 2026
64fe670
fix local rdma example
May 18, 2026
603925c
format code
QizhouZhang97 May 19, 2026
3bb0ec6
use env var to control ccqe mode
QizhouZhang97 May 20, 2026
9f23058
judge env value
QizhouZhang97 May 20, 2026
ddea7f2
address jhaos comments
QizhouZhang97 May 21, 2026
7edc3b1
remove some code
QizhouZhang97 May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions examples/local_rdma_ops/atomic_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,24 @@ __device__ void SendThreadKernel(RdmaEndpoint& epSend, RdmaMemoryRegion sendMr,
}

__device__ void RecvThreadKernel(RdmaEndpoint& epRecv, RdmaMemoryRegion mr) {
uint32_t postIdx = 0;
uint32_t* addr = reinterpret_cast<uint32_t*>(mr.addr);
uint32_t val = core::AtomicLoadSeqCst(addr);
printf("val = %u\n", val);
while (val != 2) {

// Cross-block, lock-free observation: there is no sync between the send block
// (which issues CAS then FETCH_ADD) and this recv block, so by the time we
// start polling either both atomics or only CAS may have landed. Just wait
// until the value leaves its initial 0, and then until it reaches the final
// expected sum (CAS_swap + FETCH_ADD_value == 2 + 2 == 4).
while (val == 0) {
val = core::AtomicLoadSeqCst(addr);
printf("after compare and swap val = %u\n", val);
}
printf("after compare and swap val = %u\n", val);

while (val != 4) {
val = core::AtomicLoadSeqCst(addr);
printf("after fetch add val = %u\n", val);
}
printf("after fetch add val = %u\n", val);
}

__global__ void SendRecvOnGpu(RdmaEndpoint& epSend, RdmaEndpoint& epRecv, RdmaMemoryRegion mrSend,
Expand Down
25 changes: 19 additions & 6 deletions examples/local_rdma_ops/send_recv_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,17 @@ __device__ void SendThreadKernel(RdmaEndpoint& epSend, RdmaMemoryRegion mr, int
printf("RingDoorbell is done\n");
__threadfence_system();

int snd_opcode =
PollCq<P>(epSend.cqHandle.cqAddr, epSend.cqHandle.cqeNum, &epSend.cqHandle.consIdx);
printf("send PollCq is done\n");
// PSD 4-arg PollCq is non-blocking (returns -1 when the CCQE msg_msn isn't
// there yet); BNXT/MLX5 spin internally. Wrap in a busy-wait loop so this
// example works uniformly across all providers.
uint32_t snd_wqeIdx = 0;
int snd_opcode;
do {
snd_opcode = PollCq<P>(epSend.cqHandle.cqAddr, epSend.cqHandle.cqeNum,
&epSend.cqHandle.consIdx, &snd_wqeIdx);
} while (snd_opcode < 0);
epSend.cqHandle.consIdx += 1;
printf("send PollCq is done, wqeIdx %u\n", snd_wqeIdx);
UpdateCqDbrRecord<P>(epSend.cqHandle, epSend.cqHandle.consIdx);
printf("send UpdateCqDbrRecord is done\n");
// printf("snd_opcode %d val %d\n", snd_opcode, reinterpret_cast<char*>(mrSend.addr)[0]);
Expand Down Expand Up @@ -84,9 +92,14 @@ __device__ void RecvThreadKernel(RdmaEndpoint& epRecv, RdmaMemoryRegion mr, int
printf("recv RingDoorbell is done\n");
}

int rcv_opcode =
PollCq<P>(epRecv.cqHandle.cqAddr, epRecv.cqHandle.cqeNum, &epRecv.cqHandle.consIdx);
printf("recv PollCq is done\n");
uint32_t rcv_wqeIdx = 0;
int rcv_opcode;
do {
rcv_opcode = PollCq<P>(epRecv.cqHandle.cqAddr, epRecv.cqHandle.cqeNum,
&epRecv.cqHandle.consIdx, &rcv_wqeIdx);
} while (rcv_opcode < 0);
epRecv.cqHandle.consIdx += 1;
printf("recv PollCq is done, wqeIdx %u\n", rcv_wqeIdx);
UpdateCqDbrRecord<P>(epRecv.cqHandle, epRecv.cqHandle.consIdx);
printf("recv UpdateCqDbrRecord is done\n");

Expand Down
14 changes: 11 additions & 3 deletions examples/local_rdma_ops/write_inline_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,19 @@ __device__ void SendThreadKernel(RdmaEndpoint& epSend, RdmaMemoryRegion mr) {
RingDoorbell<P>(epSend.wqHandle.dbrAddr, dbr_val);
__threadfence_system();

int opcode =
PollCq<P>(epSend.cqHandle.cqAddr, epSend.cqHandle.cqeNum, &epSend.cqHandle.consIdx);
// PSD 4-arg PollCq is non-blocking (returns -1 when CQE / CCQE msg_msn not
// ready yet), while BNXT/MLX5 internally spin. Wrap in a busy-wait loop so
// this example works uniformly across all providers.
uint32_t wqeIdx = 0;
int opcode;
do {
opcode = PollCq<P>(epSend.cqHandle.cqAddr, epSend.cqHandle.cqeNum, &epSend.cqHandle.consIdx,
&wqeIdx);
} while (opcode < 0);
epSend.cqHandle.consIdx += 1;
__threadfence_system();
UpdateCqDbrRecord<P>(epSend.cqHandle, epSend.cqHandle.consIdx);
// printf("round %d snd_opcode %d\n", i, opcode);
// printf("round %d snd_opcode %d wqeIdx %u\n", i, opcode, wqeIdx);

raddr += i;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ struct IonicDvApi {
using pd_set_sqcmb_t = int (*)(struct ibv_pd*, bool, bool, bool);
using pd_set_rqcmb_t = int (*)(struct ibv_pd*, bool, bool, bool);
using pd_set_udma_mask_t = int (*)(struct ibv_pd*, uint32_t);
using create_cq_ex_t = struct ibv_cq_ex* (*)(struct ibv_context*, struct ibv_cq_init_attr_ex*,
struct ionic_cq_init_attr_ex*);

get_ctx_t get_ctx = nullptr;
qp_get_udma_idx_t qp_get_udma_idx = nullptr;
Expand All @@ -189,6 +191,7 @@ struct IonicDvApi {
pd_set_sqcmb_t pd_set_sqcmb = nullptr;
pd_set_rqcmb_t pd_set_rqcmb = nullptr;
pd_set_udma_mask_t pd_set_udma_mask = nullptr;
create_cq_ex_t create_cq_ex = nullptr;

void* handle = nullptr;

Expand All @@ -203,7 +206,9 @@ struct IonicDvApi {
pd_set_sqcmb = (pd_set_sqcmb_t)DvLoadSymbol(handle, "ionic_dv_pd_set_sqcmb");
pd_set_rqcmb = (pd_set_rqcmb_t)DvLoadSymbol(handle, "ionic_dv_pd_set_rqcmb");
pd_set_udma_mask = (pd_set_udma_mask_t)DvLoadSymbol(handle, "ionic_dv_pd_set_udma_mask");
create_cq_ex = (create_cq_ex_t)DvLoadSymbol(handle, "ionic_dv_create_cq_ex");

// create_cq_ex is optional: nullptr means CCQE not supported by this driver version
return get_ctx && qp_get_udma_idx && get_cq && get_qp && pd_set_sqcmb && pd_set_rqcmb &&
pd_set_udma_mask;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ namespace core {

#define QUEUE_SIZE 1
#define MAX_INLINE_SIZE 32
// #define IONIC_CCQE 1
#undef IONIC_CCQE

} // namespace core
} // namespace mori
Loading
Loading