Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ NIC_PCI=0000:17:00.0
# MAC of enp23s0f0np0 of the other node (for dpdk to assign dest MAC when inter-node)
# to check, unbind NIC from DPDK, then ip l
OTHER_NODE_MAC=40:a6:b7:c3:4b:78
TSO=0

# c6525-25g nodes (Mellanox NICs don't need to be bound to vfio-pci)
# NIC=enp65s0f0np0
Expand Down
2 changes: 1 addition & 1 deletion build_and_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,5 @@ sudo ./build/umanet \
--eth-rx-cores $ETH_RX_CORES --eth-tx-cores $ETH_TX_CORES \
--eth-rx-queues $ETH_RX_QUEUES --eth-tx-queues $ETH_TX_QUEUES \
--vhost-rx-cores $VHOST_RX_CORES --vhost-tx-cores $VHOST_TX_CORES \
--show-dash $SHOW_DASH \
--show-dash $SHOW_DASH --tso $TSO \
> switch.log 2>&1
1 change: 1 addition & 0 deletions env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ set +a
: "${ETH_TX_CORES:?missing ETH_TX_CORES}"
: "${VHOST_RX_CORES:?missing VHOST_RX_CORES}"
: "${VHOST_TX_CORES:?missing VHOST_TX_CORES}"
: "${TSO:?missing TSO}"

MAX_VM_COUNT=64

Expand Down
2 changes: 1 addition & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ sudo ./build/umanet \
--eth-rx-cores $ETH_RX_CORES --eth-tx-cores $ETH_TX_CORES \
--eth-rx-queues $ETH_RX_QUEUES --eth-tx-queues $ETH_TX_QUEUES \
--vhost-rx-cores $VHOST_RX_CORES --vhost-tx-cores $VHOST_TX_CORES \
--show-dash $SHOW_DASH \
--show-dash $SHOW_DASH --tso $TSO \
> switch.log 2>&1
3 changes: 2 additions & 1 deletion src/fast/eth_rx.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ void eth_rx_loop(struct eth_rx_ctx *ctx) {
// NIC to VM path: clear offload flags and recalculate checksums
// Packets from NIC may have pseudo-checksums from sender's TX offload
// Virtio requires valid checksums in packet data, not offloaded
fix_cksum(m);
// uncomment this to enable TSO
// fix_cksum(m);

vm_pkts[dst_vid][vm_cnt[dst_vid]++] = m;
if (vid_seen_mask & (1ULL << dst_vid))
Expand Down
3 changes: 2 additions & 1 deletion src/fast/eth_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ static inline int network_send(struct eth_tx_ctx *ctx, int tx_queue_id, unsigned
// }
// num = gro_num;

pkts_set_tso_flags(pkts, num);
// uncomment this to enable TSO
// pkts_set_tso_flags(pkts, num);
int16_t ret = rte_eth_tx_burst(global->eth_port_id, tx_queue_id, pkts, num);
if (ret < 0)
ret = 0;
Expand Down
20 changes: 19 additions & 1 deletion src/include/fastpath.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
#ifndef FASTPATH_H_
#define FASTPATH_H_

#include "log.h"
#include <rte_ether.h>
#include <rte_icmp.h>
#include <rte_ip.h>
#include <rte_tcp.h>
#include <rte_udp.h>
Expand Down Expand Up @@ -70,7 +72,7 @@

#define GRO_MAX_FLOWS 2048
#define GRO_MAX_ITEMS_PER_FLOW 32
#define PKT_MTU 9000
#define PKT_MTU 1500

// tells NIC to segment TCP packets into smaller segments
static inline void pkts_set_tso_flags(struct rte_mbuf **pkts, unsigned num) {
Expand Down Expand Up @@ -113,6 +115,22 @@ static inline void pkts_set_tso_flags(struct rte_mbuf **pkts, unsigned num) {
} else if (ip->next_proto_id == IPPROTO_UDP) {
// UDP over IPv4 — only compute checksums, no TSO
m->ol_flags |= RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_UDP_CKSUM;
} else if (ip->next_proto_id == IPPROTO_ICMP) {
// m->ol_flags |= RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM;
// No NIC offload exists for ICMP
m->ol_flags &= ~(RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_TCP_CKSUM |
RTE_MBUF_F_TX_UDP_CKSUM | RTE_MBUF_F_TX_TCP_SEG);

ip->hdr_checksum = 0;
ip->hdr_checksum = rte_ipv4_cksum(ip);

struct rte_icmp_hdr *icmp = (struct rte_icmp_hdr *)((uint8_t *)ip + m->l3_len);

uint16_t icmp_len = rte_be_to_cpu_16(ip->total_length) - m->l3_len;

icmp->icmp_cksum = 0;
icmp->icmp_cksum = rte_raw_cksum(icmp, icmp_len);
icmp->icmp_cksum = __rte_raw_cksum_reduce(icmp->icmp_cksum);
} else {
// Other IPv4 protocols — just IPv4 checksum
m->ol_flags |= RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM;
Expand Down
43 changes: 25 additions & 18 deletions src/network/network.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,12 @@ static struct rte_eth_conf port_conf = {
.rxmode =
{
.mq_mode = RTE_ETH_MQ_RX_RSS,
.offloads = RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TCP_CKSUM | RTE_ETH_RX_OFFLOAD_RSS_HASH,
.offloads = 0,
},
.txmode =
{
.mq_mode = RTE_ETH_MQ_TX_NONE,
.offloads = RTE_ETH_TX_OFFLOAD_TCP_TSO | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
RTE_ETH_TX_OFFLOAD_MULTI_SEGS,
.offloads = 0,
},
.rx_adv_conf =
{
Expand Down Expand Up @@ -112,18 +111,24 @@ int network_init() {
rte_eth_macaddr_get(global->eth_port_id, &global->eth_addr);
rte_eth_dev_info_get(global->eth_port_id, &eth_devinfo);

uint64_t rx_offloads = 0;

// Check if NIC supports these features
if (eth_devinfo.rx_offload_capa & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM)
rx_offloads |= RTE_ETH_RX_OFFLOAD_IPV4_CKSUM;
if (eth_devinfo.rx_offload_capa & RTE_ETH_RX_OFFLOAD_TCP_CKSUM)
rx_offloads |= RTE_ETH_RX_OFFLOAD_TCP_CKSUM;
port_conf.rxmode.offloads = rx_offloads;

// TSO
eth_devinfo.default_txconf.offloads = port_conf.txmode.offloads;
eth_devinfo.default_rxconf.offloads = port_conf.rxmode.offloads;
if (config.enable_tso) {
port_conf.txmode.offloads = RTE_ETH_TX_OFFLOAD_TCP_TSO | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
RTE_ETH_TX_OFFLOAD_TCP_CKSUM | RTE_ETH_TX_OFFLOAD_MULTI_SEGS;
port_conf.rxmode.offloads =
RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TCP_CKSUM | RTE_ETH_RX_OFFLOAD_RSS_HASH;
uint64_t rx_offloads = 0;

// Check if NIC supports these features
if (eth_devinfo.rx_offload_capa & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM)
rx_offloads |= RTE_ETH_RX_OFFLOAD_IPV4_CKSUM;
if (eth_devinfo.rx_offload_capa & RTE_ETH_RX_OFFLOAD_TCP_CKSUM)
rx_offloads |= RTE_ETH_RX_OFFLOAD_TCP_CKSUM;
port_conf.rxmode.offloads = rx_offloads;

// TSO
eth_devinfo.default_txconf.offloads = port_conf.txmode.offloads;
eth_devinfo.default_rxconf.offloads = port_conf.rxmode.offloads;
}

if (eth_devinfo.max_rx_queues < config.eth_rx_cores || eth_devinfo.max_tx_queues < config.eth_tx_cores) {
LOG_ERROR("Error: NIC does not support enough hw queues (rx=%u tx=%u)"
Expand Down Expand Up @@ -167,10 +172,12 @@ int network_init() {
goto error_exit;
}

// eth_devinfo.default_rxconf.offloads = 0;
if (!config.enable_tso) {
eth_devinfo.default_rxconf.offloads = 0;

/* enable per-queue checksum offload if requested */
// eth_devinfo.default_txconf.offloads = 0;
/* enable per-queue checksum offload if requested */
eth_devinfo.default_txconf.offloads = 0;
}
// if (config.fp_xsumoffload) {
// uint64_t requested_offloads = RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
// /* mask unsupported TX offloads (use same mask as port-level) */
Expand Down
12 changes: 6 additions & 6 deletions src/slow/arp.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#include <rte_vhost.h>

// only does ARP destined for dataplane, not VMs
int process_arp_req(struct control_ctx *ctx, uint16_t vid, struct rte_mbuf *m, enum slow_src src) {
int process_arp_req(struct control_ctx *ctx, uint16_t vid, uint16_t eth_queue_id, struct rte_mbuf *m,
enum slow_src src) {
struct rte_ether_hdr *eth = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
struct rte_arp_hdr *arp = (struct rte_arp_hdr *)(eth + 1);
if (arp->arp_opcode != rte_cpu_to_be_16(RTE_ARP_OP_REQUEST)) {
Expand Down Expand Up @@ -47,12 +48,11 @@ int process_arp_req(struct control_ctx *ctx, uint16_t vid, struct rte_mbuf *m, e
int ret = rte_ring_enqueue_burst(global->vhost_tx_rings[vid], (void **)&m, 1, NULL);
if (unlikely(ret == 0))
LOG_WARN("[%d] Failed to enqueue ARP reply to vid=%d\n", ctx->core_id, vid);
} else if (src == SLOW_SRC_ETH) {
int ret = rte_ring_enqueue_burst(global->eth_tx_queue_rings[eth_queue_id], (void **)&m, 1, NULL);
if (unlikely(ret == 0))
LOG_WARN("[%d] Failed to enqueue ARP reply to eth_queue_id=%d\n", ctx->core_id, eth_queue_id);
}
// else if (src == SLOW_SRC_ETH) { // handled in from_eth instead
// // int ret = network_send(ctx, 1, &m);
// if (unlikely(ret == 0))
// LOG_WARN("[%d] Failed to send ARP reply to physical NIC\n", ctx->core_id);
// }

rte_pktmbuf_free(m);

Expand Down
2 changes: 1 addition & 1 deletion src/slow/slowpath.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void slowpath_loop(struct control_ctx *ctx) {
LOG_ERROR("vdev_list->vdevs[%d] is NULL for ARP request\n", slow_msg->vid);
continue;
}
process_arp_req(ctx, slow_msg->vid, slow_msg->mbuf, slow_msg->src);
process_arp_req(ctx, slow_msg->vid, slow_msg->eth_queue_id, slow_msg->mbuf, slow_msg->src);
break;

default:
Expand Down
3 changes: 2 additions & 1 deletion src/slow/slowpath.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ struct slow_msg {
struct rte_mbuf *mbuf;
};

int process_arp_req(struct control_ctx *ctx, uint16_t vid, struct rte_mbuf *m, enum slow_src src);
int process_arp_req(struct control_ctx *ctx, uint16_t vid, uint16_t eth_queue_id, struct rte_mbuf *m,
enum slow_src src);
void slowpath_loop(struct control_ctx *ctx);

void control_tty_init();
Expand Down
125 changes: 65 additions & 60 deletions src/vhost/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -257,66 +257,71 @@ int register_vhost_drivers() {
continue;
}

// flags describe what the backend (you) and the guest agree on
// GUEST_TSO enabled - beneficial for traffic going through physical NIC with hardware TSO offload
uint64_t features = (1ULL << VIRTIO_NET_F_MTU) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
(1ULL << VIRTIO_NET_F_CTRL_VQ) | (1ULL << VIRTIO_NET_F_CSUM) |
(1ULL << VIRTIO_NET_F_GUEST_CSUM) | (1ULL << VIRTIO_NET_F_GUEST_UFO) |
// (1ULL << VIRTIO_NET_F_GUEST_TSO4) | (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
(1ULL << VIRTIO_NET_F_HOST_TSO4) | (1ULL << VIRTIO_NET_F_HOST_TSO6);

rte_vhost_driver_enable_features(file, features);

// Allow the guest to send TSO packets to the host.
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_TSO4);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_TSO6);

// Allow the host (your DPDK app) to send TSO packets to the guest.
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_HOST_TSO4);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_HOST_TSO6);

// if (config.mergeable == 0) {
// }
// Allows the host to place one large packet across multiple guest RX buffers
// 1 packet → RX buf 0 + RX buf 1 + RX buf 2
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_MRG_RXBUF);

// if (config.enable_tx_csum == 0) {
// }
// guests won’t compute checksums, host will do it
// guest sends packets with checksum fields = 0, host fills them later
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_CSUM);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_CSUM);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_UFO);
// //
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_ECN);

// if (config.enable_tso == 0) {
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_HOST_TSO4);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_HOST_TSO6);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_TSO4);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_TSO6);
// // }

// - RTE_VHOST_USER_EXTBUF_SUPPORT (enables external buffer mbufs)
// - RTE_VHOST_USER_LINEARBUF_SUPPORT (required for external buffers)
// - VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD (required for zero copy tracking)
// uint64_t protocol_features = 0;
// if (rte_vhost_driver_get_protocol_features(file, &protocol_features) == 0) {
// protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD);
// if (rte_vhost_driver_set_protocol_features(file, protocol_features) != 0) {
// LOG_WARN("Failed to set INFLIGHT_SHMFD protocol feature for %s (zero copy may not work)\n", file);
// } else {
// LOG_INFO("Enabled INFLIGHT_SHMFD protocol feature for zero copy support (features: 0x%lx)\n",
// protocol_features);
// }
// } else {
// LOG_WARN("Failed to get protocol features for %s, trying to set INFLIGHT_SHMFD directly\n", file);
// protocol_features = (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD);
// if (rte_vhost_driver_set_protocol_features(file, protocol_features) != 0) {
// LOG_WARN("Failed to set INFLIGHT_SHMFD protocol feature for %s (zero copy may not work)\n", file);
// }
// }
if (config.enable_tso) {
// flags describe what the backend (you) and the guest agree on
// GUEST_TSO enabled - beneficial for traffic going through physical NIC with hardware TSO offload
uint64_t features = (1ULL << VIRTIO_NET_F_MTU) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
(1ULL << VIRTIO_NET_F_CTRL_VQ) | (1ULL << VIRTIO_NET_F_CSUM) |
(1ULL << VIRTIO_NET_F_GUEST_CSUM) | (1ULL << VIRTIO_NET_F_GUEST_UFO) |
// (1ULL << VIRTIO_NET_F_GUEST_TSO4) | (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
(1ULL << VIRTIO_NET_F_HOST_TSO4) | (1ULL << VIRTIO_NET_F_HOST_TSO6);

rte_vhost_driver_enable_features(file, features);
} else {

// Allow the guest to send TSO packets to the host.
rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_TSO4);
rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_TSO6);

// Allow the host (your DPDK app) to send TSO packets to the guest.
rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_HOST_TSO4);
rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_HOST_TSO6);

// if (config.mergeable == 0) {
// }
// Allows the host to place one large packet across multiple guest RX buffers
// 1 packet → RX buf 0 + RX buf 1 + RX buf 2
rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_MRG_RXBUF);

// if (config.enable_tx_csum == 0) {
// }
// guests won’t compute checksums, host will do it
// guest sends packets with checksum fields = 0, host fills them later
rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_CSUM);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_CSUM);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_UFO);
// //
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_ECN);

// if (config.enable_tso == 0) {
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_HOST_TSO4);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_HOST_TSO6);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_TSO4);
// rte_vhost_driver_disable_features(file, 1ULL << VIRTIO_NET_F_GUEST_TSO6);
// // }

// - RTE_VHOST_USER_EXTBUF_SUPPORT (enables external buffer mbufs)
// - RTE_VHOST_USER_LINEARBUF_SUPPORT (required for external buffers)
// - VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD (required for zero copy tracking)
// uint64_t protocol_features = 0;
// if (rte_vhost_driver_get_protocol_features(file, &protocol_features) == 0) {
// protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD);
// if (rte_vhost_driver_set_protocol_features(file, protocol_features) != 0) {
// LOG_WARN("Failed to set INFLIGHT_SHMFD protocol feature for %s (zero copy may not work)\n",
// file);
// } else {
// LOG_INFO("Enabled INFLIGHT_SHMFD protocol feature for zero copy support (features: 0x%lx)\n",
// protocol_features);
// }
// } else {
// LOG_WARN("Failed to get protocol features for %s, trying to set INFLIGHT_SHMFD directly\n", file);
// protocol_features = (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD);
// if (rte_vhost_driver_set_protocol_features(file, protocol_features) != 0) {
// LOG_WARN("Failed to set INFLIGHT_SHMFD protocol feature for %s (zero copy may not work)\n",
// file);
// }
// }
}

if (rte_vhost_driver_callback_register(file, &virtio_net_device_ops) != 0) {
LOG_ERROR("Failed to register vhost driver callbacks for %s (socket %d/%d)\n", file, i, config.nb_sockets);
Expand Down
1 change: 1 addition & 0 deletions testing/commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ df -h
systemctl status iperf
sudo tcpdump -i br0
sudo tcpdump -i enp23s0f0np0
sudo tcpdump -vv -i eth1
echo '{"vm":"vm12","throughput":12345}' | nc 192.168.100.1 9000

lsof -i :9000
Expand Down
Loading