/* SPDX-License-Identifier: MIT */ /* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */ #include <assert.h> #include <errno.h> #include <error.h> #include <fcntl.h> #include <limits.h> #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <arpa/inet.h> #include <linux/errqueue.h> #include <linux/if_packet.h> #include <linux/io_uring.h> #include <linux/ipv6.h> #include <linux/socket.h> #include <linux/sockios.h> #include <net/ethernet.h> #include <net/if.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip6.h> #include <netinet/tcp.h> #include <netinet/udp.h> #include <sys/ioctl.h> #include <sys/mman.h> #include <sys/resource.h> #include <sys/socket.h> #include <sys/stat.h> #include <sys/time.h> #include <sys/types.h> #include <sys/un.h> #include <sys/wait.h> #define NOTIF_TAG 0xfffffffULL #define NONZC_TAG 0 #define ZC_TAG 1 enum { MODE_NONZC = 0, MODE_ZC = 1, MODE_ZC_FIXED = 2, MODE_MIXED = 3, }; static bool cfg_cork = false; static int cfg_mode = MODE_ZC_FIXED; static int cfg_nr_reqs = 8; static int cfg_family = PF_UNSPEC; static int cfg_payload_len; static int cfg_port = 8000; static int cfg_runtime_ms = 4200; static socklen_t cfg_alen; static struct sockaddr_storage cfg_dst_addr; static char payload[IP_MAXPACKET] __attribute__((aligned(4096))); struct io_sq_ring { unsigned *head; unsigned *tail; unsigned *ring_mask; unsigned *ring_entries; unsigned *flags; unsigned *array; }; struct io_cq_ring { unsigned *head; unsigned *tail; unsigned *ring_mask; unsigned *ring_entries; struct io_uring_cqe *cqes; }; struct io_uring_sq { unsigned *khead; unsigned *ktail; unsigned *kring_mask; unsigned *kring_entries; unsigned *kflags; unsigned *kdropped; unsigned *array; struct io_uring_sqe *sqes; unsigned sqe_head; unsigned sqe_tail; size_t ring_sz; }; struct io_uring_cq { unsigned *khead; unsigned *ktail; unsigned *kring_mask; unsigned *kring_entries; unsigned *koverflow; struct io_uring_cqe *cqes; size_t ring_sz; }; struct io_uring { struct io_uring_sq sq; struct io_uring_cq cq; int ring_fd; }; #ifdef __alpha__ # ifndef __NR_io_uring_setup # define __NR_io_uring_setup 535 # endif # ifndef __NR_io_uring_enter # define __NR_io_uring_enter 536 # endif # ifndef __NR_io_uring_register # define __NR_io_uring_register 537 # endif #else /* !__alpha__ */ # ifndef __NR_io_uring_setup # define __NR_io_uring_setup 425 # endif # ifndef __NR_io_uring_enter # define __NR_io_uring_enter 426 # endif # ifndef __NR_io_uring_register # define __NR_io_uring_register 427 # endif #endif #if defined(__x86_64) || defined(__i386__) #define read_barrier() __asm__ __volatile__("":::"memory") #define write_barrier() __asm__ __volatile__("":::"memory") #else #define read_barrier() __sync_synchronize() #define write_barrier() __sync_synchronize() #endif static int io_uring_setup(unsigned int entries, struct io_uring_params *p) { return syscall(__NR_io_uring_setup, entries, p); } static int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, unsigned int flags, sigset_t *sig) { return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, sig, _NSIG / 8); } static int io_uring_register_buffers(struct io_uring *ring, const struct iovec *iovecs, unsigned nr_iovecs) { int ret; ret = syscall(__NR_io_uring_register, ring->ring_fd, IORING_REGISTER_BUFFERS, iovecs, nr_iovecs); return (ret < 0) ? -errno : ret; } static int io_uring_mmap(int fd, struct io_uring_params *p, struct io_uring_sq *sq, struct io_uring_cq *cq) { size_t size; void *ptr; int ret; sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); if (ptr == MAP_FAILED) return -errno; sq->khead = ptr + p->sq_off.head; sq->ktail = ptr + p->sq_off.tail; sq->kring_mask = ptr + p->sq_off.ring_mask; sq->kring_entries = ptr + p->sq_off.ring_entries; sq->kflags = ptr + p->sq_off.flags; sq->kdropped = ptr + p->sq_off.dropped; sq->array = ptr + p->sq_off.array; size = p->sq_entries * sizeof(struct io_uring_sqe); sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); if (sq->sqes == MAP_FAILED) { ret = -errno; err: munmap(sq->khead, sq->ring_sz); return ret; } cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); if (ptr == MAP_FAILED) { ret = -errno; munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); goto err; } cq->khead = ptr + p->cq_off.head; cq->ktail = ptr + p->cq_off.tail; cq->kring_mask = ptr + p->cq_off.ring_mask; cq->kring_entries = ptr + p->cq_off.ring_entries; cq->koverflow = ptr + p->cq_off.overflow; cq->cqes = ptr + p->cq_off.cqes; return 0; } static int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) { struct io_uring_params p; int fd, ret; memset(ring, 0, sizeof(*ring)); memset(&p, 0, sizeof(p)); p.flags = flags; fd = io_uring_setup(entries, &p); if (fd < 0) return fd; ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq); if (!ret) ring->ring_fd = fd; else close(fd); return ret; } static int io_uring_submit(struct io_uring *ring) { struct io_uring_sq *sq = &ring->sq; const unsigned mask = *sq->kring_mask; unsigned ktail, submitted, to_submit; int ret; read_barrier(); if (*sq->khead != *sq->ktail) { submitted = *sq->kring_entries; goto submit; } if (sq->sqe_head == sq->sqe_tail) return 0; ktail = *sq->ktail; to_submit = sq->sqe_tail - sq->sqe_head; for (submitted = 0; submitted < to_submit; submitted++) { read_barrier(); sq->array[ktail++ & mask] = sq->sqe_head++ & mask; } if (!submitted) return 0; if (*sq->ktail != ktail) { write_barrier(); *sq->ktail = ktail; write_barrier(); } submit: ret = io_uring_enter(ring->ring_fd, submitted, 0, IORING_ENTER_GETEVENTS, NULL); return ret < 0 ? -errno : ret; } static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, const void *buf, size_t len, int flags) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = (__u8) IORING_OP_SEND; sqe->fd = sockfd; sqe->addr = (unsigned long) buf; sqe->len = len; sqe->msg_flags = (__u32) flags; } static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd, const void *buf, size_t len, int flags, unsigned zc_flags) { io_uring_prep_send(sqe, sockfd, buf, len, flags); sqe->opcode = (__u8) IORING_OP_SEND_ZC; sqe->ioprio = zc_flags; } static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) { struct io_uring_sq *sq = &ring->sq; if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries) return NULL; return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask]; } static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) { struct io_uring_cq *cq = &ring->cq; const unsigned mask = *cq->kring_mask; unsigned head = *cq->khead; int ret; *cqe_ptr = NULL; do { read_barrier(); if (head != *cq->ktail) { *cqe_ptr = &cq->cqes[head & mask]; break; } ret = io_uring_enter(ring->ring_fd, 0, 1, IORING_ENTER_GETEVENTS, NULL); if (ret < 0) return -errno; } while (1); return 0; } static inline void io_uring_cqe_seen(struct io_uring *ring) { *(&ring->cq)->khead += 1; write_barrier(); } static unsigned long gettimeofday_ms(void) { struct timeval tv; gettimeofday(&tv, NULL); return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); } static void do_setsockopt(int fd, int level, int optname, int val) { if (setsockopt(fd, level, optname, &val, sizeof(val))) error(1, errno, "setsockopt %d.%d: %d", level, optname, val); } static int do_setup_tx(int domain, int type, int protocol) { int fd; fd = socket(domain, type, protocol); if (fd == -1) error(1, errno, "socket t"); do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) error(1, errno, "connect"); return fd; } static void do_tx(int domain, int type, int protocol) { struct io_uring_sqe *sqe; struct io_uring_cqe *cqe; unsigned long packets = 0, bytes = 0; struct io_uring ring; struct iovec iov; uint64_t tstop; int i, fd, ret; int compl_cqes = 0; fd = do_setup_tx(domain, type, protocol); ret = io_uring_queue_init(512, &ring, 0); if (ret) error(1, ret, "io_uring: queue init"); iov.iov_base = payload; iov.iov_len = cfg_payload_len; ret = io_uring_register_buffers(&ring, &iov, 1); if (ret) error(1, ret, "io_uring: buffer registration"); tstop = gettimeofday_ms() + cfg_runtime_ms; do { if (cfg_cork) do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1); for (i = 0; i < cfg_nr_reqs; i++) { unsigned zc_flags = 0; unsigned buf_idx = 0; unsigned mode = cfg_mode; unsigned msg_flags = MSG_WAITALL; if (cfg_mode == MODE_MIXED) mode = rand() % 3; sqe = io_uring_get_sqe(&ring); if (mode == MODE_NONZC) { io_uring_prep_send(sqe, fd, payload, cfg_payload_len, msg_flags); sqe->user_data = NONZC_TAG; } else { io_uring_prep_sendzc(sqe, fd, payload, cfg_payload_len, msg_flags, zc_flags); if (mode == MODE_ZC_FIXED) { sqe->ioprio |= IORING_RECVSEND_FIXED_BUF; sqe->buf_index = buf_idx; } sqe->user_data = ZC_TAG; } } ret = io_uring_submit(&ring); if (ret != cfg_nr_reqs) error(1, ret, "submit"); if (cfg_cork) do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); for (i = 0; i < cfg_nr_reqs; i++) { ret = io_uring_wait_cqe(&ring, &cqe); if (ret) error(1, ret, "wait cqe"); if (cqe->user_data != NONZC_TAG && cqe->user_data != ZC_TAG) error(1, -EINVAL, "invalid cqe->user_data"); if (cqe->flags & IORING_CQE_F_NOTIF) { if (cqe->flags & IORING_CQE_F_MORE) error(1, -EINVAL, "invalid notif flags"); if (compl_cqes <= 0) error(1, -EINVAL, "notification mismatch"); compl_cqes--; i--; io_uring_cqe_seen(&ring); continue; } if (cqe->flags & IORING_CQE_F_MORE) { if (cqe->user_data != ZC_TAG) error(1, cqe->res, "unexpected F_MORE"); compl_cqes++; } if (cqe->res >= 0) { packets++; bytes += cqe->res; } else if (cqe->res != -EAGAIN) { error(1, cqe->res, "send failed"); } io_uring_cqe_seen(&ring); } } while (gettimeofday_ms() < tstop); while (compl_cqes) { ret = io_uring_wait_cqe(&ring, &cqe); if (ret) error(1, ret, "wait cqe"); if (cqe->flags & IORING_CQE_F_MORE) error(1, -EINVAL, "invalid notif flags"); if (!(cqe->flags & IORING_CQE_F_NOTIF)) error(1, -EINVAL, "missing notif flag"); io_uring_cqe_seen(&ring); compl_cqes--; } fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n", packets, bytes >> 20, packets / (cfg_runtime_ms / 1000), (bytes >> 20) / (cfg_runtime_ms / 1000)); if (close(fd)) error(1, errno, "close"); } static void do_test(int domain, int type, int protocol) { int i; for (i = 0; i < IP_MAXPACKET; i++) payload[i] = 'a' + (i % 26); do_tx(domain, type, protocol); } static void usage(const char *filepath) { error(1, 0, "Usage: %s (-4|-6) (udp|tcp) -D<dst_ip> [-s<payload size>] " "[-t<time s>] [-n<batch>] [-p<port>] [-m<mode>]", filepath); } static void parse_opts(int argc, char **argv) { const int max_payload_len = sizeof(payload) - sizeof(struct ipv6hdr) - sizeof(struct tcphdr) - 40 /* max tcp options */; struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr; struct sockaddr_in *addr4 = (void *) &cfg_dst_addr; char *daddr = NULL; int c; if (argc <= 1) usage(argv[0]); cfg_payload_len = max_payload_len; while ((c = getopt(argc, argv, "46D:p:s:t:n:c:m:")) != -1) { switch (c) { case '4': if (cfg_family != PF_UNSPEC) error(1, 0, "Pass one of -4 or -6"); cfg_family = PF_INET; cfg_alen = sizeof(struct sockaddr_in); break; case '6': if (cfg_family != PF_UNSPEC) error(1, 0, "Pass one of -4 or -6"); cfg_family = PF_INET6; cfg_alen = sizeof(struct sockaddr_in6); break; case 'D': daddr = optarg; break; case 'p': cfg_port = strtoul(optarg, NULL, 0); break; case 's': cfg_payload_len = strtoul(optarg, NULL, 0); break; case 't': cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000; break; case 'n': cfg_nr_reqs = strtoul(optarg, NULL, 0); break; case 'c': cfg_cork = strtol(optarg, NULL, 0); break; case 'm': cfg_mode = strtol(optarg, NULL, 0); break; } } switch (cfg_family) { case PF_INET: memset(addr4, 0, sizeof(*addr4)); addr4->sin_family = AF_INET; addr4->sin_port = htons(cfg_port); if (daddr && inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1) error(1, 0, "ipv4 parse error: %s", daddr); break; case PF_INET6: memset(addr6, 0, sizeof(*addr6)); addr6->sin6_family = AF_INET6; addr6->sin6_port = htons(cfg_port); if (daddr && inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1) error(1, 0, "ipv6 parse error: %s", daddr); break; default: error(1, 0, "illegal domain"); } if (cfg_payload_len > max_payload_len) error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); if (optind != argc - 1) usage(argv[0]); } int main(int argc, char **argv) { const char *cfg_test = argv[argc - 1]; parse_opts(argc, argv); if (!strcmp(cfg_test, "tcp")) do_test(cfg_family, SOCK_STREAM, 0); else if (!strcmp(cfg_test, "udp")) do_test(cfg_family, SOCK_DGRAM, 0); else error(1, 0, "unknown cfg_test %s", cfg_test); return 0; }