/* SPDX-License-Identifier: MIT */
/* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
#include <assert.h>
#include <errno.h>
#include <error.h>
#include <fcntl.h>
#include <limits.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <arpa/inet.h>
#include <linux/errqueue.h>
#include <linux/if_packet.h>
#include <linux/io_uring.h>
#include <linux/ipv6.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/un.h>
#include <sys/wait.h>

#define NOTIF_TAG 0xfffffffULL
#define NONZC_TAG 0
#define ZC_TAG 1

enum {
	MODE_NONZC	= 0,
	MODE_ZC		= 1,
	MODE_ZC_FIXED	= 2,
	MODE_MIXED	= 3,
};

static bool cfg_cork		= false;
static int  cfg_mode		= MODE_ZC_FIXED;
static int  cfg_nr_reqs		= 8;
static int  cfg_family		= PF_UNSPEC;
static int  cfg_payload_len;
static int  cfg_port		= 8000;
static int  cfg_runtime_ms	= 4200;

static socklen_t cfg_alen;
static struct sockaddr_storage cfg_dst_addr;

static char payload[IP_MAXPACKET] __attribute__((aligned(4096)));

struct io_sq_ring {
	unsigned *head;
	unsigned *tail;
	unsigned *ring_mask;
	unsigned *ring_entries;
	unsigned *flags;
	unsigned *array;
};

struct io_cq_ring {
	unsigned *head;
	unsigned *tail;
	unsigned *ring_mask;
	unsigned *ring_entries;
	struct io_uring_cqe *cqes;
};

struct io_uring_sq {
	unsigned *khead;
	unsigned *ktail;
	unsigned *kring_mask;
	unsigned *kring_entries;
	unsigned *kflags;
	unsigned *kdropped;
	unsigned *array;
	struct io_uring_sqe *sqes;

	unsigned sqe_head;
	unsigned sqe_tail;

	size_t ring_sz;
};

struct io_uring_cq {
	unsigned *khead;
	unsigned *ktail;
	unsigned *kring_mask;
	unsigned *kring_entries;
	unsigned *koverflow;
	struct io_uring_cqe *cqes;

	size_t ring_sz;
};

struct io_uring {
	struct io_uring_sq sq;
	struct io_uring_cq cq;
	int ring_fd;
};

#ifdef __alpha__
# ifndef __NR_io_uring_setup
#  define __NR_io_uring_setup		535
# endif
# ifndef __NR_io_uring_enter
#  define __NR_io_uring_enter		536
# endif
# ifndef __NR_io_uring_register
#  define __NR_io_uring_register	537
# endif
#else /* !__alpha__ */
# ifndef __NR_io_uring_setup
#  define __NR_io_uring_setup		425
# endif
# ifndef __NR_io_uring_enter
#  define __NR_io_uring_enter		426
# endif
# ifndef __NR_io_uring_register
#  define __NR_io_uring_register	427
# endif
#endif

#if defined(__x86_64) || defined(__i386__)
#define read_barrier()	__asm__ __volatile__("":::"memory")
#define write_barrier()	__asm__ __volatile__("":::"memory")
#else

#define read_barrier()	__sync_synchronize()
#define write_barrier()	__sync_synchronize()
#endif

static int io_uring_setup(unsigned int entries, struct io_uring_params *p)
{
	return syscall(__NR_io_uring_setup, entries, p);
}

static int io_uring_enter(int fd, unsigned int to_submit,
			  unsigned int min_complete,
			  unsigned int flags, sigset_t *sig)
{
	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
			flags, sig, _NSIG / 8);
}

static int io_uring_register_buffers(struct io_uring *ring,
				     const struct iovec *iovecs,
				     unsigned nr_iovecs)
{
	int ret;

	ret = syscall(__NR_io_uring_register, ring->ring_fd,
		      IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
	return (ret < 0) ? -errno : ret;
}

static int io_uring_mmap(int fd, struct io_uring_params *p,
			 struct io_uring_sq *sq, struct io_uring_cq *cq)
{
	size_t size;
	void *ptr;
	int ret;

	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
	ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
		   MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
	if (ptr == MAP_FAILED)
		return -errno;
	sq->khead = ptr + p->sq_off.head;
	sq->ktail = ptr + p->sq_off.tail;
	sq->kring_mask = ptr + p->sq_off.ring_mask;
	sq->kring_entries = ptr + p->sq_off.ring_entries;
	sq->kflags = ptr + p->sq_off.flags;
	sq->kdropped = ptr + p->sq_off.dropped;
	sq->array = ptr + p->sq_off.array;

	size = p->sq_entries * sizeof(struct io_uring_sqe);
	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
	if (sq->sqes == MAP_FAILED) {
		ret = -errno;
err:
		munmap(sq->khead, sq->ring_sz);
		return ret;
	}

	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
	ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
	if (ptr == MAP_FAILED) {
		ret = -errno;
		munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
		goto err;
	}
	cq->khead = ptr + p->cq_off.head;
	cq->ktail = ptr + p->cq_off.tail;
	cq->kring_mask = ptr + p->cq_off.ring_mask;
	cq->kring_entries = ptr + p->cq_off.ring_entries;
	cq->koverflow = ptr + p->cq_off.overflow;
	cq->cqes = ptr + p->cq_off.cqes;
	return 0;
}

static int io_uring_queue_init(unsigned entries, struct io_uring *ring,
			       unsigned flags)
{
	struct io_uring_params p;
	int fd, ret;

	memset(ring, 0, sizeof(*ring));
	memset(&p, 0, sizeof(p));
	p.flags = flags;

	fd = io_uring_setup(entries, &p);
	if (fd < 0)
		return fd;
	ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
	if (!ret)
		ring->ring_fd = fd;
	else
		close(fd);
	return ret;
}

static int io_uring_submit(struct io_uring *ring)
{
	struct io_uring_sq *sq = &ring->sq;
	const unsigned mask = *sq->kring_mask;
	unsigned ktail, submitted, to_submit;
	int ret;

	read_barrier();
	if (*sq->khead != *sq->ktail) {
		submitted = *sq->kring_entries;
		goto submit;
	}
	if (sq->sqe_head == sq->sqe_tail)
		return 0;

	ktail = *sq->ktail;
	to_submit = sq->sqe_tail - sq->sqe_head;
	for (submitted = 0; submitted < to_submit; submitted++) {
		read_barrier();
		sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
	}
	if (!submitted)
		return 0;

	if (*sq->ktail != ktail) {
		write_barrier();
		*sq->ktail = ktail;
		write_barrier();
	}
submit:
	ret = io_uring_enter(ring->ring_fd, submitted, 0,
				IORING_ENTER_GETEVENTS, NULL);
	return ret < 0 ? -errno : ret;
}

static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
				      const void *buf, size_t len, int flags)
{
	memset(sqe, 0, sizeof(*sqe));
	sqe->opcode = (__u8) IORING_OP_SEND;
	sqe->fd = sockfd;
	sqe->addr = (unsigned long) buf;
	sqe->len = len;
	sqe->msg_flags = (__u32) flags;
}

static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
				        const void *buf, size_t len, int flags,
				        unsigned zc_flags)
{
	io_uring_prep_send(sqe, sockfd, buf, len, flags);
	sqe->opcode = (__u8) IORING_OP_SEND_ZC;
	sqe->ioprio = zc_flags;
}

static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
{
	struct io_uring_sq *sq = &ring->sq;

	if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
		return NULL;
	return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
}

static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
{
	struct io_uring_cq *cq = &ring->cq;
	const unsigned mask = *cq->kring_mask;
	unsigned head = *cq->khead;
	int ret;

	*cqe_ptr = NULL;
	do {
		read_barrier();
		if (head != *cq->ktail) {
			*cqe_ptr = &cq->cqes[head & mask];
			break;
		}
		ret = io_uring_enter(ring->ring_fd, 0, 1,
					IORING_ENTER_GETEVENTS, NULL);
		if (ret < 0)
			return -errno;
	} while (1);

	return 0;
}

static inline void io_uring_cqe_seen(struct io_uring *ring)
{
	*(&ring->cq)->khead += 1;
	write_barrier();
}

static unsigned long gettimeofday_ms(void)
{
	struct timeval tv;

	gettimeofday(&tv, NULL);
	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
}

static void do_setsockopt(int fd, int level, int optname, int val)
{
	if (setsockopt(fd, level, optname, &val, sizeof(val)))
		error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
}

static int do_setup_tx(int domain, int type, int protocol)
{
	int fd;

	fd = socket(domain, type, protocol);
	if (fd == -1)
		error(1, errno, "socket t");

	do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);

	if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
		error(1, errno, "connect");
	return fd;
}

static void do_tx(int domain, int type, int protocol)
{
	struct io_uring_sqe *sqe;
	struct io_uring_cqe *cqe;
	unsigned long packets = 0, bytes = 0;
	struct io_uring ring;
	struct iovec iov;
	uint64_t tstop;
	int i, fd, ret;
	int compl_cqes = 0;

	fd = do_setup_tx(domain, type, protocol);

	ret = io_uring_queue_init(512, &ring, 0);
	if (ret)
		error(1, ret, "io_uring: queue init");

	iov.iov_base = payload;
	iov.iov_len = cfg_payload_len;

	ret = io_uring_register_buffers(&ring, &iov, 1);
	if (ret)
		error(1, ret, "io_uring: buffer registration");

	tstop = gettimeofday_ms() + cfg_runtime_ms;
	do {
		if (cfg_cork)
			do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);

		for (i = 0; i < cfg_nr_reqs; i++) {
			unsigned zc_flags = 0;
			unsigned buf_idx = 0;
			unsigned mode = cfg_mode;
			unsigned msg_flags = MSG_WAITALL;

			if (cfg_mode == MODE_MIXED)
				mode = rand() % 3;

			sqe = io_uring_get_sqe(&ring);

			if (mode == MODE_NONZC) {
				io_uring_prep_send(sqe, fd, payload,
						   cfg_payload_len, msg_flags);
				sqe->user_data = NONZC_TAG;
			} else {
				io_uring_prep_sendzc(sqe, fd, payload,
						     cfg_payload_len,
						     msg_flags, zc_flags);
				if (mode == MODE_ZC_FIXED) {
					sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
					sqe->buf_index = buf_idx;
				}
				sqe->user_data = ZC_TAG;
			}
		}

		ret = io_uring_submit(&ring);
		if (ret != cfg_nr_reqs)
			error(1, ret, "submit");

		if (cfg_cork)
			do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
		for (i = 0; i < cfg_nr_reqs; i++) {
			ret = io_uring_wait_cqe(&ring, &cqe);
			if (ret)
				error(1, ret, "wait cqe");

			if (cqe->user_data != NONZC_TAG &&
			    cqe->user_data != ZC_TAG)
				error(1, -EINVAL, "invalid cqe->user_data");

			if (cqe->flags & IORING_CQE_F_NOTIF) {
				if (cqe->flags & IORING_CQE_F_MORE)
					error(1, -EINVAL, "invalid notif flags");
				if (compl_cqes <= 0)
					error(1, -EINVAL, "notification mismatch");
				compl_cqes--;
				i--;
				io_uring_cqe_seen(&ring);
				continue;
			}
			if (cqe->flags & IORING_CQE_F_MORE) {
				if (cqe->user_data != ZC_TAG)
					error(1, cqe->res, "unexpected F_MORE");
				compl_cqes++;
			}
			if (cqe->res >= 0) {
				packets++;
				bytes += cqe->res;
			} else if (cqe->res != -EAGAIN) {
				error(1, cqe->res, "send failed");
			}
			io_uring_cqe_seen(&ring);
		}
	} while (gettimeofday_ms() < tstop);

	while (compl_cqes) {
		ret = io_uring_wait_cqe(&ring, &cqe);
		if (ret)
			error(1, ret, "wait cqe");
		if (cqe->flags & IORING_CQE_F_MORE)
			error(1, -EINVAL, "invalid notif flags");
		if (!(cqe->flags & IORING_CQE_F_NOTIF))
			error(1, -EINVAL, "missing notif flag");

		io_uring_cqe_seen(&ring);
		compl_cqes--;
	}

	fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
			packets, bytes >> 20,
			packets / (cfg_runtime_ms / 1000),
			(bytes >> 20) / (cfg_runtime_ms / 1000));

	if (close(fd))
		error(1, errno, "close");
}

static void do_test(int domain, int type, int protocol)
{
	int i;

	for (i = 0; i < IP_MAXPACKET; i++)
		payload[i] = 'a' + (i % 26);
	do_tx(domain, type, protocol);
}

static void usage(const char *filepath)
{
	error(1, 0, "Usage: %s (-4|-6) (udp|tcp) -D<dst_ip> [-s<payload size>] "
		    "[-t<time s>] [-n<batch>] [-p<port>] [-m<mode>]", filepath);
}

static void parse_opts(int argc, char **argv)
{
	const int max_payload_len = sizeof(payload) -
				    sizeof(struct ipv6hdr) -
				    sizeof(struct tcphdr) -
				    40 /* max tcp options */;
	struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr;
	struct sockaddr_in *addr4 = (void *) &cfg_dst_addr;
	char *daddr = NULL;
	int c;

	if (argc <= 1)
		usage(argv[0]);
	cfg_payload_len = max_payload_len;

	while ((c = getopt(argc, argv, "46D:p:s:t:n:c:m:")) != -1) {
		switch (c) {
		case '4':
			if (cfg_family != PF_UNSPEC)
				error(1, 0, "Pass one of -4 or -6");
			cfg_family = PF_INET;
			cfg_alen = sizeof(struct sockaddr_in);
			break;
		case '6':
			if (cfg_family != PF_UNSPEC)
				error(1, 0, "Pass one of -4 or -6");
			cfg_family = PF_INET6;
			cfg_alen = sizeof(struct sockaddr_in6);
			break;
		case 'D':
			daddr = optarg;
			break;
		case 'p':
			cfg_port = strtoul(optarg, NULL, 0);
			break;
		case 's':
			cfg_payload_len = strtoul(optarg, NULL, 0);
			break;
		case 't':
			cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
			break;
		case 'n':
			cfg_nr_reqs = strtoul(optarg, NULL, 0);
			break;
		case 'c':
			cfg_cork = strtol(optarg, NULL, 0);
			break;
		case 'm':
			cfg_mode = strtol(optarg, NULL, 0);
			break;
		}
	}

	switch (cfg_family) {
	case PF_INET:
		memset(addr4, 0, sizeof(*addr4));
		addr4->sin_family = AF_INET;
		addr4->sin_port = htons(cfg_port);
		if (daddr &&
		    inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1)
			error(1, 0, "ipv4 parse error: %s", daddr);
		break;
	case PF_INET6:
		memset(addr6, 0, sizeof(*addr6));
		addr6->sin6_family = AF_INET6;
		addr6->sin6_port = htons(cfg_port);
		if (daddr &&
		    inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1)
			error(1, 0, "ipv6 parse error: %s", daddr);
		break;
	default:
		error(1, 0, "illegal domain");
	}

	if (cfg_payload_len > max_payload_len)
		error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
	if (optind != argc - 1)
		usage(argv[0]);
}

int main(int argc, char **argv)
{
	const char *cfg_test = argv[argc - 1];

	parse_opts(argc, argv);

	if (!strcmp(cfg_test, "tcp"))
		do_test(cfg_family, SOCK_STREAM, 0);
	else if (!strcmp(cfg_test, "udp"))
		do_test(cfg_family, SOCK_DGRAM, 0);
	else
		error(1, 0, "unknown cfg_test %s", cfg_test);
	return 0;
}