// SPDX-License-Identifier: GPL-2.0

#define _GNU_SOURCE

#include <stddef.h>
#include <arpa/inet.h>
#include <error.h>
#include <errno.h>
#include <net/if.h>
#include <linux/in.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <netinet/if_ether.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/udp.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>

#ifndef ETH_MAX_MTU
#define ETH_MAX_MTU	0xFFFFU
#endif

#ifndef UDP_SEGMENT
#define UDP_SEGMENT		103
#endif

#ifndef UDP_MAX_SEGMENTS
#define UDP_MAX_SEGMENTS	(1 << 6UL)
#endif

#define CONST_MTU_TEST	1500

#define CONST_HDRLEN_V4		(sizeof(struct iphdr) + sizeof(struct udphdr))
#define CONST_HDRLEN_V6		(sizeof(struct ip6_hdr) + sizeof(struct udphdr))

#define CONST_MSS_V4		(CONST_MTU_TEST - CONST_HDRLEN_V4)
#define CONST_MSS_V6		(CONST_MTU_TEST - CONST_HDRLEN_V6)

#define CONST_MAX_SEGS_V4	(ETH_MAX_MTU / CONST_MSS_V4)
#define CONST_MAX_SEGS_V6	(ETH_MAX_MTU / CONST_MSS_V6)

static bool		cfg_do_ipv4;
static bool		cfg_do_ipv6;
static bool		cfg_do_connected;
static bool		cfg_do_connectionless;
static bool		cfg_do_msgmore;
static bool		cfg_do_setsockopt;
static int		cfg_specific_test_id = -1;

static const char	cfg_ifname[] = "lo";
static unsigned short	cfg_port = 9000;

static char buf[ETH_MAX_MTU];

struct testcase {
	int tlen;		/* send() buffer size, may exceed mss */
	bool tfail;		/* send() call is expected to fail */
	int gso_len;		/* mss after applying gso */
	int r_num_mss;		/* recv(): number of calls of full mss */
	int r_len_last;		/* recv(): size of last non-mss dgram, if any */
};

const struct in6_addr addr6 = IN6ADDR_LOOPBACK_INIT;
const struct in_addr addr4 = { .s_addr = __constant_htonl(INADDR_LOOPBACK + 2) };

struct testcase testcases_v4[] = {
	{
		/* no GSO: send a single byte */
		.tlen = 1,
		.r_len_last = 1,
	},
	{
		/* no GSO: send a single MSS */
		.tlen = CONST_MSS_V4,
		.r_num_mss = 1,
	},
	{
		/* no GSO: send a single MSS + 1B: fail */
		.tlen = CONST_MSS_V4 + 1,
		.tfail = true,
	},
	{
		/* send a single MSS: will fall back to no GSO */
		.tlen = CONST_MSS_V4,
		.gso_len = CONST_MSS_V4,
		.r_num_mss = 1,
	},
	{
		/* send a single MSS + 1B */
		.tlen = CONST_MSS_V4 + 1,
		.gso_len = CONST_MSS_V4,
		.r_num_mss = 1,
		.r_len_last = 1,
	},
	{
		/* send exactly 2 MSS */
		.tlen = CONST_MSS_V4 * 2,
		.gso_len = CONST_MSS_V4,
		.r_num_mss = 2,
	},
	{
		/* send 2 MSS + 1B */
		.tlen = (CONST_MSS_V4 * 2) + 1,
		.gso_len = CONST_MSS_V4,
		.r_num_mss = 2,
		.r_len_last = 1,
	},
	{
		/* send MAX segs */
		.tlen = (ETH_MAX_MTU / CONST_MSS_V4) * CONST_MSS_V4,
		.gso_len = CONST_MSS_V4,
		.r_num_mss = (ETH_MAX_MTU / CONST_MSS_V4),
	},

	{
		/* send MAX bytes */
		.tlen = ETH_MAX_MTU - CONST_HDRLEN_V4,
		.gso_len = CONST_MSS_V4,
		.r_num_mss = CONST_MAX_SEGS_V4,
		.r_len_last = ETH_MAX_MTU - CONST_HDRLEN_V4 -
			      (CONST_MAX_SEGS_V4 * CONST_MSS_V4),
	},
	{
		/* send MAX + 1: fail */
		.tlen = ETH_MAX_MTU - CONST_HDRLEN_V4 + 1,
		.gso_len = CONST_MSS_V4,
		.tfail = true,
	},
	{
		/* send a single 1B MSS: will fall back to no GSO */
		.tlen = 1,
		.gso_len = 1,
		.r_num_mss = 1,
	},
	{
		/* send 2 1B segments */
		.tlen = 2,
		.gso_len = 1,
		.r_num_mss = 2,
	},
	{
		/* send 2B + 2B + 1B segments */
		.tlen = 5,
		.gso_len = 2,
		.r_num_mss = 2,
		.r_len_last = 1,
	},
	{
		/* send max number of min sized segments */
		.tlen = UDP_MAX_SEGMENTS,
		.gso_len = 1,
		.r_num_mss = UDP_MAX_SEGMENTS,
	},
	{
		/* send max number + 1 of min sized segments: fail */
		.tlen = UDP_MAX_SEGMENTS + 1,
		.gso_len = 1,
		.tfail = true,
	},
	{
		/* EOL */
	}
};

#ifndef IP6_MAX_MTU
#define IP6_MAX_MTU	(ETH_MAX_MTU + sizeof(struct ip6_hdr))
#endif

struct testcase testcases_v6[] = {
	{
		/* no GSO: send a single byte */
		.tlen = 1,
		.r_len_last = 1,
	},
	{
		/* no GSO: send a single MSS */
		.tlen = CONST_MSS_V6,
		.r_num_mss = 1,
	},
	{
		/* no GSO: send a single MSS + 1B: fail */
		.tlen = CONST_MSS_V6 + 1,
		.tfail = true,
	},
	{
		/* send a single MSS: will fall back to no GSO */
		.tlen = CONST_MSS_V6,
		.gso_len = CONST_MSS_V6,
		.r_num_mss = 1,
	},
	{
		/* send a single MSS + 1B */
		.tlen = CONST_MSS_V6 + 1,
		.gso_len = CONST_MSS_V6,
		.r_num_mss = 1,
		.r_len_last = 1,
	},
	{
		/* send exactly 2 MSS */
		.tlen = CONST_MSS_V6 * 2,
		.gso_len = CONST_MSS_V6,
		.r_num_mss = 2,
	},
	{
		/* send 2 MSS + 1B */
		.tlen = (CONST_MSS_V6 * 2) + 1,
		.gso_len = CONST_MSS_V6,
		.r_num_mss = 2,
		.r_len_last = 1,
	},
	{
		/* send MAX segs */
		.tlen = (IP6_MAX_MTU / CONST_MSS_V6) * CONST_MSS_V6,
		.gso_len = CONST_MSS_V6,
		.r_num_mss = (IP6_MAX_MTU / CONST_MSS_V6),
	},

	{
		/* send MAX bytes */
		.tlen = IP6_MAX_MTU - CONST_HDRLEN_V6,
		.gso_len = CONST_MSS_V6,
		.r_num_mss = CONST_MAX_SEGS_V6,
		.r_len_last = IP6_MAX_MTU - CONST_HDRLEN_V6 -
			      (CONST_MAX_SEGS_V6 * CONST_MSS_V6),
	},
	{
		/* send MAX + 1: fail */
		.tlen = IP6_MAX_MTU - CONST_HDRLEN_V6 + 1,
		.gso_len = CONST_MSS_V6,
		.tfail = true,
	},
	{
		/* send a single 1B MSS: will fall back to no GSO */
		.tlen = 1,
		.gso_len = 1,
		.r_num_mss = 1,
	},
	{
		/* send 2 1B segments */
		.tlen = 2,
		.gso_len = 1,
		.r_num_mss = 2,
	},
	{
		/* send 2B + 2B + 1B segments */
		.tlen = 5,
		.gso_len = 2,
		.r_num_mss = 2,
		.r_len_last = 1,
	},
	{
		/* send max number of min sized segments */
		.tlen = UDP_MAX_SEGMENTS,
		.gso_len = 1,
		.r_num_mss = UDP_MAX_SEGMENTS,
	},
	{
		/* send max number + 1 of min sized segments: fail */
		.tlen = UDP_MAX_SEGMENTS + 1,
		.gso_len = 1,
		.tfail = true,
	},
	{
		/* EOL */
	}
};

static unsigned int get_device_mtu(int fd, const char *ifname)
{
	struct ifreq ifr;

	memset(&ifr, 0, sizeof(ifr));

	strcpy(ifr.ifr_name, ifname);

	if (ioctl(fd, SIOCGIFMTU, &ifr))
		error(1, errno, "ioctl get mtu");

	return ifr.ifr_mtu;
}

static void __set_device_mtu(int fd, const char *ifname, unsigned int mtu)
{
	struct ifreq ifr;

	memset(&ifr, 0, sizeof(ifr));

	ifr.ifr_mtu = mtu;
	strcpy(ifr.ifr_name, ifname);

	if (ioctl(fd, SIOCSIFMTU, &ifr))
		error(1, errno, "ioctl set mtu");
}

static void set_device_mtu(int fd, int mtu)
{
	int val;

	val = get_device_mtu(fd, cfg_ifname);
	fprintf(stderr, "device mtu (orig): %u\n", val);

	__set_device_mtu(fd, cfg_ifname, mtu);
	val = get_device_mtu(fd, cfg_ifname);
	if (val != mtu)
		error(1, 0, "unable to set device mtu to %u\n", val);

	fprintf(stderr, "device mtu (test): %u\n", val);
}

static void set_pmtu_discover(int fd, bool is_ipv4)
{
	int level, name, val;

	if (is_ipv4) {
		level	= SOL_IP;
		name	= IP_MTU_DISCOVER;
		val	= IP_PMTUDISC_DO;
	} else {
		level	= SOL_IPV6;
		name	= IPV6_MTU_DISCOVER;
		val	= IPV6_PMTUDISC_DO;
	}

	if (setsockopt(fd, level, name, &val, sizeof(val)))
		error(1, errno, "setsockopt path mtu");
}

static unsigned int get_path_mtu(int fd, bool is_ipv4)
{
	socklen_t vallen;
	unsigned int mtu;
	int ret;

	vallen = sizeof(mtu);
	if (is_ipv4)
		ret = getsockopt(fd, SOL_IP, IP_MTU, &mtu, &vallen);
	else
		ret = getsockopt(fd, SOL_IPV6, IPV6_MTU, &mtu, &vallen);

	if (ret)
		error(1, errno, "getsockopt mtu");


	fprintf(stderr, "path mtu (read):  %u\n", mtu);
	return mtu;
}

/* very wordy version of system("ip route add dev lo mtu 1500 127.0.0.3/32") */
static void set_route_mtu(int mtu, bool is_ipv4)
{
	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
	struct nlmsghdr *nh;
	struct rtattr *rta;
	struct rtmsg *rt;
	char data[NLMSG_ALIGN(sizeof(*nh)) +
		  NLMSG_ALIGN(sizeof(*rt)) +
		  NLMSG_ALIGN(RTA_LENGTH(sizeof(addr6))) +
		  NLMSG_ALIGN(RTA_LENGTH(sizeof(int))) +
		  NLMSG_ALIGN(RTA_LENGTH(0) + RTA_LENGTH(sizeof(int)))];
	int fd, ret, alen, off = 0;

	alen = is_ipv4 ? sizeof(addr4) : sizeof(addr6);

	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
	if (fd == -1)
		error(1, errno, "socket netlink");

	memset(data, 0, sizeof(data));

	nh = (void *)data;
	nh->nlmsg_type = RTM_NEWROUTE;
	nh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE;
	off += NLMSG_ALIGN(sizeof(*nh));

	rt = (void *)(data + off);
	rt->rtm_family = is_ipv4 ? AF_INET : AF_INET6;
	rt->rtm_table = RT_TABLE_MAIN;
	rt->rtm_dst_len = alen << 3;
	rt->rtm_protocol = RTPROT_BOOT;
	rt->rtm_scope = RT_SCOPE_UNIVERSE;
	rt->rtm_type = RTN_UNICAST;
	off += NLMSG_ALIGN(sizeof(*rt));

	rta = (void *)(data + off);
	rta->rta_type = RTA_DST;
	rta->rta_len = RTA_LENGTH(alen);
	if (is_ipv4)
		memcpy(RTA_DATA(rta), &addr4, alen);
	else
		memcpy(RTA_DATA(rta), &addr6, alen);
	off += NLMSG_ALIGN(rta->rta_len);

	rta = (void *)(data + off);
	rta->rta_type = RTA_OIF;
	rta->rta_len = RTA_LENGTH(sizeof(int));
	*((int *)(RTA_DATA(rta))) = 1; //if_nametoindex("lo");
	off += NLMSG_ALIGN(rta->rta_len);

	/* MTU is a subtype in a metrics type */
	rta = (void *)(data + off);
	rta->rta_type = RTA_METRICS;
	rta->rta_len = RTA_LENGTH(0) + RTA_LENGTH(sizeof(int));
	off += NLMSG_ALIGN(rta->rta_len);

	/* now fill MTU subtype. Note that it fits within above rta_len */
	rta = (void *)(((char *) rta) + RTA_LENGTH(0));
	rta->rta_type = RTAX_MTU;
	rta->rta_len = RTA_LENGTH(sizeof(int));
	*((int *)(RTA_DATA(rta))) = mtu;

	nh->nlmsg_len = off;

	ret = sendto(fd, data, off, 0, (void *)&nladdr, sizeof(nladdr));
	if (ret != off)
		error(1, errno, "send netlink: %uB != %uB\n", ret, off);

	if (close(fd))
		error(1, errno, "close netlink");

	fprintf(stderr, "route mtu (test): %u\n", mtu);
}

static bool __send_one(int fd, struct msghdr *msg, int flags)
{
	int ret;

	ret = sendmsg(fd, msg, flags);
	if (ret == -1 &&
	    (errno == EMSGSIZE || errno == ENOMEM || errno == EINVAL))
		return false;
	if (ret == -1)
		error(1, errno, "sendmsg");
	if (ret != msg->msg_iov->iov_len)
		error(1, 0, "sendto: %d != %llu", ret,
			(unsigned long long)msg->msg_iov->iov_len);
	if (msg->msg_flags)
		error(1, 0, "sendmsg: return flags 0x%x\n", msg->msg_flags);

	return true;
}

static bool send_one(int fd, int len, int gso_len,
		     struct sockaddr *addr, socklen_t alen)
{
	char control[CMSG_SPACE(sizeof(uint16_t))] = {0};
	struct msghdr msg = {0};
	struct iovec iov = {0};
	struct cmsghdr *cm;

	iov.iov_base = buf;
	iov.iov_len = len;

	msg.msg_iov = &iov;
	msg.msg_iovlen = 1;

	msg.msg_name = addr;
	msg.msg_namelen = alen;

	if (gso_len && !cfg_do_setsockopt) {
		msg.msg_control = control;
		msg.msg_controllen = sizeof(control);

		cm = CMSG_FIRSTHDR(&msg);
		cm->cmsg_level = SOL_UDP;
		cm->cmsg_type = UDP_SEGMENT;
		cm->cmsg_len = CMSG_LEN(sizeof(uint16_t));
		*((uint16_t *) CMSG_DATA(cm)) = gso_len;
	}

	/* If MSG_MORE, send 1 byte followed by remainder */
	if (cfg_do_msgmore && len > 1) {
		iov.iov_len = 1;
		if (!__send_one(fd, &msg, MSG_MORE))
			error(1, 0, "send 1B failed");

		iov.iov_base++;
		iov.iov_len = len - 1;
	}

	return __send_one(fd, &msg, 0);
}

static int recv_one(int fd, int flags)
{
	int ret;

	ret = recv(fd, buf, sizeof(buf), flags);
	if (ret == -1 && errno == EAGAIN && (flags & MSG_DONTWAIT))
		return 0;
	if (ret == -1)
		error(1, errno, "recv");

	return ret;
}

static void run_one(struct testcase *test, int fdt, int fdr,
		    struct sockaddr *addr, socklen_t alen)
{
	int i, ret, val, mss;
	bool sent;

	fprintf(stderr, "ipv%d tx:%d gso:%d %s\n",
			addr->sa_family == AF_INET ? 4 : 6,
			test->tlen, test->gso_len,
			test->tfail ? "(fail)" : "");

	val = test->gso_len;
	if (cfg_do_setsockopt) {
		if (setsockopt(fdt, SOL_UDP, UDP_SEGMENT, &val, sizeof(val)))
			error(1, errno, "setsockopt udp segment");
	}

	sent = send_one(fdt, test->tlen, test->gso_len, addr, alen);
	if (sent && test->tfail)
		error(1, 0, "send succeeded while expecting failure");
	if (!sent && !test->tfail)
		error(1, 0, "send failed while expecting success");
	if (!sent)
		return;

	if (test->gso_len)
		mss = test->gso_len;
	else
		mss = addr->sa_family == AF_INET ? CONST_MSS_V4 : CONST_MSS_V6;


	/* Recv all full MSS datagrams */
	for (i = 0; i < test->r_num_mss; i++) {
		ret = recv_one(fdr, 0);
		if (ret != mss)
			error(1, 0, "recv.%d: %d != %d", i, ret, mss);
	}

	/* Recv the non-full last datagram, if tlen was not a multiple of mss */
	if (test->r_len_last) {
		ret = recv_one(fdr, 0);
		if (ret != test->r_len_last)
			error(1, 0, "recv.%d: %d != %d (last)",
			      i, ret, test->r_len_last);
	}

	/* Verify received all data */
	ret = recv_one(fdr, MSG_DONTWAIT);
	if (ret)
		error(1, 0, "recv: unexpected datagram");
}

static void run_all(int fdt, int fdr, struct sockaddr *addr, socklen_t alen)
{
	struct testcase *tests, *test;

	tests = addr->sa_family == AF_INET ? testcases_v4 : testcases_v6;

	for (test = tests; test->tlen; test++) {
		/* if a specific test is given, then skip all others */
		if (cfg_specific_test_id == -1 ||
		    cfg_specific_test_id == test - tests)
			run_one(test, fdt, fdr, addr, alen);
	}
}

static void run_test(struct sockaddr *addr, socklen_t alen)
{
	struct timeval tv = { .tv_usec = 100 * 1000 };
	int fdr, fdt, val;

	fdr = socket(addr->sa_family, SOCK_DGRAM, 0);
	if (fdr == -1)
		error(1, errno, "socket r");

	if (bind(fdr, addr, alen))
		error(1, errno, "bind");

	/* Have tests fail quickly instead of hang */
	if (setsockopt(fdr, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
		error(1, errno, "setsockopt rcv timeout");

	fdt = socket(addr->sa_family, SOCK_DGRAM, 0);
	if (fdt == -1)
		error(1, errno, "socket t");

	/* Do not fragment these datagrams: only succeed if GSO works */
	set_pmtu_discover(fdt, addr->sa_family == AF_INET);

	if (cfg_do_connectionless) {
		set_device_mtu(fdt, CONST_MTU_TEST);
		run_all(fdt, fdr, addr, alen);
	}

	if (cfg_do_connected) {
		set_device_mtu(fdt, CONST_MTU_TEST + 100);
		set_route_mtu(CONST_MTU_TEST, addr->sa_family == AF_INET);

		if (connect(fdt, addr, alen))
			error(1, errno, "connect");

		val = get_path_mtu(fdt, addr->sa_family == AF_INET);
		if (val != CONST_MTU_TEST)
			error(1, 0, "bad path mtu %u\n", val);

		run_all(fdt, fdr, addr, 0 /* use connected addr */);
	}

	if (close(fdt))
		error(1, errno, "close t");
	if (close(fdr))
		error(1, errno, "close r");
}

static void run_test_v4(void)
{
	struct sockaddr_in addr = {0};

	addr.sin_family = AF_INET;
	addr.sin_port = htons(cfg_port);
	addr.sin_addr = addr4;

	run_test((void *)&addr, sizeof(addr));
}

static void run_test_v6(void)
{
	struct sockaddr_in6 addr = {0};

	addr.sin6_family = AF_INET6;
	addr.sin6_port = htons(cfg_port);
	addr.sin6_addr = addr6;

	run_test((void *)&addr, sizeof(addr));
}

static void parse_opts(int argc, char **argv)
{
	int c;

	while ((c = getopt(argc, argv, "46cCmst:")) != -1) {
		switch (c) {
		case '4':
			cfg_do_ipv4 = true;
			break;
		case '6':
			cfg_do_ipv6 = true;
			break;
		case 'c':
			cfg_do_connected = true;
			break;
		case 'C':
			cfg_do_connectionless = true;
			break;
		case 'm':
			cfg_do_msgmore = true;
			break;
		case 's':
			cfg_do_setsockopt = true;
			break;
		case 't':
			cfg_specific_test_id = strtoul(optarg, NULL, 0);
			break;
		default:
			error(1, 0, "%s: parse error", argv[0]);
		}
	}
}

int main(int argc, char **argv)
{
	parse_opts(argc, argv);

	if (cfg_do_ipv4)
		run_test_v4();
	if (cfg_do_ipv6)
		run_test_v6();

	fprintf(stderr, "OK\n");
	return 0;
}