#!/bin/bash
# SPDX-License-Identifier: GPL-2.0

# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4

# Conntrack needs to reassemble fragments in order to have complete
# packets for rule matching.  Reassembly can lead to packet loss.

# Consider the following setup:
#            +--------+       +---------+       +--------+
#            |Router A|-------|Wanrouter|-------|Router B|
#            |        |.IPIP..|         |..IPIP.|        |
#            +--------+       +---------+       +--------+
#           /                  mtu 1400                   \
#          /                                               \
#+--------+                                                 +--------+
#|Client A|                                                 |Client B|
#|        |                                                 |        |
#+--------+                                                 +--------+

# Router A and Router B use IPIP tunnel interfaces to tunnel traffic
# between Client A and Client B over WAN. Wanrouter has MTU 1400 set
# on its interfaces.

rnd=$(mktemp -u XXXXXXXX)
rx=$(mktemp)

r_a="ns-ra-$rnd"
r_b="ns-rb-$rnd"
r_w="ns-rw-$rnd"
c_a="ns-ca-$rnd"
c_b="ns-cb-$rnd"

checktool (){
	if ! $1 > /dev/null 2>&1; then
		echo "SKIP: Could not $2"
		exit $ksft_skip
	fi
}

checktool "iptables --version" "run test without iptables"
checktool "ip -Version" "run test without ip tool"
checktool "which socat" "run test without socat"
checktool "ip netns add ${r_a}" "create net namespace"

for n in ${r_b} ${r_w} ${c_a} ${c_b};do
	ip netns add ${n}
done

cleanup() {
	for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do
		ip netns del ${n}
	done
	rm -f ${rx}
}

trap cleanup EXIT

test_path() {
	msg="$1"

	ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null &

	sleep 1
	for i in 1 2 3; do
		head -c1400 /dev/zero | tr "\000" "a" | \
			ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000
	done

	wait

	bytes=$(wc -c < ${rx})

	if [ $bytes -eq 1400 ];then
		echo "OK: PMTU $msg connection tracking"
	else
		echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400"
		exit 1
	fi
}

# Detailed setup for Router A
# ---------------------------
# Interfaces:
# eth0: 10.2.2.1/24
# eth1: 192.168.10.1/24
# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1
# Routes:
# 192.168.20.0/24 dev ipip0    (192.168.20.0/24 is subnet of Client B)
# 10.4.4.1 via 10.2.2.254      (Router B via Wanrouter)
# No iptables rules at all.

ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w}
ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a}

l_addr="10.2.2.1"
r_addr="10.4.4.1"
ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip

for dev in lo veth0 veth1 ipip0; do
    ip -net ${r_a} link set $dev up
done

ip -net ${r_a} addr add 10.2.2.1/24 dev veth0
ip -net ${r_a} addr add 192.168.10.1/24 dev veth1

ip -net ${r_a} route add 192.168.20.0/24 dev ipip0
ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254

ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null

# Detailed setup for Router B
# ---------------------------
# Interfaces:
# eth0: 10.4.4.1/24
# eth1: 192.168.20.1/24
# ipip0: No IP address, local 10.4.4.1 remote 10.2.2.1
# Routes:
# 192.168.10.0/24 dev ipip0    (192.168.10.0/24 is subnet of Client A)
# 10.2.2.1 via 10.4.4.254      (Router A via Wanrouter)
# No iptables rules at all.

ip link add veth0 netns ${r_b} type veth peer name veth1 netns ${r_w}
ip link add veth1 netns ${r_b} type veth peer name veth0 netns ${c_b}

l_addr="10.4.4.1"
r_addr="10.2.2.1"

ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip

for dev in lo veth0 veth1 ipip0; do
	ip -net ${r_b} link set $dev up
done

ip -net ${r_b} addr add 10.4.4.1/24 dev veth0
ip -net ${r_b} addr add 192.168.20.1/24 dev veth1

ip -net ${r_b} route add 192.168.10.0/24 dev ipip0
ip -net ${r_b} route add 10.2.2.0/24 via 10.4.4.254
ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null

# Client A
ip -net ${c_a} addr add 192.168.10.2/24 dev veth0
ip -net ${c_a} link set dev lo up
ip -net ${c_a} link set dev veth0 up
ip -net ${c_a} route add default via 192.168.10.1

# Client A
ip -net ${c_b} addr add 192.168.20.2/24 dev veth0
ip -net ${c_b} link set dev veth0 up
ip -net ${c_b} link set dev lo up
ip -net ${c_b} route add default via 192.168.20.1

# Wan
ip -net ${r_w} addr add 10.2.2.254/24 dev veth0
ip -net ${r_w} addr add 10.4.4.254/24 dev veth1

ip -net ${r_w} link set dev lo up
ip -net ${r_w} link set dev veth0 up mtu 1400
ip -net ${r_w} link set dev veth1 up mtu 1400

ip -net ${r_a} link set dev veth0 mtu 1400
ip -net ${r_b} link set dev veth0 mtu 1400

ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null

# Path MTU discovery
# ------------------
# Running tracepath from Client A to Client B shows PMTU discovery is working
# as expected:
#
# clienta:~# tracepath 192.168.20.2
# 1?: [LOCALHOST]                      pmtu 1500
# 1:  192.168.10.1                                          0.867ms
# 1:  192.168.10.1                                          0.302ms
# 2:  192.168.10.1                                          0.312ms pmtu 1480
# 2:  no reply
# 3:  192.168.10.1                                          0.510ms pmtu 1380
# 3:  192.168.20.2                                          2.320ms reached
# Resume: pmtu 1380 hops 3 back 3

# ip netns exec ${c_a} traceroute --mtu 192.168.20.2

# Router A has learned PMTU (1400) to Router B from Wanrouter.
# Client A has learned PMTU (1400 - IPIP overhead = 1380) to Client B
# from Router A.

#Send large UDP packet
#---------------------
#Now we send a 1400 bytes UDP packet from Client A to Client B:

# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | socat -u STDIN UDP:192.168.20.2:5000
test_path "without"

# The IPv4 stack on Client A already knows the PMTU to Client B, so the
# UDP packet is sent as two fragments (1380 + 20). Router A forwards the
# fragments between eth1 and ipip0. The fragments fit into the tunnel and
# reach their destination.

#When sending the large UDP packet again, Router A now reassembles the
#fragments before routing the packet over ipip0. The resulting IPIP
#packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is
#dropped on Router A before sending.

ip netns exec ${r_a} iptables -A FORWARD -m conntrack --ctstate NEW
test_path "with"