#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This tests basic flowtable functionality.
# Creates following default topology:
#
# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
# Router1 is the one doing flow offloading, Router2 has no special
# purpose other than having a link that is smaller than either Originator
# and responder, i.e. TCPMSS announced values are too large and will still
# result in fragmentation and/or PMTU discovery.
#
# You can check with different Orgininator/Link/Responder MTU eg:
# nft_flowtable.sh -o8000 -l1500 -r2000
#

sfx=$(mktemp -u "XXXXXXXX")
ns1="ns1-$sfx"
ns2="ns2-$sfx"
nsr1="nsr1-$sfx"
nsr2="nsr2-$sfx"

# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
ret=0

nsin=""
ns1out=""
ns2out=""

log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)

checktool (){
	if ! $1 > /dev/null 2>&1; then
		echo "SKIP: Could not $2"
		exit $ksft_skip
	fi
}

checktool "nft --version" "run test without nft tool"
checktool "ip -Version" "run test without ip tool"
checktool "which nc" "run test without nc (netcat)"
checktool "ip netns add $nsr1" "create net namespace $nsr1"

ip netns add $ns1
ip netns add $ns2
ip netns add $nsr2

cleanup() {
	ip netns del $ns1
	ip netns del $ns2
	ip netns del $nsr1
	ip netns del $nsr2

	rm -f "$nsin" "$ns1out" "$ns2out"

	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
}

trap cleanup EXIT

sysctl -q net.netfilter.nf_log_all_netns=1

ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1
ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2

ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2

for dev in lo veth0 veth1; do
    ip -net $nsr1 link set $dev up
    ip -net $nsr2 link set $dev up
done

ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
ip -net $nsr1 addr add dead:1::1/64 dev veth0

ip -net $nsr2 addr add 10.0.2.1/24 dev veth1
ip -net $nsr2 addr add dead:2::1/64 dev veth1

# set different MTUs so we need to push packets coming from ns1 (large MTU)
# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
# or to do PTMU discovery (send ICMP error back to originator).
# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
# is NOT the lowest link mtu.

omtu=9000
lmtu=1500
rmtu=2000

usage(){
	echo "nft_flowtable.sh [OPTIONS]"
	echo
	echo "MTU options"
	echo "   -o originator"
	echo "   -l link"
	echo "   -r responder"
	exit 1
}

while getopts "o:l:r:" o
do
	case $o in
		o) omtu=$OPTARG;;
		l) lmtu=$OPTARG;;
		r) rmtu=$OPTARG;;
		*) usage;;
	esac
done

if ! ip -net $nsr1 link set veth0 mtu $omtu; then
	exit 1
fi

ip -net $ns1 link set eth0 mtu $omtu

if ! ip -net $nsr2 link set veth1 mtu $rmtu; then
	exit 1
fi

ip -net $ns2 link set eth0 mtu $rmtu

# transfer-net between nsr1 and nsr2.
# these addresses are not used for connections.
ip -net $nsr1 addr add 192.168.10.1/24 dev veth1
ip -net $nsr1 addr add fee1:2::1/64 dev veth1

ip -net $nsr2 addr add 192.168.10.2/24 dev veth0
ip -net $nsr2 addr add fee1:2::2/64 dev veth0

for i in 0 1; do
  ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
  ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
done

for ns in $ns1 $ns2;do
  ip -net $ns link set lo up
  ip -net $ns link set eth0 up

  if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
	echo "ERROR: Check Originator/Responder values (problem during address addition)"
	exit 1
  fi
  # don't set ip DF bit for first two tests
  ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
done

ip -net $ns1 addr add 10.0.1.99/24 dev eth0
ip -net $ns2 addr add 10.0.2.99/24 dev eth0
ip -net $ns1 route add default via 10.0.1.1
ip -net $ns2 route add default via 10.0.2.1
ip -net $ns1 addr add dead:1::99/64 dev eth0
ip -net $ns2 addr add dead:2::99/64 dev eth0
ip -net $ns1 route add default via dead:1::1
ip -net $ns2 route add default via dead:2::1

ip -net $nsr1 route add default via 192.168.10.2
ip -net $nsr2 route add default via 192.168.10.1

ip netns exec $nsr1 nft -f - <<EOF
table inet filter {
  flowtable f1 {
     hook ingress priority 0
     devices = { veth0, veth1 }
   }

   counter routed_orig { }
   counter routed_repl { }

   chain forward {
      type filter hook forward priority 0; policy drop;

      # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
      meta oif "veth1" tcp dport 12345 ct mark set 1 flow add @f1 counter name routed_orig accept

      # count packets supposedly offloaded as per direction.
      ct mark 1 counter name ct direction map { original : routed_orig, reply : routed_repl } accept

      ct state established,related accept

      meta nfproto ipv4 meta l4proto icmp accept
      meta nfproto ipv6 meta l4proto icmpv6 accept
   }
}
EOF

if [ $? -ne 0 ]; then
	echo "SKIP: Could not load nft ruleset"
	exit $ksft_skip
fi

ip netns exec $ns2 nft -f - <<EOF
table inet filter {
   counter ip4dscp0 { }
   counter ip4dscp3 { }

   chain input {
      type filter hook input priority 0; policy accept;
      meta l4proto tcp goto {
	      ip dscp cs3 counter name ip4dscp3 accept
	      ip dscp 0 counter name ip4dscp0 accept
      }
   }
}
EOF

if [ $? -ne 0 ]; then
	echo "SKIP: Could not load nft ruleset"
	exit $ksft_skip
fi

# test basic connectivity
if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then
  echo "ERROR: $ns1 cannot reach ns2" 1>&2
  exit 1
fi

if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then
  echo "ERROR: $ns2 cannot reach $ns1" 1>&2
  exit 1
fi

if [ $ret -eq 0 ];then
	echo "PASS: netns routing/connectivity: $ns1 can reach $ns2"
fi

nsin=$(mktemp)
ns1out=$(mktemp)
ns2out=$(mktemp)

make_file()
{
	name=$1

	SIZE=$((RANDOM % (1024 * 128)))
	SIZE=$((SIZE + (1024 * 8)))
	TSIZE=$((SIZE * 1024))

	dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null

	SIZE=$((RANDOM % 1024))
	SIZE=$((SIZE + 128))
	TSIZE=$((TSIZE + SIZE))
	dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
}

check_counters()
{
	local what=$1
	local ok=1

	local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets)
	local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets)

	local orig_cnt=${orig#*bytes}
	local repl_cnt=${repl#*bytes}

	local fs=$(du -sb $nsin)
	local max_orig=${fs%%/*}
	local max_repl=$((max_orig/4))

	if [ $orig_cnt -gt $max_orig ];then
		echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2
		ret=1
		ok=0
	fi

	if [ $repl_cnt -gt $max_repl ];then
		echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2
		ret=1
		ok=0
	fi

	if [ $ok -eq 1 ]; then
		echo "PASS: $what"
	fi
}

check_dscp()
{
	local what=$1
	local ok=1

	local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets)

	local pc4=${counter%*bytes*}
	local pc4=${pc4#*packets}

	local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets)
	local pc4z=${counter%*bytes*}
	local pc4z=${pc4z#*packets}

	case "$what" in
	"dscp_none")
		if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then
			echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2
			ret=1
			ok=0
		fi
		;;
	"dscp_fwd")
		if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then
			echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2
			ret=1
			ok=0
		fi
		;;
	"dscp_ingress")
		if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then
			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
			ret=1
			ok=0
		fi
		;;
	"dscp_egress")
		if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then
			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
			ret=1
			ok=0
		fi
		;;
	*)
		echo "FAIL: Unknown DSCP check" 1>&2
		ret=1
		ok=0
	esac

	if [ $ok -eq 1 ] ;then
		echo "PASS: $what: dscp packet counters match"
	fi
}

check_transfer()
{
	in=$1
	out=$2
	what=$3

	if ! cmp "$in" "$out" > /dev/null 2>&1; then
		echo "FAIL: file mismatch for $what" 1>&2
		ls -l "$in"
		ls -l "$out"
		return 1
	fi

	return 0
}

test_tcp_forwarding_ip()
{
	local nsa=$1
	local nsb=$2
	local dstip=$3
	local dstport=$4
	local lret=0

	ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" &
	lpid=$!

	sleep 1
	ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" &
	cpid=$!

	sleep 1

	prev="$(ls -l $ns1out $ns2out)"
	sleep 1

	while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do
		sleep 1;
		prev="$(ls -l $ns1out $ns2out)"
	done

	if test -d /proc/"$lpid"/; then
		kill $lpid
	fi

	if test -d /proc/"$cpid"/; then
		kill $cpid
	fi

	wait $lpid
	wait $cpid

	if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then
		lret=1
	fi

	if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then
		lret=1
	fi

	return $lret
}

test_tcp_forwarding()
{
	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345

	return $?
}

test_tcp_forwarding_set_dscp()
{
	check_dscp "dscp_none"

ip netns exec $nsr1 nft -f - <<EOF
table netdev dscpmangle {
   chain setdscp0 {
      type filter hook ingress device "veth0" priority 0; policy accept
	ip dscp set cs3
  }
}
EOF
if [ $? -eq 0 ]; then
	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
	check_dscp "dscp_ingress"

	ip netns exec $nsr1 nft delete table netdev dscpmangle
else
	echo "SKIP: Could not load netdev:ingress for veth0"
fi

ip netns exec $nsr1 nft -f - <<EOF
table netdev dscpmangle {
   chain setdscp0 {
      type filter hook egress device "veth1" priority 0; policy accept
      ip dscp set cs3
  }
}
EOF
if [ $? -eq 0 ]; then
	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
	check_dscp "dscp_egress"

	ip netns exec $nsr1 nft flush table netdev dscpmangle
else
	echo "SKIP: Could not load netdev:egress for veth1"
fi

	# partial.  If flowtable really works, then both dscp-is-0 and dscp-is-cs3
	# counters should have seen packets (before and after ft offload kicks in).
	ip netns exec $nsr1 nft -a insert rule inet filter forward ip dscp set cs3
	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
	check_dscp "dscp_fwd"
}

test_tcp_forwarding_nat()
{
	local lret
	local pmtu

	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
	lret=$?

	pmtu=$3
	what=$4

	if [ $lret -eq 0 ] ; then
		if [ $pmtu -eq 1 ] ;then
			check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what"
		else
			echo "PASS: flow offload for ns1/ns2 with masquerade $what"
		fi

		test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
		lret=$?
		if [ $pmtu -eq 1 ] ;then
			check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what"
		elif [ $lret -eq 0 ] ; then
			echo "PASS: flow offload for ns1/ns2 with dnat $what"
		fi
	fi

	return $lret
}

make_file "$nsin"

# First test:
# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
# Due to MTU mismatch in both directions, all packets (except small packets like pure
# acks) have to be handled by normal forwarding path.  Therefore, packet counters
# are not checked.
if test_tcp_forwarding $ns1 $ns2; then
	echo "PASS: flow offloaded for ns1/ns2"
else
	echo "FAIL: flow offload for ns1/ns2:" 1>&2
	ip netns exec $nsr1 nft list ruleset
	ret=1
fi

# delete default route, i.e. ns2 won't be able to reach ns1 and
# will depend on ns1 being masqueraded in nsr1.
# expect ns1 has nsr1 address.
ip -net $ns2 route del default via 10.0.2.1
ip -net $ns2 route del default via dead:2::1
ip -net $ns2 route add 192.168.10.1 via 10.0.2.1

# Second test:
# Same, but with NAT enabled.  Same as in first test: we expect normal forward path
# to handle most packets.
ip netns exec $nsr1 nft -f - <<EOF
table ip nat {
   chain prerouting {
      type nat hook prerouting priority 0; policy accept;
      meta iif "veth0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
   }

   chain postrouting {
      type nat hook postrouting priority 0; policy accept;
      meta oifname "veth1" counter masquerade
   }
}
EOF

if ! test_tcp_forwarding_set_dscp $ns1 $ns2 0 ""; then
	echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2
	exit 0
fi

if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then
	echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
	ip netns exec $nsr1 nft list ruleset
	ret=1
fi

# Third test:
# Same as second test, but with PMTU discovery enabled. This
# means that we expect the fastpath to handle packets as soon
# as the endpoints adjust the packet size.
ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null

# reset counters.
# With pmtu in-place we'll also check that nft counters
# are lower than file size and packets were forwarded via flowtable layer.
# For earlier tests (large mtus), packets cannot be handled via flowtable
# (except pure acks and other small packets).
ip netns exec $nsr1 nft reset counters table inet filter >/dev/null

if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then
	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
	ip netns exec $nsr1 nft list ruleset
fi

# Another test:
# Add bridge interface br0 to Router1, with NAT enabled.
ip -net $nsr1 link add name br0 type bridge
ip -net $nsr1 addr flush dev veth0
ip -net $nsr1 link set up dev veth0
ip -net $nsr1 link set veth0 master br0
ip -net $nsr1 addr add 10.0.1.1/24 dev br0
ip -net $nsr1 addr add dead:1::1/64 dev br0
ip -net $nsr1 link set up dev br0

ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null

# br0 with NAT enabled.
ip netns exec $nsr1 nft -f - <<EOF
flush table ip nat
table ip nat {
   chain prerouting {
      type nat hook prerouting priority 0; policy accept;
      meta iif "br0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
   }

   chain postrouting {
      type nat hook postrouting priority 0; policy accept;
      meta oifname "veth1" counter masquerade
   }
}
EOF

if ! test_tcp_forwarding_nat $ns1 $ns2 1 "on bridge"; then
	echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2
	ip netns exec $nsr1 nft list ruleset
	ret=1
fi


# Another test:
# Add bridge interface br0 to Router1, with NAT and VLAN.
ip -net $nsr1 link set veth0 nomaster
ip -net $nsr1 link set down dev veth0
ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10
ip -net $nsr1 link set up dev veth0
ip -net $nsr1 link set up dev veth0.10
ip -net $nsr1 link set veth0.10 master br0

ip -net $ns1 addr flush dev eth0
ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10
ip -net $ns1 link set eth0 up
ip -net $ns1 link set eth0.10 up
ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10
ip -net $ns1 route add default via 10.0.1.1
ip -net $ns1 addr add dead:1::99/64 dev eth0.10

if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then
	echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2
	ip netns exec $nsr1 nft list ruleset
	ret=1
fi

# restore test topology (remove bridge and VLAN)
ip -net $nsr1 link set veth0 nomaster
ip -net $nsr1 link set veth0 down
ip -net $nsr1 link set veth0.10 down
ip -net $nsr1 link delete veth0.10 type vlan
ip -net $nsr1 link delete br0 type bridge
ip -net $ns1 addr flush dev eth0.10
ip -net $ns1 link set eth0.10 down
ip -net $ns1 link set eth0 down
ip -net $ns1 link delete eth0.10 type vlan

# restore address in ns1 and nsr1
ip -net $ns1 link set eth0 up
ip -net $ns1 addr add 10.0.1.99/24 dev eth0
ip -net $ns1 route add default via 10.0.1.1
ip -net $ns1 addr add dead:1::99/64 dev eth0
ip -net $ns1 route add default via dead:1::1
ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
ip -net $nsr1 addr add dead:1::1/64 dev veth0
ip -net $nsr1 link set up dev veth0

KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1)
KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1)
SPI1=$RANDOM
SPI2=$RANDOM

if [ $SPI1 -eq $SPI2 ]; then
	SPI2=$((SPI2+1))
fi

do_esp() {
    local ns=$1
    local me=$2
    local remote=$3
    local lnet=$4
    local rnet=$5
    local spi_out=$6
    local spi_in=$7

    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet

    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
    # to fwd decrypted packets after esp processing:
    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow

}

do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2

do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1

ip netns exec $nsr1 nft delete table ip nat

# restore default routes
ip -net $ns2 route del 192.168.10.1 via 10.0.2.1
ip -net $ns2 route add default via 10.0.2.1
ip -net $ns2 route add default via dead:2::1

if test_tcp_forwarding $ns1 $ns2; then
	check_counters "ipsec tunnel mode for ns1/ns2"
else
	echo "FAIL: ipsec tunnel mode for ns1/ns2"
	ip netns exec $nsr1 nft list ruleset 1>&2
	ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2
fi

exit $ret