#!/bin/bash
#
# Test connection tracking zone and NAT source port reallocation support.
#

# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4

# Don't increase too much, 2000 clients should work
# just fine but script can then take several minutes with
# KASAN/debug builds.
maxclients=100

have_iperf=1
ret=0

# client1---.
#            veth1-.
#                  |
#               NAT Gateway --veth0--> Server
#                  | |
#            veth2-' |
# client2---'        |
#  ....              |
# clientX----vethX---'

# All clients share identical IP address.
# NAT Gateway uses policy routing and conntrack zones to isolate client
# namespaces.  Each client connects to Server, each with colliding tuples:
#   clientsaddr:10000 -> serveraddr:dport
#   NAT Gateway is supposed to do port reallocation for each of the
#   connections.

sfx=$(mktemp -u "XXXXXXXX")
gw="ns-gw-$sfx"
cl1="ns-cl1-$sfx"
cl2="ns-cl2-$sfx"
srv="ns-srv-$sfx"

v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null)
v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null)
v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null)
v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null)
v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null)
v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null)

cleanup()
{
	ip netns del $gw
	ip netns del $srv
	for i in $(seq 1 $maxclients); do
		ip netns del ns-cl$i-$sfx 2>/dev/null
	done

	sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null
	sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null
	sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null
	sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null
	sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null
	sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null
}

nft --version > /dev/null 2>&1
if [ $? -ne 0 ];then
	echo "SKIP: Could not run test without nft tool"
	exit $ksft_skip
fi

ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
	echo "SKIP: Could not run test without ip tool"
	exit $ksft_skip
fi

conntrack -V > /dev/null 2>&1
if [ $? -ne 0 ];then
	echo "SKIP: Could not run test without conntrack tool"
	exit $ksft_skip
fi

iperf3 -v >/dev/null 2>&1
if [ $? -ne 0 ];then
	have_iperf=0
fi

ip netns add "$gw"
if [ $? -ne 0 ];then
	echo "SKIP: Could not create net namespace $gw"
	exit $ksft_skip
fi
ip -net "$gw" link set lo up

trap cleanup EXIT

ip netns add "$srv"
if [ $? -ne 0 ];then
	echo "SKIP: Could not create server netns $srv"
	exit $ksft_skip
fi

ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv"
ip -net "$gw" link set veth0 up
ip -net "$srv" link set lo up
ip -net "$srv" link set eth0 up

sysctl -q net.ipv6.neigh.default.gc_thresh1=512  2>/dev/null
sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null
sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null
sysctl -q net.ipv4.neigh.default.gc_thresh1=512  2>/dev/null
sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null
sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null

for i in $(seq 1 $maxclients);do
  cl="ns-cl$i-$sfx"

  ip netns add "$cl"
  if [ $? -ne 0 ];then
     echo "SKIP: Could not create client netns $cl"
     exit $ksft_skip
  fi
  ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1
  if [ $? -ne 0 ];then
    echo "SKIP: No virtual ethernet pair device support in kernel"
    exit $ksft_skip
  fi
done

for i in $(seq 1 $maxclients);do
  cl="ns-cl$i-$sfx"
  echo netns exec "$cl" ip link set lo up
  echo netns exec "$cl" ip link set eth0 up
  echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2
  echo netns exec "$gw" ip link set veth$i up
  echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2
  echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0

  # clients have same IP addresses.
  echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0
  echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0
  echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0
  echo netns exec "$cl" ip route add default via dead:1::2 dev eth0

  # NB: same addresses on client-facing interfaces.
  echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i
  echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i

  # gw: policy routing
  echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i))
  echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i))
  echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i))
  echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i))
  echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i))
done | ip -batch /dev/stdin

ip -net "$gw" addr add 10.3.0.1/24 dev veth0
ip -net "$gw" addr add dead:3::1/64 dev veth0

ip -net "$srv" addr add 10.3.0.99/24 dev eth0
ip -net "$srv" addr add dead:3::99/64 dev eth0

ip netns exec $gw nft -f /dev/stdin<<EOF
table inet raw {
	map iiftomark {
		type ifname : mark
	}

	map iiftozone {
		typeof iifname : ct zone
	}

	set inicmp {
		flags dynamic
		type ipv4_addr . ifname . ipv4_addr
	}
	set inflows {
		flags dynamic
		type ipv4_addr . inet_service . ifname . ipv4_addr . inet_service
	}

	set inflows6 {
		flags dynamic
		type ipv6_addr . inet_service . ifname . ipv6_addr . inet_service
	}

	chain prerouting {
		type filter hook prerouting priority -64000; policy accept;
		ct original zone set meta iifname map @iiftozone
		meta mark set meta iifname map @iiftomark

		tcp flags & (syn|ack) == ack add @inflows { ip saddr . tcp sport . meta iifname . ip daddr . tcp dport counter }
		add @inflows6 { ip6 saddr . tcp sport . meta iifname . ip6 daddr . tcp dport counter }
		ip protocol icmp add @inicmp { ip saddr . meta iifname . ip daddr counter }
	}

	chain nat_postrouting {
		type nat hook postrouting priority 0; policy accept;
                ct mark set meta mark meta oifname veth0 masquerade
	}

	chain mangle_prerouting {
		type filter hook prerouting priority -100; policy accept;
		ct direction reply meta mark set ct mark
	}
}
EOF

( echo add element inet raw iiftomark \{
	for i in $(seq 1 $((maxclients-1))); do
		echo \"veth$i\" : $i,
	done
	echo \"veth$maxclients\" : $maxclients \}
	echo add element inet raw iiftozone \{
	for i in $(seq 1 $((maxclients-1))); do
		echo \"veth$i\" : $i,
	done
	echo \"veth$maxclients\" : $maxclients \}
) | ip netns exec $gw nft -f /dev/stdin

ip netns exec "$gw" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null
ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null

# useful for debugging: allows to use 'ping' from clients to gateway.
ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null
ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null

for i in $(seq 1 $maxclients); do
  cl="ns-cl$i-$sfx"
  ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 &
  if [ $? -ne 0 ]; then
     echo FAIL: Ping failure from $cl 1>&2
     ret=1
     break
  fi
done

wait

for i in $(seq 1 $maxclients); do
   ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }"
   if [ $? -ne 0 ];then
      ret=1
      echo "FAIL: counter icmp mismatch for veth$i" 1>&2
      ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2
      break
   fi
done

ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }"
if [ $? -ne 0 ];then
    ret=1
    echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }"
    ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2
fi

if  [ $ret -eq 0 ]; then
	echo "PASS: ping test from all $maxclients namespaces"
fi

if [ $have_iperf -eq 0 ];then
	echo "SKIP: iperf3 not installed"
	if [ $ret -ne 0 ];then
	    exit $ret
	fi
	exit $ksft_skip
fi

ip netns exec $srv iperf3 -s > /dev/null 2>&1 &
iperfpid=$!
sleep 1

for i in $(seq 1 $maxclients); do
  if [ $ret -ne 0 ]; then
     break
  fi
  cl="ns-cl$i-$sfx"
  ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null
  if [ $? -ne 0 ]; then
     echo FAIL: Failure to connect for $cl 1>&2
     ip netns exec $gw conntrack -S 1>&2
     ret=1
  fi
done
if [ $ret -eq 0 ];then
	echo "PASS: iperf3 connections for all $maxclients net namespaces"
fi

kill $iperfpid
wait

for i in $(seq 1 $maxclients); do
   ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null
   if [ $? -ne 0 ];then
      ret=1
      echo "FAIL: can't find expected tcp entry for veth$i" 1>&2
      break
   fi
done
if [ $ret -eq 0 ];then
	echo "PASS: Found client connection for all $maxclients net namespaces"
fi

ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null
if [ $? -ne 0 ];then
    ret=1
    echo "FAIL: cannot find return entry on veth0" 1>&2
fi

exit $ret