#!/bin/bash
# SPDX-License-Identifier: GPL-2.0

# +--------------------+                               +----------------------+
# | H1 (vrf)           |                               |             H2 (vrf) |
# |    + $h1           |                               |  + $h2               |
# |    | 192.0.2.1/28  |                               |  | 192.0.2.2/28      |
# +----|---------------+                               +--|-------------------+
#      |                                                  |
# +----|--------------------------------------------------|-------------------+
# | SW |                                                  |                   |
# | +--|--------------------------------------------------|-----------------+ |
# | |  + $swp1                   BR1 (802.1d)             + $swp2           | |
# | |                                                                       | |
# | |  + vx1 (vxlan)                                                        | |
# | |    local 192.0.2.17                                                   | |
# | |    remote 192.0.2.34 192.0.2.50                                       | |
# | |    id 1000 dstport $VXPORT                                            | |
# | +-----------------------------------------------------------------------+ |
# |                                                                           |
# |  192.0.2.32/28 via 192.0.2.18                                             |
# |  192.0.2.48/28 via 192.0.2.18                                             |
# |                                                                           |
# |    + $rp1                                                                 |
# |    | 192.0.2.17/28                                                        |
# +----|----------------------------------------------------------------------+
#      |
# +----|--------------------------------------------------------+
# |    |                                             VRP2 (vrf) |
# |    + $rp2                                                   |
# |      192.0.2.18/28                                          |
# |                                                             |   (maybe) HW
# =============================================================================
# |                                                             |  (likely) SW
# |    + v1 (veth)                             + v3 (veth)      |
# |    | 192.0.2.33/28                         | 192.0.2.49/28  |
# +----|---------------------------------------|----------------+
#      |                                       |
# +----|------------------------------+   +----|------------------------------+
# |    + v2 (veth)        NS1 (netns) |   |    + v4 (veth)        NS2 (netns) |
# |      192.0.2.34/28                |   |      192.0.2.50/28                |
# |                                   |   |                                   |
# |   192.0.2.16/28 via 192.0.2.33    |   |   192.0.2.16/28 via 192.0.2.49    |
# |   192.0.2.50/32 via 192.0.2.33    |   |   192.0.2.34/32 via 192.0.2.49    |
# |                                   |   |                                   |
# | +-------------------------------+ |   | +-------------------------------+ |
# | |                  BR2 (802.1d) | |   | |                  BR2 (802.1d) | |
# | |  + vx2 (vxlan)                | |   | |  + vx2 (vxlan)                | |
# | |    local 192.0.2.34           | |   | |    local 192.0.2.50           | |
# | |    remote 192.0.2.17          | |   | |    remote 192.0.2.17          | |
# | |    remote 192.0.2.50          | |   | |    remote 192.0.2.34          | |
# | |    id 1000 dstport $VXPORT    | |   | |    id 1000 dstport $VXPORT    | |
# | |                               | |   | |                               | |
# | |  + w1 (veth)                  | |   | |  + w1 (veth)                  | |
# | +--|----------------------------+ |   | +--|----------------------------+ |
# |    |                              |   |    |                              |
# | +--|----------------------------+ |   | +--|----------------------------+ |
# | |  |                  VW2 (vrf) | |   | |  |                  VW2 (vrf) | |
# | |  + w2 (veth)                  | |   | |  + w2 (veth)                  | |
# | |    192.0.2.3/28               | |   | |    192.0.2.4/28               | |
# | +-------------------------------+ |   | +-------------------------------+ |
# +-----------------------------------+   +-----------------------------------+

: ${VXPORT:=4789}
export VXPORT

: ${ALL_TESTS:="
	ping_ipv4
	test_flood
	test_unicast
	test_ttl
	test_tos
	test_ecn_encap
	test_ecn_decap
	reapply_config
	ping_ipv4
	test_flood
	test_unicast
	test_learning
    "}

NUM_NETIFS=6
source lib.sh

h1_create()
{
	simple_if_init $h1 192.0.2.1/28
	tc qdisc add dev $h1 clsact
}

h1_destroy()
{
	tc qdisc del dev $h1 clsact
	simple_if_fini $h1 192.0.2.1/28
}

h2_create()
{
	simple_if_init $h2 192.0.2.2/28
	tc qdisc add dev $h2 clsact
}

h2_destroy()
{
	tc qdisc del dev $h2 clsact
	simple_if_fini $h2 192.0.2.2/28
}

rp1_set_addr()
{
	ip address add dev $rp1 192.0.2.17/28

	ip route add 192.0.2.32/28 nexthop via 192.0.2.18
	ip route add 192.0.2.48/28 nexthop via 192.0.2.18
}

rp1_unset_addr()
{
	ip route del 192.0.2.48/28 nexthop via 192.0.2.18
	ip route del 192.0.2.32/28 nexthop via 192.0.2.18

	ip address del dev $rp1 192.0.2.17/28
}

switch_create()
{
	ip link add name br1 type bridge vlan_filtering 0 mcast_snooping 0
	# Make sure the bridge uses the MAC address of the local port and not
	# that of the VxLAN's device.
	ip link set dev br1 address $(mac_get $swp1)
	ip link set dev br1 up

	ip link set dev $rp1 up
	rp1_set_addr

	ip link add name vx1 type vxlan id 1000		\
		local 192.0.2.17 dstport "$VXPORT"	\
		nolearning noudpcsum tos inherit ttl 100
	ip link set dev vx1 up

	ip link set dev vx1 master br1
	ip link set dev $swp1 master br1
	ip link set dev $swp1 up

	ip link set dev $swp2 master br1
	ip link set dev $swp2 up

	bridge fdb append dev vx1 00:00:00:00:00:00 dst 192.0.2.34 self
	bridge fdb append dev vx1 00:00:00:00:00:00 dst 192.0.2.50 self
}

switch_destroy()
{
	rp1_unset_addr
	ip link set dev $rp1 down

	bridge fdb del dev vx1 00:00:00:00:00:00 dst 192.0.2.50 self
	bridge fdb del dev vx1 00:00:00:00:00:00 dst 192.0.2.34 self

	ip link set dev vx1 nomaster
	ip link set dev vx1 down
	ip link del dev vx1

	ip link set dev $swp2 down
	ip link set dev $swp2 nomaster

	ip link set dev $swp1 down
	ip link set dev $swp1 nomaster

	ip link set dev br1 down
	ip link del dev br1
}

vrp2_create()
{
	simple_if_init $rp2 192.0.2.18/28
	__simple_if_init v1 v$rp2 192.0.2.33/28
	__simple_if_init v3 v$rp2 192.0.2.49/28
	tc qdisc add dev v1 clsact
}

vrp2_destroy()
{
	tc qdisc del dev v1 clsact
	__simple_if_fini v3 192.0.2.49/28
	__simple_if_fini v1 192.0.2.33/28
	simple_if_fini $rp2 192.0.2.18/28
}

ns_init_common()
{
	local in_if=$1; shift
	local in_addr=$1; shift
	local other_in_addr=$1; shift
	local nh_addr=$1; shift
	local host_addr=$1; shift

	ip link set dev $in_if up
	ip address add dev $in_if $in_addr/28
	tc qdisc add dev $in_if clsact

	ip link add name br2 type bridge vlan_filtering 0
	ip link set dev br2 up

	ip link add name w1 type veth peer name w2

	ip link set dev w1 master br2
	ip link set dev w1 up

	ip link add name vx2 type vxlan id 1000 local $in_addr dstport "$VXPORT"
	ip link set dev vx2 up
	bridge fdb append dev vx2 00:00:00:00:00:00 dst 192.0.2.17 self
	bridge fdb append dev vx2 00:00:00:00:00:00 dst $other_in_addr self

	ip link set dev vx2 master br2
	tc qdisc add dev vx2 clsact

	simple_if_init w2 $host_addr/28

	ip route add 192.0.2.16/28 nexthop via $nh_addr
	ip route add $other_in_addr/32 nexthop via $nh_addr
}
export -f ns_init_common

ns1_create()
{
	ip netns add ns1
	ip link set dev v2 netns ns1
	in_ns ns1 \
	      ns_init_common v2 192.0.2.34 192.0.2.50 192.0.2.33 192.0.2.3
}

ns1_destroy()
{
	ip netns exec ns1 ip link set dev v2 netns 1
	ip netns del ns1
}

ns2_create()
{
	ip netns add ns2
	ip link set dev v4 netns ns2
	in_ns ns2 \
	      ns_init_common v4 192.0.2.50 192.0.2.34 192.0.2.49 192.0.2.4
}

ns2_destroy()
{
	ip netns exec ns2 ip link set dev v4 netns 1
	ip netns del ns2
}

setup_prepare()
{
	h1=${NETIFS[p1]}
	swp1=${NETIFS[p2]}

	swp2=${NETIFS[p3]}
	h2=${NETIFS[p4]}

	rp1=${NETIFS[p5]}
	rp2=${NETIFS[p6]}

	vrf_prepare
	forwarding_enable

	h1_create
	h2_create
	switch_create

	ip link add name v1 type veth peer name v2
	ip link add name v3 type veth peer name v4
	vrp2_create
	ns1_create
	ns2_create

	r1_mac=$(in_ns ns1 mac_get w2)
	r2_mac=$(in_ns ns2 mac_get w2)
	h2_mac=$(mac_get $h2)
}

cleanup()
{
	pre_cleanup

	ns2_destroy
	ns1_destroy
	vrp2_destroy
	ip link del dev v3
	ip link del dev v1

	switch_destroy
	h2_destroy
	h1_destroy

	forwarding_restore
	vrf_cleanup
}

# For the first round of tests, vx1 is the first device to get attached to the
# bridge, and that at the point that the local IP is already configured. Try the
# other scenario of attaching the device to an already-offloaded bridge, and
# only then attach the local IP.
reapply_config()
{
	echo "Reapplying configuration"

	bridge fdb del dev vx1 00:00:00:00:00:00 dst 192.0.2.50 self
	bridge fdb del dev vx1 00:00:00:00:00:00 dst 192.0.2.34 self
	rp1_unset_addr
	ip link set dev vx1 nomaster
	sleep 5

	ip link set dev vx1 master br1
	bridge fdb append dev vx1 00:00:00:00:00:00 dst 192.0.2.34 self
	bridge fdb append dev vx1 00:00:00:00:00:00 dst 192.0.2.50 self
	sleep 1
	rp1_set_addr
	sleep 5
}

ping_ipv4()
{
	ping_test $h1 192.0.2.2 ": local->local"
	ping_test $h1 192.0.2.3 ": local->remote 1"
	ping_test $h1 192.0.2.4 ": local->remote 2"
}

maybe_in_ns()
{
	echo ${1:+in_ns} $1
}

__flood_counter_add_del()
{
	local add_del=$1; shift
	local dev=$1; shift
	local ns=$1; shift

	# Putting the ICMP capture both to HW and to SW will end up
	# double-counting the packets that are trapped to slow path, such as for
	# the unicast test. Adding either skip_hw or skip_sw fixes this problem,
	# but with skip_hw, the flooded packets are not counted at all, because
	# those are dropped due to MAC address mismatch; and skip_sw is a no-go
	# for veth-based topologies.
	#
	# So try to install with skip_sw and fall back to skip_sw if that fails.

	$(maybe_in_ns $ns) __icmp_capture_add_del          \
			   $add_del 100 "" $dev skip_sw 2>/dev/null || \
	$(maybe_in_ns $ns) __icmp_capture_add_del          \
			   $add_del 100 "" $dev skip_hw
}

flood_counter_install()
{
	__flood_counter_add_del add "$@"
}

flood_counter_uninstall()
{
	__flood_counter_add_del del "$@"
}

flood_fetch_stat()
{
	local dev=$1; shift
	local ns=$1; shift

	$(maybe_in_ns $ns) tc_rule_stats_get $dev 100 ingress
}

flood_fetch_stats()
{
	local counters=("${@}")
	local counter

	for counter in "${counters[@]}"; do
		flood_fetch_stat $counter
	done
}

vxlan_flood_test()
{
	local mac=$1; shift
	local dst=$1; shift
	local -a expects=("${@}")

	local -a counters=($h2 "vx2 ns1" "vx2 ns2")
	local counter
	local key

	for counter in "${counters[@]}"; do
		flood_counter_install $counter
	done

	local -a t0s=($(flood_fetch_stats "${counters[@]}"))
	$MZ $h1 -c 10 -d 100msec -p 64 -b $mac -B $dst -t icmp -q
	sleep 1
	local -a t1s=($(flood_fetch_stats "${counters[@]}"))

	for key in ${!t0s[@]}; do
		local delta=$((t1s[$key] - t0s[$key]))
		local expect=${expects[$key]}

		((expect == delta))
		check_err $? "${counters[$key]}: Expected to capture $expect packets, got $delta."
	done

	for counter in "${counters[@]}"; do
		flood_counter_uninstall $counter
	done
}

__test_flood()
{
	local mac=$1; shift
	local dst=$1; shift
	local what=$1; shift

	RET=0

	vxlan_flood_test $mac $dst 10 10 10

	log_test "VXLAN: $what"
}

test_flood()
{
	__test_flood de:ad:be:ef:13:37 192.0.2.100 "flood"
}

vxlan_fdb_add_del()
{
	local add_del=$1; shift
	local mac=$1; shift
	local dev=$1; shift
	local dst=$1; shift

	bridge fdb $add_del dev $dev $mac self static permanent \
		${dst:+dst} $dst 2>/dev/null
	bridge fdb $add_del dev $dev $mac master static 2>/dev/null
}

__test_unicast()
{
	local mac=$1; shift
	local dst=$1; shift
	local hit_idx=$1; shift
	local what=$1; shift

	RET=0

	local -a expects=(0 0 0)
	expects[$hit_idx]=10

	vxlan_flood_test $mac $dst "${expects[@]}"

	log_test "VXLAN: $what"
}

test_unicast()
{
	local -a targets=("$h2_mac $h2"
			  "$r1_mac vx1 192.0.2.34"
			  "$r2_mac vx1 192.0.2.50")
	local target

	for target in "${targets[@]}"; do
		vxlan_fdb_add_del add $target
	done

	__test_unicast $h2_mac 192.0.2.2 0 "local MAC unicast"
	__test_unicast $r1_mac 192.0.2.3 1 "remote MAC 1 unicast"
	__test_unicast $r2_mac 192.0.2.4 2 "remote MAC 2 unicast"

	for target in "${targets[@]}"; do
		vxlan_fdb_add_del del $target
	done
}

vxlan_ping_test()
{
	local ping_dev=$1; shift
	local ping_dip=$1; shift
	local ping_args=$1; shift
	local capture_dev=$1; shift
	local capture_dir=$1; shift
	local capture_pref=$1; shift
	local expect=$1; shift

	local t0=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
	ping_do $ping_dev $ping_dip "$ping_args"
	local t1=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
	local delta=$((t1 - t0))

	# Tolerate a couple stray extra packets.
	((expect <= delta && delta <= expect + 2))
	check_err $? "$capture_dev: Expected to capture $expect packets, got $delta."
}

test_ttl()
{
	RET=0

	tc filter add dev v1 egress pref 77 prot ip \
		flower ip_ttl 99 action pass
	vxlan_ping_test $h1 192.0.2.3 "" v1 egress 77 10
	tc filter del dev v1 egress pref 77 prot ip

	log_test "VXLAN: envelope TTL"
}

test_tos()
{
	RET=0

	tc filter add dev v1 egress pref 77 prot ip \
		flower ip_tos 0x14 action pass
	vxlan_ping_test $h1 192.0.2.3 "-Q 0x14" v1 egress 77 10
	vxlan_ping_test $h1 192.0.2.3 "-Q 0x18" v1 egress 77 0
	tc filter del dev v1 egress pref 77 prot ip

	log_test "VXLAN: envelope TOS inheritance"
}

__test_ecn_encap()
{
	local q=$1; shift
	local tos=$1; shift

	RET=0

	tc filter add dev v1 egress pref 77 prot ip \
		flower ip_tos $tos action pass
	sleep 1
	vxlan_ping_test $h1 192.0.2.3 "-Q $q" v1 egress 77 10
	tc filter del dev v1 egress pref 77 prot ip

	log_test "VXLAN: ECN encap: $q->$tos"
}

test_ecn_encap()
{
	# In accordance with INET_ECN_encapsulate()
	__test_ecn_encap 0x00 0x00
	__test_ecn_encap 0x01 0x01
	__test_ecn_encap 0x02 0x02
	__test_ecn_encap 0x03 0x02
}

vxlan_encapped_ping_do()
{
	local count=$1; shift
	local dev=$1; shift
	local next_hop_mac=$1; shift
	local dest_ip=$1; shift
	local dest_mac=$1; shift
	local inner_tos=$1; shift
	local outer_tos=$1; shift

	$MZ $dev -c $count -d 100msec -q \
		-b $next_hop_mac -B $dest_ip \
		-t udp tos=$outer_tos,sp=23456,dp=$VXPORT,p=$(:
		    )"08:"$(                      : VXLAN flags
		    )"00:00:00:"$(                : VXLAN reserved
		    )"00:03:e8:"$(                : VXLAN VNI
		    )"00:"$(                      : VXLAN reserved
		    )"$dest_mac:"$(               : ETH daddr
		    )"$(mac_get w2):"$(           : ETH saddr
		    )"08:00:"$(                   : ETH type
		    )"45:"$(                      : IP version + IHL
		    )"$inner_tos:"$(              : IP TOS
		    )"00:54:"$(                   : IP total length
		    )"99:83:"$(                   : IP identification
		    )"40:00:"$(                   : IP flags + frag off
		    )"40:"$(                      : IP TTL
		    )"01:"$(                      : IP proto
		    )"00:00:"$(                   : IP header csum
		    )"c0:00:02:03:"$(             : IP saddr: 192.0.2.3
		    )"c0:00:02:01:"$(             : IP daddr: 192.0.2.1
		    )"08:"$(                      : ICMP type
		    )"00:"$(                      : ICMP code
		    )"8b:f2:"$(                   : ICMP csum
		    )"1f:6a:"$(                   : ICMP request identifier
		    )"00:01:"$(                   : ICMP request sequence number
		    )"4f:ff:c5:5b:00:00:00:00:"$( : ICMP payload
		    )"6d:74:0b:00:00:00:00:00:"$( :
		    )"10:11:12:13:14:15:16:17:"$( :
		    )"18:19:1a:1b:1c:1d:1e:1f:"$( :
		    )"20:21:22:23:24:25:26:27:"$( :
		    )"28:29:2a:2b:2c:2d:2e:2f:"$( :
		    )"30:31:32:33:34:35:36:37"
}
export -f vxlan_encapped_ping_do

vxlan_encapped_ping_test()
{
	local ping_dev=$1; shift
	local nh_dev=$1; shift
	local ping_dip=$1; shift
	local inner_tos=$1; shift
	local outer_tos=$1; shift
	local stat_get=$1; shift
	local expect=$1; shift

	local t0=$($stat_get)

	in_ns ns1 \
		vxlan_encapped_ping_do 10 $ping_dev $(mac_get $nh_dev) \
			$ping_dip $(mac_get $h1) \
			$inner_tos $outer_tos

	local t1=$($stat_get)
	local delta=$((t1 - t0))

	# Tolerate a couple stray extra packets.
	((expect <= delta && delta <= expect + 2))
	check_err $? "Expected to capture $expect packets, got $delta."
}
export -f vxlan_encapped_ping_test

__test_ecn_decap()
{
	local orig_inner_tos=$1; shift
	local orig_outer_tos=$1; shift
	local decapped_tos=$1; shift

	RET=0

	tc filter add dev $h1 ingress pref 77 prot ip \
		flower ip_tos $decapped_tos action drop
	sleep 1
	vxlan_encapped_ping_test v2 v1 192.0.2.17 \
				 $orig_inner_tos $orig_outer_tos \
				 "tc_rule_stats_get $h1 77 ingress" 10
	tc filter del dev $h1 ingress pref 77

	log_test "VXLAN: ECN decap: $orig_outer_tos/$orig_inner_tos->$decapped_tos"
}

test_ecn_decap_error()
{
	local orig_inner_tos=00
	local orig_outer_tos=03

	RET=0

	vxlan_encapped_ping_test v2 v1 192.0.2.17 \
				 $orig_inner_tos $orig_outer_tos \
				 "link_stats_rx_errors_get vx1" 10

	log_test "VXLAN: ECN decap: $orig_outer_tos/$orig_inner_tos->error"
}

test_ecn_decap()
{
	# In accordance with INET_ECN_decapsulate()
	__test_ecn_decap 00 00 0x00
	__test_ecn_decap 00 01 0x00
	__test_ecn_decap 00 02 0x00
	# 00 03 is tested in test_ecn_decap_error()
	__test_ecn_decap 01 00 0x01
	__test_ecn_decap 01 01 0x01
	__test_ecn_decap 01 02 0x01
	__test_ecn_decap 01 03 0x03
	__test_ecn_decap 02 00 0x02
	__test_ecn_decap 02 01 0x01
	__test_ecn_decap 02 02 0x02
	__test_ecn_decap 02 03 0x03
	__test_ecn_decap 03 00 0x03
	__test_ecn_decap 03 01 0x03
	__test_ecn_decap 03 02 0x03
	__test_ecn_decap 03 03 0x03
	test_ecn_decap_error
}

test_learning()
{
	local mac=de:ad:be:ef:13:37
	local dst=192.0.2.100

	# Enable learning on the VxLAN device and set ageing time to 10 seconds
	ip link set dev br1 type bridge ageing_time 1000
	ip link set dev vx1 type vxlan ageing 10
	ip link set dev vx1 type vxlan learning
	reapply_config

	# Check that flooding works
	RET=0

	vxlan_flood_test $mac $dst 10 10 10

	log_test "VXLAN: flood before learning"

	# Send a packet with source mac set to $mac from host w2 and check that
	# a corresponding entry is created in VxLAN device vx1
	RET=0

	in_ns ns1 $MZ w2 -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff -B $dst \
		-t icmp -q
	sleep 1

	bridge fdb show brport vx1 | grep $mac | grep -q self
	check_err $?
	bridge fdb show brport vx1 | grep $mac | grep -q -v self
	check_err $?

	log_test "VXLAN: show learned FDB entry"

	# Repeat first test and check that packets only reach host w2 in ns1
	RET=0

	vxlan_flood_test $mac $dst 0 10 0

	log_test "VXLAN: learned FDB entry"

	# Delete the learned FDB entry from the VxLAN and bridge devices and
	# check that packets are flooded
	RET=0

	bridge fdb del dev vx1 $mac master self
	sleep 1

	vxlan_flood_test $mac $dst 10 10 10

	log_test "VXLAN: deletion of learned FDB entry"

	# Re-learn the first FDB entry and check that it is correctly aged-out
	RET=0

	in_ns ns1 $MZ w2 -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff -B $dst \
		-t icmp -q
	sleep 1

	bridge fdb show brport vx1 | grep $mac | grep -q self
	check_err $?
	bridge fdb show brport vx1 | grep $mac | grep -q -v self
	check_err $?

	vxlan_flood_test $mac $dst 0 10 0

	sleep 20

	bridge fdb show brport vx1 | grep $mac | grep -q self
	check_fail $?
	bridge fdb show brport vx1 | grep $mac | grep -q -v self
	check_fail $?

	vxlan_flood_test $mac $dst 10 10 10

	log_test "VXLAN: Ageing of learned FDB entry"

	# Toggle learning on the bridge port and check that the bridge's FDB
	# is populated only when it should
	RET=0

	ip link set dev vx1 type bridge_slave learning off

	in_ns ns1 $MZ w2 -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff -B $dst \
		-t icmp -q
	sleep 1

	bridge fdb show brport vx1 | grep $mac | grep -q -v self
	check_fail $?

	ip link set dev vx1 type bridge_slave learning on

	in_ns ns1 $MZ w2 -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff -B $dst \
		-t icmp -q
	sleep 1

	bridge fdb show brport vx1 | grep $mac | grep -q -v self
	check_err $?

	log_test "VXLAN: learning toggling on bridge port"

	# Restore previous settings
	ip link set dev vx1 type vxlan nolearning
	ip link set dev vx1 type vxlan ageing 300
	ip link set dev br1 type bridge ageing_time 30000
	reapply_config
}

test_all()
{
	echo "Running tests with UDP port $VXPORT"
	tests_run
}

trap cleanup EXIT

setup_prepare
setup_wait
test_all

exit $EXIT_STATUS