#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# A test for switch behavior under MC overload. An issue in Spectrum chips
# causes throughput of UC traffic to drop severely when a switch is under heavy
# MC load. This issue can be overcome by putting the switch to MC-aware mode.
# This test verifies that UC performance stays intact even as the switch is
# under MC flood, and therefore that the MC-aware mode is enabled and correctly
# configured.
#
# Because mlxsw throttles CPU port, the traffic can't actually reach userspace
# at full speed. That makes it impossible to use iperf3 to simply measure the
# throughput, because many packets (that reach $h3) don't get to the kernel at
# all even in UDP mode (the situation is even worse in TCP mode, where one can't
# hope to see more than a couple Mbps).
#
# So instead we send traffic with mausezahn and use RX ethtool counters at $h3.
# Multicast traffic is untagged, unicast traffic is tagged with PCP 1. Therefore
# each gets a different priority and we can use per-prio ethtool counters to
# measure the throughput. In order to avoid prioritizing unicast traffic, prio
# qdisc is installed on $swp3 and maps all priorities to the same band #7 (and
# thus TC 0).
#
# Mausezahn can't actually saturate the links unless it's using large frames.
# Thus we set MTU to 10K on all involved interfaces. Then both unicast and
# multicast traffic uses 8K frames.
#
# +---------------------------+            +----------------------------------+
# | H1                        |            |                               H2 |
# |                           |            |  unicast --> + $h2.111           |
# |                 multicast |            |  traffic     | 192.0.2.129/28    |
# |                 traffic   |            |              | e-qos-map 0:1     |
# |           $h1 + <-----    |            |              |                   |
# | 192.0.2.65/28 |           |            |              + $h2               |
# +---------------|-----------+            +--------------|-------------------+
#                 |                                       |
# +---------------|---------------------------------------|-------------------+
# |         $swp1 +                                       + $swp2             |
# |        >1Gbps |                                       | >1Gbps            |
# | +-------------|------+                     +----------|----------------+  |
# | |     $swp1.1 +      |                     |          + $swp2.111      |  |
# | |                BR1 |             SW      | BR111                     |  |
# | |     $swp3.1 +      |                     |          + $swp3.111      |  |
# | +-------------|------+                     +----------|----------------+  |
# |               \_______________________________________/                   |
# |                                    |                                      |
# |                                    + $swp3                                |
# |                                    | 1Gbps bottleneck                     |
# |                                    | prio qdisc: {0..7} -> 7              |
# +------------------------------------|--------------------------------------+
#                                      |
#                                   +--|-----------------+
#                                   |  + $h3          H3 |
#                                   |  | 192.0.2.66/28   |
#                                   |  |                 |
#                                   |  + $h3.111         |
#                                   |    192.0.2.130/28  |
#                                   +--------------------+

ALL_TESTS="
	ping_ipv4
	test_mc_aware
	test_uc_aware
"

lib_dir=$(dirname $0)/../../../net/forwarding

NUM_NETIFS=6
source $lib_dir/lib.sh
source $lib_dir/devlink_lib.sh
source qos_lib.sh

h1_create()
{
	simple_if_init $h1 192.0.2.65/28
	mtu_set $h1 10000
}

h1_destroy()
{
	mtu_restore $h1
	simple_if_fini $h1 192.0.2.65/28
}

h2_create()
{
	simple_if_init $h2
	mtu_set $h2 10000

	vlan_create $h2 111 v$h2 192.0.2.129/28
	ip link set dev $h2.111 type vlan egress-qos-map 0:1
}

h2_destroy()
{
	vlan_destroy $h2 111

	mtu_restore $h2
	simple_if_fini $h2
}

h3_create()
{
	simple_if_init $h3 192.0.2.66/28
	mtu_set $h3 10000

	vlan_create $h3 111 v$h3 192.0.2.130/28
}

h3_destroy()
{
	vlan_destroy $h3 111

	mtu_restore $h3
	simple_if_fini $h3 192.0.2.66/28
}

switch_create()
{
	ip link set dev $swp1 up
	mtu_set $swp1 10000

	ip link set dev $swp2 up
	mtu_set $swp2 10000

	ip link set dev $swp3 up
	mtu_set $swp3 10000

	vlan_create $swp2 111
	vlan_create $swp3 111

	tc qdisc replace dev $swp3 root handle 3: tbf rate 1gbit \
		burst 128K limit 1G
	tc qdisc replace dev $swp3 parent 3:3 handle 33: \
		prio bands 8 priomap 7 7 7 7 7 7 7 7

	ip link add name br1 type bridge vlan_filtering 0
	ip link set dev br1 addrgenmode none
	ip link set dev br1 up
	ip link set dev $swp1 master br1
	ip link set dev $swp3 master br1

	ip link add name br111 type bridge vlan_filtering 0
	ip link set dev br111 addrgenmode none
	ip link set dev br111 up
	ip link set dev $swp2.111 master br111
	ip link set dev $swp3.111 master br111

	# Make sure that ingress quotas are smaller than egress so that there is
	# room for both streams of traffic to be admitted to shared buffer.
	devlink_port_pool_th_save $swp1 0
	devlink_port_pool_th_set $swp1 0 5
	devlink_tc_bind_pool_th_save $swp1 0 ingress
	devlink_tc_bind_pool_th_set $swp1 0 ingress 0 5

	devlink_port_pool_th_save $swp2 0
	devlink_port_pool_th_set $swp2 0 5
	devlink_tc_bind_pool_th_save $swp2 1 ingress
	devlink_tc_bind_pool_th_set $swp2 1 ingress 0 5

	devlink_port_pool_th_save $swp3 4
	devlink_port_pool_th_set $swp3 4 12
}

switch_destroy()
{
	devlink_port_pool_th_restore $swp3 4

	devlink_tc_bind_pool_th_restore $swp2 1 ingress
	devlink_port_pool_th_restore $swp2 0

	devlink_tc_bind_pool_th_restore $swp1 0 ingress
	devlink_port_pool_th_restore $swp1 0

	ip link del dev br111
	ip link del dev br1

	tc qdisc del dev $swp3 parent 3:3 handle 33:
	tc qdisc del dev $swp3 root handle 3:

	vlan_destroy $swp3 111
	vlan_destroy $swp2 111

	mtu_restore $swp3
	ip link set dev $swp3 down

	mtu_restore $swp2
	ip link set dev $swp2 down

	mtu_restore $swp1
	ip link set dev $swp1 down
}

setup_prepare()
{
	h1=${NETIFS[p1]}
	swp1=${NETIFS[p2]}

	swp2=${NETIFS[p3]}
	h2=${NETIFS[p4]}

	swp3=${NETIFS[p5]}
	h3=${NETIFS[p6]}

	h3mac=$(mac_get $h3)

	vrf_prepare

	h1_create
	h2_create
	h3_create
	switch_create
}

cleanup()
{
	pre_cleanup

	switch_destroy
	h3_destroy
	h2_destroy
	h1_destroy

	vrf_cleanup
}

ping_ipv4()
{
	ping_test $h2 192.0.2.130
}

test_mc_aware()
{
	RET=0

	local -a uc_rate
	start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
	uc_rate=($(measure_rate $swp2 $h3 rx_octets_prio_1 "UC-only"))
	check_err $? "Could not get high enough UC-only ingress rate"
	stop_traffic
	local ucth1=${uc_rate[1]}

	start_traffic $h1 192.0.2.65 bc bc

	local d0=$(date +%s)
	local t0=$(ethtool_stats_get $h3 rx_octets_prio_0)
	local u0=$(ethtool_stats_get $swp1 rx_octets_prio_0)

	local -a uc_rate_2
	start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
	uc_rate_2=($(measure_rate $swp2 $h3 rx_octets_prio_1 "UC+MC"))
	check_err $? "Could not get high enough UC+MC ingress rate"
	stop_traffic
	local ucth2=${uc_rate_2[1]}

	local d1=$(date +%s)
	local t1=$(ethtool_stats_get $h3 rx_octets_prio_0)
	local u1=$(ethtool_stats_get $swp1 rx_octets_prio_0)

	local deg=$(bc <<< "
			scale=2
			ret = 100 * ($ucth1 - $ucth2) / $ucth1
			if (ret > 0) { ret } else { 0 }
		    ")

	# Minimum shaper of 200Mbps on MC TCs should cause about 20% of
	# degradation on 1Gbps link.
	check_err $(bc <<< "$deg < 15") "Minimum shaper not in effect"
	check_err $(bc <<< "$deg > 25") "MC traffic degrades UC performance too much"

	local interval=$((d1 - d0))
	local mc_ir=$(rate $u0 $u1 $interval)
	local mc_er=$(rate $t0 $t1 $interval)

	stop_traffic

	log_test "UC performance under MC overload"

	echo "UC-only throughput  $(humanize $ucth1)"
	echo "UC+MC throughput    $(humanize $ucth2)"
	echo "Degradation         $deg %"
	echo
	echo "Full report:"
	echo "  UC only:"
	echo "    ingress UC throughput $(humanize ${uc_rate[0]})"
	echo "    egress UC throughput  $(humanize ${uc_rate[1]})"
	echo "  UC+MC:"
	echo "    ingress UC throughput $(humanize ${uc_rate_2[0]})"
	echo "    egress UC throughput  $(humanize ${uc_rate_2[1]})"
	echo "    ingress MC throughput $(humanize $mc_ir)"
	echo "    egress MC throughput  $(humanize $mc_er)"
	echo
}

test_uc_aware()
{
	RET=0

	start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac

	local d0=$(date +%s)
	local t0=$(ethtool_stats_get $h3 rx_octets_prio_1)
	local u0=$(ethtool_stats_get $swp2 rx_octets_prio_1)
	sleep 1

	local attempts=50
	local passes=0
	local i

	for ((i = 0; i < attempts; ++i)); do
		if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 1; then
			((passes++))
		fi

		sleep 0.1
	done

	local d1=$(date +%s)
	local t1=$(ethtool_stats_get $h3 rx_octets_prio_1)
	local u1=$(ethtool_stats_get $swp2 rx_octets_prio_1)

	local interval=$((d1 - d0))
	local uc_ir=$(rate $u0 $u1 $interval)
	local uc_er=$(rate $t0 $t1 $interval)

	((attempts == passes))
	check_err $?

	stop_traffic

	log_test "MC performance under UC overload"
	echo "    ingress UC throughput $(humanize ${uc_ir})"
	echo "    egress UC throughput  $(humanize ${uc_er})"
	echo "    sent $attempts BC ARPs, got $passes responses"
}

trap cleanup EXIT

setup_prepare
setup_wait

tests_run

exit $EXIT_STATUS