#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Test that packets are sampled when tc-sample is used and that reported
# metadata is correct. Two sets of hosts (with and without LAG) are used, since
# metadata extraction in mlxsw is a bit different when LAG is involved.
#
# +---------------------------------+       +---------------------------------+
# | H1 (vrf)                        |       | H3 (vrf)                        |
# |    + $h1                        |       |    + $h3_lag                    |
# |    | 192.0.2.1/28               |       |    | 192.0.2.17/28              |
# |    |                            |       |    |                            |
# |    |  default via 192.0.2.2     |       |    |  default via 192.0.2.18    |
# +----|----------------------------+       +----|----------------------------+
#      |                                         |
# +----|-----------------------------------------|----------------------------+
# |    | 192.0.2.2/28                            | 192.0.2.18/28              |
# |    + $rp1                                    + $rp3_lag                   |
# |                                                                           |
# |    + $rp2                                    + $rp4_lag                   |
# |    | 198.51.100.2/28                         | 198.51.100.18/28           |
# +----|-----------------------------------------|----------------------------+
#      |                                         |
# +----|----------------------------+       +----|----------------------------+
# |    |  default via 198.51.100.2  |       |    |  default via 198.51.100.18 |
# |    |                            |       |    |                            |
# |    | 198.51.100.1/28            |       |    | 198.51.100.17/28           |
# |    + $h2                        |       |    + $h4_lag                    |
# | H2 (vrf)                        |       | H4 (vrf)                        |
# +---------------------------------+       +---------------------------------+

lib_dir=$(dirname $0)/../../../net/forwarding

ALL_TESTS="
	tc_sample_rate_test
	tc_sample_max_rate_test
	tc_sample_conflict_test
	tc_sample_group_conflict_test
	tc_sample_md_iif_test
	tc_sample_md_lag_iif_test
	tc_sample_md_oif_test
	tc_sample_md_lag_oif_test
	tc_sample_md_out_tc_test
	tc_sample_md_out_tc_occ_test
	tc_sample_md_latency_test
	tc_sample_acl_group_conflict_test
	tc_sample_acl_rate_test
	tc_sample_acl_max_rate_test
"
NUM_NETIFS=8
CAPTURE_FILE=$(mktemp)
source $lib_dir/lib.sh
source $lib_dir/devlink_lib.sh
source mlxsw_lib.sh

# Available at https://github.com/Mellanox/libpsample
require_command psample

h1_create()
{
	simple_if_init $h1 192.0.2.1/28

	ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
}

h1_destroy()
{
	ip -4 route del default vrf v$h1 nexthop via 192.0.2.2

	simple_if_fini $h1 192.0.2.1/28
}

h2_create()
{
	simple_if_init $h2 198.51.100.1/28

	ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
}

h2_destroy()
{
	ip -4 route del default vrf v$h2 nexthop via 198.51.100.2

	simple_if_fini $h2 198.51.100.1/28
}

h3_create()
{
	ip link set dev $h3 down
	ip link add name ${h3}_bond type bond mode 802.3ad
	ip link set dev $h3 master ${h3}_bond

	simple_if_init ${h3}_bond 192.0.2.17/28

	ip -4 route add default vrf v${h3}_bond nexthop via 192.0.2.18
}

h3_destroy()
{
	ip -4 route del default vrf v${h3}_bond nexthop via 192.0.2.18

	simple_if_fini ${h3}_bond 192.0.2.17/28

	ip link set dev $h3 nomaster
	ip link del dev ${h3}_bond
}

h4_create()
{
	ip link set dev $h4 down
	ip link add name ${h4}_bond type bond mode 802.3ad
	ip link set dev $h4 master ${h4}_bond

	simple_if_init ${h4}_bond 198.51.100.17/28

	ip -4 route add default vrf v${h4}_bond nexthop via 198.51.100.18
}

h4_destroy()
{
	ip -4 route del default vrf v${h4}_bond nexthop via 198.51.100.18

	simple_if_fini ${h4}_bond 198.51.100.17/28

	ip link set dev $h4 nomaster
	ip link del dev ${h4}_bond
}

router_create()
{
	ip link set dev $rp1 up
	__addr_add_del $rp1 add 192.0.2.2/28
	tc qdisc add dev $rp1 clsact

	ip link set dev $rp2 up
	__addr_add_del $rp2 add 198.51.100.2/28
	tc qdisc add dev $rp2 clsact

	ip link add name ${rp3}_bond type bond mode 802.3ad
	ip link set dev $rp3 master ${rp3}_bond
	__addr_add_del ${rp3}_bond add 192.0.2.18/28
	tc qdisc add dev $rp3 clsact
	ip link set dev ${rp3}_bond up

	ip link add name ${rp4}_bond type bond mode 802.3ad
	ip link set dev $rp4 master ${rp4}_bond
	__addr_add_del ${rp4}_bond add 198.51.100.18/28
	tc qdisc add dev $rp4 clsact
	ip link set dev ${rp4}_bond up
}

router_destroy()
{
	ip link set dev ${rp4}_bond down
	tc qdisc del dev $rp4 clsact
	__addr_add_del ${rp4}_bond del 198.51.100.18/28
	ip link set dev $rp4 nomaster
	ip link del dev ${rp4}_bond

	ip link set dev ${rp3}_bond down
	tc qdisc del dev $rp3 clsact
	__addr_add_del ${rp3}_bond del 192.0.2.18/28
	ip link set dev $rp3 nomaster
	ip link del dev ${rp3}_bond

	tc qdisc del dev $rp2 clsact
	__addr_add_del $rp2 del 198.51.100.2/28
	ip link set dev $rp2 down

	tc qdisc del dev $rp1 clsact
	__addr_add_del $rp1 del 192.0.2.2/28
	ip link set dev $rp1 down
}

setup_prepare()
{
	h1=${NETIFS[p1]}
	rp1=${NETIFS[p2]}
	rp2=${NETIFS[p3]}
	h2=${NETIFS[p4]}
	h3=${NETIFS[p5]}
	rp3=${NETIFS[p6]}
	h4=${NETIFS[p7]}
	rp4=${NETIFS[p8]}

	vrf_prepare

	h1_create
	h2_create
	h3_create
	h4_create
	router_create
}

cleanup()
{
	pre_cleanup

	rm -f $CAPTURE_FILE

	router_destroy
	h4_destroy
	h3_destroy
	h2_destroy
	h1_destroy

	vrf_cleanup
}

psample_capture_start()
{
	rm -f $CAPTURE_FILE

	psample &> $CAPTURE_FILE &

	sleep 1
}

psample_capture_stop()
{
	{ kill %% && wait %%; } 2>/dev/null
}

__tc_sample_rate_test()
{
	local desc=$1; shift
	local dip=$1; shift
	local pkts pct

	RET=0

	tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 32 group 1
	check_err $? "Failed to configure sampling rule"

	psample_capture_start

	ip vrf exec v$h1 $MZ $h1 -c 320000 -d 100usec -p 64 -A 192.0.2.1 \
		-B $dip -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l)
	pct=$((100 * (pkts - 10000) / 10000))
	(( -25 <= pct && pct <= 25))
	check_err $? "Expected 10000 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%"

	log_test "tc sample rate ($desc)"

	tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
}

tc_sample_rate_test()
{
	__tc_sample_rate_test "forward" 198.51.100.1
	__tc_sample_rate_test "local receive" 192.0.2.2
}

tc_sample_max_rate_test()
{
	RET=0

	tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate $((35 * 10 ** 8)) group 1
	check_err $? "Failed to configure sampling rule with max rate"

	tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall

	tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate $((35 * 10 ** 8 + 1)) \
		group 1 &> /dev/null
	check_fail $? "Managed to configure sampling rate above maximum"

	log_test "tc sample maximum rate"
}

tc_sample_conflict_test()
{
	RET=0

	# Test that two sampling rules cannot be configured on the same port,
	# even when they share the same parameters.

	tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 1024 group 1
	check_err $? "Failed to configure sampling rule"

	tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \
		skip_sw action sample rate 1024 group 1 &> /dev/null
	check_fail $? "Managed to configure second sampling rule"

	# Delete the first rule and make sure the second rule can now be
	# configured.

	tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall

	tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \
		skip_sw action sample rate 1024 group 1
	check_err $? "Failed to configure sampling rule after deletion"

	log_test "tc sample conflict test"

	tc filter del dev $rp1 ingress protocol all pref 2 handle 102 matchall
}

tc_sample_group_conflict_test()
{
	RET=0

	# Test that two sampling rules cannot be configured on the same port
	# with different groups.

	tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 1024 group 1
	check_err $? "Failed to configure sampling rule"

	tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \
		skip_sw action sample rate 1024 group 2 &> /dev/null
	check_fail $? "Managed to configure sampling rule with conflicting group"

	log_test "tc sample group conflict test"

	tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
}

tc_sample_md_iif_test()
{
	local rp1_ifindex

	RET=0

	tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 5 group 1
	check_err $? "Failed to configure sampling rule"

	psample_capture_start

	ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
		-B 198.51.100.1 -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	rp1_ifindex=$(ip -j -p link show dev $rp1 | jq '.[]["ifindex"]')
	grep -q -e "in-ifindex $rp1_ifindex " $CAPTURE_FILE
	check_err $? "Sampled packets do not have expected in-ifindex"

	log_test "tc sample iif"

	tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
}

tc_sample_md_lag_iif_test()
{
	local rp3_ifindex

	RET=0

	tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 5 group 1
	check_err $? "Failed to configure sampling rule"

	psample_capture_start

	ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \
		-A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	rp3_ifindex=$(ip -j -p link show dev $rp3 | jq '.[]["ifindex"]')
	grep -q -e "in-ifindex $rp3_ifindex " $CAPTURE_FILE
	check_err $? "Sampled packets do not have expected in-ifindex"

	log_test "tc sample lag iif"

	tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall
}

tc_sample_md_oif_test()
{
	local rp2_ifindex

	RET=0

	tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 5 group 1
	check_err $? "Failed to configure sampling rule"

	psample_capture_start

	ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
		-B 198.51.100.1 -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	rp2_ifindex=$(ip -j -p link show dev $rp2 | jq '.[]["ifindex"]')
	grep -q -e "out-ifindex $rp2_ifindex " $CAPTURE_FILE
	check_err $? "Sampled packets do not have expected out-ifindex"

	log_test "tc sample oif"

	tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
}

tc_sample_md_lag_oif_test()
{
	local rp4_ifindex

	RET=0

	tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 5 group 1
	check_err $? "Failed to configure sampling rule"

	psample_capture_start

	ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \
		-A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	rp4_ifindex=$(ip -j -p link show dev $rp4 | jq '.[]["ifindex"]')
	grep -q -e "out-ifindex $rp4_ifindex " $CAPTURE_FILE
	check_err $? "Sampled packets do not have expected out-ifindex"

	log_test "tc sample lag oif"

	tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall
}

tc_sample_md_out_tc_test()
{
	RET=0

	# Output traffic class is not supported on Spectrum-1.
	mlxsw_only_on_spectrum 2+ || return

	tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 5 group 1
	check_err $? "Failed to configure sampling rule"

	# By default, all the packets should go to the same traffic class (0).

	psample_capture_start

	ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
		-B 198.51.100.1 -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	grep -q -e "out-tc 0 " $CAPTURE_FILE
	check_err $? "Sampled packets do not have expected out-tc (0)"

	# Map all priorities to highest traffic class (7) and check reported
	# out-tc.
	tc qdisc replace dev $rp2 root handle 1: \
		prio bands 3 priomap 0 0 0 0 0 0 0 0

	psample_capture_start

	ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
		-B 198.51.100.1 -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	grep -q -e "out-tc 7 " $CAPTURE_FILE
	check_err $? "Sampled packets do not have expected out-tc (7)"

	log_test "tc sample out-tc"

	tc qdisc del dev $rp2 root handle 1:
	tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
}

tc_sample_md_out_tc_occ_test()
{
	local backlog pct occ

	RET=0

	# Output traffic class occupancy is not supported on Spectrum-1.
	mlxsw_only_on_spectrum 2+ || return

	tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 1024 group 1
	check_err $? "Failed to configure sampling rule"

	# Configure a shaper on egress to create congestion.
	tc qdisc replace dev $rp2 root handle 1: \
		tbf rate 1Mbit burst 256k limit 1M

	psample_capture_start

	ip vrf exec v$h1 $MZ $h1 -c 0 -d 1usec -p 1400 -A 192.0.2.1 \
		-B 198.51.100.1 -t udp dp=52768,sp=42768 -q &

	# Allow congestion to reach steady state.
	sleep 10

	backlog=$(tc -j -p -s qdisc show dev $rp2 | jq '.[0]["backlog"]')

	# Kill mausezahn.
	{ kill %% && wait %%; } 2>/dev/null

	psample_capture_stop

	# Record last congestion sample.
	occ=$(grep -e "out-tc-occ " $CAPTURE_FILE | tail -n 1 | \
		cut -d ' ' -f 16)

	pct=$((100 * (occ - backlog) / backlog))
	(( -1 <= pct && pct <= 1))
	check_err $? "Recorded a congestion of $backlog bytes, but sampled congestion is $occ bytes, which is $pct% off. Required accuracy is +-5%"

	log_test "tc sample out-tc-occ"

	tc qdisc del dev $rp2 root handle 1:
	tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
}

tc_sample_md_latency_test()
{
	RET=0

	# Egress sampling not supported on Spectrum-1.
	mlxsw_only_on_spectrum 2+ || return

	tc filter add dev $rp2 egress protocol all pref 1 handle 101 matchall \
		skip_sw action sample rate 5 group 1
	check_err $? "Failed to configure sampling rule"

	psample_capture_start

	ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
		-B 198.51.100.1 -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	grep -q -e "latency " $CAPTURE_FILE
	check_err $? "Sampled packets do not have latency attribute"

	log_test "tc sample latency"

	tc filter del dev $rp2 egress protocol all pref 1 handle 101 matchall
}

tc_sample_acl_group_conflict_test()
{
	RET=0

	# Test that two flower sampling rules cannot be configured on the same
	# port with different groups.

	# Policy-based sampling is not supported on Spectrum-1.
	mlxsw_only_on_spectrum 2+ || return

	tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \
		skip_sw action sample rate 1024 group 1
	check_err $? "Failed to configure sampling rule"

	tc filter add dev $rp1 ingress protocol ip pref 2 handle 102 flower \
		skip_sw action sample rate 1024 group 1
	check_err $? "Failed to configure sampling rule with same group"

	tc filter add dev $rp1 ingress protocol ip pref 3 handle 103 flower \
		skip_sw action sample rate 1024 group 2 &> /dev/null
	check_fail $? "Managed to configure sampling rule with conflicting group"

	log_test "tc sample (w/ flower) group conflict test"

	tc filter del dev $rp1 ingress protocol ip pref 2 handle 102 flower
	tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower
}

__tc_sample_acl_rate_test()
{
	local bind=$1; shift
	local port=$1; shift
	local pkts pct

	RET=0

	# Policy-based sampling is not supported on Spectrum-1.
	mlxsw_only_on_spectrum 2+ || return

	tc filter add dev $port $bind protocol ip pref 1 handle 101 flower \
		skip_sw dst_ip 198.51.100.1 action sample rate 32 group 1
	check_err $? "Failed to configure sampling rule"

	psample_capture_start

	ip vrf exec v$h1 $MZ $h1 -c 320000 -d 100usec -p 64 -A 192.0.2.1 \
		-B 198.51.100.1 -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l)
	pct=$((100 * (pkts - 10000) / 10000))
	(( -25 <= pct && pct <= 25))
	check_err $? "Expected 10000 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%"

	# Setup a filter that should not match any packet and make sure packets
	# are not sampled.
	tc filter del dev $port $bind protocol ip pref 1 handle 101 flower

	tc filter add dev $port $bind protocol ip pref 1 handle 101 flower \
		skip_sw dst_ip 198.51.100.10 action sample rate 32 group 1
	check_err $? "Failed to configure sampling rule"

	psample_capture_start

	ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
		-B 198.51.100.1 -t udp dp=52768,sp=42768 -q

	psample_capture_stop

	grep -q -e "group 1 " $CAPTURE_FILE
	check_fail $? "Sampled packets when should not"

	log_test "tc sample (w/ flower) rate ($bind)"

	tc filter del dev $port $bind protocol ip pref 1 handle 101 flower
}

tc_sample_acl_rate_test()
{
	__tc_sample_acl_rate_test ingress $rp1
	__tc_sample_acl_rate_test egress $rp2
}

tc_sample_acl_max_rate_test()
{
	RET=0

	# Policy-based sampling is not supported on Spectrum-1.
	mlxsw_only_on_spectrum 2+ || return

	tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \
		skip_sw action sample rate $((2 ** 24 - 1)) group 1
	check_err $? "Failed to configure sampling rule with max rate"

	tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower

	tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \
		skip_sw action sample rate $((2 ** 24)) \
		group 1 &> /dev/null
	check_fail $? "Managed to configure sampling rate above maximum"

	log_test "tc sample (w/ flower) maximum rate"
}

trap cleanup EXIT

setup_prepare
setup_wait

tests_run

exit $EXIT_STATUS