// SPDX-License-Identifier: GPL-2.0

/*
 * Based on Christian Brauner's clone3() example.
 * These tests are assuming to be running in the host's
 * PID namespace.
 */

/* capabilities related code based on selftests/bpf/test_verifier.c */

#define _GNU_SOURCE
#include <errno.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/capability.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/un.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sched.h>

#include "../kselftest_harness.h"
#include "clone3_selftests.h"

#ifndef MAX_PID_NS_LEVEL
#define MAX_PID_NS_LEVEL 32
#endif

static void child_exit(int ret)
{
	fflush(stdout);
	fflush(stderr);
	_exit(ret);
}

static int call_clone3_set_tid(struct __test_metadata *_metadata,
			       pid_t *set_tid, size_t set_tid_size)
{
	int status;
	pid_t pid = -1;

	struct __clone_args args = {
		.exit_signal = SIGCHLD,
		.set_tid = ptr_to_u64(set_tid),
		.set_tid_size = set_tid_size,
	};

	pid = sys_clone3(&args, sizeof(args));
	if (pid < 0) {
		TH_LOG("%s - Failed to create new process", strerror(errno));
		return -errno;
	}

	if (pid == 0) {
		int ret;
		char tmp = 0;

		TH_LOG("I am the child, my PID is %d (expected %d)", getpid(), set_tid[0]);

		if (set_tid[0] != getpid())
			child_exit(EXIT_FAILURE);
		child_exit(EXIT_SUCCESS);
	}

	TH_LOG("I am the parent (%d). My child's pid is %d", getpid(), pid);

	if (waitpid(pid, &status, 0) < 0) {
		TH_LOG("Child returned %s", strerror(errno));
		return -errno;
	}

	if (!WIFEXITED(status))
		return -1;

	return WEXITSTATUS(status);
}

static int test_clone3_set_tid(struct __test_metadata *_metadata,
			       pid_t *set_tid, size_t set_tid_size)
{
	int ret;

	TH_LOG("[%d] Trying clone3() with CLONE_SET_TID to %d", getpid(), set_tid[0]);
	ret = call_clone3_set_tid(_metadata, set_tid, set_tid_size);
	TH_LOG("[%d] clone3() with CLONE_SET_TID %d says:%d", getpid(), set_tid[0], ret);
	return ret;
}

struct libcap {
	struct __user_cap_header_struct hdr;
	struct __user_cap_data_struct data[2];
};

static int set_capability(void)
{
	cap_value_t cap_values[] = { CAP_SETUID, CAP_SETGID };
	struct libcap *cap;
	int ret = -1;
	cap_t caps;

	caps = cap_get_proc();
	if (!caps) {
		perror("cap_get_proc");
		return -1;
	}

	/* Drop all capabilities */
	if (cap_clear(caps)) {
		perror("cap_clear");
		goto out;
	}

	cap_set_flag(caps, CAP_EFFECTIVE, 2, cap_values, CAP_SET);
	cap_set_flag(caps, CAP_PERMITTED, 2, cap_values, CAP_SET);

	cap = (struct libcap *) caps;

	/* 40 -> CAP_CHECKPOINT_RESTORE */
	cap->data[1].effective |= 1 << (40 - 32);
	cap->data[1].permitted |= 1 << (40 - 32);

	if (cap_set_proc(caps)) {
		perror("cap_set_proc");
		goto out;
	}
	ret = 0;
out:
	if (cap_free(caps))
		perror("cap_free");
	return ret;
}

TEST(clone3_cap_checkpoint_restore)
{
	pid_t pid;
	int status;
	int ret = 0;
	pid_t set_tid[1];

	test_clone3_supported();

	EXPECT_EQ(getuid(), 0)
		SKIP(return, "Skipping all tests as non-root");

	memset(&set_tid, 0, sizeof(set_tid));

	/* Find the current active PID */
	pid = fork();
	if (pid == 0) {
		TH_LOG("Child has PID %d", getpid());
		child_exit(EXIT_SUCCESS);
	}
	ASSERT_GT(waitpid(pid, &status, 0), 0)
		TH_LOG("Waiting for child %d failed", pid);

	/* After the child has finished, its PID should be free. */
	set_tid[0] = pid;

	ASSERT_EQ(set_capability(), 0)
		TH_LOG("Could not set CAP_CHECKPOINT_RESTORE");

	ASSERT_EQ(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0), 0);

	EXPECT_EQ(setgid(65534), 0)
		TH_LOG("Failed to setgid(65534)");
	ASSERT_EQ(setuid(65534), 0);

	set_tid[0] = pid;
	/* This would fail without CAP_CHECKPOINT_RESTORE */
	ASSERT_EQ(test_clone3_set_tid(_metadata, set_tid, 1), -EPERM);
	ASSERT_EQ(set_capability(), 0)
		TH_LOG("Could not set CAP_CHECKPOINT_RESTORE");
	/* This should work as we have CAP_CHECKPOINT_RESTORE as non-root */
	ASSERT_EQ(test_clone3_set_tid(_metadata, set_tid, 1), 0);
}

TEST_HARNESS_MAIN