// SPDX-License-Identifier: GPL-2.0-only
/*
 * KSM functional tests
 *
 * Copyright 2022, Red Hat, Inc.
 *
 * Author(s): David Hildenbrand <david@redhat.com>
 */
#define _GNU_SOURCE
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/wait.h>
#include <linux/userfaultfd.h>

#include "../kselftest.h"
#include "vm_util.h"

#define KiB 1024u
#define MiB (1024 * KiB)

static int mem_fd;
static int ksm_fd;
static int ksm_full_scans_fd;
static int proc_self_ksm_stat_fd;
static int proc_self_ksm_merging_pages_fd;
static int ksm_use_zero_pages_fd;
static int pagemap_fd;
static size_t pagesize;

static bool range_maps_duplicates(char *addr, unsigned long size)
{
	unsigned long offs_a, offs_b, pfn_a, pfn_b;

	/*
	 * There is no easy way to check if there are KSM pages mapped into
	 * this range. We only check that the range does not map the same PFN
	 * twice by comparing each pair of mapped pages.
	 */
	for (offs_a = 0; offs_a < size; offs_a += pagesize) {
		pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
		/* Page not present or PFN not exposed by the kernel. */
		if (pfn_a == -1ul || !pfn_a)
			continue;

		for (offs_b = offs_a + pagesize; offs_b < size;
		     offs_b += pagesize) {
			pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b);
			if (pfn_b == -1ul || !pfn_b)
				continue;
			if (pfn_a == pfn_b)
				return true;
		}
	}
	return false;
}

static long get_my_ksm_zero_pages(void)
{
	char buf[200];
	char *substr_ksm_zero;
	size_t value_pos;
	ssize_t read_size;
	unsigned long my_ksm_zero_pages;

	if (!proc_self_ksm_stat_fd)
		return 0;

	read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0);
	if (read_size < 0)
		return -errno;

	buf[read_size] = 0;

	substr_ksm_zero = strstr(buf, "ksm_zero_pages");
	if (!substr_ksm_zero)
		return 0;

	value_pos = strcspn(substr_ksm_zero, "0123456789");
	my_ksm_zero_pages = strtol(substr_ksm_zero + value_pos, NULL, 10);

	return my_ksm_zero_pages;
}

static long get_my_merging_pages(void)
{
	char buf[10];
	ssize_t ret;

	if (proc_self_ksm_merging_pages_fd < 0)
		return proc_self_ksm_merging_pages_fd;

	ret = pread(proc_self_ksm_merging_pages_fd, buf, sizeof(buf) - 1, 0);
	if (ret <= 0)
		return -errno;
	buf[ret] = 0;

	return strtol(buf, NULL, 10);
}

static long ksm_get_full_scans(void)
{
	char buf[10];
	ssize_t ret;

	ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
	if (ret <= 0)
		return -errno;
	buf[ret] = 0;

	return strtol(buf, NULL, 10);
}

static int ksm_merge(void)
{
	long start_scans, end_scans;

	/* Wait for two full scans such that any possible merging happened. */
	start_scans = ksm_get_full_scans();
	if (start_scans < 0)
		return start_scans;
	if (write(ksm_fd, "1", 1) != 1)
		return -errno;
	do {
		end_scans = ksm_get_full_scans();
		if (end_scans < 0)
			return end_scans;
	} while (end_scans < start_scans + 2);

	return 0;
}

static int ksm_unmerge(void)
{
	if (write(ksm_fd, "2", 1) != 1)
		return -errno;
	return 0;
}

static char *mmap_and_merge_range(char val, unsigned long size, int prot,
				  bool use_prctl)
{
	char *map;
	int ret;

	/* Stabilize accounting by disabling KSM completely. */
	if (ksm_unmerge()) {
		ksft_test_result_fail("Disabling (unmerging) KSM failed\n");
		goto unmap;
	}

	if (get_my_merging_pages() > 0) {
		ksft_test_result_fail("Still pages merged\n");
		goto unmap;
	}

	map = mmap(NULL, size, PROT_READ|PROT_WRITE,
		   MAP_PRIVATE|MAP_ANON, -1, 0);
	if (map == MAP_FAILED) {
		ksft_test_result_fail("mmap() failed\n");
		return MAP_FAILED;
	}

	/* Don't use THP. Ignore if THP are not around on a kernel. */
	if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) {
		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
		goto unmap;
	}

	/* Make sure each page contains the same values to merge them. */
	memset(map, val, size);

	if (mprotect(map, size, prot)) {
		ksft_test_result_skip("mprotect() failed\n");
		goto unmap;
	}

	if (use_prctl) {
		ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
		if (ret < 0 && errno == EINVAL) {
			ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
			goto unmap;
		} else if (ret) {
			ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
			goto unmap;
		}
	} else if (madvise(map, size, MADV_MERGEABLE)) {
		ksft_test_result_fail("MADV_MERGEABLE failed\n");
		goto unmap;
	}

	/* Run KSM to trigger merging and wait. */
	if (ksm_merge()) {
		ksft_test_result_fail("Running KSM failed\n");
		goto unmap;
	}

	/*
	 * Check if anything was merged at all. Ignore the zero page that is
	 * accounted differently (depending on kernel support).
	 */
	if (val && !get_my_merging_pages()) {
		ksft_test_result_fail("No pages got merged\n");
		goto unmap;
	}

	return map;
unmap:
	munmap(map, size);
	return MAP_FAILED;
}

static void test_unmerge(void)
{
	const unsigned int size = 2 * MiB;
	char *map;

	ksft_print_msg("[RUN] %s\n", __func__);

	map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false);
	if (map == MAP_FAILED)
		return;

	if (madvise(map, size, MADV_UNMERGEABLE)) {
		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
		goto unmap;
	}

	ksft_test_result(!range_maps_duplicates(map, size),
			 "Pages were unmerged\n");
unmap:
	munmap(map, size);
}

static void test_unmerge_zero_pages(void)
{
	const unsigned int size = 2 * MiB;
	char *map;
	unsigned int offs;
	unsigned long pages_expected;

	ksft_print_msg("[RUN] %s\n", __func__);

	if (proc_self_ksm_stat_fd < 0) {
		ksft_test_result_skip("open(\"/proc/self/ksm_stat\") failed\n");
		return;
	}
	if (ksm_use_zero_pages_fd < 0) {
		ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
		return;
	}
	if (write(ksm_use_zero_pages_fd, "1", 1) != 1) {
		ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
		return;
	}

	/* Let KSM deduplicate zero pages. */
	map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, false);
	if (map == MAP_FAILED)
		return;

	/* Check if ksm_zero_pages is updated correctly after KSM merging */
	pages_expected = size / pagesize;
	if (pages_expected != get_my_ksm_zero_pages()) {
		ksft_test_result_fail("'ksm_zero_pages' updated after merging\n");
		goto unmap;
	}

	/* Try to unmerge half of the region */
	if (madvise(map, size / 2, MADV_UNMERGEABLE)) {
		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
		goto unmap;
	}

	/* Check if ksm_zero_pages is updated correctly after unmerging */
	pages_expected /= 2;
	if (pages_expected != get_my_ksm_zero_pages()) {
		ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n");
		goto unmap;
	}

	/* Trigger unmerging of the other half by writing to the pages. */
	for (offs = size / 2; offs < size; offs += pagesize)
		*((unsigned int *)&map[offs]) = offs;

	/* Now we should have no zeropages remaining. */
	if (get_my_ksm_zero_pages()) {
		ksft_test_result_fail("'ksm_zero_pages' updated after write fault\n");
		goto unmap;
	}

	/* Check if ksm zero pages are really unmerged */
	ksft_test_result(!range_maps_duplicates(map, size),
			"KSM zero pages were unmerged\n");
unmap:
	munmap(map, size);
}

static void test_unmerge_discarded(void)
{
	const unsigned int size = 2 * MiB;
	char *map;

	ksft_print_msg("[RUN] %s\n", __func__);

	map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false);
	if (map == MAP_FAILED)
		return;

	/* Discard half of all mapped pages so we have pte_none() entries. */
	if (madvise(map, size / 2, MADV_DONTNEED)) {
		ksft_test_result_fail("MADV_DONTNEED failed\n");
		goto unmap;
	}

	if (madvise(map, size, MADV_UNMERGEABLE)) {
		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
		goto unmap;
	}

	ksft_test_result(!range_maps_duplicates(map, size),
			 "Pages were unmerged\n");
unmap:
	munmap(map, size);
}

#ifdef __NR_userfaultfd
static void test_unmerge_uffd_wp(void)
{
	struct uffdio_writeprotect uffd_writeprotect;
	const unsigned int size = 2 * MiB;
	struct uffdio_api uffdio_api;
	char *map;
	int uffd;

	ksft_print_msg("[RUN] %s\n", __func__);

	map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false);
	if (map == MAP_FAILED)
		return;

	/* See if UFFD is around. */
	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
	if (uffd < 0) {
		ksft_test_result_skip("__NR_userfaultfd failed\n");
		goto unmap;
	}

	/* See if UFFD-WP is around. */
	uffdio_api.api = UFFD_API;
	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
		ksft_test_result_fail("UFFDIO_API failed\n");
		goto close_uffd;
	}
	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
		ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n");
		goto close_uffd;
	}

	/* Register UFFD-WP, no need for an actual handler. */
	if (uffd_register(uffd, map, size, false, true, false)) {
		ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n");
		goto close_uffd;
	}

	/* Write-protect the range using UFFD-WP. */
	uffd_writeprotect.range.start = (unsigned long) map;
	uffd_writeprotect.range.len = size;
	uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
		ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n");
		goto close_uffd;
	}

	if (madvise(map, size, MADV_UNMERGEABLE)) {
		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
		goto close_uffd;
	}

	ksft_test_result(!range_maps_duplicates(map, size),
			 "Pages were unmerged\n");
close_uffd:
	close(uffd);
unmap:
	munmap(map, size);
}
#endif

/* Verify that KSM can be enabled / queried with prctl. */
static void test_prctl(void)
{
	int ret;

	ksft_print_msg("[RUN] %s\n", __func__);

	ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
	if (ret < 0 && errno == EINVAL) {
		ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
		return;
	} else if (ret) {
		ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
		return;
	}

	ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0);
	if (ret < 0) {
		ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n");
		return;
	} else if (ret != 1) {
		ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 not effective\n");
		return;
	}

	ret = prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0);
	if (ret) {
		ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
		return;
	}

	ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0);
	if (ret < 0) {
		ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n");
		return;
	} else if (ret != 0) {
		ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 not effective\n");
		return;
	}

	ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n");
}

/* Verify that prctl ksm flag is inherited. */
static void test_prctl_fork(void)
{
	int ret, status;
	pid_t child_pid;

	ksft_print_msg("[RUN] %s\n", __func__);

	ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
	if (ret < 0 && errno == EINVAL) {
		ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
		return;
	} else if (ret) {
		ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
		return;
	}

	child_pid = fork();
	if (!child_pid) {
		exit(prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0));
	} else if (child_pid < 0) {
		ksft_test_result_fail("fork() failed\n");
		return;
	}

	if (waitpid(child_pid, &status, 0) < 0) {
		ksft_test_result_fail("waitpid() failed\n");
		return;
	} else if (WEXITSTATUS(status) != 1) {
		ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n");
		return;
	}

	if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) {
		ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
		return;
	}

	ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n");
}

static void test_prctl_unmerge(void)
{
	const unsigned int size = 2 * MiB;
	char *map;

	ksft_print_msg("[RUN] %s\n", __func__);

	map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, true);
	if (map == MAP_FAILED)
		return;

	if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) {
		ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
		goto unmap;
	}

	ksft_test_result(!range_maps_duplicates(map, size),
			 "Pages were unmerged\n");
unmap:
	munmap(map, size);
}

static void test_prot_none(void)
{
	const unsigned int size = 2 * MiB;
	char *map;
	int i;

	ksft_print_msg("[RUN] %s\n", __func__);

	map = mmap_and_merge_range(0x11, size, PROT_NONE, false);
	if (map == MAP_FAILED)
		goto unmap;

	/* Store a unique value in each page on one half using ptrace */
	for (i = 0; i < size / 2; i += pagesize) {
		lseek(mem_fd, (uintptr_t) map + i, SEEK_SET);
		if (write(mem_fd, &i, sizeof(i)) != sizeof(i)) {
			ksft_test_result_fail("ptrace write failed\n");
			goto unmap;
		}
	}

	/* Trigger unsharing on the other half. */
	if (madvise(map + size / 2, size / 2, MADV_UNMERGEABLE)) {
		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
		goto unmap;
	}

	ksft_test_result(!range_maps_duplicates(map, size),
			 "Pages were unmerged\n");
unmap:
	munmap(map, size);
}

int main(int argc, char **argv)
{
	unsigned int tests = 7;
	int err;

#ifdef __NR_userfaultfd
	tests++;
#endif

	ksft_print_header();
	ksft_set_plan(tests);

	pagesize = getpagesize();

	mem_fd = open("/proc/self/mem", O_RDWR);
	if (mem_fd < 0)
		ksft_exit_fail_msg("opening /proc/self/mem failed\n");
	ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
	if (ksm_fd < 0)
		ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n");
	ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
	if (ksm_full_scans_fd < 0)
		ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n");
	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
	if (pagemap_fd < 0)
		ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
	proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY);
	proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages",
					      O_RDONLY);
	ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR);

	test_unmerge();
	test_unmerge_zero_pages();
	test_unmerge_discarded();
#ifdef __NR_userfaultfd
	test_unmerge_uffd_wp();
#endif

	test_prot_none();

	test_prctl();
	test_prctl_fork();
	test_prctl_unmerge();

	err = ksft_get_fail_cnt();
	if (err)
		ksft_exit_fail_msg("%d out of %d tests failed\n",
				   err, ksft_test_num());
	return ksft_exit_pass();
}