/* SPDX-License-Identifier: GPL-2.0 */

#define _GNU_SOURCE

#include <errno.h>
#include <fcntl.h>
#include <linux/limits.h>
#include <poll.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/inotify.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

#include "cgroup_util.h"
#include "../clone3/clone3_selftests.h"

/* Returns read len on success, or -errno on failure. */
static ssize_t read_text(const char *path, char *buf, size_t max_len)
{
	ssize_t len;
	int fd;

	fd = open(path, O_RDONLY);
	if (fd < 0)
		return -errno;

	len = read(fd, buf, max_len - 1);

	if (len >= 0)
		buf[len] = 0;

	close(fd);
	return len < 0 ? -errno : len;
}

/* Returns written len on success, or -errno on failure. */
static ssize_t write_text(const char *path, char *buf, ssize_t len)
{
	int fd;

	fd = open(path, O_WRONLY | O_APPEND);
	if (fd < 0)
		return -errno;

	len = write(fd, buf, len);
	close(fd);
	return len < 0 ? -errno : len;
}

char *cg_name(const char *root, const char *name)
{
	size_t len = strlen(root) + strlen(name) + 2;
	char *ret = malloc(len);

	snprintf(ret, len, "%s/%s", root, name);

	return ret;
}

char *cg_name_indexed(const char *root, const char *name, int index)
{
	size_t len = strlen(root) + strlen(name) + 10;
	char *ret = malloc(len);

	snprintf(ret, len, "%s/%s_%d", root, name, index);

	return ret;
}

char *cg_control(const char *cgroup, const char *control)
{
	size_t len = strlen(cgroup) + strlen(control) + 2;
	char *ret = malloc(len);

	snprintf(ret, len, "%s/%s", cgroup, control);

	return ret;
}

/* Returns 0 on success, or -errno on failure. */
int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
{
	char path[PATH_MAX];
	ssize_t ret;

	snprintf(path, sizeof(path), "%s/%s", cgroup, control);

	ret = read_text(path, buf, len);
	return ret >= 0 ? 0 : ret;
}

int cg_read_strcmp(const char *cgroup, const char *control,
		   const char *expected)
{
	size_t size;
	char *buf;
	int ret;

	/* Handle the case of comparing against empty string */
	if (!expected)
		return -1;
	else
		size = strlen(expected) + 1;

	buf = malloc(size);
	if (!buf)
		return -1;

	if (cg_read(cgroup, control, buf, size)) {
		free(buf);
		return -1;
	}

	ret = strcmp(expected, buf);
	free(buf);
	return ret;
}

int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
{
	char buf[PAGE_SIZE];

	if (cg_read(cgroup, control, buf, sizeof(buf)))
		return -1;

	return strstr(buf, needle) ? 0 : -1;
}

long cg_read_long(const char *cgroup, const char *control)
{
	char buf[128];

	if (cg_read(cgroup, control, buf, sizeof(buf)))
		return -1;

	return atol(buf);
}

long cg_read_key_long(const char *cgroup, const char *control, const char *key)
{
	char buf[PAGE_SIZE];
	char *ptr;

	if (cg_read(cgroup, control, buf, sizeof(buf)))
		return -1;

	ptr = strstr(buf, key);
	if (!ptr)
		return -1;

	return atol(ptr + strlen(key));
}

long cg_read_lc(const char *cgroup, const char *control)
{
	char buf[PAGE_SIZE];
	const char delim[] = "\n";
	char *line;
	long cnt = 0;

	if (cg_read(cgroup, control, buf, sizeof(buf)))
		return -1;

	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
		cnt++;

	return cnt;
}

/* Returns 0 on success, or -errno on failure. */
int cg_write(const char *cgroup, const char *control, char *buf)
{
	char path[PATH_MAX];
	ssize_t len = strlen(buf), ret;

	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
	ret = write_text(path, buf, len);
	return ret == len ? 0 : ret;
}

int cg_write_numeric(const char *cgroup, const char *control, long value)
{
	char buf[64];
	int ret;

	ret = sprintf(buf, "%lu", value);
	if (ret < 0)
		return ret;

	return cg_write(cgroup, control, buf);
}

int cg_find_unified_root(char *root, size_t len)
{
	char buf[10 * PAGE_SIZE];
	char *fs, *mount, *type;
	const char delim[] = "\n\t ";

	if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
		return -1;

	/*
	 * Example:
	 * cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0
	 */
	for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
		mount = strtok(NULL, delim);
		type = strtok(NULL, delim);
		strtok(NULL, delim);
		strtok(NULL, delim);
		strtok(NULL, delim);

		if (strcmp(type, "cgroup2") == 0) {
			strncpy(root, mount, len);
			return 0;
		}
	}

	return -1;
}

int cg_create(const char *cgroup)
{
	return mkdir(cgroup, 0755);
}

int cg_wait_for_proc_count(const char *cgroup, int count)
{
	char buf[10 * PAGE_SIZE] = {0};
	int attempts;
	char *ptr;

	for (attempts = 10; attempts >= 0; attempts--) {
		int nr = 0;

		if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
			break;

		for (ptr = buf; *ptr; ptr++)
			if (*ptr == '\n')
				nr++;

		if (nr >= count)
			return 0;

		usleep(100000);
	}

	return -1;
}

int cg_killall(const char *cgroup)
{
	char buf[PAGE_SIZE];
	char *ptr = buf;

	/* If cgroup.kill exists use it. */
	if (!cg_write(cgroup, "cgroup.kill", "1"))
		return 0;

	if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
		return -1;

	while (ptr < buf + sizeof(buf)) {
		int pid = strtol(ptr, &ptr, 10);

		if (pid == 0)
			break;
		if (*ptr)
			ptr++;
		else
			break;
		if (kill(pid, SIGKILL))
			return -1;
	}

	return 0;
}

int cg_destroy(const char *cgroup)
{
	int ret;

	if (!cgroup)
		return 0;
retry:
	ret = rmdir(cgroup);
	if (ret && errno == EBUSY) {
		cg_killall(cgroup);
		usleep(100);
		goto retry;
	}

	if (ret && errno == ENOENT)
		ret = 0;

	return ret;
}

int cg_enter(const char *cgroup, int pid)
{
	char pidbuf[64];

	snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
	return cg_write(cgroup, "cgroup.procs", pidbuf);
}

int cg_enter_current(const char *cgroup)
{
	return cg_write(cgroup, "cgroup.procs", "0");
}

int cg_enter_current_thread(const char *cgroup)
{
	return cg_write(cgroup, "cgroup.threads", "0");
}

int cg_run(const char *cgroup,
	   int (*fn)(const char *cgroup, void *arg),
	   void *arg)
{
	int pid, retcode;

	pid = fork();
	if (pid < 0) {
		return pid;
	} else if (pid == 0) {
		char buf[64];

		snprintf(buf, sizeof(buf), "%d", getpid());
		if (cg_write(cgroup, "cgroup.procs", buf))
			exit(EXIT_FAILURE);
		exit(fn(cgroup, arg));
	} else {
		waitpid(pid, &retcode, 0);
		if (WIFEXITED(retcode))
			return WEXITSTATUS(retcode);
		else
			return -1;
	}
}

pid_t clone_into_cgroup(int cgroup_fd)
{
#ifdef CLONE_ARGS_SIZE_VER2
	pid_t pid;

	struct __clone_args args = {
		.flags = CLONE_INTO_CGROUP,
		.exit_signal = SIGCHLD,
		.cgroup = cgroup_fd,
	};

	pid = sys_clone3(&args, sizeof(struct __clone_args));
	/*
	 * Verify that this is a genuine test failure:
	 * ENOSYS -> clone3() not available
	 * E2BIG  -> CLONE_INTO_CGROUP not available
	 */
	if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
		goto pretend_enosys;

	return pid;

pretend_enosys:
#endif
	errno = ENOSYS;
	return -ENOSYS;
}

int clone_reap(pid_t pid, int options)
{
	int ret;
	siginfo_t info = {
		.si_signo = 0,
	};

again:
	ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
	if (ret < 0) {
		if (errno == EINTR)
			goto again;
		return -1;
	}

	if (options & WEXITED) {
		if (WIFEXITED(info.si_status))
			return WEXITSTATUS(info.si_status);
	}

	if (options & WSTOPPED) {
		if (WIFSTOPPED(info.si_status))
			return WSTOPSIG(info.si_status);
	}

	if (options & WCONTINUED) {
		if (WIFCONTINUED(info.si_status))
			return 0;
	}

	return -1;
}

int dirfd_open_opath(const char *dir)
{
	return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
}

#define close_prot_errno(fd)                                                   \
	if (fd >= 0) {                                                         \
		int _e_ = errno;                                               \
		close(fd);                                                     \
		errno = _e_;                                                   \
	}

static int clone_into_cgroup_run_nowait(const char *cgroup,
					int (*fn)(const char *cgroup, void *arg),
					void *arg)
{
	int cgroup_fd;
	pid_t pid;

	cgroup_fd =  dirfd_open_opath(cgroup);
	if (cgroup_fd < 0)
		return -1;

	pid = clone_into_cgroup(cgroup_fd);
	close_prot_errno(cgroup_fd);
	if (pid == 0)
		exit(fn(cgroup, arg));

	return pid;
}

int cg_run_nowait(const char *cgroup,
		  int (*fn)(const char *cgroup, void *arg),
		  void *arg)
{
	int pid;

	pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
	if (pid > 0)
		return pid;

	/* Genuine test failure. */
	if (pid < 0 && errno != ENOSYS)
		return -1;

	pid = fork();
	if (pid == 0) {
		char buf[64];

		snprintf(buf, sizeof(buf), "%d", getpid());
		if (cg_write(cgroup, "cgroup.procs", buf))
			exit(EXIT_FAILURE);
		exit(fn(cgroup, arg));
	}

	return pid;
}

int get_temp_fd(void)
{
	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
}

int alloc_pagecache(int fd, size_t size)
{
	char buf[PAGE_SIZE];
	struct stat st;
	int i;

	if (fstat(fd, &st))
		goto cleanup;

	size += st.st_size;

	if (ftruncate(fd, size))
		goto cleanup;

	for (i = 0; i < size; i += sizeof(buf))
		read(fd, buf, sizeof(buf));

	return 0;

cleanup:
	return -1;
}

int alloc_anon(const char *cgroup, void *arg)
{
	size_t size = (unsigned long)arg;
	char *buf, *ptr;

	buf = malloc(size);
	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
		*ptr = 0;

	free(buf);
	return 0;
}

int is_swap_enabled(void)
{
	char buf[PAGE_SIZE];
	const char delim[] = "\n";
	int cnt = 0;
	char *line;

	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
		return -1;

	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
		cnt++;

	return cnt > 1;
}

int set_oom_adj_score(int pid, int score)
{
	char path[PATH_MAX];
	int fd, len;

	sprintf(path, "/proc/%d/oom_score_adj", pid);

	fd = open(path, O_WRONLY | O_APPEND);
	if (fd < 0)
		return fd;

	len = dprintf(fd, "%d", score);
	if (len < 0) {
		close(fd);
		return len;
	}

	close(fd);
	return 0;
}

int proc_mount_contains(const char *option)
{
	char buf[4 * PAGE_SIZE];
	ssize_t read;

	read = read_text("/proc/mounts", buf, sizeof(buf));
	if (read < 0)
		return read;

	return strstr(buf, option) != NULL;
}

ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
{
	char path[PATH_MAX];
	ssize_t ret;

	if (!pid)
		snprintf(path, sizeof(path), "/proc/%s/%s",
			 thread ? "thread-self" : "self", item);
	else
		snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);

	ret = read_text(path, buf, size);
	return ret < 0 ? -1 : ret;
}

int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
{
	char buf[PAGE_SIZE];

	if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
		return -1;

	return strstr(buf, needle) ? 0 : -1;
}

int clone_into_cgroup_run_wait(const char *cgroup)
{
	int cgroup_fd;
	pid_t pid;

	cgroup_fd =  dirfd_open_opath(cgroup);
	if (cgroup_fd < 0)
		return -1;

	pid = clone_into_cgroup(cgroup_fd);
	close_prot_errno(cgroup_fd);
	if (pid < 0)
		return -1;

	if (pid == 0)
		exit(EXIT_SUCCESS);

	/*
	 * We don't care whether this fails. We only care whether the initial
	 * clone succeeded.
	 */
	(void)clone_reap(pid, WEXITED);
	return 0;
}

static int __prepare_for_wait(const char *cgroup, const char *filename)
{
	int fd, ret = -1;

	fd = inotify_init1(0);
	if (fd == -1)
		return fd;

	ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY);
	if (ret == -1) {
		close(fd);
		fd = -1;
	}

	return fd;
}

int cg_prepare_for_wait(const char *cgroup)
{
	return __prepare_for_wait(cgroup, "cgroup.events");
}

int memcg_prepare_for_wait(const char *cgroup)
{
	return __prepare_for_wait(cgroup, "memory.events");
}

int cg_wait_for(int fd)
{
	int ret = -1;
	struct pollfd fds = {
		.fd = fd,
		.events = POLLIN,
	};

	while (true) {
		ret = poll(&fds, 1, 10000);

		if (ret == -1) {
			if (errno == EINTR)
				continue;

			break;
		}

		if (ret > 0 && fds.revents & POLLIN) {
			ret = 0;
			break;
		}
	}

	return ret;
}