// SPDX-License-Identifier: GPL-2.0
/*
 * dlfilter-show-cycles.c: Print the number of cycles at the start of each line
 * Copyright (c) 2021, Intel Corporation.
 */
#include <perf/perf_dlfilter.h>
#include <string.h>
#include <stdio.h>

#define MAX_CPU 4096

enum {
	INSTR_CYC,
	BRNCH_CYC,
	OTHER_CYC,
	MAX_ENTRY
};

static __u64 cycles[MAX_CPU][MAX_ENTRY];
static __u64 cycles_rpt[MAX_CPU][MAX_ENTRY];

#define BITS		16
#define TABLESZ		(1 << BITS)
#define TABLEMAX	(TABLESZ / 2)
#define MASK		(TABLESZ - 1)

static struct entry {
	__u32 used;
	__s32 tid;
	__u64 cycles[MAX_ENTRY];
	__u64 cycles_rpt[MAX_ENTRY];
} table[TABLESZ];

static int tid_cnt;

static int event_entry(const char *event)
{
	if (!event)
		return OTHER_CYC;
	if (!strncmp(event, "instructions", 12))
		return INSTR_CYC;
	if (!strncmp(event, "branches", 8))
		return BRNCH_CYC;
	return OTHER_CYC;
}

static struct entry *find_entry(__s32 tid)
{
	__u32 pos = tid & MASK;
	struct entry *e;

	e = &table[pos];
	while (e->used) {
		if (e->tid == tid)
			return e;
		if (++pos == TABLESZ)
			pos = 0;
		e = &table[pos];
	}

	if (tid_cnt >= TABLEMAX) {
		fprintf(stderr, "Too many threads\n");
		return NULL;
	}

	tid_cnt += 1;
	e->used = 1;
	e->tid = tid;
	return e;
}

static void add_entry(__s32 tid, int pos, __u64 cnt)
{
	struct entry *e = find_entry(tid);

	if (e)
		e->cycles[pos] += cnt;
}

int filter_event_early(void *data, const struct perf_dlfilter_sample *sample, void *ctx)
{
	__s32 cpu = sample->cpu;
	__s32 tid = sample->tid;
	int pos;

	if (!sample->cyc_cnt)
		return 0;

	pos = event_entry(sample->event);

	if (cpu >= 0 && cpu < MAX_CPU)
		cycles[cpu][pos] += sample->cyc_cnt;
	else if (tid != -1)
		add_entry(tid, pos, sample->cyc_cnt);
	return 0;
}

static void print_vals(__u64 cycles, __u64 delta)
{
	if (delta)
		printf("%10llu %10llu ", (unsigned long long)cycles, (unsigned long long)delta);
	else
		printf("%10llu %10s ", (unsigned long long)cycles, "");
}

int filter_event(void *data, const struct perf_dlfilter_sample *sample, void *ctx)
{
	__s32 cpu = sample->cpu;
	__s32 tid = sample->tid;
	int pos;

	pos = event_entry(sample->event);

	if (cpu >= 0 && cpu < MAX_CPU) {
		print_vals(cycles[cpu][pos], cycles[cpu][pos] - cycles_rpt[cpu][pos]);
		cycles_rpt[cpu][pos] = cycles[cpu][pos];
		return 0;
	}

	if (tid != -1) {
		struct entry *e = find_entry(tid);

		if (e) {
			print_vals(e->cycles[pos], e->cycles[pos] - e->cycles_rpt[pos]);
			e->cycles_rpt[pos] = e->cycles[pos];
			return 0;
		}
	}

	printf("%22s", "");
	return 0;
}

const char *filter_description(const char **long_description)
{
	static char *long_desc = "Cycle counts are accumulated per CPU (or "
		"per thread if CPU is not recorded) from IPC information, and "
		"printed together with the change since the last print, at the "
		"start of each line. Separate counts are kept for branches, "
		"instructions or other events.";

	*long_description = long_desc;
	return "Print the number of cycles at the start of each line";
}