// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) // Copyright (c) 2022 Google #include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <bpf/bpf_core_read.h> /* task->flags for off-cpu analysis */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ /* task->state for off-cpu analysis */ #define TASK_INTERRUPTIBLE 0x0001 #define TASK_UNINTERRUPTIBLE 0x0002 /* create a new thread */ #define CLONE_THREAD 0x10000 #define MAX_STACKS 32 #define MAX_ENTRIES 102400 struct tstamp_data { __u32 stack_id; __u32 state; __u64 timestamp; }; struct offcpu_key { __u32 pid; __u32 tgid; __u32 stack_id; __u32 state; __u64 cgroup_id; }; struct { __uint(type, BPF_MAP_TYPE_STACK_TRACE); __uint(key_size, sizeof(__u32)); __uint(value_size, MAX_STACKS * sizeof(__u64)); __uint(max_entries, MAX_ENTRIES); } stacks SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); __type(value, struct tstamp_data); } tstamp SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(struct offcpu_key)); __uint(value_size, sizeof(__u64)); __uint(max_entries, MAX_ENTRIES); } off_cpu SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u8)); __uint(max_entries, 1); } cpu_filter SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u8)); __uint(max_entries, 1); } task_filter SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(__u64)); __uint(value_size, sizeof(__u8)); __uint(max_entries, 1); } cgroup_filter SEC(".maps"); /* new kernel task_struct definition */ struct task_struct___new { long __state; } __attribute__((preserve_access_index)); /* old kernel task_struct definition */ struct task_struct___old { long state; } __attribute__((preserve_access_index)); int enabled = 0; int has_cpu = 0; int has_task = 0; int has_cgroup = 0; int uses_tgid = 0; const volatile bool has_prev_state = false; const volatile bool needs_cgroup = false; const volatile bool uses_cgroup_v1 = false; int perf_subsys_id = -1; /* * Old kernel used to call it task_struct->state and now it's '__state'. * Use BPF CO-RE "ignored suffix rule" to deal with it like below: * * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes */ static inline int get_task_state(struct task_struct *t) { /* recast pointer to capture new type for compiler */ struct task_struct___new *t_new = (void *)t; if (bpf_core_field_exists(t_new->__state)) { return BPF_CORE_READ(t_new, __state); } else { /* recast pointer to capture old type for compiler */ struct task_struct___old *t_old = (void *)t; return BPF_CORE_READ(t_old, state); } } static inline __u64 get_cgroup_id(struct task_struct *t) { struct cgroup *cgrp; if (!uses_cgroup_v1) return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id); if (perf_subsys_id == -1) { #if __has_builtin(__builtin_preserve_enum_value) perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, perf_event_cgrp_id); #else perf_subsys_id = perf_event_cgrp_id; #endif } cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup); return BPF_CORE_READ(cgrp, kn, id); } static inline int can_record(struct task_struct *t, int state) { /* kernel threads don't have user stack */ if (t->flags & PF_KTHREAD) return 0; if (state != TASK_INTERRUPTIBLE && state != TASK_UNINTERRUPTIBLE) return 0; if (has_cpu) { __u32 cpu = bpf_get_smp_processor_id(); __u8 *ok; ok = bpf_map_lookup_elem(&cpu_filter, &cpu); if (!ok) return 0; } if (has_task) { __u8 *ok; __u32 pid; if (uses_tgid) pid = t->tgid; else pid = t->pid; ok = bpf_map_lookup_elem(&task_filter, &pid); if (!ok) return 0; } if (has_cgroup) { __u8 *ok; __u64 cgrp_id = get_cgroup_id(t); ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); if (!ok) return 0; } return 1; } static int off_cpu_stat(u64 *ctx, struct task_struct *prev, struct task_struct *next, int state) { __u64 ts; __u32 stack_id; struct tstamp_data *pelem; ts = bpf_ktime_get_ns(); if (!can_record(prev, state)) goto next; stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); pelem = bpf_task_storage_get(&tstamp, prev, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!pelem) goto next; pelem->timestamp = ts; pelem->state = state; pelem->stack_id = stack_id; next: pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); if (pelem && pelem->timestamp) { struct offcpu_key key = { .pid = next->pid, .tgid = next->tgid, .stack_id = pelem->stack_id, .state = pelem->state, .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, }; __u64 delta = ts - pelem->timestamp; __u64 *total; total = bpf_map_lookup_elem(&off_cpu, &key); if (total) *total += delta; else bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); /* prevent to reuse the timestamp later */ pelem->timestamp = 0; } return 0; } SEC("tp_btf/task_newtask") int on_newtask(u64 *ctx) { struct task_struct *task; u64 clone_flags; u32 pid; u8 val = 1; if (!uses_tgid) return 0; task = (struct task_struct *)bpf_get_current_task(); pid = BPF_CORE_READ(task, tgid); if (!bpf_map_lookup_elem(&task_filter, &pid)) return 0; task = (struct task_struct *)ctx[0]; clone_flags = ctx[1]; pid = task->tgid; if (!(clone_flags & CLONE_THREAD)) bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST); return 0; } SEC("tp_btf/sched_switch") int on_switch(u64 *ctx) { struct task_struct *prev, *next; int prev_state; if (!enabled) return 0; prev = (struct task_struct *)ctx[1]; next = (struct task_struct *)ctx[2]; if (has_prev_state) prev_state = (int)ctx[3]; else prev_state = get_task_state(prev); return off_cpu_stat(ctx, prev, next, prev_state & 0xff); } char LICENSE[] SEC("license") = "Dual BSD/GPL";