#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
#include <linux/ktime_api.h>
#include <linux/sched/signal.h>
#include <linux/syscalls_api.h>
#include <linux/debug_locks.h>
#include <linux/prefetch.h>
#include <linux/capability.h>
#include <linux/pgtable_api.h>
#include <linux/wait_bit.h>
#include <linux/jiffies.h>
#include <linux/spinlock_api.h>
#include <linux/cpumask_api.h>
#include <linux/lockdep_api.h>
#include <linux/hardirq.h>
#include <linux/softirq.h>
#include <linux/refcount_api.h>
#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>
#include <linux/sched/debug.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/init.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
#include <linux/sched/nohz.h>
#include <linux/sched/rseq_api.h>
#include <linux/sched/rt.h>
#include <linux/blkdev.h>
#include <linux/context_tracking.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/interrupt.h>
#include <linux/ioprio.h>
#include <linux/kallsyms.h>
#include <linux/kcov.h>
#include <linux/kprobes.h>
#include <linux/llist_api.h>
#include <linux/mmu_context.h>
#include <linux/mmzone.h>
#include <linux/mutex_api.h>
#include <linux/nmi.h>
#include <linux/nospec.h>
#include <linux/perf_event_api.h>
#include <linux/profile.h>
#include <linux/psi.h>
#include <linux/rcuwait_api.h>
#include <linux/sched/wake_q.h>
#include <linux/scs.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/vtime.h>
#include <linux/wait_api.h>
#include <linux/workqueue_api.h>
#ifdef CONFIG_PREEMPT_DYNAMIC
# ifdef CONFIG_GENERIC_ENTRY
# include <linux/entry-common.h>
# endif
#endif
#include <uapi/linux/sched/types.h>
#include <asm/irq_regs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <linux/sched/rseq_api.h>
#include <trace/events/sched.h>
#include <trace/events/ipi.h>
#undef CREATE_TRACE_POINTS
#include "sched.h"
#include "stats.h"
#include "autogroup.h"
#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
#include "stats.h"
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#ifdef CONFIG_SCHED_DEBUG
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
__read_mostly int sysctl_resched_latency_warn_ms = 100;
__read_mostly int sysctl_resched_latency_warn_once = 1;
#endif /* CONFIG_SCHED_DEBUG */
const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
#ifdef CONFIG_SCHED_CORE
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
static inline int __task_prio(const struct task_struct *p)
{
if (p->sched_class == &stop_sched_class)
return -2;
if (rt_prio(p->prio))
return p->prio;
if (p->sched_class == &idle_sched_class)
return MAX_RT_PRIO + NICE_WIDTH;
return MAX_RT_PRIO + MAX_NICE;
}
static inline bool prio_less(const struct task_struct *a,
const struct task_struct *b, bool in_fi)
{
int pa = __task_prio(a), pb = __task_prio(b);
if (-pa < -pb)
return true;
if (-pb < -pa)
return false;
if (pa == -1)
return !dl_time_before(a->dl.deadline, b->dl.deadline);
if (pa == MAX_RT_PRIO + MAX_NICE)
return cfs_prio_less(a, b, in_fi);
return false;
}
static inline bool __sched_core_less(const struct task_struct *a,
const struct task_struct *b)
{
if (a->core_cookie < b->core_cookie)
return true;
if (a->core_cookie > b->core_cookie)
return false;
if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;
return false;
}
#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
{
return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
}
static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
{
const struct task_struct *p = __node_2_sc(node);
unsigned long cookie = (unsigned long)key;
if (cookie < p->core_cookie)
return -1;
if (cookie > p->core_cookie)
return 1;
return 0;
}
void sched_core_enqueue(struct rq *rq, struct task_struct *p)
{
rq->core->core_task_seq++;
if (!p->core_cookie)
return;
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;
if (sched_core_enqueued(p)) {
rb_erase(&p->core_node, &rq->core_tree);
RB_CLEAR_NODE(&p->core_node);
}
if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
rq->core->core_forceidle_count && rq->curr == rq->idle)
resched_curr(rq);
}
static int sched_task_is_throttled(struct task_struct *p, int cpu)
{
if (p->sched_class->task_is_throttled)
return p->sched_class->task_is_throttled(p, cpu);
return 0;
}
static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
{
struct rb_node *node = &p->core_node;
int cpu = task_cpu(p);
do {
node = rb_next(node);
if (!node)
return NULL;
p = __node_2_sc(node);
if (p->core_cookie != cookie)
return NULL;
} while (sched_task_is_throttled(p, cpu));
return p;
}
static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
{
struct task_struct *p;
struct rb_node *node;
node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
if (!node)
return NULL;
p = __node_2_sc(node);
if (!sched_task_is_throttled(p, rq->cpu))
return p;
return sched_core_next(p, cookie);
}
static DEFINE_MUTEX(sched_core_mutex);
static atomic_t sched_core_count;
static struct cpumask sched_core_mask;
static void sched_core_lock(int cpu, unsigned long *flags)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
int t, i = 0;
local_irq_save(*flags);
for_each_cpu(t, smt_mask)
raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
}
static void sched_core_unlock(int cpu, unsigned long *flags)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
int t;
for_each_cpu(t, smt_mask)
raw_spin_unlock(&cpu_rq(t)->__lock);
local_irq_restore(*flags);
}
static void __sched_core_flip(bool enabled)
{
unsigned long flags;
int cpu, t;
cpus_read_lock();
cpumask_copy(&sched_core_mask, cpu_online_mask);
for_each_cpu(cpu, &sched_core_mask) {
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
sched_core_lock(cpu, &flags);
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
cpu_rq(cpu)->core->core_forceidle_start = 0;
sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
}
for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
cpu_rq(cpu)->core_enabled = enabled;
cpus_read_unlock();
}
static void sched_core_assert_empty(void)
{
int cpu;
for_each_possible_cpu(cpu)
WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
}
static void __sched_core_enable(void)
{
static_branch_enable(&__sched_core_enabled);
synchronize_rcu();
__sched_core_flip(true);
sched_core_assert_empty();
}
static void __sched_core_disable(void)
{
sched_core_assert_empty();
__sched_core_flip(false);
static_branch_disable(&__sched_core_enabled);
}
void sched_core_get(void)
{
if (atomic_inc_not_zero(&sched_core_count))
return;
mutex_lock(&sched_core_mutex);
if (!atomic_read(&sched_core_count))
__sched_core_enable();
smp_mb__before_atomic();
atomic_inc(&sched_core_count);
mutex_unlock(&sched_core_mutex);
}
static void __sched_core_put(struct work_struct *work)
{
if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
__sched_core_disable();
mutex_unlock(&sched_core_mutex);
}
}
void sched_core_put(void)
{
static DECLARE_WORK(_work, __sched_core_put);
if (!atomic_add_unless(&sched_core_count, -1, 1))
schedule_work(&_work);
}
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
{
raw_spinlock_t *lock;
preempt_disable();
if (sched_core_disabled()) {
raw_spin_lock_nested(&rq->__lock, subclass);
preempt_enable_no_resched();
return;
}
for (;;) {
lock = __rq_lockp(rq);
raw_spin_lock_nested(lock, subclass);
if (likely(lock == __rq_lockp(rq))) {
preempt_enable_no_resched();
return;
}
raw_spin_unlock(lock);
}
}
bool raw_spin_rq_trylock(struct rq *rq)
{
raw_spinlock_t *lock;
bool ret;
preempt_disable();
if (sched_core_disabled()) {
ret = raw_spin_trylock(&rq->__lock);
preempt_enable();
return ret;
}
for (;;) {
lock = __rq_lockp(rq);
ret = raw_spin_trylock(lock);
if (!ret || (likely(lock == __rq_lockp(rq)))) {
preempt_enable();
return ret;
}
raw_spin_unlock(lock);
}
}
void raw_spin_rq_unlock(struct rq *rq)
{
raw_spin_unlock(rq_lockp(rq));
}
#ifdef CONFIG_SMP
void double_rq_lock(struct rq *rq1, struct rq *rq2)
{
lockdep_assert_irqs_disabled();
if (rq_order_less(rq2, rq1))
swap(rq1, rq2);
raw_spin_rq_lock(rq1);
if (__rq_lockp(rq1) != __rq_lockp(rq2))
raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
double_rq_clock_clear_update(rq1, rq2);
}
#endif
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock)
{
struct rq *rq;
lockdep_assert_held(&p->pi_lock);
for (;;) {
rq = task_rq(p);
raw_spin_rq_lock(rq);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
raw_spin_rq_unlock(rq);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
raw_spin_rq_lock(rq);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
if (irq_delta > delta)
irq_delta = delta;
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
rq->prev_steal_time_rq += steal;
delta -= steal;
}
#endif
rq->clock_task += delta;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
update_rq_clock_pelt(rq, delta);
}
void update_rq_clock(struct rq *rq)
{
s64 delta;
lockdep_assert_rq_held(rq);
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
#ifdef CONFIG_SCHED_DEBUG
if (sched_feat(WARN_DOUBLE_CLOCK))
SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
if (delta < 0)
return;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}
#ifdef CONFIG_SCHED_HRTICK
static void hrtick_clear(struct rq *rq)
{
if (hrtimer_active(&rq->hrtick_timer))
hrtimer_cancel(&rq->hrtick_timer);
}
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
struct rq_flags rf;
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
rq_lock(rq, &rf);
update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
}
#ifdef CONFIG_SMP
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = rq->hrtick_time;
hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
static void __hrtick_start(void *arg)
{
struct rq *rq = arg;
struct rq_flags rf;
rq_lock(rq, &rf);
__hrtick_restart(rq);
rq_unlock(rq, &rf);
}
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
s64 delta;
delta = max_t(s64, delay, 10000LL);
rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
if (rq == this_rq())
__hrtick_restart(rq);
else
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
}
#else
void hrtick_start(struct rq *rq, u64 delay)
{
delay = max_t(u64, delay, 10000LL);
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_HARD);
}
#endif /* CONFIG_SMP */
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
static inline void hrtick_rq_init(struct rq *rq)
{
}
#endif /* CONFIG_SCHED_HRTICK */
#define fetch_or(ptr, mask) \
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
typeof(*_ptr) _val = *_ptr; \
\
do { \
} while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
_val; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
static inline bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
break;
}
return true;
}
#else
static inline bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
#endif
static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
struct wake_q_node *node = &task->wake_q;
smp_mb__before_atomic();
if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
return false;
*head->lastp = node;
head->lastp = &node->next;
return true;
}
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
if (__wake_q_add(head, task))
get_task_struct(task);
}
void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
{
if (!__wake_q_add(head, task))
put_task_struct(task);
}
void wake_up_q(struct wake_q_head *head)
{
struct wake_q_node *node = head->first;
while (node != WAKE_Q_TAIL) {
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
node = node->next;
task->wake_q.next = NULL;
wake_up_process(task);
put_task_struct(task);
}
}
void resched_curr(struct rq *rq)
{
struct task_struct *curr = rq->curr;
int cpu;
lockdep_assert_rq_held(rq);
if (test_tsk_need_resched(curr))
return;
cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
set_tsk_need_resched(curr);
set_preempt_need_resched();
return;
}
if (set_nr_and_not_polling(curr))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
raw_spin_rq_lock_irqsave(rq, flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
raw_spin_rq_unlock_irqrestore(rq, flags);
}
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
const struct cpumask *hk_mask;
if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
guard(rcu)();
for_each_domain(cpu, sd) {
for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
if (!idle_cpu(i))
return i;
}
}
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
return default_cpu;
}
static void wake_up_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (cpu == smp_processor_id())
return;
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
static bool wake_up_full_nohz_cpu(int cpu)
{
if (cpu_is_offline(cpu))
return true;
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
tick_nohz_full_kick_cpu(cpu);
return true;
}
return false;
}
void wake_up_nohz_cpu(int cpu)
{
if (!wake_up_full_nohz_cpu(cpu))
wake_up_idle_cpu(cpu);
}
static void nohz_csd_func(void *info)
{
struct rq *rq = info;
int cpu = cpu_of(rq);
unsigned int flags;
flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
rq->idle_balance = idle_cpu(cpu);
if (rq->idle_balance && !need_resched()) {
rq->nohz_idle_balance = flags;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
}
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
{
if (rq->nr_running != 1)
return false;
if (p->sched_class != &fair_sched_class)
return false;
if (!task_on_rq_queued(p))
return false;
return true;
}
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
if (rq->dl.dl_nr_running)
return false;
if (rq->rt.rr_nr_running) {
if (rq->rt.rr_nr_running == 1)
return true;
else
return false;
}
fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
if (fifo_nr_running)
return true;
if (rq->nr_running > 1)
return false;
if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
if (cfs_task_bw_constrained(rq->curr))
return false;
}
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
int walk_tg_tree_from(struct task_group *from,
tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
int ret;
parent = from;
down:
ret = (*down)(parent, data);
if (ret)
goto out;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
ret = (*up)(parent, data);
if (ret || parent == from)
goto out;
child = parent;
parent = parent->parent;
if (parent)
goto up;
out:
return ret;
}
int tg_nop(struct task_group *tg, void *data)
{
return 0;
}
#endif
static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
return;
}
if (update_load && p->sched_class == &fair_sched_class) {
reweight_task(p, prio);
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
}
}
#ifdef CONFIG_UCLAMP_TASK
static DEFINE_MUTEX(uclamp_mutex);
static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
static struct uclamp_se uclamp_default[UCLAMP_CNT];
DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
#define for_each_clamp_id(clamp_id) \
for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
}
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
return SCHED_CAPACITY_SCALE;
}
static inline void uclamp_se_set(struct uclamp_se *uc_se,
unsigned int value, bool user_defined)
{
uc_se->value = value;
uc_se->bucket_id = uclamp_bucket_id(value);
uc_se->user_defined = user_defined;
}
static inline unsigned int
uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
if (clamp_id == UCLAMP_MAX) {
rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
return clamp_value;
}
return uclamp_none(UCLAMP_MIN);
}
static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
uclamp_rq_set(rq, clamp_id, clamp_value);
}
static inline
unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
int bucket_id = UCLAMP_BUCKETS - 1;
for ( ; bucket_id >= 0; bucket_id--) {
if (!bucket[bucket_id].tasks)
continue;
return bucket[bucket_id].value;
}
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
static void __uclamp_update_util_min_rt_default(struct task_struct *p)
{
unsigned int default_util_min;
struct uclamp_se *uc_se;
lockdep_assert_held(&p->pi_lock);
uc_se = &p->uclamp_req[UCLAMP_MIN];
if (uc_se->user_defined)
return;
default_util_min = sysctl_sched_uclamp_util_min_rt_default;
uclamp_se_set(uc_se, default_util_min, false);
}
static void uclamp_update_util_min_rt_default(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
if (!rt_task(p))
return;
rq = task_rq_lock(p, &rf);
__uclamp_update_util_min_rt_default(p);
task_rq_unlock(rq, p, &rf);
}
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_req = p->uclamp_req[clamp_id];
#ifdef CONFIG_UCLAMP_TASK_GROUP
unsigned int tg_min, tg_max, value;
if (task_group_is_autogroup(task_group(p)))
return uc_req;
if (task_group(p) == &root_task_group)
return uc_req;
tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
value = uc_req.value;
value = clamp(value, tg_min, tg_max);
uclamp_se_set(&uc_req, value, false);
#endif
return uc_req;
}
static inline struct uclamp_se
uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
struct uclamp_se uc_max = uclamp_default[clamp_id];
if (unlikely(uc_req.value > uc_max.value))
return uc_max;
return uc_req;
}
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
if (p->uclamp[clamp_id].active)
return (unsigned long)p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
return (unsigned long)uc_eff.value;
}
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
lockdep_assert_rq_held(rq);
p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
bucket = &uc_rq->bucket[uc_se->bucket_id];
bucket->tasks++;
uc_se->active = true;
uclamp_idle_reset(rq, clamp_id, uc_se->value);
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
if (uc_se->value > uclamp_rq_get(rq, clamp_id))
uclamp_rq_set(rq, clamp_id, uc_se->value);
}
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
unsigned int bkt_clamp;
unsigned int rq_clamp;
lockdep_assert_rq_held(rq);
if (unlikely(!uc_se->active))
return;
bucket = &uc_rq->bucket[uc_se->bucket_id];
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
uc_se->active = false;
if (likely(bucket->tasks))
return;
rq_clamp = uclamp_rq_get(rq, clamp_id);
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
uclamp_rq_set(rq, clamp_id, bkt_clamp);
}
}
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
if (!static_branch_unlikely(&sched_uclamp_used))
return;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
for_each_clamp_id(clamp_id)
uclamp_rq_inc_id(rq, p, clamp_id);
if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
if (!static_branch_unlikely(&sched_uclamp_used))
return;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
for_each_clamp_id(clamp_id)
uclamp_rq_dec_id(rq, p, clamp_id);
}
static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
if (!p->uclamp[clamp_id].active)
return;
uclamp_rq_dec_id(rq, p, clamp_id);
uclamp_rq_inc_id(rq, p, clamp_id);
if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
static inline void
uclamp_update_active(struct task_struct *p)
{
enum uclamp_id clamp_id;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
for_each_clamp_id(clamp_id)
uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
static inline void
uclamp_update_active_tasks(struct cgroup_subsys_state *css)
{
struct css_task_iter it;
struct task_struct *p;
css_task_iter_start(css, 0, &it);
while ((p = css_task_iter_next(&it)))
uclamp_update_active(p);
css_task_iter_end(&it);
}
static void cpu_util_update_eff(struct cgroup_subsys_state *css);
#endif
#ifdef CONFIG_SYSCTL
#ifdef CONFIG_UCLAMP_TASK
#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)
{
struct task_group *tg = &root_task_group;
uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
rcu_read_lock();
cpu_util_update_eff(&root_task_group.css);
rcu_read_unlock();
}
#else
static void uclamp_update_root_tg(void) { }
#endif
static void uclamp_sync_util_min_rt_default(void)
{
struct task_struct *g, *p;
read_lock(&tasklist_lock);
smp_mb__after_spinlock();
read_unlock(&tasklist_lock);
rcu_read_lock();
for_each_process_thread(g, p)
uclamp_update_util_min_rt_default(p);
rcu_read_unlock();
}
static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
int old_min, old_max, old_min_rt;
int result;
guard(mutex)(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
goto undo;
if (!write)
return 0;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
result = -EINVAL;
goto undo;
}
if (old_min != sysctl_sched_uclamp_util_min) {
uclamp_se_set(&uclamp_default[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
update_root_tg = true;
}
if (old_max != sysctl_sched_uclamp_util_max) {
uclamp_se_set(&uclamp_default[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
update_root_tg = true;
}
if (update_root_tg) {
static_branch_enable(&sched_uclamp_used);
uclamp_update_root_tg();
}
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
static_branch_enable(&sched_uclamp_used);
uclamp_sync_util_min_rt_default();
}
return 0;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
return result;
}
#endif
#endif
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
int util_min = p->uclamp_req[UCLAMP_MIN].value;
int util_max = p->uclamp_req[UCLAMP_MAX].value;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
util_min = attr->sched_util_min;
if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
return -EINVAL;
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
util_max = attr->sched_util_max;
if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
return -EINVAL;
}
if (util_min != -1 && util_max != -1 && util_min > util_max)
return -EINVAL;
static_branch_enable(&sched_uclamp_used);
return 0;
}
static bool uclamp_reset(const struct sched_attr *attr,
enum uclamp_id clamp_id,
struct uclamp_se *uc_se)
{
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
!uc_se->user_defined)
return true;
if (clamp_id == UCLAMP_MIN &&
attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
attr->sched_util_min == -1) {
return true;
}
if (clamp_id == UCLAMP_MAX &&
attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
attr->sched_util_max == -1) {
return true;
}
return false;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
unsigned int value;
if (!uclamp_reset(attr, clamp_id, uc_se))
continue;
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
value = sysctl_sched_uclamp_util_min_rt_default;
else
value = uclamp_none(clamp_id);
uclamp_se_set(uc_se, value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
return;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
attr->sched_util_min != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
attr->sched_util_min, true);
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
attr->sched_util_max != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
attr->sched_util_max, true);
}
}
static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
if (likely(!p->sched_reset_on_fork))
return;
for_each_clamp_id(clamp_id) {
uclamp_se_set(&p->uclamp_req[clamp_id],
uclamp_none(clamp_id), false);
}
}
static void uclamp_post_fork(struct task_struct *p)
{
uclamp_update_util_min_rt_default(p);
}
static void __init init_uclamp_rq(struct rq *rq)
{
enum uclamp_id clamp_id;
struct uclamp_rq *uc_rq = rq->uclamp;
for_each_clamp_id(clamp_id) {
uc_rq[clamp_id] = (struct uclamp_rq) {
.value = uclamp_none(clamp_id)
};
}
rq->uclamp_flags = UCLAMP_FLAG_IDLE;
}
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
enum uclamp_id clamp_id;
int cpu;
for_each_possible_cpu(cpu)
init_uclamp_rq(cpu_rq(cpu));
for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp_req[clamp_id],
uclamp_none(clamp_id), false);
}
uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
for_each_clamp_id(clamp_id) {
uclamp_default[clamp_id] = uc_max;
#ifdef CONFIG_UCLAMP_TASK_GROUP
root_task_group.uclamp_req[clamp_id] = uc_max;
root_task_group.uclamp[clamp_id] = uc_max;
#endif
}
}
#else /* CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
return -EOPNOTSUPP;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
bool sched_task_on_rq(struct task_struct *p)
{
return task_on_rq_queued(p);
}
unsigned long get_wchan(struct task_struct *p)
{
unsigned long ip = 0;
unsigned int state;
if (!p || p == current)
return 0;
raw_spin_lock_irq(&p->pi_lock);
state = READ_ONCE(p->__state);
smp_rmb();
if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
ip = __get_wchan(p);
raw_spin_unlock_irq(&p->pi_lock);
return ip;
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & ENQUEUE_RESTORE)) {
sched_info_enqueue(rq, p);
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}
uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeue(rq, p);
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
uclamp_rq_dec(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED;
if (flags & ENQUEUE_MIGRATED)
sched_mm_cid_migrate_to(rq, p);
enqueue_task(rq, p, flags);
p->on_rq = TASK_ON_RQ_QUEUED;
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
dequeue_task(rq, p, flags);
}
static inline int __normal_prio(int policy, int rt_prio, int nice)
{
int prio;
if (dl_policy(policy))
prio = MAX_DL_PRIO - 1;
else if (rt_policy(policy))
prio = MAX_RT_PRIO - 1 - rt_prio;
else
prio = NICE_TO_PRIO(nice);
return prio;
}
static inline int normal_prio(struct task_struct *p)
{
return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
static int effective_prio(struct task_struct *p)
{
p->normal_prio = normal_prio(p);
if (!rt_prio(p->prio))
return p->normal_prio;
return p->prio;
}
inline int task_curr(const struct task_struct *p)
{
return cpu_curr(task_cpu(p)) == p;
}
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
rq_clock_skip_update(rq);
}
static __always_inline
int __task_state_match(struct task_struct *p, unsigned int state)
{
if (READ_ONCE(p->__state) & state)
return 1;
#ifdef CONFIG_PREEMPT_RT
if (READ_ONCE(p->saved_state) & state)
return -1;
#endif
return 0;
}
static __always_inline
int task_state_match(struct task_struct *p, unsigned int state)
{
#ifdef CONFIG_PREEMPT_RT
int match;
raw_spin_lock_irq(&p->pi_lock);
match = __task_state_match(p, state);
raw_spin_unlock_irq(&p->pi_lock);
return match;
#else
return __task_state_match(p, state);
#endif
}
unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
int running, queued, match;
struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
for (;;) {
rq = task_rq(p);
while (task_on_cpu(rq, p)) {
if (!task_state_match(p, match_state))
return 0;
cpu_relax();
}
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
if ((match = __task_state_match(p, match_state))) {
if (match < 0)
queued = 1;
ncsw = p->nvcsw | LONG_MIN;
}
task_rq_unlock(rq, p, &rf);
if (unlikely(!ncsw))
break;
if (unlikely(running)) {
cpu_relax();
continue;
}
if (unlikely(queued)) {
ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
continue;
}
break;
}
return ncsw;
}
#ifdef CONFIG_SMP
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static int __set_cpus_allowed_ptr(struct task_struct *p,
struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
struct affinity_context ac = {
.new_mask = cpumask_of(rq->cpu),
.flags = SCA_MIGRATE_DISABLE,
};
if (likely(!p->migration_disabled))
return;
if (p->cpus_ptr != &p->cpus_mask)
return;
__do_set_cpus_allowed(p, &ac);
}
void migrate_disable(void)
{
struct task_struct *p = current;
if (p->migration_disabled) {
p->migration_disabled++;
return;
}
preempt_disable();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
struct task_struct *p = current;
struct affinity_context ac = {
.new_mask = &p->cpus_mask,
.flags = SCA_MIGRATE_ENABLE,
};
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
if (WARN_ON_ONCE(!p->migration_disabled))
return;
preempt_disable();
if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &ac);
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return rq->nr_pinned;
}
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (is_migration_disabled(p))
return cpu_online(cpu);
if (!(p->flags & PF_KTHREAD))
return cpu_active(cpu) && task_cpu_possible(cpu, p);
if (kthread_is_per_cpu(p))
return cpu_online(cpu);
if (cpu_dying(cpu))
return false;
return cpu_online(cpu);
}
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{
lockdep_assert_rq_held(rq);
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
rq = cpu_rq(new_cpu);
rq_lock(rq, rf);
WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
return rq;
}
struct migration_arg {
struct task_struct *task;
int dest_cpu;
struct set_affinity_pending *pending;
};
struct set_affinity_pending {
refcount_t refs;
unsigned int stop_pending;
struct completion done;
struct cpu_stop_work stop_work;
struct migration_arg arg;
};
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
if (!is_cpu_allowed(p, dest_cpu))
return rq;
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
}
static int migration_cpu_stop(void *data)
{
struct migration_arg *arg = data;
struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
bool complete = false;
struct rq_flags rf;
local_irq_save(rf.flags);
flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
WARN_ON_ONCE(pending && pending != p->migration_pending);
if (task_rq(p) == rq) {
if (is_migration_disabled(p))
goto out;
if (pending) {
p->migration_pending = NULL;
complete = true;
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
goto out;
}
if (task_on_rq_queued(p)) {
update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
} else {
p->wake_cpu = arg->dest_cpu;
}
} else if (pending) {
if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
p->migration_pending = NULL;
complete = true;
goto out;
}
WARN_ON_ONCE(!pending->stop_pending);
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
out:
if (pending)
pending->stop_pending = false;
task_rq_unlock(rq, p, &rf);
if (complete)
complete_all(&pending->done);
return 0;
}
int push_cpu_stop(void *arg)
{
struct rq *lowest_rq = NULL, *rq = this_rq();
struct task_struct *p = arg;
raw_spin_lock_irq(&p->pi_lock);
raw_spin_rq_lock(rq);
if (task_rq(p) != rq)
goto out_unlock;
if (is_migration_disabled(p)) {
p->migration_flags |= MDF_PUSH;
goto out_unlock;
}
p->migration_flags &= ~MDF_PUSH;
if (p->sched_class->find_lock_rq)
lowest_rq = p->sched_class->find_lock_rq(p, rq);
if (!lowest_rq)
goto out_unlock;
if (task_rq(p) == rq) {
deactivate_task(rq, p, 0);
set_task_cpu(p, lowest_rq->cpu);
activate_task(lowest_rq, p, 0);
resched_curr(lowest_rq);
}
double_unlock_balance(rq, lowest_rq);
out_unlock:
rq->push_busy = false;
raw_spin_rq_unlock(rq);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
return 0;
}
void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
{
if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
p->cpus_ptr = ctx->new_mask;
return;
}
cpumask_copy(&p->cpus_mask, ctx->new_mask);
p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
if (ctx->flags & SCA_USER)
swap(p->user_cpus_ptr, ctx->user_mask);
}
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
struct rq *rq = task_rq(p);
bool queued, running;
if (ctx->flags & SCA_MIGRATE_DISABLE)
SCHED_WARN_ON(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued) {
lockdep_assert_rq_held(rq);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
put_prev_task(rq, p);
p->sched_class->set_cpus_allowed(p, ctx);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_next_task(rq, p);
}
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
.user_mask = NULL,
.flags = SCA_USER,
};
union cpumask_rcuhead {
cpumask_t cpumask;
struct rcu_head rcu;
};
__do_set_cpus_allowed(p, &ac);
kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
}
static cpumask_t *alloc_user_cpus_ptr(int node)
{
int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
return kmalloc_node(size, GFP_KERNEL, node);
}
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
cpumask_t *user_mask;
unsigned long flags;
dst->user_cpus_ptr = NULL;
if (data_race(!src->user_cpus_ptr))
return 0;
user_mask = alloc_user_cpus_ptr(node);
if (!user_mask)
return -ENOMEM;
raw_spin_lock_irqsave(&src->pi_lock, flags);
if (src->user_cpus_ptr) {
swap(dst->user_cpus_ptr, user_mask);
cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
}
raw_spin_unlock_irqrestore(&src->pi_lock, flags);
if (unlikely(user_mask))
kfree(user_mask);
return 0;
}
static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
{
struct cpumask *user_mask = NULL;
swap(p->user_cpus_ptr, user_mask);
return user_mask;
}
void release_user_cpus_ptr(struct task_struct *p)
{
kfree(clear_user_cpus_ptr(p));
}
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
int dest_cpu, unsigned int flags)
__releases(rq->lock)
__releases(p->pi_lock)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
struct task_struct *push_task = NULL;
if ((flags & SCA_MIGRATE_ENABLE) &&
(p->migration_flags & MDF_PUSH) && !rq->push_busy) {
rq->push_busy = true;
push_task = get_task_struct(p);
}
pending = p->migration_pending;
if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
task_rq_unlock(rq, p, rf);
if (push_task) {
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
p, &rq->push_work);
}
if (complete)
complete_all(&pending->done);
return 0;
}
if (!(flags & SCA_MIGRATE_ENABLE)) {
if (!p->migration_pending) {
refcount_set(&my_pending.refs, 1);
init_completion(&my_pending.done);
my_pending.arg = (struct migration_arg) {
.task = p,
.dest_cpu = dest_cpu,
.pending = &my_pending,
};
p->migration_pending = &my_pending;
} else {
pending = p->migration_pending;
refcount_inc(&pending->refs);
pending->arg.dest_cpu = dest_cpu;
}
}
pending = p->migration_pending;
if (WARN_ON_ONCE(!pending)) {
task_rq_unlock(rq, p, rf);
return -EINVAL;
}
if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
stop_pending = pending->stop_pending;
if (!stop_pending)
pending->stop_pending = true;
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;
task_rq_unlock(rq, p, rf);
if (!stop_pending) {
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
}
if (flags & SCA_MIGRATE_ENABLE)
return 0;
} else {
if (!is_migration_disabled(p)) {
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);
if (!pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
}
task_rq_unlock(rq, p, rf);
if (complete)
complete_all(&pending->done);
}
wait_for_completion(&pending->done);
if (refcount_dec_and_test(&pending->refs))
wake_up_var(&pending->refs);
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
WARN_ON_ONCE(my_pending.stop_pending);
return 0;
}
static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
struct affinity_context *ctx,
struct rq *rq,
struct rq_flags *rf)
__releases(rq->lock)
__releases(p->pi_lock)
{
const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
bool kthread = p->flags & PF_KTHREAD;
unsigned int dest_cpu;
int ret = 0;
update_rq_clock(rq);
if (kthread || is_migration_disabled(p)) {
cpu_valid_mask = cpu_online_mask;
}
if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
ret = -EINVAL;
goto out;
}
if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
if (ctx->flags & SCA_USER)
swap(p->user_cpus_ptr, ctx->user_mask);
goto out;
}
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
!cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
ret = -EBUSY;
goto out;
}
}
dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
__do_set_cpus_allowed(p, ctx);
return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
out:
task_rq_unlock(rq, p, rf);
return ret;
}
static int __set_cpus_allowed_ptr(struct task_struct *p,
struct affinity_context *ctx)
{
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
if (p->user_cpus_ptr &&
!(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
ctx->new_mask = rq->scratch_mask;
return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
}
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
.flags = 0,
};
return __set_cpus_allowed_ptr(p, &ac);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
static int restrict_cpus_allowed_ptr(struct task_struct *p,
struct cpumask *new_mask,
const struct cpumask *subset_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
.flags = 0,
};
struct rq_flags rf;
struct rq *rq;
int err;
rq = task_rq_lock(p, &rf);
if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
err = -EPERM;
goto err_unlock;
}
if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
err = -EINVAL;
goto err_unlock;
}
return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
err_unlock:
task_rq_unlock(rq, p, &rf);
return err;
}
void force_compatible_cpus_allowed_ptr(struct task_struct *p)
{
cpumask_var_t new_mask;
const struct cpumask *override_mask = task_cpu_possible_mask(p);
alloc_cpumask_var(&new_mask, GFP_KERNEL);
cpus_read_lock();
if (!cpumask_available(new_mask))
goto out_set_mask;
if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
goto out_free_mask;
cpuset_cpus_allowed(p, new_mask);
override_mask = new_mask;
out_set_mask:
if (printk_ratelimit()) {
printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
task_pid_nr(p), p->comm,
cpumask_pr_args(override_mask));
}
WARN_ON(set_cpus_allowed_ptr(p, override_mask));
out_free_mask:
cpus_read_unlock();
free_cpumask_var(new_mask);
}
static int
__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
{
struct affinity_context ac = {
.new_mask = task_user_cpus(p),
.flags = 0,
};
int ret;
ret = __sched_setaffinity(p, &ac);
WARN_ON_ONCE(ret);
}
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
unsigned int state = READ_ONCE(p->__state);
WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
WARN_ON_ONCE(state == TASK_RUNNING &&
p->sched_class == &fair_sched_class &&
(p->on_rq && !task_on_rq_migrating(p)));
#ifdef CONFIG_LOCKDEP
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(__rq_lockp(task_rq(p)))));
#endif
WARN_ON_ONCE(!cpu_online(new_cpu));
WARN_ON_ONCE(is_migration_disabled(p));
#endif
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
__set_task_cpu(p, new_cpu);
}
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf);
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
check_preempt_curr(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
} else {
p->wake_cpu = cpu;
}
}
struct migration_swap_arg {
struct task_struct *src_task, *dst_task;
int src_cpu, dst_cpu;
};
static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
return -EAGAIN;
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
guard(double_rq_lock)(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
return -EAGAIN;
if (task_cpu(arg->src_task) != arg->src_cpu)
return -EAGAIN;
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
return -EAGAIN;
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
return -EAGAIN;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
return 0;
}
int migrate_swap(struct task_struct *cur, struct task_struct *p,
int target_cpu, int curr_cpu)
{
struct migration_swap_arg arg;
int ret = -EINVAL;
arg = (struct migration_swap_arg){
.src_task = cur,
.src_cpu = curr_cpu,
.dst_task = p,
.dst_cpu = target_cpu,
};
if (arg.src_cpu == arg.dst_cpu)
goto out;
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
goto out;
if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
out:
return ret;
}
#endif /* CONFIG_NUMA_BALANCING */
void kick_process(struct task_struct *p)
{
int cpu;
preempt_disable();
cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
static int select_fallback_rq(int cpu, struct task_struct *p)
{
int nid = cpu_to_node(cpu);
const struct cpumask *nodemask = NULL;
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
if (nid != -1) {
nodemask = cpumask_of_node(nid);
for_each_cpu(dest_cpu, nodemask) {
if (is_cpu_allowed(p, dest_cpu))
return dest_cpu;
}
}
for (;;) {
for_each_cpu(dest_cpu, p->cpus_ptr) {
if (!is_cpu_allowed(p, dest_cpu))
continue;
goto out;
}
switch (state) {
case cpuset:
if (cpuset_cpus_allowed_fallback(p)) {
state = possible;
break;
}
fallthrough;
case possible:
do_set_cpus_allowed(p, task_cpu_possible_mask(p));
state = fail;
break;
case fail:
BUG();
break;
}
}
out:
if (state != cpuset) {
if (p->mm && printk_ratelimit()) {
printk_deferred("process %d (%s) no longer affine to cpu%d\n",
task_pid_nr(p), p->comm, cpu);
}
}
return dest_cpu;
}
static inline
int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
else
cpu = cpumask_any(p->cpus_ptr);
if (unlikely(!is_cpu_allowed(p, cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
return cpu;
}
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
static struct lock_class_key stop_pi_lock;
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
struct task_struct *old_stop = cpu_rq(cpu)->stop;
if (stop) {
sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
stop->sched_class = &stop_sched_class;
lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
}
cpu_rq(cpu)->stop = stop;
if (old_stop) {
old_stop->sched_class = &rt_sched_class;
}
}
#else /* CONFIG_SMP */
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
struct affinity_context *ctx)
{
return set_cpus_allowed_ptr(p, ctx->new_mask);
}
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return false;
}
static inline cpumask_t *alloc_user_cpus_ptr(int node)
{
return NULL;
}
#endif /* !CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq;
if (!schedstat_enabled())
return;
rq = this_rq();
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
__schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
__schedstat_inc(p->stats.nr_wakeups_remote);
guard(rcu)();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
__schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
}
if (wake_flags & WF_MIGRATED)
__schedstat_inc(p->stats.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
__schedstat_inc(p->stats.nr_wakeups);
if (wake_flags & WF_SYNC)
__schedstat_inc(p->stats.nr_wakeups_sync);
}
static inline void ttwu_do_wakeup(struct task_struct *p)
{
WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
}
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
{
int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
lockdep_assert_rq_held(rq);
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
#ifdef CONFIG_SMP
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
else
#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
activate_task(rq, p, en_flags);
check_preempt_curr(rq, p, wake_flags);
ttwu_do_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
rq_unpin_lock(rq, rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, rf);
}
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
u64 max = 2*rq->max_idle_balance_cost;
update_avg(&rq->avg_idle, delta);
if (rq->avg_idle > max)
rq->avg_idle = max;
rq->wake_stamp = jiffies;
rq->wake_avg_idle = rq->avg_idle / 2;
rq->idle_stamp = 0;
}
#endif
}
static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
struct rq_flags rf;
struct rq *rq;
int ret = 0;
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
if (!task_on_cpu(rq, p)) {
update_rq_clock(rq);
check_preempt_curr(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
ret = 1;
}
__task_rq_unlock(rq, &rf);
return ret;
}
#ifdef CONFIG_SMP
void sched_ttwu_pending(void *arg)
{
struct llist_node *llist = arg;
struct rq *rq = this_rq();
struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
return;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
if (WARN_ON_ONCE(p->on_cpu))
smp_cond_load_acquire(&p->on_cpu, !VAL);
if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
set_task_cpu(p, cpu_of(rq));
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}
WRITE_ONCE(rq->ttwu_pending, 0);
rq_unlock_irqrestore(rq, &rf);
}
bool call_function_single_prep_ipi(int cpu)
{
if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
return false;
}
return true;
}
static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
__smp_call_single_queue(cpu, &p->wake_entry.llist);
}
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
guard(rcu)();
if (is_idle_task(rcu_dereference(rq->curr))) {
guard(rq_lock_irqsave)(rq);
if (is_idle_task(rq->curr))
resched_curr(rq);
}
}
bool cpus_share_cache(int this_cpu, int that_cpu)
{
if (this_cpu == that_cpu)
return true;
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
if (!cpu_active(cpu))
return false;
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
if (cpu == smp_processor_id())
return false;
if (!cpu_rq(cpu)->nr_running)
return true;
return false;
}
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {
sched_clock_cpu(cpu);
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
}
return false;
}
#else /* !CONFIG_SMP */
static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
return false;
}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
rq_lock(rq, &rf);
update_rq_clock(rq);
ttwu_do_activate(rq, p, wake_flags, &rf);
rq_unlock(rq, &rf);
}
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
{
int match;
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
state != TASK_RTLOCK_WAIT);
}
*success = !!(match = __task_state_match(p, state));
#ifdef CONFIG_PREEMPT_RT
if (match < 0)
p->saved_state = TASK_RUNNING;
#endif
return match > 0;
}
int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
guard(preempt)();
int cpu, success = 0;
if (p == current) {
if (!ttwu_state_match(p, state, &success))
goto out;
trace_sched_waking(p);
ttwu_do_wakeup(p);
goto out;
}
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
smp_mb__after_spinlock();
if (!ttwu_state_match(p, state, &success))
break;
trace_sched_waking(p);
smp_rmb();
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
break;
#ifdef CONFIG_SMP
smp_acquire__after_ctrl_dep();
WRITE_ONCE(p->__state, TASK_WAKING);
if (smp_load_acquire(&p->on_cpu) &&
ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
break;
smp_cond_load_acquire(&p->on_cpu, !VAL);
cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
#else
cpu = task_cpu(p);
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
}
out:
if (success)
ttwu_stat(p, task_cpu(p), wake_flags);
return success;
}
static bool __task_needs_rq_lock(struct task_struct *p)
{
unsigned int state = READ_ONCE(p->__state);
if (state == TASK_RUNNING || state == TASK_WAKING)
return true;
smp_rmb();
if (p->on_rq)
return true;
#ifdef CONFIG_SMP
smp_rmb();
smp_cond_load_acquire(&p->on_cpu, !VAL);
#endif
return false;
}
int task_call_func(struct task_struct *p, task_call_f func, void *arg)
{
struct rq *rq = NULL;
struct rq_flags rf;
int ret;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
if (__task_needs_rq_lock(p))
rq = __task_rq_lock(p, &rf);
ret = func(p, arg);
if (rq)
rq_unlock(rq, &rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
}
struct task_struct *cpu_curr_snapshot(int cpu)
{
struct task_struct *t;
smp_mb();
t = rcu_dereference(cpu_curr(cpu));
smp_mb();
return t;
}
int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
int wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0);
}
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0;
p->se.on_rq = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->stats, 0, sizeof(p->stats));
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
init_dl_task_timer(&p->dl);
init_dl_inactive_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p->rt.time_slice = sched_rr_timeslice;
p->rt.on_rq = 0;
p->rt.on_list = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
#ifdef CONFIG_COMPACTION
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
#ifdef CONFIG_NUMA_BALANCING
int sysctl_numa_balancing_mode;
static void __set_numabalancing_state(bool enabled)
{
if (enabled)
static_branch_enable(&sched_numa_balancing);
else
static_branch_disable(&sched_numa_balancing);
}
void set_numabalancing_state(bool enabled)
{
if (enabled)
sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
else
sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
__set_numabalancing_state(enabled);
}
#ifdef CONFIG_PROC_SYSCTL
static void reset_memory_tiering(void)
{
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat) {
pgdat->nbp_threshold = 0;
pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
}
}
static int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
int state = sysctl_numa_balancing_mode;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
t = *table;
t.data = &state;
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
if (write) {
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
(state & NUMA_BALANCING_MEMORY_TIERING))
reset_memory_tiering();
sysctl_numa_balancing_mode = state;
__set_numabalancing_state(state);
}
return err;
}
#endif
#endif
#ifdef CONFIG_SCHEDSTATS
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
static void set_schedstats(bool enabled)
{
if (enabled)
static_branch_enable(&sched_schedstats);
else
static_branch_disable(&sched_schedstats);
}
void force_schedstat_enabled(void)
{
if (!schedstat_enabled()) {
pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
static_branch_enable(&sched_schedstats);
}
}
static int __init setup_schedstats(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
set_schedstats(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
set_schedstats(false);
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse schedstats=\n");
return ret;
}
__setup("schedstats=", setup_schedstats);
#ifdef CONFIG_PROC_SYSCTL
static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
int state = static_branch_likely(&sched_schedstats);
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
t = *table;
t.data = &state;
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
if (write)
set_schedstats(state);
return err;
}
#endif /* CONFIG_PROC_SYSCTL */
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SYSCTL
static struct ctl_table sched_core_sysctls[] = {
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
.data = NULL,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_schedstats,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_UCLAMP_TASK
{
.procname = "sched_util_clamp_min",
.data = &sysctl_sched_uclamp_util_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
{
.procname = "sched_util_clamp_max",
.data = &sysctl_sched_uclamp_util_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
{
.procname = "sched_util_clamp_min_rt_default",
.data = &sysctl_sched_uclamp_util_min_rt_default,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
#endif /* CONFIG_UCLAMP_TASK */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing",
.data = NULL,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_numa_balancing,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_FOUR,
},
#endif /* CONFIG_NUMA_BALANCING */
{}
};
static int __init sched_core_sysctl_init(void)
{
register_sysctl_init("kernel", sched_core_sysctls);
return 0;
}
late_initcall(sched_core_sysctl_init);
#endif /* CONFIG_SYSCTL */
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
__sched_fork(clone_flags, p);
p->__state = TASK_NEW;
p->prio = current->normal_prio;
uclamp_fork(p);
if (unlikely(p->sched_reset_on_fork)) {
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->policy = SCHED_NORMAL;
p->static_prio = NICE_TO_PRIO(0);
p->rt_priority = 0;
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
p->sched_reset_on_fork = 0;
}
if (dl_prio(p->prio))
return -EAGAIN;
else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
init_entity_runnable_average(&p->se);
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
init_task_preempt_count(p);
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
return 0;
}
void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED
if (1) {
struct task_group *tg;
tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
struct task_group, css);
tg = autogroup_task_group(p, tg);
p->sched_task_group = tg;
}
#endif
rseq_migrate(p);
__set_task_cpu(p, smp_processor_id());
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
void sched_post_fork(struct task_struct *p)
{
uclamp_post_fork(p);
}
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
return BW_UNIT;
if (period == 0)
return 0;
return div64_u64(runtime << BW_SHIFT, period);
}
void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
#ifdef CONFIG_SMP
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
void preempt_notifier_inc(void)
{
static_branch_inc(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_inc);
void preempt_notifier_dec(void)
{
static_branch_dec(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_dec);
void preempt_notifier_register(struct preempt_notifier *notifier)
{
if (!static_branch_unlikely(&preempt_notifier_key))
WARN(1, "registering preempt_notifier while notifiers disabled\n");
hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
}
EXPORT_SYMBOL_GPL(preempt_notifier_register);
void preempt_notifier_unregister(struct preempt_notifier *notifier)
{
hlist_del(¬ifier->link);
}
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_in_preempt_notifiers(curr);
}
static void
__fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
struct preempt_notifier *notifier;
hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_out(notifier, next);
}
static __always_inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_out_preempt_notifiers(curr, next);
}
#else /* !CONFIG_PREEMPT_NOTIFIERS */
static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
}
static inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
}
#endif /* CONFIG_PREEMPT_NOTIFIERS */
static inline void prepare_task(struct task_struct *next)
{
#ifdef CONFIG_SMP
WRITE_ONCE(next->on_cpu, 1);
#endif
}
static inline void finish_task(struct task_struct *prev)
{
#ifdef CONFIG_SMP
smp_store_release(&prev->on_cpu, 0);
#endif
}
#ifdef CONFIG_SMP
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
struct balance_callback *next;
lockdep_assert_rq_held(rq);
while (head) {
func = (void (*)(struct rq *))head->func;
next = head->next;
head->next = NULL;
head = next;
func(rq);
}
}
static void balance_push(struct rq *rq);
struct balance_callback balance_push_callback = {
.next = NULL,
.func = balance_push,
};
static inline struct balance_callback *
__splice_balance_callbacks(struct rq *rq, bool split)
{
struct balance_callback *head = rq->balance_callback;
if (likely(!head))
return NULL;
lockdep_assert_rq_held(rq);
if (split && head == &balance_push_callback)
head = NULL;
else
rq->balance_callback = NULL;
return head;
}
static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return __splice_balance_callbacks(rq, true);
}
static void __balance_callbacks(struct rq *rq)
{
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
}
static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
unsigned long flags;
if (unlikely(head)) {
raw_spin_rq_lock_irqsave(rq, flags);
do_balance_callbacks(rq, head);
raw_spin_rq_unlock_irqrestore(rq, flags);
}
}
#else
static inline void __balance_callbacks(struct rq *rq)
{
}
static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return NULL;
}
static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
}
#endif
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
rq_unpin_lock(rq, rf);
spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
rq_lockp(rq)->owner = next;
#endif
}
static inline void finish_lock_switch(struct rq *rq)
{
spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
}
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
static inline void kmap_local_sched_out(void)
{
#ifdef CONFIG_KMAP_LOCAL
if (unlikely(current->kmap_ctrl.idx))
__kmap_local_sched_out();
#endif
}
static inline void kmap_local_sched_in(void)
{
#ifdef CONFIG_KMAP_LOCAL
if (unlikely(current->kmap_ctrl.idx))
__kmap_local_sched_in();
#endif
}
static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
kcov_prepare_switch(prev);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
prepare_arch_switch(next);
}
static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
unsigned int prev_state;
if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
"corrupted preempt_count: %s/%d/0x%x\n",
current->comm, current->pid, preempt_count()))
preempt_count_set(FORK_PREEMPT_COUNT);
rq->prev_mm = NULL;
prev_state = READ_ONCE(prev->__state);
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
finish_task(prev);
tick_nohz_task_switch();
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
kmap_local_sched_in();
fire_sched_in_preempt_notifiers(current);
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
mmdrop_lazy_tlb_sched(mm);
}
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
put_task_stack(prev);
put_task_struct_rcu_user(prev);
}
return rq;
}
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
finish_task_switch(prev);
preempt_enable();
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
calculate_sigpending();
}
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
prepare_task_switch(rq, prev, next);
arch_start_context_switch(prev);
if (!next->mm) {
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
if (prev->mm)
mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else {
membarrier_switch_mm(rq, prev->active_mm, next->mm);
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);
if (!prev->mm) {
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
switch_mm_cid(rq, prev, next);
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
switch_to(prev, next, prev);
barrier();
return finish_task_switch(prev);
}
unsigned int nr_running(void)
{
unsigned int i, sum = 0;
for_each_online_cpu(i)
sum += cpu_rq(i)->nr_running;
return sum;
}
bool single_task_running(void)
{
return raw_rq()->nr_running == 1;
}
EXPORT_SYMBOL(single_task_running);
unsigned long long nr_context_switches_cpu(int cpu)
{
return cpu_rq(cpu)->nr_switches;
}
unsigned long long nr_context_switches(void)
{
int i;
unsigned long long sum = 0;
for_each_possible_cpu(i)
sum += cpu_rq(i)->nr_switches;
return sum;
}
unsigned int nr_iowait_cpu(int cpu)
{
return atomic_read(&cpu_rq(cpu)->nr_iowait);
}
unsigned int nr_iowait(void)
{
unsigned int i, sum = 0;
for_each_possible_cpu(i)
sum += nr_iowait_cpu(i);
return sum;
}
#ifdef CONFIG_SMP
void sched_exec(void)
{
struct task_struct *p = current;
struct migration_arg arg;
int dest_cpu;
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
if (dest_cpu == smp_processor_id())
return;
if (unlikely(!cpu_active(dest_cpu)))
return;
arg = (struct migration_arg){ p, dest_cpu };
}
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
#endif
DEFINE_PER_CPU(struct kernel_stat, kstat);
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
static inline void prefetch_curr_exec_start(struct task_struct *p)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *curr = (&p->se)->cfs_rq->curr;
#else
struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
#endif
prefetch(curr);
prefetch(&curr->exec_start);
}
unsigned long long task_sched_runtime(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
u64 ns;
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
if (!p->on_cpu || !task_on_rq_queued(p))
return p->se.sum_exec_runtime;
#endif
rq = task_rq_lock(p, &rf);
if (task_current(rq, p) && task_on_rq_queued(p)) {
prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
ns = p->se.sum_exec_runtime;
task_rq_unlock(rq, p, &rf);
return ns;
}
#ifdef CONFIG_SCHED_DEBUG
static u64 cpu_resched_latency(struct rq *rq)
{
int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
u64 resched_latency, now = rq_clock(rq);
static bool warned_once;
if (sysctl_resched_latency_warn_once && warned_once)
return 0;
if (!need_resched() || !latency_warn_ms)
return 0;
if (system_state == SYSTEM_BOOTING)
return 0;
if (!rq->last_seen_need_resched_ns) {
rq->last_seen_need_resched_ns = now;
rq->ticks_without_resched = 0;
return 0;
}
rq->ticks_without_resched++;
resched_latency = now - rq->last_seen_need_resched_ns;
if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
return 0;
warned_once = true;
return resched_latency;
}
static int __init setup_resched_latency_warn_ms(char *str)
{
long val;
if ((kstrtol(str, 0, &val))) {
pr_warn("Unable to set resched_latency_warn_ms\n");
return 1;
}
sysctl_resched_latency_warn_ms = val;
return 1;
}
__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
#else
static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
#endif /* CONFIG_SCHED_DEBUG */
void scheduler_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
unsigned long thermal_pressure;
u64 resched_latency;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
arch_scale_freq_tick();
sched_clock_tick();
rq_lock(rq, &rf);
update_rq_clock(rq);
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
task_tick_mm_cid(rq, curr);
rq_unlock(rq, &rf);
if (sched_feat(LATENCY_WARN) && resched_latency)
resched_latency_warn(cpu, resched_latency);
perf_event_task_tick();
if (curr->flags & PF_WQ_WORKER)
wq_worker_tick(curr);
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
}
#ifdef CONFIG_NO_HZ_FULL
struct tick_work {
int cpu;
atomic_t state;
struct delayed_work work;
};
#define TICK_SCHED_REMOTE_OFFLINE 0
#define TICK_SCHED_REMOTE_OFFLINING 1
#define TICK_SCHED_REMOTE_RUNNING 2
static struct tick_work __percpu *tick_work_cpu;
static void sched_tick_remote(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
int os;
if (tick_nohz_tick_stopped_cpu(cpu)) {
guard(rq_lock_irq)(rq);
struct task_struct *curr = rq->curr;
if (cpu_online(cpu)) {
update_rq_clock(rq);
if (!is_idle_task(curr)) {
u64 delta = rq_clock_task(rq) - curr->se.exec_start;
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
}
curr->sched_class->task_tick(rq, curr, 0);
calc_load_nohz_remote(rq);
}
}
os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
if (os == TICK_SCHED_REMOTE_RUNNING)
queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
int os;
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
if (os == TICK_SCHED_REMOTE_OFFLINE) {
twork->cpu = cpu;
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
queue_delayed_work(system_unbound_wq, &twork->work, HZ);
}
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
int os;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
}
#endif /* CONFIG_HOTPLUG_CPU */
int __init sched_tick_offload_init(void)
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
return 0;
}
#else /* !CONFIG_NO_HZ_FULL */
static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
#endif
#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
static inline void preempt_latency_start(int val)
{
if (preempt_count() == val) {
unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = ip;
#endif
trace_preempt_off(CALLER_ADDR0, ip);
}
}
void preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
#endif
__preempt_count_add(val);
#ifdef CONFIG_DEBUG_PREEMPT
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
preempt_latency_start(val);
}
EXPORT_SYMBOL(preempt_count_add);
NOKPROBE_SYMBOL(preempt_count_add);
static inline void preempt_latency_stop(int val)
{
if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
}
void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
return;
if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
!(preempt_count() & PREEMPT_MASK)))
return;
#endif
preempt_latency_stop(val);
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
NOKPROBE_SYMBOL(preempt_count_sub);
#else
static inline void preempt_latency_start(int val) { }
static inline void preempt_latency_stop(int val) { }
#endif
static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
{
#ifdef CONFIG_DEBUG_PREEMPT
return p->preempt_disable_ip;
#else
return 0;
#endif
}
static noinline void __schedule_bug(struct task_struct *prev)
{
unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
if (oops_in_progress)
return;
printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
prev->comm, prev->pid, preempt_count());
debug_show_held_locks(prev);
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
&& in_atomic_preempt_off()) {
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
check_panic_on_warn("scheduling while atomic");
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
static inline void schedule_debug(struct task_struct *prev, bool preempt)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
if (task_scs_end_corrupted(prev))
panic("corrupted shadow stack detected inside scheduler\n");
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
prev->comm, prev->pid, prev->non_block_count);
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
#endif
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
SCHED_WARN_ON(ct_state() == CONTEXT_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq()->sched_count);
}
static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
struct rq_flags *rf)
{
#ifdef CONFIG_SMP
const struct sched_class *class;
for_class_range(class, prev->sched_class, &idle_sched_class) {
if (class->balance(rq, prev, rf))
break;
}
#endif
put_prev_task(rq, prev);
}
static inline struct task_struct *
__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
if (!p) {
put_prev_task(rq, prev);
p = pick_next_task_idle(rq);
}
return p;
}
restart:
put_prev_task_balance(rq, prev, rf);
for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
return p;
}
BUG();
}
#ifdef CONFIG_SCHED_CORE
static inline bool is_task_rq_idle(struct task_struct *t)
{
return (task_rq(t)->idle == t);
}
static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
{
return is_task_rq_idle(a) || (a->core_cookie == cookie);
}
static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
{
if (is_task_rq_idle(a) || is_task_rq_idle(b))
return true;
return a->core_cookie == b->core_cookie;
}
static inline struct task_struct *pick_task(struct rq *rq)
{
const struct sched_class *class;
struct task_struct *p;
for_each_class(class) {
p = class->pick_task(rq);
if (p)
return p;
}
BUG();
}
extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
static void queue_core_balance(struct rq *rq);
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
bool need_sync;
if (!sched_core_enabled(rq))
return __pick_next_task(rq, prev, rf);
cpu = cpu_of(rq);
if (cpu_is_offline(cpu)) {
rq->core_pick = NULL;
return __pick_next_task(rq, prev, rf);
}
if (rq->core->core_pick_seq == rq->core->core_task_seq &&
rq->core->core_pick_seq != rq->core_sched_seq &&
rq->core_pick) {
WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
next = rq->core_pick;
if (next != prev) {
put_prev_task(rq, prev);
set_next_task(rq, next);
}
rq->core_pick = NULL;
goto out;
}
put_prev_task_balance(rq, prev, rf);
smt_mask = cpu_smt_mask(cpu);
need_sync = !!rq->core->core_cookie;
rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle_count) {
if (!core_clock_updated) {
update_rq_clock(rq->core);
core_clock_updated = true;
}
sched_core_account_forceidle(rq);
rq->core->core_forceidle_start = 0;
rq->core->core_forceidle_count = 0;
rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
}
rq->core->core_task_seq++;
if (!need_sync) {
next = pick_task(rq);
if (!next->core_cookie) {
rq->core_pick = NULL;
WARN_ON_ONCE(fi_before);
task_vruntime_update(rq, next, false);
goto out_set_next;
}
}
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
p = rq_i->core_pick = pick_task(rq_i);
if (!max || prio_less(max, p, fi_before))
max = p;
}
cookie = rq->core->core_cookie = max->core_cookie;
for_each_cpu(i, smt_mask) {
rq_i = cpu_rq(i);
p = rq_i->core_pick;
if (!cookie_equals(p, cookie)) {
p = NULL;
if (cookie)
p = sched_core_find(rq_i, cookie);
if (!p)
p = idle_sched_class.pick_task(rq_i);
}
rq_i->core_pick = p;
if (p == rq_i->idle) {
if (rq_i->nr_running) {
rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
} else {
occ++;
}
}
if (schedstat_enabled() && rq->core->core_forceidle_count) {
rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_occupation = occ;
}
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
WARN_ON_ONCE(!next);
for_each_cpu(i, smt_mask) {
rq_i = cpu_rq(i);
if (!rq_i->core_pick)
continue;
if (!(fi_before && rq->core->core_forceidle_count))
task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
rq_i->core_pick->core_occupation = occ;
if (i == cpu) {
rq_i->core_pick = NULL;
continue;
}
WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
if (rq_i->curr == rq_i->core_pick) {
rq_i->core_pick = NULL;
continue;
}
resched_curr(rq_i);
}
out_set_next:
set_next_task(rq, next);
out:
if (rq->core->core_forceidle_count && next == rq->idle)
queue_core_balance(rq);
return next;
}
static bool try_steal_cookie(int this, int that)
{
struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
struct task_struct *p;
unsigned long cookie;
bool success = false;
guard(irq)();
guard(double_rq_lock)(dst, src);
cookie = dst->core->core_cookie;
if (!cookie)
return false;
if (dst->curr != dst->idle)
return false;
p = sched_core_find(src, cookie);
if (!p)
return false;
do {
if (p == src->core_pick || p == src->curr)
goto next;
if (!is_cpu_allowed(p, this))
goto next;
if (p->core_occupation > dst->idle->core_occupation)
goto next;
if (sched_task_is_throttled(p, this))
goto next;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
activate_task(dst, p, 0);
resched_curr(dst);
success = true;
break;
next:
p = sched_core_next(p, cookie);
} while (p);
return success;
}
static bool steal_cookie_task(int cpu, struct sched_domain *sd)
{
int i;
for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
if (i == cpu)
continue;
if (need_resched())
break;
if (try_steal_cookie(cpu, i))
return true;
}
return false;
}
static void sched_core_balance(struct rq *rq)
{
struct sched_domain *sd;
int cpu = cpu_of(rq);
preempt_disable();
rcu_read_lock();
raw_spin_rq_unlock_irq(rq);
for_each_domain(cpu, sd) {
if (need_resched())
break;
if (steal_cookie_task(cpu, sd))
break;
}
raw_spin_rq_lock_irq(rq);
rcu_read_unlock();
preempt_enable();
}
static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
static void queue_core_balance(struct rq *rq)
{
if (!sched_core_enabled(rq))
return;
if (!rq->core->core_cookie)
return;
if (!rq->nr_running)
return;
queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
}
DEFINE_LOCK_GUARD_1(core_lock, int,
sched_core_lock(*_T->lock, &_T->flags),
sched_core_unlock(*_T->lock, &_T->flags),
unsigned long flags)
static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
int t;
guard(core_lock)(&cpu);
WARN_ON_ONCE(rq->core != rq);
if (cpumask_weight(smt_mask) == 1)
return;
for_each_cpu(t, smt_mask) {
if (t == cpu)
continue;
rq = cpu_rq(t);
if (rq->core == rq) {
core_rq = rq;
break;
}
}
if (WARN_ON_ONCE(!core_rq))
return;
for_each_cpu(t, smt_mask) {
rq = cpu_rq(t);
if (t == cpu)
rq->core = core_rq;
WARN_ON_ONCE(rq->core != core_rq);
}
}
static void sched_core_cpu_deactivate(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
int t;
guard(core_lock)(&cpu);
if (cpumask_weight(smt_mask) == 1) {
WARN_ON_ONCE(rq->core != rq);
return;
}
if (rq->core != rq)
return;
for_each_cpu(t, smt_mask) {
if (t == cpu)
continue;
core_rq = cpu_rq(t);
break;
}
if (WARN_ON_ONCE(!core_rq))
return;
core_rq->core_task_seq = rq->core_task_seq;
core_rq->core_pick_seq = rq->core_pick_seq;
core_rq->core_cookie = rq->core_cookie;
core_rq->core_forceidle_count = rq->core_forceidle_count;
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
core_rq->core_forceidle_start = 0;
for_each_cpu(t, smt_mask) {
rq = cpu_rq(t);
rq->core = core_rq;
}
}
static inline void sched_core_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (rq->core != rq)
rq->core = rq;
}
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
static inline void sched_core_cpu_dying(unsigned int cpu) {}
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
return __pick_next_task(rq, prev, rf);
}
#endif /* CONFIG_SCHED_CORE */
#define SM_NONE 0x0
#define SM_PREEMPT 0x1
#define SM_RTLOCK_WAIT 0x2
#ifndef CONFIG_PREEMPT_RT
# define SM_MASK_PREEMPT (~0U)
#else
# define SM_MASK_PREEMPT SM_PREEMPT
#endif
static void __sched notrace __schedule(unsigned int sched_mode)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
struct rq *rq;
int cpu;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
schedule_debug(prev, !!sched_mode);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
rcu_note_context_switch(!!sched_mode);
rq_lock(rq, &rf);
smp_mb__after_spinlock();
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
switch_count = &prev->nivcsw;
prev_state = READ_ONCE(prev->__state);
if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
if (signal_pending_state(prev_state, prev)) {
WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
!(prev_state & TASK_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
}
switch_count = &prev->nvcsw;
}
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
#endif
if (likely(prev != next)) {
rq->nr_switches++;
RCU_INIT_POINTER(rq->curr, next);
++*switch_count;
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
}
}
void __noreturn do_task_dead(void)
{
set_special_state(TASK_DEAD);
current->flags |= PF_NOFREEZE;
__schedule(SM_NONE);
BUG();
for (;;)
cpu_relax();
}
static inline void sched_submit_work(struct task_struct *tsk)
{
unsigned int task_flags;
if (task_is_running(tsk))
return;
task_flags = tsk->flags;
if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk);
else
io_wq_worker_sleeping(tsk);
}
SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
blk_flush_plug(tsk->plug, true);
}
static void sched_update_worker(struct task_struct *tsk)
{
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
else
io_wq_worker_running(tsk);
}
}
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
sched_submit_work(tsk);
do {
preempt_disable();
__schedule(SM_NONE);
sched_preempt_enable_no_resched();
} while (need_resched());
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
void __sched schedule_idle(void)
{
WARN_ON_ONCE(current->__state);
do {
__schedule(SM_NONE);
} while (need_resched());
}
#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
enum ctx_state prev_state = exception_enter();
schedule();
exception_exit(prev_state);
}
#endif
void __sched schedule_preempt_disabled(void)
{
sched_preempt_enable_no_resched();
schedule();
preempt_disable();
}
#ifdef CONFIG_PREEMPT_RT
void __sched notrace schedule_rtlock(void)
{
do {
preempt_disable();
__schedule(SM_RTLOCK_WAIT);
sched_preempt_enable_no_resched();
} while (need_resched());
}
NOKPROBE_SYMBOL(schedule_rtlock);
#endif
static void __sched notrace preempt_schedule_common(void)
{
do {
preempt_disable_notrace();
preempt_latency_start(1);
__schedule(SM_PREEMPT);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
} while (need_resched());
}
#ifdef CONFIG_PREEMPTION
asmlinkage __visible void __sched notrace preempt_schedule(void)
{
if (likely(!preemptible()))
return;
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#ifndef preempt_schedule_dynamic_enabled
#define preempt_schedule_dynamic_enabled preempt_schedule
#define preempt_schedule_dynamic_disabled NULL
#endif
DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
void __sched notrace dynamic_preempt_schedule(void)
{
if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
return;
preempt_schedule();
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule);
EXPORT_SYMBOL(dynamic_preempt_schedule);
#endif
#endif
asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
{
enum ctx_state prev_ctx;
if (likely(!preemptible()))
return;
do {
preempt_disable_notrace();
preempt_latency_start(1);
prev_ctx = exception_enter();
__schedule(SM_PREEMPT);
exception_exit(prev_ctx);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
} while (need_resched());
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#ifndef preempt_schedule_notrace_dynamic_enabled
#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
#define preempt_schedule_notrace_dynamic_disabled NULL
#endif
DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
void __sched notrace dynamic_preempt_schedule_notrace(void)
{
if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
return;
preempt_schedule_notrace();
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
#endif
#endif
#endif /* CONFIG_PREEMPTION */
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
enum ctx_state prev_state;
BUG_ON(preempt_count() || !irqs_disabled());
prev_state = exception_enter();
do {
preempt_disable();
local_irq_enable();
__schedule(SM_PREEMPT);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
exception_exit(prev_state);
}
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
static void __setscheduler_prio(struct task_struct *p, int prio)
{
if (dl_prio(prio))
p->sched_class = &dl_sched_class;
else if (rt_prio(prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
p->prio = prio;
}
#ifdef CONFIG_RT_MUTEXES
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
{
if (pi_task)
prio = min(prio, pi_task->prio);
return prio;
}
static inline int rt_effective_prio(struct task_struct *p, int prio)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
return __rt_effective_prio(pi_task, prio);
}
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
int prio, oldprio, queued, running, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class;
struct rq_flags rf;
struct rq *rq;
prio = __rt_effective_prio(pi_task, p->normal_prio);
if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
return;
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
p->pi_top_task = pi_task;
if (prio == p->prio && !dl_prio(prio))
goto out_unlock;
if (unlikely(p == rq->idle)) {
WARN_ON(p != rq->curr);
WARN_ON(p->pi_blocked_on);
goto out_unlock;
}
trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
if (oldprio == prio)
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, queue_flag);
if (running)
put_prev_task(rq, p);
if (dl_prio(prio)) {
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_prio(pi_task->prio) &&
dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.pi_se = pi_task->dl.pi_se;
queue_flag |= ENQUEUE_REPLENISH;
} else {
p->dl.pi_se = &p->dl;
}
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
} else {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
}
__setscheduler_prio(p, prio);
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
preempt_disable();
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock(rq);
preempt_enable();
}
#else
static inline int rt_effective_prio(struct task_struct *p, int prio)
{
return prio;
}
#endif
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
int old_prio;
struct rq_flags rf;
struct rq *rq;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_next_task(rq, p);
p->sched_class->prio_changed(rq, p, old_prio);
out_unlock:
task_rq_unlock(rq, p, &rf);
}
EXPORT_SYMBOL(set_user_nice);
static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
}
int can_nice(const struct task_struct *p, const int nice)
{
return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}
#ifdef __ARCH_WANT_SYS_NICE
SYSCALL_DEFINE1(nice, int, increment)
{
long nice, retval;
increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
nice = task_nice(current) + increment;
nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
retval = security_task_setnice(current, nice);
if (retval)
return retval;
set_user_nice(current, nice);
return 0;
}
#endif
int task_prio(const struct task_struct *p)
{
return p->prio - MAX_RT_PRIO;
}
int idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (rq->curr != rq->idle)
return 0;
if (rq->nr_running)
return 0;
#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
#endif
return 1;
}
int available_idle_cpu(int cpu)
{
if (!idle_cpu(cpu))
return 0;
if (vcpu_is_preempted(cpu))
return 0;
return 1;
}
struct task_struct *idle_task(int cpu)
{
return cpu_rq(cpu)->idle;
}
#ifdef CONFIG_SCHED_CORE
int sched_core_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (sched_core_enabled(rq) && rq->curr == rq->idle)
return 1;
return idle_cpu(cpu);
}
#endif
#ifdef CONFIG_SMP
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
enum cpu_util_type type,
struct task_struct *p)
{
unsigned long dl_util, util, irq, max;
struct rq *rq = cpu_rq(cpu);
max = arch_scale_cpu_capacity(cpu);
if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
}
irq = cpu_util_irq(rq);
if (unlikely(irq >= max))
return max;
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
util = uclamp_rq_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
if (util + dl_util >= max)
return max;
if (type == ENERGY_UTIL)
util += dl_util;
util = scale_irq_capacity(util, irq, max);
util += irq;
if (type == FREQUENCY_UTIL)
util += cpu_bw_dl(rq);
return min(max, util);
}
unsigned long sched_cpu_util(int cpu)
{
return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
static struct task_struct *find_process_by_pid(pid_t pid)
{
return pid ? find_task_by_vpid(pid) : current;
}
#define SETPARAM_POLICY -1
static void __setscheduler_params(struct task_struct *p,
const struct sched_attr *attr)
{
int policy = attr->sched_policy;
if (policy == SETPARAM_POLICY)
policy = p->policy;
p->policy = policy;
if (dl_policy(policy))
__setparam_dl(p, attr);
else if (fair_policy(policy))
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
set_load_weight(p, true);
}
static bool check_same_owner(struct task_struct *p)
{
const struct cred *cred = current_cred(), *pcred;
bool match;
rcu_read_lock();
pcred = __task_cred(p);
match = (uid_eq(cred->euid, pcred->euid) ||
uid_eq(cred->euid, pcred->uid));
rcu_read_unlock();
return match;
}
static int user_check_sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
int policy, int reset_on_fork)
{
if (fair_policy(policy)) {
if (attr->sched_nice < task_nice(p) &&
!is_nice_reduction(p, attr->sched_nice))
goto req_priv;
}
if (rt_policy(policy)) {
unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
if (policy != p->policy && !rlim_rtprio)
goto req_priv;
if (attr->sched_priority > p->rt_priority &&
attr->sched_priority > rlim_rtprio)
goto req_priv;
}
if (dl_policy(policy))
goto req_priv;
if (task_has_idle_policy(p) && !idle_policy(policy)) {
if (!is_nice_reduction(p, task_nice(p)))
goto req_priv;
}
if (!check_same_owner(p))
goto req_priv;
if (p->sched_reset_on_fork && !reset_on_fork)
goto req_priv;
return 0;
req_priv:
if (!capable(CAP_SYS_NICE))
return -EPERM;
return 0;
}
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
int oldpolicy = -1, policy = attr->sched_policy;
int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
bool cpuset_locked = false;
BUG_ON(pi && in_interrupt());
recheck:
if (policy < 0) {
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
} else {
reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
if (!valid_policy(policy))
return -EINVAL;
}
if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
return -EINVAL;
if (attr->sched_priority > MAX_RT_PRIO-1)
return -EINVAL;
if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
if (user) {
retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
if (retval)
return retval;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
retval = security_task_setscheduler(p);
if (retval)
return retval;
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
retval = uclamp_validate(p, attr);
if (retval)
return retval;
}
if (dl_policy(policy) || dl_policy(p->policy)) {
cpuset_locked = true;
cpuset_lock();
}
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
if (p == rq->stop) {
retval = -EINVAL;
goto unlock;
}
if (unlikely(policy == p->policy)) {
if (fair_policy(policy) && attr->sched_nice != task_nice(p))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
p->sched_reset_on_fork = reset_on_fork;
retval = 0;
goto unlock;
}
change:
if (user) {
#ifdef CONFIG_RT_GROUP_SCHED
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
retval = -EPERM;
goto unlock;
}
#endif
#ifdef CONFIG_SMP
if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
retval = -EPERM;
goto unlock;
}
}
#endif
}
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
if (cpuset_locked)
cpuset_unlock();
goto recheck;
}
if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
retval = -EBUSY;
goto unlock;
}
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
newprio = rt_effective_prio(p, newprio);
if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
prev_class = p->sched_class;
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
__setscheduler_params(p, attr);
__setscheduler_prio(p, newprio);
}
__setscheduler_uclamp(p, attr);
if (queued) {
if (oldprio < p->prio)
queue_flags |= ENQUEUE_HEAD;
enqueue_task(rq, p, queue_flags);
}
if (running)
set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
preempt_disable();
head = splice_balance_callbacks(rq);
task_rq_unlock(rq, p, &rf);
if (pi) {
if (cpuset_locked)
cpuset_unlock();
rt_mutex_adjust_pi(p);
}
balance_callbacks(rq, head);
preempt_enable();
return 0;
unlock:
task_rq_unlock(rq, p, &rf);
if (cpuset_locked)
cpuset_unlock();
return retval;
}
static int _sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param, bool check)
{
struct sched_attr attr = {
.sched_policy = policy,
.sched_priority = param->sched_priority,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
policy &= ~SCHED_RESET_ON_FORK;
attr.sched_policy = policy;
}
return __sched_setscheduler(p, &attr, check, true);
}
int sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param)
{
return _sched_setscheduler(p, policy, param, true);
}
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, true, true);
}
int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
{
return _sched_setscheduler(p, policy, param, false);
}
void sched_set_fifo(struct task_struct *p)
{
struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
}
EXPORT_SYMBOL_GPL(sched_set_fifo);
void sched_set_fifo_low(struct task_struct *p)
{
struct sched_param sp = { .sched_priority = 1 };
WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
}
EXPORT_SYMBOL_GPL(sched_set_fifo_low);
void sched_set_normal(struct task_struct *p, int nice)
{
struct sched_attr attr = {
.sched_policy = SCHED_NORMAL,
.sched_nice = nice,
};
WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
}
EXPORT_SYMBOL_GPL(sched_set_normal);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
struct task_struct *p;
int retval;
if (!param || pid < 0)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
if (likely(p))
get_task_struct(p);
rcu_read_unlock();
if (likely(p)) {
retval = sched_setscheduler(p, policy, &lparam);
put_task_struct(p);
}
return retval;
}
static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
{
u32 size;
int ret;
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
if (!size)
size = SCHED_ATTR_SIZE_VER0;
if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
if (ret) {
if (ret == -E2BIG)
goto err_size;
return ret;
}
if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
return 0;
err_size:
put_user(sizeof(*attr), &uattr->size);
return -E2BIG;
}
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
if (task_has_dl_policy(p))
__getparam_dl(p, attr);
else if (task_has_rt_policy(p))
attr->sched_priority = p->rt_priority;
else
attr->sched_nice = task_nice(p);
}
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
{
if (policy < 0)
return -EINVAL;
return do_sched_setscheduler(pid, policy, param);
}
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
}
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
{
struct sched_attr attr;
struct task_struct *p;
int retval;
if (!uattr || pid < 0 || flags)
return -EINVAL;
retval = sched_copy_attr(uattr, &attr);
if (retval)
return retval;
if ((int)attr.sched_policy < 0)
return -EINVAL;
if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
attr.sched_policy = SETPARAM_POLICY;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
if (likely(p))
get_task_struct(p);
rcu_read_unlock();
if (likely(p)) {
if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
get_params(p, &attr);
retval = sched_setattr(p, &attr);
put_task_struct(p);
}
return retval;
}
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
struct task_struct *p;
int retval;
if (pid < 0)
return -EINVAL;
retval = -ESRCH;
rcu_read_lock();
p = find_process_by_pid(pid);
if (p) {
retval = security_task_getscheduler(p);
if (!retval)
retval = p->policy
| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
}
rcu_read_unlock();
return retval;
}
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
if (!param || pid < 0)
return -EINVAL;
rcu_read_lock();
p = find_process_by_pid(pid);
retval = -ESRCH;
if (!p)
goto out_unlock;
retval = security_task_getscheduler(p);
if (retval)
goto out_unlock;
if (task_has_rt_policy(p))
lp.sched_priority = p->rt_priority;
rcu_read_unlock();
retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
return retval;
out_unlock:
rcu_read_unlock();
return retval;
}
static int
sched_attr_copy_to_user(struct sched_attr __user *uattr,
struct sched_attr *kattr,
unsigned int usize)
{
unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
kattr->size = min(usize, ksize);
if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
}
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, usize, unsigned int, flags)
{
struct sched_attr kattr = { };
struct task_struct *p;
int retval;
if (!uattr || pid < 0 || usize > PAGE_SIZE ||
usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
p = find_process_by_pid(pid);
retval = -ESRCH;
if (!p)
goto out_unlock;
retval = security_task_getscheduler(p);
if (retval)
goto out_unlock;
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
get_params(p, &kattr);
kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
rcu_read_unlock();
return sched_attr_copy_to_user(uattr, &kattr, usize);
out_unlock:
rcu_read_unlock();
return retval;
}
#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
int ret = 0;
if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
return 0;
rcu_read_lock();
if (!cpumask_subset(task_rq(p)->rd->span, mask))
ret = -EBUSY;
rcu_read_unlock();
return ret;
}
#endif
static int
__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
{
int retval;
cpumask_var_t cpus_allowed, new_mask;
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
return -ENOMEM;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_free_cpus_allowed;
}
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
ctx->new_mask = new_mask;
ctx->flags |= SCA_CHECK;
retval = dl_task_check_affinity(p, new_mask);
if (retval)
goto out_free_new_mask;
retval = __set_cpus_allowed_ptr(p, ctx);
if (retval)
goto out_free_new_mask;
cpuset_cpus_allowed(p, cpus_allowed);
if (!cpumask_subset(new_mask, cpus_allowed)) {
cpumask_copy(new_mask, cpus_allowed);
if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
bool empty = !cpumask_and(new_mask, new_mask,
ctx->user_mask);
if (WARN_ON_ONCE(empty))
cpumask_copy(new_mask, cpus_allowed);
}
__set_cpus_allowed_ptr(p, ctx);
retval = -EINVAL;
}
out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
return retval;
}
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
struct affinity_context ac;
struct cpumask *user_mask;
struct task_struct *p;
int retval;
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
return -ESRCH;
}
get_task_struct(p);
rcu_read_unlock();
if (p->flags & PF_NO_SETAFFINITY) {
retval = -EINVAL;
goto out_put_task;
}
if (!check_same_owner(p)) {
rcu_read_lock();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock();
retval = -EPERM;
goto out_put_task;
}
rcu_read_unlock();
}
retval = security_task_setscheduler(p);
if (retval)
goto out_put_task;
user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
if (user_mask) {
cpumask_copy(user_mask, in_mask);
} else if (IS_ENABLED(CONFIG_SMP)) {
retval = -ENOMEM;
goto out_put_task;
}
ac = (struct affinity_context){
.new_mask = in_mask,
.user_mask = user_mask,
.flags = SCA_USER,
};
retval = __sched_setaffinity(p, &ac);
kfree(ac.user_mask);
out_put_task:
put_task_struct(p);
return retval;
}
static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
struct cpumask *new_mask)
{
if (len < cpumask_size())
cpumask_clear(new_mask);
else if (len > cpumask_size())
len = cpumask_size();
return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
}
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
{
cpumask_var_t new_mask;
int retval;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;
retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
if (retval == 0)
retval = sched_setaffinity(pid, new_mask);
free_cpumask_var(new_mask);
return retval;
}
long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
unsigned long flags;
int retval;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
if (!p)
goto out_unlock;
retval = security_task_getscheduler(p);
if (retval)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
return retval;
}
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
{
int ret;
cpumask_var_t mask;
if ((len * BITS_PER_BYTE) < nr_cpu_ids)
return -EINVAL;
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
unsigned int retlen = min(len, cpumask_size());
if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
}
free_cpumask_var(mask);
return ret;
}
static void do_sched_yield(void)
{
struct rq_flags rf;
struct rq *rq;
rq = this_rq_lock_irq(&rf);
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
preempt_disable();
rq_unlock_irq(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
}
SYSCALL_DEFINE0(sched_yield)
{
do_sched_yield();
return 0;
}
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
{
if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
#endif
return 0;
}
EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define cond_resched_dynamic_enabled __cond_resched
#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
#define might_resched_dynamic_enabled __cond_resched
#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
klp_sched_try_switch();
if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
return __cond_resched();
}
EXPORT_SYMBOL(dynamic_cond_resched);
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
int __sched dynamic_might_resched(void)
{
if (!static_branch_unlikely(&sk_dynamic_might_resched))
return 0;
return __cond_resched();
}
EXPORT_SYMBOL(dynamic_might_resched);
#endif
#endif
int __cond_resched_lock(spinlock_t *lock)
{
int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held(lock);
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (!_cond_resched())
cpu_relax();
ret = 1;
spin_lock(lock);
}
return ret;
}
EXPORT_SYMBOL(__cond_resched_lock);
int __cond_resched_rwlock_read(rwlock_t *lock)
{
int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held_read(lock);
if (rwlock_needbreak(lock) || resched) {
read_unlock(lock);
if (!_cond_resched())
cpu_relax();
ret = 1;
read_lock(lock);
}
return ret;
}
EXPORT_SYMBOL(__cond_resched_rwlock_read);
int __cond_resched_rwlock_write(rwlock_t *lock)
{
int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held_write(lock);
if (rwlock_needbreak(lock) || resched) {
write_unlock(lock);
if (!_cond_resched())
cpu_relax();
ret = 1;
write_lock(lock);
}
return ret;
}
EXPORT_SYMBOL(__cond_resched_rwlock_write);
#ifdef CONFIG_PREEMPT_DYNAMIC
#ifdef CONFIG_GENERIC_ENTRY
#include <linux/entry-common.h>
#endif
enum {
preempt_dynamic_undefined = -1,
preempt_dynamic_none,
preempt_dynamic_voluntary,
preempt_dynamic_full,
};
int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
if (!strcmp(str, "none"))
return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
return preempt_dynamic_voluntary;
if (!strcmp(str, "full"))
return preempt_dynamic_full;
return -EINVAL;
}
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
#else
#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif
static DEFINE_MUTEX(sched_dynamic_mutex);
static bool klp_override;
static void __sched_dynamic_update(int mode)
{
if (!klp_override)
preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
switch (mode) {
case preempt_dynamic_none:
if (!klp_override)
preempt_dynamic_enable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: none\n");
break;
case preempt_dynamic_voluntary:
if (!klp_override)
preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: voluntary\n");
break;
case preempt_dynamic_full:
if (!klp_override)
preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: full\n");
break;
}
preempt_dynamic_mode = mode;
}
void sched_dynamic_update(int mode)
{
mutex_lock(&sched_dynamic_mutex);
__sched_dynamic_update(mode);
mutex_unlock(&sched_dynamic_mutex);
}
#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
static int klp_cond_resched(void)
{
__klp_sched_try_switch();
return __cond_resched();
}
void sched_dynamic_klp_enable(void)
{
mutex_lock(&sched_dynamic_mutex);
klp_override = true;
static_call_update(cond_resched, klp_cond_resched);
mutex_unlock(&sched_dynamic_mutex);
}
void sched_dynamic_klp_disable(void)
{
mutex_lock(&sched_dynamic_mutex);
klp_override = false;
__sched_dynamic_update(preempt_dynamic_mode);
mutex_unlock(&sched_dynamic_mutex);
}
#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
static int __init setup_preempt_mode(char *str)
{
int mode = sched_dynamic_mode(str);
if (mode < 0) {
pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
return 0;
}
sched_dynamic_update(mode);
return 1;
}
__setup("preempt=", setup_preempt_mode);
static void __init preempt_dynamic_init(void)
{
if (preempt_dynamic_mode == preempt_dynamic_undefined) {
if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
sched_dynamic_update(preempt_dynamic_none);
} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
sched_dynamic_update(preempt_dynamic_voluntary);
} else {
WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
preempt_dynamic_mode = preempt_dynamic_full;
pr_info("Dynamic Preempt: full\n");
}
}
}
#define PREEMPT_MODEL_ACCESSOR(mode) \
bool preempt_model_##mode(void) \
{ \
WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
return preempt_dynamic_mode == preempt_dynamic_##mode; \
} \
EXPORT_SYMBOL_GPL(preempt_model_##mode)
PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);
PREEMPT_MODEL_ACCESSOR(full);
#else /* !CONFIG_PREEMPT_DYNAMIC */
static inline void preempt_dynamic_init(void) { }
#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
void __sched yield(void)
{
set_current_state(TASK_RUNNING);
do_sched_yield();
}
EXPORT_SYMBOL(yield);
int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
unsigned long flags;
int yielded = 0;
local_irq_save(flags);
rq = this_rq();
again:
p_rq = task_rq(p);
if (rq->nr_running == 1 && p_rq->nr_running == 1) {
yielded = -ESRCH;
goto out_irq;
}
double_rq_lock(rq, p_rq);
if (task_rq(p) != p_rq) {
double_rq_unlock(rq, p_rq);
goto again;
}
if (!curr->sched_class->yield_to_task)
goto out_unlock;
if (curr->sched_class != p->sched_class)
goto out_unlock;
if (task_on_cpu(p_rq, p) || !task_is_running(p))
goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p);
if (yielded) {
schedstat_inc(rq->yld_count);
if (preempt && rq != p_rq)
resched_curr(p_rq);
}
out_unlock:
double_rq_unlock(rq, p_rq);
out_irq:
local_irq_restore(flags);
if (yielded > 0)
schedule();
return yielded;
}
EXPORT_SYMBOL_GPL(yield_to);
int io_schedule_prepare(void)
{
int old_iowait = current->in_iowait;
current->in_iowait = 1;
blk_flush_plug(current->plug, true);
return old_iowait;
}
void io_schedule_finish(int token)
{
current->in_iowait = token;
}
long __sched io_schedule_timeout(long timeout)
{
int token;
long ret;
token = io_schedule_prepare();
ret = schedule_timeout(timeout);
io_schedule_finish(token);
return ret;
}
EXPORT_SYMBOL(io_schedule_timeout);
void __sched io_schedule(void)
{
int token;
token = io_schedule_prepare();
schedule();
io_schedule_finish(token);
}
EXPORT_SYMBOL(io_schedule);
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{
int ret = -EINVAL;
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
ret = MAX_RT_PRIO-1;
break;
case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
ret = 0;
break;
}
return ret;
}
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{
int ret = -EINVAL;
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
ret = 1;
break;
case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
ret = 0;
}
return ret;
}
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
unsigned int time_slice;
struct rq_flags rf;
struct rq *rq;
int retval;
if (pid < 0)
return -EINVAL;
retval = -ESRCH;
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p)
goto out_unlock;
retval = security_task_getscheduler(p);
if (retval)
goto out_unlock;
rq = task_rq_lock(p, &rf);
time_slice = 0;
if (p->sched_class->get_rr_interval)
time_slice = p->sched_class->get_rr_interval(rq, p);
task_rq_unlock(rq, p, &rf);
rcu_read_unlock();
jiffies_to_timespec64(time_slice, t);
return 0;
out_unlock:
rcu_read_unlock();
return retval;
}
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct __kernel_timespec __user *, interval)
{
struct timespec64 t;
int retval = sched_rr_get_interval(pid, &t);
if (retval == 0)
retval = put_timespec64(&t, interval);
return retval;
}
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
struct old_timespec32 __user *, interval)
{
struct timespec64 t;
int retval = sched_rr_get_interval(pid, &t);
if (retval == 0)
retval = put_old_timespec32(&t, interval);
return retval;
}
#endif
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
if (!try_get_task_stack(p))
return;
pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
if (task_is_running(p))
pr_cont(" running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
ppid = 0;
rcu_read_lock();
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
free, task_pid_nr(p), ppid,
read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
EXPORT_SYMBOL_GPL(sched_show_task);
static inline bool
state_filter_match(unsigned long state_filter, struct task_struct *p)
{
unsigned int state = READ_ONCE(p->__state);
if (!state_filter)
return true;
if (!(state & state_filter))
return false;
if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
return false;
return true;
}
void show_state_filter(unsigned int state_filter)
{
struct task_struct *g, *p;
rcu_read_lock();
for_each_process_thread(g, p) {
touch_nmi_watchdog();
touch_all_softlockup_watchdogs();
if (state_filter_match(state_filter, p))
sched_show_task(p);
}
#ifdef CONFIG_SCHED_DEBUG
if (!state_filter)
sysrq_sched_debug_show();
#endif
rcu_read_unlock();
if (!state_filter)
debug_show_all_locks();
}
void __init init_idle(struct task_struct *idle, int cpu)
{
#ifdef CONFIG_SMP
struct affinity_context ac = (struct affinity_context) {
.new_mask = cpumask_of(cpu),
.flags = 0,
};
#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
__sched_fork(0, idle);
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_rq_lock(rq);
idle->__state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
#ifdef CONFIG_SMP
set_cpus_allowed_common(idle, &ac);
#endif
rcu_read_lock();
__set_task_cpu(idle, cpu);
rcu_read_unlock();
rq->idle = idle;
rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
init_idle_preempt_count(idle, cpu);
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
}
#ifdef CONFIG_SMP
int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
int ret = 1;
if (cpumask_empty(cur))
return ret;
ret = dl_cpuset_cpumask_can_shrink(cur, trial);
return ret;
}
int task_can_attach(struct task_struct *p)
{
int ret = 0;
if (p->flags & PF_NO_SETAFFINITY)
ret = -EINVAL;
return ret;
}
bool sched_smp_initialized __read_mostly;
#ifdef CONFIG_NUMA_BALANCING
int migrate_task_to(struct task_struct *p, int target_cpu)
{
struct migration_arg arg = { p, target_cpu };
int curr_cpu = task_cpu(p);
if (curr_cpu == target_cpu)
return 0;
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
void sched_setnuma(struct task_struct *p, int nid)
{
bool queued, running;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
p->numa_preferred_nid = nid;
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_next_task(rq, p);
task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
void idle_task_exit(void)
{
struct mm_struct *mm = current->active_mm;
BUG_ON(cpu_online(smp_processor_id()));
BUG_ON(current != this_rq()->idle);
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
finish_arch_post_lock_switch();
}
}
static int __balance_push_cpu_stop(void *arg)
{
struct task_struct *p = arg;
struct rq *rq = this_rq();
struct rq_flags rf;
int cpu;
raw_spin_lock_irq(&p->pi_lock);
rq_lock(rq, &rf);
update_rq_clock(rq);
if (task_rq(p) == rq && task_on_rq_queued(p)) {
cpu = select_fallback_rq(rq->cpu, p);
rq = __migrate_task(rq, &rf, p, cpu);
}
rq_unlock(rq, &rf);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
return 0;
}
static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
static void balance_push(struct rq *rq)
{
struct task_struct *push_task = rq->curr;
lockdep_assert_rq_held(rq);
rq->balance_callback = &balance_push_callback;
if (!cpu_dying(rq->cpu) || rq != this_rq())
return;
if (kthread_is_per_cpu(push_task) ||
is_migration_disabled(push_task)) {
if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
rcuwait_active(&rq->hotplug_wait)) {
raw_spin_rq_unlock(rq);
rcuwait_wake_up(&rq->hotplug_wait);
raw_spin_rq_lock(rq);
}
return;
}
get_task_struct(push_task);
raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
this_cpu_ptr(&push_work));
raw_spin_rq_lock(rq);
}
static void balance_push_set(int cpu, bool on)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
if (on) {
WARN_ON_ONCE(rq->balance_callback);
rq->balance_callback = &balance_push_callback;
} else if (rq->balance_callback == &balance_push_callback) {
rq->balance_callback = NULL;
}
rq_unlock_irqrestore(rq, &rf);
}
static void balance_hotplug_wait(void)
{
struct rq *rq = this_rq();
rcuwait_wait_event(&rq->hotplug_wait,
rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
TASK_UNINTERRUPTIBLE);
}
#else
static inline void balance_push(struct rq *rq)
{
}
static inline void balance_push_set(int cpu, bool on)
{
}
static inline void balance_hotplug_wait(void)
{
}
#endif /* CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
{
if (!rq->online) {
const struct sched_class *class;
cpumask_set_cpu(rq->cpu, rq->rd->online);
rq->online = 1;
for_each_class(class) {
if (class->rq_online)
class->rq_online(rq);
}
}
}
void set_rq_offline(struct rq *rq)
{
if (rq->online) {
const struct sched_class *class;
update_rq_clock(rq);
for_each_class(class) {
if (class->rq_offline)
class->rq_offline(rq);
}
cpumask_clear_cpu(rq->cpu, rq->rd->online);
rq->online = 0;
}
}
static int num_cpus_frozen;
static void cpuset_cpu_active(void)
{
if (cpuhp_tasks_frozen) {
partition_sched_domains(1, NULL, NULL);
if (--num_cpus_frozen)
return;
cpuset_force_rebuild();
}
cpuset_update_active_cpus();
}
static int cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
int ret = dl_bw_check_overflow(cpu);
if (ret)
return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
}
return 0;
}
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
balance_push_set(cpu, false);
#ifdef CONFIG_SCHED_SMT
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_inc_cpuslocked(&sched_smt_present);
#endif
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
sched_update_numa(cpu, true);
sched_domains_numa_masks_set(cpu);
cpuset_cpu_active();
}
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
rq_unlock_irqrestore(rq, &rf);
return 0;
}
int sched_cpu_deactivate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
int ret;
nohz_balance_exit_idle(rq);
set_cpu_active(cpu, false);
balance_push_set(cpu, true);
synchronize_rcu();
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
rq_unlock_irqrestore(rq, &rf);
#ifdef CONFIG_SCHED_SMT
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
sched_core_cpu_deactivate(cpu);
#endif
if (!sched_smp_initialized)
return 0;
sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
sched_update_numa(cpu, true);
return ret;
}
sched_domains_numa_masks_clear(cpu);
return 0;
}
static void sched_rq_cpu_starting(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update;
update_max_interval();
}
int sched_cpu_starting(unsigned int cpu)
{
sched_core_cpu_starting(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
int sched_cpu_wait_empty(unsigned int cpu)
{
balance_hotplug_wait();
return 0;
}
static void calc_load_migrate(struct rq *rq)
{
long delta = calc_load_fold_active(rq, 1);
if (delta)
atomic_long_add(delta, &calc_load_tasks);
}
static void dump_rq_tasks(struct rq *rq, const char *loglvl)
{
struct task_struct *g, *p;
int cpu = cpu_of(rq);
lockdep_assert_rq_held(rq);
printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
for_each_process_thread(g, p) {
if (task_cpu(p) != cpu)
continue;
if (!task_on_rq_queued(p))
continue;
printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
}
}
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
WARN(true, "Dying CPU not properly vacated!");
dump_rq_tasks(rq, KERN_WARNING);
}
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
update_max_interval();
hrtick_clear(rq);
sched_core_cpu_dying(cpu);
return 0;
}
#endif
void __init sched_init_smp(void)
{
sched_init_numa(NUMA_NO_NODE);
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
mutex_unlock(&sched_domains_mutex);
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
BUG();
current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
init_sched_rt_class();
init_sched_dl_class();
sched_smp_initialized = true;
}
static int __init migration_init(void)
{
sched_cpu_starting(smp_processor_id());
return 0;
}
early_initcall(migration_init);
#else
void __init sched_init_smp(void)
{
sched_init_granularity();
}
#endif /* CONFIG_SMP */
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
(addr >= (unsigned long)__sched_text_start
&& addr < (unsigned long)__sched_text_end);
}
#ifdef CONFIG_CGROUP_SCHED
struct task_group root_task_group;
LIST_HEAD(task_groups);
static struct kmem_cache *task_group_cache __read_mostly;
#endif
void __init sched_init(void)
{
unsigned long ptr = 0;
int i;
BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
&fair_sched_class != &rt_sched_class + 1 ||
&rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
if (ptr) {
ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.se = (struct sched_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.rt_rq = (struct rt_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
}
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
#endif
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
task_group_cache = KMEM_CACHE(task_group, 0);
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
struct rq *rq;
rq = cpu_rq(i);
raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
rq->cpu = i;
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->wake_stamp = jiffies;
rq->wake_avg_idle = rq->avg_idle;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
#endif
#ifdef CONFIG_HOTPLUG_CPU
rcuwait_init(&rq->hotplug_wait);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
rq->core_forceidle_count = 0;
rq->core_forceidle_occupation = 0;
rq->core_forceidle_start = 0;
rq->core_cookie = 0UL;
#endif
zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
set_load_weight(&init_task, false);
mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
WARN_ON(!set_kthread_struct(current));
init_idle(current, smp_processor_id());
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();
psi_init();
init_uclamp();
preempt_dynamic_init();
scheduler_running = 1;
}
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
void __might_sleep(const char *file, int line)
{
unsigned int state = get_current_state();
WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
"do not call blocking ops when !TASK_RUNNING; "
"state=%x set at [<%p>] %pS\n", state,
(void *)current->task_state_change,
(void *)current->task_state_change);
__might_resched(file, line, 0);
}
EXPORT_SYMBOL(__might_sleep);
static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
{
if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
return;
if (preempt_count() == preempt_offset)
return;
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, ip);
}
static inline bool resched_offsets_ok(unsigned int offsets)
{
unsigned int nested = preempt_count();
nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
return nested == offsets;
}
void __might_resched(const char *file, int line, unsigned int offsets)
{
static unsigned long prev_jiffy;
unsigned long preempt_disable_ip;
rcu_sleep_check();
if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
!is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
preempt_disable_ip = get_preempt_disable_ip(current);
pr_err("BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
in_atomic(), irqs_disabled(), current->non_block_count,
current->pid, current->comm);
pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
offsets & MIGHT_RESCHED_PREEMPT_MASK);
if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
pr_err("RCU nest depth: %d, expected: %u\n",
rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
}
if (task_stack_end_corrupted(current))
pr_emerg("Thread overran stack, or stack corrupted\n");
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
preempt_disable_ip);
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL(__might_resched);
void __cant_sleep(const char *file, int line, int preempt_offset)
{
static unsigned long prev_jiffy;
if (irqs_disabled())
return;
if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
return;
if (preempt_count() > preempt_offset)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
in_atomic(), irqs_disabled(),
current->pid, current->comm);
debug_show_held_locks(current);
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL_GPL(__cant_sleep);
#ifdef CONFIG_SMP
void __cant_migrate(const char *file, int line)
{
static unsigned long prev_jiffy;
if (irqs_disabled())
return;
if (is_migration_disabled(current))
return;
if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
return;
if (preempt_count() > 0)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
in_atomic(), irqs_disabled(), is_migration_disabled(current),
current->pid, current->comm);
debug_show_held_locks(current);
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL_GPL(__cant_migrate);
#endif
#endif
#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
{
struct task_struct *g, *p;
struct sched_attr attr = {
.sched_policy = SCHED_NORMAL,
};
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
if (p->flags & PF_KTHREAD)
continue;
p->se.exec_start = 0;
schedstat_set(p->stats.wait_start, 0);
schedstat_set(p->stats.sleep_start, 0);
schedstat_set(p->stats.block_start, 0);
if (!dl_task(p) && !rt_task(p)) {
if (task_nice(p) < 0)
set_user_nice(p, 0);
continue;
}
__sched_setscheduler(p, &attr, false, false);
}
read_unlock(&tasklist_lock);
}
#endif /* CONFIG_MAGIC_SYSRQ */
#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
struct task_struct *curr_task(int cpu)
{
return cpu_curr(cpu);
}
#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
#ifdef CONFIG_IA64
void ia64_set_curr_task(int cpu, struct task_struct *p)
{
cpu_curr(cpu) = p;
}
#endif
#ifdef CONFIG_CGROUP_SCHED
static DEFINE_SPINLOCK(task_group_lock);
static inline void alloc_uclamp_sched_group(struct task_group *tg,
struct task_group *parent)
{
#ifdef CONFIG_UCLAMP_TASK_GROUP
enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id) {
uclamp_se_set(&tg->uclamp_req[clamp_id],
uclamp_none(clamp_id), false);
tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
}
#endif
}
static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
kmem_cache_free(task_group_cache, tg);
}
static void sched_free_group_rcu(struct rcu_head *rcu)
{
sched_free_group(container_of(rcu, struct task_group, rcu));
}
static void sched_unregister_group(struct task_group *tg)
{
unregister_fair_sched_group(tg);
unregister_rt_sched_group(tg);
call_rcu(&tg->rcu, sched_free_group_rcu);
}
struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
if (!tg)
return ERR_PTR(-ENOMEM);
if (!alloc_fair_sched_group(tg, parent))
goto err;
if (!alloc_rt_sched_group(tg, parent))
goto err;
alloc_uclamp_sched_group(tg, parent);
return tg;
err:
sched_free_group(tg);
return ERR_PTR(-ENOMEM);
}
void sched_online_group(struct task_group *tg, struct task_group *parent)
{
unsigned long flags;
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
WARN_ON(!parent);
tg->parent = parent;
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
online_fair_sched_group(tg);
}
static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
sched_unregister_group(container_of(rhp, struct task_group, rcu));
}
void sched_destroy_group(struct task_group *tg)
{
call_rcu(&tg->rcu, sched_unregister_group_rcu);
}
void sched_release_group(struct task_group *tg)
{
unsigned long flags;
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
}
static struct task_group *sched_get_task_group(struct task_struct *tsk)
{
struct task_group *tg;
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
return tg;
}
static void sched_change_group(struct task_struct *tsk, struct task_group *group)
{
tsk->sched_task_group = group;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
tsk->sched_class->task_change_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
}
void sched_move_task(struct task_struct *tsk)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct task_group *group;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
group = sched_get_task_group(tsk);
if (group == tsk->sched_task_group)
goto unlock;
update_rq_clock(rq);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
dequeue_task(rq, tsk, queue_flags);
if (running)
put_prev_task(rq, tsk);
sched_change_group(tsk, group);
if (queued)
enqueue_task(rq, tsk, queue_flags);
if (running) {
set_next_task(rq, tsk);
resched_curr(rq);
}
unlock:
task_rq_unlock(rq, tsk, &rf);
}
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct task_group, css) : NULL;
}
static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct task_group *parent = css_tg(parent_css);
struct task_group *tg;
if (!parent) {
return &root_task_group.css;
}
tg = sched_create_group(parent);
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
return &tg->css;
}
static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css->parent);
if (parent)
sched_online_group(tg, parent);
#ifdef CONFIG_UCLAMP_TASK_GROUP
mutex_lock(&uclamp_mutex);
rcu_read_lock();
cpu_util_update_eff(css);
rcu_read_unlock();
mutex_unlock(&uclamp_mutex);
#endif
return 0;
}
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
sched_release_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
sched_unregister_group(tg);
}
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
cgroup_taskset_for_each(task, css, tset) {
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
}
return 0;
}
#endif
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
static void cpu_util_update_eff(struct cgroup_subsys_state *css)
{
struct cgroup_subsys_state *top_css = css;
struct uclamp_se *uc_parent = NULL;
struct uclamp_se *uc_se = NULL;
unsigned int eff[UCLAMP_CNT];
enum uclamp_id clamp_id;
unsigned int clamps;
lockdep_assert_held(&uclamp_mutex);
SCHED_WARN_ON(!rcu_read_lock_held());
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
? css_tg(css)->parent->uclamp : NULL;
for_each_clamp_id(clamp_id) {
eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
if (uc_parent &&
eff[clamp_id] > uc_parent[clamp_id].value) {
eff[clamp_id] = uc_parent[clamp_id].value;
}
}
eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
clamps = 0x0;
uc_se = css_tg(css)->uclamp;
for_each_clamp_id(clamp_id) {
if (eff[clamp_id] == uc_se[clamp_id].value)
continue;
uc_se[clamp_id].value = eff[clamp_id];
uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
clamps |= (0x1 << clamp_id);
}
if (!clamps) {
css = css_rightmost_descendant(css);
continue;
}
uclamp_update_active_tasks(css);
}
}
#define _POW10(exp) ((unsigned int)1e##exp)
#define POW10(exp) _POW10(exp)
struct uclamp_request {
#define UCLAMP_PERCENT_SHIFT 2
#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
s64 percent;
u64 util;
int ret;
};
static inline struct uclamp_request
capacity_from_percent(char *buf)
{
struct uclamp_request req = {
.percent = UCLAMP_PERCENT_SCALE,
.util = SCHED_CAPACITY_SCALE,
.ret = 0,
};
buf = strim(buf);
if (strcmp(buf, "max")) {
req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
&req.percent);
if (req.ret)
return req;
if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
req.ret = -ERANGE;
return req;
}
req.util = req.percent << SCHED_CAPACITY_SHIFT;
req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
}
return req;
}
static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off,
enum uclamp_id clamp_id)
{
struct uclamp_request req;
struct task_group *tg;
req = capacity_from_percent(buf);
if (req.ret)
return req.ret;
static_branch_enable(&sched_uclamp_used);
mutex_lock(&uclamp_mutex);
rcu_read_lock();
tg = css_tg(of_css(of));
if (tg->uclamp_req[clamp_id].value != req.util)
uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
tg->uclamp_pct[clamp_id] = req.percent;
cpu_util_update_eff(of_css(of));
rcu_read_unlock();
mutex_unlock(&uclamp_mutex);
return nbytes;
}
static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
}
static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
}
static inline void cpu_uclamp_print(struct seq_file *sf,
enum uclamp_id clamp_id)
{
struct task_group *tg;
u64 util_clamp;
u64 percent;
u32 rem;
rcu_read_lock();
tg = css_tg(seq_css(sf));
util_clamp = tg->uclamp_req[clamp_id].value;
rcu_read_unlock();
if (util_clamp == SCHED_CAPACITY_SCALE) {
seq_puts(sf, "max\n");
return;
}
percent = tg->uclamp_pct[clamp_id];
percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
}
static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
{
cpu_uclamp_print(sf, UCLAMP_MIN);
return 0;
}
static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
{
cpu_uclamp_print(sf, UCLAMP_MAX);
return 0;
}
#endif /* CONFIG_UCLAMP_TASK_GROUP */
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
if (shareval > scale_load_down(ULONG_MAX))
shareval = MAX_SHARES;
return sched_group_set_shares(css_tg(css), scale_load(shareval));
}
static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct task_group *tg = css_tg(css);
return (u64) scale_load_down(tg->shares);
}
#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC;
static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
u64 burst)
{
int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
if (tg == &root_task_group)
return -EINVAL;
if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
return -EINVAL;
if (period > max_cfs_quota_period)
return -EINVAL;
if (quota != RUNTIME_INF && quota > max_cfs_runtime)
return -EINVAL;
if (quota != RUNTIME_INF && (burst > quota ||
burst + quota > max_cfs_runtime))
return -EINVAL;
cpus_read_lock();
mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota);
if (ret)
goto out_unlock;
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
if (runtime_enabled && !runtime_was_enabled)
cfs_bandwidth_usage_inc();
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
cfs_b->burst = burst;
__refill_cfs_bandwidth_runtime(cfs_b);
if (runtime_enabled)
start_cfs_bandwidth(cfs_b);
raw_spin_unlock_irq(&cfs_b->lock);
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
struct rq_flags rf;
rq_lock_irq(rq, &rf);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
rq_unlock_irq(rq, &rf);
}
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
cpus_read_unlock();
return ret;
}
static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
{
u64 quota, period, burst;
period = ktime_to_ns(tg->cfs_bandwidth.period);
burst = tg->cfs_bandwidth.burst;
if (cfs_quota_us < 0)
quota = RUNTIME_INF;
else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
quota = (u64)cfs_quota_us * NSEC_PER_USEC;
else
return -EINVAL;
return tg_set_cfs_bandwidth(tg, period, quota, burst);
}
static long tg_get_cfs_quota(struct task_group *tg)
{
u64 quota_us;
if (tg->cfs_bandwidth.quota == RUNTIME_INF)
return -1;
quota_us = tg->cfs_bandwidth.quota;
do_div(quota_us, NSEC_PER_USEC);
return quota_us;
}
static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
{
u64 quota, period, burst;
if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
return -EINVAL;
period = (u64)cfs_period_us * NSEC_PER_USEC;
quota = tg->cfs_bandwidth.quota;
burst = tg->cfs_bandwidth.burst;
return tg_set_cfs_bandwidth(tg, period, quota, burst);
}
static long tg_get_cfs_period(struct task_group *tg)
{
u64 cfs_period_us;
cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
do_div(cfs_period_us, NSEC_PER_USEC);
return cfs_period_us;
}
static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
{
u64 quota, period, burst;
if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
return -EINVAL;
burst = (u64)cfs_burst_us * NSEC_PER_USEC;
period = ktime_to_ns(tg->cfs_bandwidth.period);
quota = tg->cfs_bandwidth.quota;
return tg_set_cfs_bandwidth(tg, period, quota, burst);
}
static long tg_get_cfs_burst(struct task_group *tg)
{
u64 burst_us;
burst_us = tg->cfs_bandwidth.burst;
do_div(burst_us, NSEC_PER_USEC);
return burst_us;
}
static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return tg_get_cfs_quota(css_tg(css));
}
static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
struct cftype *cftype, s64 cfs_quota_us)
{
return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
}
static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return tg_get_cfs_period(css_tg(css));
}
static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 cfs_period_us)
{
return tg_set_cfs_period(css_tg(css), cfs_period_us);
}
static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return tg_get_cfs_burst(css_tg(css));
}
static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 cfs_burst_us)
{
return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
}
struct cfs_schedulable_data {
struct task_group *tg;
u64 period, quota;
};
static u64 normalize_cfs_quota(struct task_group *tg,
struct cfs_schedulable_data *d)
{
u64 quota, period;
if (tg == d->tg) {
period = d->period;
quota = d->quota;
} else {
period = tg_get_cfs_period(tg);
quota = tg_get_cfs_quota(tg);
}
if (quota == RUNTIME_INF || quota == -1)
return RUNTIME_INF;
return to_ratio(period, quota);
}
static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
{
struct cfs_schedulable_data *d = data;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
s64 quota = 0, parent_quota = -1;
if (!tg->parent) {
quota = RUNTIME_INF;
} else {
struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
quota = normalize_cfs_quota(tg, d);
parent_quota = parent_b->hierarchical_quota;
if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
if (quota == RUNTIME_INF)
quota = parent_quota;
else if (parent_quota != RUNTIME_INF)
quota = min(quota, parent_quota);
} else {
if (quota == RUNTIME_INF)
quota = parent_quota;
else if (parent_quota != RUNTIME_INF && quota > parent_quota)
return -EINVAL;
}
}
cfs_b->hierarchical_quota = quota;
return 0;
}
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
{
int ret;
struct cfs_schedulable_data data = {
.tg = tg,
.period = period,
.quota = quota,
};
if (quota != RUNTIME_INF) {
do_div(data.period, NSEC_PER_USEC);
do_div(data.quota, NSEC_PER_USEC);
}
rcu_read_lock();
ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
rcu_read_unlock();
return ret;
}
static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
if (schedstat_enabled() && tg != &root_task_group) {
struct sched_statistics *stats;
u64 ws = 0;
int i;
for_each_possible_cpu(i) {
stats = __schedstats_from_se(tg->se[i]);
ws += schedstat_val(stats->wait_sum);
}
seq_printf(sf, "wait_sum %llu\n", ws);
}
seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);
seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time);
return 0;
}
static u64 throttled_time_self(struct task_group *tg)
{
int i;
u64 total = 0;
for_each_possible_cpu(i) {
total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
}
return total;
}
static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
struct cftype *cft, s64 val)
{
return sched_group_set_rt_runtime(css_tg(css), val);
}
static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return sched_group_rt_runtime(css_tg(css));
}
static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 rt_period_us)
{
return sched_group_set_rt_period(css_tg(css), rt_period_us);
}
static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return sched_group_rt_period(css_tg(css));
}
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return css_tg(css)->idle;
}
static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 idle)
{
return sched_group_set_idle(css_tg(css), idle);
}
#endif
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
{
.name = "idle",
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "cfs_quota_us",
.read_s64 = cpu_cfs_quota_read_s64,
.write_s64 = cpu_cfs_quota_write_s64,
},
{
.name = "cfs_period_us",
.read_u64 = cpu_cfs_period_read_u64,
.write_u64 = cpu_cfs_period_write_u64,
},
{
.name = "cfs_burst_us",
.read_u64 = cpu_cfs_burst_read_u64,
.write_u64 = cpu_cfs_burst_write_u64,
},
{
.name = "stat",
.seq_show = cpu_cfs_stat_show,
},
{
.name = "stat.local",
.seq_show = cpu_cfs_local_stat_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
.name = "rt_runtime_us",
.read_s64 = cpu_rt_runtime_read,
.write_s64 = cpu_rt_runtime_write,
},
{
.name = "rt_period_us",
.read_u64 = cpu_rt_period_read_uint,
.write_u64 = cpu_rt_period_write_uint,
},
#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
.name = "uclamp.min",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_uclamp_min_show,
.write = cpu_uclamp_min_write,
},
{
.name = "uclamp.max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
#endif
{ }
};
static int cpu_extra_stat_show(struct seq_file *sf,
struct cgroup_subsys_state *css)
{
#ifdef CONFIG_CFS_BANDWIDTH
{
struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
u64 throttled_usec, burst_usec;
throttled_usec = cfs_b->throttled_time;
do_div(throttled_usec, NSEC_PER_USEC);
burst_usec = cfs_b->burst_time;
do_div(burst_usec, NSEC_PER_USEC);
seq_printf(sf, "nr_periods %d\n"
"nr_throttled %d\n"
"throttled_usec %llu\n"
"nr_bursts %d\n"
"burst_usec %llu\n",
cfs_b->nr_periods, cfs_b->nr_throttled,
throttled_usec, cfs_b->nr_burst, burst_usec);
}
#endif
return 0;
}
static int cpu_local_stat_show(struct seq_file *sf,
struct cgroup_subsys_state *css)
{
#ifdef CONFIG_CFS_BANDWIDTH
{
struct task_group *tg = css_tg(css);
u64 throttled_self_usec;
throttled_self_usec = throttled_time_self(tg);
do_div(throttled_self_usec, NSEC_PER_USEC);
seq_printf(sf, "throttled_usec %llu\n",
throttled_self_usec);
}
#endif
return 0;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct task_group *tg = css_tg(css);
u64 weight = scale_load_down(tg->shares);
return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
}
static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
struct cftype *cft, u64 weight)
{
if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
return -ERANGE;
weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
return sched_group_set_shares(css_tg(css), scale_load(weight));
}
static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
unsigned long weight = scale_load_down(css_tg(css)->shares);
int last_delta = INT_MAX;
int prio, delta;
for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
delta = abs(sched_prio_to_weight[prio] - weight);
if (delta >= last_delta)
break;
last_delta = delta;
}
return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
}
static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice)
{
unsigned long weight;
int idx;
if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE;
idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
idx = array_index_nospec(idx, 40);
weight = sched_prio_to_weight[idx];
return sched_group_set_shares(css_tg(css), scale_load(weight));
}
#endif
static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
long period, long quota)
{
if (quota < 0)
seq_puts(sf, "max");
else
seq_printf(sf, "%ld", quota);
seq_printf(sf, " %ld\n", period);
}
static int __maybe_unused cpu_period_quota_parse(char *buf,
u64 *periodp, u64 *quotap)
{
char tok[21];
if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
return -EINVAL;
*periodp *= NSEC_PER_USEC;
if (sscanf(tok, "%llu", quotap))
*quotap *= NSEC_PER_USEC;
else if (!strcmp(tok, "max"))
*quotap = RUNTIME_INF;
else
return -EINVAL;
return 0;
}
#ifdef CONFIG_CFS_BANDWIDTH
static int cpu_max_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
return 0;
}
static ssize_t cpu_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct task_group *tg = css_tg(of_css(of));
u64 period = tg_get_cfs_period(tg);
u64 burst = tg_get_cfs_burst(tg);
u64 quota;
int ret;
ret = cpu_period_quota_parse(buf, &period, "a);
if (!ret)
ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
return ret ?: nbytes;
}
#endif
static struct cftype cpu_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "weight",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = cpu_weight_read_u64,
.write_u64 = cpu_weight_write_u64,
},
{
.name = "weight.nice",
.flags = CFTYPE_NOT_ON_ROOT,
.read_s64 = cpu_weight_nice_read_s64,
.write_s64 = cpu_weight_nice_write_s64,
},
{
.name = "idle",
.flags = CFTYPE_NOT_ON_ROOT,
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_max_show,
.write = cpu_max_write,
},
{
.name = "max.burst",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = cpu_cfs_burst_read_u64,
.write_u64 = cpu_cfs_burst_write_u64,
},
#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
.name = "uclamp.min",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_uclamp_min_show,
.write = cpu_uclamp_min_write,
},
{
.name = "uclamp.max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
#endif
{ }
};
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
.css_local_stat_show = cpu_local_stat_show,
#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
#endif
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
.early_init = true,
.threaded = true,
};
#endif /* CONFIG_CGROUP_SCHED */
void dump_cpu_task(int cpu)
{
if (cpu == smp_processor_id() && in_hardirq()) {
struct pt_regs *regs;
regs = get_irq_regs();
if (regs) {
show_regs(regs);
return;
}
}
if (trigger_single_cpu_backtrace(cpu))
return;
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
const int sched_prio_to_weight[40] = {
88761, 71755, 56483, 46273, 36291,
29154, 23254, 18705, 14949, 11916,
9548, 7620, 6100, 4904, 3906,
3121, 2501, 1991, 1586, 1277,
1024, 820, 655, 526, 423,
335, 272, 215, 172, 137,
110, 87, 70, 56, 45,
36, 29, 23, 18, 15,
};
const u32 sched_prio_to_wmult[40] = {
48388, 59856, 76040, 92818, 118348,
147320, 184698, 229616, 287308, 360437,
449829, 563644, 704093, 875809, 1099582,
1376151, 1717300, 2157191, 2708050, 3363326,
4194304, 5237765, 6557202, 8165337, 10153587,
12820798, 15790321, 19976592, 24970740, 31350126,
39045157, 49367440, 61356676, 76695844, 95443717,
119304647, 148102320, 186737708, 238609294, 286331153,
};
void call_trace_sched_update_nr_running(struct rq *rq, int count)
{
trace_sched_update_nr_running_tp(rq, count);
}
#ifdef CONFIG_SCHED_MM_CID
DEFINE_RAW_SPINLOCK(cid_lock);
int use_cid_lock;
void sched_mm_cid_migrate_from(struct task_struct *t)
{
t->migrate_from_cpu = task_cpu(t);
}
static
int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
struct task_struct *t,
struct mm_cid *src_pcpu_cid)
{
struct mm_struct *mm = t->mm;
struct task_struct *src_task;
int src_cid, last_mm_cid;
if (!mm)
return -1;
last_mm_cid = t->last_mm_cid;
if (last_mm_cid == -1)
return -1;
src_cid = READ_ONCE(src_pcpu_cid->cid);
if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
return -1;
rcu_read_lock();
src_task = rcu_dereference(src_rq->curr);
if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
rcu_read_unlock();
t->last_mm_cid = -1;
return -1;
}
rcu_read_unlock();
return src_cid;
}
static
int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
struct task_struct *t,
struct mm_cid *src_pcpu_cid,
int src_cid)
{
struct task_struct *src_task;
struct mm_struct *mm = t->mm;
int lazy_cid;
if (src_cid == -1)
return -1;
lazy_cid = mm_cid_set_lazy_put(src_cid);
if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
return -1;
rcu_read_lock();
src_task = rcu_dereference(src_rq->curr);
if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
rcu_read_unlock();
t->last_mm_cid = -1;
return -1;
}
rcu_read_unlock();
if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
return -1;
return src_cid;
}
void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
{
struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
struct mm_struct *mm = t->mm;
int src_cid, dst_cid, src_cpu;
struct rq *src_rq;
lockdep_assert_rq_held(dst_rq);
if (!mm)
return;
src_cpu = t->migrate_from_cpu;
if (src_cpu == -1) {
t->last_mm_cid = -1;
return;
}
dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
dst_cid = READ_ONCE(dst_pcpu_cid->cid);
if (!mm_cid_is_unset(dst_cid) &&
atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
return;
src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
src_rq = cpu_rq(src_cpu);
src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
if (src_cid == -1)
return;
src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
src_cid);
if (src_cid == -1)
return;
if (!mm_cid_is_unset(dst_cid)) {
__mm_cid_put(mm, src_cid);
return;
}
mm_cid_snapshot_time(dst_rq, mm);
WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
}
static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
unsigned long flags;
int cid, lazy_cid;
cid = READ_ONCE(pcpu_cid->cid);
if (!mm_cid_is_valid(cid))
return;
lazy_cid = mm_cid_set_lazy_put(cid);
if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
return;
rcu_read_lock();
t = rcu_dereference(rq->curr);
if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
rcu_read_unlock();
return;
}
rcu_read_unlock();
local_irq_save(flags);
if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
__mm_cid_put(mm, cid);
local_irq_restore(flags);
}
static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct mm_cid *pcpu_cid;
struct task_struct *curr;
u64 rq_clock;
rq_clock = READ_ONCE(rq->clock);
pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
rcu_read_lock();
curr = rcu_dereference(rq->curr);
if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
WRITE_ONCE(pcpu_cid->time, rq_clock);
rcu_read_unlock();
return;
}
rcu_read_unlock();
if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
return;
sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
}
static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
int weight)
{
struct mm_cid *pcpu_cid;
int cid;
pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
cid = READ_ONCE(pcpu_cid->cid);
if (!mm_cid_is_valid(cid) || cid < weight)
return;
sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
}
static void task_mm_cid_work(struct callback_head *work)
{
unsigned long now = jiffies, old_scan, next_scan;
struct task_struct *t = current;
struct cpumask *cidmask;
struct mm_struct *mm;
int weight, cpu;
SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
work->next = work;
if (t->flags & PF_EXITING)
return;
mm = t->mm;
if (!mm)
return;
old_scan = READ_ONCE(mm->mm_cid_next_scan);
next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
if (!old_scan) {
unsigned long res;
res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
if (res != old_scan)
old_scan = res;
else
old_scan = next_scan;
}
if (time_before(now, old_scan))
return;
if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
return;
cidmask = mm_cidmask(mm);
for_each_possible_cpu(cpu)
sched_mm_cid_remote_clear_old(mm, cpu);
weight = cpumask_weight(cidmask);
for_each_possible_cpu(cpu)
sched_mm_cid_remote_clear_weight(mm, cpu, weight);
}
void init_sched_mm_cid(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
int mm_users = 0;
if (mm) {
mm_users = atomic_read(&mm->mm_users);
if (mm_users == 1)
mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
}
t->cid_work.next = &t->cid_work;
init_task_work(&t->cid_work, task_mm_cid_work);
}
void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->cid_work;
unsigned long now = jiffies;
if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
work->next != work)
return;
if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
return;
task_work_add(curr, work, TWA_RESUME);
}
void sched_mm_cid_exit_signals(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
struct rq_flags rf;
struct rq *rq;
if (!mm)
return;
preempt_disable();
rq = this_rq();
rq_lock_irqsave(rq, &rf);
preempt_enable_no_resched();
WRITE_ONCE(t->mm_cid_active, 0);
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_before_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
struct rq_flags rf;
struct rq *rq;
if (!mm)
return;
preempt_disable();
rq = this_rq();
rq_lock_irqsave(rq, &rf);
preempt_enable_no_resched();
WRITE_ONCE(t->mm_cid_active, 0);
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_after_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
struct rq_flags rf;
struct rq *rq;
if (!mm)
return;
preempt_disable();
rq = this_rq();
rq_lock_irqsave(rq, &rf);
preempt_enable_no_resched();
WRITE_ONCE(t->mm_cid_active, 1);
smp_mb();
t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
rq_unlock_irqrestore(rq, &rf);
rseq_set_notify_resume(t);
}
void sched_mm_cid_fork(struct task_struct *t)
{
WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
t->mm_cid_active = 1;
}
#endif