#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
#include <linux/ktime_api.h>
#include <linux/sched/signal.h>
#include <linux/syscalls_api.h>
#include <linux/debug_locks.h>
#include <linux/prefetch.h>
#include <linux/capability.h>
#include <linux/pgtable_api.h>
#include <linux/wait_bit.h>
#include <linux/jiffies.h>
#include <linux/spinlock_api.h>
#include <linux/cpumask_api.h>
#include <linux/lockdep_api.h>
#include <linux/hardirq.h>
#include <linux/softirq.h>
#include <linux/refcount_api.h>
#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>
#include <linux/sched/debug.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/init.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
#include <linux/sched/nohz.h>
#include <linux/sched/rseq_api.h>
#include <linux/sched/rt.h>
#include <linux/blkdev.h>
#include <linux/context_tracking.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/interrupt.h>
#include <linux/ioprio.h>
#include <linux/kallsyms.h>
#include <linux/kcov.h>
#include <linux/kprobes.h>
#include <linux/llist_api.h>
#include <linux/mmu_context.h>
#include <linux/mmzone.h>
#include <linux/mutex_api.h>
#include <linux/nmi.h>
#include <linux/nospec.h>
#include <linux/perf_event_api.h>
#include <linux/profile.h>
#include <linux/psi.h>
#include <linux/rcuwait_api.h>
#include <linux/sched/wake_q.h>
#include <linux/scs.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/vtime.h>
#include <linux/wait_api.h>
#include <linux/workqueue_api.h>
#ifdef CONFIG_PREEMPT_DYNAMIC
# ifdef CONFIG_GENERIC_ENTRY
# include <linux/entry-common.h>
# endif
#endif
#include <uapi/linux/sched/types.h>
#include <asm/irq_regs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <linux/sched/rseq_api.h>
#include <trace/events/sched.h>
#include <trace/events/ipi.h>
#undef CREATE_TRACE_POINTS
#include "sched.h"
#include "stats.h"
#include "autogroup.h"
#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
#include "stats.h"
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#ifdef CONFIG_SCHED_DEBUG
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
__read_mostly int sysctl_resched_latency_warn_ms = 100;
__read_mostly int sysctl_resched_latency_warn_once = 1;
#endif /* CONFIG_SCHED_DEBUG */
const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
#ifdef CONFIG_SCHED_CORE
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
static inline int __task_prio(const struct task_struct *p)
{
if (p->sched_class == &stop_sched_class)
return -2;
if (rt_prio(p->prio))
return p->prio;
if (p->sched_class == &idle_sched_class)
return MAX_RT_PRIO + NICE_WIDTH;
return MAX_RT_PRIO + MAX_NICE;
}
static inline bool prio_less(const struct task_struct *a,
const struct task_struct *b, bool in_fi)
{
int pa = __task_prio(a), pb = __task_prio(b);
if (-pa < -pb)
return true;
if (-pb < -pa)
return false;
if (pa == -1)
return !dl_time_before(a->dl.deadline, b->dl.deadline);
if (pa == MAX_RT_PRIO + MAX_NICE)
return cfs_prio_less(a, b, in_fi);
return false;
}
static inline bool __sched_core_less(const struct task_struct *a,
const struct task_struct *b)
{
if (a->core_cookie < b->core_cookie)
return true;
if (a->core_cookie > b->core_cookie)
return false;
if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;
return false;
}
#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
{
return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
}
static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
{
const struct task_struct *p = __node_2_sc(node);
unsigned long cookie = (unsigned long)key;
if (cookie < p->core_cookie)
return -1;
if (cookie > p->core_cookie)
return 1;
return 0;
}
void sched_core_enqueue(struct rq *rq, struct task_struct *p)
{
rq->core->core_task_seq++;
if (!p->core_cookie)
return;
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;
if (sched_core_enqueued(p)) {
rb_erase(&p->core_node, &rq->core_tree);
RB_CLEAR_NODE(&p->core_node);
}
if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
rq->core->core_forceidle_count && rq->curr == rq->idle)
resched_curr(rq);
}
static int sched_task_is_throttled(struct task_struct *p, int cpu)
{
if (p->sched_class->task_is_throttled)
return p->sched_class->task_is_throttled(p, cpu);
return 0;
}
static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
{
struct rb_node *node = &p->core_node;
int cpu = task_cpu(p);
do {
node = rb_next(node);
if (!node)
return NULL;
p = __node_2_sc(node);
if (p->core_cookie != cookie)
return NULL;
} while (sched_task_is_throttled(p, cpu));
return p;
}
static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
{
struct task_struct *p;
struct rb_node *node;
node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
if (!node)
return NULL;
p = __node_2_sc(node);
if (!sched_task_is_throttled(p, rq->cpu))
return p;
return sched_core_next(p, cookie);
}
static DEFINE_MUTEX(sched_core_mutex);
static atomic_t sched_core_count;
static struct cpumask sched_core_mask;
static void sched_core_lock(int cpu, unsigned long *flags)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
int t, i = 0;
local_irq_save(*flags);
for_each_cpu(t, smt_mask)
raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
}
static void sched_core_unlock(int cpu, unsigned long *flags)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
int t;
for_each_cpu(t, smt_mask)
raw_spin_unlock(&cpu_rq(t)->__lock);
local_irq_restore(*flags);
}
static void __sched_core_flip(bool enabled)
{
unsigned long flags;
int cpu, t;
cpus_read_lock();
cpumask_copy(&sched_core_mask, cpu_online_mask);
for_each_cpu(cpu, &sched_core_mask) {
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
sched_core_lock(cpu, &flags);
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
cpu_rq(cpu)->core->core_forceidle_start = 0;
sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
}
for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
cpu_rq(cpu)->core_enabled = enabled;
cpus_read_unlock();
}
static void sched_core_assert_empty(void)
{
int cpu;
for_each_possible_cpu(cpu)
WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
}
static void __sched_core_enable(void)
{
static_branch_enable(&__sched_core_enabled);
synchronize_rcu();
__sched_core_flip(true);
sched_core_assert_empty();
}
static void __sched_core_disable(void)
{
sched_core_assert_empty();
__sched_core_flip(false);
static_branch_disable(&__sched_core_enabled);
}
void sched_core_get(void)
{
if (atomic_inc_not_zero(&sched_core_count))
return;
mutex_lock(&sched_core_mutex);
if (!atomic_read(&sched_core_count))
__sched_core_enable();
smp_mb__before_atomic();
atomic_inc(&sched_core_count);
mutex_unlock(&sched_core_mutex);
}
static void __sched_core_put(struct work_struct *work)
{
if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
__sched_core_disable();
mutex_unlock(&sched_core_mutex);
}
}
void sched_core_put(void)
{
static DECLARE_WORK(_work, __sched_core_put);
if (!atomic_add_unless(&sched_core_count, -1, 1))
schedule_work(&_work);
}
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
{
raw_spinlock_t *lock;
preempt_disable();
if (sched_core_disabled()) {
raw_spin_lock_nested(&rq->__lock, subclass);
preempt_enable_no_resched();
return;
}
for (;;) {
lock = __rq_lockp(rq);
raw_spin_lock_nested(lock, subclass);
if (likely(lock == __rq_lockp(rq))) {
preempt_enable_no_resched();
return;
}
raw_spin_unlock(lock);
}
}
bool raw_spin_rq_trylock(struct rq *rq)
{
raw_spinlock_t *lock;
bool ret;
preempt_disable();
if (sched_core_disabled()) {
ret = raw_spin_trylock(&rq->__lock);
preempt_enable();
return ret;
}
for (;;) {
lock = __rq_lockp(rq);
ret = raw_spin_trylock(lock);
if (!ret || (likely(lock == __rq_lockp(rq)))) {
preempt_enable();
return ret;
}
raw_spin_unlock(lock);
}
}
void raw_spin_rq_unlock(struct rq *rq)
{
raw_spin_unlock(rq_lockp(rq));
}
#ifdef CONFIG_SMP
void double_rq_lock(struct rq *rq1, struct rq *rq2)
{
lockdep_assert_irqs_disabled();
if (rq_order_less(rq2, rq1))
swap(rq1, rq2);
raw_spin_rq_lock(rq1);
if (__rq_lockp(rq1) != __rq_lockp(rq2))
raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
double_rq_clock_clear_update(rq1, rq2);
}
#endif
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock)
{
struct rq *rq;
lockdep_assert_held(&p->pi_lock);
for (;;) {
rq = task_rq(p);
raw_spin_rq_lock(rq);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
raw_spin_rq_unlock(rq);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
raw_spin_rq_lock(rq);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
if (irq_delta > delta)
irq_delta = delta;
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
rq->prev_steal_time_rq += steal;
delta -= steal;
}
#endif
rq->clock_task += delta;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
update_rq_clock_pelt(rq, delta);
}
void update_rq_clock(struct rq *rq)
{
s64 delta;
lockdep_assert_rq_held(rq);
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
#ifdef CONFIG_SCHED_DEBUG
if (sched_feat(WARN_DOUBLE_CLOCK))
SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
if (delta < 0)
return;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}
#ifdef CONFIG_SCHED_HRTICK
static void hrtick_clear(struct rq *rq)
{
if (hrtimer_active(&rq->hrtick_timer))
hrtimer_cancel(&rq->hrtick_timer);
}
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
struct rq_flags rf;
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
rq_lock(rq, &rf);
update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
}
#ifdef CONFIG_SMP
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = rq->hrtick_time;
hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
static void __hrtick_start(void *arg)
{
struct rq *rq = arg;
struct rq_flags rf;
rq_lock(rq, &rf);
__hrtick_restart(rq);
rq_unlock(rq, &rf);
}
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
s64 delta;
delta = max_t(s64, delay, 10000LL);
rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
if (rq == this_rq())
__hrtick_restart(rq);
else
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
}
#else
void hrtick_start(struct rq *rq, u64 delay)
{
delay = max_t(u64, delay, 10000LL);
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_HARD);
}
#endif /* CONFIG_SMP */
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
static inline void hrtick_rq_init(struct rq *rq)
{
}
#endif /* CONFIG_SCHED_HRTICK */
#define fetch_or(ptr, mask) \
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
typeof(*_ptr) _val = *_ptr; \
\
do { \
} while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
_val; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
static inline bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
break;
}
return true;
}
#else
static inline bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
#endif
static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
struct wake_q_node *node = &task->wake_q;
smp_mb__before_atomic();
if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
return false;
*head->lastp = node;
head->lastp = &node->next;
return true;
}
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
if (__wake_q_add(head, task))
get_task_struct(task);
}
void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
{
if (!__wake_q_add(head, task))
put_task_struct(task);
}
void wake_up_q(struct wake_q_head *head)
{
struct wake_q_node *node = head->first;
while (node != WAKE_Q_TAIL) {
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
node = node->next;
task->wake_q.next = NULL;
wake_up_process(task);
put_task_struct(task);
}
}
void resched_curr(struct rq *rq)
{
struct task_struct *curr = rq->curr;
int cpu;
lockdep_assert_rq_held(rq);
if (test_tsk_need_resched(curr))
return;
cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
set_tsk_need_resched(curr);
set_preempt_need_resched();
return;
}
if (set_nr_and_not_polling(curr))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
raw_spin_rq_lock_irqsave(rq, flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
raw_spin_rq_unlock_irqrestore(rq, flags);
}
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
const struct cpumask *hk_mask;
if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
guard(rcu)();
for_each_domain(cpu, sd) {
for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
if (!idle_cpu(i))
return i;
}
}
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
return default_cpu;
}
static void wake_up_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (cpu == smp_processor_id())
return;
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
static bool wake_up_full_nohz_cpu(int cpu)
{
if (cpu_is_offline(cpu))
return true;
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
tick_nohz_full_kick_cpu(cpu);
return true;
}
return false;
}
void wake_up_nohz_cpu(int cpu)
{
if (!wake_up_full_nohz_cpu(cpu))
wake_up_idle_cpu(cpu);
}
static void nohz_csd_func(void *info)
{
struct rq *rq = info;
int cpu = cpu_of(rq);
unsigned int flags;
flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
rq->idle_balance = idle_cpu(cpu);
if (rq->idle_balance && !need_resched()) {
rq->nohz_idle_balance = flags;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
}
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
{
if (rq->nr_running != 1)
return false;
if (p->sched_class != &fair_sched_class)
return false;
if (!task_on_rq_queued(p))
return false;
return true;
}
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
if (rq->dl.dl_nr_running)
return false;
if (rq->rt.rr_nr_running) {
if (rq->rt.rr_nr_running == 1)
return true;
else
return false;
}
fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
if (fifo_nr_running)
return true;
if (rq->nr_running > 1)
return false;
if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
if (cfs_task_bw_constrained(rq->curr))
return false;
}
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
int walk_tg_tree_from(struct task_group *from,
tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
int ret;
parent = from;
down:
ret = (*down)(parent, data);
if (ret)
goto out;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
ret = (*up)(parent, data);
if (ret || parent == from)
goto out;
child = parent;
parent = parent->parent;
if (parent)
goto up;
out:
return ret;
}
int tg_nop(struct task_group *tg, void *data)
{
return 0;
}
#endif
static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
return;
}
if (update_load && p->sched_class == &fair_sched_class) {
reweight_task(p, prio);
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
}
}
#ifdef CONFIG_UCLAMP_TASK
static DEFINE_MUTEX(uclamp_mutex);
static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
static struct uclamp_se uclamp_default[UCLAMP_CNT];
DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
#define for_each_clamp_id(clamp_id) \
for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
}
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
return SCHED_CAPACITY_SCALE;
}
static inline void uclamp_se_set(struct uclamp_se *uc_se,
unsigned int value, bool user_defined)
{
uc_se->value = value;
uc_se->bucket_id = uclamp_bucket_id(value);
uc_se->user_defined = user_defined;
}
static inline unsigned int
uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
if (clamp_id == UCLAMP_MAX) {
rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
return clamp_value;
}
return uclamp_none(UCLAMP_MIN);
}
static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
uclamp_rq_set(rq, clamp_id, clamp_value);
}
static inline
unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
int bucket_id = UCLAMP_BUCKETS - 1;
for ( ; bucket_id >= 0; bucket_id--) {
if (!bucket[bucket_id].tasks)
continue;
return bucket[bucket_id].value;
}
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
static void __uclamp_update_util_min_rt_default(struct task_struct *p)
{
unsigned int default_util_min;
struct uclamp_se *uc_se;
lockdep_assert_held(&p->pi_lock);
uc_se = &p->uclamp_req[UCLAMP_MIN];
if (uc_se->user_defined)
return;
default_util_min = sysctl_sched_uclamp_util_min_rt_default;
uclamp_se_set(uc_se, default_util_min, false);
}
static void uclamp_update_util_min_rt_default(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
if (!rt_task(p))
return;
rq = task_rq_lock(p, &rf);
__uclamp_update_util_min_rt_default(p);
task_rq_unlock(rq, p, &rf);
}
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_req = p->uclamp_req[clamp_id];
#ifdef CONFIG_UCLAMP_TASK_GROUP
unsigned int tg_min, tg_max, value;
if (task_group_is_autogroup(task_group(p)))
return uc_req;
if (task_group(p) == &root_task_group)
return uc_req;
tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
value = uc_req.value;
value = clamp(value, tg_min, tg_max);
uclamp_se_set(&uc_req, value, false);
#endif
return uc_req;
}
static inline struct uclamp_se
uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
struct uclamp_se uc_max = uclamp_default[clamp_id];
if (unlikely(uc_req.value > uc_max.value))
return uc_max;
return uc_req;
}
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
if (p->uclamp[clamp_id].active)
return (unsigned long)p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
return (unsigned long)uc_eff.value;
}
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
lockdep_assert_rq_held(rq);
p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
bucket = &uc_rq->bucket[uc_se->bucket_id];
bucket->tasks++;
uc_se->active = true;
uclamp_idle_reset(rq, clamp_id, uc_se->value);
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
if (uc_se->value > uclamp_rq_get(rq, clamp_id))
uclamp_rq_set(rq, clamp_id, uc_se->value);
}
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
unsigned int bkt_clamp;
unsigned int rq_clamp;
lockdep_assert_rq_held(rq);
if (unlikely(!uc_se->active))
return;
bucket = &uc_rq->bucket[uc_se->bucket_id];
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
uc_se->active = false;
if (likely(bucket->tasks))
return;
rq_clamp = uclamp_rq_get(rq, clamp_id);
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
uclamp_rq_set(rq, clamp_id, bkt_clamp);
}
}
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
if (!static_branch_unlikely(&sched_uclamp_used))
return;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
for_each_clamp_id(clamp_id)
uclamp_rq_inc_id(rq, p, clamp_id);
if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
if (!static_branch_unlikely(&sched_uclamp_used))
return;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
for_each_clamp_id(clamp_id)
uclamp_rq_dec_id(rq, p, clamp_id);
}
static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
if (!p->uclamp[clamp_id].active)
return;
uclamp_rq_dec_id(rq, p, clamp_id);
uclamp_rq_inc_id(rq, p, clamp_id);
if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
static inline void
uclamp_update_active(struct task_struct *p)
{
enum uclamp_id clamp_id;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
for_each_clamp_id(clamp_id)
uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
static inline void
uclamp_update_active_tasks(struct cgroup_subsys_state *css)
{
struct css_task_iter it;
struct task_struct *p;
css_task_iter_start(css, 0, &it);
while ((p = css_task_iter_next(&it)))
uclamp_update_active(p);
css_task_iter_end(&it);
}
static void cpu_util_update_eff(struct cgroup_subsys_state *css);
#endif
#ifdef CONFIG_SYSCTL
#ifdef CONFIG_UCLAMP_TASK
#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)
{
struct task_group *tg = &root_task_group;
uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
rcu_read_lock();
cpu_util_update_eff(&root_task_group.css);
rcu_read_unlock();
}
#else
static void uclamp_update_root_tg(void) { }
#endif
static void uclamp_sync_util_min_rt_default(void)
{
struct task_struct *g, *p;
read_lock(&tasklist_lock);
smp_mb__after_spinlock();
read_unlock(&tasklist_lock);
rcu_read_lock();
for_each_process_thread(g, p)
uclamp_update_util_min_rt_default(p);
rcu_read_unlock();
}
static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
int old_min, old_max, old_min_rt;
int result;
guard(mutex)(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
goto undo;
if (!write)
return 0;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
result = -EINVAL;
goto undo;
}
if (old_min != sysctl_sched_uclamp_util_min) {
uclamp_se_set(&uclamp_default[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
update_root_tg = true;
}
if (old_max != sysctl_sched_uclamp_util_max) {
uclamp_se_set(&uclamp_default[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
update_root_tg = true;
}
if (update_root_tg) {
static_branch_enable(&sched_uclamp_used);
uclamp_update_root_tg();
}
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
static_branch_enable(&sched_uclamp_used);
uclamp_sync_util_min_rt_default();
}
return 0;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
return result;
}
#endif
#endif
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
int util_min = p->uclamp_req[UCLAMP_MIN].value;
int util_max = p->uclamp_req[UCLAMP_MAX].value;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
util_min = attr->sched_util_min;
if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
return -EINVAL;
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
util_max = attr->sched_util_max;
if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
return -EINVAL;
}
if (util_min != -1 && util_max != -1 && util_min > util_max)
return -EINVAL;
static_branch_enable(&sched_uclamp_used);
return 0;
}
static bool uclamp_reset(const struct sched_attr *attr,
enum uclamp_id clamp_id,
struct uclamp_se *uc_se)
{
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
!uc_se->user_defined)
return true;
if (clamp_id == UCLAMP_MIN &&
attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
attr->sched_util_min == -1) {
return true;
}
if (clamp_id == UCLAMP_MAX &&
attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
attr->sched_util_max == -1) {
return true;
}
return false;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
unsigned int value;
if (!uclamp_reset(attr, clamp_id, uc_se))
continue;
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
value = sysctl_sched_uclamp_util_min_rt_default;
else
value = uclamp_none(clamp_id);
uclamp_se_set(uc_se, value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
return;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
attr->sched_util_min != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
attr->sched_util_min, true);
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
attr->sched_util_max != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
attr->sched_util_max, true);
}
}
static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
if (likely(!p->sched_reset_on_fork))
return;
for_each_clamp_id(clamp_id) {
uclamp_se_set(&p->uclamp_req[clamp_id],
uclamp_none(clamp_id), false);
}
}
static void uclamp_post_fork(struct task_struct *p)
{
uclamp_update_util_min_rt_default(p);
}
static void __init init_uclamp_rq(struct rq *rq)
{
enum uclamp_id clamp_id;
struct uclamp_rq *uc_rq = rq->uclamp;
for_each_clamp_id(clamp_id) {
uc_rq[clamp_id] = (struct uclamp_rq) {
.value = uclamp_none(clamp_id)
};
}
rq->uclamp_flags = UCLAMP_FLAG_IDLE;
}
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
enum uclamp_id clamp_id;
int cpu;
for_each_possible_cpu(cpu)
init_uclamp_rq(cpu_rq(cpu));
for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp_req[clamp_id],
uclamp_none(clamp_id), false);
}
uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
for_each_clamp_id(clamp_id) {
uclamp_default[clamp_id] = uc_max;
#ifdef CONFIG_UCLAMP_TASK_GROUP
root_task_group.uclamp_req[clamp_id] = uc_max;
root_task_group.uclamp[clamp_id] = uc_max;
#endif
}
}
#else /* CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
return -EOPNOTSUPP;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
bool sched_task_on_rq(struct task_struct *p)
{
return task_on_rq_queued(p);
}
unsigned long get_wchan(struct task_struct *p)
{
unsigned long ip = 0;
unsigned int state;
if (!p || p == current)
return 0;
raw_spin_lock_irq(&p->pi_lock);
state = READ_ONCE(p->__state);
smp_rmb();
if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
ip = __get_wchan(p);
raw_spin_unlock_irq(&p->pi_lock);
return ip;
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & ENQUEUE_RESTORE)) {
sched_info_enqueue(rq, p);
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}
uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeue(rq, p);
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
uclamp_rq_dec(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED;
if (flags & ENQUEUE_MIGRATED)
sched_mm_cid_migrate_to(rq, p);
enqueue_task(rq, p, flags);
p->on_rq = TASK_ON_RQ_QUEUED;
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
dequeue_task(rq, p, flags);
}
static inline int __normal_prio(int policy, int rt_prio, int nice)
{
int prio;
if (dl_policy(policy))
prio = MAX_DL_PRIO - 1;
else if (rt_policy(policy))
prio = MAX_RT_PRIO - 1 - rt_prio;
else
prio = NICE_TO_PRIO(nice);
return prio;
}
static inline int normal_prio(struct task_struct *p)
{
return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
static int effective_prio(struct task_struct *p)
{
p->normal_prio = normal_prio(p);
if (!rt_prio(p->prio))
return p->normal_prio;
return p->prio;
}
inline int task_curr(const struct task_struct *p)
{
return cpu_curr(task_cpu(p)) == p;
}
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
rq_clock_skip_update(rq);
}
static __always_inline
int __task_state_match(struct task_struct *p, unsigned int state)
{
if (READ_ONCE(p->__state) & state)
return 1;
#ifdef CONFIG_PREEMPT_RT
if (READ_ONCE(p->saved_state) & state)
return -1;
#endif
return 0;
}
static __always_inline
int task_state_match(struct task_struct *p, unsigned int state)
{
#ifdef CONFIG_PREEMPT_RT
int match;
raw_spin_lock_irq(&p->pi_lock);
match = __task_state_match(p, state);
raw_spin_unlock_irq(&p->pi_lock);
return match;
#else
return __task_state_match(p, state);
#endif
}
unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
int running, queued, match;
struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
for (;;) {
rq = task_rq(p);
while (task_on_cpu(rq, p)) {
if (!task_state_match(p, match_state))
return 0;
cpu_relax();
}
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
if ((match = __task_state_match(p, match_state))) {
if (match < 0)
queued = 1;
ncsw = p->nvcsw | LONG_MIN;
}
task_rq_unlock(rq, p, &rf);
if (unlikely(!ncsw))
break;
if (unlikely(running)) {
cpu_relax();
continue;
}
if (unlikely(queued)) {
ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
continue;
}
break;
}
return ncsw;
}
#ifdef CONFIG_SMP
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static int __set_cpus_allowed_ptr(struct task_struct *p,
struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
struct affinity_context ac = {
.new_mask = cpumask_of(rq->cpu),
.flags = SCA_MIGRATE_DISABLE,
};
if (likely(!p->migration_disabled))
return;
if (p->cpus_ptr != &p->cpus_mask)
return;
__do_set_cpus_allowed(p, &ac);
}
void migrate_disable(void)
{
struct task_struct *p = current;
if (p->migration_disabled) {
p->migration_disabled++;
return;
}
preempt_disable();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
struct task_struct *p = current;
struct affinity_context ac = {
.new_mask = &p->cpus_mask,
.flags = SCA_MIGRATE_ENABLE,
};
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
if (WARN_ON_ONCE(!p->migration_disabled))
return;
preempt_disable();
if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &ac);
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return rq->nr_pinned;
}
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (is_migration_disabled(p))
return cpu_online(cpu);
if (!(p->flags & PF_KTHREAD))
return cpu_active(cpu) && task_cpu_possible(cpu, p);
if (kthread_is_per_cpu(p))
return cpu_online(cpu);
if (cpu_dying(cpu))
return false;
return cpu_online(cpu);
}
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{
lockdep_assert_rq_held(rq);
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
rq = cpu_rq(new_cpu);
rq_lock(rq, rf);
WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
return rq;
}
struct migration_arg {
struct task_struct *task;
int dest_cpu;
struct set_affinity_pending *pending;
};
struct set_affinity_pending {
refcount_t refs;
unsigned int stop_pending;
struct completion done;
struct cpu_stop_work stop_work;
struct migration_arg arg;
};
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
if (!is_cpu_allowed(p, dest_cpu))
return rq;
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
}
static int migration_cpu_stop(void *data)
{
struct migration_arg *arg = data;
struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
bool complete = false;
struct rq_flags rf;
local_irq_save(rf.flags);
flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
WARN_ON_ONCE(pending && pending != p->migration_pending);
if (task_rq(p) == rq) {
if (is_migration_disabled(p))
goto out;
if (pending) {
p->migration_pending = NULL;
complete = true;
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
goto out;
}
if (task_on_rq_queued(p)) {
update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
} else {
p->wake_cpu = arg->dest_cpu;
}
} else if (pending) {
if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
p->migration_pending = NULL;
complete = true;
goto out;
}
WARN_ON_ONCE(!pending->stop_pending);
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
out:
if (pending)
pending->stop_pending = false;
task_rq_unlock(rq, p, &rf);
if (complete)
complete_all(&pending->done);
return 0;
}
int push_cpu_stop(void *arg)
{
struct rq *lowest_rq = NULL, *rq = this_rq();
struct task_struct *p = arg;
raw_spin_lock_irq(&p->pi_lock);
raw_spin_rq_lock(rq);
if (task_rq(p) != rq)
goto out_unlock;
if (is_migration_disabled(p)) {
p->migration_flags |= MDF_PUSH;
goto out_unlock;
}
p->migration_flags &= ~MDF_PUSH;
if (p->sched_class->find_lock_rq)
lowest_rq = p->sched_class->find_lock_rq(p, rq);
if (!lowest_rq)
goto out_unlock;
if (task_rq(p) == rq) {
deactivate_task(rq, p, 0);
set_task_cpu(p, lowest_rq->cpu);
activate_task(lowest_rq, p, 0);
resched_curr(lowest_rq);
}
double_unlock_balance(rq, lowest_rq);
out_unlock:
rq->push_busy = false;
raw_spin_rq_unlock(rq);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
return 0;
}
void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
{
if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
p->cpus_ptr = ctx->new_mask;
return;
}
cpumask_copy(&p->cpus_mask, ctx->new_mask);
p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
if (ctx->flags & SCA_USER)
swap(p->user_cpus_ptr, ctx->user_mask);
}
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
struct rq *rq = task_rq(p);
bool queued, running;
if (ctx->flags & SCA_MIGRATE_DISABLE)
SCHED_WARN_ON(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued) {
lockdep_assert_rq_held(rq);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
put_prev_task(rq, p);
p->sched_class->set_cpus_allowed(p, ctx);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_next_task(rq, p);
}
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
.user_mask = NULL,
.flags = SCA_USER,
};
union cpumask_rcuhead {
cpumask_t cpumask;
struct rcu_head rcu;
};
__do_set_cpus_allowed(p, &ac);
kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
}
static cpumask_t *alloc_user_cpus_ptr(int node)
{
int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
return kmalloc_node(size, GFP_KERNEL, node);
}
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
cpumask_t *user_mask;
unsigned long flags;
dst->user_cpus_ptr = NULL;
if (data_race(!src->user_cpus_ptr))
return 0;
user_mask = alloc_user_cpus_ptr(node);
if (!user_mask)
return -ENOMEM;
raw_spin_lock_irqsave(&src->pi_lock, flags);
if (src->user_cpus_ptr) {
swap(dst->user_cpus_ptr, user_mask);
cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
}
raw_spin_unlock_irqrestore(&src->pi_lock, flags);
if (unlikely(user_mask))
kfree(user_mask);
return 0;
}
static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
{
struct cpumask *user_mask = NULL;
swap(p->user_cpus_ptr, user_mask);
return user_mask;
}
void release_user_cpus_ptr(struct task_struct *p)
{
kfree(clear_user_cpus_ptr(p));
}
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
int dest_cpu, unsigned int flags)
__releases(rq->lock)
__releases(p->pi_lock)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
struct task_struct *push_task = NULL;
if ((flags & SCA_MIGRATE_ENABLE) &&
(p->migration_flags & MDF_PUSH) && !rq->push_busy) {
rq->push_busy = true;
push_task = get_task_struct(p);
}
pending = p->migration_pending;
if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
task_rq_unlock(rq, p, rf);
if (push_task) {
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
p, &rq->push_work);
}
if (complete)
complete_all(&pending->done);
return 0;
}
if (!(flags & SCA_MIGRATE_ENABLE)) {
if (!p->migration_pending) {
refcount_set(&my_pending.refs, 1);
init_completion(&my_pending.done);
my_pending.arg = (struct migration_arg) {
.task = p,
.dest_cpu = dest_cpu,
.pending = &my_pending,
};
p->migration_pending = &my_pending;
} else {
pending = p->migration_pending;
refcount_inc(&pending->refs);
pending->arg.dest_cpu = dest_cpu;
}
}
pending = p->migration_pending;
if (WARN_ON_ONCE(!pending)) {
task_rq_unlock(rq, p, rf);
return -EINVAL;
}
if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
stop_pending = pending->stop_pending;
if (!stop_pending)
pending->stop_pending = true;
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;
task_rq_unlock(rq, p, rf);
if (!stop_pending) {
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
}
if (flags & SCA_MIGRATE_ENABLE)
return 0;
} else {
if (!is_migration_disabled(p)) {
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);
if (!pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
}
task_rq_unlock(rq, p, rf);
if (complete)
complete_all(&pending->done);
}
wait_for_completion(&pending->done);
if (refcount_dec_and_test(&pending->refs))
wake_up_var(&pending->refs);
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
WARN_ON_ONCE(my_pending.stop_pending);
return 0;
}
static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
struct affinity_context *ctx,
struct rq *rq,
struct rq_flags *rf)
__releases(rq->lock)
__releases(p->pi_lock)
{
const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
bool kthread = p->flags & PF_KTHREAD;
unsigned int dest_cpu;
int ret = 0;
update_rq_clock(rq);
if (kthread || is_migration_disabled(p)) {
cpu_valid_mask = cpu_online_mask;
}
if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
ret = -EINVAL;
goto out;
}
if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
if (ctx->flags & SCA_USER)
swap(p->user_cpus_ptr, ctx->user_mask);
goto out;
}
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
!cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
ret = -EBUSY;
goto out;
}
}
dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
__do_set_cpus_allowed(p, ctx);
return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
out:
task_rq_unlock(rq, p, rf);
return ret;
}
static int __set_cpus_allowed_ptr(struct task_struct *p,
struct affinity_context *ctx)
{
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
if (p->user_cpus_ptr &&
!(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
ctx->new_mask = rq->scratch_mask;
return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
}
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
.flags = 0,
};
return __set_cpus_allowed_ptr(p, &ac);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
static int restrict_cpus_allowed_ptr(struct task_struct *p,
struct cpumask *new_mask,
const struct cpumask *subset_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
.flags = 0,
};
struct rq_flags rf;
struct rq *rq;
int err;
rq = task_rq_lock(p, &rf);
if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
err = -EPERM;
goto err_unlock;
}
if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
err = -EINVAL;
goto err_unlock;
}
return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
err_unlock:
task_rq_unlock(rq, p, &rf);
return err;
}
void force_compatible_cpus_allowed_ptr(struct task_struct *p)
{
cpumask_var_t new_mask;
const struct cpumask *override_mask = task_cpu_possible_mask(p);
alloc_cpumask_var(&new_mask, GFP_KERNEL);
cpus_read_lock();
if (!cpumask_available(new_mask))
goto out_set_mask;
if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
goto out_free_mask;
cpuset_cpus_allowed(p, new_mask);
override_mask = new_mask;
out_set_mask:
if (printk_ratelimit()) {
printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
task_pid_nr(p), p->comm,
cpumask_pr_args(override_mask));
}
WARN_ON(set_cpus_allowed_ptr(p, override_mask));
out_free_mask:
cpus_read_unlock();
free_cpumask_var(new_mask);
}
static int
__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
{
struct affinity_context ac = {
.new_mask = task_user_cpus(p),
.flags = 0,
};
int ret;
ret = __sched_setaffinity(p, &ac);
WARN_ON_ONCE(ret);
}
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
unsigned int state = READ_ONCE(p->__state);
WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
WARN_ON_ONCE(state == TASK_RUNNING &&
p->sched_class == &fair_sched_class &&
(p->on_rq && !task_on_rq_migrating(p)));
#ifdef CONFIG_LOCKDEP
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(__rq_lockp(task_rq(p)))));
#endif
WARN_ON_ONCE(!cpu_online(new_cpu));
WARN_ON_ONCE(is_migration_disabled(p));
#endif
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
__set_task_cpu(p, new_cpu);
}
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf);
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
check_preempt_curr(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
} else {
p->wake_cpu = cpu;
}
}
struct migration_swap_arg {
struct task_struct *src_task, *dst_task;
int src_cpu, dst_cpu;
};
static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
return -EAGAIN;
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
guard(double_rq_lock)(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
return -EAGAIN;
if (task_cpu(arg->src_task) != arg->src_cpu)
return -EAGAIN;
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
return -EAGAIN;
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
return -EAGAIN;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
return 0;
}
int migrate_swap(struct task_struct *cur, struct task_struct *p,
int target_cpu, int curr_cpu)
{
struct migration_swap_arg arg;
int ret = -EINVAL;
arg = (struct migration_swap_arg){
.src_task = cur,
.src_cpu = curr_cpu,
.dst_task = p,
.dst_cpu = target_cpu,
};
if (arg.src_cpu == arg.dst_cpu)
goto out;
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
goto out;
if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
out:
return ret;
}
#endif /* CONFIG_NUMA_BALANCING */
void kick_process(struct task_struct *p)
{
int cpu;
preempt_disable();
cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
static int select_fallback_rq(int cpu, struct task_struct *p)
{
int nid = cpu_to_node(cpu);
const struct cpumask *nodemask = NULL;
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
if (nid != -1) {
nodemask = cpumask_of_node(nid);
for_each_cpu(dest_cpu, nodemask) {
if (is_cpu_allowed(p, dest_cpu))
return dest_cpu;
}
}
for (;;) {
for_each_cpu(dest_cpu, p->cpus_ptr) {
if (!is_cpu_allowed(p, dest_cpu))
continue;
goto out;
}
switch (state) {
case cpuset:
if (cpuset_cpus_allowed_fallback(p)) {
state = possible;
break;
}
fallthrough;
case possible:
do_set_cpus_allowed(p, task_cpu_possible_mask(p));
state = fail;
break;
case fail:
BUG();
break;
}
}
out:
if (state != cpuset) {
if (p->mm && printk_ratelimit()) {
printk_deferred("process %d (%s) no longer affine to cpu%d\n",
task_pid_nr(p), p->comm, cpu);
}
}
return dest_cpu;
}
static inline
int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
else
cpu = cpumask_any(p->cpus_ptr);
if (unlikely(!is_cpu_allowed(p, cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
return cpu;
}
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
static struct lock_class_key stop_pi_lock;
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
struct task_struct *old_stop = cpu_rq(cpu)->stop;
if (stop) {
sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
stop->sched_class = &stop_sched_class;
lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
}
cpu_rq(cpu)->stop = stop;
if (old_stop) {
old_stop->sched_class = &rt_sched_class;
}
}
#else /* CONFIG_SMP */
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
struct affinity_context *ctx)
{
return set_cpus_allowed_ptr(p, ctx->new_mask);
}
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return false;
}
static inline cpumask_t *alloc_user_cpus_ptr(int node)
{
return NULL;
}
#endif /* !CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq;
if (!schedstat_enabled())
return;
rq = this_rq();
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
__schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
__schedstat_inc(p->stats.nr_wakeups_remote);
guard(rcu)();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
__schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
}
if (wake_flags & WF_MIGRATED)
__schedstat_inc(p->stats.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
__schedstat_inc(p->stats.nr_wakeups);
if (wake_flags & WF_SYNC)
__schedstat_inc(p->stats.nr_wakeups_sync);
}
static inline void ttwu_do_wakeup(struct task_struct *p)
{
WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
}
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
{
int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
lockdep_assert_rq_held(rq);
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
#ifdef CONFIG_SMP
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
else
#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
activate_task(rq, p, en_flags);
check_preempt_curr(rq, p, wake_flags);
ttwu_do_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
rq_unpin_lock(rq, rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, rf);
}
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
u64 max = 2*rq->max_idle_balance_cost;
update_avg(&rq->avg_idle, delta);
if (rq->avg_idle > max)
rq->avg_idle = max;
rq->wake_stamp = jiffies;
rq->wake_avg_idle = rq->avg_idle / 2;
rq->idle_stamp = 0;
}
#endif
}
static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
struct rq_flags rf;
struct rq *rq;
int ret = 0;
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
if (!task_on_cpu(rq, p)) {
update_rq_clock(rq);
check_preempt_curr(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
ret = 1;
}
__task_rq_unlock(rq, &rf);
return ret;
}
#ifdef CONFIG_SMP
void sched_ttwu_pending(void *arg)
{
struct llist_node *llist = arg;
struct rq *rq = this_rq();
struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
return;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
if (WARN_ON_ONCE(p->on_cpu))
smp_cond_load_acquire(&p->on_cpu, !VAL);
if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
set_task_cpu(p, cpu_of(rq));
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}
WRITE_ONCE(rq->ttwu_pending, 0);
rq_unlock_irqrestore(rq, &rf);
}
bool call_function_single_prep_ipi(int cpu)
{
if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
return false;
}
return true;
}
static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
__smp_call_single_queue(cpu, &p->wake_entry.llist);
}
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
guard(rcu)();
if (is_idle_task(rcu_dereference(rq->curr))) {
guard(rq_lock_irqsave)(rq);
if (is_idle_task(rq->curr))
resched_curr(rq);
}
}
bool cpus_share_cache(int this_cpu, int that_cpu)
{
if (this_cpu == that_cpu)
return true;
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
if (!cpu_active(cpu))
return false;
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
if (cpu == smp_processor_id())
return false;
if (!cpu_rq(cpu)->nr_running)
return true;
return false;
}
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {
sched_clock_cpu(cpu);
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
}
return false;
}
#else /* !CONFIG_SMP */
static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
return false;
}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
rq_lock(rq, &rf);
update_rq_clock(rq);
ttwu_do_activate(rq, p, wake_flags, &rf);
rq_unlock(rq, &rf);
}
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
{
int match;
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
state != TASK_RTLOCK_WAIT);
}
*success = !!(match = __task_state_match(p, state));
#ifdef CONFIG_PREEMPT_RT
if (match < 0)
p->saved_state = TASK_RUNNING;
#endif
return match > 0;
}
int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
guard(preempt)();
int cpu, success = 0;
if (p == current) {
if (!ttwu_state_match(p, state, &success))
goto out;
trace_sched_waking(p);
ttwu_do_wakeup(p);
goto out;
}
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
smp_mb__after_spinlock();
if (!ttwu_state_match(p, state, &success))
break;
trace_sched_waking(p);
smp_rmb();
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
break;
#ifdef CONFIG_SMP
smp_acquire__after_ctrl_dep();
WRITE_ONCE(p->__state, TASK_WAKING);
if (smp_load_acquire(&p->on_cpu) &&
ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
break;
smp_cond_load_acquire(&p->on_cpu, !VAL);
cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
#else
cpu = task_cpu(p);
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
}
out:
if (success)
ttwu_stat(p, task_cpu(p), wake_flags);
return success;
}
static bool __task_needs_rq_lock(struct task_struct *p)
{
unsigned int state = READ_ONCE(p->__state);
if (state == TASK_RUNNING || state == TASK_WAKING)
return true;
smp_rmb();
if (p->on_rq)
return true;
#ifdef CONFIG_SMP
smp_rmb();
smp_cond_load_acquire(&p->on_cpu, !VAL);
#endif
return false;
}
int task_call_func(struct task_struct *p, task_call_f func, void *arg)
{
struct rq *rq = NULL;
struct rq_flags rf;
int ret;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
if (__task_needs_rq_lock(p))
rq = __task_rq_lock(p, &rf);
ret = func(p, arg);
if (rq)
rq_unlock(rq, &rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
}
struct task_struct *cpu_curr_snapshot(int cpu)
{
struct task_struct *t;
smp_mb();
t = rcu_dereference(cpu_curr(cpu));
smp_mb();
return t;
}
int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
int wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0);
}
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0;
p->se.on_rq = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->stats, 0, sizeof(p->stats));
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
init_dl_task_timer(&p->dl);
init_dl_inactive_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p->rt.time_slice = sched_rr_timeslice;
p->rt.on_rq = 0;
p->rt.on_list = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
#ifdef CONFIG_COMPACTION
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
#ifdef CONFIG_NUMA_BALANCING
int sysctl_numa_balancing_mode;
static void __set_numabalancing_state(bool enabled)
{
if (enabled)
static_branch_enable(&sched_numa_balancing);
else
static_branch_disable(&sched_numa_balancing);
}
void set_numabalancing_state(bool enabled)
{
if (enabled)
sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
else
sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
__set_numabalancing_state(enabled);
}
#ifdef CONFIG_PROC_SYSCTL
static void reset_memory_tiering(void)
{
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat) {
pgdat->nbp_threshold = 0;
pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
}
}
static int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
int state = sysctl_numa_balancing_mode;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
t = *table;
t.data = &state;
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
if (write) {
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
(state & NUMA_BALANCING_MEMORY_TIERING))
reset_memory_tiering();
sysctl_numa_balancing_mode = state;
__set_numabalancing_state(state);
}
return err;
}
#endif
#endif
#ifdef CONFIG_SCHEDSTATS
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
static void set_schedstats(bool enabled)
{
if (enabled)
static_branch_enable(&sched_schedstats);
else
static_branch_disable(&sched_schedstats);
}
void force_schedstat_enabled(void)
{
if (!schedstat_enabled()) {
pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
static_branch_enable(&sched_schedstats);
}
}
static int __init setup_schedstats(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
set_schedstats(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
set_schedstats(false);
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse schedstats=\n");
return ret;
}
__setup("schedstats=", setup_schedstats);
#ifdef CONFIG_PROC_SYSCTL
static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
int state = static_branch_likely(&sched_schedstats);
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
t = *table;
t.data = &state;
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
if (write)
set_schedstats(state);
return err;
}
#endif /* CONFIG_PROC_SYSCTL */
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SYSCTL
static struct ctl_table sched_core_sysctls[] = {
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
.data = NULL,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_schedstats,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_UCLAMP_TASK
{
.procname = "sched_util_clamp_min",
.data = &sysctl_sched_uclamp_util_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
{
.procname = "sched_util_clamp_max",
.data = &sysctl_sched_uclamp_util_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
{
.procname = "sched_util_clamp_min_rt_default",
.data = &sysctl_sched_uclamp_util_min_rt_default,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
#endif /* CONFIG_UCLAMP_TASK */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing",
.data = NULL,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_numa_balancing,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_FOUR,
},
#endif /* CONFIG_NUMA_BALANCING */
{}
};
static int __init sched_core_sysctl_init(void)
{
register_sysctl_init("kernel", sched_core_sysctls);
return 0;
}
late_initcall(sched_core_sysctl_init);
#endif /* CONFIG_SYSCTL */
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
__sched_fork(clone_flags, p);
p->__state = TASK_NEW;
p->prio = current->normal_prio;
uclamp_fork(p);
if (unlikely(p->sched_reset_on_fork)) {
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->policy = SCHED_NORMAL;
p->static_prio = NICE_TO_PRIO(0);
p->rt_priority = 0;
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
p->sched_reset_on_fork = 0;
}
if (dl_prio(p->prio))
return -EAGAIN;
else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
init_entity_runnable_average(&p->se);
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
init_task_preempt_count(p);
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
return 0;
}
void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED
if (1) {
struct task_group *tg;
tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
struct task_group, css);
tg = autogroup_task_group(p, tg);
p->sched_task_group = tg;
}
#endif
rseq_migrate(p);
__set_task_cpu(p, smp_processor_id());
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
void sched_post_fork(struct task_struct *p)
{
uclamp_post_fork(p);
}
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
return BW_UNIT;
if (period == 0)
return 0;
return div64_u64(runtime << BW_SHIFT, period);
}
void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
#ifdef CONFIG_SMP
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
void preempt_notifier_inc(void)
{
static_branch_inc(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_inc);
void preempt_notifier_dec(void)
{
static_branch_dec(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_dec);
void preempt_notifier_register(struct preempt_notifier *notifier)
{
if (!static_branch_unlikely(&preempt_notifier_key))
WARN(1, "registering preempt_notifier while notifiers disabled\n");
hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
}
EXPORT_SYMBOL_GPL(preempt_notifier_register);
void preempt_notifier_unregister(struct preempt_notifier *notifier)
{
hlist_del(¬ifier->link);
}
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_in_preempt_notifiers(curr);
}
static void
__fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
struct preempt_notifier *notifier;
hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_out(notifier, next);
}
static __always_inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_out_preempt_notifiers(curr, next);
}
#else /* !CONFIG_PREEMPT_NOTIFIERS */
static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
}
static inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
}
#endif /* CONFIG_PREEMPT_NOTIFIERS */
static inline void prepare_task(struct task_struct *next)
{
#ifdef CONFIG_SMP
WRITE_ONCE(next->on_cpu, 1);
#endif
}
static inline void finish_task(struct task_struct *prev)
{
#ifdef CONFIG_SMP
smp_store_release(&prev->on_cpu, 0);
#endif
}
#ifdef CONFIG_SMP
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
struct balance_callback *next;
lockdep_assert_rq_held(rq);
while (head) {
func = (void (*)(struct rq *))head->func;
next = head->next;
head->next = NULL;
head = next;
func(rq);
}
}
static void balance_push(struct rq *rq);
struct balance_callback balance_push_callback = {
.next = NULL,
.func = balance_push,
};
static inline struct balance_callback *
__splice_balance_callbacks(struct rq *rq, bool split)
{
struct balance_callback *head = rq->balance_callback;
if (likely(!head))
return NULL;
lockdep_assert_rq_held(rq);
if (split && head == &balance_push_callback)
head = NULL;
else
rq->balance_callback = NULL;
return head;
}
static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return __splice_balance_callbacks(rq, true);
}
static void __balance_callbacks(struct rq *rq)
{
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
}
static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
unsigned long flags;
if (unlikely(head)) {
raw_spin_rq_lock_irqsave(rq, flags);
do_balance_callbacks(rq, head);
raw_spin_rq_unlock_irqrestore(rq, flags);
}
}
#else
static inline void __balance_callbacks(struct rq *rq)
{
}
static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return NULL;
}
static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
}
#endif
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
rq_unpin_lock(rq, rf);
spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
rq_lockp(rq)->owner = next;
#endif
}
static inline void finish_lock_switch(struct rq *rq)
{
spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
}
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
static inline void kmap_local_sched_out(void)
{
#ifdef CONFIG_KMAP_LOCAL
if (unlikely(current->kmap_ctrl.idx))
__kmap_local_sched_out();
#endif
}
static inline void kmap_local_sched_in(void)
{
#ifdef CONFIG_KMAP_LOCAL
if (unlikely(current->kmap_ctrl.idx))
__kmap_local_sched_in();
#endif
}
static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
kcov_prepare_switch(prev);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
prepare_arch_switch(next);
}
static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
unsigned int prev_state;
if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
"corrupted preempt_count: %s/%d/0x%x\n",
current->comm, current->pid, preempt_count()))
preempt_count_set(FORK_PREEMPT_COUNT);
rq->prev_mm = NULL;
prev_state = READ_ONCE(prev->__state);
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
finish_task(prev);
tick_nohz_task_switch();
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
kmap_local_sched_in();
fire_sched_in_preempt_notifiers(current);
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
mmdrop_lazy_tlb_sched(mm);
}
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
put_task_stack(prev);
put_task_struct_rcu_user(prev);
}
return rq;
}
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
finish_task_switch(prev);
preempt_enable();
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
calculate_sigpending();
}
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
prepare_task_switch(rq, prev, next);
arch_start_context_switch(prev);
if (!next->mm) {
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
if (prev->mm)
mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else {
membarrier_switch_mm(rq, prev->active_mm, next->mm);
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);
if (!prev->mm) {
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
switch_mm_cid(rq, prev, next);
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
switch_to(prev, next, prev);
barrier();
return finish_task_switch(prev);
}
unsigned int nr_running(void)
{
unsigned int i, sum = 0;
for_each_online_cpu(i)
sum += cpu_rq(i)->nr_running;
return sum;
}
bool single_task_running(void)
{
return raw_rq()->nr_running == 1;
}
EXPORT_SYMBOL(single_task_running);
unsigned long long nr_context_switches_cpu(int cpu)
{
return cpu_rq(cpu)->nr_switches;
}
unsigned long long nr_context_switches(void)
{
int i;
unsigned long long sum = 0;
for_each_possible_cpu(i)
sum += cpu_rq(i)->nr_switches;
return sum;
}
unsigned int nr_iowait_cpu(int cpu)
{
return atomic_read(&cpu_rq(cpu)->nr_iowait);
}
unsigned int nr_iowait(void)
{
unsigned int i, sum = 0;
for_each_possible_cpu(i)
sum += nr_iowait_cpu(i);
return sum;
}
#ifdef CONFIG_SMP
void sched_exec(void)
{
struct task_struct *p = current;
struct migration_arg arg;
int dest_cpu;
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
if (dest_cpu == smp_processor_id())
return;
if (unlikely(!cpu_active(dest_cpu)))
return;
arg = (struct migration_arg){ p, dest_cpu };
}
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
#endif
DEFINE_PER_CPU(struct kernel_stat, kstat);
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
static inline void prefetch_curr_exec_start(struct task_struct *p)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *curr = (&p->se)->cfs_rq->curr;
#else
struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
#endif
prefetch(curr);
prefetch(&curr->exec_start);
}
unsigned long long task_sched_runtime(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
u64 ns;
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
if (!p->on_cpu || !task_on_rq_queued(p))
return p->se.sum_exec_runtime;
#endif
rq = task_rq_lock(p, &rf);
if (task_current(rq, p) && task_on_rq_queued(p)) {
prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
ns = p->se.sum_exec_runtime;
task_rq_unlock(rq, p, &rf);
return ns;
}
#ifdef CONFIG_SCHED_DEBUG
static u64 cpu_resched_latency(struct rq *rq)
{
int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
u64 resched_latency, now = rq_clock(rq);
static bool warned_once;
if (sysctl_resched_latency_warn_once && warned_once)
return 0;
if (!need_resched() || !latency_warn_ms)
return 0;
if (system_state == SYSTEM_BOOTING)
return 0;
if (!rq->last_seen_need_resched_ns) {
rq->last_seen_need_resched_ns = now;
rq->ticks_without_resched = 0;
return 0;
}
rq->ticks_without_resched++;
resched_latency = now - rq->last_seen_need_resched_ns;
if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
return 0;
warned_once = true;
return resched_latency;
}
static int __init setup_resched_latency_warn_ms(char *str)
{
long val;
if ((kstrtol(str, 0, &val))) {
pr_warn("Unable to set resched_latency_warn_ms\n");
return 1;
}
sysctl_resched_latency_warn_ms = val;
return 1;
}
__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
#else
static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
#endif /* CONFIG_SCHED_DEBUG */
void scheduler_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
unsigned long thermal_pressure;
u64 resched_latency;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
arch_scale_freq_tick();
sched_clock_tick();
rq_lock(rq, &rf);
update_rq_clock(rq);
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
task_tick_mm_cid(rq, curr);
rq_unlock(rq, &rf);
if (sched_feat(LATENCY_WARN) && resched_latency)
resched_latency_warn(cpu, resched_latency);
perf_event_task_tick();
if (curr->flags & PF_WQ_WORKER)
wq_worker_tick(curr);
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
}
#ifdef CONFIG_NO_HZ_FULL
struct tick_work {
int cpu;
atomic_t state;
struct delayed_work work;
};
#define TICK_SCHED_REMOTE_OFFLINE 0
#define TICK_SCHED_REMOTE_OFFLINING 1
#define TICK_SCHED_REMOTE_RUNNING 2
static struct tick_work __percpu *tick_work_cpu;
static void sched_tick_remote(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
int os;
if (tick_nohz_tick_stopped_cpu(cpu)) {
guard(rq_lock_irq)(rq);
struct task_struct *curr = rq->curr;
if (cpu_online(cpu)) {
update_rq_clock(rq);
if (!is_idle_task(curr)) {
u64 delta = rq_clock_task(rq) - curr->se.exec_start;
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
}
curr->sched_class->task_tick(rq, curr, 0);
calc_load_nohz_remote(rq);
}
}
os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
if (os == TICK_SCHED_REMOTE_RUNNING)
queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
int os;
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
if (os == TICK_SCHED_REMOTE_OFFLINE) {
twork->cpu = cpu;
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
queue_delayed_work(system_unbound_wq, &twork->work, HZ);
}
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
int os;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
}
#endif /* CONFIG_HOTPLUG_CPU */
int __init sched_tick_offload_init(void)
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
return 0;
}
#else /* !CONFIG_NO_HZ_FULL */
static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
#endif
#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
static inline void preempt_latency_start(int val)
{
if (preempt_count() == val) {
unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = ip;
#endif
trace_preempt_off(CALLER_ADDR0, ip);
}
}
void preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
#endif
__preempt_count_add(val);
#ifdef CONFIG_DEBUG_PREEMPT
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
preempt_latency_start(val);
}
EXPORT_SYMBOL(preempt_count_add);
NOKPROBE_SYMBOL(preempt_count_add);
static inline void preempt_latency_stop(int val)
{
if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
}
void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
return;
if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
!(preempt_count() & PREEMPT_MASK)))
return;
#endif
preempt_latency_stop(val);
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
NOKPROBE_SYMBOL(preempt_count_sub);
#else
static inline void preempt_latency_start(int val) { }
static inline void preempt_latency_stop(int val) { }
#endif
static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
{
#ifdef CONFIG_DEBUG_PREEMPT
return p->preempt_disable_ip;
#else
return 0;
#endif
}
static noinline void __schedule_bug(struct task_struct *prev)
{
unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
if (oops_in_progress)
return;
printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
prev->comm, prev->pid, preempt_count());
debug_show_held_locks(prev);
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
&& in_atomic_preempt_off()) {
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
check_panic_on_warn("scheduling while atomic");
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
static inline void schedule_debug(struct task_struct *prev, bool preempt)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
if (task_scs_end_corrupted(prev))
panic("corrupted shadow stack detected inside scheduler\n");
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
prev->comm, prev->pid, prev->non_block_count);
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
#endif
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
SCHED_WARN_ON(ct_state() == CONTEXT_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq()->sched_count);
}
static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
struct rq_flags *rf)
{
#ifdef CONFIG_SMP
const struct sched_class *class;
for_class_range(class, prev->sched_class, &idle_sched_class) {
if (class->balance(rq, prev, rf))
break;
}
#endif
put_prev_task(rq, prev);
}
static inline struct task_struct *
__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
if (!p) {
put_prev_task(rq, prev);
p = pick_next_task_idle(rq);
}
return p;
}
restart:
put_prev_task_balance(rq, prev, rf);
for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
return p;
}
BUG();
}
#ifdef CONFIG_SCHED_CORE
static inline bool is_task_rq_idle(struct task_struct *t)
{
return (task_rq(t)->idle == t);
}
static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
{
return is_task_rq_idle(a) || (a->core_cookie == cookie);
}
static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
{
if (is_task_rq_idle(a) || is_task_rq_idle(b))
return true;
return a->core_cookie == b->core_cookie;
}
static inline struct task_struct *pick_task(struct rq *rq)
{
const struct sched_class *class;
struct task_struct *p;
for_each_class(class) {
p = class->pick_task(rq);
if (p)
return p;
}
BUG();
}
extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
static void queue_core_balance(struct rq *rq);
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
bool need_sync;
if (!sched_core_enabled(rq))
return __pick_next_task(rq, prev, rf);
cpu = cpu_of(rq);
if (cpu_is_offline(cpu)) {
rq->core_pick = NULL;
return __pick_next_task(rq, prev, rf);
}
if (rq->core->core_pick_seq == rq->core->core_task_seq &&
rq->core->core_pick_seq != rq->core_sched_seq &&
rq->core_pick) {
WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
next = rq->core_pick;
if (next != prev) {
put_prev_task(rq, prev);
set_next_task(rq, next);
}
rq->core_pick = NULL;
goto out;
}
put_prev_task_balance(rq, prev, rf);
smt_mask = cpu_smt_mask(cpu);
need_sync = !!rq->core->core_cookie;
rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle_count) {
if (!core_clock_updated) {
update_rq_clock(rq->core);
core_clock_updated = true;
}
sched_core_account_forceidle(rq);
rq->core->core_forceidle_start = 0;
rq->core->core_forceidle_count = 0;
rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
}
rq->core->core_task_seq++;
if (!need_sync) {
next = pick_task(rq);
if (!next->core_cookie) {
rq->core_pick = NULL;
WARN_ON_ONCE(fi_before);
task_vruntime_update(rq, next, false);
goto out_set_next;
}
}
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
p = rq_i->core_pick = pick_task(rq_i);
if (!max || prio_less(max, p, fi_before))
max = p;
}
cookie = rq->core->core_cookie = max->core_cookie;
for_each_cpu(i, smt_mask) {
rq_i = cpu_rq(i);
p = rq_i->core_pick;
if (!cookie_equals(p, cookie)) {
p = NULL;
if (cookie)
p = sched_core_find(rq_i, cookie);
if (!p)
p = idle_sched_class.pick_task(rq_i);
}
rq_i->core_pick = p;
if (p == rq_i->idle) {
if (rq_i->nr_running) {
rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
} else {
occ++;
}
}
if (schedstat_enabled() && rq->core->core_forceidle_count) {
rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_occupation = occ;
}
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
WARN_ON_ONCE(!next);
for_each_cpu(i, smt_mask) {
rq_i = cpu_rq(i);
if (!rq_i->core_pick)
continue;
if (!(fi_before && rq->core->core_forceidle_count))
task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
rq_i->core_pick->core_occupation = occ;
if (i == cpu) {
rq_i->core_pick = NULL;
continue;
}
WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
if (rq_i->curr == rq_i->core_pick) {
rq_i->core_pick = NULL;
continue;
}
resched_curr(rq_i);
}
out_set_next:
set_next_task(rq, next);
out:
if (rq->core->core_forceidle_count && next == rq->idle)
queue_core_balance(rq);
return next;
}
static bool try_steal_cookie(int this, int that)
{
struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
struct task_struct *p;
unsigned long cookie;
bool success = false;
guard(irq)();
guard(double_rq_lock)(dst, src);
cookie = dst->core->core_cookie;
if (!cookie)
return false;
if (dst->curr != dst->idle)
return false;
p = sched_core_find(src, cookie);
if (!p)
return false;
do {
if (p == src->core_pick || p == src->curr)
goto next;
if (!is_cpu_allowed(p, this))
goto next;
if (p->core_occupation > dst->idle->core_occupation)
goto next;
if (sched_task_is_throttled(p, this))
goto next;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
activate_task(dst, p, 0);
resched_curr(dst);
success = true;
break;
next:
p = sched_core_next(p, cookie);
} while (p);
return success;
}
static bool steal_cookie_task(int cpu, struct sched_domain *sd)
{
int i;
for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
if (i == cpu)
continue;
if (need_resched())
break;
if (try_steal_cookie(cpu, i))
return true;
}
return false;
}
static void sched_core_balance(struct rq *rq)
{
struct sched_domain *sd;
int cpu = cpu_of(rq);
preempt_disable();
rcu_read_lock();
raw_spin_rq_unlock_irq(rq);
for_each_domain(cpu, sd) {
if (need_resched())
break;
if (steal_cookie_task(cpu, sd))
break;
}
raw_spin_rq_lock_irq(rq);
rcu_read_unlock();
preempt_enable();
}
static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
static void queue_core_balance(struct rq *rq)
{
if (!sched_core_enabled(rq))
return;
if (!rq->core->core_cookie)
return;
if (!rq->nr_running)
return;
queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
}
DEFINE_LOCK_GUARD_1(core_lock, int,
sched_core_lock(*_T->lock, &_T->flags),
sched_core_unlock(*_T->lock, &_T->flags),
unsigned long flags)
static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
int t;
guard(core_lock)(&cpu);
WARN_ON_ONCE(rq->core != rq);
if (cpumask_weight(smt_mask) == 1)
return;
for_each_cpu(t, smt_mask) {
if (t == cpu)
continue;
rq = cpu_rq(t);
if (rq->core == rq) {
core_rq = rq;
break;
}
}
if (WARN_ON_ONCE(!core_rq))
return;
for_each_cpu(t, smt_mask) {
rq = cpu_rq(t);
if (t == cpu)
rq->core = core_rq;
WARN_ON_ONCE(rq->core != core_rq);
}
}
static void sched_core_cpu_deactivate(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
int t;
guard(core_lock)(&cpu);
if (cpumask_weight(smt_mask) == 1) {
WARN_ON_ONCE(rq->core != rq);
return;
}
if (rq->core != rq)
return;
for_each_cpu(t, smt_mask) {
if (t == cpu)
continue;
core_rq = cpu_rq(t);
break;
}
if (WARN_ON_ONCE(!core_rq))
return;
core_rq->core_task_seq = rq->core_task_seq;
core_rq->core_pick_seq = rq->core_pick_seq;
core_rq->core_cookie = rq->core_cookie;
core_rq->core_forceidle_count = rq->core_forceidle_count;
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
core_rq->core_forceidle_start = 0;
for_each_cpu(t, smt_mask) {
rq = cpu_rq(t);
rq->core = core_rq;
}
}
static inline void sched_core_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (rq->core != rq)
rq->core = rq;
}
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
static inline void sched_core_cpu_dying(unsigned int cpu) {}
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
return __pick_next_task(rq, prev, rf);
}
#endif /* CONFIG_SCHED_CORE */
#define SM_NONE 0x0
#define SM_PREEMPT 0x1
#define SM_RTLOCK_WAIT 0x2
#ifndef CONFIG_PREEMPT_RT
# define SM_MASK_PREEMPT (~0U)
#else
# define SM_MASK_PREEMPT SM_PREEMPT
#endif
static void __sched notrace __schedule(unsigned int sched_mode)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
struct rq *rq;
int cpu;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
schedule_debug(prev, !!sched_mode);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
rcu_note_context_switch(!!sched_mode);
rq_lock(rq, &rf);
smp_mb__after_spinlock();
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
switch_count = &prev->nivcsw;
prev_state = READ_ONCE(prev->__state);
if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
if (signal_pending_state(prev_state, prev)) {
WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
!(prev_state & TASK_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
}
switch_count = &prev->nvcsw;
}
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
#endif
if (likely(prev != next)) {
rq->nr_switches++;
RCU_INIT_POINTER(rq->curr, next);
++*switch_count;
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
}
}
void __noreturn do_task_dead(void)
{
set_special_state(TASK_DEAD);
current->flags |= PF_NOFREEZE;
__schedule(SM_NONE);
BUG();
for (;;)
cpu_relax();
}
static inline void sched_submit_work(struct task_struct *tsk)
{
unsigned int task_flags;
if (task_is_running(tsk))
return;
task_flags = tsk->flags;
if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk);
else
io_wq_worker_sleeping(tsk);
}
SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
blk_flush_plug(tsk->plug, true);
}
static void sched_update_worker(struct task_struct *tsk)
{
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
else
io_wq_worker_running(tsk);
}
}
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
sched_submit_work(tsk);
do {
preempt_disable();
__schedule(SM_NONE);
sched_preempt_enable_no_resched();
} while (need_resched());
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
void __sched schedule_idle(void)
{
WARN_ON_ONCE(current->__state);
do {
__schedule(SM_NONE);
} while (need_resched());
}
#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
enum ctx_state prev_state = exception_enter();
schedule();
exception_exit(prev_state);
}
#endif
void __sched schedule_preempt_disabled(void)
{
sched_preempt_enable_no_resched();
schedule();
preempt_disable();
}
#ifdef CONFIG_PREEMPT_RT
void __sched notrace schedule_rtlock(void)
{
do {
preempt_disable();
__schedule(SM_RTLOCK_WAIT);
sched_preempt_enable_no_resched();
} while (need_resched());
}
NOKPROBE_SYMBOL(schedule_rtlock);
#endif
static void __sched notrace preempt_schedule_common(void)
{
do {
preempt_disable_notrace();
preempt_latency_start(1);
__schedule(SM_PREEMPT);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
} while (need_resched());
}
#ifdef CONFIG_PREEMPTION
asmlinkage __visible void __sched notrace preempt_schedule(void)
{
if (likely(!preemptible()))
return;
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#ifndef preempt_schedule_dynamic_enabled
#define preempt_schedule_dynamic_enabled preempt_schedule
#define preempt_schedule_dynamic_disabled NULL
#endif
DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
void __sched notrace dynamic_preempt_schedule(void)
{
if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
return;
preempt_schedule();
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule);
EXPORT_SYMBOL(dynamic_preempt_schedule);
#endif
#endif
asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
{
enum ctx_state prev_ctx;
if (likely(!preemptible()))
return;
do {
preempt_disable_notrace();
preempt_latency_start(1);
prev_ctx = exception_enter();
__schedule(SM_PREEMPT);
exception_exit(prev_ctx);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
} while (need_resched());
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#ifndef preempt_schedule_notrace_dynamic_enabled
#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
#define preempt_schedule_notrace_dynamic_disabled NULL
#endif
DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
void __sched notrace dynamic_preempt_schedule_notrace(void)
{
if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
return;
preempt_schedule_notrace();
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
#endif
#endif
#endif /* CONFIG_PREEMPTION */
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
enum ctx_state prev_state;
BUG_ON(preempt_count() || !irqs_disabled());
prev_state = exception_enter();
do {
preempt_disable();
local_irq_enable();
__schedule(SM_PREEMPT);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
exception_exit(prev_state);
}
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
static void __setscheduler_prio(struct task_struct *p, int prio)
{
if (dl_prio(prio))
p->sched_class = &dl_sched_class;
else if (rt_prio(prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
p->prio = prio;
}
#ifdef CONFIG_RT_MUTEXES
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
{
if (pi_task)
prio = min(prio, pi_task->prio);
return prio;
}
static inline int rt_effective_prio(struct task_struct *p, int prio)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
return __rt_effective_prio(pi_task, prio);
}
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
int prio, oldprio, queued, running, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class;
struct rq_flags rf;
struct rq *rq;
prio = __rt_effective_prio(pi_task, p->normal_prio);
if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
return;
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
p->pi_top_task = pi_task;
if (prio == p->prio && !dl_prio(prio))
goto out_unlock;
if (unlikely(p == rq->idle)) {
WARN_ON(p != rq->curr);
WARN_ON(p->pi_blocked_on);
goto out_unlock;
}
trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
if (oldprio == prio)
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, queue_flag);
if (running)
put_prev_task(rq, p);
if (dl_prio(prio)) {
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_prio(pi_task->prio) &&
dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.pi_se = pi_task->dl.pi_se;
queue_flag |= ENQUEUE_REPLENISH;
} else {
p->dl.pi_se = &p->dl;
}
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
} else {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
}
__setscheduler_prio(p, prio);
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
preempt_disable();
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock(rq);
preempt_enable();
}
#else
static inline int rt_effective_prio(struct task_struct *p, int prio)
{
return prio;
}
#endif
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
int old_prio;
struct rq_flags rf;
struct rq *rq;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_next_task(rq, p);
p->sched_class->prio_changed(rq, p, old_prio);
out_unlock:
task_rq_unlock(rq, p, &rf);
}
EXPORT_SYMBOL(set_user_nice);
static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
}
int can_nice(const struct task_struct *p, const int nice)
{
return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}
#ifdef __ARCH_WANT_SYS_NICE
SYSCALL_DEFINE1(nice, int, increment)
{
long nice, retval;
increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
nice = task_nice(current) + increment;
nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
retval = security_task_setnice(current, nice);
if (retval)
return retval;
set_user_nice(current, nice);
return 0;
}
#endif
int task_prio(const struct task_struct *p)
{
return p->prio - MAX_RT_PRIO;
}
int idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (rq->curr != rq->idle)
return 0;
if (rq->nr_running)
return 0;
#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
#endif
return 1;
}
int available_idle_cpu(int cpu)
{
if (!idle_cpu(cpu))
return 0;
if (vcpu_is_preempted(cpu))
return 0;
return 1;
}
struct task_struct *idle_task(int cpu)
{
return cpu_rq(cpu)->idle;
}
#ifdef CONFIG_SCHED_CORE
int sched_core_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (sched_core_enabled(rq) && rq->curr == rq->idle)
return 1;
return idle_cpu(cpu);
}
#endif
#ifdef CONFIG_SMP
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
enum cpu_util_type type,
struct task_struct *p)
{
unsigned long dl_util, util, irq, max;
struct rq *rq = cpu_rq(cpu);
max = arch_scale_cpu_capacity(cpu);
if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
}
irq = cpu_util_irq(rq);
if (unlikely(irq >= max))
return max;
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
util = uclamp_rq_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
if (util + dl_util >= max)
return max;
if (type == ENERGY_UTIL)
util += dl_util;
util = scale_irq_capacity(util, irq, max);
util += irq;
if (type == FREQUENCY_UTIL)
util += cpu_bw_dl(rq);
return min(max, util);
}
unsigned long sched_cpu_util(int cpu)
{
return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
static struct task_struct *find_process_by_pid(pid_t pid)
{
return pid ? find_task_by_vpid(pid) : current;
}
#define SETPARAM_POLICY -1
static void __setscheduler_params(struct task_struct *p,
const struct sched_attr *attr)
{
int policy = attr->sched_policy;
if (policy == SETPARAM_POLICY)
policy = p->policy;
p->policy = policy;
if (dl_policy(policy))
__setparam_dl(p, attr);
else if (fair_policy(policy))
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
set_load_weight(p, true);
}
static bool check_same_owner(struct task_struct *p)
{
const struct cred *cred = current_cred(), *pcred;
bool match;
rcu_read_lock();
pcred = __task_cred(p);
match = (uid_eq(cred->euid, pcred->euid) ||
uid_eq(cred->euid, pcred->uid));
rcu_read_unlock();
return match;
}
static int user_check_sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
int policy, int reset_on_fork)
{
if (fair_policy(policy)) {
if (attr->sched_nice < task_nice(p) &&
!is_nice_reduction(p, attr->sched_nice))
goto req_priv;
}
if (rt_policy(policy)) {
unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
if (policy != p->policy && !rlim_rtprio)
goto req_priv;
if (attr->sched_priority > p->rt_priority &&
attr->sched_priority > rlim_rtprio)
goto req_priv;
}
if (dl_policy(policy))
goto req_priv;
if (task_has_idle_policy(p) && !idle_policy(policy)) {
if (!is_nice_reduction(p, task_nice(p)))
goto req_priv;
}
if (!check_same_owner(p))
goto req_priv;
if (p->sched_reset_on_fork && !reset_on_fork)
goto req_priv;
return 0;
req_priv:
if (!capable(CAP_SYS_NICE))
return -EPERM;
return 0;
}
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
int oldpolicy = -1, policy = attr->sched_policy;
int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
bool cpuset_locked = false;
BUG_ON(pi && in_interrupt());
recheck:
if (policy < 0) {
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
} else {
reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
if (!valid_policy(policy))
return -EINVAL;
}
if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
return -EINVAL;
if (attr->sched_priority > MAX_RT_PRIO-1)
return -EINVAL;
if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
if (user) {
retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
if (retval)
return retval;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
retval = security_task_setscheduler(p);
if (retval)
return retval;
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
retval = uclamp_validate(p, attr);
if (retval)
return retval;
}
if (dl_policy(policy) || dl_policy(p->policy)) {
cpuset_locked = true;
cpuset_lock();
}
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
if (p == rq->stop) {
retval = -EINVAL;
goto unlock;
}
if (unlikely(policy == p->policy)) {
if (fair_policy(policy) && attr->sched_nice != task_nice(p))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
p->sched_reset_on_fork = reset_on_fork;
retval = 0;
goto unlock;
}
change:
if (user) {
#ifdef CONFIG_RT_GROUP_SCHED
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
retval = -EPERM;
goto unlock;
}
#endif
#ifdef CONFIG_SMP
if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
retval = -EPERM;
goto unlock;
}
}
#endif
}
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
if (cpuset_locked)
cpuset_unlock();
goto recheck;
}
if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
retval = -EBUSY;
goto unlock;
}
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
newprio = rt_effective_prio(p, newprio);
if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
prev_class = p->sched_class;
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
__setscheduler_params(p, attr);
__setscheduler_prio(p, newprio);
}
__setscheduler_uclamp(p, attr);
if (queued) {
if (oldprio < p->prio)
queue_flags |= ENQUEUE_HEAD;
enqueue_task(rq, p, queue_flags);
}
if (running)
set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
preempt_disable();
head = splice_balance_callbacks(rq);
task_rq_unlock(rq, p, &rf);
if (pi) {
if (cpuset_locked)
cpuset_unlock();
rt_mutex_adjust_pi(p);
}
balance_callbacks(rq, head);
preempt_enable();
return 0;
unlock:
task_rq_unlock(rq, p, &rf);
if (cpuset_locked)
cpuset_unlock();
return retval;
}
static int _sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param, bool check)
{
struct sched_attr attr = {
.sched_policy = policy,
.sched_priority = param->sched_priority,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
policy &= ~SCHED_RESET_ON_FORK;
attr.sched_policy = policy;
}
return __sched_setscheduler(p, &attr, check, true);
}
int sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param)
{
return _sched_setscheduler(p, policy, param, true);
}
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, true, true);
}
int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
{
return _sched_setscheduler(p, policy, param, false);
}
void sched_set_fifo(struct task_struct *p)
{
struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
}
EXPORT_SYMBOL_GPL(sched_set_fifo);
void sched_set_fifo_low(struct task_struct *p)
{
struct sched_param sp = { .sched_priority = 1 };
WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
}
EXPORT_SYMBOL_GPL(sched_set_fifo_low);
void sched_set_normal(struct task_struct *p, int nice)
{
struct sched_attr attr = {
.sched_policy = SCHED_NORMAL,
.sched_nice = nice,
};
WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
}
EXPORT_SYMBOL_GPL(sched_set_normal);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
struct task_struct *p;
int retval;
if (!param || pid < 0)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
if (likely(p))
get_task_struct(p);
rcu_read_unlock();
if (likely(p)) {
retval = sched_setscheduler(p, policy, &lparam);
put_task_struct(p);
}
return retval;
}
static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
{
u32 size;
int ret;
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
if (!size)
size = SCHED_ATTR_SIZE_VER0;
if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
if (ret) {
if (ret == -E2BIG)
goto err_size;
return ret;
}
if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
return 0;
err_size:
put_user(sizeof(*attr), &uattr->size);
return -E2BIG;
}
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
if (task_has_dl_policy(p))
__getparam_dl(p, attr);
else if (task_has_rt_policy(p))
attr->sched_priority = p->rt_priority;
else
attr->sched_nice = task_nice(p);
}
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
{
if (policy < 0)
return -EINVAL;
return do_sched_setscheduler(pid, policy, param);
}
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
}
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
{
struct sched_attr attr;
struct task_struct *p;
int retval;
if (!uattr || pid < 0 || flags)
return -EINVAL;
retval = sched_copy_attr(uattr, &attr);
if (retval)
return retval;
if ((int)attr.sched_policy < 0)
return -EINVAL;
if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
attr.sched_policy = SETPARAM_POLICY;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
if (likely(p))
get_task_struct(p);
rcu_read_unlock();
if (likely(p)) {
if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
get_params(p, &attr);
retval = sched_setattr(p, &attr);
put_task_struct(p);
}
return retval;
}
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
struct task_struct *p;
int retval;
if (pid < 0)
return -EINVAL;
retval = -ESRCH;
rcu_read_lock();
p = find_process_by_pid(pid);
if (p) {
retval = security_task_getscheduler(p);
if (!retval)
retval = p->policy
| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
}
rcu_read_unlock();
return retval;
}
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
if (!param || pid < 0)
return -EINVAL;
rcu_read_lock();
p = find_process_by_pid(pid);
retval = -ESRCH;
if (!p)
goto out_unlock;
retval = security_task_getscheduler(p);
if (retval)
goto out_unlock;
if (task_has_rt_policy(p))
lp.sched_priority = p->rt_priority;
rcu_read_unlock();
retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
return retval;
out_unlock:
rcu_read_unlock();
return retval;
}
static int
sched_attr_copy_to_user(struct sched_attr __user *uattr,
struct sched_attr *kattr,
unsigned int usize)
{
unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
kattr->size = min(usize, ksize);
if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
}
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, usize, unsigned int, flags)
{
struct sched_attr kattr = { };
struct task_struct *p;
int retval;
if (!uattr || pid < 0 || usize > PAGE_SIZE ||
usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
p = find_process_by_pid(pid);
retval = -ESRCH;
if (!p)
goto out_unlock;
retval = security_task_getscheduler(p);
if (retval)
goto out_unlock;
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
get_params(p, &kattr);
kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
rcu_read_unlock();
return sched_attr_copy_to_user(uattr, &kattr, usize);
out_unlock:
rcu_read_unlock();
return retval;
}
#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
int ret = 0;
if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
return 0;
rcu_read_lock();
if (!cpumask_subset(task_rq(p)->rd->span, mask))
ret = -EBUSY;
rcu_read_unlock();
return ret;
}
#endif
static int
__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
{
int retval;
cpumask_var_t cpus_allowed, new_mask;
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
return -ENOMEM;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_free_cpus_allowed;
}
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
ctx->new_mask = new_mask;
ctx->flags |= SCA_CHECK;
retval = dl_task_check_affinity(p, new_mask);
if (retval)
goto out_free_new_mask;
retval = __set_cpus_allowed_ptr(p, ctx);
if (retval)
goto out_free_new_mask;
cpuset_cpus_allowed(p, cpus_allowed);
if (!cpumask_subset(new_mask, cpus_allowed)) {
cpumask_copy(new_mask, cpus_allowed);
if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
bool empty = !cpumask_and(new_mask, new_mask,
ctx->user_mask);
if (WARN_ON_ONCE(empty))
cpumask_copy(new_mask, cpus_allowed);
}
__set_cpus_allowed_ptr(p, ctx);
retval = -EINVAL;
}
out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
return retval;
}
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
struct affinity_context ac;
struct cpumask *user_mask;
struct task_struct *p;
int retval;
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
return -ESRCH;
}
get_task_struct(p);
rcu_read_unlock();
if (p->flags & PF_NO_SETAFFINITY) {
retval = -EINVAL;
goto out_put_task;
}
if (!check_same_owner(p)) {
rcu_read_lock();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock();
retval = -EPERM;
goto out_put_task;
}
rcu_read_unlock();
}
retval = security_task_setscheduler(p);
if (retval)
goto out_put_task;
user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
if (user_mask) {
cpumask_copy(user_mask, in_mask);
} else if (IS_ENABLED(CONFIG_SMP)) {
retval = -ENOMEM;
goto out_put_task;
}
ac = (struct affinity_context){
.new_mask = in_mask,
.user_mask = user_mask,
.flags = SCA_USER,
};
retval = __sched_setaffinity(p, &ac);
kfree(ac.user_mask);
out_put_task:
put_task_struct(p);
return retval;
}
static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
struct cpumask *new_mask)
{
if (len < cpumask_size())
cpumask_clear(new_mask);
else if (len > cpumask_size())
len = cpumask_size();
return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
}
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
{
cpumask_var_t new_mask;
int retval;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;
retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
if (retval == 0)
retval = sched_setaffinity(pid, new_mask);
free_cpumask_var(new_mask);
return retval;
}
long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
unsigned long flags;
int retval;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
if (!p)
goto out_unlock;
retval = security_task_getscheduler(p);
if (retval)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
return retval;
}
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
{
int ret;
cpumask_var_t mask;
if ((len * BITS_PER_BYTE) < nr_cpu_ids)
return -EINVAL;
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
unsigned int retlen = min(len, cpumask_size());
if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
}
free_cpumask_var(mask);
return ret;
}
static void do_sched_yield(void)
{
struct rq_flags rf;
struct rq *rq;
rq = this_rq_lock_irq(&rf);
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
preempt_disable();
rq_unlock_irq(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
}
SYSCALL_DEFINE0(sched_yield)
{
do_sched_yield();
return 0;
}
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
{
if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
#endif
return 0;
}
EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define cond_resched_dynamic_enabled __cond_resched
#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
#define might_resched_dynamic_enabled __cond_resched
#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
klp_sched_try_switch();
if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
return __cond_resched();
}
EXPORT_SYMBOL(dynamic_cond_resched);
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
int __sched dynamic_might_resched(void)
{
if (!static_branch_unlikely(&sk_dynamic_might_resched))
return 0;
return __cond_resched();
}
EXPORT_SYMBOL(dynamic_might_resched);
#endif
#endif
int __cond_resched_lock(spinlock_t *lock)
{
int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held(lock);
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (!_cond_resched())
cpu_relax();
ret = 1;
spin_lock(lock);
}
return ret;
}
EXPORT_SYMBOL(__cond_resched_lock);
int __cond_resched_rwlock_read(rwlock_t *lock)
{
int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held_read(lock);
if (rwlock_needbreak(lock) || resched) {
read_unlock(lock);
if (!_cond_resched())
cpu_relax();
ret = 1;
read_lock(lock);
}
return ret;
}
EXPORT_SYMBOL(__cond_resched_rwlock_read);
int __cond_resched_rwlock_write(rwlock_t *lock)
{
int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held_write(lock);
if (rwlock_needbreak(lock) || resched) {
write_unlock(lock);
if (!_cond_resched())
cpu_relax();
ret = 1;
write_lock(lock);
}
return ret;
}
EXPORT_SYMBOL(__cond_resched_rwlock_write);
#ifdef CONFIG_PREEMPT_DYNAMIC
#ifdef CONFIG_GENERIC_ENTRY
#include <linux/entry-common.h>
#endif
enum {
preempt_dynamic_undefined = -1,
preempt_dynamic_none,
preempt_dynamic_voluntary,
preempt_dynamic_full,
};
int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
if (!strcmp(str, "none"))
return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
return preempt_dynamic_voluntary;
if (!strcmp(str, "full"))
return preempt_dynamic_full;
return -EINVAL;
}
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
#else
#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif
static DEFINE_MUTEX(sched_dynamic_mutex);
static bool klp_override;
static void __sched_dynamic_update(int mode)
{
if (!klp_override)
preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
switch (mode) {
case preempt_dynamic_none:
if (!klp_override)
preempt_dynamic_enable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: none\n");
break;
case preempt_dynamic_voluntary:
if (!klp_override)
preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: voluntary\n");
break;
case preempt_dynamic_full:
if (!klp_override)
preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: full\n");
break;
}
preempt_dynamic_mode = mode;
}
void sched_dynamic_update(int mode)
{
mutex_lock(&sched_dynamic_mutex);
__sched_dynamic_update(mode);
mutex_unlock(&sched_dynamic_mutex);
}
#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
static int klp_cond_resched(void)
{
__klp_sched_try_switch();
return __cond_resched();
}
void sched_dynamic_klp_enable(void)
{
mutex_lock(&sched_dynamic_mutex);
klp_override = true;
static_call_update(cond_resched, klp_cond_resched);
mutex_unlock(&sched_dynamic_mutex);
}
void sched_dynamic_klp_disable(void)
{
mutex_lock(&sched_dynamic_mutex);
klp_override = false;
__sched_dynamic_update(preempt_dynamic_mode);
mutex_unlock(&sched_dynamic_mutex);
}
#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
static int __init setup_preempt_mode(char *str)
{
int mode = sched_dynamic_mode(str);
if (mode < 0) {
pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
return 0;
}
sched_dynamic_update(mode);
return 1;
}
__setup("preempt=", setup_preempt_mode);
static void __init preempt_dynamic_init(void)
{
if (preempt_dynamic_mode == preempt_dynamic_undefined) {
if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
sched_dynamic_update(preempt_dynamic_none);
} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
sched_dynamic_update(preempt_dynamic_voluntary);
} else {
WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
preempt_dynamic_mode = preempt_dynamic_full;
pr_info("Dynamic Preempt: full\n");
}
}
}
#define PREEMPT_MODEL_ACCESSOR(mode) \
bool preempt_model_##mode(void) \
{ \
WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
return preempt_dynamic_mode == preempt_dynamic_##mode; \
} \
EXPORT_SYMBOL_GPL(preempt_model_##mode)
PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);
PREEMPT_MODEL_ACCESSOR(full);
#else /* !CONFIG_PREEMPT_DYNAMIC */
static inline void preempt_dynamic_init(void) { }
#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */