Commit 46e0d28b authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main scheduler changes in this cycle were:

   - NUMA balancing improvements (Mel Gorman)

   - Further load tracking improvements (Patrick Bellasi)

   - Various NOHZ balancing cleanups and optimizations (Peter Zijlstra)

   - Improve blocked load handling, in particular we can now reduce and
     eventually stop periodic load updates on 'very idle' CPUs. (Vincent
     Guittot)

   - On isolated CPUs offload the final 1Hz scheduler tick as well, plus
     related cleanups and reorganization. (Frederic Weisbecker)

   - Core scheduler code cleanups (Ingo Molnar)"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits)
  sched/core: Update preempt_notifier_key to modern API
  sched/cpufreq: Rate limits for SCHED_DEADLINE
  sched/fair: Update util_est only on util_avg updates
  sched/cpufreq/schedutil: Use util_est for OPP selection
  sched/fair: Use util_est in LB and WU paths
  sched/fair: Add util_est on top of PELT
  sched/core: Remove TASK_ALL
  sched/completions: Use bool in try_wait_for_completion()
  sched/fair: Update blocked load when newly idle
  sched/fair: Move idle_balance()
  sched/nohz: Merge CONFIG_NO_HZ_COMMON blocks
  sched/fair: Move rebalance_domains()
  sched/nohz: Optimize nohz_idle_balance()
  sched/fair: Reduce the periodic update duration
  sched/nohz: Stop NOHZ stats when decayed
  sched/cpufreq: Provide migration hint
  sched/nohz: Clean up nohz enter/exit
  sched/fair: Update blocked load from NEWIDLE
  sched/fair: Add NOHZ stats balancing
  sched/fair: Restructure nohz_balance_kick()
  ...
parents 86bbbeba b7203428
......@@ -1766,6 +1766,17 @@
nohz
Disable the tick when a single task runs.
A residual 1Hz tick is offloaded to workqueues, which you
need to affine to housekeeping through the global
workqueue's affinity configured via the
/sys/devices/virtual/workqueue/cpumask sysfs file, or
by using the 'domain' flag described below.
NOTE: by default the global workqueue runs on all CPUs,
so to protect individual CPUs the 'cpumask' file has to
be configured manually after bootup.
domain
Isolate from the general SMP balancing and scheduling
algorithms. Note that performing domain isolation this way
......
......@@ -93,7 +93,6 @@ struct task_group;
/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
/* get_task_state(): */
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
......@@ -275,6 +274,34 @@ struct load_weight {
u32 inv_weight;
};
/**
* struct util_est - Estimation utilization of FAIR tasks
* @enqueued: instantaneous estimated utilization of a task/cpu
* @ewma: the Exponential Weighted Moving Average (EWMA)
* utilization of a task
*
* Support data structure to track an Exponential Weighted Moving Average
* (EWMA) of a FAIR task's utilization. New samples are added to the moving
* average each time a task completes an activation. Sample's weight is chosen
* so that the EWMA will be relatively insensitive to transient changes to the
* task's workload.
*
* The enqueued attribute has a slightly different meaning for tasks and cpus:
* - task: the task's util_avg at last task dequeue time
* - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
* Thus, the util_est.enqueued of a task represents the contribution on the
* estimated utilization of the CPU where that task is currently enqueued.
*
* Only for tasks we track a moving average of the past instantaneous
* estimated utilization. This allows to absorb sporadic drops in utilization
* of an otherwise almost periodic task.
*/
struct util_est {
unsigned int enqueued;
unsigned int ewma;
#define UTIL_EST_WEIGHT_SHIFT 2
};
/*
* The load_avg/util_avg accumulates an infinite geometric series
* (see __update_load_avg() in kernel/sched/fair.c).
......@@ -336,6 +363,7 @@ struct sched_avg {
unsigned long load_avg;
unsigned long runnable_load_avg;
unsigned long util_avg;
struct util_est util_est;
};
struct sched_statistics {
......
......@@ -8,9 +8,8 @@
* Interface between cpufreq drivers and the scheduler:
*/
#define SCHED_CPUFREQ_RT (1U << 0)
#define SCHED_CPUFREQ_DL (1U << 1)
#define SCHED_CPUFREQ_IOWAIT (1U << 2)
#define SCHED_CPUFREQ_IOWAIT (1U << 0)
#define SCHED_CPUFREQ_MIGRATION (1U << 1)
#ifdef CONFIG_CPU_FREQ
struct update_util_data {
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_DEADLINE_H
#define _LINUX_SCHED_DEADLINE_H
#include <linux/sched.h>
/*
* SCHED_DEADLINE tasks has negative priorities, reflecting
......@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
#endif /* _LINUX_SCHED_DEADLINE_H */
......@@ -12,6 +12,7 @@ enum hk_flags {
HK_FLAG_SCHED = (1 << 3),
HK_FLAG_TICK = (1 << 4),
HK_FLAG_DOMAIN = (1 << 5),
HK_FLAG_WQ = (1 << 6),
};
#ifdef CONFIG_CPU_ISOLATION
......
......@@ -16,11 +16,9 @@ static inline void cpu_load_update_nohz_stop(void) { }
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_balance_enter_idle(int cpu);
extern void set_cpu_sd_state_idle(void);
extern int get_nohz_timer_target(void);
#else
static inline void nohz_balance_enter_idle(int cpu) { }
static inline void set_cpu_sd_state_idle(void) { }
#endif
#ifdef CONFIG_NO_HZ_COMMON
......@@ -37,8 +35,4 @@ extern void wake_up_nohz_cpu(int cpu);
static inline void wake_up_nohz_cpu(int cpu) { }
#endif
#ifdef CONFIG_NO_HZ_FULL
extern u64 scheduler_tick_max_deferment(void);
#endif
#endif /* _LINUX_SCHED_NOHZ_H */
......@@ -113,7 +113,8 @@ enum tick_dep_bits {
#ifdef CONFIG_NO_HZ_COMMON
extern bool tick_nohz_enabled;
extern int tick_nohz_tick_stopped(void);
extern bool tick_nohz_tick_stopped(void);
extern bool tick_nohz_tick_stopped_cpu(int cpu);
extern void tick_nohz_idle_enter(void);
extern void tick_nohz_idle_exit(void);
extern void tick_nohz_irq_exit(void);
......@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
#else /* !CONFIG_NO_HZ_COMMON */
#define tick_nohz_enabled (0)
static inline int tick_nohz_tick_stopped(void) { return 0; }
static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
static inline void tick_nohz_idle_enter(void) { }
static inline void tick_nohz_idle_exit(void) { }
......
......@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o idle.o
obj-y += idle.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
#include <linux/security.h>
#include <linux/export.h>
/*
* Auto-group scheduling implementation:
*/
#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
......@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
autogroup_kref_put(prev);
}
/* Allocates GFP_KERNEL, cannot be called under any spinlock */
/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
void sched_autogroup_create_attach(struct task_struct *p)
{
struct autogroup *ag = autogroup_create();
autogroup_move_group(p, ag);
/* drop extra reference added by autogroup_create() */
/* Drop extra reference added by autogroup_create(): */
autogroup_kref_put(ag);
}
EXPORT_SYMBOL(sched_autogroup_create_attach);
/* Cannot be called under siglock. Currently has no users */
/* Cannot be called under siglock. Currently has no users: */
void sched_autogroup_detach(struct task_struct *p)
{
autogroup_move_group(p, &autogroup_default);
......@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
return 1;
}
__setup("noautogroup", setup_autogroup);
#ifdef CONFIG_PROC_FS
......@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
if (nice < 0 && !can_nice(current, nice))
return -EPERM;
/* this is a heavy operation taking global locks.. */
/* This is a heavy operation, taking global locks.. */
if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
return -EAGAIN;
......@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif /* CONFIG_SCHED_DEBUG */
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_SCHED_AUTOGROUP
#include <linux/kref.h>
#include <linux/rwsem.h>
#include <linux/sched/autogroup.h>
struct autogroup {
/*
* reference doesn't mean how many thread attach to this
* autogroup now. It just stands for the number of task
* could use this autogroup.
* Reference doesn't mean how many threads attach to this
* autogroup now. It just stands for the number of tasks
* which could use this autogroup.
*/
struct kref kref;
struct task_group *tg;
......@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
return tg;
}
#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
return 0;
}
#endif
#endif /* CONFIG_SCHED_AUTOGROUP */
/*
* sched_clock for unstable cpu clocks
* sched_clock() for unstable CPU clocks
*
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
*
......@@ -11,7 +11,7 @@
* Guillaume Chazarain <guichaz@gmail.com>
*
*
* What:
* What this file implements:
*
* cpu_clock(i) provides a fast (execution time) high resolution
* clock with bounded drift between CPUs. The value of cpu_clock(i)
......@@ -26,11 +26,11 @@
* at 0 on boot (but people really shouldn't rely on that).
*
* cpu_clock(i) -- can be used from any context, including NMI.
* local_clock() -- is cpu_clock() on the current cpu.
* local_clock() -- is cpu_clock() on the current CPU.
*
* sched_clock_cpu(i)
*
* How:
* How it is implemented:
*
* The implementation either uses sched_clock() when
* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
......@@ -52,19 +52,7 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/nmi.h>
#include <linux/sched/clock.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
#include <linux/compiler.h>
#include <linux/tick.h>
#include <linux/init.h>
#include "sched.h"
/*
* Scheduler clock - returns current time in nanosec units.
......@@ -302,21 +290,21 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
* cmpxchg64 below only protects one readout.
*
* We must reread via sched_clock_local() in the retry case on
* 32bit as an NMI could use sched_clock_local() via the
* 32-bit kernels as an NMI could use sched_clock_local() via the
* tracer and hit between the readout of
* the low32bit and the high 32bit portion.
* the low 32-bit and the high 32-bit portion.
*/
this_clock = sched_clock_local(my_scd);
/*
* We must enforce atomic readout on 32bit, otherwise the
* update on the remote cpu can hit inbetween the readout of
* the low32bit and the high 32bit portion.
* We must enforce atomic readout on 32-bit, otherwise the
* update on the remote CPU can hit inbetween the readout of
* the low 32-bit and the high 32-bit portion.
*/
remote_clock = cmpxchg64(&scd->clock, 0, 0);
#else
/*
* On 64bit the read of [my]scd->clock is atomic versus the
* update, so we can avoid the above 32bit dance.
* On 64-bit kernels the read of [my]scd->clock is atomic versus the
* update, so we can avoid the above 32-bit dance.
*/
sched_clock_local(my_scd);
again:
......
......@@ -11,10 +11,7 @@
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/completion.h>
#include "sched.h"
/**
* complete: - signals a single thread waiting on this completion
......@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
bool try_wait_for_completion(struct completion *x)
{
unsigned long flags;
int ret = 1;
bool ret = true;
/*
* Since x->done will need to be locked only
......@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x)
* return early in the blocking case.
*/
if (!READ_ONCE(x->done))
return 0;
return false;
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
ret = false;
else if (x->done != UINT_MAX)
x->done--;
spin_unlock_irqrestore(&x->wait.lock, flags);
......
......@@ -5,37 +5,11 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <uapi/linux/sched/types.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/hotplug.h>
#include <linux/wait_bit.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/sched/isolation.h>
#include "sched.h"
#include <asm/switch_to.h>
#include <asm/tlb.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
#include "sched.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
......@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq
* RELEASE (rq->lock)
*
* If we observe the old cpu in task_rq_lock, the acquire of
* If we observe the old CPU in task_rq_lock, the acquire of
* the old rq->lock will fully serialize against the stores.
*
* If we observe the new CPU in task_rq_lock, the acquire will
......@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay)
}
#endif /* CONFIG_SMP */
static void init_rq_hrtick(struct rq *rq)
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
rq->hrtick_csd_pending = 0;
......@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq)
{
}
static inline void init_rq_hrtick(struct rq *rq)
static inline void hrtick_rq_init(struct rq *rq)
{
}
#endif /* CONFIG_SCHED_HRTICK */
......@@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
return false;
if (idle_cpu(cpu) && !need_resched())
......@@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void)
* We can't run Idle Load Balance on this CPU for this time so we
* cancel it and clear NOHZ_BALANCE_KICK
*/
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
return false;
}
......@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process);
*
* - cpu_active must be a subset of cpu_online
*
* - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
* - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
* see __set_cpus_allowed_ptr(). At this point the newly online
* CPU isn't yet part of the sched domains, and balancing will not
* see it.
......@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
void preempt_notifier_inc(void)
{
static_key_slow_inc(&preempt_notifier_key);
static_branch_inc(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_inc);
void preempt_notifier_dec(void)
{
static_key_slow_dec(&preempt_notifier_key);
static_branch_dec(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_dec);
......@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec);
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
if (!static_key_false(&preempt_notifier_key))
if (!static_branch_unlikely(&preempt_notifier_key))
WARN(1, "registering preempt_notifier while notifiers disabled\n");
hlist_add_head(&notifier->link, &current->preempt_notifiers);
......@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
if (static_key_false(&preempt_notifier_key))
if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_in_preempt_notifiers(curr);
}
......@@ -2555,7 +2529,7 @@ static __always_inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
if (static_key_false(&preempt_notifier_key))
if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_out_preempt_notifiers(curr, next);
}
......@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
raw_spin_unlock_irq(&rq->lock);
}
/*
* NOP if the arch has not defined these:
*/
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
......@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
* 64-bit doesn't need locks to atomically read a 64bit value.
* 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
* Reading ->on_cpu is racy, but this is ok.
*
......@@ -3096,35 +3082,99 @@ void scheduler_tick(void)
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
rq_last_tick_reset(rq);
}
#ifdef CONFIG_NO_HZ_FULL
/**
* scheduler_tick_max_deferment
*
* Keep at least one tick per second when a single
* active task is running because the scheduler doesn't
* yet completely support full dynticks environment.
*
* This makes sure that uptime, CFS vruntime, load
* balancing, etc... continue to move forward, even
* with a very low granularity.
*
* Return: Maximum deferment in nanoseconds.
*/
u64 scheduler_tick_max_deferment(void)
struct tick_work {
int cpu;
struct delayed_work work;
};
static struct tick_work __percpu *tick_work_cpu;
static void sched_tick_remote(struct work_struct *work)
{
struct rq *rq = this_rq();
unsigned long next, now = READ_ONCE(jiffies);
struct delayed_work *dwork = to_delayed_work(work);
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
next = rq->last_sched_tick + HZ;
/*
* Handle the tick only if it appears the remote CPU is running in full
* dynticks mode. The check is racy by nature, but missing a tick or
* having one too much is no big deal because the scheduler tick updates
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
struct task_struct *curr;
u64 delta;
if (time_before_eq(next, now))
return 0;
rq_lock_irq(rq, &rf);
update_rq_clock(rq);
curr = rq->curr;
delta = rq_clock_task(rq) - curr->se.exec_start;
return jiffies_to_nsecs(next - now);
/*
* Make sure the next tick runs within a reasonable
* amount of time.
*/
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
curr->sched_class->task_tick(rq, curr, 0);
rq_unlock_irq(rq, &rf);
}
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
* to keep scheduler internal stats reasonably up to date.
*/
queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
twork->cpu = cpu;
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
queue_delayed_work(system_unbound_wq, &twork->work, HZ);
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
cancel_delayed_work_sync(&twork->work);
}