Commit a0fa1dd3 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:

 - Add the initial implementation of SCHED_DEADLINE support: a real-time
   scheduling policy where tasks that meet their deadlines and
   periodically execute their instances in less than their runtime quota
   see real-time scheduling and won't miss any of their deadlines.
   Tasks that go over their quota get delayed (Available to privileged
   users for now)

 - Clean up and fix preempt_enable_no_resched() abuse all around the
   tree

 - Do sched_clock() performance optimizations on x86 and elsewhere

 - Fix and improve auto-NUMA balancing

 - Fix and clean up the idle loop

 - Apply various cleanups and fixes

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
  sched: Fix __sched_setscheduler() nice test
  sched: Move SCHED_RESET_ON_FORK into attr::sched_flags
  sched: Fix up attr::sched_priority warning
  sched: Fix up scheduler syscall LTP fails
  sched: Preserve the nice level over sched_setscheduler() and sched_setparam() calls
  sched/core: Fix htmldocs warnings
  sched/deadline: No need to check p if dl_se is valid
  sched/deadline: Remove unused variables
  sched/deadline: Fix sparse static warnings
  m68k: Fix build warning in mac_via.h
  sched, thermal: Clean up preempt_enable_no_resched() abuse
  sched, net: Fixup busy_loop_us_clock()
  sched, net: Clean up preempt_enable_no_resched() abuse
  sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding
  sched/preempt, locking: Rework local_bh_{dis,en}able()
  sched/clock, x86: Avoid a runtime condition in native_sched_clock()
  sched/clock: Fix up clear_sched_clock_stable()
  sched/clock, x86: Use a static_key for sched_clock_stable
  sched/clock: Remove local_irq_disable() from the clocks
  sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs
  ...
parents 9326657a eaad4513
......@@ -428,11 +428,6 @@ rate for each task.
numa_balancing_scan_size_mb is how many megabytes worth of pages are
scanned for a given scan.
numa_balancing_settle_count is how many scan periods must complete before
the schedule balancer stops pushing the task towards a preferred node. This
gives the scheduler a chance to place the task on an alternative node if the
preferred node is overloaded.
numa_balancing_migrate_deferred is how many page migrations get skipped
unconditionally, after a page migration is skipped because a page is shared
with other tasks. This reduces page migration overhead, and determines
......
......@@ -15,7 +15,7 @@
#include <uapi/asm/unistd.h>
#define __NR_syscalls (380)
#define __NR_syscalls (384)
#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0)
#define __ARCH_WANT_STAT64
......
......@@ -406,6 +406,8 @@
#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377)
#define __NR_kcmp (__NR_SYSCALL_BASE+378)
#define __NR_finit_module (__NR_SYSCALL_BASE+379)
#define __NR_sched_setattr (__NR_SYSCALL_BASE+380)
#define __NR_sched_getattr (__NR_SYSCALL_BASE+381)
/*
* This may need to be greater than __NR_last_syscall+1 in order to
......
......@@ -389,6 +389,8 @@
CALL(sys_process_vm_writev)
CALL(sys_kcmp)
CALL(sys_finit_module)
/* 380 */ CALL(sys_sched_setattr)
CALL(sys_sched_getattr)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
......
......@@ -254,6 +254,8 @@
extern volatile __u8 *via1,*via2;
extern int rbv_present,via_alt_mapping;
struct irq_desc;
extern void via_register_interrupts(void);
extern void via_irq_enable(int);
extern void via_irq_disable(int);
......
#ifndef _ASM_X86_MWAIT_H
#define _ASM_X86_MWAIT_H
#include <linux/sched.h>
#define MWAIT_SUBSTATE_MASK 0xf
#define MWAIT_CSTATE_MASK 0xf
#define MWAIT_SUBSTATE_SIZE 4
......@@ -13,4 +15,45 @@
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax, %ecx, %edx;" */
asm volatile(".byte 0x0f, 0x01, 0xc8;"
:: "a" (eax), "c" (ecx), "d"(edx));
}
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (!current_set_polling_and_test()) {
if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
mb();
clflush((void *)&current_thread_info()->flags);
mb();
}
__monitor((void *)&current_thread_info()->flags, 0, 0);
if (!need_resched())
__mwait(eax, ecx);
}
current_clr_polling();
}
#endif /* _ASM_X86_MWAIT_H */
......@@ -700,29 +700,6 @@ static inline void sync_core(void)
#endif
}
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax, %ecx, %edx;" */
asm volatile(".byte 0x0f, 0x01, 0xc8;"
:: "a" (eax), "c" (ecx), "d"(edx));
}
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
trace_hardirqs_on();
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}
extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void init_amd_e400_c1e_mask(void);
......
......@@ -4,6 +4,7 @@
#include <linux/pm.h>
#include <linux/percpu.h>
#include <linux/interrupt.h>
#include <linux/math64.h>
#define TICK_SIZE (tick_nsec / 1000)
......@@ -12,68 +13,26 @@ extern int recalibrate_cpu_khz(void);
extern int no_timer_check;
/* Accelerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
* ns = cycles / (freq / ns_per_sec)
* ns = cycles * (ns_per_sec / freq)
* ns = cycles * (10^9 / (cpu_khz * 10^3))
* ns = cycles * (10^6 / cpu_khz)
/*
* We use the full linear equation: f(x) = a + b*x, in order to allow
* a continuous function in the face of dynamic freq changes.
*
* Then we use scaling math (suggested by george@mvista.com) to get:
* ns = cycles * (10^6 * SC / cpu_khz) / SC
* ns = cycles * cyc2ns_scale / SC
* Continuity means that when our frequency changes our slope (b); we want to
* ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
*
* And since SC is a constant power of two, we can convert the div
* into a shift.
* Without an offset (a) the above would not be possible.
*
* We can use khz divisor instead of mhz to keep a better precision, since
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*
* In:
*
* ns = cycles * cyc2ns_scale / SC
*
* Although we may still have enough bits to store the value of ns,
* in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
* leading to an incorrect result.
*
* To avoid this, we can decompose 'cycles' into quotient and remainder
* of division by SC. Then,
*
* ns = (quot * SC + rem) * cyc2ns_scale / SC
* = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
*
* - sqazi@google.com
* See the comment near cycles_2_ns() for details on how we compute (b).
*/
DECLARE_PER_CPU(unsigned long, cyc2ns);
DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
int cpu = smp_processor_id();
unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
(1UL << CYC2NS_SCALE_FACTOR));
return ns;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
unsigned long long ns;
unsigned long flags;
local_irq_save(flags);
ns = __cycles_2_ns(cyc);
local_irq_restore(flags);
return ns;
}
struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
u32 __count;
/* u32 hole */
}; /* 24 bytes -- do not grow */
extern struct cyc2ns_data *cyc2ns_read_begin(void);
extern void cyc2ns_read_end(struct cyc2ns_data *);
#endif /* _ASM_X86_TIMER_H */
......@@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
if (!need_resched()) {
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(ax, cx);
}
}
void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
......
......@@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
sched_clock_stable = 1;
set_sched_clock_stable();
}
#ifdef CONFIG_X86_64
......
......@@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
sched_clock_stable = 1;
set_sched_clock_stable();
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
......
......@@ -1883,21 +1883,27 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
struct cyc2ns_data *data;
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
userpg->pmc_width = x86_pmu.cntval_bits;
if (!sched_clock_stable)
if (!sched_clock_stable())
return;
data = cyc2ns_read_begin();
userpg->cap_user_time = 1;
userpg->time_mult = this_cpu_read(cyc2ns);
userpg->time_shift = CYC2NS_SCALE_FACTOR;
userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
userpg->time_offset = data->cyc2ns_offset - now;
userpg->cap_user_time_zero = 1;
userpg->time_zero = this_cpu_read(cyc2ns_offset);
userpg->time_zero = data->cyc2ns_offset;
cyc2ns_read_end(data);
}
/*
......
......@@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void)
* The WBINVD is insufficient due to the spurious-wakeup
* case where we return around the loop.
*/
mb();
clflush(mwait_ptr);
mb();
__monitor(mwait_ptr, 0, 0);
mb();
__mwait(eax, 0);
......
......@@ -11,6 +11,7 @@
#include <linux/clocksource.h>
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
#include <asm/hpet.h>
#include <asm/timer.h>
......@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int __read_mostly tsc_disabled = -1;
static struct static_key __use_tsc = STATIC_KEY_INIT;
int tsc_clocksource_reliable;
/*
* Use a ring-buffer like data structure, where a writer advances the head by
* writing a new data entry and a reader advances the tail when it observes a
* new entry.
*
* Writers are made to wait on readers until there's space to write a new
* entry.
*
* This means that we can always use an {offset, mul} pair to compute a ns
* value that is 'roughly' in the right direction, even if we're writing a new
* {offset, mul} pair during the clock read.
*
* The down-side is that we can no longer guarantee strict monotonicity anymore
* (assuming the TSC was that to begin with), because while we compute the
* intersection point of the two clock slopes and make sure the time is
* continuous at the point of switching; we can no longer guarantee a reader is
* strictly before or after the switch point.
*
* It does mean a reader no longer needs to disable IRQs in order to avoid
* CPU-Freq updates messing with his times, and similarly an NMI reader will
* no longer run the risk of hitting half-written state.
*/
struct cyc2ns {
struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
struct cyc2ns_data *head; /* 48 + 8 = 56 */
struct cyc2ns_data *tail; /* 56 + 8 = 64 */
}; /* exactly fits one cacheline */
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
struct cyc2ns_data *cyc2ns_read_begin(void)
{
struct cyc2ns_data *head;
preempt_disable();
head = this_cpu_read(cyc2ns.head);
/*
* Ensure we observe the entry when we observe the pointer to it.
* matches the wmb from cyc2ns_write_end().
*/
smp_read_barrier_depends();
head->__count++;
barrier();
return head;
}
void cyc2ns_read_end(struct cyc2ns_data *head)
{
barrier();
/*
* If we're the outer most nested read; update the tail pointer
* when we're done. This notifies possible pending writers
* that we've observed the head pointer and that the other
* entry is now free.
*/
if (!--head->__count) {
/*
* x86-TSO does not reorder writes with older reads;
* therefore once this write becomes visible to another
* cpu, we must be finished reading the cyc2ns_data.
*
* matches with cyc2ns_write_begin().
*/
this_cpu_write(cyc2ns.tail, head);
}
preempt_enable();
}
/*
* Begin writing a new @data entry for @cpu.
*
* Assumes some sort of write side lock; currently 'provided' by the assumption
* that cpufreq will call its notifiers sequentially.
*/
static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
{
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
struct cyc2ns_data *data = c2n->data;
if (data == c2n->head)
data++;
/* XXX send an IPI to @cpu in order to guarantee a read? */
/*
* When we observe the tail write from cyc2ns_read_end(),
* the cpu must be done with that entry and its safe
* to start writing to it.
*/
while (c2n->tail == data)
cpu_relax();
return data;
}
static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
{
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
/*
* Ensure the @data writes are visible before we publish the
* entry. Matches the data-depencency in cyc2ns_read_begin().
*/
smp_wmb();
ACCESS_ONCE(c2n->head) = data;
}
/*
* Accelerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
* ns = cycles / (freq / ns_per_sec)
* ns = cycles * (ns_per_sec / freq)
* ns = cycles * (10^9 / (cpu_khz * 10^3))
* ns = cycles * (10^6 / cpu_khz)
*
* Then we use scaling math (suggested by george@mvista.com) to get:
* ns = cycles * (10^6 * SC / cpu_khz) / SC
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
* into a shift.
*
* We can use khz divisor instead of mhz to keep a better precision, since
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
static void cyc2ns_data_init(struct cyc2ns_data *data)
{
data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
data->cyc2ns_offset = 0;
data->__count = 0;
}
static void cyc2ns_init(int cpu)
{
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
c2n->head = c2n->data;
c2n->tail = c2n->data;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
struct cyc2ns_data *data, *tail;
unsigned long long ns;
/*
* See cyc2ns_read_*() for details; replicated in order to avoid
* an extra few instructions that came with the abstraction.
* Notable, it allows us to only do the __count and tail update
* dance when its actually needed.
*/
preempt_disable();
data = this_cpu_read(cyc2ns.head);
tail = this_cpu_read(cyc2ns.tail);
if (likely(data == tail)) {
ns = data->cyc2ns_offset;
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
} else {
data->__count++;
barrier();
ns = data->cyc2ns_offset;
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
barrier();
if (!--data->__count)
this_cpu_write(cyc2ns.tail, data);
}
preempt_enable();
return ns;
}
/* XXX surely we already have this someplace in the kernel?! */
#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
unsigned long long tsc_now, ns_now;
struct cyc2ns_data *data;
unsigned long flags;
local_irq_save(flags);
sched_clock_idle_sleep_event();
if (!cpu_khz)
goto done;
data = cyc2ns_write_begin(cpu);
rdtscll(tsc_now);
ns_now = cycles_2_ns(tsc_now);
/*
* Compute a new multiplier as per the above comment and ensure our
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
data->cyc2ns_offset = ns_now -
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
cyc2ns_write_end(cpu, data);
done:
sched_clock_idle_wakeup_event(0);
local_irq_restore(flags);
}
/*
* Scheduler clock - returns current time in nanosec units.
*/
u64 native_sched_clock(void)
{
u64 this_offset;
u64 tsc_now;
/*
* Fall back to jiffies if there's no TSC available:
......@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
* very important for it to be as fast as the platform
* can achieve it. )
*/
if (unlikely(tsc_disabled)) {
if (!static_key_false(&__use_tsc)) {
/* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}
/* read the Time Stamp Counter: */
rdtscll(this_offset);
rdtscll(tsc_now);
/* return the value in ns */
return __cycles_2_ns(this_offset);
return cycles_2_ns(tsc_now);
}
/* We need to define a real function for sched_clock, to override the
......@@ -589,61 +821,11 @@ int recalibrate_cpu_khz(void)
EXPORT_SYMBOL(recalibrate_cpu_khz);
/* Accelerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)