sched.c 219 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  kernel/sched.c
 *
 *  Kernel scheduler and related syscalls
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
 *
 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 *		make semaphores SMP safe
 *  1998-11-19	Implemented schedule_timeout() and related stuff
 *		by Andrea Arcangeli
 *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
 *		hybrid priority-list and round-robin design with
 *		an array-switch method of distributing timeslices
 *		and per-CPU runqueues.  Cleanups and useful suggestions
 *		by Davide Libenzi, preemptible kernel bits by Robert Love.
 *  2003-09-03	Interactivity tuning by Con Kolivas.
 *  2004-04-02	Scheduler domains code by Nick Piggin
Ingo Molnar's avatar
Ingo Molnar committed
19 20 21 22 23 24
 *  2007-04-15  Work begun on replacing all interactivity tuning with a
 *              fair scheduling design by Con Kolivas.
 *  2007-05-05  Load balancing (smp-nice) and other improvements
 *              by Peter Williams
 *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
 *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
25 26
 *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
 *              Thomas Gleixner, Mike Kravetz
Linus Torvalds's avatar
Linus Torvalds committed
27 28 29 30 31 32
 */

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/init.h>
33
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
34 35 36 37
#include <linux/highmem.h>
#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
38
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
39 40
#include <linux/completion.h>
#include <linux/kernel_stat.h>
41
#include <linux/debug_locks.h>
42
#include <linux/perf_event.h>
Linus Torvalds's avatar
Linus Torvalds committed
43 44 45
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
46
#include <linux/freezer.h>
47
#include <linux/vmalloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
48 49
#include <linux/blkdev.h>
#include <linux/delay.h>
50
#include <linux/pid_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
51 52 53 54 55 56 57 58
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/timer.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
59
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
60
#include <linux/seq_file.h>
61
#include <linux/sysctl.h>
Linus Torvalds's avatar
Linus Torvalds committed
62 63
#include <linux/syscalls.h>
#include <linux/times.h>
64
#include <linux/tsacct_kern.h>
65
#include <linux/kprobes.h>
66
#include <linux/delayacct.h>
67
#include <linux/unistd.h>
68
#include <linux/pagemap.h>
69
#include <linux/hrtimer.h>
70
#include <linux/tick.h>
71 72
#include <linux/debugfs.h>
#include <linux/ctype.h>
73
#include <linux/ftrace.h>
Linus Torvalds's avatar
Linus Torvalds committed
74

75
#include <asm/tlb.h>
76
#include <asm/irq_regs.h>
Linus Torvalds's avatar
Linus Torvalds committed
77

78 79
#include "sched_cpupri.h"

80
#define CREATE_TRACE_POINTS
81
#include <trace/events/sched.h>
82

Linus Torvalds's avatar
Linus Torvalds committed
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
/*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 * and back.
 */
#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)

/*
 * 'User priority' is the nice value converted to something we
 * can work with better when scaling various scheduler parameters,
 * it's a [ 0 ... 39 ] range.
 */
#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))

/*
102
 * Helpers for converting nanosecond timing to jiffy resolution
Linus Torvalds's avatar
Linus Torvalds committed
103
 */
104
#define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
Linus Torvalds's avatar
Linus Torvalds committed
105

Ingo Molnar's avatar
Ingo Molnar committed
106 107 108
#define NICE_0_LOAD		SCHED_LOAD_SCALE
#define NICE_0_SHIFT		SCHED_LOAD_SHIFT

Linus Torvalds's avatar
Linus Torvalds committed
109 110 111
/*
 * These are the 'tuning knobs' of the scheduler:
 *
112
 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
Linus Torvalds's avatar
Linus Torvalds committed
113 114 115
 * Timeslices get refilled after they expire.
 */
#define DEF_TIMESLICE		(100 * HZ / 1000)
116

117 118 119 120 121
/*
 * single value that denotes runtime == period, ie unlimited time.
 */
#define RUNTIME_INF	((u64)~0ULL)

122 123
static inline int rt_policy(int policy)
{
124
	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
125 126 127 128 129 130 131 132 133
		return 1;
	return 0;
}

static inline int task_has_rt_policy(struct task_struct *p)
{
	return rt_policy(p->policy);
}

Linus Torvalds's avatar
Linus Torvalds committed
134
/*
Ingo Molnar's avatar
Ingo Molnar committed
135
 * This is the priority-queue data structure of the RT scheduling class:
Linus Torvalds's avatar
Linus Torvalds committed
136
 */
Ingo Molnar's avatar
Ingo Molnar committed
137 138 139 140 141
struct rt_prio_array {
	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
	struct list_head queue[MAX_RT_PRIO];
};

142
struct rt_bandwidth {
Ingo Molnar's avatar
Ingo Molnar committed
143
	/* nests inside the rq lock: */
144
	raw_spinlock_t		rt_runtime_lock;
Ingo Molnar's avatar
Ingo Molnar committed
145 146 147
	ktime_t			rt_period;
	u64			rt_runtime;
	struct hrtimer		rt_period_timer;
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
};

static struct rt_bandwidth def_rt_bandwidth;

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
	struct rt_bandwidth *rt_b =
		container_of(timer, struct rt_bandwidth, rt_period_timer);
	ktime_t now;
	int overrun;
	int idle = 0;

	for (;;) {
		now = hrtimer_cb_get_time(timer);
		overrun = hrtimer_forward(timer, now, rt_b->rt_period);

		if (!overrun)
			break;

		idle = do_sched_rt_period_timer(rt_b, overrun);
	}

	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}

static
void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
{
	rt_b->rt_period = ns_to_ktime(period);
	rt_b->rt_runtime = runtime;

181
	raw_spin_lock_init(&rt_b->rt_runtime_lock);
182

183 184 185 186 187
	hrtimer_init(&rt_b->rt_period_timer,
			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rt_b->rt_period_timer.function = sched_rt_period_timer;
}

188 189 190
static inline int rt_bandwidth_enabled(void)
{
	return sysctl_sched_rt_runtime >= 0;
191 192 193 194 195 196
}

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
	ktime_t now;

197
	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
198 199 200 201 202
		return;

	if (hrtimer_active(&rt_b->rt_period_timer))
		return;

203
	raw_spin_lock(&rt_b->rt_runtime_lock);
204
	for (;;) {
205 206 207
		unsigned long delta;
		ktime_t soft, hard;

208 209 210 211 212
		if (hrtimer_active(&rt_b->rt_period_timer))
			break;

		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
213 214 215 216 217

		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
		delta = ktime_to_ns(ktime_sub(hard, soft));
		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218
				HRTIMER_MODE_ABS_PINNED, 0);
219
	}
220
	raw_spin_unlock(&rt_b->rt_runtime_lock);
221 222 223 224 225 226 227 228 229
}

#ifdef CONFIG_RT_GROUP_SCHED
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{
	hrtimer_cancel(&rt_b->rt_period_timer);
}
#endif

230 231 232 233 234 235
/*
 * sched_domains_mutex serializes calls to arch_init_sched_domains,
 * detach_destroy_domains and partition_sched_domains.
 */
static DEFINE_MUTEX(sched_domains_mutex);

Dhaval Giani's avatar
Dhaval Giani committed
236
#ifdef CONFIG_CGROUP_SCHED
237

238 239
#include <linux/cgroup.h>

240 241
struct cfs_rq;

242 243
static LIST_HEAD(task_groups);

244
/* task group related information */
245
struct task_group {
246
	struct cgroup_subsys_state css;
247

248
#ifdef CONFIG_FAIR_GROUP_SCHED
249 250 251 252 253
	/* schedulable entities of this group on each cpu */
	struct sched_entity **se;
	/* runqueue "owned" by this group on each cpu */
	struct cfs_rq **cfs_rq;
	unsigned long shares;
254 255 256 257 258 259
#endif

#ifdef CONFIG_RT_GROUP_SCHED
	struct sched_rt_entity **rt_se;
	struct rt_rq **rt_rq;

260
	struct rt_bandwidth rt_bandwidth;
261
#endif
262

263
	struct rcu_head rcu;
264
	struct list_head list;
265 266 267 268

	struct task_group *parent;
	struct list_head siblings;
	struct list_head children;
269 270
};

271
#define root_task_group init_task_group
272

273
/* task_group_lock serializes add/remove of task groups and also changes to
274 275
 * a task group's cpu shares.
 */
276
static DEFINE_SPINLOCK(task_group_lock);
277

278 279
#ifdef CONFIG_FAIR_GROUP_SCHED

280 281 282 283 284 285 286
#ifdef CONFIG_SMP
static int root_task_group_empty(void)
{
	return list_empty(&root_task_group.children);
}
#endif

287 288
# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD

289
/*
290 291 292 293
 * A weight of 0 or 1 can cause arithmetics problems.
 * A weight of a cfs_rq is the sum of weights of which entities
 * are queued on this cfs_rq, so a weight of a entity should not be
 * too large, so as the shares value of a task group.
294 295 296
 * (The default weight is 1024 - so there's no practical
 *  limitation from this.)
 */
297
#define MIN_SHARES	2
298
#define MAX_SHARES	(1UL << 18)
299

300 301 302
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
#endif

303
/* Default task group.
Ingo Molnar's avatar
Ingo Molnar committed
304
 *	Every task in system belong to this group at bootup.
305
 */
306
struct task_group init_task_group;
307 308

/* return group to which a task belongs */
309
static inline struct task_group *task_group(struct task_struct *p)
310
{
311
	struct task_group *tg;
312

Dhaval Giani's avatar
Dhaval Giani committed
313
#ifdef CONFIG_CGROUP_SCHED
314 315
	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
				struct task_group, css);
316
#else
Ingo Molnar's avatar
Ingo Molnar committed
317
	tg = &init_task_group;
318
#endif
319
	return tg;
320 321 322
}

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
323
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
324
{
325
#ifdef CONFIG_FAIR_GROUP_SCHED
326 327
	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
	p->se.parent = task_group(p)->se[cpu];
328
#endif
329

330
#ifdef CONFIG_RT_GROUP_SCHED
331 332
	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
	p->rt.parent = task_group(p)->rt_se[cpu];
333
#endif
334 335 336 337
}

#else

338
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
339 340 341 342
static inline struct task_group *task_group(struct task_struct *p)
{
	return NULL;
}
343

Dhaval Giani's avatar
Dhaval Giani committed
344
#endif	/* CONFIG_CGROUP_SCHED */
345

Ingo Molnar's avatar
Ingo Molnar committed
346 347 348 349 350 351
/* CFS-related fields in a runqueue */
struct cfs_rq {
	struct load_weight load;
	unsigned long nr_running;

	u64 exec_clock;
352
	u64 min_vruntime;
Ingo Molnar's avatar
Ingo Molnar committed
353 354 355

	struct rb_root tasks_timeline;
	struct rb_node *rb_leftmost;
356 357 358 359 360 361

	struct list_head tasks;
	struct list_head *balance_iterator;

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
Ingo Molnar's avatar
Ingo Molnar committed
362 363
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
364
	struct sched_entity *curr, *next, *last;
365

366
	unsigned int nr_spread_over;
367

368
#ifdef CONFIG_FAIR_GROUP_SCHED
Ingo Molnar's avatar
Ingo Molnar committed
369 370
	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */

Ingo Molnar's avatar
Ingo Molnar committed
371 372
	/*
	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
Ingo Molnar's avatar
Ingo Molnar committed
373 374 375 376 377 378
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
	 * list is used during load balance.
	 */
Ingo Molnar's avatar
Ingo Molnar committed
379 380
	struct list_head leaf_cfs_rq_list;
	struct task_group *tg;	/* group that "owns" this runqueue */
381 382 383

#ifdef CONFIG_SMP
	/*
384
	 * the part of load.weight contributed by tasks
385
	 */
386
	unsigned long task_weight;
387

388 389 390 391 392 393 394
	/*
	 *   h_load = weight * f(tg)
	 *
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	 */
	unsigned long h_load;
395

396 397 398 399
	/*
	 * this cpu's part of tg->shares
	 */
	unsigned long shares;
400 401 402 403 404

	/*
	 * load.weight at the time we set shares
	 */
	unsigned long rq_weight;
405
#endif
Ingo Molnar's avatar
Ingo Molnar committed
406 407
#endif
};
Linus Torvalds's avatar
Linus Torvalds committed
408

Ingo Molnar's avatar
Ingo Molnar committed
409 410 411
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
	struct rt_prio_array active;
412
	unsigned long rt_nr_running;
413
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
414 415
	struct {
		int curr; /* highest queued rt task prio */
416
#ifdef CONFIG_SMP
417
		int next; /* next highest */
418
#endif
419
	} highest_prio;
420
#endif
Peter Zijlstra's avatar
Peter Zijlstra committed
421
#ifdef CONFIG_SMP
422
	unsigned long rt_nr_migratory;
423
	unsigned long rt_nr_total;
424
	int overloaded;
425
	struct plist_head pushable_tasks;
Peter Zijlstra's avatar
Peter Zijlstra committed
426
#endif
427
	int rt_throttled;
Peter Zijlstra's avatar
Peter Zijlstra committed
428
	u64 rt_time;
429
	u64 rt_runtime;
Ingo Molnar's avatar
Ingo Molnar committed
430
	/* Nests inside the rq lock: */
431
	raw_spinlock_t rt_runtime_lock;
432

433
#ifdef CONFIG_RT_GROUP_SCHED
434 435
	unsigned long rt_nr_boosted;

436 437 438 439
	struct rq *rq;
	struct list_head leaf_rt_rq_list;
	struct task_group *tg;
#endif
Ingo Molnar's avatar
Ingo Molnar committed
440 441
};

442 443 444 445
#ifdef CONFIG_SMP

/*
 * We add the notion of a root-domain which will be used to define per-domain
Ingo Molnar's avatar
Ingo Molnar committed
446 447
 * variables. Each exclusive cpuset essentially defines an island domain by
 * fully partitioning the member cpus from any other cpuset. Whenever a new
448 449 450 451 452 453
 * exclusive cpuset is created, we also create and attach a new root-domain
 * object.
 *
 */
struct root_domain {
	atomic_t refcount;
454 455
	cpumask_var_t span;
	cpumask_var_t online;
456

Ingo Molnar's avatar
Ingo Molnar committed
457
	/*
458 459 460
	 * The "RT overload" flag: it gets set if a CPU has more than
	 * one runnable RT task.
	 */
461
	cpumask_var_t rto_mask;
Ingo Molnar's avatar
Ingo Molnar committed
462
	atomic_t rto_count;
463 464 465
#ifdef CONFIG_SMP
	struct cpupri cpupri;
#endif
466 467
};

468 469 470 471
/*
 * By default the system creates a single root-domain with all cpus as
 * members (mimicking the global state we have today).
 */
472 473 474 475
static struct root_domain def_root_domain;

#endif

Linus Torvalds's avatar
Linus Torvalds committed
476 477 478 479 480 481 482
/*
 * This is the main, per-CPU runqueue data structure.
 *
 * Locking rule: those places that want to lock multiple runqueues
 * (such as the load balancing or the thread migration code), lock
 * acquire operations must be ordered by ascending &runqueue.
 */
483
struct rq {
484
	/* runqueue lock: */
485
	raw_spinlock_t lock;
Linus Torvalds's avatar
Linus Torvalds committed
486 487 488 489 490 491

	/*
	 * nr_running and cpu_load should be in the same cacheline because
	 * remote CPUs use both these fields when doing load calculation.
	 */
	unsigned long nr_running;
Ingo Molnar's avatar
Ingo Molnar committed
492 493
	#define CPU_LOAD_IDX_MAX 5
	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
494 495 496
#ifdef CONFIG_NO_HZ
	unsigned char in_nohz_recently;
#endif
497 498
	/* capture load from *all* tasks on this cpu: */
	struct load_weight load;
Ingo Molnar's avatar
Ingo Molnar committed
499 500 501 502
	unsigned long nr_load_updates;
	u64 nr_switches;

	struct cfs_rq cfs;
503 504
	struct rt_rq rt;

Ingo Molnar's avatar
Ingo Molnar committed
505
#ifdef CONFIG_FAIR_GROUP_SCHED
506 507
	/* list of leaf cfs_rq on this cpu: */
	struct list_head leaf_cfs_rq_list;
508 509
#endif
#ifdef CONFIG_RT_GROUP_SCHED
510
	struct list_head leaf_rt_rq_list;
Linus Torvalds's avatar
Linus Torvalds committed
511 512 513 514 515 516 517 518 519 520
#endif

	/*
	 * This is part of a global counter where only the total sum
	 * over all CPUs matters. A task can increase this counter on
	 * one CPU and if it got migrated afterwards it may decrease
	 * it on another CPU. Always updated under the runqueue lock:
	 */
	unsigned long nr_uninterruptible;

521
	struct task_struct *curr, *idle;
522
	unsigned long next_balance;
Linus Torvalds's avatar
Linus Torvalds committed
523
	struct mm_struct *prev_mm;
Ingo Molnar's avatar
Ingo Molnar committed
524

525
	u64 clock;
Ingo Molnar's avatar
Ingo Molnar committed
526

Linus Torvalds's avatar
Linus Torvalds committed
527 528 529
	atomic_t nr_iowait;

#ifdef CONFIG_SMP
Ingo Molnar's avatar
Ingo Molnar committed
530
	struct root_domain *rd;
Linus Torvalds's avatar
Linus Torvalds committed
531 532
	struct sched_domain *sd;

533
	unsigned char idle_at_tick;
Linus Torvalds's avatar
Linus Torvalds committed
534
	/* For active balancing */
535
	int post_schedule;
Linus Torvalds's avatar
Linus Torvalds committed
536 537
	int active_balance;
	int push_cpu;
538 539
	/* cpu of this runqueue: */
	int cpu;
540
	int online;
Linus Torvalds's avatar
Linus Torvalds committed
541

542
	unsigned long avg_load_per_task;
Linus Torvalds's avatar
Linus Torvalds committed
543

544
	struct task_struct *migration_thread;
Linus Torvalds's avatar
Linus Torvalds committed
545
	struct list_head migration_queue;
546 547 548

	u64 rt_avg;
	u64 age_stamp;
549 550
	u64 idle_stamp;
	u64 avg_idle;
Linus Torvalds's avatar
Linus Torvalds committed
551 552
#endif

553 554 555 556
	/* calc_load related fields */
	unsigned long calc_load_update;
	long calc_load_active;

557
#ifdef CONFIG_SCHED_HRTICK
558 559 560 561
#ifdef CONFIG_SMP
	int hrtick_csd_pending;
	struct call_single_data hrtick_csd;
#endif
562 563 564
	struct hrtimer hrtick_timer;
#endif

Linus Torvalds's avatar
Linus Torvalds committed
565 566 567
#ifdef CONFIG_SCHEDSTATS
	/* latency stats */
	struct sched_info rq_sched_info;
568 569
	unsigned long long rq_cpu_time;
	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
Linus Torvalds's avatar
Linus Torvalds committed
570 571

	/* sys_sched_yield() stats */
572
	unsigned int yld_count;
Linus Torvalds's avatar
Linus Torvalds committed
573 574

	/* schedule() stats */
575 576 577
	unsigned int sched_switch;
	unsigned int sched_count;
	unsigned int sched_goidle;
Linus Torvalds's avatar
Linus Torvalds committed
578 579

	/* try_to_wake_up() stats */
580 581
	unsigned int ttwu_count;
	unsigned int ttwu_local;
582 583

	/* BKL stats */
584
	unsigned int bkl_count;
Linus Torvalds's avatar
Linus Torvalds committed
585 586 587
#endif
};

588
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
Linus Torvalds's avatar
Linus Torvalds committed
589

590 591
static inline
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnar's avatar
Ingo Molnar committed
592
{
593
	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
Ingo Molnar's avatar
Ingo Molnar committed
594 595
}

596 597 598 599 600 601 602 603 604
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
	return rq->cpu;
#else
	return 0;
#endif
}

605
#define rcu_dereference_check_sched_domain(p) \
606 607 608 609
	rcu_dereference_check((p), \
			      rcu_read_lock_sched_held() || \
			      lockdep_is_held(&sched_domains_mutex))

Nick Piggin's avatar
Nick Piggin committed
610 611
/*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
612
 * See detach_destroy_domains: synchronize_sched for details.
Nick Piggin's avatar
Nick Piggin committed
613 614 615 616
 *
 * The domain tree of any CPU may only be accessed from within
 * preempt-disabled sections.
 */
617
#define for_each_domain(cpu, __sd) \
618
	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
Linus Torvalds's avatar
Linus Torvalds committed
619 620 621 622 623

#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
#define this_rq()		(&__get_cpu_var(runqueues))
#define task_rq(p)		cpu_rq(task_cpu(p))
#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
624
#define raw_rq()		(&__raw_get_cpu_var(runqueues))
Linus Torvalds's avatar
Linus Torvalds committed
625

626
inline void update_rq_clock(struct rq *rq)
627 628 629 630
{
	rq->clock = sched_clock_cpu(cpu_of(rq));
}

631 632 633 634 635 636 637 638 639
/*
 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 */
#ifdef CONFIG_SCHED_DEBUG
# define const_debug __read_mostly
#else
# define const_debug static const
#endif

Ingo Molnar's avatar
Ingo Molnar committed
640 641
/**
 * runqueue_is_locked
642
 * @cpu: the processor in question.
Ingo Molnar's avatar
Ingo Molnar committed
643 644 645 646 647
 *
 * Returns true if the current cpu runqueue is locked.
 * This interface allows printk to be called with the runqueue lock
 * held and know whether or not it is OK to wake up the klogd.
 */
648
int runqueue_is_locked(int cpu)
Ingo Molnar's avatar
Ingo Molnar committed
649
{
650
	return raw_spin_is_locked(&cpu_rq(cpu)->lock);
Ingo Molnar's avatar
Ingo Molnar committed
651 652
}

653 654 655
/*
 * Debugging: various feature bits
 */
656 657 658 659

#define SCHED_FEAT(name, enabled)	\
	__SCHED_FEAT_##name ,

660
enum {
661
#include "sched_features.h"
662 663
};

664 665 666 667 668
#undef SCHED_FEAT

#define SCHED_FEAT(name, enabled)	\
	(1UL << __SCHED_FEAT_##name) * enabled |

669
const_debug unsigned int sysctl_sched_features =
670 671 672 673 674 675 676 677 678
#include "sched_features.h"
	0;

#undef SCHED_FEAT

#ifdef CONFIG_SCHED_DEBUG
#define SCHED_FEAT(name, enabled)	\
	#name ,

679
static __read_mostly char *sched_feat_names[] = {
680 681 682 683 684 685
#include "sched_features.h"
	NULL
};

#undef SCHED_FEAT

686
static int sched_feat_show(struct seq_file *m, void *v)
687 688 689 690
{
	int i;

	for (i = 0; sched_feat_names[i]; i++) {
691 692 693
		if (!(sysctl_sched_features & (1UL << i)))
			seq_puts(m, "NO_");
		seq_printf(m, "%s ", sched_feat_names[i]);
694
	}
695
	seq_puts(m, "\n");
696

697
	return 0;
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
}

static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
		size_t cnt, loff_t *ppos)
{
	char buf[64];
	char *cmp = buf;
	int neg = 0;
	int i;

	if (cnt > 63)
		cnt = 63;

	if (copy_from_user(&buf, ubuf, cnt))
		return -EFAULT;

	buf[cnt] = 0;

Ingo Molnar's avatar
Ingo Molnar committed
717
	if (strncmp(buf, "NO_", 3) == 0) {
718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
		neg = 1;
		cmp += 3;
	}

	for (i = 0; sched_feat_names[i]; i++) {
		int len = strlen(sched_feat_names[i]);

		if (strncmp(cmp, sched_feat_names[i], len) == 0) {
			if (neg)
				sysctl_sched_features &= ~(1UL << i);
			else
				sysctl_sched_features |= (1UL << i);
			break;
		}
	}

	if (!sched_feat_names[i])
		return -EINVAL;

737
	*ppos += cnt;
738 739 740 741

	return cnt;
}

742 743 744 745 746
static int sched_feat_open(struct inode *inode, struct file *filp)
{
	return single_open(filp, sched_feat_show, NULL);
}

747
static const struct file_operations sched_feat_fops = {
748 749 750 751 752
	.open		= sched_feat_open,
	.write		= sched_feat_write,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
753 754 755 756 757 758 759 760 761 762 763 764 765 766
};

static __init int sched_init_debug(void)
{
	debugfs_create_file("sched_features", 0644, NULL, NULL,
			&sched_feat_fops);

	return 0;
}
late_initcall(sched_init_debug);

#endif

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
767

768 769 770 771 772 773
/*
 * Number of tasks to iterate in a single balance run.
 * Limited because this is done with IRQs disabled.
 */
const_debug unsigned int sysctl_sched_nr_migrate = 32;

774 775
/*
 * ratelimit for updating the group shares.
776
 * default: 0.25ms
777
 */
778
unsigned int sysctl_sched_shares_ratelimit = 250000;
779
unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
780

781 782 783 784 785 786 787
/*
 * Inject some fuzzyness into changing the per-cpu group shares
 * this avoids remote rq-locks at the expense of fairness.
 * default: 4
 */
unsigned int sysctl_sched_shares_thresh = 4;

788 789 790 791 792 793 794 795
/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
 * default: 1s
 */
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

Peter Zijlstra's avatar
Peter Zijlstra committed
796
/*
797
 * period over which we measure -rt task cpu usage in us.
Peter Zijlstra's avatar
Peter Zijlstra committed
798 799
 * default: 1s
 */
800
unsigned int sysctl_sched_rt_period = 1000000;
Peter Zijlstra's avatar
Peter Zijlstra committed
801

802 803
static __read_mostly int scheduler_running;

804 805 806 807 808
/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = 950000;
Peter Zijlstra's avatar
Peter Zijlstra committed
809

810 811 812 813 814 815 816
static inline u64 global_rt_period(void)
{
	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
}

static inline u64 global_rt_runtime(void)
{
817
	if (sysctl_sched_rt_runtime < 0)
818 819 820 821
		return RUNTIME_INF;

	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
Peter Zijlstra's avatar
Peter Zijlstra committed
822

Linus Torvalds's avatar
Linus Torvalds committed
823
#ifndef prepare_arch_switch
824 825 826 827 828 829
# define prepare_arch_switch(next)	do { } while (0)
#endif
#ifndef finish_arch_switch
# define finish_arch_switch(prev)	do { } while (0)
#endif

830 831 832 833 834
static inline int task_current(struct rq *rq, struct task_struct *p)
{
	return rq->curr == p;
}

835
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
836
static inline int task_running(struct rq *rq, struct task_struct *p)
837
{
838
	return task_current(rq, p);
839 840
}

841
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
842 843 844
{
}

845
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
846
{
847 848 849 850
#ifdef CONFIG_DEBUG_SPINLOCK
	/* this is a valid case when another task releases the spinlock */
	rq->lock.owner = current;
#endif
851 852 853 854 855 856 857
	/*
	 * If we are tracking spinlock dependencies then we have to
	 * fix up the runqueue lock - which gets 'carried over' from
	 * prev into current:
	 */
	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

858
	raw_spin_unlock_irq(&rq->lock);
859 860 861
}

#else /* __ARCH_WANT_UNLOCKED_CTXSW */
862
static inline int task_running(struct rq *rq, struct task_struct *p)
863 864 865 866
{
#ifdef CONFIG_SMP
	return p->oncpu;
#else
867
	return task_current(rq, p);
868 869 870
#endif
}

871
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
872 873 874 875 876 877 878 879 880 881
{
#ifdef CONFIG_SMP
	/*
	 * We can optimise this out completely for !SMP, because the
	 * SMP rebalancing from interrupt is the only thing that cares
	 * here.
	 */
	next->oncpu = 1;
#endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
882
	raw_spin_unlock_irq(&rq->lock);
883
#else
884
	raw_spin_unlock(&rq->lock);
885 886 887
#endif
}

888
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
889 890 891 892 893 894 895 896 897 898 899 900
{
#ifdef CONFIG_SMP
	/*
	 * After ->oncpu is cleared, the task can be moved to a different CPU.
	 * We must ensure this doesn't happen until the switch is completely
	 * finished.
	 */
	smp_wmb();
	prev->oncpu = 0;
#endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
	local_irq_enable();
Linus Torvalds's avatar
Linus Torvalds committed
901
#endif
902 903
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
Linus Torvalds's avatar
Linus Torvalds committed
904

905 906 907 908 909 910 911 912 913 914 915 916 917
/*
 * Check whether the task is waking, we use this to synchronize against
 * ttwu() so that task_cpu() reports a stable number.
 *
 * We need to make an exception for PF_STARTING tasks because the fork
 * path might require task_rq_lock() to work, eg. it can call
 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
 */
static inline int task_is_waking(struct task_struct *p)
{
	return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
}

918 919 920 921
/*
 * __task_rq_lock - lock the runqueue a given task resides on.
 * Must be called interrupts disabled.
 */
922
static inline struct rq *__task_rq_lock(struct task_struct *p)
923 924
	__acquires(rq->lock)
{
925 926
	struct rq *rq;

927
	for (;;) {
928 929 930
		while (task_is_waking(p))
			cpu_relax();
		rq = task_rq(p);
931
		raw_spin_lock(&rq->lock);
932
		if (likely(rq == task_rq(p) && !task_is_waking(p)))
933
			return rq;
934
		raw_spin_unlock(&rq->lock);
935 936 937
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
938 939
/*
 * task_rq_lock - lock the runqueue a given task resides on and disable
Ingo Molnar's avatar
Ingo Molnar committed
940
 * interrupts. Note the ordering: we can safely lookup the task_rq without
Linus Torvalds's avatar
Linus Torvalds committed
941 942
 * explicitly disabling preemption.
 */
943
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
Linus Torvalds's avatar
Linus Torvalds committed
944 945
	__acquires(rq->lock)
{
946
	struct rq *rq;
Linus Torvalds's avatar
Linus Torvalds committed
947

948
	for (;;) {
949 950
		while (task_is_waking(p))
			cpu_relax();
951 952
		local_irq_save(*flags);
		rq = task_rq(p);
953
		raw_spin_lock(&rq->lock);
954
		if (likely(rq == task_rq(p) && !task_is_waking(p)))
955
			return rq;
956
		raw_spin_unlock_irqrestore(&rq->lock, *flags);
Linus Torvalds's avatar
Linus Torvalds committed
957 958 959
	}
}

960 961 962 963 964
void task_rq_unlock_wait(struct task_struct *p)
{
	struct rq *rq = task_rq(p);

	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
965
	raw_spin_unlock_wait(&rq->lock);
966 967
}

968
static void __task_rq_unlock(struct rq *rq)
969 970
	__releases(rq->lock)
{
971
	raw_spin_unlock(&rq->lock);
972 973
}

974
static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
Linus Torvalds's avatar
Linus Torvalds committed
975 976
	__releases(rq->lock)
{
977
	raw_spin_unlock_irqrestore(&rq->lock, *flags);
Linus Torvalds's avatar
Linus Torvalds committed
978 979 980
}

/*
981
 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds's avatar
Linus Torvalds committed
982
 */
983
static struct rq *this_rq_lock(void)
Linus Torvalds's avatar
Linus Torvalds committed
984 985
	__acquires(rq->lock)
{
986
	struct rq *rq;
Linus Torvalds's avatar
Linus Torvalds committed
987 988 989

	local_irq_disable();
	rq = this_rq();
990
	raw_spin_lock(&rq->lock);
Linus Torvalds's avatar
Linus Torvalds committed
991 992 993 994

	return rq;
}

995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
#ifdef CONFIG_SCHED_HRTICK
/*
 * Use HR-timers to deliver accurate preemption points.
 *
 * Its all a bit involved since we cannot program an hrt while holding the
 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
 * reschedule event.
 *
 * When we get rescheduled we reprogram the hrtick_timer outside of the
 * rq->lock.
 */

/*
 * Use hrtick when:
 *  - enabled by features
 *  - hrtimer is actually high res
 */
static inline int hrtick_enabled(struct rq *rq)
{
	if (!sched_feat(HRTICK))
		return 0;
1016
	if (!cpu_active(cpu_of(rq)))
1017
		return 0;
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
	return hrtimer_is_hres_active(&rq->hrtick_timer);
}

static void hrtick_clear(struct rq *rq)
{
	if (hrtimer_active(&rq->hrtick_timer))
		hrtimer_cancel(&rq->hrtick_timer);
}

/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
	struct rq *rq = container_of(timer, struct rq, hrtick_timer);

	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1037
	raw_spin_lock(&rq->lock);
1038
	update_rq_clock(rq);
1039
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1040
	raw_spin_unlock(&rq->lock);
1041 1042 1043 1044

	return HRTIMER_NORESTART;
}

1045
#ifdef CONFIG_SMP
1046 1047 1048 1049
/*
 * called from hardirq (IPI) context
 */
static void __hrtick_start(void *arg)
1050
{
1051
	struct rq *rq = arg;
1052

1053
	raw_spin_lock(&rq->lock);
1054 1055
	hrtimer_restart(&rq->hrtick_timer);
	rq->hrtick_csd_pending = 0;
1056
	raw_spin_unlock(&rq->lock);
1057 1058
}

1059 1060 1061 1062 1063 1064
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
static void hrtick_start(struct rq *rq, u64 delay)
1065
{
1066 1067
	struct hrtimer *timer = &rq->hrtick_timer;
	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1068

1069
	hrtimer_set_expires(timer, time);
1070 1071 1072 1073

	if (rq == this_rq()) {
		hrtimer_restart(timer);
	} else if (!rq->hrtick_csd_pending) {
1074
		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1075 1076
		rq->hrtick_csd_pending = 1;
	}
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
}

static int
hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
	int cpu = (int)(long)hcpu;

	switch (action) {
	case CPU_UP_CANCELED:
	case CPU_UP_CANCELED_FROZEN:
	case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
	case CPU_DEAD:
	case CPU_DEAD_FROZEN:
1091
		hrtick_clear(cpu_rq(cpu));
1092 1093 1094 1095 1096 1097
		return NOTIFY_OK;
	}

	return NOTIFY_DONE;
}

1098
static __init void init_hrtick(void)
1099 1100 1101
{
	hotcpu_notifier(hotplug_hrtick, 0);
}
1102 1103 1104 1105 1106 1107 1108 1109
#else
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
static void hrtick_start(struct rq *rq, u64 delay)
{
1110
	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1111
			HRTIMER_MODE_REL_PINNED, 0);
1112
}
1113

1114
static inline void init_hrtick(void)
1115 1116
{
}
1117
#endif /* CONFIG_SMP */
1118

1119
static void init_rq_hrtick(struct rq *rq)
1120
{
1121 1122
#ifdef CONFIG_SMP
	rq->hrtick_csd_pending = 0;
1123

1124 1125 1126 1127
	rq->hrtick_csd.flags = 0;
	rq->hrtick_csd.func = __hrtick_start;
	rq->hrtick_csd.info = rq;
#endif
1128

1129 1130
	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rq->hrtick_timer.function = hrtick;
1131
}
1132
#else	/* CONFIG_SCHED_HRTICK */
1133 1134 1135 1136 1137 1138 1139 1140
static inline void hrtick_clear(struct rq *rq)
{
}

static inline void init_rq_hrtick(struct rq *rq)
{
}

1141 1142 1143
static inline void init_hrtick(void)
{
}
1144
#endif	/* CONFIG_SCHED_HRTICK */
1145

1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158
/*
 * resched_task - mark a task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
#ifdef CONFIG_SMP

#ifndef tsk_is_polling
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
#endif

1159
static void resched_task(struct task_struct *p)
1160 1161 1162
{
	int cpu;

1163
	assert_raw_spin_locked(&task_rq(p)->lock);
1164

1165
	if (test_tsk_need_resched(p))
1166 1167
		return;

1168
	set_tsk_need_resched(p);
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184

	cpu = task_cpu(p);
	if (cpu == smp_processor_id())
		return;

	/* NEED_RESCHED must be visible before we test polling */
	smp_mb();
	if (!tsk_is_polling(p))
		smp_send_reschedule(cpu);
}

static void resched_cpu(int cpu)
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long flags;

1185
	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1186 1187
		return;
	resched_task(cpu_curr(cpu));
1188
	raw_spin_unlock_irqrestore(&rq->lock, flags);
1189
}
1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223

#ifdef CONFIG_NO_HZ
/*
 * When add_timer_on() enqueues a timer into the timer wheel of an
 * idle CPU then this timer might expire before the next timer event
 * which is scheduled to wake up that CPU. In case of a completely
 * idle system the next event might even be infinite time into the
 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
 * leaves the inner idle loop so the newly added timer is taken into
 * account when the CPU goes back to idle and evaluates the timer
 * wheel for the next timer event.
 */
void wake_up_idle_cpu(int cpu)
{
	struct rq *rq = cpu_rq(cpu);

	if (cpu == smp_processor_id())
		return;

	/*
	 * This is safe, as this function is called with the timer
	 * wheel base lock of (cpu) held. When the CPU is on the way
	 * to idle and has not yet set rq->curr to idle then it will
	 * be serialized on the timer wheel base lock and take the new
	 * timer into account automatically.
	 */
	if (rq->curr != rq->idle)
		return;

	/*
	 * We can set TIF_RESCHED on the idle task of the other CPU
	 * lockless. The worst case is that the other CPU runs the
	 * idle task through an additional NOOP schedule()
	 */
1224
	set_tsk_need_resched(rq->idle);
1225 1226 1227 1228 1229 1230

	/* NEED_RESCHED must be visible before we test polling */
	smp_mb();
	if (!tsk_is_polling(rq->idle))
		smp_send_reschedule(cpu);
}
1231
#endif /* CONFIG_NO_HZ */
1232

1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253
static u64 sched_avg_period(void)
{
	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}

static void sched_avg_update(struct rq *rq)
{
	s64 period = sched_avg_period();

	while ((s64)(rq->clock - rq->age_stamp) > period) {
		rq->age_stamp += period;
		rq->rt_avg /= 2;
	}
}

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
	rq->rt_avg += rt_delta;
	sched_avg_update(rq);
}

1254
#else /* !CONFIG_SMP */
1255
static void resched_task(struct task_struct *p)
1256
{
1257
	assert_raw_spin_locked(&task_rq(p)->lock);
1258
	set_tsk_need_resched(p);
1259
}
1260 1261 1262 1263

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
}
1264
#endif /* CONFIG_SMP */
1265

1266 1267 1268 1269 1270 1271 1272 1273
#if BITS_PER_LONG == 32
# define WMULT_CONST	(~0UL)
#else
# define WMULT_CONST	(1UL << 32)
#endif

#define WMULT_SHIFT	32

Ingo Molnar's avatar
Ingo Molnar committed
1274 1275 1276
/*
 * Shift right and round:
 */
1277
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
Ingo Molnar's avatar
Ingo Molnar committed
1278

1279 1280 1281
/*
 * delta *= weight / lw
 */
1282
static unsigned long
1283 1284 1285 1286 1287
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
		struct load_weight *lw)
{
	u64 tmp;

1288 1289 1290 1291 1292 1293 1294
	if (!lw->inv_weight) {
		if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
			lw->inv_weight = 1;
		else
			lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
				/ (lw->weight+1);
	}
1295 1296 1297 1298 1299

	tmp = (u64)delta_exec * weight;
	/*
	 * Check whether we'd overflow the 64-bit multiplication:
	 */
Ingo Molnar's avatar
Ingo Molnar committed
1300
	if (unlikely(tmp > WMULT_CONST))
1301
		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
Ingo Molnar's avatar
Ingo Molnar committed
1302 1303
			WMULT_SHIFT/2);
	else
1304
		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1305

1306
	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1307 1308
}

1309
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1310 1311
{
	lw->weight += inc;
1312
	lw->inv_weight = 0;
1313 1314
}

1315
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1316 1317
{
	lw->weight -= dec;
1318
	lw->inv_weight = 0;
1319 1320
}

1321 1322 1323 1324
/*
 * To aid in avoiding the subversion of "niceness" due to uneven distribution
 * of tasks with abnormal "nice" values across CPUs the contribution that
 * each task makes to its run queue's load is weighted according to its
Ingo Molnar's avatar
Ingo Molnar committed
1325
 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1326 1327 1328 1329
 * scaled version of the new time slice allocation that they receive on time
 * slice expiry etc.
 */

1330 1331
#define WEIGHT_IDLEPRIO                3
#define WMULT_IDLEPRIO         1431655765
Ingo Molnar's avatar
Ingo Molnar committed
1332 1333 1334 1335 1336 1337 1338 1339 1340

/*
 * Nice levels are multiplicative, with a gentle 10% change for every
 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 * nice 1, it will get ~10% less CPU time than another CPU-bound task
 * that remained on nice 0.
 *
 * The "10% effect" is relative and cumulative: from _any_ nice level,
 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1341 1342 1343
 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
 * If a task goes up by ~10% and another task goes down by ~10% then
 * the relative distance between them is ~25%.)
Ingo Molnar's avatar
Ingo Molnar committed
1344 1345
 */
static const int prio_to_weight[40] = {
1346 1347 1348 1349 1350 1351 1352 1353
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
Ingo Molnar's avatar
Ingo Molnar committed
1354 1355
};

1356 1357 1358 1359 1360 1361 1362
/*
 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
 *
 * In cases where the weight does not change often, we can use the
 * precalculated inverse to speed up arithmetics by turning divisions
 * into multiplications:
 */
Ingo Molnar's avatar
Ingo Molnar committed
1363
static const u32 prio_to_wmult[40] = {
1364 1365 1366 1367 1368 1369 1370 1371
 /* -20 */     48388,     59856,     76040,     92818,    118348,
 /* -15 */    147320,    184698,    229616,    287308,    360437,
 /* -10 */    449829,    563644,    704093,    875809,   1099582,
 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
Ingo Molnar's avatar
Ingo Molnar committed
1372
};
1373

1374 1375 1376 1377 1378 1379 1380 1381
/* Time spent by the tasks of the cpu accounting group executing in ... */
enum cpuacct_stat_index {
	CPUACCT_STAT_USER,	/* ... user mode */
	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */

	CPUACCT_STAT_NSTATS,
};

1382 1383
#ifdef CONFIG_CGROUP_CPUACCT
static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1384 1385
static void cpuacct_update_stats(struct task_struct *tsk,
		enum cpuacct_stat_index idx, cputime_t val);
1386 1387
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1388 1389
static inline void cpuacct_update_stats(struct task_struct *tsk,
		enum cpuacct_stat_index idx, cputime_t val) {}