Commit 22e2c507 authored by Jens Axboe's avatar Jens Axboe Committed by Linus Torvalds
Browse files

[PATCH] Update cfq io scheduler to time sliced design



This updates the CFQ io scheduler to the new time sliced design (cfq
v3).  It provides full process fairness, while giving excellent
aggregate system throughput even for many competing processes.  It
supports io priorities, either inherited from the cpu nice value or set
directly with the ioprio_get/set syscalls.  The latter closely mimic
set/getpriority.

This import is based on my latest from -mm.
Signed-off-by: default avatarJens Axboe <axboe@suse.de>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 020f46a3
......@@ -289,3 +289,5 @@ ENTRY(sys_call_table)
.long sys_add_key
.long sys_request_key
.long sys_keyctl
.long sys_ioprio_set
.long sys_ioprio_get /* 290 */
......@@ -1577,8 +1577,8 @@ sys_call_table:
data8 sys_add_key
data8 sys_request_key
data8 sys_keyctl
data8 sys_ni_syscall
data8 sys_ni_syscall // 1275
data8 sys_ioprio_set
data8 sys_ioprio_get // 1275
data8 sys_set_zone_reclaim
data8 sys_ni_syscall
data8 sys_ni_syscall
......
......@@ -1449,3 +1449,5 @@ _GLOBAL(sys_call_table)
.long sys_request_key /* 270 */
.long sys_keyctl
.long sys_waitid
.long sys_ioprio_set
.long sys_ioprio_get
......@@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq)
rq->elevator_private = NULL;
}
static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
static int as_set_request(request_queue_t *q, struct request *rq,
struct bio *bio, int gfp_mask)
{
struct as_data *ad = q->elevator->elevator_data;
struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
......@@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
return 1;
}
static int as_may_queue(request_queue_t *q, int rw)
static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
{
int ret = ELV_MQUEUE_MAY;
struct as_data *ad = q->elevator->elevator_data;
......
......@@ -21,22 +21,33 @@
#include <linux/hash.h>
#include <linux/rbtree.h>
#include <linux/mempool.h>
static unsigned long max_elapsed_crq;
static unsigned long max_elapsed_dispatch;
#include <linux/ioprio.h>
#include <linux/writeback.h>
/*
* tunables
*/
static int cfq_quantum = 4; /* max queue in one round of service */
static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/
static int cfq_service = HZ; /* period over which service is avg */
static int cfq_fifo_expire_r = HZ / 2; /* fifo timeout for sync requests */
static int cfq_fifo_expire_w = 5 * HZ; /* fifo timeout for async requests */
static int cfq_fifo_rate = HZ / 8; /* fifo expiry rate */
static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */
static int cfq_back_penalty = 2; /* penalty of a backwards seek */
static int cfq_slice_sync = HZ / 10;
static int cfq_slice_async = HZ / 50;
static int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 50;
#define CFQ_IDLE_GRACE (HZ / 10)
#define CFQ_SLICE_SCALE (5)
#define CFQ_KEY_ASYNC (0)
/*
* disable queueing at the driver/hardware level
*/
static int cfq_max_depth = 1;
/*
* for the hash of cfqq inside the cfqd
*/
......@@ -55,6 +66,7 @@ static int cfq_back_penalty = 2; /* penalty of a backwards seek */
#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash)
#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list)
#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
#define RQ_DATA(rq) (rq)->elevator_private
......@@ -75,78 +87,101 @@ static int cfq_back_penalty = 2; /* penalty of a backwards seek */
#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node)
#define rq_rb_key(rq) (rq)->sector
/*
* threshold for switching off non-tag accounting
*/
#define CFQ_MAX_TAG (4)
/*
* sort key types and names
*/
enum {
CFQ_KEY_PGID,
CFQ_KEY_TGID,
CFQ_KEY_UID,
CFQ_KEY_GID,
CFQ_KEY_LAST,
};
static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
static kmem_cache_t *crq_pool;
static kmem_cache_t *cfq_pool;
static kmem_cache_t *cfq_ioc_pool;
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define cfq_cfqq_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC)
/*
* Per block device queue structure
*/
struct cfq_data {
struct list_head rr_list;
atomic_t ref;
request_queue_t *queue;
/*
* rr list of queues with requests and the count of them
*/
struct list_head rr_list[CFQ_PRIO_LISTS];
struct list_head busy_rr;
struct list_head cur_rr;
struct list_head idle_rr;
unsigned int busy_queues;
/*
* non-ordered list of empty cfqq's
*/
struct list_head empty_list;
/*
* cfqq lookup hash
*/
struct hlist_head *cfq_hash;
struct hlist_head *crq_hash;
/* queues on rr_list (ie they have pending requests */
unsigned int busy_queues;
/*
* global crq hash for all queues
*/
struct hlist_head *crq_hash;
unsigned int max_queued;
atomic_t ref;
mempool_t *crq_pool;
int key_type;
int rq_in_driver;
mempool_t *crq_pool;
/*
* schedule slice state info
*/
/*
* idle window management
*/
struct timer_list idle_slice_timer;
struct work_struct unplug_work;
request_queue_t *queue;
struct cfq_queue *active_queue;
struct cfq_io_context *active_cic;
int cur_prio, cur_end_prio;
unsigned int dispatch_slice;
struct timer_list idle_class_timer;
sector_t last_sector;
unsigned long last_end_request;
int rq_in_driver;
unsigned int rq_starved;
/*
* tunables, see top of file
*/
unsigned int cfq_quantum;
unsigned int cfq_queued;
unsigned int cfq_fifo_expire_r;
unsigned int cfq_fifo_expire_w;
unsigned int cfq_fifo_batch_expire;
unsigned int cfq_fifo_expire[2];
unsigned int cfq_back_penalty;
unsigned int cfq_back_max;
unsigned int find_best_crq;
unsigned int cfq_tagged;
unsigned int cfq_slice[2];
unsigned int cfq_slice_async_rq;
unsigned int cfq_slice_idle;
unsigned int cfq_max_depth;
};
/*
* Per process-grouping structure
*/
struct cfq_queue {
/* reference count */
atomic_t ref;
/* parent cfq_data */
struct cfq_data *cfqd;
/* hash of mergeable requests */
/* cfqq lookup hash */
struct hlist_node cfq_hash;
/* hash key */
unsigned long key;
/* whether queue is on rr (or empty) list */
int on_rr;
unsigned int key;
/* on either rr or empty list of cfqd */
struct list_head cfq_list;
/* sorted list of pending requests */
......@@ -158,21 +193,35 @@ struct cfq_queue {
/* currently allocated requests */
int allocated[2];
/* fifo list of requests in sort_list */
struct list_head fifo[2];
/* last time fifo expired */
unsigned long last_fifo_expire;
int key_type;
unsigned long service_start;
unsigned long service_used;
struct list_head fifo;
unsigned int max_rate;
unsigned long slice_start;
unsigned long slice_end;
unsigned long slice_left;
unsigned long service_last;
/* number of requests that have been handed to the driver */
int in_flight;
/* number of currently allocated requests */
int alloc_limit[2];
/* io prio of this group */
unsigned short ioprio, org_ioprio;
unsigned short ioprio_class, org_ioprio_class;
/* whether queue is on rr (or empty) list */
unsigned on_rr : 1;
/* idle slice, waiting for new request submission */
unsigned wait_request : 1;
/* set when wait_request gets set, reset on first rq alloc */
unsigned must_alloc : 1;
/* only gets one must_alloc per slice */
unsigned must_alloc_slice : 1;
/* idle slice, request added, now waiting to dispatch it */
unsigned must_dispatch : 1;
/* fifo expire per-slice */
unsigned fifo_expire : 1;
unsigned idle_window : 1;
unsigned prio_changed : 1;
};
struct cfq_rq {
......@@ -184,42 +233,17 @@ struct cfq_rq {
struct cfq_queue *cfq_queue;
struct cfq_io_context *io_context;
unsigned long service_start;
unsigned long queue_start;
unsigned int in_flight : 1;
unsigned int accounted : 1;
unsigned int is_sync : 1;
unsigned int is_write : 1;
unsigned in_flight : 1;
unsigned accounted : 1;
unsigned is_sync : 1;
unsigned requeued : 1;
};
static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int);
static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
static void cfq_update_next_crq(struct cfq_rq *);
static void cfq_put_cfqd(struct cfq_data *cfqd);
/*
* what the fairness is based on (ie how processes are grouped and
* differentiated)
*/
static inline unsigned long
cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
{
/*
* optimize this so that ->key_type is the offset into the struct
*/
switch (cfqd->key_type) {
case CFQ_KEY_PGID:
return process_group(tsk);
default:
case CFQ_KEY_TGID:
return tsk->tgid;
case CFQ_KEY_UID:
return tsk->uid;
case CFQ_KEY_GID:
return tsk->gid;
}
}
#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE)
/*
* lots of deadline iosched dupes, can be abstracted later...
......@@ -235,16 +259,12 @@ static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
if (q->last_merge == crq->request)
q->last_merge = NULL;
cfq_update_next_crq(crq);
}
static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
{
const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
BUG_ON(!hlist_unhashed(&crq->hash));
hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
}
......@@ -257,8 +277,6 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
struct cfq_rq *crq = list_entry_hash(entry);
struct request *__rq = crq->request;
BUG_ON(hlist_unhashed(&crq->hash));
if (!rq_mergeable(__rq)) {
cfq_del_crq_hash(crq);
continue;
......@@ -287,36 +305,16 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
return crq2;
if (crq2 == NULL)
return crq1;
if (crq1->requeued)
return crq1;
if (crq2->requeued)
return crq2;
s1 = crq1->request->sector;
s2 = crq2->request->sector;
last = cfqd->last_sector;
#if 0
if (!list_empty(&cfqd->queue->queue_head)) {
struct list_head *entry = &cfqd->queue->queue_head;
unsigned long distance = ~0UL;
struct request *rq;
while ((entry = entry->prev) != &cfqd->queue->queue_head) {
rq = list_entry_rq(entry);
if (blk_barrier_rq(rq))
break;
if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
distance = abs(s1 - rq->sector +rq->nr_sectors);
last = rq->sector + rq->nr_sectors;
}
if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
distance = abs(s2 - rq->sector +rq->nr_sectors);
last = rq->sector + rq->nr_sectors;
}
}
}
#endif
/*
* by definition, 1KiB is 2 sectors
*/
......@@ -377,11 +375,13 @@ cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
struct rb_node *rbnext, *rbprev;
if (!ON_RB(&last->rb_node))
return NULL;
if ((rbnext = rb_next(&last->rb_node)) == NULL)
if (ON_RB(&last->rb_node))
rbnext = rb_next(&last->rb_node);
else {
rbnext = rb_first(&cfqq->sort_list);
if (rbnext == &last->rb_node)
rbnext = NULL;
}
rbprev = rb_prev(&last->rb_node);
......@@ -401,67 +401,53 @@ static void cfq_update_next_crq(struct cfq_rq *crq)
cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
}
static int cfq_check_sort_rr_list(struct cfq_queue *cfqq)
static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
{
struct list_head *head = &cfqq->cfqd->rr_list;
struct list_head *next, *prev;
/*
* list might still be ordered
*/
next = cfqq->cfq_list.next;
if (next != head) {
struct cfq_queue *cnext = list_entry_cfqq(next);
struct cfq_data *cfqd = cfqq->cfqd;
struct list_head *list, *entry;
if (cfqq->service_used > cnext->service_used)
return 1;
}
BUG_ON(!cfqq->on_rr);
prev = cfqq->cfq_list.prev;
if (prev != head) {
struct cfq_queue *cprev = list_entry_cfqq(prev);
list_del(&cfqq->cfq_list);
if (cfqq->service_used < cprev->service_used)
return 1;
if (cfq_class_rt(cfqq))
list = &cfqd->cur_rr;
else if (cfq_class_idle(cfqq))
list = &cfqd->idle_rr;
else {
/*
* if cfqq has requests in flight, don't allow it to be
* found in cfq_set_active_queue before it has finished them.
* this is done to increase fairness between a process that
* has lots of io pending vs one that only generates one
* sporadically or synchronously
*/
if (cfqq->in_flight)
list = &cfqd->busy_rr;
else
list = &cfqd->rr_list[cfqq->ioprio];
}
return 0;
}
static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
{
struct list_head *entry = &cfqq->cfqd->rr_list;
if (!cfqq->on_rr)
return;
if (!new_queue && !cfq_check_sort_rr_list(cfqq))
/*
* if queue was preempted, just add to front to be fair. busy_rr
* isn't sorted.
*/
if (preempted || list == &cfqd->busy_rr) {
list_add(&cfqq->cfq_list, list);
return;
list_del(&cfqq->cfq_list);
}
/*
* sort by our mean service_used, sub-sort by in-flight requests
* sort by when queue was last serviced
*/
while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
entry = list;
while ((entry = entry->prev) != list) {
struct cfq_queue *__cfqq = list_entry_cfqq(entry);
if (cfqq->service_used > __cfqq->service_used)
if (!__cfqq->service_last)
break;
if (time_before(__cfqq->service_last, cfqq->service_last))
break;
else if (cfqq->service_used == __cfqq->service_used) {
struct list_head *prv;
while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
__cfqq = list_entry_cfqq(prv);
WARN_ON(__cfqq->service_used > cfqq->service_used);
if (cfqq->service_used != __cfqq->service_used)
break;
if (cfqq->in_flight > __cfqq->in_flight)
break;
entry = prv;
}
}
}
list_add(&cfqq->cfq_list, entry);
......@@ -469,28 +455,24 @@ static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
/*
* add to busy list of queues for service, trying to be fair in ordering
* the pending list according to requests serviced
* the pending list according to last request service
*/
static inline void
cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue)
{
/*
* it's currently on the empty list
*/
BUG_ON(cfqq->on_rr);
cfqq->on_rr = 1;
cfqd->busy_queues++;
if (time_after(jiffies, cfqq->service_start + cfq_service))
cfqq->service_used >>= 3;
cfq_sort_rr_list(cfqq, 1);
cfq_resort_rr_list(cfqq, requeue);
}
static inline void
cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
list_move(&cfqq->cfq_list, &cfqd->empty_list);
BUG_ON(!cfqq->on_rr);
cfqq->on_rr = 0;
list_move(&cfqq->cfq_list, &cfqd->empty_list);
BUG_ON(!cfqd->busy_queues);
cfqd->busy_queues--;
......@@ -505,16 +487,17 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
if (ON_RB(&crq->rb_node)) {
struct cfq_data *cfqd = cfqq->cfqd;
const int sync = crq->is_sync;
BUG_ON(!cfqq->queued[crq->is_sync]);
BUG_ON(!cfqq->queued[sync]);
cfqq->queued[sync]--;
cfq_update_next_crq(crq);
cfqq->queued[crq->is_sync]--;
rb_erase(&crq->rb_node, &cfqq->sort_list);
RB_CLEAR_COLOR(&crq->rb_node);
if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr)
if (cfqq->on_rr && RB_EMPTY(&cfqq->sort_list))
cfq_del_cfqq_rr(cfqd, cfqq);
}
}
......@@ -562,7 +545,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
rb_insert_color(&crq->rb_node, &cfqq->sort_list);
if (!cfqq->on_rr)
cfq_add_cfqq_rr(cfqd, cfqq);
cfq_add_cfqq_rr(cfqd, cfqq, crq->requeued);
/*
* check if this request is a better next-serve candidate
......@@ -581,11 +564,10 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
cfq_add_crq_rb(crq);
}
static struct request *
cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
{
const unsigned long key = cfq_hash_key(cfqd, current);
struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid);
struct rb_node *n;
if (!cfqq)
......@@ -609,20 +591,23 @@ cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
struct cfq_rq *crq = RQ_DATA(rq);
if (crq) {
struct cfq_queue *cfqq = crq->cfq_queue;
if (cfqq->cfqd->cfq_tagged) {
cfqq->service_used--;
cfq_sort_rr_list(cfqq, 0);
}
if (crq->accounted) {
crq->accounted = 0;
cfqq->cfqd->rq_in_driver--;
WARN_ON(!cfqd->rq_in_driver);
cfqd->rq_in_driver--;
}
if (crq->in_flight) {
crq->in_flight = 0;
WARN_ON(!cfqq->in_flight);
cfqq->in_flight--;
}
crq->requeued = 1;
}
}