Newer
Older
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match)
__releases(wqe->lock)
struct io_wq_work *work;
wq_list_for_each(node, prev, &acct->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
raw_spin_unlock(&wqe->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
/* not safe to continue after unlock */
return true;
}
return false;
}
static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
struct io_cb_cancel_data *match)
{
int i;
raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
if (io_acct_cancel_pending_work(wqe, acct, match)) {
if (match->cancel_all)
goto retry;
return;
raw_spin_unlock(&wqe->lock);
static void io_wqe_cancel_running_work(struct io_wqe *wqe,
struct io_cb_cancel_data *match)
{
io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
rcu_read_unlock();
}
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all)
struct io_cb_cancel_data match = {
.fn = cancel,
.data = data,
.cancel_all = cancel_all,
/*
* First check pending list, if we're lucky we can just remove it
* from there. CANCEL_OK means that the work is returned as-new,
* no completion will be posted for it.
*/
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
io_wqe_cancel_pending_work(wqe, &match);
if (match.nr_pending && !match.cancel_all)
return IO_WQ_CANCEL_OK;
/*
* Now check if a free (going busy) or busy worker has the work
* currently running. If we find it there, we'll return CANCEL_RUNNING
* as an indication that we attempt to signal cancellation. The
* completion will run normally in this case.
*/
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
io_wqe_cancel_running_work(wqe, &match);
if (match.nr_running && !match.cancel_all)
return IO_WQ_CANCEL_RUNNING;
}
if (match.nr_running)
return IO_WQ_CANCEL_RUNNING;
if (match.nr_pending)
return IO_WQ_CANCEL_OK;
return IO_WQ_CANCEL_NOTFOUND;
static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
list_del_init(&wait->entry);
rcu_read_lock();
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = &wqe->acct[i];
if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
io_wqe_activate_free_worker(wqe, acct);
}
rcu_read_unlock();
return 1;
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret, node, i;
struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work || !data->do_work))
return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(!bounded))
return ERR_PTR(-EINVAL);
wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
if (ret)
refcount_inc(&data->hash->refs);
wq->hash = data->hash;
wq->free_work = data->free_work;
wq->do_work = data->do_work;
int alloc_node = node;
if (!node_online(alloc_node))
alloc_node = NUMA_NO_NODE;
wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
goto err;
cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
wqe->node = alloc_node;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
INIT_LIST_HEAD(&wqe->wait.entry);
wqe->wait.func = io_wqe_hash_wake;
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = &wqe->acct[i];
acct->index = i;
atomic_set(&acct->nr_running, 0);
INIT_WQ_LIST(&acct->work_list);
}
raw_spin_lock_init(&wqe->lock);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
INIT_LIST_HEAD(&wqe->all_list);
wq->task = get_task_struct(data->task);
atomic_set(&wq->worker_refs, 1);
init_completion(&wq->worker_done);
return wq;
io_wq_put_hash(data->hash);
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node) {
if (!wq->wqes[node])
continue;
free_cpumask_var(wq->wqes[node]->cpu_mask);
return ERR_PTR(ret);
}
static bool io_task_work_match(struct callback_head *cb, void *data)
{
struct io_worker *worker;
if (cb->func != create_worker_cb && cb->func != create_worker_cont)
return false;
worker = container_of(cb, struct io_worker, create_work);
return worker->wqe->wq == data;
void io_wq_exit_start(struct io_wq *wq)
{
set_bit(IO_WQ_BIT_EXIT, &wq->state);
}
static void io_wq_exit_workers(struct io_wq *wq)
struct callback_head *cb;
int node;
if (!wq->task)
return;
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct io_worker *worker;
worker = container_of(cb, struct io_worker, create_work);
io_worker_cancel_cb(worker);
}
rcu_read_lock();
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
rcu_read_unlock();
io_worker_ref_put(wq);
wait_for_completion(&wq->worker_done);
for_each_node(node) {
spin_lock_irq(&wq->hash->wait.lock);
list_del_init(&wq->wqes[node]->wait.entry);
spin_unlock_irq(&wq->hash->wait.lock);
}
put_task_struct(wq->task);
wq->task = NULL;
static void io_wq_destroy(struct io_wq *wq)
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
struct io_cb_cancel_data match = {
.fn = io_wq_work_match_all,
.cancel_all = true,
};
io_wqe_cancel_pending_work(wqe, &match);
kfree(wqe);
}
io_wq_put_hash(wq->hash);
void io_wq_put_and_exit(struct io_wq *wq)
{
WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));
struct online_data {
unsigned int cpu;
bool online;
};
static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
{
if (od->online)
cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask);
else
cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask);
return false;
}
static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
struct online_data od = {
.cpu = cpu,
.online = online
};
int i;
rcu_read_lock();
for_each_node(i)
io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od);
rcu_read_unlock();
return 0;
}
static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
{
struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
return __io_wq_cpu_online(wq, cpu, true);
}
static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
return __io_wq_cpu_online(wq, cpu, false);
}
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
{
int i;
rcu_read_lock();
for_each_node(i) {
struct io_wqe *wqe = wq->wqes[i];
if (mask)
cpumask_copy(wqe->cpu_mask, mask);
else
cpumask_copy(wqe->cpu_mask, cpumask_of_node(i));
}
rcu_read_unlock();
return 0;
}
/*
* Set max number of unbounded workers, returns old value. If new_count is 0,
* then just return the old value.
*/
int io_wq_max_workers(struct io_wq *wq, int *new_count)
{
int prev[IO_WQ_ACCT_NR];
bool first_node = true;
int i, node;
Eugene Syromiatnikov
committed
BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
for (i = 0; i < 2; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
for (i = 0; i < IO_WQ_ACCT_NR; i++)
prev[i] = 0;
rcu_read_lock();
for_each_node(node) {
struct io_wqe_acct *acct;
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
if (first_node)
prev[i] = max_t(int, acct->max_workers, prev[i]);
if (new_count[i])
acct->max_workers = new_count[i];
}
first_node = false;
}
rcu_read_unlock();
for (i = 0; i < IO_WQ_ACCT_NR; i++)
new_count[i] = prev[i];
static __init int io_wq_init(void)
{
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
if (ret < 0)
return ret;
io_wq_online = ret;
return 0;
}
subsys_initcall(io_wq_init);