Commit 912afc36 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'dm-3.5-changes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm

Pull device-mapper updates from Alasdair G Kergon:
 "Improve multipath's retrying mechanism in some defined circumstances
  and provide a simple reserve/release mechanism for userspace tools to
  access thin provisioning metadata while the pool is in use."

* tag 'dm-3.5-changes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm:
  dm thin: provide userspace access to pool metadata
  dm thin: use slab mempools
  dm mpath: allow ioctls to trigger pg init
  dm mpath: delay retry of bypassed pg
  dm mpath: reduce size of struct multipath
parents 4fc3acf2 cc8394d8
......@@ -287,6 +287,17 @@ iii) Messages
the current transaction id is when you change it with this
compare-and-swap message.
reserve_metadata_snap
Reserve a copy of the data mapping btree for use by userland.
This allows userland to inspect the mappings as they were when
this message was executed. Use the pool's status command to
get the root block associated with the metadata snapshot.
release_metadata_snap
Release a previously reserved copy of the data mapping btree.
'thin' target
-------------
......
......@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <scsi/scsi_dh.h>
#include <linux/atomic.h>
......@@ -61,11 +62,11 @@ struct multipath {
struct list_head list;
struct dm_target *ti;
spinlock_t lock;
const char *hw_handler_name;
char *hw_handler_params;
spinlock_t lock;
unsigned nr_priority_groups;
struct list_head priority_groups;
......@@ -81,16 +82,17 @@ struct multipath {
struct priority_group *next_pg; /* Switch to this PG if set */
unsigned repeat_count; /* I/Os left before calling PS again */
unsigned queue_io; /* Must we queue all I/O? */
unsigned queue_if_no_path; /* Queue I/O if last path fails? */
unsigned saved_queue_if_no_path;/* Saved state during suspension */
unsigned queue_io:1; /* Must we queue all I/O? */
unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
unsigned pg_init_retries; /* Number of times to retry pg_init */
unsigned pg_init_count; /* Number of times pg_init called */
unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
unsigned queue_size;
struct work_struct process_queued_ios;
struct list_head queued_ios;
unsigned queue_size;
struct work_struct trigger_event;
......@@ -328,14 +330,18 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
/*
* Loop through priority groups until we find a valid path.
* First time we skip PGs marked 'bypassed'.
* Second time we only try the ones we skipped.
* Second time we only try the ones we skipped, but set
* pg_init_delay_retry so we do not hammer controllers.
*/
do {
list_for_each_entry(pg, &m->priority_groups, list) {
if (pg->bypassed == bypassed)
continue;
if (!__choose_path_in_pg(m, pg, nr_bytes))
if (!__choose_path_in_pg(m, pg, nr_bytes)) {
if (!bypassed)
m->pg_init_delay_retry = 1;
return;
}
}
} while (bypassed--);
......@@ -481,9 +487,6 @@ static void process_queued_ios(struct work_struct *work)
spin_lock_irqsave(&m->lock, flags);
if (!m->queue_size)
goto out;
if (!m->current_pgpath)
__choose_pgpath(m, 0);
......@@ -496,7 +499,6 @@ static void process_queued_ios(struct work_struct *work)
if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
__pg_init_all_paths(m);
out:
spin_unlock_irqrestore(&m->lock, flags);
if (!must_queue)
dispatch_queued_ios(m);
......@@ -1517,11 +1519,16 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
unsigned long arg)
{
struct multipath *m = (struct multipath *) ti->private;
struct block_device *bdev = NULL;
fmode_t mode = 0;
struct multipath *m = ti->private;
struct block_device *bdev;
fmode_t mode;
unsigned long flags;
int r = 0;
int r;
again:
bdev = NULL;
mode = 0;
r = 0;
spin_lock_irqsave(&m->lock, flags);
......@@ -1546,6 +1553,12 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
r = scsi_verify_blk_ioctl(NULL, cmd);
if (r == -EAGAIN && !fatal_signal_pending(current)) {
queue_work(kmultipathd, &m->process_queued_ios);
msleep(10);
goto again;
}
return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
......@@ -1643,7 +1656,7 @@ static int multipath_busy(struct dm_target *ti)
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
.version = {1, 3, 0},
.version = {1, 4, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
......
......@@ -1082,31 +1082,155 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
return 0;
}
static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
dm_block_t *result)
static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
{
int r, inc;
struct thin_disk_superblock *disk_super;
struct dm_block *copy, *sblock;
dm_block_t held_root;
/*
* Copy the superblock.
*/
dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
&sb_validator, &copy, &inc);
if (r)
return r;
BUG_ON(!inc);
held_root = dm_block_location(copy);
disk_super = dm_block_data(copy);
if (le64_to_cpu(disk_super->held_root)) {
DMWARN("Pool metadata snapshot already exists: release this before taking another.");
dm_tm_dec(pmd->tm, held_root);
dm_tm_unlock(pmd->tm, copy);
pmd->need_commit = 1;
return -EBUSY;
}
/*
* Wipe the spacemap since we're not publishing this.
*/
memset(&disk_super->data_space_map_root, 0,
sizeof(disk_super->data_space_map_root));
memset(&disk_super->metadata_space_map_root, 0,
sizeof(disk_super->metadata_space_map_root));
/*
* Increment the data structures that need to be preserved.
*/
dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
dm_tm_unlock(pmd->tm, copy);
/*
* Write the held root into the superblock.
*/
r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
&sb_validator, &sblock);
if (r) {
dm_tm_dec(pmd->tm, held_root);
pmd->need_commit = 1;
return r;
}
disk_super = dm_block_data(sblock);
disk_super->held_root = cpu_to_le64(held_root);
dm_bm_unlock(sblock);
pmd->need_commit = 1;
return 0;
}
int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
{
int r;
down_write(&pmd->root_lock);
r = __reserve_metadata_snap(pmd);
up_write(&pmd->root_lock);
return r;
}
static int __release_metadata_snap(struct dm_pool_metadata *pmd)
{
int r;
struct thin_disk_superblock *disk_super;
struct dm_block *sblock;
struct dm_block *sblock, *copy;
dm_block_t held_root;
r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
&sb_validator, &sblock);
if (r)
return r;
disk_super = dm_block_data(sblock);
held_root = le64_to_cpu(disk_super->held_root);
disk_super->held_root = cpu_to_le64(0);
pmd->need_commit = 1;
dm_bm_unlock(sblock);
if (!held_root) {
DMWARN("No pool metadata snapshot found: nothing to release.");
return -EINVAL;
}
r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
if (r)
return r;
disk_super = dm_block_data(copy);
dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
dm_sm_dec_block(pmd->metadata_sm, held_root);
return dm_tm_unlock(pmd->tm, copy);
}
int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
{
int r;
down_write(&pmd->root_lock);
r = __release_metadata_snap(pmd);
up_write(&pmd->root_lock);
return r;
}
static int __get_metadata_snap(struct dm_pool_metadata *pmd,
dm_block_t *result)
{
int r;
struct thin_disk_superblock *disk_super;
struct dm_block *sblock;
r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
&sb_validator, &sblock);
if (r)
return r;
disk_super = dm_block_data(sblock);
*result = le64_to_cpu(disk_super->held_root);
return dm_bm_unlock(sblock);
}
int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
dm_block_t *result)
int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
dm_block_t *result)
{
int r;
down_read(&pmd->root_lock);
r = __get_held_metadata_root(pmd, result);
r = __get_metadata_snap(pmd, result);
up_read(&pmd->root_lock);
return r;
......
......@@ -90,11 +90,18 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
/*
* Hold/get root for userspace transaction.
*
* The metadata snapshot is a copy of the current superblock (minus the
* space maps). Userland can access the data structures for READ
* operations only. A small performance hit is incurred by providing this
* copy of the metadata to userland due to extra copy-on-write operations
* on the metadata nodes. Release this as soon as you finish with it.
*/
int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd);
int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd);
int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd);
int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
dm_block_t *result);
int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
dm_block_t *result);
/*
* Actions on a single virtual device.
......
......@@ -111,7 +111,7 @@ struct cell_key {
dm_block_t block;
};
struct cell {
struct dm_bio_prison_cell {
struct hlist_node list;
struct bio_prison *prison;
struct cell_key key;
......@@ -141,6 +141,8 @@ static uint32_t calc_nr_buckets(unsigned nr_cells)
return n;
}
static struct kmem_cache *_cell_cache;
/*
* @nr_cells should be the number of cells you want in use _concurrently_.
* Don't confuse it with the number of distinct keys.
......@@ -157,8 +159,7 @@ static struct bio_prison *prison_create(unsigned nr_cells)
return NULL;
spin_lock_init(&prison->lock);
prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
sizeof(struct cell));
prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
if (!prison->cell_pool) {
kfree(prison);
return NULL;
......@@ -194,10 +195,10 @@ static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
(lhs->block == rhs->block);
}
static struct cell *__search_bucket(struct hlist_head *bucket,
struct cell_key *key)
static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
struct cell_key *key)
{
struct cell *cell;
struct dm_bio_prison_cell *cell;
struct hlist_node *tmp;
hlist_for_each_entry(cell, tmp, bucket, list)
......@@ -214,12 +215,12 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
* Returns 1 if the cell was already held, 0 if @inmate is the new holder.
*/
static int bio_detain(struct bio_prison *prison, struct cell_key *key,
struct bio *inmate, struct cell **ref)
struct bio *inmate, struct dm_bio_prison_cell **ref)
{
int r = 1;
unsigned long flags;
uint32_t hash = hash_key(prison, key);
struct cell *cell, *cell2;
struct dm_bio_prison_cell *cell, *cell2;
BUG_ON(hash > prison->nr_buckets);
......@@ -273,7 +274,7 @@ static int bio_detain(struct bio_prison *prison, struct cell_key *key,
/*
* @inmates must have been initialised prior to this call
*/
static void __cell_release(struct cell *cell, struct bio_list *inmates)
static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
{
struct bio_prison *prison = cell->prison;
......@@ -287,7 +288,7 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
mempool_free(cell, prison->cell_pool);
}
static void cell_release(struct cell *cell, struct bio_list *bios)
static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
{
unsigned long flags;
struct bio_prison *prison = cell->prison;
......@@ -303,7 +304,7 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
* bio may be in the cell. This function releases the cell, and also does
* a sanity check.
*/
static void __cell_release_singleton(struct cell *cell, struct bio *bio)
static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
{
BUG_ON(cell->holder != bio);
BUG_ON(!bio_list_empty(&cell->bios));
......@@ -311,7 +312,7 @@ static void __cell_release_singleton(struct cell *cell, struct bio *bio)
__cell_release(cell, NULL);
}
static void cell_release_singleton(struct cell *cell, struct bio *bio)
static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
{
unsigned long flags;
struct bio_prison *prison = cell->prison;
......@@ -324,7 +325,8 @@ static void cell_release_singleton(struct cell *cell, struct bio *bio)
/*
* Sometimes we don't want the holder, just the additional bios.
*/
static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
struct bio_list *inmates)
{
struct bio_prison *prison = cell->prison;
......@@ -334,7 +336,8 @@ static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates
mempool_free(cell, prison->cell_pool);
}
static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
static void cell_release_no_holder(struct dm_bio_prison_cell *cell,
struct bio_list *inmates)
{
unsigned long flags;
struct bio_prison *prison = cell->prison;
......@@ -344,7 +347,7 @@ static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
spin_unlock_irqrestore(&prison->lock, flags);
}
static void cell_error(struct cell *cell)
static void cell_error(struct dm_bio_prison_cell *cell)
{
struct bio_prison *prison = cell->prison;
struct bio_list bios;
......@@ -491,7 +494,7 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
* also provides the interface for creating and destroying internal
* devices.
*/
struct new_mapping;
struct dm_thin_new_mapping;
struct pool_features {
unsigned zero_new_blocks:1;
......@@ -537,7 +540,7 @@ struct pool {
struct deferred_set shared_read_ds;
struct deferred_set all_io_ds;
struct new_mapping *next_mapping;
struct dm_thin_new_mapping *next_mapping;
mempool_t *mapping_pool;
mempool_t *endio_hook_pool;
};
......@@ -630,11 +633,11 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
/*----------------------------------------------------------------*/
struct endio_hook {
struct dm_thin_endio_hook {
struct thin_c *tc;
struct deferred_entry *shared_read_entry;
struct deferred_entry *all_io_entry;
struct new_mapping *overwrite_mapping;
struct dm_thin_new_mapping *overwrite_mapping;
};
static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
......@@ -647,7 +650,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
bio_list_init(master);
while ((bio = bio_list_pop(&bios))) {
struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
if (h->tc == tc)
bio_endio(bio, DM_ENDIO_REQUEUE);
else
......@@ -736,7 +740,7 @@ static void wake_worker(struct pool *pool)
/*
* Bio endio functions.
*/
struct new_mapping {
struct dm_thin_new_mapping {
struct list_head list;
unsigned quiesced:1;
......@@ -746,7 +750,7 @@ struct new_mapping {
struct thin_c *tc;
dm_block_t virt_block;
dm_block_t data_block;
struct cell *cell, *cell2;
struct dm_bio_prison_cell *cell, *cell2;
int err;
/*
......@@ -759,7 +763,7 @@ struct new_mapping {
bio_end_io_t *saved_bi_end_io;
};
static void __maybe_add_mapping(struct new_mapping *m)
static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
{
struct pool *pool = m->tc->pool;
......@@ -772,7 +776,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
static void copy_complete(int read_err, unsigned long write_err, void *context)
{
unsigned long flags;
struct new_mapping *m = context;
struct dm_thin_new_mapping *m = context;
struct pool *pool = m->tc->pool;
m->err = read_err || write_err ? -EIO : 0;
......@@ -786,8 +790,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
static void overwrite_endio(struct bio *bio, int err)
{
unsigned long flags;
struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
struct new_mapping *m = h->overwrite_mapping;
struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
struct dm_thin_new_mapping *m = h->overwrite_mapping;
struct pool *pool = m->tc->pool;
m->err = err;
......@@ -811,7 +815,7 @@ static void overwrite_endio(struct bio *bio, int err)
/*
* This sends the bios in the cell back to the deferred_bios list.
*/
static void cell_defer(struct thin_c *tc, struct cell *cell,
static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
dm_block_t data_block)
{
struct pool *pool = tc->pool;
......@@ -828,7 +832,7 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
* Same as cell_defer above, except it omits one particular detainee,
* a write bio that covers the block and has already been processed.
*/
static void cell_defer_except(struct thin_c *tc, struct cell *cell)
static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
struct bio_list bios;
struct pool *pool = tc->pool;
......@@ -843,7 +847,7 @@ static void cell_defer_except(struct thin_c *tc, struct cell *cell)
wake_worker(pool);
}
static void process_prepared_mapping(struct new_mapping *m)
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
struct bio *bio;
......@@ -886,7 +890,7 @@ static void process_prepared_mapping(struct new_mapping *m)
mempool_free(m, tc->pool->mapping_pool);
}
static void process_prepared_discard(struct new_mapping *m)
static void process_prepared_discard(struct dm_thin_new_mapping *m)
{
int r;
struct thin_c *tc = m->tc;
......@@ -909,11 +913,11 @@ static void process_prepared_discard(struct new_mapping *m)
}
static void process_prepared(struct pool *pool, struct list_head *head,
void (*fn)(struct new_mapping *))
void (*fn)(struct dm_thin_new_mapping *))
{
unsigned long flags;
struct list_head maps;
struct new_mapping *m, *tmp;
struct dm_thin_new_mapping *m, *tmp;
INIT_LIST_HEAD(&maps);
spin_lock_irqsave(&pool->lock, flags);
......@@ -957,9 +961,9 @@ static int ensure_next_mapping(struct pool *pool)
return pool->next_mapping ? 0 : -ENOMEM;
}
static struct new_mapping *get_next_mapping(struct pool *pool)
static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
{
struct new_mapping *r = pool->next_mapping;
struct dm_thin_new_mapping *r = pool->next_mapping;
BUG_ON(!pool->next_mapping);
......@@ -971,11 +975,11 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_dev *origin, dm_block_t data_origin,
dm_block_t data_dest,
struct cell *cell, struct bio *bio)
struct dm_bio_prison_cell *cell, struct bio *bio)
{
int r;
struct pool *pool = tc->pool;
struct new_mapping *m = get_next_mapping(pool);
struct dm_thin_new_mapping *m = get_next_mapping(pool);
INIT_LIST_HEAD(&m->list);
m->quiesced = 0;
......@@ -997,7 +1001,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
* bio immediately. Otherwise we use kcopyd to clone the data first.
*/
if (io_overwrites_block(pool, bio)) {
struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
h->overwrite_mapping = m;
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
......@@ -1025,7 +1030,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
dm_block_t data_origin, dm_block_t data_dest,
struct cell *cell, struct bio *bio)
struct dm_bio_prison_cell *cell, struct bio *bio)
{
schedule_copy(tc, virt_block, tc->pool_dev,
data_origin, data_dest, cell, bio);
......@@ -1033,18 +1038,18 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
dm_block_t data_dest,
struct cell *cell, struct bio *bio)
struct dm_bio_prison_cell *cell, struct bio *bio)
{
schedule_copy(tc, virt_block, tc->origin_dev,
virt_block, data_dest, cell, bio);
}