Commit f758eeab authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Wu Fengguang

writeback: split inode_wb_list_lock into bdi_writeback.list_lock

Split the global inode_wb_list_lock into a per-bdi_writeback list_lock,
as it's currently the most contended lock in the system for metadata
heavy workloads.  It won't help for single-filesystem workloads for
which we'll need the I/O-less balance_dirty_pages, but at least we
can dedicate a cpu to spinning on each bdi now for larger systems.

Based on earlier patches from Nick Piggin and Dave Chinner.

It reduces lock contentions to 1/4 in this test case:
10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram

lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
vanilla 2.6.39-rc3:
                      inode_wb_list_lock:         42590          44433           0.12         147.74      144127.35         252274         886792           0.08         121.34      917211.23
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             34          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock          12893          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock          10702          [<ffffffff8115afef>] writeback_single_inode+0x16d/0x20a
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             19          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock           5550          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock           8511          [<ffffffff8115b4ad>] writeback_sb_inodes+0x10f/0x157

2.6.39-rc3 + patch:
                &(&wb->list_lock)->rlock:         11383          11657           0.14         151.69       40429.51          90825         527918           0.11         145.90      556843.37
                ------------------------
                &(&wb->list_lock)->rlock             10          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           1493          [<ffffffff8115b1ed>] writeback_inodes_wb+0x3d/0x150
                &(&wb->list_lock)->rlock           3652          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f
                &(&wb->list_lock)->rlock           1412          [<ffffffff8115a38e>] writeback_single_inode+0x17f/0x223
                ------------------------
                &(&wb->list_lock)->rlock              3          [<ffffffff8110b5af>] bdi_lock_two+0x46/0x4b
                &(&wb->list_lock)->rlock              6          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           2061          [<ffffffff8115af97>] __mark_inode_dirty+0x173/0x1cf
                &(&wb->list_lock)->rlock           2629          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f

hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old
akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarWu Fengguang <fengguang.wu@intel.com>
parent 424b351f
...@@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode) ...@@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode)
{ {
return &BDEV_I(inode)->bdev; return &BDEV_I(inode)->bdev;
} }
EXPORT_SYMBOL(I_BDEV); EXPORT_SYMBOL(I_BDEV);
/* /*
* move the inode from it's current bdi to the a new bdi. if the inode is dirty * Move the inode from its current bdi to a new bdi. If the inode is dirty we
* we need to move it onto the dirty list of @dst so that the inode is always * need to move it onto the dirty list of @dst so that the inode is always on
* on the right list. * the right list.
*/ */
static void bdev_inode_switch_bdi(struct inode *inode, static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst) struct backing_dev_info *dst)
{ {
spin_lock(&inode_wb_list_lock); struct backing_dev_info *old = inode->i_data.backing_dev_info;
if (unlikely(dst == old)) /* deadlock avoidance */
return;
bdi_lock_two(&old->wb, &dst->wb);
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst; inode->i_data.backing_dev_info = dst;
if (inode->i_state & I_DIRTY) if (inode->i_state & I_DIRTY)
list_move(&inode->i_wb_list, &dst->wb.b_dirty); list_move(&inode->i_wb_list, &dst->wb.b_dirty);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock); spin_unlock(&old->wb.list_lock);
spin_unlock(&dst->wb.list_lock);
} }
static sector_t max_block(struct block_device *bdev) static sector_t max_block(struct block_device *bdev)
......
This diff is collapsed.
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
* inode_lru, inode->i_lru * inode_lru, inode->i_lru
* inode_sb_list_lock protects: * inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list * sb->s_inodes, inode->i_sb_list
* inode_wb_list_lock protects: * bdi->wb.list_lock protects:
* bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
* inode_hash_lock protects: * inode_hash_lock protects:
* inode_hashtable, inode->i_hash * inode_hashtable, inode->i_hash
...@@ -48,7 +48,7 @@ ...@@ -48,7 +48,7 @@
* inode->i_lock * inode->i_lock
* inode_lru_lock * inode_lru_lock
* *
* inode_wb_list_lock * bdi->wb.list_lock
* inode->i_lock * inode->i_lock
* *
* inode_hash_lock * inode_hash_lock
...@@ -68,7 +68,6 @@ static LIST_HEAD(inode_lru); ...@@ -68,7 +68,6 @@ static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock); static DEFINE_SPINLOCK(inode_lru_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
/* /*
* iprune_sem provides exclusion between the icache shrinking and the * iprune_sem provides exclusion between the icache shrinking and the
......
...@@ -57,6 +57,7 @@ struct bdi_writeback { ...@@ -57,6 +57,7 @@ struct bdi_writeback {
struct list_head b_dirty; /* dirty inodes */ struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */ struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */ struct list_head b_more_io; /* parked for more writeback */
spinlock_t list_lock; /* protects the b_* lists */
}; };
struct backing_dev_info { struct backing_dev_info {
...@@ -106,6 +107,7 @@ int bdi_writeback_thread(void *data); ...@@ -106,6 +107,7 @@ int bdi_writeback_thread(void *data);
int bdi_has_dirty_io(struct backing_dev_info *bdi); int bdi_has_dirty_io(struct backing_dev_info *bdi);
void bdi_arm_supers_timer(void); void bdi_arm_supers_timer(void);
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
extern spinlock_t bdi_lock; extern spinlock_t bdi_lock;
extern struct list_head bdi_list; extern struct list_head bdi_list;
......
...@@ -9,8 +9,6 @@ ...@@ -9,8 +9,6 @@
struct backing_dev_info; struct backing_dev_info;
extern spinlock_t inode_wb_list_lock;
/* /*
* fs/fs-writeback.c * fs/fs-writeback.c
*/ */
......
...@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; ...@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *); static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long); static void sync_supers_timer_fn(unsigned long);
void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
{
if (wb1 < wb2) {
spin_lock(&wb1->list_lock);
spin_lock_nested(&wb2->list_lock, 1);
} else {
spin_lock(&wb2->list_lock);
spin_lock_nested(&wb1->list_lock, 1);
}
}
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
...@@ -67,14 +78,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) ...@@ -67,14 +78,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
struct inode *inode; struct inode *inode;
nr_dirty = nr_io = nr_more_io = 0; nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_wb_list_lock); spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list) list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++; nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_wb_list) list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++; nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list) list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++; nr_more_io++;
spin_unlock(&inode_wb_list_lock); spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh); global_dirty_limits(&background_thresh, &dirty_thresh);
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
...@@ -628,6 +639,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) ...@@ -628,6 +639,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_more_io);
spin_lock_init(&wb->list_lock);
setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
} }
...@@ -676,11 +688,12 @@ void bdi_destroy(struct backing_dev_info *bdi) ...@@ -676,11 +688,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
if (bdi_has_dirty_io(bdi)) { if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb; struct bdi_writeback *dst = &default_backing_dev_info.wb;
spin_lock(&inode_wb_list_lock); bdi_lock_two(&bdi->wb, dst);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
spin_unlock(&inode_wb_list_lock); spin_unlock(&bdi->wb.list_lock);
spin_unlock(&dst->list_lock);
} }
bdi_unregister(bdi); bdi_unregister(bdi);
......
...@@ -81,7 +81,7 @@ ...@@ -81,7 +81,7 @@
* ->i_mutex * ->i_mutex
* ->i_alloc_sem (various) * ->i_alloc_sem (various)
* *
* inode_wb_list_lock * bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c) * sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode) * ->mapping->tree_lock (__sync_single_inode)
* *
...@@ -99,9 +99,9 @@ ...@@ -99,9 +99,9 @@
* ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty) * ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty)
* inode_wb_list_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty)
* inode_wb_list_lock (zap_pte_range->set_page_dirty) * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers) * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
* *
......
...@@ -32,11 +32,11 @@ ...@@ -32,11 +32,11 @@
* mmlist_lock (in mmput, drain_mmlist and others) * mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers) * mapping->private_lock (in __set_page_dirty_buffers)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty) * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c) * sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->tree_lock (widely used, in set_page_dirty, * mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock, * in arch-dependent flush_dcache_mmap_lock,
* within inode_wb_list_lock in __sync_single_inode) * within bdi.wb->list_lock in __sync_single_inode)
* *
* (code doesn't rely on that order so it could be switched around) * (code doesn't rely on that order so it could be switched around)
* ->tasklist_lock * ->tasklist_lock
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment