Commit e893123c authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (49 commits)
  ext4: Avoid corrupting the uninitialized bit in the extent during truncate
  ext4: Don't treat a truncation of a zero-length file as replace-via-truncate
  ext4: fix dx_map_entry to support 256k directory blocks
  ext4: truncate the file properly if we fail to copy data from userspace
  ext4: Avoid leaking blocks after a block allocation failure
  ext4: Change all super.c messages to print the device
  ext4: Get rid of EXTEND_DISKSIZE flag of ext4_get_blocks_handle()
  ext4: super.c whitespace cleanup
  jbd2: Fix minor typos in comments in fs/jbd2/journal.c
  ext4: Clean up calls to ext4_get_group_desc()
  ext4: remove unused function __ext4_write_dirty_metadata
  ext2: Fix memory leak in ext2_fill_super() in case of a failed mount
  ext3: Fix memory leak in ext3_fill_super() in case of a failed mount
  ext4: Fix memory leak in ext4_fill_super() in case of a failed mount
  ext4: down i_data_sem only for read when walking tree for fiemap
  ext4: Add a comprehensive block validity check to ext4_get_blocks()
  ext4: Clean up ext4_get_blocks() so it does not depend on bh_result->b_state
  ext4: Merge ext4_da_get_block_write() into mpage_da_map_blocks()
  ext4: Add BUG_ON debugging checks to noalloc_get_block_write()
  ext4: Add documentation to the ext4_*get_block* functions
  ...
parents 27951daa a41f2071
......@@ -2935,6 +2935,8 @@ int submit_bh(int rw, struct buffer_head * bh)
BUG_ON(!buffer_locked(bh));
BUG_ON(!buffer_mapped(bh));
BUG_ON(!bh->b_end_io);
BUG_ON(buffer_delay(bh));
BUG_ON(buffer_unwritten(bh));
/*
* Mask in barrier bit for a write (could be either a WRITE or a
......
......@@ -1093,6 +1093,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
failed_sbi:
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
return ret;
}
......
......@@ -2021,6 +2021,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
brelse(bh);
out_fail:
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
lock_kernel();
return ret;
......
......@@ -5,8 +5,8 @@
obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o
ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
......
......@@ -19,7 +19,6 @@
#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "group.h"
#include "mballoc.h"
/*
......@@ -88,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
ext4_group_t block_group, struct ext4_group_desc *gdp)
{
int bit, bit_max;
ext4_group_t ngroups = ext4_get_groups_count(sb);
unsigned free_blocks, group_blocks;
struct ext4_sb_info *sbi = EXT4_SB(sb);
......@@ -123,7 +123,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
bit_max += ext4_bg_num_gdb(sb, block_group);
}
if (block_group == sbi->s_groups_count - 1) {
if (block_group == ngroups - 1) {
/*
* Even though mke2fs always initialize first and last group
* if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
......@@ -131,7 +131,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
group_blocks = ext4_blocks_count(sbi->s_es) -
le32_to_cpu(sbi->s_es->s_first_data_block) -
(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
(EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
} else {
group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
}
......@@ -205,18 +205,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
{
unsigned int group_desc;
unsigned int offset;
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (block_group >= sbi->s_groups_count) {
if (block_group >= ngroups) {
ext4_error(sb, "ext4_get_group_desc",
"block_group >= groups_count - "
"block_group = %u, groups_count = %u",
block_group, sbi->s_groups_count);
block_group, ngroups);
return NULL;
}
smp_rmb();
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
......@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
unlock_buffer(bh);
return bh;
}
spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
return bh;
}
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
ext4_unlock_group(sb, block_group);
if (buffer_uptodate(bh)) {
/*
* if not uninit if bh is uptodate,
......@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
down_write(&grp->alloc_sem);
for (i = 0, blocks_freed = 0; i < count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
bit + i, bitmap_bh->b_data)) {
ext4_error(sb, __func__,
"bit already cleared for block %llu",
......@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
blocks_freed++;
}
}
spin_lock(sb_bgl_lock(sbi, block_group));
ext4_lock_group(sb, block_group);
blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
ext4_free_blks_set(sb, desc, blk_free_count);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
spin_unlock(sb_bgl_lock(sbi, block_group));
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
if (sbi->s_log_groups_per_flex) {
......@@ -665,7 +665,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
ext4_fsblk_t desc_count;
struct ext4_group_desc *gdp;
ext4_group_t i;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t ngroups = ext4_get_groups_count(sb);
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
ext4_fsblk_t bitmap_count;
......@@ -677,7 +677,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
bitmap_count = 0;
gdp = NULL;
smp_rmb();
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
......@@ -700,7 +699,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
return bitmap_count;
#else
desc_count = 0;
smp_rmb();
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
......
/*
* linux/fs/ext4/block_validity.c
*
* Copyright (C) 2009
* Theodore Ts'o (tytso@mit.edu)
*
* Track which blocks in the filesystem are metadata blocks that
* should never be used as data blocks by files or directories.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/version.h>
#include <linux/blkdev.h>
#include <linux/mutex.h>
#include "ext4.h"
struct ext4_system_zone {
struct rb_node node;
ext4_fsblk_t start_blk;
unsigned int count;
};
static struct kmem_cache *ext4_system_zone_cachep;
int __init init_ext4_system_zone(void)
{
ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
SLAB_RECLAIM_ACCOUNT);
if (ext4_system_zone_cachep == NULL)
return -ENOMEM;
return 0;
}
void exit_ext4_system_zone(void)
{
kmem_cache_destroy(ext4_system_zone_cachep);
}
static inline int can_merge(struct ext4_system_zone *entry1,
struct ext4_system_zone *entry2)
{
if ((entry1->start_blk + entry1->count) == entry2->start_blk)
return 1;
return 0;
}
/*
* Mark a range of blocks as belonging to the "system zone" --- that
* is, filesystem metadata blocks which should never be used by
* inodes.
*/
static int add_system_zone(struct ext4_sb_info *sbi,
ext4_fsblk_t start_blk,
unsigned int count)
{
struct ext4_system_zone *new_entry = NULL, *entry;
struct rb_node **n = &sbi->system_blks.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL;
while (*n) {
parent = *n;
entry = rb_entry(parent, struct ext4_system_zone, node);
if (start_blk < entry->start_blk)
n = &(*n)->rb_left;
else if (start_blk >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
else {
if (start_blk + count > (entry->start_blk +
entry->count))
entry->count = (start_blk + count -
entry->start_blk);
new_node = *n;
new_entry = rb_entry(new_node, struct ext4_system_zone,
node);
break;
}
}
if (!new_entry) {
new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
GFP_KERNEL);
if (!new_entry)
return -ENOMEM;
new_entry->start_blk = start_blk;
new_entry->count = count;
new_node = &new_entry->node;
rb_link_node(new_node, parent, n);
rb_insert_color(new_node, &sbi->system_blks);
}
/* Can we merge to the left? */
node = rb_prev(new_node);
if (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
if (can_merge(entry, new_entry)) {
new_entry->start_blk = entry->start_blk;
new_entry->count += entry->count;
rb_erase(node, &sbi->system_blks);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
/* Can we merge to the right? */
node = rb_next(new_node);
if (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
if (can_merge(new_entry, entry)) {
new_entry->count += entry->count;
rb_erase(node, &sbi->system_blks);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
return 0;
}
static void debug_print_tree(struct ext4_sb_info *sbi)
{
struct rb_node *node;
struct ext4_system_zone *entry;
int first = 1;
printk(KERN_INFO "System zones: ");
node = rb_first(&sbi->system_blks);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
printk("%s%llu-%llu", first ? "" : ", ",
entry->start_blk, entry->start_blk + entry->count - 1);
first = 0;
node = rb_next(node);
}
printk("\n");
}
int ext4_setup_system_zone(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp;
ext4_group_t i;
int flex_size = ext4_flex_bg_size(sbi);
int ret;
if (!test_opt(sb, BLOCK_VALIDITY)) {
if (EXT4_SB(sb)->system_blks.rb_node)
ext4_release_system_zone(sb);
return 0;
}
if (EXT4_SB(sb)->system_blks.rb_node)
return 0;
for (i=0; i < ngroups; i++) {
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
add_system_zone(sbi, ext4_group_first_block_no(sb, i),
sbi->s_gdb_count + 1);
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
if (ret)
return ret;
ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
if (ret)
return ret;
ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
sbi->s_itb_per_group);
if (ret)
return ret;
}
if (test_opt(sb, DEBUG))
debug_print_tree(EXT4_SB(sb));
return 0;
}
/* Called when the filesystem is unmounted */
void ext4_release_system_zone(struct super_block *sb)
{
struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node;
struct rb_node *parent;
struct ext4_system_zone *entry;
while (n) {
/* Do the node's children first */
if (n->rb_left) {
n = n->rb_left;
continue;
}
if (n->rb_right) {
n = n->rb_right;
continue;
}
/*
* The node has no children; free it, and then zero
* out parent's link to it. Finally go to the
* beginning of the loop and try to free the parent
* node.
*/
parent = rb_parent(n);
entry = rb_entry(n, struct ext4_system_zone, node);
kmem_cache_free(ext4_system_zone_cachep, entry);
if (!parent)
EXT4_SB(sb)->system_blks.rb_node = NULL;
else if (parent->rb_left == n)
parent->rb_left = NULL;
else if (parent->rb_right == n)
parent->rb_right = NULL;
n = parent;
}
EXT4_SB(sb)->system_blks.rb_node = NULL;
}
/*
* Returns 1 if the passed-in block region (start_blk,
* start_blk+count) is valid; 0 if some part of the block region
* overlaps with filesystem metadata blocks.
*/
int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
unsigned int count)
{
struct ext4_system_zone *entry;
struct rb_node *n = sbi->system_blks.rb_node;
if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
(start_blk + count > ext4_blocks_count(sbi->s_es)))
return 0;
while (n) {
entry = rb_entry(n, struct ext4_system_zone, node);
if (start_blk + count - 1 < entry->start_blk)
n = n->rb_left;
else if (start_blk >= (entry->start_blk + entry->count))
n = n->rb_right;
else
return 0;
}
return 1;
}
......@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
0, 0, 0);
err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
......
......@@ -21,7 +21,14 @@
#include <linux/magic.h>
#include <linux/jbd2.h>
#include <linux/quota.h>
#include "ext4_i.h"
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
/*
* The fourth extended filesystem constants/structures
......@@ -46,6 +53,19 @@
#define ext4_debug(f, a...) do {} while (0)
#endif
/* data type for block offset of block group */
typedef int ext4_grpblk_t;
/* data type for filesystem-wide blocks number */
typedef unsigned long long ext4_fsblk_t;
/* data type for file logical block number */
typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned int ext4_group_t;
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 1
/* blocks already reserved */
......@@ -179,9 +199,6 @@ struct flex_groups {
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
#ifdef __KERNEL__
#include "ext4_sb.h"
#endif
/*
* Macro-instructions used to manage group descriptors
*/
......@@ -297,10 +314,23 @@ struct ext4_new_group_data {
};
/*
* Following is used by preallocation code to tell get_blocks() that we
* want uninitialzed extents.
* Flags used by ext4_get_blocks()
*/
#define EXT4_CREATE_UNINITIALIZED_EXT 2
/* Allocate any needed blocks and/or convert an unitialized
extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE 0x0001
/* Request the creation of an unitialized extent */
#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
EXT4_GET_BLOCKS_CREATE)
/* Caller is from the delayed allocation writeout path,
so set the magic i_delalloc_reserve_flag after taking the
inode allocation semaphore for */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
/* Call ext4_da_update_reserve_space() after successfully
allocating the blocks */
#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
/*
* ioctl commands
......@@ -515,6 +545,110 @@ do { \
#endif /* defined(__KERNEL__) || defined(__linux__) */
/*
* storage for cached extent
*/
struct ext4_ext_cache {
ext4_fsblk_t ec_start;
ext4_lblk_t ec_block;
__u32 ec_len; /* must be 32bit to return holes */
__u32 ec_type;
};
/*
* fourth extended file system inode data in memory
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
__u32 i_flags;
ext4_fsblk_t i_file_acl;
__u32 i_dtime;
/*
* i_block_group is the number of the block group which contains
* this file's inode. Constant across the lifetime of the inode,
* it is ued for making block allocation decisions - we try to
* place a file's data blocks near its inode block, and new inodes
* near to their parent directory's inode.
*/
ext4_group_t i_block_group;
__u32 i_state; /* Dynamic state flags for ext4 */
ext4_lblk_t i_dir_start_lookup;
#ifdef CONFIG_EXT4_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_mutex even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
#endif
#ifdef CONFIG_EXT4_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
struct list_head i_orphan; /* unlinked but open inodes */
/*
* i_disksize keeps track of what the inode size is ON DISK, not
* in memory. During truncate, i_size is set to the new size by
* the VFS prior to calling ext4_truncate(), but the filesystem won't
* set i_disksize to 0 until the truncate is actually under way.
*
* The intent is that i_disksize always represents the blocks which
* are used by this file. This allows recovery to restart truncate
* on orphans if we crash during truncate. We actually write i_disksize
* into the on-disk inode when writing inodes out, instead of i_size.
*
* The only time when i_disksize and i_size may be different is when
* a truncate is in progress. The only things which change i_disksize
* are ext4_get_block (growth) and ext4_truncate (shrinkth).
*/
loff_t i_disksize;
/*
* i_data_sem is for serialising ext4_truncate() against
* ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
* data tree are chopped off during truncate. We can't do that in
* ext4 because whenever we perform intermediate commits during
* truncate, the inode and all the metadata blocks *must* be in a
* consistent state which allows truncation of the orphans to restart
* during recovery. Hence we must fix the get_block-vs-truncate race
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
struct inode vfs_inode;
struct jbd2_inode jinode;
struct ext4_ext_cache i_cached_extent;
/*
* File creation time. Its function is same as that of
* struct timespec i_{a,c,m}time in the generic inode.
*/
struct timespec i_crtime;
/* mballoc */
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
/* ialloc */
ext4_group_t i_last_alloc_group;
/* allocation reservation info for delalloc */
unsigned int i_reserved_data_blocks;
unsigned int i_reserved_meta_blocks;
unsigned int i_allocated_meta_blocks;
unsigned short i_delalloc_reserved_flag;
/* on-disk additional length */
__u16 i_extra_isize;
spinlock_t i_block_reservation_lock;
};
/*
* File system states
*/
......@@ -560,6 +694,7 @@ do { \
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
......@@ -689,6 +824,137 @@ struct ext4_super_block {
};
#ifdef __KERNEL__
/*
* fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
unsigned long s_inodes_per_block;/* Number of inodes per block */
unsigned long s_blocks_per_group;/* Number of blocks in a group */
unsigned long s_inodes_per_group;/* Number of inodes in a group */
unsigned long s_itb_per_group; /* Number of inode table blocks per group */
unsigned long s_gdb_count; /* Number of group descriptor blocks */
unsigned long s_desc_per_block; /* Number of group descriptors per block */
ext4_group_t s_groups_count; /* Number of groups in the fs */
unsigned long s_overhead_last; /* Last calculated overhead */
unsigned long s_blocks_last; /* Last seen block count */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */