inode.c 285 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Chris Mason's avatar
Chris Mason committed
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

6
#include <linux/kernel.h>
7
#include <linux/bio.h>
Chris Mason's avatar
Chris Mason committed
8
#include <linux/buffer_head.h>
Sage Weil's avatar
Sage Weil committed
9
#include <linux/file.h>
Chris Mason's avatar
Chris Mason committed
10 11 12 13 14 15 16 17 18 19 20
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/compat.h>
Chris Mason's avatar
Chris Mason committed
21
#include <linux/bit_spinlock.h>
Josef Bacik's avatar
Josef Bacik committed
22
#include <linux/xattr.h>
Josef Bacik's avatar
Josef Bacik committed
23
#include <linux/posix_acl.h>
Yan Zheng's avatar
Yan Zheng committed
24
#include <linux/falloc.h>
25
#include <linux/slab.h>
26
#include <linux/ratelimit.h>
27
#include <linux/mount.h>
28
#include <linux/btrfs.h>
David Woodhouse's avatar
David Woodhouse committed
29
#include <linux/blkdev.h>
30
#include <linux/posix_acl_xattr.h>
31
#include <linux/uio.h>
32
#include <linux/magic.h>
33
#include <linux/iversion.h>
34
#include <asm/unaligned.h>
Chris Mason's avatar
Chris Mason committed
35 36 37 38 39
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
40
#include "ordered-data.h"
41
#include "xattr.h"
42
#include "tree-log.h"
43
#include "volumes.h"
44
#include "compression.h"
45
#include "locking.h"
46
#include "free-space-cache.h"
47
#include "inode-map.h"
Liu Bo's avatar
Liu Bo committed
48
#include "backref.h"
49
#include "props.h"
50
#include "qgroup.h"
51
#include "dedupe.h"
Chris Mason's avatar
Chris Mason committed
52 53

struct btrfs_iget_args {
54
	struct btrfs_key *location;
Chris Mason's avatar
Chris Mason committed
55 56 57
	struct btrfs_root *root;
};

58 59 60 61
struct btrfs_dio_data {
	u64 reserve;
	u64 unsubmitted_oe_range_start;
	u64 unsubmitted_oe_range_end;
62
	int overwrite;
63 64
};

65 66 67 68 69
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
70 71
static const struct address_space_operations btrfs_aops;
static const struct address_space_operations btrfs_symlink_aops;
72
static const struct file_operations btrfs_dir_file_operations;
73
static const struct extent_io_ops btrfs_extent_io_ops;
Chris Mason's avatar
Chris Mason committed
74 75 76 77

static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_path_cachep;
78
struct kmem_cache *btrfs_free_space_cachep;
Chris Mason's avatar
Chris Mason committed
79 80

#define S_SHIFT 12
David Sterba's avatar
David Sterba committed
81
static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
Chris Mason's avatar
Chris Mason committed
82 83 84 85 86 87 88 89 90
	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
};

91
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
92
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
93
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
94 95
static noinline int cow_file_range(struct inode *inode,
				   struct page *locked_page,
96 97 98
				   u64 start, u64 end, u64 delalloc_end,
				   int *page_started, unsigned long *nr_written,
				   int unlock, struct btrfs_dedupe_hash *hash);
99 100 101 102 103
static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
				       u64 orig_start, u64 block_start,
				       u64 block_len, u64 orig_block_len,
				       u64 ram_bytes, int compress_type,
				       int type);
104

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
static void __endio_write_update_ordered(struct inode *inode,
					 const u64 offset, const u64 bytes,
					 const bool uptodate);

/*
 * Cleanup all submitted ordered extents in specified range to handle errors
 * from the fill_dellaloc() callback.
 *
 * NOTE: caller must ensure that when an error happens, it can not call
 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 * to be released, which we want to happen only when finishing the ordered
 * extent (btrfs_finish_ordered_io()). Also note that the caller of the
 * fill_delalloc() callback already does proper cleanup for the first page of
 * the range, that is, it invokes the callback writepage_end_io_hook() for the
 * range of the first page.
 */
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
						 const u64 offset,
						 const u64 bytes)
{
126 127 128 129 130 131 132 133 134 135 136 137
	unsigned long index = offset >> PAGE_SHIFT;
	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
	struct page *page;

	while (index <= end_index) {
		page = find_get_page(inode->i_mapping, index);
		index++;
		if (!page)
			continue;
		ClearPagePrivate2(page);
		put_page(page);
	}
138 139 140 141
	return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
					    bytes - PAGE_SIZE, false);
}

142
static int btrfs_dirty_inode(struct inode *inode);
143

144 145 146 147 148 149 150
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_inode_set_ops(struct inode *inode)
{
	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
}
#endif

151
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
152 153
				     struct inode *inode,  struct inode *dir,
				     const struct qstr *qstr)
Jim Owens's avatar
Jim Owens committed
154 155 156
{
	int err;

157
	err = btrfs_init_acl(trans, inode, dir);
Jim Owens's avatar
Jim Owens committed
158
	if (!err)
159
		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
Jim Owens's avatar
Jim Owens committed
160 161 162
	return err;
}

163 164 165 166 167
/*
 * this does all the hard work for inserting an inline extent into
 * the btree.  The caller should have done a btrfs_drop_extents so that
 * no overlapping inline items exist in the btree
 */
168
static int insert_inline_extent(struct btrfs_trans_handle *trans,
169
				struct btrfs_path *path, int extent_inserted,
170 171
				struct btrfs_root *root, struct inode *inode,
				u64 start, size_t size, size_t compressed_size,
172
				int compress_type,
173 174 175 176 177 178 179 180 181 182 183
				struct page **compressed_pages)
{
	struct extent_buffer *leaf;
	struct page *page = NULL;
	char *kaddr;
	unsigned long ptr;
	struct btrfs_file_extent_item *ei;
	int ret;
	size_t cur_size = size;
	unsigned long offset;

184
	if (compressed_size && compressed_pages)
185 186
		cur_size = compressed_size;

187
	inode_add_bytes(inode, size);
188

189 190 191
	if (!extent_inserted) {
		struct btrfs_key key;
		size_t datasize;
192

193
		key.objectid = btrfs_ino(BTRFS_I(inode));
194
		key.offset = start;
195
		key.type = BTRFS_EXTENT_DATA_KEY;
196

197 198 199 200
		datasize = btrfs_file_extent_calc_inline_size(cur_size);
		path->leave_spinning = 1;
		ret = btrfs_insert_empty_item(trans, root, path, &key,
					      datasize);
201
		if (ret)
202
			goto fail;
203 204 205 206 207 208 209 210 211 212 213
	}
	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
	btrfs_set_file_extent_encryption(leaf, ei, 0);
	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
	ptr = btrfs_file_extent_inline_start(ei);

214
	if (compress_type != BTRFS_COMPRESS_NONE) {
215 216
		struct page *cpage;
		int i = 0;
217
		while (compressed_size > 0) {
218
			cpage = compressed_pages[i];
219
			cur_size = min_t(unsigned long, compressed_size,
220
				       PAGE_SIZE);
221

222
			kaddr = kmap_atomic(cpage);
223
			write_extent_buffer(leaf, kaddr, ptr, cur_size);
224
			kunmap_atomic(kaddr);
225 226 227 228 229 230

			i++;
			ptr += cur_size;
			compressed_size -= cur_size;
		}
		btrfs_set_file_extent_compression(leaf, ei,
231
						  compress_type);
232 233
	} else {
		page = find_get_page(inode->i_mapping,
234
				     start >> PAGE_SHIFT);
235
		btrfs_set_file_extent_compression(leaf, ei, 0);
236
		kaddr = kmap_atomic(page);
237
		offset = start & (PAGE_SIZE - 1);
238
		write_extent_buffer(leaf, kaddr + offset, ptr, size);
239
		kunmap_atomic(kaddr);
240
		put_page(page);
241 242
	}
	btrfs_mark_buffer_dirty(leaf);
243
	btrfs_release_path(path);
244

245 246 247 248 249 250 251 252 253
	/*
	 * we're an inline extent, so nobody can
	 * extend the file past i_size without locking
	 * a page we already have locked.
	 *
	 * We must do any isize and inode updates
	 * before we unlock the pages.  Otherwise we
	 * could end up racing with unlink.
	 */
254
	BTRFS_I(inode)->disk_i_size = inode->i_size;
255
	ret = btrfs_update_inode(trans, root, inode);
256

257
fail:
258
	return ret;
259 260 261 262 263 264 265 266
}


/*
 * conditionally insert an inline extent into the file.  This
 * does the checks required to make sure the data is small enough
 * to fit as an inline extent.
 */
267
static noinline int cow_file_range_inline(struct inode *inode, u64 start,
268 269 270
					  u64 end, size_t compressed_size,
					  int compress_type,
					  struct page **compressed_pages)
271
{
272
	struct btrfs_root *root = BTRFS_I(inode)->root;
273
	struct btrfs_fs_info *fs_info = root->fs_info;
274
	struct btrfs_trans_handle *trans;
275 276 277
	u64 isize = i_size_read(inode);
	u64 actual_end = min(end + 1, isize);
	u64 inline_len = actual_end - start;
278
	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
279 280
	u64 data_len = inline_len;
	int ret;
281 282 283
	struct btrfs_path *path;
	int extent_inserted = 0;
	u32 extent_item_size;
284 285 286 287 288

	if (compressed_size)
		data_len = compressed_size;

	if (start > 0 ||
289 290
	    actual_end > fs_info->sectorsize ||
	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
291
	    (!compressed_size &&
292
	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
293
	    end + 1 < isize ||
294
	    data_len > fs_info->max_inline) {
295 296 297
		return 1;
	}

298 299 300 301
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

302
	trans = btrfs_join_transaction(root);
303 304
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
305
		return PTR_ERR(trans);
306
	}
307
	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
308

309 310 311 312 313 314 315 316 317 318
	if (compressed_size && compressed_pages)
		extent_item_size = btrfs_file_extent_calc_inline_size(
		   compressed_size);
	else
		extent_item_size = btrfs_file_extent_calc_inline_size(
		    inline_len);

	ret = __btrfs_drop_extents(trans, root, inode, path,
				   start, aligned_end, NULL,
				   1, 1, extent_item_size, &extent_inserted);
319
	if (ret) {
320
		btrfs_abort_transaction(trans, ret);
321 322
		goto out;
	}
323 324 325

	if (isize > actual_end)
		inline_len = min_t(u64, isize, actual_end);
326 327
	ret = insert_inline_extent(trans, path, extent_inserted,
				   root, inode, start,
328
				   inline_len, compressed_size,
329
				   compress_type, compressed_pages);
330
	if (ret && ret != -ENOSPC) {
331
		btrfs_abort_transaction(trans, ret);
332
		goto out;
333
	} else if (ret == -ENOSPC) {
334 335
		ret = 1;
		goto out;
336
	}
337

338
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
339
	btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
340
out:
341 342 343 344 345 346
	/*
	 * Don't forget to free the reserved space, as for inlined extent
	 * it won't count as data extent, free them directly here.
	 * And at reserve time, it's always aligned to page size, so
	 * just free one page here.
	 */
347
	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
348
	btrfs_free_path(path);
349
	btrfs_end_transaction(trans);
350
	return ret;
351 352
}

353 354 355 356 357 358
struct async_extent {
	u64 start;
	u64 ram_size;
	u64 compressed_size;
	struct page **pages;
	unsigned long nr_pages;
359
	int compress_type;
360 361 362 363 364 365 366 367 368
	struct list_head list;
};

struct async_cow {
	struct inode *inode;
	struct btrfs_root *root;
	struct page *locked_page;
	u64 start;
	u64 end;
369
	unsigned int write_flags;
370 371 372 373 374 375 376 377
	struct list_head extents;
	struct btrfs_work work;
};

static noinline int add_async_extent(struct async_cow *cow,
				     u64 start, u64 ram_size,
				     u64 compressed_size,
				     struct page **pages,
378 379
				     unsigned long nr_pages,
				     int compress_type)
380 381 382 383
{
	struct async_extent *async_extent;

	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
384
	BUG_ON(!async_extent); /* -ENOMEM */
385 386 387 388 389
	async_extent->start = start;
	async_extent->ram_size = ram_size;
	async_extent->compressed_size = compressed_size;
	async_extent->pages = pages;
	async_extent->nr_pages = nr_pages;
390
	async_extent->compress_type = compress_type;
391 392 393 394
	list_add_tail(&async_extent->list, &cow->extents);
	return 0;
}

395
static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
396
{
397
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
398 399

	/* force compress */
400
	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
401
		return 1;
402 403 404
	/* defrag ioctl */
	if (BTRFS_I(inode)->defrag_compress)
		return 1;
405 406 407
	/* bad compression ratios */
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
		return 0;
408
	if (btrfs_test_opt(fs_info, COMPRESS) ||
409
	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
410
	    BTRFS_I(inode)->prop_compress)
411
		return btrfs_compress_heuristic(inode, start, end);
412 413 414
	return 0;
}

415
static inline void inode_should_defrag(struct btrfs_inode *inode,
416 417 418 419
		u64 start, u64 end, u64 num_bytes, u64 small_write)
{
	/* If this is a small write inside eof, kick off a defrag */
	if (num_bytes < small_write &&
420
	    (start > 0 || end + 1 < inode->disk_i_size))
421 422 423
		btrfs_add_inode_defrag(NULL, inode);
}

Chris Mason's avatar
Chris Mason committed
424
/*
425 426 427
 * we create compressed extents in two phases.  The first
 * phase compresses a range of pages that have already been
 * locked (both pages and state bits are locked).
428
 *
429 430 431 432 433
 * This is done inside an ordered work queue, and the compression
 * is spread across many cpus.  The actual IO submission is step
 * two, and the ordered work queue takes care of making sure that
 * happens in the same order things were put onto the queue by
 * writepages and friends.
434
 *
435 436 437
 * If this code finds it can't get good compression, it puts an
 * entry onto the work queue to write the uncompressed bytes.  This
 * makes sure that both compressed inodes and uncompressed inodes
438 439
 * are written in the same order that the flusher thread sent them
 * down.
Chris Mason's avatar
Chris Mason committed
440
 */
441
static noinline void compress_file_range(struct inode *inode,
442 443 444 445
					struct page *locked_page,
					u64 start, u64 end,
					struct async_cow *async_cow,
					int *num_added)
446
{
447 448
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	u64 blocksize = fs_info->sectorsize;
449
	u64 actual_end;
450
	u64 isize = i_size_read(inode);
451
	int ret = 0;
452 453 454 455 456 457
	struct page **pages = NULL;
	unsigned long nr_pages;
	unsigned long total_compressed = 0;
	unsigned long total_in = 0;
	int i;
	int will_compress;
458
	int compress_type = fs_info->compress_type;
459
	int redirty = 0;
460

461 462
	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
			SZ_16K);
Chris Mason's avatar
Chris Mason committed
463

464
	actual_end = min_t(u64, isize, end + 1);
465 466
again:
	will_compress = 0;
467
	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
468 469 470
	BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
	nr_pages = min_t(unsigned long, nr_pages,
			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
471

472 473 474 475 476 477 478 479 480 481 482 483 484
	/*
	 * we don't want to send crud past the end of i_size through
	 * compression, that's just a waste of CPU time.  So, if the
	 * end of the file is before the start of our current
	 * requested range of bytes, we bail out to the uncompressed
	 * cleanup code that can deal with all of this.
	 *
	 * It isn't really the fastest way to fix things, but this is a
	 * very uncommon corner.
	 */
	if (actual_end <= start)
		goto cleanup_and_bail_uncompressed;

485 486
	total_compressed = actual_end - start;

487 488
	/*
	 * skip compression for a small file range(<=blocksize) that
489
	 * isn't an inline extent, since it doesn't save disk space at all.
490 491 492 493 494
	 */
	if (total_compressed <= blocksize &&
	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
		goto cleanup_and_bail_uncompressed;

495 496
	total_compressed = min_t(unsigned long, total_compressed,
			BTRFS_MAX_UNCOMPRESSED);
497 498
	total_in = 0;
	ret = 0;
499

500 501 502 503
	/*
	 * we do compression for mount -o compress and when the
	 * inode has not been flagged as nocompress.  This flag can
	 * change at any time if we discover bad compression ratios.
504
	 */
505
	if (inode_need_compress(inode, start, end)) {
506
		WARN_ON(pages);
507
		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
508 509 510 511
		if (!pages) {
			/* just bail out to the uncompressed code */
			goto cont;
		}
512

513 514 515
		if (BTRFS_I(inode)->defrag_compress)
			compress_type = BTRFS_I(inode)->defrag_compress;
		else if (BTRFS_I(inode)->prop_compress)
516
			compress_type = BTRFS_I(inode)->prop_compress;
517

518 519 520 521 522 523 524 525
		/*
		 * we need to call clear_page_dirty_for_io on each
		 * page in the range.  Otherwise applications with the file
		 * mmap'd can wander in and change the page contents while
		 * we are compressing them.
		 *
		 * If the compression fails for any reason, we set the pages
		 * dirty again later on.
526 527 528
		 *
		 * Note that the remaining part is redirtied, the start pointer
		 * has moved, the end is the original one.
529
		 */
530 531 532 533
		if (!redirty) {
			extent_range_clear_dirty_for_io(inode, start, end);
			redirty = 1;
		}
534 535 536 537

		/* Compression level is applied here and only here */
		ret = btrfs_compress_pages(
			compress_type | (fs_info->compress_level << 4),
538
					   inode->i_mapping, start,
539
					   pages,
540
					   &nr_pages,
541
					   &total_in,
542
					   &total_compressed);
543 544 545

		if (!ret) {
			unsigned long offset = total_compressed &
546
				(PAGE_SIZE - 1);
547
			struct page *page = pages[nr_pages - 1];
548 549 550 551 552 553
			char *kaddr;

			/* zero the tail end of the last page, we might be
			 * sending it down to disk
			 */
			if (offset) {
554
				kaddr = kmap_atomic(page);
555
				memset(kaddr + offset, 0,
556
				       PAGE_SIZE - offset);
557
				kunmap_atomic(kaddr);
558 559 560 561
			}
			will_compress = 1;
		}
	}
562
cont:
563 564
	if (start == 0) {
		/* lets try to make an inline extent */
565
		if (ret || total_in < actual_end) {
566
			/* we didn't compress the entire range, try
567
			 * to make an uncompressed inline extent.
568
			 */
569 570
			ret = cow_file_range_inline(inode, start, end, 0,
						    BTRFS_COMPRESS_NONE, NULL);
571
		} else {
572
			/* try making a compressed inline extent */
573
			ret = cow_file_range_inline(inode, start, end,
574 575
						    total_compressed,
						    compress_type, pages);
576
		}
577
		if (ret <= 0) {
578
			unsigned long clear_flags = EXTENT_DELALLOC |
579 580
				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
				EXTENT_DO_ACCOUNTING;
581 582 583
			unsigned long page_error_op;

			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
584

585
			/*
586 587 588
			 * inline extent creation worked or returned error,
			 * we don't need to create any more async work items.
			 * Unlock and free up our temp pages.
589 590 591 592 593
			 *
			 * We use DO_ACCOUNTING here because we need the
			 * delalloc_release_metadata to be done _after_ we drop
			 * our outstanding extent for clearing delalloc for this
			 * range.
594
			 */
595 596 597
			extent_clear_unlock_delalloc(inode, start, end, end,
						     NULL, clear_flags,
						     PAGE_UNLOCK |
598 599
						     PAGE_CLEAR_DIRTY |
						     PAGE_SET_WRITEBACK |
600
						     page_error_op |
601
						     PAGE_END_WRITEBACK);
602 603 604 605 606 607 608 609 610 611
			goto free_pages_out;
		}
	}

	if (will_compress) {
		/*
		 * we aren't doing an inline extent round the compressed size
		 * up to a block size boundary so the allocator does sane
		 * things
		 */
612
		total_compressed = ALIGN(total_compressed, blocksize);
613 614 615

		/*
		 * one last check to make sure the compression is really a
616 617
		 * win, compare the page count read with the blocks on disk,
		 * compression must free at least one sector size
618
		 */
619
		total_in = ALIGN(total_in, PAGE_SIZE);
620
		if (total_compressed + blocksize <= total_in) {
621 622 623 624 625 626 627
			*num_added += 1;

			/*
			 * The async work queues will take care of doing actual
			 * allocation on disk for these compressed pages, and
			 * will submit them to the elevator.
			 */
628
			add_async_extent(async_cow, start, total_in,
629
					total_compressed, pages, nr_pages,
630 631
					compress_type);

632 633
			if (start + total_in < end) {
				start += total_in;
634 635 636 637 638
				pages = NULL;
				cond_resched();
				goto again;
			}
			return;
639 640
		}
	}
641
	if (pages) {
642 643 644 645
		/*
		 * the compression code ran but failed to make things smaller,
		 * free any pages it allocated and our page pointer array
		 */
646
		for (i = 0; i < nr_pages; i++) {
Chris Mason's avatar
Chris Mason committed
647
			WARN_ON(pages[i]->mapping);
648
			put_page(pages[i]);
649 650 651 652
		}
		kfree(pages);
		pages = NULL;
		total_compressed = 0;
653
		nr_pages = 0;
654 655

		/* flag the file so we don't compress in the future */
656
		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
657
		    !(BTRFS_I(inode)->prop_compress)) {
658
			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
659
		}
660
	}
661
cleanup_and_bail_uncompressed:
662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
	/*
	 * No compression, but we still need to write the pages in the file
	 * we've been given so far.  redirty the locked page if it corresponds
	 * to our extent and set things up for the async work queue to run
	 * cow_file_range to do the normal delalloc dance.
	 */
	if (page_offset(locked_page) >= start &&
	    page_offset(locked_page) <= end)
		__set_page_dirty_nobuffers(locked_page);
		/* unlocked later on in the async handlers */

	if (redirty)
		extent_range_redirty_for_io(inode, start, end);
	add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
			 BTRFS_COMPRESS_NONE);
	*num_added += 1;
678

679
	return;
680 681

free_pages_out:
682
	for (i = 0; i < nr_pages; i++) {
683
		WARN_ON(pages[i]->mapping);
684
		put_page(pages[i]);
685
	}
686
	kfree(pages);
687 688
}

689 690 691 692 693 694 695 696 697
static void free_async_extent_pages(struct async_extent *async_extent)
{
	int i;

	if (!async_extent->pages)
		return;

	for (i = 0; i < async_extent->nr_pages; i++) {
		WARN_ON(async_extent->pages[i]->mapping);
698
		put_page(async_extent->pages[i]);
699 700 701 702
	}
	kfree(async_extent->pages);
	async_extent->nr_pages = 0;
	async_extent->pages = NULL;
703 704 705 706 707 708 709 710
}

/*
 * phase two of compressed writeback.  This is the ordered portion
 * of the code, which only gets called in the order the work was
 * queued.  We walk all the async extents created by compress_file_range
 * and send them down to the disk.
 */
711
static noinline void submit_compressed_extents(struct inode *inode,
712 713
					      struct async_cow *async_cow)
{
714
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
715 716 717 718 719 720
	struct async_extent *async_extent;
	u64 alloc_hint = 0;
	struct btrfs_key ins;
	struct extent_map *em;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct extent_io_tree *io_tree;
721
	int ret = 0;
722

723
again:
724
	while (!list_empty(&async_cow->extents)) {
725 726 727
		async_extent = list_entry(async_cow->extents.next,
					  struct async_extent, list);
		list_del(&async_extent->list);
728

729 730
		io_tree = &BTRFS_I(inode)->io_tree;

731
retry:
732 733 734 735 736 737
		/* did the compression code fall back to uncompressed IO? */
		if (!async_extent->pages) {
			int page_started = 0;
			unsigned long nr_written = 0;

			lock_extent(io_tree, async_extent->start,
738
					 async_extent->start +
739
					 async_extent->ram_size - 1);
740 741

			/* allocate blocks */
742 743 744 745
			ret = cow_file_range(inode, async_cow->locked_page,
					     async_extent->start,
					     async_extent->start +
					     async_extent->ram_size - 1,
746 747 748 749
					     async_extent->start +
					     async_extent->ram_size - 1,
					     &page_started, &nr_written, 0,
					     NULL);
750

751 752
			/* JDM XXX */

753 754 755 756 757 758
			/*
			 * if page_started, cow_file_range inserted an
			 * inline extent and took care of all the unlocking
			 * and IO for us.  Otherwise, we need to submit
			 * all those pages down to the drive.
			 */
759
			if (!page_started && !ret)
760 761
				extent_write_locked_range(inode,
						  async_extent->start,
762
						  async_extent->start +
763 764
						  async_extent->ram_size - 1,
						  WB_SYNC_ALL);
765 766
			else if (ret)
				unlock_page(async_cow->locked_page);
767 768 769 770 771 772
			kfree(async_extent);
			cond_resched();
			continue;
		}

		lock_extent(io_tree, async_extent->start,
773
			    async_extent->start + async_extent->ram_size - 1);
774

775
		ret = btrfs_reserve_extent(root, async_extent->ram_size,
776 777
					   async_extent->compressed_size,
					   async_extent->compressed_size,
778
					   0, alloc_hint, &ins, 1, 1);
779
		if (ret) {
780
			free_async_extent_pages(async_extent);
781

782 783 784 785
			if (ret == -ENOSPC) {
				unlock_extent(io_tree, async_extent->start,
					      async_extent->start +
					      async_extent->ram_size - 1);
786 787 788 789 790 791 792 793 794 795 796 797

				/*
				 * we need to redirty the pages if we decide to
				 * fallback to uncompressed IO, otherwise we
				 * will not submit these pages down to lower
				 * layers.
				 */
				extent_range_redirty_for_io(inode,
						async_extent->start,
						async_extent->start +
						async_extent->ram_size - 1);

798
				goto retry;
799
			}
800
			goto out_free;
801
		}
802 803 804 805
		/*
		 * here we're doing allocation and writeback of the
		 * compressed pages
		 */
806 807 808 809 810 811 812 813 814 815 816
		em = create_io_em(inode, async_extent->start,
				  async_extent->ram_size, /* len */
				  async_extent->start, /* orig_start */
				  ins.objectid, /* block_start */
				  ins.offset, /* block_len */
				  ins.offset, /* orig_block_len */
				  async_extent->ram_size, /* ram_bytes */
				  async_extent->compress_type,
				  BTRFS_ORDERED_COMPRESSED);
		if (IS_ERR(em))
			/* ret value is not necessary due to void function */
817
			goto out_free_reserve;
818
		free_extent_map(em);
819

820 821 822 823 824 825 826
		ret = btrfs_add_ordered_extent_compress(inode,
						async_extent->start,
						ins.objectid,
						async_extent->ram_size,
						ins.offset,
						BTRFS_ORDERED_COMPRESSED,
						async_extent->compress_type);
827
		if (ret) {
828 829
			btrfs_drop_extent_cache(BTRFS_I(inode),
						async_extent->start,
830 831
						async_extent->start +
						async_extent->ram_size - 1, 0);
832
			goto out_free_reserve;
833
		}
834
		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
835 836 837 838

		/*
		 * clear dirty, set writeback and unlock the pages.
		 */
839
		extent_clear_unlock_delalloc(inode, async_extent->start,
840 841
				async_extent->start +
				async_extent->ram_size - 1,
842 843
				async_extent->start +
				async_extent->ram_size - 1,
844 845
				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
846
				PAGE_SET_WRITEBACK);
847
		if (btrfs_submit_compressed_write(inode,
848 849 850 851
				    async_extent->start,
				    async_extent->ram_size,
				    ins.objectid,
				    ins.offset, async_extent->pages,
852 853
				    async_extent->nr_pages,
				    async_cow->write_flags)) {
854 855 856 857 858 859 860 861 862
			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
			struct page *p = async_extent->pages[0];
			const u64 start = async_extent->start;
			const u64 end = start + async_extent->ram_size - 1;

			p->mapping = inode->i_mapping;
			tree->ops->writepage_end_io_hook(p, start, end,
							 NULL, 0);
			p->mapping = NULL;