extent_io.c 150 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2

3 4 5 6 7 8 9 10 11 12 13
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
14
#include <linux/prefetch.h>
15
#include <linux/cleancache.h>
16 17
#include "extent_io.h"
#include "extent_map.h"
18 19
#include "ctree.h"
#include "btrfs_inode.h"
20
#include "volumes.h"
21
#include "check-integrity.h"
22
#include "locking.h"
23
#include "rcu-string.h"
24
#include "backref.h"
25
#include "disk-io.h"
26 27 28

static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
29
static struct bio_set btrfs_bioset;
30

31 32 33 34 35
static inline bool extent_state_in_tree(const struct extent_state *state)
{
	return !RB_EMPTY_NODE(&state->rb_node);
}

36
#ifdef CONFIG_BTRFS_DEBUG
37 38
static LIST_HEAD(buffers);
static LIST_HEAD(states);
Chris Mason's avatar
Chris Mason committed
39

40
static DEFINE_SPINLOCK(leak_lock);
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69

static inline
void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
{
	unsigned long flags;

	spin_lock_irqsave(&leak_lock, flags);
	list_add(new, head);
	spin_unlock_irqrestore(&leak_lock, flags);
}

static inline
void btrfs_leak_debug_del(struct list_head *entry)
{
	unsigned long flags;

	spin_lock_irqsave(&leak_lock, flags);
	list_del(entry);
	spin_unlock_irqrestore(&leak_lock, flags);
}

static inline
void btrfs_leak_debug_check(void)
{
	struct extent_state *state;
	struct extent_buffer *eb;

	while (!list_empty(&states)) {
		state = list_entry(states.next, struct extent_state, leak_list);
70
		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
71 72
		       state->start, state->end, state->state,
		       extent_state_in_tree(state),
73
		       refcount_read(&state->refs));
74 75 76 77 78 79
		list_del(&state->leak_list);
		kmem_cache_free(extent_state_cache, state);
	}

	while (!list_empty(&buffers)) {
		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
80 81
		pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
82 83 84 85
		list_del(&eb->leak_list);
		kmem_cache_free(extent_buffer_cache, eb);
	}
}
86

87 88
#define btrfs_debug_check_extent_io_range(tree, start, end)		\
	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
89
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
90
		struct extent_io_tree *tree, u64 start, u64 end)
91
{
92 93 94
	if (tree->ops && tree->ops->check_extent_io_range)
		tree->ops->check_extent_io_range(tree->private_data, caller,
						 start, end);
95
}
96 97 98 99
#else
#define btrfs_leak_debug_add(new, head)	do {} while (0)
#define btrfs_leak_debug_del(entry)	do {} while (0)
#define btrfs_leak_debug_check()	do {} while (0)
100
#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
Chris Mason's avatar
Chris Mason committed
101
#endif
102 103 104 105 106 107 108 109 110 111 112 113

#define BUFFER_LRU_MAX 64

struct tree_entry {
	u64 start;
	u64 end;
	struct rb_node rb_node;
};

struct extent_page_data {
	struct bio *bio;
	struct extent_io_tree *tree;
114 115 116
	/* tells writepage not to lock the state bits for this range
	 * it still does the unlocking
	 */
117 118
	unsigned int extent_locked:1;

119
	/* tells the submit_bio code to use REQ_SYNC */
120
	unsigned int sync_io:1;
121 122
};

123
static int add_extent_changeset(struct extent_state *state, unsigned bits,
124 125 126 127 128 129
				 struct extent_changeset *changeset,
				 int set)
{
	int ret;

	if (!changeset)
130
		return 0;
131
	if (set && (state->state & bits) == bits)
132
		return 0;
133
	if (!set && (state->state & bits) == 0)
134
		return 0;
135
	changeset->bytes_changed += state->end - state->start + 1;
136
	ret = ulist_add(&changeset->range_changed, state->start, state->end,
137
			GFP_ATOMIC);
138
	return ret;
139 140
}

141
static void flush_write_bio(struct extent_page_data *epd);
142

143 144 145
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
{
146 147 148
	if (tree->ops)
		return tree->ops->tree_fs_info(tree->private_data);
	return NULL;
149
}
150

151 152
int __init extent_io_init(void)
{
153
	extent_state_cache = kmem_cache_create("btrfs_extent_state",
154
			sizeof(struct extent_state), 0,
155
			SLAB_MEM_SPREAD, NULL);
156 157 158
	if (!extent_state_cache)
		return -ENOMEM;

159
	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
160
			sizeof(struct extent_buffer), 0,
161
			SLAB_MEM_SPREAD, NULL);
162 163
	if (!extent_buffer_cache)
		goto free_state_cache;
164

165 166 167
	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
			offsetof(struct btrfs_io_bio, bio),
			BIOSET_NEED_BVECS))
168
		goto free_buffer_cache;
169

170
	if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
171 172
		goto free_bioset;

173 174
	return 0;

175
free_bioset:
176
	bioset_exit(&btrfs_bioset);
177

178 179 180 181
free_buffer_cache:
	kmem_cache_destroy(extent_buffer_cache);
	extent_buffer_cache = NULL;

182 183
free_state_cache:
	kmem_cache_destroy(extent_state_cache);
184
	extent_state_cache = NULL;
185 186 187
	return -ENOMEM;
}

188
void __cold extent_io_exit(void)
189
{
190
	btrfs_leak_debug_check();
191 192 193 194 195 196

	/*
	 * Make sure all delayed rcu free are flushed before we
	 * destroy caches.
	 */
	rcu_barrier();
197 198
	kmem_cache_destroy(extent_state_cache);
	kmem_cache_destroy(extent_buffer_cache);
199
	bioset_exit(&btrfs_bioset);
200 201 202
}

void extent_io_tree_init(struct extent_io_tree *tree,
203
			 void *private_data)
204
{
205
	tree->state = RB_ROOT;
206 207
	tree->ops = NULL;
	tree->dirty_bytes = 0;
208
	spin_lock_init(&tree->lock);
209
	tree->private_data = private_data;
210 211
}

212
static struct extent_state *alloc_extent_state(gfp_t mask)
213 214 215
{
	struct extent_state *state;

216 217 218 219 220
	/*
	 * The given mask might be not appropriate for the slab allocator,
	 * drop the unsupported bits
	 */
	mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
221
	state = kmem_cache_alloc(extent_state_cache, mask);
222
	if (!state)
223 224
		return state;
	state->state = 0;
225
	state->failrec = NULL;
226
	RB_CLEAR_NODE(&state->rb_node);
227
	btrfs_leak_debug_add(&state->leak_list, &states);
228
	refcount_set(&state->refs, 1);
229
	init_waitqueue_head(&state->wq);
230
	trace_alloc_extent_state(state, mask, _RET_IP_);
231 232 233
	return state;
}

234
void free_extent_state(struct extent_state *state)
235 236 237
{
	if (!state)
		return;
238
	if (refcount_dec_and_test(&state->refs)) {
239
		WARN_ON(extent_state_in_tree(state));
240
		btrfs_leak_debug_del(&state->leak_list);
241
		trace_free_extent_state(state, _RET_IP_);
242 243 244 245
		kmem_cache_free(extent_state_cache, state);
	}
}

246 247 248
static struct rb_node *tree_insert(struct rb_root *root,
				   struct rb_node *search_start,
				   u64 offset,
249 250 251
				   struct rb_node *node,
				   struct rb_node ***p_in,
				   struct rb_node **parent_in)
252
{
253
	struct rb_node **p;
254
	struct rb_node *parent = NULL;
255 256
	struct tree_entry *entry;

257 258 259 260 261 262
	if (p_in && parent_in) {
		p = *p_in;
		parent = *parent_in;
		goto do_insert;
	}

263
	p = search_start ? &search_start : &root->rb_node;
264
	while (*p) {
265 266 267 268 269 270 271 272 273 274 275
		parent = *p;
		entry = rb_entry(parent, struct tree_entry, rb_node);

		if (offset < entry->start)
			p = &(*p)->rb_left;
		else if (offset > entry->end)
			p = &(*p)->rb_right;
		else
			return parent;
	}

276
do_insert:
277 278 279 280 281
	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

282
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
283 284 285 286
				      struct rb_node **prev_ret,
				      struct rb_node **next_ret,
				      struct rb_node ***p_ret,
				      struct rb_node **parent_ret)
287
{
288
	struct rb_root *root = &tree->state;
289
	struct rb_node **n = &root->rb_node;
290 291 292 293 294
	struct rb_node *prev = NULL;
	struct rb_node *orig_prev = NULL;
	struct tree_entry *entry;
	struct tree_entry *prev_entry = NULL;

295 296 297
	while (*n) {
		prev = *n;
		entry = rb_entry(prev, struct tree_entry, rb_node);
298 299 300
		prev_entry = entry;

		if (offset < entry->start)
301
			n = &(*n)->rb_left;
302
		else if (offset > entry->end)
303
			n = &(*n)->rb_right;
304
		else
305
			return *n;
306 307
	}

308 309 310 311 312
	if (p_ret)
		*p_ret = n;
	if (parent_ret)
		*parent_ret = prev;

313 314
	if (prev_ret) {
		orig_prev = prev;
315
		while (prev && offset > prev_entry->end) {
316 317 318 319 320 321 322 323 324
			prev = rb_next(prev);
			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
		}
		*prev_ret = prev;
		prev = orig_prev;
	}

	if (next_ret) {
		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
325
		while (prev && offset < prev_entry->start) {
326 327 328 329 330 331 332 333
			prev = rb_prev(prev);
			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
		}
		*next_ret = prev;
	}
	return NULL;
}

334 335 336 337 338
static inline struct rb_node *
tree_search_for_insert(struct extent_io_tree *tree,
		       u64 offset,
		       struct rb_node ***p_ret,
		       struct rb_node **parent_ret)
339
{
340
	struct rb_node *prev = NULL;
341
	struct rb_node *ret;
342

343
	ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
344
	if (!ret)
345 346 347 348
		return prev;
	return ret;
}

349 350 351 352 353 354
static inline struct rb_node *tree_search(struct extent_io_tree *tree,
					  u64 offset)
{
	return tree_search_for_insert(tree, offset, NULL, NULL);
}

Josef Bacik's avatar
Josef Bacik committed
355 356 357 358
static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
		     struct extent_state *other)
{
	if (tree->ops && tree->ops->merge_extent_hook)
359
		tree->ops->merge_extent_hook(tree->private_data, new, other);
Josef Bacik's avatar
Josef Bacik committed
360 361
}

362 363 364 365 366 367 368 369 370
/*
 * utility function to look for merge candidates inside a given range.
 * Any extents with matching state are merged together into a single
 * extent in the tree.  Extents with EXTENT_IO in their state field
 * are not merged because the end_io handlers need to be able to do
 * operations on them without sleeping (or doing allocations/splits).
 *
 * This should be called with the tree lock held.
 */
371 372
static void merge_state(struct extent_io_tree *tree,
		        struct extent_state *state)
373 374 375 376
{
	struct extent_state *other;
	struct rb_node *other_node;

377
	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
378
		return;
379 380 381 382 383 384

	other_node = rb_prev(&state->rb_node);
	if (other_node) {
		other = rb_entry(other_node, struct extent_state, rb_node);
		if (other->end == state->start - 1 &&
		    other->state == state->state) {
Josef Bacik's avatar
Josef Bacik committed
385
			merge_cb(tree, state, other);
386 387
			state->start = other->start;
			rb_erase(&other->rb_node, &tree->state);
388
			RB_CLEAR_NODE(&other->rb_node);
389 390 391 392 393 394 395 396
			free_extent_state(other);
		}
	}
	other_node = rb_next(&state->rb_node);
	if (other_node) {
		other = rb_entry(other_node, struct extent_state, rb_node);
		if (other->start == state->end + 1 &&
		    other->state == state->state) {
Josef Bacik's avatar
Josef Bacik committed
397
			merge_cb(tree, state, other);
398 399
			state->end = other->end;
			rb_erase(&other->rb_node, &tree->state);
400
			RB_CLEAR_NODE(&other->rb_node);
401
			free_extent_state(other);
402 403 404 405
		}
	}
}

406
static void set_state_cb(struct extent_io_tree *tree,
407
			 struct extent_state *state, unsigned *bits)
408
{
409
	if (tree->ops && tree->ops->set_bit_hook)
410
		tree->ops->set_bit_hook(tree->private_data, state, bits);
411 412 413
}

static void clear_state_cb(struct extent_io_tree *tree,
414
			   struct extent_state *state, unsigned *bits)
415
{
Josef Bacik's avatar
Josef Bacik committed
416
	if (tree->ops && tree->ops->clear_bit_hook)
417
		tree->ops->clear_bit_hook(tree->private_data, state, bits);
418 419
}

420
static void set_state_bits(struct extent_io_tree *tree,
421 422
			   struct extent_state *state, unsigned *bits,
			   struct extent_changeset *changeset);
423

424 425 426 427 428 429 430 431 432 433 434 435
/*
 * insert an extent_state struct into the tree.  'bits' are set on the
 * struct before it is inserted.
 *
 * This may return -EEXIST if the extent is already there, in which case the
 * state struct is freed.
 *
 * The tree lock is not taken internally.  This is a utility function and
 * probably isn't what you want to call (see set/clear_extent_bit).
 */
static int insert_state(struct extent_io_tree *tree,
			struct extent_state *state, u64 start, u64 end,
436 437
			struct rb_node ***p,
			struct rb_node **parent,
438
			unsigned *bits, struct extent_changeset *changeset)
439 440 441
{
	struct rb_node *node;

Julia Lawall's avatar
Julia Lawall committed
442
	if (end < start)
443
		WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
444
		       end, start);
445 446
	state->start = start;
	state->end = end;
Josef Bacik's avatar
Josef Bacik committed
447

448
	set_state_bits(tree, state, bits, changeset);
449

450
	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
451 452 453
	if (node) {
		struct extent_state *found;
		found = rb_entry(node, struct extent_state, rb_node);
454
		pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n",
455
		       found->start, found->end, start, end);
456 457 458 459 460 461
		return -EEXIST;
	}
	merge_state(tree, state);
	return 0;
}

462
static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
Josef Bacik's avatar
Josef Bacik committed
463 464 465
		     u64 split)
{
	if (tree->ops && tree->ops->split_extent_hook)
466
		tree->ops->split_extent_hook(tree->private_data, orig, split);
Josef Bacik's avatar
Josef Bacik committed
467 468
}

469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
/*
 * split a given extent state struct in two, inserting the preallocated
 * struct 'prealloc' as the newly created second half.  'split' indicates an
 * offset inside 'orig' where it should be split.
 *
 * Before calling,
 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 * are two extent state structs in the tree:
 * prealloc: [orig->start, split - 1]
 * orig: [ split, orig->end ]
 *
 * The tree locks are not taken by this function. They need to be held
 * by the caller.
 */
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
		       struct extent_state *prealloc, u64 split)
{
	struct rb_node *node;
Josef Bacik's avatar
Josef Bacik committed
487 488 489

	split_cb(tree, orig, split);

490 491 492 493 494
	prealloc->start = orig->start;
	prealloc->end = split - 1;
	prealloc->state = orig->state;
	orig->start = split;

495 496
	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
			   &prealloc->rb_node, NULL, NULL);
497 498 499 500 501 502 503
	if (node) {
		free_extent_state(prealloc);
		return -EEXIST;
	}
	return 0;
}

504 505 506 507 508 509 510 511 512
static struct extent_state *next_state(struct extent_state *state)
{
	struct rb_node *next = rb_next(&state->rb_node);
	if (next)
		return rb_entry(next, struct extent_state, rb_node);
	else
		return NULL;
}

513 514
/*
 * utility function to clear some bits in an extent state struct.
515
 * it will optionally wake up any one waiting on this state (wake == 1).
516 517 518 519
 *
 * If no bits are set on the state struct after clearing things, the
 * struct is freed and removed from the tree
 */
520 521
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
					    struct extent_state *state,
522 523
					    unsigned *bits, int wake,
					    struct extent_changeset *changeset)
524
{
525
	struct extent_state *next;
526
	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
527
	int ret;
528

529
	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
530 531 532 533
		u64 range = state->end - state->start + 1;
		WARN_ON(range > tree->dirty_bytes);
		tree->dirty_bytes -= range;
	}
534
	clear_state_cb(tree, state, bits);
535 536
	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
	BUG_ON(ret < 0);
537
	state->state &= ~bits_to_clear;
538 539
	if (wake)
		wake_up(&state->wq);
540
	if (state->state == 0) {
541
		next = next_state(state);
542
		if (extent_state_in_tree(state)) {
543
			rb_erase(&state->rb_node, &tree->state);
544
			RB_CLEAR_NODE(&state->rb_node);
545 546 547 548 549 550
			free_extent_state(state);
		} else {
			WARN_ON(1);
		}
	} else {
		merge_state(tree, state);
551
		next = next_state(state);
552
	}
553
	return next;
554 555
}

556 557 558 559 560 561 562 563 564
static struct extent_state *
alloc_extent_state_atomic(struct extent_state *prealloc)
{
	if (!prealloc)
		prealloc = alloc_extent_state(GFP_ATOMIC);

	return prealloc;
}

565
static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
566
{
Jeff Mahoney's avatar
Jeff Mahoney committed
567 568
	btrfs_panic(tree_fs_info(tree), err,
		    "Locking error: Extent tree was modified by another thread while locked.");
569 570
}

571 572 573 574 575 576 577 578 579 580
/*
 * clear some bits on a range in the tree.  This may require splitting
 * or inserting elements in the tree, so the gfp mask is used to
 * indicate which allocations or sleeping are allowed.
 *
 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 * the given range from the tree regardless of state (ie for truncate).
 *
 * the range [start, end] is inclusive.
 *
581
 * This takes the tree lock, and returns 0 on success and < 0 on error.
582
 */
583
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
584 585 586
			      unsigned bits, int wake, int delete,
			      struct extent_state **cached_state,
			      gfp_t mask, struct extent_changeset *changeset)
587 588
{
	struct extent_state *state;
589
	struct extent_state *cached;
590 591
	struct extent_state *prealloc = NULL;
	struct rb_node *node;
592
	u64 last_end;
593
	int err;
594
	int clear = 0;
595

596
	btrfs_debug_check_extent_io_range(tree, start, end);
597

598 599 600
	if (bits & EXTENT_DELALLOC)
		bits |= EXTENT_NORESERVE;

601 602 603 604
	if (delete)
		bits |= ~EXTENT_CTLBITS;
	bits |= EXTENT_FIRST_DELALLOC;

605 606
	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
		clear = 1;
607
again:
608
	if (!prealloc && gfpflags_allow_blocking(mask)) {
609 610 611 612 613 614 615
		/*
		 * Don't care for allocation failure here because we might end
		 * up not needing the pre-allocated extent state at all, which
		 * is the case if we only have in the tree extent states that
		 * cover our input range and don't cover too any other range.
		 * If we end up needing a new extent state we allocate it later.
		 */
616 617 618
		prealloc = alloc_extent_state(mask);
	}

619
	spin_lock(&tree->lock);
620 621
	if (cached_state) {
		cached = *cached_state;
622 623 624 625 626 627

		if (clear) {
			*cached_state = NULL;
			cached_state = NULL;
		}

628 629
		if (cached && extent_state_in_tree(cached) &&
		    cached->start <= start && cached->end > start) {
630
			if (clear)
631
				refcount_dec(&cached->refs);
632
			state = cached;
633
			goto hit_next;
634
		}
635 636
		if (clear)
			free_extent_state(cached);
637
	}
638 639 640 641
	/*
	 * this search will find the extents that end after
	 * our range starts
	 */
642
	node = tree_search(tree, start);
643 644 645
	if (!node)
		goto out;
	state = rb_entry(node, struct extent_state, rb_node);
646
hit_next:
647 648 649
	if (state->start > end)
		goto out;
	WARN_ON(state->end < start);
650
	last_end = state->end;
651

652
	/* the state doesn't have the wanted bits, go ahead */
653 654
	if (!(state->state & bits)) {
		state = next_state(state);
655
		goto next;
656
	}
657

658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
	/*
	 *     | ---- desired range ---- |
	 *  | state | or
	 *  | ------------- state -------------- |
	 *
	 * We need to split the extent we found, and may flip
	 * bits on second half.
	 *
	 * If the extent we found extends past our range, we
	 * just split and search again.  It'll get split again
	 * the next time though.
	 *
	 * If the extent we found is inside our range, we clear
	 * the desired bit on it.
	 */

	if (state->start < start) {
675 676
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
677
		err = split_state(tree, state, prealloc, start);
678 679 680
		if (err)
			extent_io_tree_panic(tree, err);

681 682 683 684
		prealloc = NULL;
		if (err)
			goto out;
		if (state->end <= end) {
685 686
			state = clear_state_bit(tree, state, &bits, wake,
						changeset);
687
			goto next;
688 689 690 691 692 693 694 695 696 697
		}
		goto search_again;
	}
	/*
	 * | ---- desired range ---- |
	 *                        | state |
	 * We need to split the extent, and clear the bit
	 * on the first half
	 */
	if (state->start <= end && state->end > end) {
698 699
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
700
		err = split_state(tree, state, prealloc, end + 1);
701 702 703
		if (err)
			extent_io_tree_panic(tree, err);

704 705
		if (wake)
			wake_up(&state->wq);
706

707
		clear_state_bit(tree, prealloc, &bits, wake, changeset);
Josef Bacik's avatar
Josef Bacik committed
708

709 710 711
		prealloc = NULL;
		goto out;
	}
712

713
	state = clear_state_bit(tree, state, &bits, wake, changeset);
714
next:
715 716 717
	if (last_end == (u64)-1)
		goto out;
	start = last_end + 1;
718
	if (start <= end && state && !need_resched())
719
		goto hit_next;
720 721 722 723

search_again:
	if (start > end)
		goto out;
724
	spin_unlock(&tree->lock);
725
	if (gfpflags_allow_blocking(mask))
726 727
		cond_resched();
	goto again;
728 729 730 731 732 733 734 735

out:
	spin_unlock(&tree->lock);
	if (prealloc)
		free_extent_state(prealloc);

	return 0;

736 737
}

738 739
static void wait_on_state(struct extent_io_tree *tree,
			  struct extent_state *state)
740 741
		__releases(tree->lock)
		__acquires(tree->lock)
742 743 744
{
	DEFINE_WAIT(wait);
	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
745
	spin_unlock(&tree->lock);
746
	schedule();
747
	spin_lock(&tree->lock);
748 749 750 751 752 753 754 755
	finish_wait(&state->wq, &wait);
}

/*
 * waits for one or more bits to clear on a range in the state tree.
 * The range [start, end] is inclusive.
 * The tree lock is taken by this function
 */
756 757
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
			    unsigned long bits)
758 759 760 761
{
	struct extent_state *state;
	struct rb_node *node;

762
	btrfs_debug_check_extent_io_range(tree, start, end);
763

764
	spin_lock(&tree->lock);
765 766 767 768 769 770
again:
	while (1) {
		/*
		 * this search will find all the extents that end after
		 * our range starts
		 */
771
		node = tree_search(tree, start);
772
process_node:
773 774 775 776 777 778 779 780 781 782
		if (!node)
			break;

		state = rb_entry(node, struct extent_state, rb_node);

		if (state->start > end)
			goto out;

		if (state->state & bits) {
			start = state->start;
783
			refcount_inc(&state->refs);
784 785 786 787 788 789 790 791 792
			wait_on_state(tree, state);
			free_extent_state(state);
			goto again;
		}
		start = state->end + 1;

		if (start > end)
			break;

793 794 795 796
		if (!cond_resched_lock(&tree->lock)) {
			node = rb_next(node);
			goto process_node;
		}
797 798
	}
out:
799
	spin_unlock(&tree->lock);
800 801
}

802
static void set_state_bits(struct extent_io_tree *tree,
803
			   struct extent_state *state,
804
			   unsigned *bits, struct extent_changeset *changeset)
805
{
806
	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
807
	int ret;
Josef Bacik's avatar
Josef Bacik committed
808

809
	set_state_cb(tree, state, bits);
810
	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
811 812 813
		u64 range = state->end - state->start + 1;
		tree->dirty_bytes += range;
	}
814 815
	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
	BUG_ON(ret < 0);
816
	state->state |= bits_to_set;
817 818
}

819 820
static void cache_state_if_flags(struct extent_state *state,
				 struct extent_state **cached_ptr,
821
				 unsigned flags)
822 823
{
	if (cached_ptr && !(*cached_ptr)) {
824
		if (!flags || (state->state & flags)) {
825
			*cached_ptr = state;
826
			refcount_inc(&state->refs);
827 828 829 830
		}
	}
}

831 832 833 834 835 836 837
static void cache_state(struct extent_state *state,
			struct extent_state **cached_ptr)
{
	return cache_state_if_flags(state, cached_ptr,
				    EXTENT_IOBITS | EXTENT_BOUNDARY);
}

838
/*
839 840
 * set some bits on a range in the tree.  This may require allocations or
 * sleeping, so the gfp mask is used to indicate what is allowed.
841
 *
842 843 844
 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 * part of the range already has the desired bits set.  The start of the
 * existing range is returned in failed_start in this case.
845
 *
846
 * [start, end] is inclusive This takes the tree lock.
847
 */
848

Jeff Mahoney's avatar
Jeff Mahoney committed
849 850
static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
851
		 unsigned bits, unsigned exclusive_bits,
852
		 u64 *failed_start, struct extent_state **cached_state,
853
		 gfp_t mask, struct extent_changeset *changeset)
854 855 856 857
{
	struct extent_state *state;
	struct extent_state *prealloc = NULL;
	struct rb_node *node;
858 859
	struct rb_node **p;
	struct rb_node *parent;
860 861 862
	int err = 0;
	u64 last_start;
	u64 last_end;
863

864
	btrfs_debug_check_extent_io_range(tree, start, end);
865

866
	bits |= EXTENT_FIRST_DELALLOC;
867
again:
Mel Gorman's avatar