Skip to content
Snippets Groups Projects
disk-io.c 86.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • Chris Mason's avatar
    Chris Mason committed
    /*
     * Copyright (C) 2007 Oracle.  All rights reserved.
     *
     * This program is free software; you can redistribute it and/or
     * modify it under the terms of the GNU General Public
     * License v2 as published by the Free Software Foundation.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     * General Public License for more details.
     *
     * You should have received a copy of the GNU General Public
     * License along with this program; if not, write to the
     * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     * Boston, MA 021110-1307, USA.
     */
    
    
    #include <linux/fs.h>
    
    #include <linux/blkdev.h>
    
    #include <linux/scatterlist.h>
    
    #include <linux/swap.h>
    
    #include <linux/radix-tree.h>
    
    #include <linux/writeback.h>
    
    #include <linux/buffer_head.h>
    
    #include <linux/workqueue.h>
    
    #include <linux/kthread.h>
    
    Chris Mason's avatar
    Chris Mason committed
    #include <linux/freezer.h>
    
    #include <linux/crc32c.h>
    
    #include <linux/migrate.h>
    
    #include <linux/ratelimit.h>
    
    #include <asm/unaligned.h>
    
    Chris Mason's avatar
    Chris Mason committed
    #include "compat.h"
    
    #include "ctree.h"
    #include "disk-io.h"
    
    #include "transaction.h"
    
    #include "btrfs_inode.h"
    
    #include "print-tree.h"
    
    #include "locking.h"
    
    #include "free-space-cache.h"
    
    #include "inode-map.h"
    
    static struct extent_io_ops btree_extent_io_ops;
    
    static void end_workqueue_fn(struct btrfs_work *work);
    
    static void free_fs_root(struct btrfs_root *root);
    
    static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
    				    int read_only);
    static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
    static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
    static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
    				      struct btrfs_root *root);
    static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
    static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
    static int btrfs_destroy_marked_extents(struct btrfs_root *root,
    					struct extent_io_tree *dirty_pages,
    					int mark);
    static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
    				       struct extent_io_tree *pinned_extents);
    static int btrfs_cleanup_transaction(struct btrfs_root *root);
    
    /*
     * end_io_wq structs are used to do processing in task context when an IO is
     * complete.  This is used during reads to verify checksums, and it is used
     * by writes to insert metadata for new file extents after IO is complete.
     */
    
    struct end_io_wq {
    	struct bio *bio;
    	bio_end_io_t *end_io;
    	void *private;
    	struct btrfs_fs_info *info;
    	int error;
    
    	struct list_head list;
    
    /*
     * async submit bios are used to offload expensive checksumming
     * onto the worker threads.  They checksum file and metadata bios
     * just before they are sent down the IO stack.
     */
    
    struct async_submit_bio {
    	struct inode *inode;
    	struct bio *bio;
    	struct list_head list;
    
    	extent_submit_bio_hook_t *submit_bio_start;
    	extent_submit_bio_hook_t *submit_bio_done;
    
    	int rw;
    	int mirror_num;
    
    	unsigned long bio_flags;
    
    	/*
    	 * bio_offset is optional, can be used if the pages in the bio
    	 * can't tell us where in the file the bio should go
    	 */
    	u64 bio_offset;
    
    /*
     * Lockdep class keys for extent_buffer->lock's in this root.  For a given
     * eb, the lockdep key is determined by the btrfs_root it belongs to and
     * the level the eb occupies in the tree.
     *
     * Different roots are used for different purposes and may nest inside each
     * other and they require separate keysets.  As lockdep keys should be
     * static, assign keysets according to the purpose of the root as indicated
     * by btrfs_root->objectid.  This ensures that all special purpose roots
     * have separate keysets.
    
     * Lock-nesting across peer nodes is always done with the immediate parent
     * node locked thus preventing deadlock.  As lockdep doesn't know this, use
     * subclass to avoid triggering lockdep warning in such cases.
    
     * The key is set by the readpage_end_io_hook after the buffer has passed
     * csum validation but before the pages are unlocked.  It is also set by
     * btrfs_init_new_buffer on freshly allocated blocks.
    
     * We also add a check to make sure the highest level of the tree is the
     * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
     * needs update as well.
    
     */
    #ifdef CONFIG_DEBUG_LOCK_ALLOC
    # if BTRFS_MAX_LEVEL != 8
    #  error
    # endif
    
    
    static struct btrfs_lockdep_keyset {
    	u64			id;		/* root objectid */
    	const char		*name_stem;	/* lock name stem */
    	char			names[BTRFS_MAX_LEVEL + 1][20];
    	struct lock_class_key	keys[BTRFS_MAX_LEVEL + 1];
    } btrfs_lockdep_keysets[] = {
    	{ .id = BTRFS_ROOT_TREE_OBJECTID,	.name_stem = "root"	},
    	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	.name_stem = "extent"	},
    	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	.name_stem = "chunk"	},
    	{ .id = BTRFS_DEV_TREE_OBJECTID,	.name_stem = "dev"	},
    	{ .id = BTRFS_FS_TREE_OBJECTID,		.name_stem = "fs"	},
    	{ .id = BTRFS_CSUM_TREE_OBJECTID,	.name_stem = "csum"	},
    	{ .id = BTRFS_ORPHAN_OBJECTID,		.name_stem = "orphan"	},
    	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	},
    	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
    	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
    	{ .id = 0,				.name_stem = "tree"	},
    
    
    void __init btrfs_init_lockdep(void)
    {
    	int i, j;
    
    	/* initialize lockdep class names */
    	for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
    		struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
    
    		for (j = 0; j < ARRAY_SIZE(ks->names); j++)
    			snprintf(ks->names[j], sizeof(ks->names[j]),
    				 "btrfs-%s-%02d", ks->name_stem, j);
    	}
    }
    
    void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
    				    int level)
    {
    	struct btrfs_lockdep_keyset *ks;
    
    	BUG_ON(level >= ARRAY_SIZE(ks->keys));
    
    	/* find the matching keyset, id 0 is the default entry */
    	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
    		if (ks->id == objectid)
    			break;
    
    	lockdep_set_class_and_name(&eb->lock,
    				   &ks->keys[level], ks->names[level]);
    }
    
    
    /*
     * extents on the btree inode are pretty simple, there's one extent
     * that covers the entire device
     */
    
    static struct extent_map *btree_get_extent(struct inode *inode,
    
    		struct page *page, size_t pg_offset, u64 start, u64 len,
    
    	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
    	struct extent_map *em;
    	int ret;
    
    
    	read_lock(&em_tree->lock);
    
    	em = lookup_extent_mapping(em_tree, start, len);
    
    	if (em) {
    		em->bdev =
    			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
    
    		read_unlock(&em_tree->lock);
    
    	read_unlock(&em_tree->lock);
    
    	em = alloc_extent_map();
    
    	if (!em) {
    		em = ERR_PTR(-ENOMEM);
    		goto out;
    	}
    	em->start = 0;
    
    	em->len = (u64)-1;
    
    	em->block_len = (u64)-1;
    
    	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
    
    	write_lock(&em_tree->lock);
    
    	ret = add_extent_mapping(em_tree, em);
    	if (ret == -EEXIST) {
    
    		u64 failed_start = em->start;
    		u64 failed_len = em->len;
    
    
    		em = lookup_extent_mapping(em_tree, start, len);
    
    		} else {
    			em = lookup_extent_mapping(em_tree, failed_start,
    						   failed_len);
    
    	write_unlock(&em_tree->lock);
    
    u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
    {
    
    	return crc32c(seed, data, len);
    
    }
    
    void btrfs_csum_final(u32 crc, char *result)
    {
    
    	put_unaligned_le32(~crc, result);
    
    /*
     * compute the csum for a btree block, and either verify it or write it
     * into the csum field of the block.
     */
    
    static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
    			   int verify)
    {
    
    	u16 csum_size =
    		btrfs_super_csum_size(&root->fs_info->super_copy);
    	char *result = NULL;
    
    	unsigned long len;
    	unsigned long cur_len;
    	unsigned long offset = BTRFS_CSUM_SIZE;
    	char *kaddr;
    	unsigned long map_start;
    	unsigned long map_len;
    	int err;
    	u32 crc = ~(u32)0;
    
    	unsigned long inline_result;
    
    
    	len = buf->len - offset;
    
    	while (len > 0) {
    
    		err = map_private_extent_buffer(buf, offset, 32,
    
    					&kaddr, &map_start, &map_len);
    
    		if (err)
    
    			return 1;
    		cur_len = min(len, map_len - (offset - map_start));
    		crc = btrfs_csum_data(root, kaddr + offset - map_start,
    				      crc, cur_len);
    		len -= cur_len;
    		offset += cur_len;
    	}
    
    	if (csum_size > sizeof(inline_result)) {
    		result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
    		if (!result)
    			return 1;
    	} else {
    		result = (char *)&inline_result;
    	}
    
    
    	btrfs_csum_final(crc, result);
    
    	if (verify) {
    
    		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
    
    			memcpy(&found, result, csum_size);
    
    			read_extent_buffer(buf, &val, 0, csum_size);
    
    			printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
    
    				       "failed on %llu wanted %X found %X "
    				       "level %d\n",
    				       root->fs_info->sb->s_id,
    				       (unsigned long long)buf->start, val, found,
    				       btrfs_header_level(buf));
    
    			if (result != (char *)&inline_result)
    				kfree(result);
    
    			return 1;
    		}
    	} else {
    
    		write_extent_buffer(buf, result, 0, csum_size);
    
    	if (result != (char *)&inline_result)
    		kfree(result);
    
    /*
     * we can't consider a given block up to date unless the transid of the
     * block matches the transid in the parent node's pointer.  This is how we
     * detect blocks that either didn't get written at all or got written
     * in the wrong place.
     */
    
    static int verify_parent_transid(struct extent_io_tree *io_tree,
    				 struct extent_buffer *eb, u64 parent_transid)
    {
    
    	struct extent_state *cached_state = NULL;
    
    	int ret;
    
    	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
    		return 0;
    
    
    	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
    			 0, &cached_state, GFP_NOFS);
    	if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
    
    	    btrfs_header_generation(eb) == parent_transid) {
    		ret = 0;
    		goto out;
    	}
    
    	printk_ratelimited("parent transid verify failed on %llu wanted %llu "
    
    		       "found %llu\n",
    		       (unsigned long long)eb->start,
    		       (unsigned long long)parent_transid,
    		       (unsigned long long)btrfs_header_generation(eb));
    
    	clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
    
    	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
    			     &cached_state, GFP_NOFS);
    
    /*
     * helper to read a given tree block, doing retries as required when
     * the checksums don't match and we have alternate mirrors to try.
     */
    
    static int btree_read_extent_buffer_pages(struct btrfs_root *root,
    					  struct extent_buffer *eb,
    
    {
    	struct extent_io_tree *io_tree;
    	int ret;
    	int num_copies = 0;
    	int mirror_num = 0;
    
    
    	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
    
    	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
    	while (1) {
    
    		ret = read_extent_buffer_pages(io_tree, eb, start,
    					       WAIT_COMPLETE,
    
    					       btree_get_extent, mirror_num);
    
    		if (!ret &&
    		    !verify_parent_transid(io_tree, eb, parent_transid))
    
    		/*
    		 * This buffer's crc is fine, but its contents are corrupted, so
    		 * there is no reason to read the other copies, they won't be
    		 * any less wrong.
    		 */
    		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
    			return ret;
    
    
    		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
    					      eb->start, eb->len);
    
    		if (num_copies == 1)
    
    		if (mirror_num > num_copies)
    
     * checksum a dirty tree block before IO.  This has extra checks to make sure
     * we only fill in the checksum field in the first page of a multi-page block
    
    static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
    
    	struct extent_io_tree *tree;
    
    	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
    
    	u64 found_start;
    	unsigned long len;
    	struct extent_buffer *eb;
    
    	tree = &BTRFS_I(page->mapping->host)->io_tree;
    
    	if (page->private == EXTENT_PAGE_PRIVATE) {
    		WARN_ON(1);
    
    	}
    	if (!page->private) {
    		WARN_ON(1);
    
    	len = page->private >> 2;
    
    	WARN_ON(len == 0);
    
    
    	eb = alloc_extent_buffer(tree, start, len, page);
    
    Tsutomu Itoh's avatar
    Tsutomu Itoh committed
    	if (eb == NULL) {
    		WARN_ON(1);
    		goto out;
    	}
    
    	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
    					     btrfs_header_generation(eb));
    
    	WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
    
    
    	found_start = btrfs_header_bytenr(eb);
    	if (found_start != start) {
    
    		WARN_ON(1);
    		goto err;
    	}
    	if (eb->first_page != page) {
    		WARN_ON(1);
    		goto err;
    	}
    	if (!PageUptodate(page)) {
    		WARN_ON(1);
    		goto err;
    
    	}
    	csum_tree_block(root, eb, 0);
    
    	free_extent_buffer(eb);
    out:
    	return 0;
    }
    
    
    Yan Zheng's avatar
    Yan Zheng committed
    static int check_tree_block_fsid(struct btrfs_root *root,
    				 struct extent_buffer *eb)
    {
    	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
    	u8 fsid[BTRFS_UUID_SIZE];
    	int ret = 1;
    
    	read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
    			   BTRFS_FSID_SIZE);
    	while (fs_devices) {
    		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
    			ret = 0;
    			break;
    		}
    		fs_devices = fs_devices->seed;
    	}
    	return ret;
    }
    
    
    #define CORRUPT(reason, eb, root, slot)				\
    	printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"	\
    	       "root=%llu, slot=%d\n", reason,			\
    	       (unsigned long long)btrfs_header_bytenr(eb),	\
    	       (unsigned long long)root->objectid, slot)
    
    static noinline int check_leaf(struct btrfs_root *root,
    			       struct extent_buffer *leaf)
    {
    	struct btrfs_key key;
    	struct btrfs_key leaf_key;
    	u32 nritems = btrfs_header_nritems(leaf);
    	int slot;
    
    	if (nritems == 0)
    		return 0;
    
    	/* Check the 0 item */
    	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
    	    BTRFS_LEAF_DATA_SIZE(root)) {
    		CORRUPT("invalid item offset size pair", leaf, root, 0);
    		return -EIO;
    	}
    
    	/*
    	 * Check to make sure each items keys are in the correct order and their
    	 * offsets make sense.  We only have to loop through nritems-1 because
    	 * we check the current slot against the next slot, which verifies the
    	 * next slot's offset+size makes sense and that the current's slot
    	 * offset is correct.
    	 */
    	for (slot = 0; slot < nritems - 1; slot++) {
    		btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
    		btrfs_item_key_to_cpu(leaf, &key, slot + 1);
    
    		/* Make sure the keys are in the right order */
    		if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
    			CORRUPT("bad key order", leaf, root, slot);
    			return -EIO;
    		}
    
    		/*
    		 * Make sure the offset and ends are right, remember that the
    		 * item data starts at the end of the leaf and grows towards the
    		 * front.
    		 */
    		if (btrfs_item_offset_nr(leaf, slot) !=
    			btrfs_item_end_nr(leaf, slot + 1)) {
    			CORRUPT("slot offset bad", leaf, root, slot);
    			return -EIO;
    		}
    
    		/*
    		 * Check to make sure that we don't point outside of the leaf,
    		 * just incase all the items are consistent to eachother, but
    		 * all point outside of the leaf.
    		 */
    		if (btrfs_item_end_nr(leaf, slot) >
    		    BTRFS_LEAF_DATA_SIZE(root)) {
    			CORRUPT("slot end outside of leaf", leaf, root, slot);
    			return -EIO;
    		}
    	}
    
    	return 0;
    }
    
    
    static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
    
    			       struct extent_state *state)
    {
    	struct extent_io_tree *tree;
    	u64 found_start;
    	int found_level;
    	unsigned long len;
    	struct extent_buffer *eb;
    	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
    
    
    	tree = &BTRFS_I(page->mapping->host)->io_tree;
    	if (page->private == EXTENT_PAGE_PRIVATE)
    		goto out;
    	if (!page->private)
    		goto out;
    
    	len = page->private >> 2;
    
    	WARN_ON(len == 0);
    
    
    	eb = alloc_extent_buffer(tree, start, len, page);
    
    Tsutomu Itoh's avatar
    Tsutomu Itoh committed
    	if (eb == NULL) {
    		ret = -EIO;
    		goto out;
    	}
    
    	found_start = btrfs_header_bytenr(eb);
    
    	if (found_start != start) {
    
    		printk_ratelimited(KERN_INFO "btrfs bad tree block start "
    
    			       "%llu %llu\n",
    			       (unsigned long long)found_start,
    			       (unsigned long long)eb->start);
    
    		goto err;
    	}
    	if (eb->first_page != page) {
    
    		printk(KERN_INFO "btrfs bad first page %lu %lu\n",
    		       eb->first_page->index, page->index);
    
    Yan Zheng's avatar
    Yan Zheng committed
    	if (check_tree_block_fsid(root, eb)) {
    
    		printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
    
    			       (unsigned long long)eb->start);
    
    	found_level = btrfs_header_level(eb);
    
    
    	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
    				       eb, found_level);
    
    	ret = csum_tree_block(root, eb, 1);
    
    		goto err;
    	}
    
    	/*
    	 * If this is a leaf block and it is corrupt, set the corrupt bit so
    	 * that we don't try and read the other copies of this block, just
    	 * return -EIO.
    	 */
    	if (found_level == 0 && check_leaf(root, eb)) {
    		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
    		ret = -EIO;
    	}
    
    
    	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
    	end = eb->start + end - 1;
    err:
    
    Arne Jansen's avatar
    Arne Jansen committed
    	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
    		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
    		btree_readahead_hook(root, eb, eb->start, ret);
    	}
    
    
    	free_extent_buffer(eb);
    out:
    
    Arne Jansen's avatar
    Arne Jansen committed
    static int btree_io_failed_hook(struct bio *failed_bio,
    			 struct page *page, u64 start, u64 end,
    			 struct extent_state *state)
    {
    	struct extent_io_tree *tree;
    	unsigned long len;
    	struct extent_buffer *eb;
    	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
    
    	tree = &BTRFS_I(page->mapping->host)->io_tree;
    	if (page->private == EXTENT_PAGE_PRIVATE)
    		goto out;
    	if (!page->private)
    		goto out;
    
    	len = page->private >> 2;
    	WARN_ON(len == 0);
    
    	eb = alloc_extent_buffer(tree, start, len, page);
    	if (eb == NULL)
    		goto out;
    
    	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
    		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
    		btree_readahead_hook(root, eb, eb->start, -EIO);
    	}
    
    out:
    	return -EIO;	/* we fixed nothing */
    }
    
    
    static void end_workqueue_bio(struct bio *bio, int err)
    {
    	struct end_io_wq *end_io_wq = bio->bi_private;
    	struct btrfs_fs_info *fs_info;
    
    	fs_info = end_io_wq->info;
    	end_io_wq->error = err;
    
    	end_io_wq->work.func = end_workqueue_fn;
    	end_io_wq->work.flags = 0;
    
    	if (bio->bi_rw & REQ_WRITE) {
    
    		if (end_io_wq->metadata == 1)
    
    			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
    					   &end_io_wq->work);
    
    		else if (end_io_wq->metadata == 2)
    			btrfs_queue_worker(&fs_info->endio_freespace_worker,
    					   &end_io_wq->work);
    
    		else
    			btrfs_queue_worker(&fs_info->endio_write_workers,
    					   &end_io_wq->work);
    
    	} else {
    		if (end_io_wq->metadata)
    			btrfs_queue_worker(&fs_info->endio_meta_workers,
    					   &end_io_wq->work);
    		else
    			btrfs_queue_worker(&fs_info->endio_workers,
    					   &end_io_wq->work);
    	}
    
    /*
     * For the metadata arg you want
     *
     * 0 - if data
     * 1 - if normal metadta
     * 2 - if writing to the free space cache area
     */
    
    int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
    			int metadata)
    
    	struct end_io_wq *end_io_wq;
    	end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
    	if (!end_io_wq)
    		return -ENOMEM;
    
    	end_io_wq->private = bio->bi_private;
    	end_io_wq->end_io = bio->bi_end_io;
    
    	end_io_wq->error = 0;
    	end_io_wq->bio = bio;
    
    	end_io_wq->metadata = metadata;
    
    
    	bio->bi_private = end_io_wq;
    	bio->bi_end_io = end_workqueue_bio;
    
    unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
    
    	unsigned long limit = min_t(unsigned long,
    				    info->workers.max_workers,
    				    info->fs_devices->open_devices);
    	return 256 * limit;
    }
    
    static void run_one_async_start(struct btrfs_work *work)
    {
    	struct async_submit_bio *async;
    
    	async = container_of(work, struct  async_submit_bio, work);
    	async->submit_bio_start(async->inode, async->rw, async->bio,
    
    			       async->mirror_num, async->bio_flags,
    			       async->bio_offset);
    
    }
    
    static void run_one_async_done(struct btrfs_work *work)
    
    {
    	struct btrfs_fs_info *fs_info;
    	struct async_submit_bio *async;
    
    
    	async = container_of(work, struct  async_submit_bio, work);
    	fs_info = BTRFS_I(async->inode)->root->fs_info;
    
    	limit = btrfs_async_submit_limit(fs_info);
    
    	atomic_dec(&fs_info->nr_async_submits);
    
    	if (atomic_read(&fs_info->nr_async_submits) < limit &&
    	    waitqueue_active(&fs_info->async_submit_wait))
    
    		wake_up(&fs_info->async_submit_wait);
    
    
    	async->submit_bio_done(async->inode, async->rw, async->bio,
    
    			       async->mirror_num, async->bio_flags,
    			       async->bio_offset);
    
    }
    
    static void run_one_async_free(struct btrfs_work *work)
    {
    	struct async_submit_bio *async;
    
    	async = container_of(work, struct  async_submit_bio, work);
    
    int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
    			int rw, struct bio *bio, int mirror_num,
    
    			unsigned long bio_flags,
    
    			extent_submit_bio_hook_t *submit_bio_start,
    			extent_submit_bio_hook_t *submit_bio_done)
    
    {
    	struct async_submit_bio *async;
    
    	async = kmalloc(sizeof(*async), GFP_NOFS);
    	if (!async)
    		return -ENOMEM;
    
    	async->inode = inode;
    	async->rw = rw;
    	async->bio = bio;
    	async->mirror_num = mirror_num;
    
    	async->submit_bio_start = submit_bio_start;
    	async->submit_bio_done = submit_bio_done;
    
    	async->work.func = run_one_async_start;
    	async->work.ordered_func = run_one_async_done;
    	async->work.ordered_free = run_one_async_free;
    
    
    	async->bio_flags = bio_flags;
    
    	async->bio_offset = bio_offset;
    
    		btrfs_set_work_high_prio(&async->work);
    
    
    	btrfs_queue_worker(&fs_info->workers, &async->work);
    
    	while (atomic_read(&fs_info->async_submit_draining) &&
    
    	      atomic_read(&fs_info->nr_async_submits)) {
    		wait_event(fs_info->async_submit_wait,
    			   (atomic_read(&fs_info->nr_async_submits) == 0));
    	}
    
    
    static int btree_csum_one_bio(struct bio *bio)
    {
    	struct bio_vec *bvec = bio->bi_io_vec;
    	int bio_index = 0;
    	struct btrfs_root *root;
    
    	WARN_ON(bio->bi_vcnt <= 0);
    
    	while (bio_index < bio->bi_vcnt) {
    
    		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
    		csum_dirty_buffer(root, bvec->bv_page);
    		bio_index++;
    		bvec++;
    	}
    	return 0;
    }
    
    
    static int __btree_submit_bio_start(struct inode *inode, int rw,
    				    struct bio *bio, int mirror_num,
    
    				    unsigned long bio_flags,
    				    u64 bio_offset)
    
    	/*
    	 * when we're called for a write, we're already in the async
    
    	 * submission context.  Just jump into btrfs_map_bio
    
    	btree_csum_one_bio(bio);
    	return 0;
    }
    
    static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
    
    				 int mirror_num, unsigned long bio_flags,
    				 u64 bio_offset)
    
    	 * when we're called for a write, we're already in the async
    	 * submission context.  Just jump into btrfs_map_bio
    
    	 */
    	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
    
    static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
    
    				 int mirror_num, unsigned long bio_flags,
    				 u64 bio_offset)
    
    	int ret;
    
    	ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
    					  bio, 1);
    	BUG_ON(ret);
    
    
    	if (!(rw & REQ_WRITE)) {
    
    		/*
    		 * called for a read, do the setup so that checksum validation
    		 * can happen in the async kernel threads
    		 */
    		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
    
    				     mirror_num, 0);
    
    	/*
    	 * kthread helpers are used to submit writes so that checksumming
    	 * can happen in parallel across all CPUs
    	 */
    
    	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
    
    				   inode, rw, bio, mirror_num, 0,
    
    				   __btree_submit_bio_start,
    				   __btree_submit_bio_done);
    
    #ifdef CONFIG_MIGRATION
    
    static int btree_migratepage(struct address_space *mapping,
    			struct page *newpage, struct page *page)
    {
    	/*
    	 * we can't safely write a btree page from here,
    	 * we haven't done the locking hook
    	 */
    	if (PageDirty(page))
    		return -EAGAIN;
    	/*
    	 * Buffers may be managed in a filesystem specific way.
    	 * We must have no buffers or drop them.
    	 */
    	if (page_has_private(page) &&
    	    !try_to_release_page(page, GFP_KERNEL))
    		return -EAGAIN;
    	return migrate_page(mapping, newpage, page);
    }
    
    #endif
    
    static int btree_writepage(struct page *page, struct writeback_control *wbc)
    {
    
    	struct extent_io_tree *tree;
    
    	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
    	struct extent_buffer *eb;
    	int was_dirty;
    
    
    	tree = &BTRFS_I(page->mapping->host)->io_tree;
    
    	if (!(current->flags & PF_MEMALLOC)) {
    		return extent_write_full_page(tree, page,
    					      btree_get_extent, wbc);
    	}
    
    	redirty_page_for_writepage(wbc, page);
    
    	eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
    
    	WARN_ON(!eb);
    
    	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
    	if (!was_dirty) {
    		spin_lock(&root->fs_info->delalloc_lock);
    		root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
    		spin_unlock(&root->fs_info->delalloc_lock);
    
    	free_extent_buffer(eb);
    
    	unlock_page(page);
    	return 0;
    
    
    static int btree_writepages(struct address_space *mapping,
    			    struct writeback_control *wbc)
    {
    
    	struct extent_io_tree *tree;
    	tree = &BTRFS_I(mapping->host)->io_tree;
    
    	if (wbc->sync_mode == WB_SYNC_NONE) {
    
    		struct btrfs_root *root = BTRFS_I(mapping->host)->root;
    
    		unsigned long thresh = 32 * 1024 * 1024;
    
    		/* this is a bit racy, but that's ok */
    		num_dirty = root->fs_info->dirty_metadata_bytes;
    
    		if (num_dirty < thresh)
    
    	return extent_writepages(tree, mapping, btree_get_extent, wbc);
    }
    
    
    static int btree_readpage(struct file *file, struct page *page)
    
    	struct extent_io_tree *tree;
    	tree = &BTRFS_I(page->mapping->host)->io_tree;
    
    	return extent_read_full_page(tree, page, btree_get_extent);
    }
    
    static int btree_releasepage(struct page *page, gfp_t gfp_flags)
    
    	struct extent_io_tree *tree;
    	struct extent_map_tree *map;
    
    	if (PageWriteback(page) || PageDirty(page))
    
    		return 0;
    
    	tree = &BTRFS_I(page->mapping->host)->io_tree;
    	map = &BTRFS_I(page->mapping->host)->extent_tree;
    
    	ret = try_release_extent_state(map, tree, page, gfp_flags);
    
    	if (!ret)
    
    		return 0;
    
    	ret = try_release_extent_buffer(tree, page);
    
    	if (ret == 1) {
    		ClearPagePrivate(page);
    		set_page_private(page, 0);
    		page_cache_release(page);
    	}
    
    static void btree_invalidatepage(struct page *page, unsigned long offset)
    
    	struct extent_io_tree *tree;
    	tree = &BTRFS_I(page->mapping->host)->io_tree;
    
    	extent_invalidatepage(tree, page, offset);
    	btree_releasepage(page, GFP_NOFS);
    
    	if (PagePrivate(page)) {
    
    		printk(KERN_WARNING "btrfs warning page private not zero "
    		       "on page %llu\n", (unsigned long long)page_offset(page));
    
    		ClearPagePrivate(page);
    		set_page_private(page, 0);
    		page_cache_release(page);
    	}
    
    static const struct address_space_operations btree_aops = {
    
    	.readpage	= btree_readpage,
    	.writepage	= btree_writepage,
    
    	.writepages	= btree_writepages,
    
    	.releasepage	= btree_releasepage,
    	.invalidatepage = btree_invalidatepage,
    
    #ifdef CONFIG_MIGRATION
    
    	.migratepage	= btree_migratepage,