Skip to content
Snippets Groups Projects
extent-tree.c 194 KiB
Newer Older
  • Learn to ignore specific revisions
  • Chris Mason's avatar
    Chris Mason committed
    /*
     * Copyright (C) 2007 Oracle.  All rights reserved.
     *
     * This program is free software; you can redistribute it and/or
     * modify it under the terms of the GNU General Public
     * License v2 as published by the Free Software Foundation.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     * General Public License for more details.
     *
     * You should have received a copy of the GNU General Public
     * License along with this program; if not, write to the
     * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     * Boston, MA 021110-1307, USA.
     */
    
    #include <linux/sched.h>
    
    #include <linux/pagemap.h>
    
    #include <linux/writeback.h>
    
    #include <linux/blkdev.h>
    
    #include <linux/sort.h>
    
    #include <linux/rcupdate.h>
    
    #include <linux/kthread.h>
    
    Chris Mason's avatar
    Chris Mason committed
    #include "compat.h"
    
    #include "ctree.h"
    #include "disk-io.h"
    #include "print-tree.h"
    
    #include "transaction.h"
    
    #include "locking.h"
    
    #include "free-space-cache.h"
    
    static int update_reserved_extents(struct btrfs_root *root,
    				   u64 bytenr, u64 num, int reserve);
    
    static int update_block_group(struct btrfs_trans_handle *trans,
    			      struct btrfs_root *root,
    			      u64 bytenr, u64 num_bytes, int alloc,
    			      int mark_free);
    
    static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
    				struct btrfs_root *root,
    				u64 bytenr, u64 num_bytes, u64 parent,
    				u64 root_objectid, u64 owner_objectid,
    				u64 owner_offset, int refs_to_drop,
    				struct btrfs_delayed_extent_op *extra_op);
    static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
    				    struct extent_buffer *leaf,
    				    struct btrfs_extent_item *ei);
    static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
    				      struct btrfs_root *root,
    				      u64 parent, u64 root_objectid,
    				      u64 flags, u64 owner, u64 offset,
    				      struct btrfs_key *ins, int ref_mod);
    static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
    				     struct btrfs_root *root,
    				     u64 parent, u64 root_objectid,
    				     u64 flags, struct btrfs_disk_key *key,
    				     int level, struct btrfs_key *ins);
    
    static int do_chunk_alloc(struct btrfs_trans_handle *trans,
    			  struct btrfs_root *extent_root, u64 alloc_bytes,
    			  u64 flags, int force);
    
    
    static noinline int
    block_group_cache_done(struct btrfs_block_group_cache *cache)
    {
    	smp_mb();
    	return cache->cached == BTRFS_CACHE_FINISHED;
    }
    
    
    static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
    {
    	return (cache->flags & bits) == bits;
    }
    
    /*
     * this adds the block group to the fs_info rb tree for the block group
     * cache
     */
    
    static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
    
    				struct btrfs_block_group_cache *block_group)
    {
    	struct rb_node **p;
    	struct rb_node *parent = NULL;
    	struct btrfs_block_group_cache *cache;
    
    	spin_lock(&info->block_group_cache_lock);
    	p = &info->block_group_cache_tree.rb_node;
    
    	while (*p) {
    		parent = *p;
    		cache = rb_entry(parent, struct btrfs_block_group_cache,
    				 cache_node);
    		if (block_group->key.objectid < cache->key.objectid) {
    			p = &(*p)->rb_left;
    		} else if (block_group->key.objectid > cache->key.objectid) {
    			p = &(*p)->rb_right;
    		} else {
    			spin_unlock(&info->block_group_cache_lock);
    			return -EEXIST;
    		}
    	}
    
    	rb_link_node(&block_group->cache_node, parent, p);
    	rb_insert_color(&block_group->cache_node,
    			&info->block_group_cache_tree);
    	spin_unlock(&info->block_group_cache_lock);
    
    	return 0;
    }
    
    /*
     * This will return the block group at or after bytenr if contains is 0, else
     * it will return the block group that contains the bytenr
     */
    static struct btrfs_block_group_cache *
    block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
    			      int contains)
    {
    	struct btrfs_block_group_cache *cache, *ret = NULL;
    	struct rb_node *n;
    	u64 end, start;
    
    	spin_lock(&info->block_group_cache_lock);
    	n = info->block_group_cache_tree.rb_node;
    
    	while (n) {
    		cache = rb_entry(n, struct btrfs_block_group_cache,
    				 cache_node);
    		end = cache->key.objectid + cache->key.offset - 1;
    		start = cache->key.objectid;
    
    		if (bytenr < start) {
    			if (!contains && (!ret || start < ret->key.objectid))
    				ret = cache;
    			n = n->rb_left;
    		} else if (bytenr > start) {
    			if (contains && bytenr <= end) {
    				ret = cache;
    				break;
    			}
    			n = n->rb_right;
    		} else {
    			ret = cache;
    			break;
    		}
    	}
    
    	if (ret)
    		atomic_inc(&ret->count);
    
    	spin_unlock(&info->block_group_cache_lock);
    
    	return ret;
    }
    
    
    /*
     * We always set EXTENT_LOCKED for the super mirror extents so we don't
     * overwrite them, so those bits need to be unset.  Also, if we are unmounting
     * with pinned extents still sitting there because we had a block group caching,
     * we need to clear those now, since we are done.
     */
    void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
    
    {
    	u64 start, end, last = 0;
    	int ret;
    
    	while (1) {
    		ret = find_first_extent_bit(&info->pinned_extents, last,
    
    					    &start, &end,
    					    EXTENT_LOCKED|EXTENT_DIRTY);
    
    		if (ret)
    			break;
    
    
    		clear_extent_bits(&info->pinned_extents, start, end,
    				  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
    
    		last = end+1;
    	}
    }
    
    static int remove_sb_from_cache(struct btrfs_root *root,
    				struct btrfs_block_group_cache *cache)
    {
    	struct btrfs_fs_info *fs_info = root->fs_info;
    	u64 bytenr;
    	u64 *logical;
    	int stripe_len;
    	int i, nr, ret;
    
    	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
    		bytenr = btrfs_sb_offset(i);
    		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
    				       cache->key.objectid, bytenr,
    				       0, &logical, &nr, &stripe_len);
    		BUG_ON(ret);
    		while (nr--) {
    			try_lock_extent(&fs_info->pinned_extents,
    					logical[nr],
    					logical[nr] + stripe_len - 1, GFP_NOFS);
    		}
    		kfree(logical);
    	}
    
    	return 0;
    }
    
    
    /*
     * this is only called by cache_block_group, since we could have freed extents
     * we need to check the pinned_extents for any extents that can't be used yet
     * since their free space will be released as soon as the transaction commits.
     */
    
    static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
    
    			      struct btrfs_fs_info *info, u64 start, u64 end)
    {
    
    	u64 extent_start, extent_end, size, total_added = 0;
    
    	int ret;
    
    	while (start < end) {
    		ret = find_first_extent_bit(&info->pinned_extents, start,
    					    &extent_start, &extent_end,
    
    					    EXTENT_DIRTY|EXTENT_LOCKED);
    
    		if (ret)
    			break;
    
    		if (extent_start == start) {
    			start = extent_end + 1;
    		} else if (extent_start > start && extent_start < end) {
    			size = extent_start - start;
    
    			total_added += size;
    
    			ret = btrfs_add_free_space(block_group, start,
    						   size);
    
    			BUG_ON(ret);
    			start = extent_end + 1;
    		} else {
    			break;
    		}
    	}
    
    	if (start < end) {
    		size = end - start;
    
    		total_added += size;
    
    		ret = btrfs_add_free_space(block_group, start, size);
    
    	return total_added;
    
    static int caching_kthread(void *data)
    
    	struct btrfs_block_group_cache *block_group = data;
    	struct btrfs_fs_info *fs_info = block_group->fs_info;
    	u64 last = 0;
    
    	struct btrfs_path *path;
    
    	struct btrfs_key key;
    
    	struct extent_buffer *leaf;
    
    	u64 total_found = 0;
    
    	BUG_ON(!fs_info);
    
    	path = btrfs_alloc_path();
    	if (!path)
    		return -ENOMEM;
    
    	atomic_inc(&block_group->space_info->caching_threads);
    	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
    again:
    	/* need to make sure the commit_root doesn't disappear */
    	down_read(&fs_info->extent_root->commit_root_sem);
    
    
    	 * We don't want to deadlock with somebody trying to allocate a new
    	 * extent for the extent root while also trying to search the extent
    	 * root to add free space.  So we skip locking and search the commit
    	 * root, since its read-only
    
    	path->search_commit_root = 1;
    	path->reada = 2;
    
    
    Yan Zheng's avatar
    Yan Zheng committed
    	key.objectid = last;
    
    	key.offset = 0;
    	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
    
    	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
    
    	while (1) {
    
    		smp_mb();
    
    		if (block_group->fs_info->closing > 1) {
    			last = (u64)-1;
    
    		slot = path->slots[0];
    
    		if (slot >= btrfs_header_nritems(leaf)) {
    
    			ret = btrfs_next_leaf(fs_info->extent_root, path);
    
    			else if (ret)
    
    
    			if (need_resched()) {
    				btrfs_release_path(fs_info->extent_root, path);
    				up_read(&fs_info->extent_root->commit_root_sem);
    				cond_resched();
    				goto again;
    			}
    
    			continue;
    
    		btrfs_item_key_to_cpu(leaf, &key, slot);
    
    		if (key.objectid < block_group->key.objectid)
    
    		if (key.objectid >= block_group->key.objectid +
    
    		    block_group->key.offset)
    
    		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
    
    			total_found += add_new_free_space(block_group,
    							  fs_info, last,
    							  key.objectid);
    
    			last = key.objectid + key.offset;
    
    
    		if (total_found > (1024 * 1024 * 2)) {
    			total_found = 0;
    			wake_up(&block_group->caching_q);
    		}
    
    		path->slots[0]++;
    	}
    
    	ret = 0;
    
    	total_found += add_new_free_space(block_group, fs_info, last,
    					  block_group->key.objectid +
    					  block_group->key.offset);
    
    	spin_lock(&block_group->lock);
    	block_group->cached = BTRFS_CACHE_FINISHED;
    	spin_unlock(&block_group->lock);
    
    	btrfs_free_path(path);
    
    	up_read(&fs_info->extent_root->commit_root_sem);
    	atomic_dec(&block_group->space_info->caching_threads);
    	wake_up(&block_group->caching_q);
    
    	return 0;
    }
    
    static int cache_block_group(struct btrfs_block_group_cache *cache)
    {
    	struct task_struct *tsk;
    	int ret = 0;
    
    	spin_lock(&cache->lock);
    	if (cache->cached != BTRFS_CACHE_NO) {
    		spin_unlock(&cache->lock);
    		return ret;
    	}
    	cache->cached = BTRFS_CACHE_STARTED;
    	spin_unlock(&cache->lock);
    
    	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
    			  cache->key.objectid);
    	if (IS_ERR(tsk)) {
    		ret = PTR_ERR(tsk);
    		printk(KERN_ERR "error running thread %d\n", ret);
    		BUG();
    	}
    
    
    /*
     * return the block group that starts at or after bytenr
     */
    
    static struct btrfs_block_group_cache *
    btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
    
    	struct btrfs_block_group_cache *cache;
    
    	cache = block_group_cache_tree_search(info, bytenr, 0);
    
    	return cache;
    
     * return the block group that contains the given bytenr
    
    struct btrfs_block_group_cache *btrfs_lookup_block_group(
    						 struct btrfs_fs_info *info,
    						 u64 bytenr)
    
    	struct btrfs_block_group_cache *cache;
    
    	cache = block_group_cache_tree_search(info, bytenr, 1);
    
    	return cache;
    
    void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
    
    {
    	if (atomic_dec_and_test(&cache->count))
    		kfree(cache);
    }
    
    
    static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
    						  u64 flags)
    
    	struct list_head *head = &info->space_info;
    	struct btrfs_space_info *found;
    
    
    	rcu_read_lock();
    	list_for_each_entry_rcu(found, head, list) {
    		if (found->flags == flags) {
    			rcu_read_unlock();
    
    			return found;
    
    	rcu_read_unlock();
    
    	return NULL;
    
    /*
     * after adding space to the filesystem, we need to clear the full flags
     * on all the space infos.
     */
    void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
    {
    	struct list_head *head = &info->space_info;
    	struct btrfs_space_info *found;
    
    	rcu_read_lock();
    	list_for_each_entry_rcu(found, head, list)
    		found->full = 0;
    	rcu_read_unlock();
    }
    
    
    static u64 div_factor(u64 num, int factor)
    {
    	if (factor == 10)
    		return num;
    	num *= factor;
    	do_div(num, 10);
    	return num;
    }
    
    
    u64 btrfs_find_block_group(struct btrfs_root *root,
    			   u64 search_start, u64 search_hint, int owner)
    
    	struct btrfs_block_group_cache *cache;
    
    	u64 used;
    
    	u64 last = max(search_hint, search_start);
    	u64 group_start = 0;
    
    	int full_search = 0;
    
    	int factor = 9;
    
    	int wrapped = 0;
    
    	while (1) {
    		cache = btrfs_lookup_first_block_group(root->fs_info, last);
    
    		if (!cache)
    			break;
    
    		spin_lock(&cache->lock);
    
    		last = cache->key.objectid + cache->key.offset;
    		used = btrfs_block_group_used(&cache->item);
    
    
    		if ((full_search || !cache->ro) &&
    		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
    
    			if (used + cache->pinned + cache->reserved <
    
    			    div_factor(cache->key.offset, factor)) {
    				group_start = cache->key.objectid;
    
    				spin_unlock(&cache->lock);
    
    				btrfs_put_block_group(cache);
    
    		spin_unlock(&cache->lock);
    
    		btrfs_put_block_group(cache);
    
    	if (!wrapped) {
    		last = search_start;
    		wrapped = 1;
    		goto again;
    	}
    	if (!full_search && factor < 10) {
    
    		last = search_start;
    
    		factor = 10;
    
    	return group_start;
    
    /* simple helper to search for an existing extent at a given offset */
    
    int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
    
    	struct btrfs_path *path;
    
    	path = btrfs_alloc_path();
    	BUG_ON(!path);
    
    	key.objectid = start;
    	key.offset = len;
    	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
    	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
    				0, 0);
    
    	btrfs_free_path(path);
    
    /*
     * Back reference rules.  Back refs have three main goals:
     *
     * 1) differentiate between all holders of references to an extent so that
     *    when a reference is dropped we can make sure it was a valid reference
     *    before freeing the extent.
     *
     * 2) Provide enough information to quickly find the holders of an extent
     *    if we notice a given block is corrupted or bad.
     *
     * 3) Make it easy to migrate blocks for FS shrinking or storage pool
     *    maintenance.  This is actually the same as #2, but with a slightly
     *    different use case.
     *
    
     * There are two kinds of back refs. The implicit back refs is optimized
     * for pointers in non-shared tree blocks. For a given pointer in a block,
     * back refs of this kind provide information about the block's owner tree
     * and the pointer's key. These information allow us to find the block by
     * b-tree searching. The full back refs is for pointers in tree blocks not
     * referenced by their owner trees. The location of tree block is recorded
     * in the back refs. Actually the full back refs is generic, and can be
     * used in all cases the implicit back refs is used. The major shortcoming
     * of the full back refs is its overhead. Every time a tree block gets
     * COWed, we have to update back refs entry for all pointers in it.
     *
     * For a newly allocated tree block, we use implicit back refs for
     * pointers in it. This means most tree related operations only involve
     * implicit back refs. For a tree block created in old transaction, the
     * only way to drop a reference to it is COW it. So we can detect the
     * event that tree block loses its owner tree's reference and do the
     * back refs conversion.
     *
     * When a tree block is COW'd through a tree, there are four cases:
     *
     * The reference count of the block is one and the tree is the block's
     * owner tree. Nothing to do in this case.
     *
     * The reference count of the block is one and the tree is not the
     * block's owner tree. In this case, full back refs is used for pointers
     * in the block. Remove these full back refs, add implicit back refs for
     * every pointers in the new block.
     *
     * The reference count of the block is greater than one and the tree is
     * the block's owner tree. In this case, implicit back refs is used for
     * pointers in the block. Add full back refs for every pointers in the
     * block, increase lower level extents' reference counts. The original
     * implicit back refs are entailed to the new block.
     *
     * The reference count of the block is greater than one and the tree is
     * not the block's owner tree. Add implicit back refs for every pointer in
     * the new block, increase lower level extents' reference count.
     *
     * Back Reference Key composing:
     *
     * The key objectid corresponds to the first byte in the extent,
     * The key type is used to differentiate between types of back refs.
     * There are different meanings of the key offset for different types
     * of back refs.
     *
    
     * File extents can be referenced by:
     *
     * - multiple snapshots, subvolumes, or different generations in one subvol
    
     * - different files inside a single subvolume
    
     * - different offsets inside a file (bookend extents in file.c)
     *
    
     * The extent ref structure for the implicit back refs has fields for:
    
     *
     * - Objectid of the subvolume root
     * - objectid of the file holding the reference
    
     * - original offset in the file
     * - how many bookend extents
    
     * The key offset for the implicit back refs is hash of the first
     * three fields.
    
     * The extent ref structure for the full back refs has field for:
    
     * - number of pointers in the tree leaf
    
     * The key offset for the implicit back refs is the first byte of
     * the tree leaf
    
     * When a file extent is allocated, The implicit back refs is used.
     * the fields are filled in:
    
     *     (root_key.objectid, inode objectid, offset in file, 1)
    
     * When a file extent is removed file truncation, we find the
     * corresponding implicit back refs and check the following fields:
    
     *     (btrfs_header_owner(leaf), inode objectid, offset in file)
    
     * Btree extents can be referenced by:
    
     * - Different subvolumes
    
     * Both the implicit back refs and the full back refs for tree blocks
     * only consist of key. The key offset for the implicit back refs is
     * objectid of block's owner tree. The key offset for the full back refs
     * is the first byte of parent block.
    
     * When implicit back refs is used, information about the lowest key and
     * level of the tree block are required. These information are stored in
     * tree block info structure.
    
    #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
    static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
    				  struct btrfs_root *root,
    				  struct btrfs_path *path,
    				  u64 owner, u32 extra_size)
    
    	struct btrfs_extent_item *item;
    	struct btrfs_extent_item_v0 *ei0;
    	struct btrfs_extent_ref_v0 *ref0;
    	struct btrfs_tree_block_info *bi;
    	struct extent_buffer *leaf;
    
    	struct btrfs_key found_key;
    	u32 new_size = sizeof(*item);
    	u64 refs;
    	int ret;
    
    	leaf = path->nodes[0];
    	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
    
    	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    	ei0 = btrfs_item_ptr(leaf, path->slots[0],
    			     struct btrfs_extent_item_v0);
    	refs = btrfs_extent_refs_v0(leaf, ei0);
    
    	if (owner == (u64)-1) {
    		while (1) {
    			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
    				ret = btrfs_next_leaf(root, path);
    				if (ret < 0)
    					return ret;
    				BUG_ON(ret > 0);
    				leaf = path->nodes[0];
    			}
    			btrfs_item_key_to_cpu(leaf, &found_key,
    					      path->slots[0]);
    			BUG_ON(key.objectid != found_key.objectid);
    			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
    				path->slots[0]++;
    				continue;
    			}
    			ref0 = btrfs_item_ptr(leaf, path->slots[0],
    					      struct btrfs_extent_ref_v0);
    			owner = btrfs_ref_objectid_v0(leaf, ref0);
    			break;
    		}
    	}
    	btrfs_release_path(root, path);
    
    	if (owner < BTRFS_FIRST_FREE_OBJECTID)
    		new_size += sizeof(*bi);
    
    	new_size -= sizeof(*ei0);
    	ret = btrfs_search_slot(trans, root, &key, path,
    				new_size + extra_size, 1);
    	if (ret < 0)
    		return ret;
    	BUG_ON(ret);
    
    	ret = btrfs_extend_item(trans, root, path, new_size);
    	BUG_ON(ret);
    
    	leaf = path->nodes[0];
    	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
    	btrfs_set_extent_refs(leaf, item, refs);
    	/* FIXME: get real generation */
    	btrfs_set_extent_generation(leaf, item, 0);
    	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
    		btrfs_set_extent_flags(leaf, item,
    				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
    				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
    		bi = (struct btrfs_tree_block_info *)(item + 1);
    		/* FIXME: get first key of the block */
    		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
    		btrfs_set_tree_block_level(leaf, bi, (int)owner);
    	} else {
    		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
    	}
    	btrfs_mark_buffer_dirty(leaf);
    	return 0;
    }
    #endif
    
    static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
    {
    	u32 high_crc = ~(u32)0;
    	u32 low_crc = ~(u32)0;
    	__le64 lenum;
    
    	lenum = cpu_to_le64(root_objectid);
    
    	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
    
    	lenum = cpu_to_le64(owner);
    
    	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
    
    	lenum = cpu_to_le64(offset);
    
    	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
    
    
    	return ((u64)high_crc << 31) ^ (u64)low_crc;
    }
    
    static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
    				     struct btrfs_extent_data_ref *ref)
    {
    	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
    				    btrfs_extent_data_ref_objectid(leaf, ref),
    				    btrfs_extent_data_ref_offset(leaf, ref));
    }
    
    static int match_extent_data_ref(struct extent_buffer *leaf,
    				 struct btrfs_extent_data_ref *ref,
    				 u64 root_objectid, u64 owner, u64 offset)
    {
    	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
    	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
    	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
    		return 0;
    	return 1;
    }
    
    static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
    					   struct btrfs_root *root,
    					   struct btrfs_path *path,
    					   u64 bytenr, u64 parent,
    					   u64 root_objectid,
    					   u64 owner, u64 offset)
    {
    	struct btrfs_key key;
    	struct btrfs_extent_data_ref *ref;
    
    	struct extent_buffer *leaf;
    
    	int recow;
    	int err = -ENOENT;
    
    	key.objectid = bytenr;
    
    	if (parent) {
    		key.type = BTRFS_SHARED_DATA_REF_KEY;
    		key.offset = parent;
    	} else {
    		key.type = BTRFS_EXTENT_DATA_REF_KEY;
    		key.offset = hash_extent_data_ref(root_objectid,
    						  owner, offset);
    	}
    again:
    	recow = 0;
    	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    	if (ret < 0) {
    		err = ret;
    		goto fail;
    	}
    
    	if (parent) {
    		if (!ret)
    			return 0;
    #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
    		key.type = BTRFS_EXTENT_REF_V0_KEY;
    		btrfs_release_path(root, path);
    		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    		if (ret < 0) {
    			err = ret;
    			goto fail;
    		}
    		if (!ret)
    			return 0;
    #endif
    		goto fail;
    
    	}
    
    	leaf = path->nodes[0];
    
    	nritems = btrfs_header_nritems(leaf);
    	while (1) {
    		if (path->slots[0] >= nritems) {
    			ret = btrfs_next_leaf(root, path);
    			if (ret < 0)
    				err = ret;
    			if (ret)
    				goto fail;
    
    			leaf = path->nodes[0];
    			nritems = btrfs_header_nritems(leaf);
    			recow = 1;
    		}
    
    		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    		if (key.objectid != bytenr ||
    		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
    			goto fail;
    
    		ref = btrfs_item_ptr(leaf, path->slots[0],
    				     struct btrfs_extent_data_ref);
    
    		if (match_extent_data_ref(leaf, ref, root_objectid,
    					  owner, offset)) {
    			if (recow) {
    				btrfs_release_path(root, path);
    				goto again;
    			}
    			err = 0;
    			break;
    		}
    		path->slots[0]++;
    
    static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
    					   struct btrfs_root *root,
    					   struct btrfs_path *path,
    					   u64 bytenr, u64 parent,
    					   u64 root_objectid, u64 owner,
    					   u64 offset, int refs_to_add)
    
    {
    	struct btrfs_key key;
    	struct extent_buffer *leaf;
    
    	u32 num_refs;
    	int ret;
    
    	if (parent) {
    		key.type = BTRFS_SHARED_DATA_REF_KEY;
    		key.offset = parent;
    		size = sizeof(struct btrfs_shared_data_ref);
    	} else {
    		key.type = BTRFS_EXTENT_DATA_REF_KEY;
    		key.offset = hash_extent_data_ref(root_objectid,
    						  owner, offset);
    		size = sizeof(struct btrfs_extent_data_ref);
    	}
    
    	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
    	if (ret && ret != -EEXIST)
    		goto fail;
    
    	leaf = path->nodes[0];
    	if (parent) {
    		struct btrfs_shared_data_ref *ref;
    
    		ref = btrfs_item_ptr(leaf, path->slots[0],
    
    				     struct btrfs_shared_data_ref);
    		if (ret == 0) {
    			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
    		} else {
    			num_refs = btrfs_shared_data_ref_count(leaf, ref);
    			num_refs += refs_to_add;
    			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
    
    	} else {
    		struct btrfs_extent_data_ref *ref;
    		while (ret == -EEXIST) {
    			ref = btrfs_item_ptr(leaf, path->slots[0],
    					     struct btrfs_extent_data_ref);
    			if (match_extent_data_ref(leaf, ref, root_objectid,
    						  owner, offset))
    				break;
    			btrfs_release_path(root, path);
    			key.offset++;
    			ret = btrfs_insert_empty_item(trans, root, path, &key,
    						      size);
    			if (ret && ret != -EEXIST)
    				goto fail;
    
    			leaf = path->nodes[0];
    		}
    		ref = btrfs_item_ptr(leaf, path->slots[0],
    				     struct btrfs_extent_data_ref);
    		if (ret == 0) {
    			btrfs_set_extent_data_ref_root(leaf, ref,
    						       root_objectid);
    			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
    			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
    			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
    		} else {
    			num_refs = btrfs_extent_data_ref_count(leaf, ref);
    			num_refs += refs_to_add;
    			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
    
    	btrfs_mark_buffer_dirty(leaf);
    	ret = 0;
    fail:
    
    	btrfs_release_path(root, path);
    	return ret;
    
    static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
    					   struct btrfs_root *root,
    					   struct btrfs_path *path,
    					   int refs_to_drop)
    
    	struct btrfs_key key;
    	struct btrfs_extent_data_ref *ref1 = NULL;
    	struct btrfs_shared_data_ref *ref2 = NULL;
    
    	struct extent_buffer *leaf;
    
    	int ret = 0;
    
    	leaf = path->nodes[0];
    
    	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    
    	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
    		ref1 = btrfs_item_ptr(leaf, path->slots[0],
    				      struct btrfs_extent_data_ref);
    		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
    	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
    		ref2 = btrfs_item_ptr(leaf, path->slots[0],
    				      struct btrfs_shared_data_ref);
    		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
    #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
    	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
    		struct btrfs_extent_ref_v0 *ref0;
    		ref0 = btrfs_item_ptr(leaf, path->slots[0],
    				      struct btrfs_extent_ref_v0);
    		num_refs = btrfs_ref_count_v0(leaf, ref0);
    #endif
    	} else {
    		BUG();
    	}
    
    
    	BUG_ON(num_refs < refs_to_drop);
    	num_refs -= refs_to_drop;
    
    	if (num_refs == 0) {
    		ret = btrfs_del_item(trans, root, path);
    	} else {
    
    		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
    			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
    		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
    			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
    #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
    		else {
    			struct btrfs_extent_ref_v0 *ref0;
    			ref0 = btrfs_item_ptr(leaf, path->slots[0],
    					struct btrfs_extent_ref_v0);
    			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
    		}
    #endif
    
    		btrfs_mark_buffer_dirty(leaf);
    	}
    	return ret;
    }
    
    
    static noinline u32 extent_data_ref_count(struct btrfs_root *root,
    					  struct btrfs_path *path,
    					  struct btrfs_extent_inline_ref *iref)
    
    	struct btrfs_key key;
    	struct extent_buffer *leaf;
    	struct btrfs_extent_data_ref *ref1;
    	struct btrfs_shared_data_ref *ref2;
    	u32 num_refs = 0;
    
    	leaf = path->nodes[0];
    	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    	if (iref) {
    		if (btrfs_extent_inline_ref_type(leaf, iref) ==
    		    BTRFS_EXTENT_DATA_REF_KEY) {
    			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
    			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
    		} else {
    			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
    			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
    		}
    	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
    		ref1 = btrfs_item_ptr(leaf, path->slots[0],
    				      struct btrfs_extent_data_ref);
    		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
    	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
    		ref2 = btrfs_item_ptr(leaf, path->slots[0],
    				      struct btrfs_shared_data_ref);
    		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
    #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
    	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
    		struct btrfs_extent_ref_v0 *ref0;
    		ref0 = btrfs_item_ptr(leaf, path->slots[0],
    				      struct btrfs_extent_ref_v0);