Skip to content
Snippets Groups Projects
alloc.c 179 KiB
Newer Older
  • Learn to ignore specific revisions
  • /* -*- mode: c; c-basic-offset: 8; -*-
     * vim: noexpandtab sw=8 ts=8 sts=0:
     *
     * alloc.c
     *
     * Extent allocs and frees
     *
     * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
     *
     * This program is free software; you can redistribute it and/or
     * modify it under the terms of the GNU General Public
     * License as published by the Free Software Foundation; either
     * version 2 of the License, or (at your option) any later version.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     * General Public License for more details.
     *
     * You should have received a copy of the GNU General Public
     * License along with this program; if not, write to the
     * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     * Boston, MA 021110-1307, USA.
     */
    
    #include <linux/fs.h>
    #include <linux/types.h>
    #include <linux/slab.h>
    #include <linux/highmem.h>
    
    #include <linux/swap.h>
    
    
    #define MLOG_MASK_PREFIX ML_DISK_ALLOC
    #include <cluster/masklog.h>
    
    #include "ocfs2.h"
    
    #include "alloc.h"
    
    #include "aops.h"
    
    #include "dlmglue.h"
    #include "extent_map.h"
    #include "inode.h"
    #include "journal.h"
    #include "localalloc.h"
    #include "suballoc.h"
    #include "sysfile.h"
    #include "file.h"
    #include "super.h"
    #include "uptodate.h"
    
    #include "buffer_head_io.h"
    
    
    
    struct ocfs2_extent_tree_operations {
    
    	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
    				   u64 blkno);
    	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
    	void (*eo_update_clusters)(struct inode *inode,
    				   struct ocfs2_extent_tree *et,
    				   u32 new_clusters);
    
    	int (*eo_insert_check)(struct inode *inode,
    			       struct ocfs2_extent_tree *et,
    			       struct ocfs2_extent_rec *rec);
    
    	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
    
    
    	/* These are internal to ocfs2_extent_tree and don't have
    	 * accessor functions */
    	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
    
    	void (*eo_fill_max_leaf_clusters)(struct inode *inode,
    					  struct ocfs2_extent_tree *et);
    
    /*
     * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
     * in the methods.
     */
    static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
    static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
    					 u64 blkno);
    static void ocfs2_dinode_update_clusters(struct inode *inode,
    					 struct ocfs2_extent_tree *et,
    					 u32 clusters);
    static int ocfs2_dinode_insert_check(struct inode *inode,
    				     struct ocfs2_extent_tree *et,
    				     struct ocfs2_extent_rec *rec);
    static int ocfs2_dinode_sanity_check(struct inode *inode,
    				     struct ocfs2_extent_tree *et);
    static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
    static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
    	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
    	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
    	.eo_update_clusters	= ocfs2_dinode_update_clusters,
    	.eo_insert_check	= ocfs2_dinode_insert_check,
    	.eo_sanity_check	= ocfs2_dinode_sanity_check,
    	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
    };
    
    static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
    					 u64 blkno)
    {
    
    	struct ocfs2_dinode *di = et->et_object;
    
    	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
    
    	di->i_last_eb_blk = cpu_to_le64(blkno);
    }
    
    static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
    {
    
    	struct ocfs2_dinode *di = et->et_object;
    
    	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
    
    	return le64_to_cpu(di->i_last_eb_blk);
    }
    
    static void ocfs2_dinode_update_clusters(struct inode *inode,
    					 struct ocfs2_extent_tree *et,
    					 u32 clusters)
    {
    
    	struct ocfs2_dinode *di = et->et_object;
    
    
    	le32_add_cpu(&di->i_clusters, clusters);
    	spin_lock(&OCFS2_I(inode)->ip_lock);
    	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
    	spin_unlock(&OCFS2_I(inode)->ip_lock);
    }
    
    
    static int ocfs2_dinode_insert_check(struct inode *inode,
    				     struct ocfs2_extent_tree *et,
    				     struct ocfs2_extent_rec *rec)
    {
    	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    
    	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
    	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
    			(OCFS2_I(inode)->ip_clusters != rec->e_cpos),
    			"Device %s, asking for sparse allocation: inode %llu, "
    			"cpos %u, clusters %u\n",
    			osb->dev_str,
    			(unsigned long long)OCFS2_I(inode)->ip_blkno,
    			rec->e_cpos,
    			OCFS2_I(inode)->ip_clusters);
    
    	return 0;
    }
    
    
    static int ocfs2_dinode_sanity_check(struct inode *inode,
    				     struct ocfs2_extent_tree *et)
    {
    	int ret = 0;
    	struct ocfs2_dinode *di;
    
    
    	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
    
    	if (!OCFS2_IS_VALID_DINODE(di)) {
    		ret = -EIO;
    		ocfs2_error(inode->i_sb,
    			"Inode %llu has invalid path root",
    			(unsigned long long)OCFS2_I(inode)->ip_blkno);
    	}
    
    	return ret;
    }
    
    
    static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
    {
    	struct ocfs2_dinode *di = et->et_object;
    
    	et->et_root_el = &di->id2.i_list;
    }
    
    
    static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
    {
    	struct ocfs2_xattr_value_root *xv = et->et_object;
    
    	et->et_root_el = &xv->xr_list;
    }
    
    
    static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
    					      u64 blkno)
    {
    	struct ocfs2_xattr_value_root *xv =
    
    		(struct ocfs2_xattr_value_root *)et->et_object;
    
    
    	xv->xr_last_eb_blk = cpu_to_le64(blkno);
    }
    
    static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
    {
    	struct ocfs2_xattr_value_root *xv =
    
    		(struct ocfs2_xattr_value_root *) et->et_object;
    
    
    	return le64_to_cpu(xv->xr_last_eb_blk);
    }
    
    static void ocfs2_xattr_value_update_clusters(struct inode *inode,
    					      struct ocfs2_extent_tree *et,
    					      u32 clusters)
    {
    	struct ocfs2_xattr_value_root *xv =
    
    		(struct ocfs2_xattr_value_root *)et->et_object;
    
    
    	le32_add_cpu(&xv->xr_clusters, clusters);
    }
    
    
    static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
    
    	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
    	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
    	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
    
    	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
    
    static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
    {
    	struct ocfs2_xattr_block *xb = et->et_object;
    
    	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
    }
    
    
    static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
    						    struct ocfs2_extent_tree *et)
    {
    	et->et_max_leaf_clusters =
    		ocfs2_clusters_for_bytes(inode->i_sb,
    					 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
    }
    
    
    static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
    					     u64 blkno)
    {
    
    	struct ocfs2_xattr_block *xb = et->et_object;
    
    	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
    
    	xt->xt_last_eb_blk = cpu_to_le64(blkno);
    }
    
    static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
    {
    
    	struct ocfs2_xattr_block *xb = et->et_object;
    
    	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
    
    	return le64_to_cpu(xt->xt_last_eb_blk);
    }
    
    static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
    					     struct ocfs2_extent_tree *et,
    					     u32 clusters)
    {
    
    	struct ocfs2_xattr_block *xb = et->et_object;
    
    
    	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
    }
    
    static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
    
    	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
    	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
    	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
    
    	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
    
    	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
    
    static void __ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
    				    struct inode *inode,
    				    struct buffer_head *bh,
    				    void *obj,
    				    struct ocfs2_extent_tree_operations *ops)
    
    	et->et_root_bh = bh;
    
    	if (!obj)
    		obj = (void *)bh->b_data;
    	et->et_object = obj;
    
    	if (!et->et_ops->eo_fill_max_leaf_clusters)
    		et->et_max_leaf_clusters = 0;
    	else
    		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
    
    void ocfs2_get_dinode_extent_tree(struct ocfs2_extent_tree *et,
    				  struct inode *inode,
    				  struct buffer_head *bh)
    
    	__ocfs2_get_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
    
    void ocfs2_get_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
    				      struct inode *inode,
    				      struct buffer_head *bh)
    
    {
    	__ocfs2_get_extent_tree(et, inode, bh, NULL,
    				&ocfs2_xattr_tree_et_ops);
    }
    
    
    void ocfs2_get_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
    				       struct inode *inode,
    				       struct buffer_head *bh,
    				       struct ocfs2_xattr_value_root *xv)
    
    {
    	__ocfs2_get_extent_tree(et, inode, bh, xv,
    				&ocfs2_xattr_value_et_ops);
    }
    
    
    void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
    
    	brelse(et->et_root_bh);
    
    static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
    					    u64 new_last_eb_blk)
    
    	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
    
    static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
    
    	return et->et_ops->eo_get_last_eb_blk(et);
    
    static inline void ocfs2_et_update_clusters(struct inode *inode,
    					    struct ocfs2_extent_tree *et,
    					    u32 clusters)
    {
    
    	et->et_ops->eo_update_clusters(inode, et, clusters);
    
    static inline int ocfs2_et_insert_check(struct inode *inode,
    					struct ocfs2_extent_tree *et,
    					struct ocfs2_extent_rec *rec)
    {
    	int ret = 0;
    
    	if (et->et_ops->eo_insert_check)
    		ret = et->et_ops->eo_insert_check(inode, et, rec);
    	return ret;
    }
    
    
    static inline int ocfs2_et_sanity_check(struct inode *inode,
    					struct ocfs2_extent_tree *et)
    
    	int ret = 0;
    
    	if (et->et_ops->eo_sanity_check)
    		ret = et->et_ops->eo_sanity_check(inode, et);
    	return ret;
    
    static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
    
    static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
    					 struct ocfs2_extent_block *eb);
    
    /*
     * Structures which describe a path through a btree, and functions to
     * manipulate them.
     *
     * The idea here is to be as generic as possible with the tree
     * manipulation code.
     */
    struct ocfs2_path_item {
    	struct buffer_head		*bh;
    	struct ocfs2_extent_list	*el;
    };
    
    #define OCFS2_MAX_PATH_DEPTH	5
    
    struct ocfs2_path {
    	int			p_tree_depth;
    	struct ocfs2_path_item	p_node[OCFS2_MAX_PATH_DEPTH];
    };
    
    #define path_root_bh(_path) ((_path)->p_node[0].bh)
    #define path_root_el(_path) ((_path)->p_node[0].el)
    #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
    #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
    #define path_num_items(_path) ((_path)->p_tree_depth + 1)
    
    /*
     * Reset the actual path elements so that we can re-use the structure
     * to build another path. Generally, this involves freeing the buffer
     * heads.
     */
    static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
    {
    	int i, start = 0, depth = 0;
    	struct ocfs2_path_item *node;
    
    	if (keep_root)
    		start = 1;
    
    	for(i = start; i < path_num_items(path); i++) {
    		node = &path->p_node[i];
    
    		brelse(node->bh);
    		node->bh = NULL;
    		node->el = NULL;
    	}
    
    	/*
    	 * Tree depth may change during truncate, or insert. If we're
    	 * keeping the root extent list, then make sure that our path
    	 * structure reflects the proper depth.
    	 */
    	if (keep_root)
    		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
    
    	path->p_tree_depth = depth;
    }
    
    static void ocfs2_free_path(struct ocfs2_path *path)
    {
    	if (path) {
    		ocfs2_reinit_path(path, 0);
    		kfree(path);
    	}
    }
    
    
    /*
     * All the elements of src into dest. After this call, src could be freed
     * without affecting dest.
     *
     * Both paths should have the same root. Any non-root elements of dest
     * will be freed.
     */
    static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
    {
    	int i;
    
    	BUG_ON(path_root_bh(dest) != path_root_bh(src));
    	BUG_ON(path_root_el(dest) != path_root_el(src));
    
    	ocfs2_reinit_path(dest, 1);
    
    	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
    		dest->p_node[i].bh = src->p_node[i].bh;
    		dest->p_node[i].el = src->p_node[i].el;
    
    		if (dest->p_node[i].bh)
    			get_bh(dest->p_node[i].bh);
    	}
    }
    
    
    /*
     * Make the *dest path the same as src and re-initialize src path to
     * have a root only.
     */
    static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
    {
    	int i;
    
    	BUG_ON(path_root_bh(dest) != path_root_bh(src));
    
    	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
    		brelse(dest->p_node[i].bh);
    
    		dest->p_node[i].bh = src->p_node[i].bh;
    		dest->p_node[i].el = src->p_node[i].el;
    
    		src->p_node[i].bh = NULL;
    		src->p_node[i].el = NULL;
    	}
    }
    
    /*
     * Insert an extent block at given index.
     *
     * This will not take an additional reference on eb_bh.
     */
    static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
    					struct buffer_head *eb_bh)
    {
    	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
    
    	/*
    	 * Right now, no root bh is an extent block, so this helps
    	 * catch code errors with dinode trees. The assertion can be
    	 * safely removed if we ever need to insert extent block
    	 * structures at the root.
    	 */
    	BUG_ON(index == 0);
    
    	path->p_node[index].bh = eb_bh;
    	path->p_node[index].el = &eb->h_list;
    }
    
    static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
    					 struct ocfs2_extent_list *root_el)
    {
    	struct ocfs2_path *path;
    
    	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
    
    	path = kzalloc(sizeof(*path), GFP_NOFS);
    	if (path) {
    		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
    		get_bh(root_bh);
    		path_root_bh(path) = root_bh;
    		path_root_el(path) = root_el;
    	}
    
    	return path;
    }
    
    /*
     * Convenience function to journal all components in a path.
     */
    static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
    				     struct ocfs2_path *path)
    {
    	int i, ret = 0;
    
    	if (!path)
    		goto out;
    
    	for(i = 0; i < path_num_items(path); i++) {
    		ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
    					   OCFS2_JOURNAL_ACCESS_WRITE);
    		if (ret < 0) {
    			mlog_errno(ret);
    			goto out;
    		}
    	}
    
    out:
    	return ret;
    }
    
    
    /*
     * Return the index of the extent record which contains cluster #v_cluster.
     * -1 is returned if it was not found.
     *
     * Should work fine on interior and exterior nodes.
     */
    int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
    {
    	int ret = -1;
    	int i;
    	struct ocfs2_extent_rec *rec;
    	u32 rec_end, rec_start, clusters;
    
    	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
    		rec = &el->l_recs[i];
    
    		rec_start = le32_to_cpu(rec->e_cpos);
    		clusters = ocfs2_rec_clusters(el, rec);
    
    		rec_end = rec_start + clusters;
    
    		if (v_cluster >= rec_start && v_cluster < rec_end) {
    			ret = i;
    			break;
    		}
    	}
    
    	return ret;
    }
    
    
    enum ocfs2_contig_type {
    	CONTIG_NONE = 0,
    	CONTIG_LEFT,
    
    	CONTIG_RIGHT,
    	CONTIG_LEFTRIGHT,
    
    
    /*
     * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
     * ocfs2_extent_contig only work properly against leaf nodes!
     */
    
    static int ocfs2_block_extent_contig(struct super_block *sb,
    				     struct ocfs2_extent_rec *ext,
    				     u64 blkno)
    
    	u64 blk_end = le64_to_cpu(ext->e_blkno);
    
    	blk_end += ocfs2_clusters_to_blocks(sb,
    				    le16_to_cpu(ext->e_leaf_clusters));
    
    	return blkno == blk_end;
    
    static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
    				  struct ocfs2_extent_rec *right)
    {
    
    	u32 left_range;
    
    	left_range = le32_to_cpu(left->e_cpos) +
    		le16_to_cpu(left->e_leaf_clusters);
    
    	return (left_range == le32_to_cpu(right->e_cpos));
    
    }
    
    static enum ocfs2_contig_type
    	ocfs2_extent_contig(struct inode *inode,
    			    struct ocfs2_extent_rec *ext,
    			    struct ocfs2_extent_rec *insert_rec)
    {
    	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
    
    
    	/*
    	 * Refuse to coalesce extent records with different flag
    	 * fields - we don't want to mix unwritten extents with user
    	 * data.
    	 */
    	if (ext->e_flags != insert_rec->e_flags)
    		return CONTIG_NONE;
    
    
    	if (ocfs2_extents_adjacent(ext, insert_rec) &&
    	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
    			return CONTIG_RIGHT;
    
    	blkno = le64_to_cpu(ext->e_blkno);
    	if (ocfs2_extents_adjacent(insert_rec, ext) &&
    	    ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
    		return CONTIG_LEFT;
    
    	return CONTIG_NONE;
    }
    
    /*
     * NOTE: We can have pretty much any combination of contiguousness and
     * appending.
     *
     * The usefulness of APPEND_TAIL is more in that it lets us know that
     * we'll have to update the path to that leaf.
     */
    enum ocfs2_append_type {
    	APPEND_NONE = 0,
    	APPEND_TAIL,
    };
    
    
    enum ocfs2_split_type {
    	SPLIT_NONE = 0,
    	SPLIT_LEFT,
    	SPLIT_RIGHT,
    };
    
    
    struct ocfs2_insert_type {
    
    	enum ocfs2_split_type	ins_split;
    
    	enum ocfs2_append_type	ins_appending;
    	enum ocfs2_contig_type	ins_contig;
    	int			ins_contig_index;
    	int			ins_tree_depth;
    };
    
    
    struct ocfs2_merge_ctxt {
    	enum ocfs2_contig_type	c_contig_type;
    	int			c_has_empty_extent;
    	int			c_split_covers_rec;
    };
    
    
    /*
     * How many free extents have we got before we need more meta data?
     */
    int ocfs2_num_free_extents(struct ocfs2_super *osb,
    			   struct inode *inode,
    
    	struct ocfs2_extent_list *el = NULL;
    
    	struct ocfs2_extent_block *eb;
    	struct buffer_head *eb_bh = NULL;
    
    	u64 last_eb_blk = 0;
    
    	el = et->et_root_el;
    	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
    
    	if (last_eb_blk) {
    		retval = ocfs2_read_block(osb, last_eb_blk,
    
    					  &eb_bh, OCFS2_BH_CACHED, inode);
    		if (retval < 0) {
    			mlog_errno(retval);
    			goto bail;
    		}
    		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
    		el = &eb->h_list;
    
    
    	BUG_ON(el->l_tree_depth != 0);
    
    	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
    bail:
    	if (eb_bh)
    		brelse(eb_bh);
    
    	mlog_exit(retval);
    	return retval;
    }
    
    /* expects array to already be allocated
     *
     * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
     * l_count for you
     */
    static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
    
    				     struct inode *inode,
    				     int wanted,
    				     struct ocfs2_alloc_context *meta_ac,
    				     struct buffer_head *bhs[])
    {
    	int count, status, i;
    	u16 suballoc_bit_start;
    	u32 num_got;
    	u64 first_blkno;
    	struct ocfs2_extent_block *eb;
    
    	mlog_entry_void();
    
    	count = 0;
    	while (count < wanted) {
    		status = ocfs2_claim_metadata(osb,
    					      handle,
    					      meta_ac,
    					      wanted - count,
    					      &suballoc_bit_start,
    					      &num_got,
    					      &first_blkno);
    		if (status < 0) {
    			mlog_errno(status);
    			goto bail;
    		}
    
    		for(i = count;  i < (num_got + count); i++) {
    			bhs[i] = sb_getblk(osb->sb, first_blkno);
    			if (bhs[i] == NULL) {
    				status = -EIO;
    				mlog_errno(status);
    				goto bail;
    			}
    			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
    
    			status = ocfs2_journal_access(handle, inode, bhs[i],
    						      OCFS2_JOURNAL_ACCESS_CREATE);
    			if (status < 0) {
    				mlog_errno(status);
    				goto bail;
    			}
    
    			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
    			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
    			/* Ok, setup the minimal stuff here. */
    			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
    			eb->h_blkno = cpu_to_le64(first_blkno);
    			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
    			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
    			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
    			eb->h_list.l_count =
    				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
    
    			suballoc_bit_start++;
    			first_blkno++;
    
    			/* We'll also be dirtied by the caller, so
    			 * this isn't absolutely necessary. */
    			status = ocfs2_journal_dirty(handle, bhs[i]);
    			if (status < 0) {
    				mlog_errno(status);
    				goto bail;
    			}
    		}
    
    		count += num_got;
    	}
    
    	status = 0;
    bail:
    	if (status < 0) {
    		for(i = 0; i < wanted; i++) {
    			if (bhs[i])
    				brelse(bhs[i]);
    			bhs[i] = NULL;
    		}
    	}
    	mlog_exit(status);
    	return status;
    }
    
    
    /*
     * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
     *
     * Returns the sum of the rightmost extent rec logical offset and
     * cluster count.
     *
     * ocfs2_add_branch() uses this to determine what logical cluster
     * value should be populated into the leftmost new branch records.
     *
     * ocfs2_shift_tree_depth() uses this to determine the # clusters
     * value for the new topmost tree record.
     */
    static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
    {
    	int i;
    
    	i = le16_to_cpu(el->l_next_free_rec) - 1;
    
    	return le32_to_cpu(el->l_recs[i].e_cpos) +
    
    		ocfs2_rec_clusters(el, &el->l_recs[i]);
    
    /*
     * Add an entire tree branch to our inode. eb_bh is the extent block
     * to start at, if we don't want to start the branch at the dinode
     * structure.
     *
     * last_eb_bh is required as we have to update it's next_leaf pointer
     * for the new last extent block.
     *
     * the new branch will be 'empty' in the sense that every block will
    
     * contain a single record with cluster count == 0.
    
     */
    static int ocfs2_add_branch(struct ocfs2_super *osb,
    
    			    struct inode *inode,
    
    			    struct ocfs2_extent_tree *et,
    
    			    struct buffer_head *eb_bh,
    
    			    struct buffer_head **last_eb_bh,
    
    			    struct ocfs2_alloc_context *meta_ac)
    {
    	int status, new_blocks, i;
    	u64 next_blkno, new_last_eb_blk;
    	struct buffer_head *bh;
    	struct buffer_head **new_eb_bhs = NULL;
    	struct ocfs2_extent_block *eb;
    	struct ocfs2_extent_list  *eb_el;
    	struct ocfs2_extent_list  *el;
    
    	u32 new_cpos;
    
    	BUG_ON(!last_eb_bh || !*last_eb_bh);
    
    
    	if (eb_bh) {
    		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
    		el = &eb->h_list;
    	} else
    
    		el = et->et_root_el;
    
    
    	/* we never add a branch to a leaf. */
    	BUG_ON(!el->l_tree_depth);
    
    	new_blocks = le16_to_cpu(el->l_tree_depth);
    
    	/* allocate the number of new eb blocks we need */
    	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
    			     GFP_KERNEL);
    	if (!new_eb_bhs) {
    		status = -ENOMEM;
    		mlog_errno(status);
    		goto bail;
    	}
    
    	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
    					   meta_ac, new_eb_bhs);
    	if (status < 0) {
    		mlog_errno(status);
    		goto bail;
    	}
    
    
    	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
    
    	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
    
    
    	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
    	 * linked with the rest of the tree.
    	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
    	 *
    	 * when we leave the loop, new_last_eb_blk will point to the
    	 * newest leaf, and next_blkno will point to the topmost extent
    	 * block. */
    	next_blkno = new_last_eb_blk = 0;
    	for(i = 0; i < new_blocks; i++) {
    		bh = new_eb_bhs[i];
    		eb = (struct ocfs2_extent_block *) bh->b_data;
    		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
    			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
    			status = -EIO;
    			goto bail;
    		}
    		eb_el = &eb->h_list;
    
    		status = ocfs2_journal_access(handle, inode, bh,
    					      OCFS2_JOURNAL_ACCESS_CREATE);
    		if (status < 0) {
    			mlog_errno(status);
    			goto bail;
    		}
    
    		eb->h_next_leaf_blk = 0;
    		eb_el->l_tree_depth = cpu_to_le16(i);
    		eb_el->l_next_free_rec = cpu_to_le16(1);
    
    		/*
    		 * This actually counts as an empty extent as
    		 * c_clusters == 0
    		 */
    		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
    
    		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
    
    		/*
    		 * eb_el isn't always an interior node, but even leaf
    		 * nodes want a zero'd flags and reserved field so
    		 * this gets the whole 32 bits regardless of use.
    		 */
    		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
    
    		if (!eb_el->l_tree_depth)
    			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
    
    		status = ocfs2_journal_dirty(handle, bh);
    		if (status < 0) {
    			mlog_errno(status);
    			goto bail;
    		}
    
    		next_blkno = le64_to_cpu(eb->h_blkno);
    	}
    
    	/* This is a bit hairy. We want to update up to three blocks
    	 * here without leaving any of them in an inconsistent state
    	 * in case of error. We don't have to worry about
    	 * journal_dirty erroring as it won't unless we've aborted the
    	 * handle (in which case we would never be here) so reserving
    	 * the write with journal_access is all we need to do. */
    
    	status = ocfs2_journal_access(handle, inode, *last_eb_bh,
    
    				      OCFS2_JOURNAL_ACCESS_WRITE);
    	if (status < 0) {
    		mlog_errno(status);
    		goto bail;
    	}
    
    	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
    
    				      OCFS2_JOURNAL_ACCESS_WRITE);
    	if (status < 0) {
    		mlog_errno(status);
    		goto bail;
    	}
    	if (eb_bh) {
    		status = ocfs2_journal_access(handle, inode, eb_bh,
    					      OCFS2_JOURNAL_ACCESS_WRITE);
    		if (status < 0) {
    			mlog_errno(status);
    			goto bail;
    		}
    	}
    
    	/* Link the new branch into the rest of the tree (el will
    
    	 * either be on the root_bh, or the extent block passed in. */
    
    	i = le16_to_cpu(el->l_next_free_rec);
    	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
    
    	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
    
    	el->l_recs[i].e_int_clusters = 0;
    
    	le16_add_cpu(&el->l_next_free_rec, 1);
    
    	/* fe needs a new last extent block pointer, as does the
    	 * next_leaf on the previously last-extent-block. */
    
    	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
    
    	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
    
    	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
    
    
    	status = ocfs2_journal_dirty(handle, *last_eb_bh);
    
    	if (status < 0)
    		mlog_errno(status);
    
    	status = ocfs2_journal_dirty(handle, et->et_root_bh);
    
    	if (status < 0)
    		mlog_errno(status);
    	if (eb_bh) {
    		status = ocfs2_journal_dirty(handle, eb_bh);
    		if (status < 0)
    			mlog_errno(status);
    	}
    
    
    	/*
    	 * Some callers want to track the rightmost leaf so pass it
    	 * back here.
    	 */
    	brelse(*last_eb_bh);
    	get_bh(new_eb_bhs[0]);
    	*last_eb_bh = new_eb_bhs[0];
    
    
    	status = 0;
    bail:
    	if (new_eb_bhs) {
    		for (i = 0; i < new_blocks; i++)
    			if (new_eb_bhs[i])
    				brelse(new_eb_bhs[i]);
    		kfree(new_eb_bhs);
    	}
    
    	mlog_exit(status);
    	return status;
    }
    
    /*
     * adds another level to the allocation tree.
     * returns back the new extent block so you can add a branch to it
     * after this call.
     */
    static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,