Skip to content
Snippets Groups Projects
extent-tree.c 222 KiB
Newer Older
  • Learn to ignore specific revisions
  • 		btrfs_release_path(path);
    
    		goto out;
    	}
    
    	if (IS_ERR(inode)) {
    		BUG_ON(retries);
    		retries++;
    
    		if (block_group->ro)
    			goto out_free;
    
    		ret = create_free_space_inode(root, trans, block_group, path);
    		if (ret)
    			goto out_free;
    		goto again;
    	}
    
    
    	/* We've already setup this transaction, go ahead and exit */
    	if (block_group->cache_generation == trans->transid &&
    	    i_size_read(inode)) {
    		dcs = BTRFS_DC_SETUP;
    		goto out_put;
    	}
    
    
    	/*
    	 * We want to set the generation to 0, that way if anything goes wrong
    	 * from here on out we know not to trust this cache when we load up next
    	 * time.
    	 */
    	BTRFS_I(inode)->generation = 0;
    	ret = btrfs_update_inode(trans, root, inode);
    	WARN_ON(ret);
    
    	if (i_size_read(inode) > 0) {
    		ret = btrfs_truncate_free_space_cache(root, trans, path,
    						      inode);
    		if (ret)
    			goto out_put;
    	}
    
    	spin_lock(&block_group->lock);
    
    	if (block_group->cached != BTRFS_CACHE_FINISHED ||
    	    !btrfs_test_opt(root, SPACE_CACHE)) {
    		/*
    		 * don't bother trying to write stuff out _if_
    		 * a) we're not cached,
    		 * b) we're with nospace_cache mount option.
    		 */
    
    		dcs = BTRFS_DC_WRITTEN;
    
    		spin_unlock(&block_group->lock);
    		goto out_put;
    	}
    	spin_unlock(&block_group->lock);
    
    
    	/*
    	 * Try to preallocate enough space based on how big the block group is.
    	 * Keep in mind this has to include any pinned space which could end up
    	 * taking up quite a bit since it's not folded into the other space
    	 * cache.
    	 */
    	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
    
    	if (!num_pages)
    		num_pages = 1;
    
    	num_pages *= 16;
    	num_pages *= PAGE_CACHE_SIZE;
    
    	ret = btrfs_check_data_free_space(inode, num_pages);
    	if (ret)
    		goto out_put;
    
    	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
    					      num_pages, num_pages,
    					      &alloc_hint);
    
    	if (!ret)
    		dcs = BTRFS_DC_SETUP;
    
    	btrfs_free_reserved_data_space(inode, num_pages);
    
    out_put:
    	iput(inode);
    out_free:
    
    	btrfs_release_path(path);
    
    out:
    	spin_lock(&block_group->lock);
    
    	if (!ret && dcs == BTRFS_DC_SETUP)
    
    		block_group->cache_generation = trans->transid;
    
    	block_group->disk_cache_state = dcs;
    
    	spin_unlock(&block_group->lock);
    
    	return ret;
    }
    
    
    int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
    				   struct btrfs_root *root)
    
    	struct btrfs_block_group_cache *cache;
    
    	int err = 0;
    	struct btrfs_path *path;
    
    
    	path = btrfs_alloc_path();
    	if (!path)
    		return -ENOMEM;
    
    
    again:
    	while (1) {
    		cache = btrfs_lookup_first_block_group(root->fs_info, last);
    		while (cache) {
    			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
    				break;
    			cache = next_block_group(root, cache);
    		}
    		if (!cache) {
    			if (last == 0)
    				break;
    			last = 0;
    			continue;
    		}
    		err = cache_save_setup(cache, trans, path);
    		last = cache->key.objectid + cache->key.offset;
    		btrfs_put_block_group(cache);
    	}
    
    
    	while (1) {
    
    		if (last == 0) {
    			err = btrfs_run_delayed_refs(trans, root,
    						     (unsigned long)-1);
    
    			if (err) /* File system offline */
    				goto out;
    
    		cache = btrfs_lookup_first_block_group(root->fs_info, last);
    		while (cache) {
    
    			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
    				btrfs_put_block_group(cache);
    				goto again;
    			}
    
    
    			if (cache->dirty)
    				break;
    			cache = next_block_group(root, cache);
    		}
    		if (!cache) {
    			if (last == 0)
    				break;
    			last = 0;
    			continue;
    		}
    
    		if (cache->disk_cache_state == BTRFS_DC_SETUP)
    			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
    
    		cache->dirty = 0;
    
    		last = cache->key.objectid + cache->key.offset;
    
    		err = write_one_cache_group(trans, root, path, cache);
    
    		if (err) /* File system offline */
    			goto out;
    
    
    		btrfs_put_block_group(cache);
    
    	while (1) {
    		/*
    		 * I don't think this is needed since we're just marking our
    		 * preallocated extent as written, but just in case it can't
    		 * hurt.
    		 */
    		if (last == 0) {
    			err = btrfs_run_delayed_refs(trans, root,
    						     (unsigned long)-1);
    
    			if (err) /* File system offline */
    				goto out;
    
    		}
    
    		cache = btrfs_lookup_first_block_group(root->fs_info, last);
    		while (cache) {
    			/*
    			 * Really this shouldn't happen, but it could if we
    			 * couldn't write the entire preallocated extent and
    			 * splitting the extent resulted in a new block.
    			 */
    			if (cache->dirty) {
    				btrfs_put_block_group(cache);
    				goto again;
    			}
    			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
    				break;
    			cache = next_block_group(root, cache);
    		}
    		if (!cache) {
    			if (last == 0)
    				break;
    			last = 0;
    			continue;
    		}
    
    
    		err = btrfs_write_out_cache(root, trans, cache, path);
    
    
    		/*
    		 * If we didn't have an error then the cache state is still
    		 * NEED_WRITE, so we can set it to WRITTEN.
    		 */
    
    		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
    
    			cache->disk_cache_state = BTRFS_DC_WRITTEN;
    		last = cache->key.objectid + cache->key.offset;
    		btrfs_put_block_group(cache);
    	}
    
    	btrfs_free_path(path);
    
    int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
    {
    	struct btrfs_block_group_cache *block_group;
    	int readonly = 0;
    
    	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
    	if (!block_group || block_group->ro)
    		readonly = 1;
    	if (block_group)
    
    		btrfs_put_block_group(block_group);
    
    static int update_space_info(struct btrfs_fs_info *info, u64 flags,
    			     u64 total_bytes, u64 bytes_used,
    			     struct btrfs_space_info **space_info)
    {
    	struct btrfs_space_info *found;
    
    	int i;
    	int factor;
    
    	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
    		     BTRFS_BLOCK_GROUP_RAID10))
    		factor = 2;
    	else
    		factor = 1;
    
    
    	found = __find_space_info(info, flags);
    	if (found) {
    
    		spin_lock(&found->lock);
    
    		found->total_bytes += total_bytes;
    
    Josef Bacik's avatar
    Josef Bacik committed
    		found->disk_total += total_bytes * factor;
    
    		found->bytes_used += bytes_used;
    
    		found->disk_used += bytes_used * factor;
    
    		spin_unlock(&found->lock);
    
    	found = kzalloc(sizeof(*found), GFP_NOFS);
    
    	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
    		INIT_LIST_HEAD(&found->block_groups[i]);
    
    	init_rwsem(&found->groups_sem);
    
    	spin_lock_init(&found->lock);
    
    	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
    
    	found->total_bytes = total_bytes;
    
    Josef Bacik's avatar
    Josef Bacik committed
    	found->disk_total = total_bytes * factor;
    
    	found->bytes_used = bytes_used;
    
    	found->disk_used = bytes_used * factor;
    
    	found->bytes_pinned = 0;
    
    	found->bytes_reserved = 0;
    
    	found->bytes_readonly = 0;
    
    	found->bytes_may_use = 0;
    
    	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
    
    	found->chunk_alloc = 0;
    
    	found->flush = 0;
    	init_waitqueue_head(&found->wait);
    
    	list_add_rcu(&found->list, &info->space_info);
    
    	if (flags & BTRFS_BLOCK_GROUP_DATA)
    		info->data_sinfo = found;
    
    static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
    {
    
    	u64 extra_flags = chunk_to_extended(flags) &
    				BTRFS_EXTENDED_PROFILE_MASK;
    
    	write_seqlock(&fs_info->profiles_lock);
    
    	if (flags & BTRFS_BLOCK_GROUP_DATA)
    		fs_info->avail_data_alloc_bits |= extra_flags;
    	if (flags & BTRFS_BLOCK_GROUP_METADATA)
    		fs_info->avail_metadata_alloc_bits |= extra_flags;
    	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    		fs_info->avail_system_alloc_bits |= extra_flags;
    
    	write_sequnlock(&fs_info->profiles_lock);
    
    /*
     * returns target flags in extended format or 0 if restripe for this
     * chunk_type is not in progress
    
     *
     * should be called with either volume_mutex or balance_lock held
    
     */
    static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
    {
    	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
    	u64 target = 0;
    
    	if (!bctl)
    		return 0;
    
    	if (flags & BTRFS_BLOCK_GROUP_DATA &&
    	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
    		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
    	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
    		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
    		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
    	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
    		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
    		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
    	}
    
    	return target;
    }
    
    
    /*
     * @flags: available profiles in extended format (see ctree.h)
     *
    
     * Returns reduced profile in chunk format.  If profile changing is in
     * progress (either running or paused) picks the target profile (if it's
     * already available), otherwise falls back to plain reducing.
    
    Yan Zheng's avatar
    Yan Zheng committed
    u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
    
    	/*
    	 * we add in the count of missing devices because we want
    	 * to make sure that any RAID levels on a degraded FS
    	 * continue to be honored.
    	 */
    	u64 num_devices = root->fs_info->fs_devices->rw_devices +
    		root->fs_info->fs_devices->missing_devices;
    
    David Woodhouse's avatar
    David Woodhouse committed
    	u64 tmp;
    
    	/*
    	 * see if restripe for this chunk_type is in progress, if so
    	 * try to reduce to the target profile
    	 */
    
    	spin_lock(&root->fs_info->balance_lock);
    
    	target = get_restripe_target(root->fs_info, flags);
    	if (target) {
    		/* pick target profile only if it's already available */
    		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
    
    			spin_unlock(&root->fs_info->balance_lock);
    
    			return extended_to_chunk(target);
    
    		}
    	}
    	spin_unlock(&root->fs_info->balance_lock);
    
    
    David Woodhouse's avatar
    David Woodhouse committed
    	/* First, mask out the RAID levels which aren't possible */
    
    	if (num_devices == 1)
    
    David Woodhouse's avatar
    David Woodhouse committed
    		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
    			   BTRFS_BLOCK_GROUP_RAID5);
    	if (num_devices < 3)
    		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
    
    	if (num_devices < 4)
    		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
    
    
    David Woodhouse's avatar
    David Woodhouse committed
    	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
    		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
    		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
    	flags &= ~tmp;
    
    David Woodhouse's avatar
    David Woodhouse committed
    	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
    		tmp = BTRFS_BLOCK_GROUP_RAID6;
    	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
    		tmp = BTRFS_BLOCK_GROUP_RAID5;
    	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
    		tmp = BTRFS_BLOCK_GROUP_RAID10;
    	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
    		tmp = BTRFS_BLOCK_GROUP_RAID1;
    	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
    		tmp = BTRFS_BLOCK_GROUP_RAID0;
    
    David Woodhouse's avatar
    David Woodhouse committed
    	return extended_to_chunk(flags | tmp);
    
    static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
    
    	unsigned seq;
    
    	do {
    		seq = read_seqbegin(&root->fs_info->profiles_lock);
    
    		if (flags & BTRFS_BLOCK_GROUP_DATA)
    			flags |= root->fs_info->avail_data_alloc_bits;
    		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    			flags |= root->fs_info->avail_system_alloc_bits;
    		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
    			flags |= root->fs_info->avail_metadata_alloc_bits;
    	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
    
    	return btrfs_reduce_alloc_profile(root, flags);
    
    u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
    
    David Woodhouse's avatar
    David Woodhouse committed
    	u64 ret;
    
    	if (data)
    		flags = BTRFS_BLOCK_GROUP_DATA;
    	else if (root == root->fs_info->chunk_root)
    		flags = BTRFS_BLOCK_GROUP_SYSTEM;
    
    		flags = BTRFS_BLOCK_GROUP_METADATA;
    
    David Woodhouse's avatar
    David Woodhouse committed
    	ret = get_alloc_profile(root, flags);
    	return ret;
    
    /*
     * This will check the space that the inode allocates from to make sure we have
     * enough space for bytes.
     */
    
    int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
    
    {
    	struct btrfs_space_info *data_sinfo;
    
    	struct btrfs_root *root = BTRFS_I(inode)->root;
    
    	struct btrfs_fs_info *fs_info = root->fs_info;
    
    	int ret = 0, committed = 0, alloc_chunk = 1;
    
    
    	/* make sure bytes are sectorsize aligned */
    
    	bytes = ALIGN(bytes, root->sectorsize);
    
    	if (root == root->fs_info->tree_root ||
    	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
    
    		alloc_chunk = 0;
    		committed = 1;
    	}
    
    
    	data_sinfo = fs_info->data_sinfo;
    
    	if (!data_sinfo)
    		goto alloc;
    
    again:
    	/* make sure we have enough space to handle the data first */
    	spin_lock(&data_sinfo->lock);
    
    	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
    		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
    		data_sinfo->bytes_may_use;
    
    
    	if (used + bytes > data_sinfo->total_bytes) {
    
    		struct btrfs_trans_handle *trans;
    
    		/*
    		 * if we don't have enough free bytes in this space then we need
    		 * to alloc a new chunk.
    		 */
    
    		if (!data_sinfo->full && alloc_chunk) {
    
    			u64 alloc_target;
    
    			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
    
    			spin_unlock(&data_sinfo->lock);
    
    			alloc_target = btrfs_get_alloc_profile(root, 1);
    
    			trans = btrfs_join_transaction(root);
    
    			if (IS_ERR(trans))
    				return PTR_ERR(trans);
    
    			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
    
    					     alloc_target,
    					     CHUNK_ALLOC_NO_FORCE);
    
    			btrfs_end_transaction(trans, root);
    
    			if (ret < 0) {
    				if (ret != -ENOSPC)
    					return ret;
    				else
    					goto commit_trans;
    			}
    
    			if (!data_sinfo)
    				data_sinfo = fs_info->data_sinfo;
    
    
    
    		/*
    		 * If we have less pinned bytes than we want to allocate then
    		 * don't bother committing the transaction, it won't help us.
    		 */
    		if (data_sinfo->bytes_pinned < bytes)
    			committed = 1;
    
    		spin_unlock(&data_sinfo->lock);
    
    
    		/* commit the current transaction and try again */
    
    Josef Bacik's avatar
    Josef Bacik committed
    		if (!committed &&
    		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
    
    			trans = btrfs_join_transaction(root);
    
    			if (IS_ERR(trans))
    				return PTR_ERR(trans);
    
    			ret = btrfs_commit_transaction(trans, root);
    			if (ret)
    				return ret;
    			goto again;
    		}
    
    		return -ENOSPC;
    	}
    	data_sinfo->bytes_may_use += bytes;
    
    	trace_btrfs_space_reservation(root->fs_info, "space_info",
    
    				      data_sinfo->flags, bytes, 1);
    
    	spin_unlock(&data_sinfo->lock);
    
    
    	return 0;
    }
    
     * Called if we need to clear a data reservation for this inode.
    
    void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
    
    	struct btrfs_root *root = BTRFS_I(inode)->root;
    
    	struct btrfs_space_info *data_sinfo;
    
    	/* make sure bytes are sectorsize aligned */
    
    	bytes = ALIGN(bytes, root->sectorsize);
    
    	data_sinfo = root->fs_info->data_sinfo;
    
    	spin_lock(&data_sinfo->lock);
    	data_sinfo->bytes_may_use -= bytes;
    
    	trace_btrfs_space_reservation(root->fs_info, "space_info",
    
    				      data_sinfo->flags, bytes, 0);
    
    	spin_unlock(&data_sinfo->lock);
    
    static void force_metadata_allocation(struct btrfs_fs_info *info)
    
    	struct list_head *head = &info->space_info;
    	struct btrfs_space_info *found;
    
    	rcu_read_lock();
    	list_for_each_entry_rcu(found, head, list) {
    		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
    
    			found->force_alloc = CHUNK_ALLOC_FORCE;
    
    static int should_alloc_chunk(struct btrfs_root *root,
    
    			      struct btrfs_space_info *sinfo, int force)
    
    	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
    
    	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
    
    	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
    
    	if (force == CHUNK_ALLOC_FORCE)
    		return 1;
    
    
    	/*
    	 * We need to take into account the global rsv because for all intents
    	 * and purposes it's used space.  Don't worry about locking the
    	 * global_rsv, it doesn't change except when the transaction commits.
    	 */
    
    	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
    		num_allocated += global_rsv->size;
    
    	/*
    	 * in limited mode, we want to have some free space up to
    	 * about 1% of the FS size.
    	 */
    	if (force == CHUNK_ALLOC_LIMITED) {
    
    		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
    
    		thresh = max_t(u64, 64 * 1024 * 1024,
    			       div_factor_fine(thresh, 1));
    
    		if (num_bytes - num_allocated < thresh)
    			return 1;
    	}
    
    
    	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
    
    static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
    {
    	u64 num_dev;
    
    
    David Woodhouse's avatar
    David Woodhouse committed
    	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
    		    BTRFS_BLOCK_GROUP_RAID0 |
    		    BTRFS_BLOCK_GROUP_RAID5 |
    		    BTRFS_BLOCK_GROUP_RAID6))
    
    		num_dev = root->fs_info->fs_devices->rw_devices;
    	else if (type & BTRFS_BLOCK_GROUP_RAID1)
    		num_dev = 2;
    	else
    		num_dev = 1;	/* DUP or single */
    
    	/* metadata for updaing devices and chunk tree */
    	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
    }
    
    static void check_system_chunk(struct btrfs_trans_handle *trans,
    			       struct btrfs_root *root, u64 type)
    {
    	struct btrfs_space_info *info;
    	u64 left;
    	u64 thresh;
    
    	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
    	spin_lock(&info->lock);
    	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
    		info->bytes_reserved - info->bytes_readonly;
    	spin_unlock(&info->lock);
    
    	thresh = get_system_chunk_thresh(root, type);
    	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
    		printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
    		       left, thresh, type);
    		dump_space_info(info, 0, 0);
    	}
    
    	if (left < thresh) {
    		u64 flags;
    
    		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
    		btrfs_alloc_chunk(trans, root, flags);
    	}
    }
    
    
    static int do_chunk_alloc(struct btrfs_trans_handle *trans,
    
    			  struct btrfs_root *extent_root, u64 flags, int force)
    
    	struct btrfs_space_info *space_info;
    
    	struct btrfs_fs_info *fs_info = extent_root->fs_info;
    
    	int wait_for_alloc = 0;
    
    	int ret = 0;
    
    
    	/* Don't re-enter if we're already allocating a chunk */
    	if (trans->allocating_chunk)
    		return -ENOSPC;
    
    
    	space_info = __find_space_info(extent_root->fs_info, flags);
    
    	if (!space_info) {
    		ret = update_space_info(extent_root->fs_info, flags,
    					0, 0, &space_info);
    
    		BUG_ON(ret); /* -ENOMEM */
    
    	BUG_ON(!space_info); /* Logic error */
    
    	spin_lock(&space_info->lock);
    
    	if (force < space_info->force_alloc)
    
    		force = space_info->force_alloc;
    
    	if (space_info->full) {
    		spin_unlock(&space_info->lock);
    
    	if (!should_alloc_chunk(extent_root, space_info, force)) {
    
    		spin_unlock(&space_info->lock);
    
    		return 0;
    	} else if (space_info->chunk_alloc) {
    		wait_for_alloc = 1;
    	} else {
    		space_info->chunk_alloc = 1;
    
    	spin_unlock(&space_info->lock);
    
    	mutex_lock(&fs_info->chunk_mutex);
    
    	/*
    	 * The chunk_mutex is held throughout the entirety of a chunk
    	 * allocation, so once we've acquired the chunk_mutex we know that the
    	 * other guy is done and we need to recheck and see if we should
    	 * allocate.
    	 */
    	if (wait_for_alloc) {
    		mutex_unlock(&fs_info->chunk_mutex);
    		wait_for_alloc = 0;
    		goto again;
    	}
    
    
    	trans->allocating_chunk = true;
    
    
    	/*
    	 * If we have mixed data/metadata chunks we want to make sure we keep
    	 * allocating mixed chunks instead of individual chunks.
    	 */
    	if (btrfs_mixed_space_info(space_info))
    		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
    
    
    	/*
    	 * if we're doing a data chunk, go ahead and make sure that
    	 * we keep a reasonable number of metadata chunks allocated in the
    	 * FS as well.
    	 */
    
    	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
    
    		fs_info->data_chunk_allocations++;
    		if (!(fs_info->data_chunk_allocations %
    		      fs_info->metadata_ratio))
    			force_metadata_allocation(fs_info);
    
    	/*
    	 * Check if we have enough space in SYSTEM chunk because we may need
    	 * to update devices.
    	 */
    	check_system_chunk(trans, extent_root, flags);
    
    
    Yan Zheng's avatar
    Yan Zheng committed
    	ret = btrfs_alloc_chunk(trans, extent_root, flags);
    
    	trans->allocating_chunk = false;
    
    	spin_lock(&space_info->lock);
    
    	if (ret < 0 && ret != -ENOSPC)
    		goto out;
    
    	if (ret)
    
    		space_info->full = 1;
    
    	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
    
    	space_info->chunk_alloc = 0;
    
    	spin_unlock(&space_info->lock);
    
    	mutex_unlock(&fs_info->chunk_mutex);
    
    	return ret;
    
    static int can_overcommit(struct btrfs_root *root,
    			  struct btrfs_space_info *space_info, u64 bytes,
    
    			  enum btrfs_reserve_flush_enum flush)
    
    	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
    
    	u64 profile = btrfs_get_alloc_profile(root, 0);
    
    	u64 avail;
    	u64 used;
    
    
    	used = space_info->bytes_used + space_info->bytes_reserved +
    
    		space_info->bytes_pinned + space_info->bytes_readonly;
    
    	spin_lock(&global_rsv->lock);
    	rsv_size = global_rsv->size;
    	spin_unlock(&global_rsv->lock);
    
    	/*
    	 * We only want to allow over committing if we have lots of actual space
    	 * free, but if we don't have enough space to handle the global reserve
    	 * space then we could end up having a real enospc problem when trying
    	 * to allocate a chunk or some other such important allocation.
    	 */
    	rsv_size <<= 1;
    	if (used + rsv_size >= space_info->total_bytes)
    		return 0;
    
    	used += space_info->bytes_may_use;
    
    
    	spin_lock(&root->fs_info->free_chunk_lock);
    	avail = root->fs_info->free_chunk_space;
    	spin_unlock(&root->fs_info->free_chunk_lock);
    
    	/*
    	 * If we have dup, raid1 or raid10 then only half of the free
    
    David Woodhouse's avatar
    David Woodhouse committed
    	 * space is actually useable.  For raid56, the space info used
    	 * doesn't include the parity drive, so we don't have to
    	 * change the math
    
    	 */
    	if (profile & (BTRFS_BLOCK_GROUP_DUP |
    		       BTRFS_BLOCK_GROUP_RAID1 |
    		       BTRFS_BLOCK_GROUP_RAID10))
    		avail >>= 1;
    
    
    	 * If we aren't flushing all things, let us overcommit up to
    	 * 1/2th of the space. If we can flush, don't let us overcommit
    	 * too much, let it overcommit up to 1/8 of the space.
    
    	if (flush == BTRFS_RESERVE_FLUSH_ALL)
    
    	/*
    	 * Limit the overcommit to the amount of free space we could possibly
    	 * allocate for chunks.
    	 */
    	to_add = min(avail, to_add);
    
    	if (used + bytes < space_info->total_bytes + to_add)
    
    		return 1;
    	return 0;
    }
    
    
    static inline int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
    						      unsigned long nr_pages,
    						      enum wb_reason reason)
    
    	/* the flusher is dealing with the dirty inodes now. */
    	if (writeback_in_progress(sb->s_bdi))
    		return 1;
    
    	if (down_read_trylock(&sb->s_umount)) {
    
    		writeback_inodes_sb_nr(sb, nr_pages, reason);
    		up_read(&sb->s_umount);
    		return 1;
    	}
    
    	return 0;
    }
    
    
    void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
    				  unsigned long nr_pages)
    {
    	struct super_block *sb = root->fs_info->sb;
    	int started;
    
    	/* If we can not start writeback, just sync all the delalloc file. */
    	started = writeback_inodes_sb_nr_if_idle_safe(sb, nr_pages,
    						      WB_REASON_FS_FREE_SPACE);
    	if (!started) {
    		/*
    		 * We needn't worry the filesystem going from r/w to r/o though
    		 * we don't acquire ->s_umount mutex, because the filesystem
    		 * should guarantee the delalloc inodes list be empty after
    		 * the filesystem is readonly(all dirty pages are written to
    		 * the disk).
    		 */
    		btrfs_start_delalloc_inodes(root, 0);
    		btrfs_wait_ordered_extents(root, 0);
    	}
    }
    
    
     * shrink metadata reservation for delalloc
    
    static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
    			    bool wait_ordered)
    
    	struct btrfs_block_rsv *block_rsv;
    
    	struct btrfs_space_info *space_info;
    
    	u64 delalloc_bytes;
    
    	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
    
    	enum btrfs_reserve_flush_enum flush;
    
    	trans = (struct btrfs_trans_handle *)current->journal_info;
    
    	block_rsv = &root->fs_info->delalloc_block_rsv;
    
    	space_info = block_rsv->space_info;
    
    	delalloc_bytes = percpu_counter_sum_positive(
    						&root->fs_info->delalloc_bytes);
    
    	if (delalloc_bytes == 0) {
    
    			return;
    
    		btrfs_wait_ordered_extents(root, 0);
    
    		return;
    
    	while (delalloc_bytes && loops < 3) {
    		max_reclaim = min(delalloc_bytes, to_reclaim);
    		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
    
    		btrfs_writeback_inodes_sb_nr(root, nr_pages);
    
    		/*
    		 * We need to wait for the async pages to actually start before
    		 * we do anything.
    		 */
    		wait_event(root->fs_info->async_submit_wait,
    			   !atomic_read(&root->fs_info->async_delalloc_pages));
    
    
    		if (!trans)
    			flush = BTRFS_RESERVE_FLUSH_ALL;
    		else
    			flush = BTRFS_RESERVE_NO_FLUSH;
    
    		spin_lock(&space_info->lock);
    
    		if (can_overcommit(root, space_info, orig, flush)) {
    
    			spin_unlock(&space_info->lock);
    			break;
    		}
    
    		spin_unlock(&space_info->lock);
    
    			btrfs_wait_ordered_extents(root, 0);
    
    			time_left = schedule_timeout_killable(1);
    
    		smp_mb();
    
    		delalloc_bytes = percpu_counter_sum_positive(
    						&root->fs_info->delalloc_bytes);
    
    /**
     * maybe_commit_transaction - possibly commit the transaction if its ok to
     * @root - the root we're allocating for
     * @bytes - the number of bytes we want to reserve
     * @force - force the commit
    
     * This will check to make sure that committing the transaction will actually
     * get us somewhere and then commit the transaction if it does.  Otherwise it
     * will return -ENOSPC.
    
    static int may_commit_transaction(struct btrfs_root *root,
    				  struct btrfs_space_info *space_info,
    				  u64 bytes, int force)
    {
    	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
    	struct btrfs_trans_handle *trans;
    
    	trans = (struct btrfs_trans_handle *)current->journal_info;
    	if (trans)
    		return -EAGAIN;
    
    	if (force)
    		goto commit;
    
    	/* See if there is enough pinned space to make this reservation */
    	spin_lock(&space_info->lock);
    	if (space_info->bytes_pinned >= bytes) {
    		spin_unlock(&space_info->lock);
    		goto commit;
    	}
    	spin_unlock(&space_info->lock);
    
    	/*
    	 * See if there is some space in the delayed insertion reservation for
    	 * this reservation.
    	 */
    	if (space_info != delayed_rsv->space_info)
    		return -ENOSPC;
    
    
    	spin_lock(&space_info->lock);
    
    	if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
    
    		spin_unlock(&space_info->lock);
    
    		return -ENOSPC;
    	}
    	spin_unlock(&delayed_rsv->lock);
    
    	spin_unlock(&space_info->lock);
    
    
    commit:
    	trans = btrfs_join_transaction(root);
    	if (IS_ERR(trans))
    		return -ENOSPC;
    
    	return btrfs_commit_transaction(trans, root);
    }
    
    
    	FLUSH_DELAYED_ITEMS_NR	=	1,
    	FLUSH_DELAYED_ITEMS	=	2,
    	FLUSH_DELALLOC		=	3,
    	FLUSH_DELALLOC_WAIT	=	4,
    
    	ALLOC_CHUNK		=	5,
    	COMMIT_TRANS		=	6,
    
    };
    
    static int flush_space(struct btrfs_root *root,
    		       struct btrfs_space_info *space_info, u64 num_bytes,
    		       u64 orig_bytes, int state)
    {
    	struct btrfs_trans_handle *trans;
    	int nr;
    
    	int ret = 0;
    
    
    	switch (state) {
    	case FLUSH_DELAYED_ITEMS_NR:
    	case FLUSH_DELAYED_ITEMS:
    		if (state == FLUSH_DELAYED_ITEMS_NR) {
    			u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
    
    			nr = (int)div64_u64(num_bytes, bytes);
    			if (!nr)
    				nr = 1;
    			nr *= 2;
    		} else {
    			nr = -1;
    		}
    		trans = btrfs_join_transaction(root);
    		if (IS_ERR(trans)) {
    			ret = PTR_ERR(trans);
    			break;
    		}
    		ret = btrfs_run_delayed_items_nr(trans, root, nr);