Skip to content
Snippets Groups Projects
buffer.c 88.5 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    grow_dev_page(struct block_device *bdev, sector_t block,
    		pgoff_t index, int size)
    {
    	struct inode *inode = bdev->bd_inode;
    	struct page *page;
    	struct buffer_head *bh;
    
    
    	page = find_or_create_page(inode->i_mapping, index,
    
    		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!page)
    		return NULL;
    
    
    	BUG_ON(!PageLocked(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (page_has_buffers(page)) {
    		bh = page_buffers(page);
    		if (bh->b_size == size) {
    			init_page_buffers(page, bdev, block, size);
    			return page;
    		}
    		if (!try_to_free_buffers(page))
    			goto failed;
    	}
    
    	/*
    	 * Allocate some buffers for this page
    	 */
    	bh = alloc_page_buffers(page, size, 0);
    	if (!bh)
    		goto failed;
    
    	/*
    	 * Link the page to the buffers and initialise them.  Take the
    	 * lock to be atomic wrt __find_get_block(), which does not
    	 * run under the page lock.
    	 */
    	spin_lock(&inode->i_mapping->private_lock);
    	link_dev_buffers(page, bh);
    	init_page_buffers(page, bdev, block, size);
    	spin_unlock(&inode->i_mapping->private_lock);
    	return page;
    
    failed:
    	BUG();
    	unlock_page(page);
    	page_cache_release(page);
    	return NULL;
    }
    
    /*
     * Create buffers for the specified block device block's page.  If
     * that page was dirty, the buffers are set dirty also.
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    grow_buffers(struct block_device *bdev, sector_t block, int size)
    {
    	struct page *page;
    	pgoff_t index;
    	int sizebits;
    
    	sizebits = -1;
    	do {
    		sizebits++;
    	} while ((size << sizebits) < PAGE_SIZE);
    
    	index = block >> sizebits;
    
    
    	/*
    	 * Check for a block which wants to lie outside our maximum possible
    	 * pagecache index.  (this comparison is done using sector_t types).
    	 */
    	if (unlikely(index != block >> sizebits)) {
    		char b[BDEVNAME_SIZE];
    
    		printk(KERN_ERR "%s: requested out-of-range block %llu for "
    			"device %s\n",
    
    			__func__, (unsigned long long)block,
    
    			bdevname(bdev, b));
    		return -EIO;
    	}
    	block = index << sizebits;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Create a page with the proper size buffers.. */
    	page = grow_dev_page(bdev, block, index, size);
    	if (!page)
    		return 0;
    	unlock_page(page);
    	page_cache_release(page);
    	return 1;
    }
    
    
    static struct buffer_head *
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    __getblk_slow(struct block_device *bdev, sector_t block, int size)
    {
    	/* Size must be multiple of hard sectorsize */
    
    	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			(size < 512 || size > PAGE_SIZE))) {
    		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
    					size);
    
    		printk(KERN_ERR "logical block size: %d\n",
    					bdev_logical_block_size(bdev));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		dump_stack();
    		return NULL;
    	}
    
    	for (;;) {
    		struct buffer_head * bh;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		bh = __find_get_block(bdev, block, size);
    		if (bh)
    			return bh;
    
    
    		ret = grow_buffers(bdev, block, size);
    		if (ret < 0)
    			return NULL;
    		if (ret == 0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			free_more_memory();
    	}
    }
    
    /*
     * The relationship between dirty buffers and dirty pages:
     *
     * Whenever a page has any dirty buffers, the page's dirty bit is set, and
     * the page is tagged dirty in its radix tree.
     *
     * At all times, the dirtiness of the buffers represents the dirtiness of
     * subsections of the page.  If the page has buffers, the page dirty bit is
     * merely a hint about the true dirty state.
     *
     * When a page is set dirty in its entirety, all its buffers are marked dirty
     * (if the page has buffers).
     *
     * When a buffer is marked dirty, its page is dirtied, but the page's other
     * buffers are not.
     *
     * Also.  When blockdev buffers are explicitly read with bread(), they
     * individually become uptodate.  But their backing page remains not
     * uptodate - even if all of its buffers are uptodate.  A subsequent
     * block_read_full_page() against that page will discover all the uptodate
     * buffers, will set the page uptodate and will perform no I/O.
     */
    
    /**
     * mark_buffer_dirty - mark a buffer_head as needing writeout
    
     * @bh: the buffer_head to mark dirty
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
     * backing page dirty, then tag the page as dirty in its address_space's radix
     * tree and then attach the address_space's inode to its superblock's dirty
     * inode list.
     *
     * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
     * mapping->tree_lock and the global inode_lock.
     */
    
    void mark_buffer_dirty(struct buffer_head *bh)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	WARN_ON_ONCE(!buffer_uptodate(bh));
    
    
    	/*
    	 * Very *carefully* optimize the it-is-already-dirty case.
    	 *
    	 * Don't let the final "is it dirty" escape to before we
    	 * perhaps modified the buffer.
    	 */
    	if (buffer_dirty(bh)) {
    		smp_mb();
    		if (buffer_dirty(bh))
    			return;
    	}
    
    
    	if (!test_set_buffer_dirty(bh)) {
    		struct page *page = bh->b_page;
    
    		if (!TestSetPageDirty(page)) {
    			struct address_space *mapping = page_mapping(page);
    			if (mapping)
    				__set_page_dirty(page, mapping, 0);
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    EXPORT_SYMBOL(mark_buffer_dirty);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * Decrement a buffer_head's reference count.  If all buffers against a page
     * have zero reference count, are clean and unlocked, and if the page is clean
     * and unlocked then try_to_free_buffers() may strip the buffers from the page
     * in preparation for freeing it (sometimes, rarely, buffers are removed from
     * a page but it ends up not being freed, and buffers may later be reattached).
     */
    void __brelse(struct buffer_head * buf)
    {
    	if (atomic_read(&buf->b_count)) {
    		put_bh(buf);
    		return;
    	}
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    EXPORT_SYMBOL(__brelse);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * bforget() is like brelse(), except it discards any
     * potentially dirty data.
     */
    void __bforget(struct buffer_head *bh)
    {
    	clear_buffer_dirty(bh);
    
    	if (bh->b_assoc_map) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct address_space *buffer_mapping = bh->b_page->mapping;
    
    		spin_lock(&buffer_mapping->private_lock);
    		list_del_init(&bh->b_assoc_buffers);
    
    		bh->b_assoc_map = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		spin_unlock(&buffer_mapping->private_lock);
    	}
    	__brelse(bh);
    }
    
    EXPORT_SYMBOL(__bforget);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static struct buffer_head *__bread_slow(struct buffer_head *bh)
    {
    	lock_buffer(bh);
    	if (buffer_uptodate(bh)) {
    		unlock_buffer(bh);
    		return bh;
    	} else {
    		get_bh(bh);
    		bh->b_end_io = end_buffer_read_sync;
    		submit_bh(READ, bh);
    		wait_on_buffer(bh);
    		if (buffer_uptodate(bh))
    			return bh;
    	}
    	brelse(bh);
    	return NULL;
    }
    
    /*
     * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
     * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
     * refcount elevated by one when they're in an LRU.  A buffer can only appear
     * once in a particular CPU's LRU.  A single buffer can be present in multiple
     * CPU's LRUs at the same time.
     *
     * This is a transparent caching front-end to sb_bread(), sb_getblk() and
     * sb_find_get_block().
     *
     * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
     * a local interrupt disable for that.
     */
    
    #define BH_LRU_SIZE	8
    
    struct bh_lru {
    	struct buffer_head *bhs[BH_LRU_SIZE];
    };
    
    static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
    
    #ifdef CONFIG_SMP
    #define bh_lru_lock()	local_irq_disable()
    #define bh_lru_unlock()	local_irq_enable()
    #else
    #define bh_lru_lock()	preempt_disable()
    #define bh_lru_unlock()	preempt_enable()
    #endif
    
    static inline void check_irqs_on(void)
    {
    #ifdef irqs_disabled
    	BUG_ON(irqs_disabled());
    #endif
    }
    
    /*
     * The LRU management algorithm is dopey-but-simple.  Sorry.
     */
    static void bh_lru_install(struct buffer_head *bh)
    {
    	struct buffer_head *evictee = NULL;
    	struct bh_lru *lru;
    
    	check_irqs_on();
    	bh_lru_lock();
    	lru = &__get_cpu_var(bh_lrus);
    	if (lru->bhs[0] != bh) {
    		struct buffer_head *bhs[BH_LRU_SIZE];
    		int in;
    		int out = 0;
    
    		get_bh(bh);
    		bhs[out++] = bh;
    		for (in = 0; in < BH_LRU_SIZE; in++) {
    			struct buffer_head *bh2 = lru->bhs[in];
    
    			if (bh2 == bh) {
    				__brelse(bh2);
    			} else {
    				if (out >= BH_LRU_SIZE) {
    					BUG_ON(evictee != NULL);
    					evictee = bh2;
    				} else {
    					bhs[out++] = bh2;
    				}
    			}
    		}
    		while (out < BH_LRU_SIZE)
    			bhs[out++] = NULL;
    		memcpy(lru->bhs, bhs, sizeof(bhs));
    	}
    	bh_lru_unlock();
    
    	if (evictee)
    		__brelse(evictee);
    }
    
    /*
     * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
     */
    
    static struct buffer_head *
    
    lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct buffer_head *ret = NULL;
    	struct bh_lru *lru;
    
    	unsigned int i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	check_irqs_on();
    	bh_lru_lock();
    	lru = &__get_cpu_var(bh_lrus);
    	for (i = 0; i < BH_LRU_SIZE; i++) {
    		struct buffer_head *bh = lru->bhs[i];
    
    		if (bh && bh->b_bdev == bdev &&
    				bh->b_blocknr == block && bh->b_size == size) {
    			if (i) {
    				while (i) {
    					lru->bhs[i] = lru->bhs[i - 1];
    					i--;
    				}
    				lru->bhs[0] = bh;
    			}
    			get_bh(bh);
    			ret = bh;
    			break;
    		}
    	}
    	bh_lru_unlock();
    	return ret;
    }
    
    /*
     * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
     * it in the LRU and mark it as accessed.  If it is not present then return
     * NULL
     */
    struct buffer_head *
    
    __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
    
    	if (bh == NULL) {
    
    		bh = __find_get_block_slow(bdev, block);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (bh)
    			bh_lru_install(bh);
    	}
    	if (bh)
    		touch_buffer(bh);
    	return bh;
    }
    EXPORT_SYMBOL(__find_get_block);
    
    /*
     * __getblk will locate (and, if necessary, create) the buffer_head
     * which corresponds to the passed block_device, block and size. The
     * returned buffer has its reference count incremented.
     *
     * __getblk() cannot fail - it just keeps trying.  If you pass it an
     * illegal block number, __getblk() will happily return a buffer_head
     * which represents the non-existent block.  Very weird.
     *
     * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
     * attempt is failing.  FIXME, perhaps?
     */
    struct buffer_head *
    
    __getblk(struct block_device *bdev, sector_t block, unsigned size)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct buffer_head *bh = __find_get_block(bdev, block, size);
    
    	might_sleep();
    	if (bh == NULL)
    		bh = __getblk_slow(bdev, block, size);
    	return bh;
    }
    EXPORT_SYMBOL(__getblk);
    
    /*
     * Do async read-ahead on a buffer..
     */
    
    void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct buffer_head *bh = __getblk(bdev, block, size);
    
    Andrew Morton's avatar
    Andrew Morton committed
    	if (likely(bh)) {
    		ll_rw_block(READA, 1, &bh);
    		brelse(bh);
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    EXPORT_SYMBOL(__breadahead);
    
    /**
     *  __bread() - reads a specified block and returns the bh
    
     *  @bdev: the block_device to read from
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *  @block: number of block
     *  @size: size (in bytes) to read
     * 
     *  Reads a specified block, and returns buffer head that contains it.
     *  It returns NULL if the block was unreadable.
     */
    struct buffer_head *
    
    __bread(struct block_device *bdev, sector_t block, unsigned size)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct buffer_head *bh = __getblk(bdev, block, size);
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    	if (likely(bh) && !buffer_uptodate(bh))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		bh = __bread_slow(bh);
    	return bh;
    }
    EXPORT_SYMBOL(__bread);
    
    /*
     * invalidate_bh_lrus() is called rarely - but not only at unmount.
     * This doesn't race because it runs in each cpu either in irq
     * or with preempt disabled.
     */
    static void invalidate_bh_lru(void *arg)
    {
    	struct bh_lru *b = &get_cpu_var(bh_lrus);
    	int i;
    
    	for (i = 0; i < BH_LRU_SIZE; i++) {
    		brelse(b->bhs[i]);
    		b->bhs[i] = NULL;
    	}
    	put_cpu_var(bh_lrus);
    }
    	
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    void invalidate_bh_lrus(void)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	on_each_cpu(invalidate_bh_lru, NULL, 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    void set_bh_page(struct buffer_head *bh,
    		struct page *page, unsigned long offset)
    {
    	bh->b_page = page;
    
    	BUG_ON(offset >= PAGE_SIZE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (PageHighMem(page))
    		/*
    		 * This catches illegal uses and preserves the offset:
    		 */
    		bh->b_data = (char *)(0 + offset);
    	else
    		bh->b_data = page_address(page) + offset;
    }
    EXPORT_SYMBOL(set_bh_page);
    
    /*
     * Called when truncating a buffer on a page completely.
     */
    
    static void discard_buffer(struct buffer_head * bh)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	lock_buffer(bh);
    	clear_buffer_dirty(bh);
    	bh->b_bdev = NULL;
    	clear_buffer_mapped(bh);
    	clear_buffer_req(bh);
    	clear_buffer_new(bh);
    	clear_buffer_delay(bh);
    
    	clear_buffer_unwritten(bh);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	unlock_buffer(bh);
    }
    
    /**
     * block_invalidatepage - invalidate part of all of a buffer-backed page
     *
     * @page: the page which is affected
     * @offset: the index of the truncation point
     *
     * block_invalidatepage() is called when all or part of the page has become
     * invalidatedby a truncate operation.
     *
     * block_invalidatepage() does not have to release all buffers, but it must
     * ensure that no dirty buffer is left outside @offset and that no I/O
     * is underway against any of the blocks which are outside the truncation
     * point.  Because the caller is about to free (and possibly reuse) those
     * blocks on-disk.
     */
    
    void block_invalidatepage(struct page *page, unsigned long offset)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct buffer_head *head, *bh, *next;
    	unsigned int curr_off = 0;
    
    	BUG_ON(!PageLocked(page));
    	if (!page_has_buffers(page))
    		goto out;
    
    	head = page_buffers(page);
    	bh = head;
    	do {
    		unsigned int next_off = curr_off + bh->b_size;
    		next = bh->b_this_page;
    
    		/*
    		 * is this block fully invalidated?
    		 */
    		if (offset <= curr_off)
    			discard_buffer(bh);
    		curr_off = next_off;
    		bh = next;
    	} while (bh != head);
    
    	/*
    	 * We release buffers only if the entire page is being invalidated.
    	 * The get_block cached value has been unconditionally invalidated,
    	 * so real IO is not possible anymore.
    	 */
    	if (offset == 0)
    
    		try_to_release_page(page, 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    EXPORT_SYMBOL(block_invalidatepage);
    
    /*
     * We attach and possibly dirty the buffers atomically wrt
     * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
     * is already excluded via the page lock.
     */
    void create_empty_buffers(struct page *page,
    			unsigned long blocksize, unsigned long b_state)
    {
    	struct buffer_head *bh, *head, *tail;
    
    	head = alloc_page_buffers(page, blocksize, 1);
    	bh = head;
    	do {
    		bh->b_state |= b_state;
    		tail = bh;
    		bh = bh->b_this_page;
    	} while (bh);
    	tail->b_this_page = head;
    
    	spin_lock(&page->mapping->private_lock);
    	if (PageUptodate(page) || PageDirty(page)) {
    		bh = head;
    		do {
    			if (PageDirty(page))
    				set_buffer_dirty(bh);
    			if (PageUptodate(page))
    				set_buffer_uptodate(bh);
    			bh = bh->b_this_page;
    		} while (bh != head);
    	}
    	attach_page_buffers(page, head);
    	spin_unlock(&page->mapping->private_lock);
    }
    EXPORT_SYMBOL(create_empty_buffers);
    
    /*
     * We are taking a block for data and we don't want any output from any
     * buffer-cache aliases starting from return from that function and
     * until the moment when something will explicitly mark the buffer
     * dirty (hopefully that will not happen until we will free that block ;-)
     * We don't even need to mark it not-uptodate - nobody can expect
     * anything from a newly allocated buffer anyway. We used to used
     * unmap_buffer() for such invalidation, but that was wrong. We definitely
     * don't want to mark the alias unmapped, for example - it would confuse
     * anyone who might pick it with bread() afterwards...
     *
     * Also..  Note that bforget() doesn't lock the buffer.  So there can
     * be writeout I/O going on against recently-freed buffers.  We don't
     * wait on that I/O in bforget() - it's more efficient to wait on the I/O
     * only if we really need to.  That happens here.
     */
    void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
    {
    	struct buffer_head *old_bh;
    
    	might_sleep();
    
    
    	old_bh = __find_get_block_slow(bdev, block);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (old_bh) {
    		clear_buffer_dirty(old_bh);
    		wait_on_buffer(old_bh);
    		clear_buffer_req(old_bh);
    		__brelse(old_bh);
    	}
    }
    EXPORT_SYMBOL(unmap_underlying_metadata);
    
    /*
     * NOTE! All mapped/uptodate combinations are valid:
     *
     *	Mapped	Uptodate	Meaning
     *
     *	No	No		"unknown" - must do get_block()
     *	No	Yes		"hole" - zero-filled
     *	Yes	No		"allocated" - allocated on disk, not read in
     *	Yes	Yes		"valid" - allocated and up-to-date in memory.
     *
     * "Dirty" is valid only with the last case (mapped+uptodate).
     */
    
    /*
     * While block_write_full_page is writing back the dirty buffers under
     * the page lock, whoever dirtied the buffers may decide to clean them
     * again at any time.  We handle that by only looking at the buffer
     * state inside lock_buffer().
     *
     * If block_write_full_page() is called for regular writeback
     * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
     * locked buffer.   This only can happen if someone has written the buffer
     * directly, with submit_bh().  At the address_space level PageWriteback
     * prevents this contention from occurring.
    
     *
     * If block_write_full_page() is called with wbc->sync_mode ==
     * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
     * causes the writes to be flagged as synchronous writes, but the
     * block device queue will NOT be unplugged, since usually many pages
     * will be pushed to the out before the higher-level caller actually
     * waits for the writes to be completed.  The various wait functions,
     * such as wait_on_writeback_range() will ultimately call sync_page()
     * which will ultimately call blk_run_backing_dev(), which will end up
     * unplugging the device queue.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    static int __block_write_full_page(struct inode *inode, struct page *page,
    
    			get_block_t *get_block, struct writeback_control *wbc,
    			bh_end_io_t *handler)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int err;
    	sector_t block;
    	sector_t last_block;
    
    	struct buffer_head *bh, *head;
    
    	const unsigned blocksize = 1 << inode->i_blkbits;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int nr_underway = 0;
    
    	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
    			WRITE_SYNC_PLUG : WRITE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	BUG_ON(!PageLocked(page));
    
    	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
    
    	if (!page_has_buffers(page)) {
    
    		create_empty_buffers(page, blocksize,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					(1 << BH_Dirty)|(1 << BH_Uptodate));
    	}
    
    	/*
    	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
    	 * here, and the (potentially unmapped) buffers may become dirty at
    	 * any time.  If a buffer becomes dirty here after we've inspected it
    	 * then we just miss that fact, and the page stays dirty.
    	 *
    	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
    	 * handle that here by just cleaning them.
    	 */
    
    
    	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	head = page_buffers(page);
    	bh = head;
    
    	/*
    	 * Get all the dirty buffers mapped to disk addresses and
    	 * handle any aliases from the underlying blockdev's mapping.
    	 */
    	do {
    		if (block > last_block) {
    			/*
    			 * mapped buffers outside i_size will occur, because
    			 * this page can be outside i_size when there is a
    			 * truncate in progress.
    			 */
    			/*
    			 * The buffer was zeroed by block_write_full_page()
    			 */
    			clear_buffer_dirty(bh);
    			set_buffer_uptodate(bh);
    
    		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
    			   buffer_dirty(bh)) {
    
    			WARN_ON(bh->b_size != blocksize);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			err = get_block(inode, block, bh, 1);
    			if (err)
    				goto recover;
    
    			clear_buffer_delay(bh);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (buffer_new(bh)) {
    				/* blockdev mappings never come here */
    				clear_buffer_new(bh);
    				unmap_underlying_metadata(bh->b_bdev,
    							bh->b_blocknr);
    			}
    		}
    		bh = bh->b_this_page;
    		block++;
    	} while (bh != head);
    
    	do {
    		if (!buffer_mapped(bh))
    			continue;
    		/*
    		 * If it's a fully non-blocking write attempt and we cannot
    		 * lock the buffer then redirty the page.  Note that this can
    
    		 * potentially cause a busy-wait loop from writeback threads
    		 * and kswapd activity, but those code paths have their own
    		 * higher-level throttling.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		 */
    		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
    			lock_buffer(bh);
    
    		} else if (!trylock_buffer(bh)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			redirty_page_for_writepage(wbc, page);
    			continue;
    		}
    		if (test_clear_buffer_dirty(bh)) {
    
    			mark_buffer_async_write_endio(bh, handler);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		} else {
    			unlock_buffer(bh);
    		}
    	} while ((bh = bh->b_this_page) != head);
    
    	/*
    	 * The page and its buffers are protected by PageWriteback(), so we can
    	 * drop the bh refcounts early.
    	 */
    	BUG_ON(PageWriteback(page));
    	set_page_writeback(page);
    
    	do {
    		struct buffer_head *next = bh->b_this_page;
    		if (buffer_async_write(bh)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			nr_underway++;
    		}
    		bh = next;
    	} while (bh != head);
    
    	unlock_page(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	err = 0;
    done:
    	if (nr_underway == 0) {
    		/*
    		 * The page was marked dirty, but the buffers were
    		 * clean.  Someone wrote them back by hand with
    		 * ll_rw_block/submit_bh.  A rare case.
    		 */
    		end_page_writeback(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/*
    		 * The page and buffer_heads can be released at any time from
    		 * here on.
    		 */
    	}
    	return err;
    
    recover:
    	/*
    	 * ENOSPC, or some other error.  We may already have added some
    	 * blocks to the file, so we need to write these out to avoid
    	 * exposing stale data.
    	 * The page is currently locked and not marked for writeback
    	 */
    	bh = head;
    	/* Recovery: lock and submit the mapped buffers */
    	do {
    
    		if (buffer_mapped(bh) && buffer_dirty(bh) &&
    		    !buffer_delay(bh)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			lock_buffer(bh);
    
    			mark_buffer_async_write_endio(bh, handler);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		} else {
    			/*
    			 * The buffer may have been set dirty during
    			 * attachment to a dirty page.
    			 */
    			clear_buffer_dirty(bh);
    		}
    	} while ((bh = bh->b_this_page) != head);
    	SetPageError(page);
    	BUG_ON(PageWriteback(page));
    
    	mapping_set_error(page->mapping, err);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	set_page_writeback(page);
    	do {
    		struct buffer_head *next = bh->b_this_page;
    		if (buffer_async_write(bh)) {
    			clear_buffer_dirty(bh);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			nr_underway++;
    		}
    		bh = next;
    	} while (bh != head);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	goto done;
    }
    
    
    /*
     * If a page has any new buffers, zero them out here, and mark them uptodate
     * and dirty so they'll be written out (in order to prevent uninitialised
     * block data from leaking). And clear the new bit.
     */
    void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
    {
    	unsigned int block_start, block_end;
    	struct buffer_head *head, *bh;
    
    	BUG_ON(!PageLocked(page));
    	if (!page_has_buffers(page))
    		return;
    
    	bh = head = page_buffers(page);
    	block_start = 0;
    	do {
    		block_end = block_start + bh->b_size;
    
    		if (buffer_new(bh)) {
    			if (block_end > from && block_start < to) {
    				if (!PageUptodate(page)) {
    					unsigned start, size;
    
    					start = max(from, block_start);
    					size = min(to, block_end) - start;
    
    
    					set_buffer_uptodate(bh);
    				}
    
    				clear_buffer_new(bh);
    				mark_buffer_dirty(bh);
    			}
    		}
    
    		block_start = block_end;
    		bh = bh->b_this_page;
    	} while (bh != head);
    }
    EXPORT_SYMBOL(page_zero_new_buffers);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int __block_prepare_write(struct inode *inode, struct page *page,
    		unsigned from, unsigned to, get_block_t *get_block)
    {
    	unsigned block_start, block_end;
    	sector_t block;
    	int err = 0;
    	unsigned blocksize, bbits;
    	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
    
    	BUG_ON(!PageLocked(page));
    	BUG_ON(from > PAGE_CACHE_SIZE);
    	BUG_ON(to > PAGE_CACHE_SIZE);
    	BUG_ON(from > to);
    
    	blocksize = 1 << inode->i_blkbits;
    	if (!page_has_buffers(page))
    		create_empty_buffers(page, blocksize, 0);
    	head = page_buffers(page);
    
    	bbits = inode->i_blkbits;
    	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
    
    	for(bh = head, block_start = 0; bh != head || !block_start;
    	    block++, block_start=block_end, bh = bh->b_this_page) {
    		block_end = block_start + blocksize;
    		if (block_end <= from || block_start >= to) {
    			if (PageUptodate(page)) {
    				if (!buffer_uptodate(bh))
    					set_buffer_uptodate(bh);
    			}
    			continue;
    		}
    		if (buffer_new(bh))
    			clear_buffer_new(bh);
    		if (!buffer_mapped(bh)) {
    
    			WARN_ON(bh->b_size != blocksize);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			err = get_block(inode, block, bh, 1);
    			if (err)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (buffer_new(bh)) {
    				unmap_underlying_metadata(bh->b_bdev,
    							bh->b_blocknr);
    				if (PageUptodate(page)) {
    
    					clear_buffer_new(bh);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					set_buffer_uptodate(bh);
    
    					mark_buffer_dirty(bh);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					continue;
    				}
    
    				if (block_end > to || block_start < from)
    					zero_user_segments(page,
    						to, block_end,
    						block_start, from);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				continue;
    			}
    		}
    		if (PageUptodate(page)) {
    			if (!buffer_uptodate(bh))
    				set_buffer_uptodate(bh);
    			continue; 
    		}
    		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		     (block_start < from || block_end > to)) {
    			ll_rw_block(READ, 1, &bh);
    			*wait_bh++=bh;
    		}
    	}
    	/*
    	 * If we issued read requests - let them complete.
    	 */
    	while(wait_bh > wait) {
    		wait_on_buffer(*--wait_bh);
    		if (!buffer_uptodate(*wait_bh))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	if (unlikely(err))
    		page_zero_new_buffers(page, from, to);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return err;
    }
    
    static int __block_commit_write(struct inode *inode, struct page *page,
    		unsigned from, unsigned to)
    {
    	unsigned block_start, block_end;
    	int partial = 0;
    	unsigned blocksize;
    	struct buffer_head *bh, *head;
    
    	blocksize = 1 << inode->i_blkbits;
    
    	for(bh = head = page_buffers(page), block_start = 0;
    	    bh != head || !block_start;
    	    block_start=block_end, bh = bh->b_this_page) {
    		block_end = block_start + blocksize;
    		if (block_end <= from || block_start >= to) {
    			if (!buffer_uptodate(bh))
    				partial = 1;
    		} else {
    			set_buffer_uptodate(bh);
    			mark_buffer_dirty(bh);
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/*
    	 * If this is a partial write which happened to make all buffers
    	 * uptodate then we can optimize away a bogus readpage() for
    	 * the next read(). Here we 'discover' whether the page went
    	 * uptodate as a result of this (potentially partial) write.
    	 */
    	if (!partial)
    		SetPageUptodate(page);
    	return 0;
    }
    
    
    /*
     * block_write_begin takes care of the basic task of block allocation and
     * bringing partial write blocks uptodate first.
     *
     * If *pagep is not NULL, then block_write_begin uses the locked page
     * at *pagep rather than allocating its own. In this case, the page will
     * not be unlocked or deallocated on failure.
     */
    int block_write_begin(struct file *file, struct address_space *mapping,
    			loff_t pos, unsigned len, unsigned flags,
    			struct page **pagep, void **fsdata,
    			get_block_t *get_block)
    {
    	struct inode *inode = mapping->host;
    	int status = 0;
    	struct page *page;
    	pgoff_t index;
    	unsigned start, end;
    	int ownpage = 0;
    
    	index = pos >> PAGE_CACHE_SHIFT;
    	start = pos & (PAGE_CACHE_SIZE - 1);
    	end = start + len;
    
    	page = *pagep;
    	if (page == NULL) {
    		ownpage = 1;
    
    		page = grab_cache_page_write_begin(mapping, index, flags);
    
    		if (!page) {
    			status = -ENOMEM;
    			goto out;
    		}
    		*pagep = page;
    	} else
    		BUG_ON(!PageLocked(page));
    
    	status = __block_prepare_write(inode, page, start, end, get_block);
    	if (unlikely(status)) {
    		ClearPageUptodate(page);
    
    		if (ownpage) {
    			unlock_page(page);