Skip to content
Snippets Groups Projects
buffer.c 79.9 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    			submit_bh(READ, bh);
    	}
    	return 0;
    }
    
    /* utility function for filesystems that need to do work on expanding
     * truncates.  Uses prepare/commit_write to allow the filesystem to
     * deal with the hole.  
     */
    
    static int __generic_cont_expand(struct inode *inode, loff_t size,
    				 pgoff_t index, unsigned int offset)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct address_space *mapping = inode->i_mapping;
    	struct page *page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int err;
    
    	err = -EFBIG;
            limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
    	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
    		send_sig(SIGXFSZ, current, 0);
    		goto out;
    	}
    	if (size > inode->i_sb->s_maxbytes)
    		goto out;
    
    	err = -ENOMEM;
    	page = grab_cache_page(mapping, index);
    	if (!page)
    		goto out;
    	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
    
    	if (err) {
    		/*
    		 * ->prepare_write() may have instantiated a few blocks
    		 * outside i_size.  Trim these off again.
    		 */
    		unlock_page(page);
    		page_cache_release(page);
    		vmtruncate(inode, inode->i_size);
    		goto out;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    
    	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	unlock_page(page);
    	page_cache_release(page);
    	if (err > 0)
    		err = 0;
    out:
    	return err;
    }
    
    
    int generic_cont_expand(struct inode *inode, loff_t size)
    {
    	pgoff_t index;
    	unsigned int offset;
    
    	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
    
    	/* ugh.  in prepare/commit_write, if from==to==start of block, we
    	** skip the prepare.  make sure we never send an offset for the start
    	** of a block
    	*/
    	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
    		/* caller must handle this extra byte. */
    		offset++;
    	}
    	index = size >> PAGE_CACHE_SHIFT;
    
    	return __generic_cont_expand(inode, size, index, offset);
    }
    
    int generic_cont_expand_simple(struct inode *inode, loff_t size)
    {
    	loff_t pos = size - 1;
    	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
    	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
    
    	/* prepare/commit_write can handle even if from==to==start of block. */
    	return __generic_cont_expand(inode, size, index, offset);
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * For moronic filesystems that do not allow holes in file.
     * We may have to extend the file.
     */
    
    int cont_prepare_write(struct page *page, unsigned offset,
    		unsigned to, get_block_t *get_block, loff_t *bytes)
    {
    	struct address_space *mapping = page->mapping;
    	struct inode *inode = mapping->host;
    	struct page *new_page;
    	pgoff_t pgpos;
    	long status;
    	unsigned zerofrom;
    	unsigned blocksize = 1 << inode->i_blkbits;
    	void *kaddr;
    
    	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
    		status = -ENOMEM;
    		new_page = grab_cache_page(mapping, pgpos);
    		if (!new_page)
    			goto out;
    		/* we might sleep */
    		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
    			unlock_page(new_page);
    			page_cache_release(new_page);
    			continue;
    		}
    		zerofrom = *bytes & ~PAGE_CACHE_MASK;
    		if (zerofrom & (blocksize-1)) {
    			*bytes |= (blocksize-1);
    			(*bytes)++;
    		}
    		status = __block_prepare_write(inode, new_page, zerofrom,
    						PAGE_CACHE_SIZE, get_block);
    		if (status)
    			goto out_unmap;
    		kaddr = kmap_atomic(new_page, KM_USER0);
    		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
    		flush_dcache_page(new_page);
    		kunmap_atomic(kaddr, KM_USER0);
    		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
    		unlock_page(new_page);
    		page_cache_release(new_page);
    	}
    
    	if (page->index < pgpos) {
    		/* completely inside the area */
    		zerofrom = offset;
    	} else {
    		/* page covers the boundary, find the boundary offset */
    		zerofrom = *bytes & ~PAGE_CACHE_MASK;
    
    		/* if we will expand the thing last block will be filled */
    		if (to > zerofrom && (zerofrom & (blocksize-1))) {
    			*bytes |= (blocksize-1);
    			(*bytes)++;
    		}
    
    		/* starting below the boundary? Nothing to zero out */
    		if (offset <= zerofrom)
    			zerofrom = offset;
    	}
    	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
    	if (status)
    		goto out1;
    	if (zerofrom < offset) {
    		kaddr = kmap_atomic(page, KM_USER0);
    		memset(kaddr+zerofrom, 0, offset-zerofrom);
    		flush_dcache_page(page);
    		kunmap_atomic(kaddr, KM_USER0);
    		__block_commit_write(inode, page, zerofrom, offset);
    	}
    	return 0;
    out1:
    	ClearPageUptodate(page);
    	return status;
    
    out_unmap:
    	ClearPageUptodate(new_page);
    	unlock_page(new_page);
    	page_cache_release(new_page);
    out:
    	return status;
    }
    
    int block_prepare_write(struct page *page, unsigned from, unsigned to,
    			get_block_t *get_block)
    {
    	struct inode *inode = page->mapping->host;
    	int err = __block_prepare_write(inode, page, from, to, get_block);
    	if (err)
    		ClearPageUptodate(page);
    	return err;
    }
    
    int block_commit_write(struct page *page, unsigned from, unsigned to)
    {
    	struct inode *inode = page->mapping->host;
    	__block_commit_write(inode,page,from,to);
    	return 0;
    }
    
    int generic_commit_write(struct file *file, struct page *page,
    		unsigned from, unsigned to)
    {
    	struct inode *inode = page->mapping->host;
    	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
    	__block_commit_write(inode,page,from,to);
    	/*
    	 * No need to use i_size_read() here, the i_size
    
    	 * cannot change under us because we hold i_mutex.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    	if (pos > inode->i_size) {
    		i_size_write(inode, pos);
    		mark_inode_dirty(inode);
    	}
    	return 0;
    }
    
    
    /*
     * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
     * immediately, while under the page lock.  So it needs a special end_io
     * handler which does not touch the bh after unlocking it.
     *
     * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
     * a race there is benign: unlock_buffer() only use the bh's address for
     * hashing after unlocking the buffer, so it doesn't actually touch the bh
     * itself.
     */
    static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
    {
    	if (uptodate) {
    		set_buffer_uptodate(bh);
    	} else {
    		/* This happens, due to failed READA attempts. */
    		clear_buffer_uptodate(bh);
    	}
    	unlock_buffer(bh);
    }
    
    /*
     * On entry, the page is fully not uptodate.
     * On exit the page is fully uptodate in the areas outside (from,to)
     */
    int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
    			get_block_t *get_block)
    {
    	struct inode *inode = page->mapping->host;
    	const unsigned blkbits = inode->i_blkbits;
    	const unsigned blocksize = 1 << blkbits;
    	struct buffer_head map_bh;
    	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
    	unsigned block_in_page;
    	unsigned block_start;
    	sector_t block_in_file;
    	char *kaddr;
    	int nr_reads = 0;
    	int i;
    	int ret = 0;
    	int is_mapped_to_disk = 1;
    	int dirtied_it = 0;
    
    	if (PageMappedToDisk(page))
    		return 0;
    
    	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
    	map_bh.b_page = page;
    
    	/*
    	 * We loop across all blocks in the page, whether or not they are
    	 * part of the affected region.  This is so we can discover if the
    	 * page is fully mapped-to-disk.
    	 */
    	for (block_start = 0, block_in_page = 0;
    		  block_start < PAGE_CACHE_SIZE;
    		  block_in_page++, block_start += blocksize) {
    		unsigned block_end = block_start + blocksize;
    		int create;
    
    		map_bh.b_state = 0;
    		create = 1;
    		if (block_start >= to)
    			create = 0;
    
    		map_bh.b_size = blocksize;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		ret = get_block(inode, block_in_file + block_in_page,
    					&map_bh, create);
    		if (ret)
    			goto failed;
    		if (!buffer_mapped(&map_bh))
    			is_mapped_to_disk = 0;
    		if (buffer_new(&map_bh))
    			unmap_underlying_metadata(map_bh.b_bdev,
    							map_bh.b_blocknr);
    		if (PageUptodate(page))
    			continue;
    		if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
    			kaddr = kmap_atomic(page, KM_USER0);
    			if (block_start < from) {
    				memset(kaddr+block_start, 0, from-block_start);
    				dirtied_it = 1;
    			}
    			if (block_end > to) {
    				memset(kaddr + to, 0, block_end - to);
    				dirtied_it = 1;
    			}
    			flush_dcache_page(page);
    			kunmap_atomic(kaddr, KM_USER0);
    			continue;
    		}
    		if (buffer_uptodate(&map_bh))
    			continue;	/* reiserfs does this */
    		if (block_start < from || block_end > to) {
    			struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
    
    			if (!bh) {
    				ret = -ENOMEM;
    				goto failed;
    			}
    			bh->b_state = map_bh.b_state;
    			atomic_set(&bh->b_count, 0);
    			bh->b_this_page = NULL;
    			bh->b_page = page;
    			bh->b_blocknr = map_bh.b_blocknr;
    			bh->b_size = blocksize;
    			bh->b_data = (char *)(long)block_start;
    			bh->b_bdev = map_bh.b_bdev;
    			bh->b_private = NULL;
    			read_bh[nr_reads++] = bh;
    		}
    	}
    
    	if (nr_reads) {
    		struct buffer_head *bh;
    
    		/*
    		 * The page is locked, so these buffers are protected from
    		 * any VM or truncate activity.  Hence we don't need to care
    		 * for the buffer_head refcounts.
    		 */
    		for (i = 0; i < nr_reads; i++) {
    			bh = read_bh[i];
    			lock_buffer(bh);
    			bh->b_end_io = end_buffer_read_nobh;
    			submit_bh(READ, bh);
    		}
    		for (i = 0; i < nr_reads; i++) {
    			bh = read_bh[i];
    			wait_on_buffer(bh);
    			if (!buffer_uptodate(bh))
    				ret = -EIO;
    			free_buffer_head(bh);
    			read_bh[i] = NULL;
    		}
    		if (ret)
    			goto failed;
    	}
    
    	if (is_mapped_to_disk)
    		SetPageMappedToDisk(page);
    	SetPageUptodate(page);
    
    	/*
    	 * Setting the page dirty here isn't necessary for the prepare_write
    	 * function - commit_write will do that.  But if/when this function is
    	 * used within the pagefault handler to ensure that all mmapped pages
    	 * have backing space in the filesystem, we will need to dirty the page
    	 * if its contents were altered.
    	 */
    	if (dirtied_it)
    		set_page_dirty(page);
    
    	return 0;
    
    failed:
    	for (i = 0; i < nr_reads; i++) {
    		if (read_bh[i])
    			free_buffer_head(read_bh[i]);
    	}
    
    	/*
    	 * Error recovery is pretty slack.  Clear the page and mark it dirty
    	 * so we'll later zero out any blocks which _were_ allocated.
    	 */
    	kaddr = kmap_atomic(page, KM_USER0);
    	memset(kaddr, 0, PAGE_CACHE_SIZE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	kunmap_atomic(kaddr, KM_USER0);
    	SetPageUptodate(page);
    	set_page_dirty(page);
    	return ret;
    }
    EXPORT_SYMBOL(nobh_prepare_write);
    
    int nobh_commit_write(struct file *file, struct page *page,
    		unsigned from, unsigned to)
    {
    	struct inode *inode = page->mapping->host;
    	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
    
    	set_page_dirty(page);
    	if (pos > inode->i_size) {
    		i_size_write(inode, pos);
    		mark_inode_dirty(inode);
    	}
    	return 0;
    }
    EXPORT_SYMBOL(nobh_commit_write);
    
    /*
     * nobh_writepage() - based on block_full_write_page() except
     * that it tries to operate without attaching bufferheads to
     * the page.
     */
    int nobh_writepage(struct page *page, get_block_t *get_block,
    			struct writeback_control *wbc)
    {
    	struct inode * const inode = page->mapping->host;
    	loff_t i_size = i_size_read(inode);
    	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
    	unsigned offset;
    	void *kaddr;
    	int ret;
    
    	/* Is the page fully inside i_size? */
    	if (page->index < end_index)
    		goto out;
    
    	/* Is the page fully outside i_size? (truncate in progress) */
    	offset = i_size & (PAGE_CACHE_SIZE-1);
    	if (page->index >= end_index+1 || !offset) {
    		/*
    		 * The page may have dirty, unmapped buffers.  For example,
    		 * they may have been added in ext3_writepage().  Make them
    		 * freeable here, so the page does not leak.
    		 */
    #if 0
    		/* Not really sure about this  - do we need this ? */
    		if (page->mapping->a_ops->invalidatepage)
    			page->mapping->a_ops->invalidatepage(page, offset);
    #endif
    		unlock_page(page);
    		return 0; /* don't care */
    	}
    
    	/*
    	 * The page straddles i_size.  It must be zeroed out on each and every
    	 * writepage invocation because it may be mmapped.  "A file is mapped
    	 * in multiples of the page size.  For a file that is not a multiple of
    	 * the  page size, the remaining memory is zeroed when mapped, and
    	 * writes to that region are not written out to the file."
    	 */
    	kaddr = kmap_atomic(page, KM_USER0);
    	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
    	flush_dcache_page(page);
    	kunmap_atomic(kaddr, KM_USER0);
    out:
    	ret = mpage_writepage(page, get_block, wbc);
    	if (ret == -EAGAIN)
    		ret = __block_write_full_page(inode, page, get_block, wbc);
    	return ret;
    }
    EXPORT_SYMBOL(nobh_writepage);
    
    /*
     * This function assumes that ->prepare_write() uses nobh_prepare_write().
     */
    int nobh_truncate_page(struct address_space *mapping, loff_t from)
    {
    	struct inode *inode = mapping->host;
    	unsigned blocksize = 1 << inode->i_blkbits;
    	pgoff_t index = from >> PAGE_CACHE_SHIFT;
    	unsigned offset = from & (PAGE_CACHE_SIZE-1);
    	unsigned to;
    	struct page *page;
    
    	const struct address_space_operations *a_ops = mapping->a_ops;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	char *kaddr;
    	int ret = 0;
    
    	if ((offset & (blocksize - 1)) == 0)
    		goto out;
    
    	ret = -ENOMEM;
    	page = grab_cache_page(mapping, index);
    	if (!page)
    		goto out;
    
    	to = (offset + blocksize) & ~(blocksize - 1);
    	ret = a_ops->prepare_write(NULL, page, offset, to);
    	if (ret == 0) {
    		kaddr = kmap_atomic(page, KM_USER0);
    		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
    		flush_dcache_page(page);
    		kunmap_atomic(kaddr, KM_USER0);
    		set_page_dirty(page);
    	}
    	unlock_page(page);
    	page_cache_release(page);
    out:
    	return ret;
    }
    EXPORT_SYMBOL(nobh_truncate_page);
    
    int block_truncate_page(struct address_space *mapping,
    			loff_t from, get_block_t *get_block)
    {
    	pgoff_t index = from >> PAGE_CACHE_SHIFT;
    	unsigned offset = from & (PAGE_CACHE_SIZE-1);
    	unsigned blocksize;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	unsigned length, pos;
    	struct inode *inode = mapping->host;
    	struct page *page;
    	struct buffer_head *bh;
    	void *kaddr;
    	int err;
    
    	blocksize = 1 << inode->i_blkbits;
    	length = offset & (blocksize - 1);
    
    	/* Block boundary? Nothing to do */
    	if (!length)
    		return 0;
    
    	length = blocksize - length;
    
    	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	
    	page = grab_cache_page(mapping, index);
    	err = -ENOMEM;
    	if (!page)
    		goto out;
    
    	if (!page_has_buffers(page))
    		create_empty_buffers(page, blocksize, 0);
    
    	/* Find the buffer that contains "offset" */
    	bh = page_buffers(page);
    	pos = blocksize;
    	while (offset >= pos) {
    		bh = bh->b_this_page;
    		iblock++;
    		pos += blocksize;
    	}
    
    	err = 0;
    	if (!buffer_mapped(bh)) {
    
    		WARN_ON(bh->b_size != blocksize);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		err = get_block(inode, iblock, bh, 0);
    		if (err)
    			goto unlock;
    		/* unmapped? It's a hole - nothing to do */
    		if (!buffer_mapped(bh))
    			goto unlock;
    	}
    
    	/* Ok, it's mapped. Make sure it's up-to-date */
    	if (PageUptodate(page))
    		set_buffer_uptodate(bh);
    
    	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
    		err = -EIO;
    		ll_rw_block(READ, 1, &bh);
    		wait_on_buffer(bh);
    		/* Uhhuh. Read error. Complain and punt. */
    		if (!buffer_uptodate(bh))
    			goto unlock;
    	}
    
    	kaddr = kmap_atomic(page, KM_USER0);
    	memset(kaddr + offset, 0, length);
    	flush_dcache_page(page);
    	kunmap_atomic(kaddr, KM_USER0);
    
    	mark_buffer_dirty(bh);
    	err = 0;
    
    unlock:
    	unlock_page(page);
    	page_cache_release(page);
    out:
    	return err;
    }
    
    /*
     * The generic ->writepage function for buffer-backed address_spaces
     */
    int block_write_full_page(struct page *page, get_block_t *get_block,
    			struct writeback_control *wbc)
    {
    	struct inode * const inode = page->mapping->host;
    	loff_t i_size = i_size_read(inode);
    	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
    	unsigned offset;
    	void *kaddr;
    
    	/* Is the page fully inside i_size? */
    	if (page->index < end_index)
    		return __block_write_full_page(inode, page, get_block, wbc);
    
    	/* Is the page fully outside i_size? (truncate in progress) */
    	offset = i_size & (PAGE_CACHE_SIZE-1);
    	if (page->index >= end_index+1 || !offset) {
    		/*
    		 * The page may have dirty, unmapped buffers.  For example,
    		 * they may have been added in ext3_writepage().  Make them
    		 * freeable here, so the page does not leak.
    		 */
    
    		do_invalidatepage(page, 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		unlock_page(page);
    		return 0; /* don't care */
    	}
    
    	/*
    	 * The page straddles i_size.  It must be zeroed out on each and every
    	 * writepage invokation because it may be mmapped.  "A file is mapped
    	 * in multiples of the page size.  For a file that is not a multiple of
    	 * the  page size, the remaining memory is zeroed when mapped, and
    	 * writes to that region are not written out to the file."
    	 */
    	kaddr = kmap_atomic(page, KM_USER0);
    	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
    	flush_dcache_page(page);
    	kunmap_atomic(kaddr, KM_USER0);
    	return __block_write_full_page(inode, page, get_block, wbc);
    }
    
    sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
    			    get_block_t *get_block)
    {
    	struct buffer_head tmp;
    	struct inode *inode = mapping->host;
    	tmp.b_state = 0;
    	tmp.b_blocknr = 0;
    
    	tmp.b_size = 1 << inode->i_blkbits;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	get_block(inode, block, &tmp, 0);
    	return tmp.b_blocknr;
    }
    
    static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
    {
    	struct buffer_head *bh = bio->bi_private;
    
    	if (bio->bi_size)
    		return 1;
    
    	if (err == -EOPNOTSUPP) {
    		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
    		set_bit(BH_Eopnotsupp, &bh->b_state);
    	}
    
    	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
    	bio_put(bio);
    	return 0;
    }
    
    int submit_bh(int rw, struct buffer_head * bh)
    {
    	struct bio *bio;
    	int ret = 0;
    
    	BUG_ON(!buffer_locked(bh));
    	BUG_ON(!buffer_mapped(bh));
    	BUG_ON(!bh->b_end_io);
    
    	if (buffer_ordered(bh) && (rw == WRITE))
    		rw = WRITE_BARRIER;
    
    	/*
    	 * Only clear out a write error when rewriting, should this
    	 * include WRITE_SYNC as well?
    	 */
    	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
    		clear_buffer_write_io_error(bh);
    
    	/*
    	 * from here on down, it's all bio -- do the initial mapping,
    	 * submit_bio -> generic_make_request may further map this bio around
    	 */
    	bio = bio_alloc(GFP_NOIO, 1);
    
    	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
    	bio->bi_bdev = bh->b_bdev;
    	bio->bi_io_vec[0].bv_page = bh->b_page;
    	bio->bi_io_vec[0].bv_len = bh->b_size;
    	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
    
    	bio->bi_vcnt = 1;
    	bio->bi_idx = 0;
    	bio->bi_size = bh->b_size;
    
    	bio->bi_end_io = end_bio_bh_io_sync;
    	bio->bi_private = bh;
    
    	bio_get(bio);
    	submit_bio(rw, bio);
    
    	if (bio_flagged(bio, BIO_EOPNOTSUPP))
    		ret = -EOPNOTSUPP;
    
    	bio_put(bio);
    	return ret;
    }
    
    /**
     * ll_rw_block: low-level access to block devices (DEPRECATED)
    
     * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * @nr: number of &struct buffer_heads in the array
     * @bhs: array of pointers to &struct buffer_head
     *
    
     * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
     * requests an I/O operation on them, either a %READ or a %WRITE.  The third
     * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
     * are sent to disk. The fourth %READA option is described in the documentation
     * for generic_make_request() which ll_rw_block() calls.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * This function drops any buffer that it cannot get a lock on (with the
    
     * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
     * clean when doing a write request, and any buffer that appears to be
     * up-to-date when doing read request.  Further it marks as clean buffers that
     * are processed for writing (the buffer cache won't assume that they are
     * actually clean until the buffer gets unlocked).
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * ll_rw_block sets b_end_io to simple completion handler that marks
     * the buffer up-to-date (if approriate), unlocks the buffer and wakes
     * any waiters. 
     *
     * All of the buffers must be for the same device, and must also be a
     * multiple of the current approved size for the device.
     */
    void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
    {
    	int i;
    
    	for (i = 0; i < nr; i++) {
    		struct buffer_head *bh = bhs[i];
    
    
    		if (rw == SWRITE)
    			lock_buffer(bh);
    		else if (test_set_buffer_locked(bh))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    
    		if (rw == WRITE || rw == SWRITE) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (test_clear_buffer_dirty(bh)) {
    
    				bh->b_end_io = end_buffer_write_sync;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				submit_bh(WRITE, bh);
    				continue;
    			}
    		} else {
    			if (!buffer_uptodate(bh)) {
    
    				bh->b_end_io = end_buffer_read_sync;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				submit_bh(rw, bh);
    				continue;
    			}
    		}
    		unlock_buffer(bh);
    	}
    }
    
    /*
     * For a data-integrity writeout, we need to wait upon any in-progress I/O
     * and then start new I/O and then wait upon it.  The caller must have a ref on
     * the buffer_head.
     */
    int sync_dirty_buffer(struct buffer_head *bh)
    {
    	int ret = 0;
    
    	WARN_ON(atomic_read(&bh->b_count) < 1);
    	lock_buffer(bh);
    	if (test_clear_buffer_dirty(bh)) {
    		get_bh(bh);
    		bh->b_end_io = end_buffer_write_sync;
    		ret = submit_bh(WRITE, bh);
    		wait_on_buffer(bh);
    		if (buffer_eopnotsupp(bh)) {
    			clear_buffer_eopnotsupp(bh);
    			ret = -EOPNOTSUPP;
    		}
    		if (!ret && !buffer_uptodate(bh))
    			ret = -EIO;
    	} else {
    		unlock_buffer(bh);
    	}
    	return ret;
    }
    
    /*
     * try_to_free_buffers() checks if all the buffers on this particular page
     * are unused, and releases them if so.
     *
     * Exclusion against try_to_free_buffers may be obtained by either
     * locking the page or by holding its mapping's private_lock.
     *
     * If the page is dirty but all the buffers are clean then we need to
     * be sure to mark the page clean as well.  This is because the page
     * may be against a block device, and a later reattachment of buffers
     * to a dirty page will set *all* buffers dirty.  Which would corrupt
     * filesystem data on the same device.
     *
     * The same applies to regular filesystem pages: if all the buffers are
     * clean then we set the page clean and proceed.  To do that, we require
     * total exclusion from __set_page_dirty_buffers().  That is obtained with
     * private_lock.
     *
     * try_to_free_buffers() is non-blocking.
     */
    static inline int buffer_busy(struct buffer_head *bh)
    {
    	return atomic_read(&bh->b_count) |
    		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
    }
    
    static int
    drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
    {
    	struct buffer_head *head = page_buffers(page);
    	struct buffer_head *bh;
    
    	bh = head;
    	do {
    
    		if (buffer_write_io_error(bh) && page->mapping)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			set_bit(AS_EIO, &page->mapping->flags);
    		if (buffer_busy(bh))
    			goto failed;
    		bh = bh->b_this_page;
    	} while (bh != head);
    
    	do {
    		struct buffer_head *next = bh->b_this_page;
    
    		if (!list_empty(&bh->b_assoc_buffers))
    			__remove_assoc_queue(bh);
    		bh = next;
    	} while (bh != head);
    	*buffers_to_free = head;
    	__clear_page_buffers(page);
    	return 1;
    failed:
    	return 0;
    }
    
    int try_to_free_buffers(struct page *page)
    {
    	struct address_space * const mapping = page->mapping;
    	struct buffer_head *buffers_to_free = NULL;
    	int ret = 0;
    
    	BUG_ON(!PageLocked(page));
    	if (PageWriteback(page))
    		return 0;
    
    	if (mapping == NULL) {		/* can this still happen? */
    		ret = drop_buffers(page, &buffers_to_free);
    		goto out;
    	}
    
    	spin_lock(&mapping->private_lock);
    	ret = drop_buffers(page, &buffers_to_free);
    
    	spin_unlock(&mapping->private_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (ret) {
    		/*
    		 * If the filesystem writes its buffers by hand (eg ext3)
    		 * then we can have clean buffers against a dirty page.  We
    		 * clean the page here; otherwise later reattachment of buffers
    		 * could encounter a non-uptodate page, which is unresolvable.
    		 * This only applies in the rare case where try_to_free_buffers
    		 * succeeds but the page is not freed.
    		 */
    		clear_page_dirty(page);
    	}
    out:
    	if (buffers_to_free) {
    		struct buffer_head *bh = buffers_to_free;
    
    		do {
    			struct buffer_head *next = bh->b_this_page;
    			free_buffer_head(bh);
    			bh = next;
    		} while (bh != buffers_to_free);
    	}
    	return ret;
    }
    EXPORT_SYMBOL(try_to_free_buffers);
    
    
    void block_sync_page(struct page *page)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct address_space *mapping;
    
    	smp_mb();
    	mapping = page_mapping(page);
    	if (mapping)
    		blk_run_backing_dev(mapping->backing_dev_info, page);
    }
    
    /*
     * There are no bdflush tunables left.  But distributions are
     * still running obsolete flush daemons, so we terminate them here.
     *
     * Use of bdflush() is deprecated and will be removed in a future kernel.
     * The `pdflush' kernel threads fully replace bdflush daemons and this call.
     */
    asmlinkage long sys_bdflush(int func, long data)
    {
    	static int msg_count;
    
    	if (!capable(CAP_SYS_ADMIN))
    		return -EPERM;
    
    	if (msg_count < 5) {
    		msg_count++;
    		printk(KERN_INFO
    			"warning: process `%s' used the obsolete bdflush"
    			" system call\n", current->comm);
    		printk(KERN_INFO "Fix your initscripts?\n");
    	}
    
    	if (func == 1)
    		do_exit(0);
    	return 0;
    }
    
    /*
     * Buffer-head allocation
     */
    
    static struct kmem_cache *bh_cachep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * Once the number of bh's in the machine exceeds this level, we start
     * stripping them in writeback.
     */
    static int max_buffer_heads;
    
    int buffer_heads_over_limit;
    
    struct bh_accounting {
    	int nr;			/* Number of live bh's */
    	int ratelimit;		/* Limit cacheline bouncing */
    };
    
    static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
    
    static void recalc_bh_state(void)
    {
    	int i;
    	int tot = 0;
    
    	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
    		return;
    	__get_cpu_var(bh_accounting).ratelimit = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tot += per_cpu(bh_accounting, i).nr;
    	buffer_heads_over_limit = (tot > max_buffer_heads);
    }
    	
    
    struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
    	if (ret) {
    
    		get_cpu_var(bh_accounting).nr++;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		recalc_bh_state();
    
    		put_cpu_var(bh_accounting);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	return ret;
    }
    EXPORT_SYMBOL(alloc_buffer_head);
    
    void free_buffer_head(struct buffer_head *bh)
    {
    	BUG_ON(!list_empty(&bh->b_assoc_buffers));
    	kmem_cache_free(bh_cachep, bh);
    
    	get_cpu_var(bh_accounting).nr--;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	recalc_bh_state();
    
    	put_cpu_var(bh_accounting);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    EXPORT_SYMBOL(free_buffer_head);
    
    static void
    
    init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
    			    SLAB_CTOR_CONSTRUCTOR) {
    		struct buffer_head * bh = (struct buffer_head *)data;
    
    		memset(bh, 0, sizeof(*bh));
    		INIT_LIST_HEAD(&bh->b_assoc_buffers);
    	}
    }
    
    static void buffer_exit_cpu(int cpu)
    {
    	int i;
    	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
    
    	for (i = 0; i < BH_LRU_SIZE; i++) {
    		brelse(b->bhs[i]);
    		b->bhs[i] = NULL;
    	}
    
    	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
    	per_cpu(bh_accounting, cpu).nr = 0;
    	put_cpu_var(bh_accounting);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    static int buffer_cpu_notify(struct notifier_block *self,
    			      unsigned long action, void *hcpu)
    {
    	if (action == CPU_DEAD)
    		buffer_exit_cpu((unsigned long)hcpu);
    	return NOTIFY_OK;
    }
    
    void __init buffer_init(void)
    {
    	int nrpages;
    
    	bh_cachep = kmem_cache_create("buffer_head",