Skip to content
Snippets Groups Projects
buffer.c 82.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    	 * data.  If BH_New is set, we know that the block was newly
    	 * allocated in the above loop.
    	 */
    	bh = head;
    	block_start = 0;
    	do {
    		block_end = block_start+blocksize;
    		if (block_end <= from)
    			goto next_bh;
    		if (block_start >= to)
    			break;
    		if (buffer_new(bh)) {
    			void *kaddr;
    
    			clear_buffer_new(bh);
    			kaddr = kmap_atomic(page, KM_USER0);
    			memset(kaddr+block_start, 0, bh->b_size);
    			kunmap_atomic(kaddr, KM_USER0);
    			set_buffer_uptodate(bh);
    			mark_buffer_dirty(bh);
    		}
    next_bh:
    		block_start = block_end;
    		bh = bh->b_this_page;
    	} while (bh != head);
    	return err;
    }
    
    static int __block_commit_write(struct inode *inode, struct page *page,
    		unsigned from, unsigned to)
    {
    	unsigned block_start, block_end;
    	int partial = 0;
    	unsigned blocksize;
    	struct buffer_head *bh, *head;
    
    	blocksize = 1 << inode->i_blkbits;
    
    	for(bh = head = page_buffers(page), block_start = 0;
    	    bh != head || !block_start;
    	    block_start=block_end, bh = bh->b_this_page) {
    		block_end = block_start + blocksize;
    		if (block_end <= from || block_start >= to) {
    			if (!buffer_uptodate(bh))
    				partial = 1;
    		} else {
    			set_buffer_uptodate(bh);
    			mark_buffer_dirty(bh);
    		}
    	}
    
    	/*
    	 * If this is a partial write which happened to make all buffers
    	 * uptodate then we can optimize away a bogus readpage() for
    	 * the next read(). Here we 'discover' whether the page went
    	 * uptodate as a result of this (potentially partial) write.
    	 */
    	if (!partial)
    		SetPageUptodate(page);
    	return 0;
    }
    
    /*
     * Generic "read page" function for block devices that have the normal
     * get_block functionality. This is most of the block device filesystems.
     * Reads the page asynchronously --- the unlock_buffer() and
     * set/clear_buffer_uptodate() functions propagate buffer state into the
     * page struct once IO has completed.
     */
    int block_read_full_page(struct page *page, get_block_t *get_block)
    {
    	struct inode *inode = page->mapping->host;
    	sector_t iblock, lblock;
    	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
    	unsigned int blocksize;
    	int nr, i;
    	int fully_mapped = 1;
    
    
    	BUG_ON(!PageLocked(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	blocksize = 1 << inode->i_blkbits;
    	if (!page_has_buffers(page))
    		create_empty_buffers(page, blocksize, 0);
    	head = page_buffers(page);
    
    	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
    	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
    	bh = head;
    	nr = 0;
    	i = 0;
    
    	do {
    		if (buffer_uptodate(bh))
    			continue;
    
    		if (!buffer_mapped(bh)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			fully_mapped = 0;
    			if (iblock < lblock) {
    
    				err = get_block(inode, iblock, bh, 0);
    				if (err)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					SetPageError(page);
    			}
    			if (!buffer_mapped(bh)) {
    				void *kaddr = kmap_atomic(page, KM_USER0);
    				memset(kaddr + i * blocksize, 0, blocksize);
    				flush_dcache_page(page);
    				kunmap_atomic(kaddr, KM_USER0);
    
    				if (!err)
    					set_buffer_uptodate(bh);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709
    				continue;
    			}
    			/*
    			 * get_block() might have updated the buffer
    			 * synchronously
    			 */
    			if (buffer_uptodate(bh))
    				continue;
    		}
    		arr[nr++] = bh;
    	} while (i++, iblock++, (bh = bh->b_this_page) != head);
    
    	if (fully_mapped)
    		SetPageMappedToDisk(page);
    
    	if (!nr) {
    		/*
    		 * All buffers are uptodate - we can set the page uptodate
    		 * as well. But not if get_block() returned an error.
    		 */
    		if (!PageError(page))
    			SetPageUptodate(page);
    		unlock_page(page);
    		return 0;
    	}
    
    	/* Stage two: lock the buffers */
    	for (i = 0; i < nr; i++) {
    		bh = arr[i];
    		lock_buffer(bh);
    		mark_buffer_async_read(bh);
    	}
    
    	/*
    	 * Stage 3: start the IO.  Check for uptodateness
    	 * inside the buffer lock in case another process reading
    	 * the underlying blockdev brought it uptodate (the sct fix).
    	 */
    	for (i = 0; i < nr; i++) {
    		bh = arr[i];
    		if (buffer_uptodate(bh))
    			end_buffer_async_read(bh, 1);
    		else
    			submit_bh(READ, bh);
    	}
    	return 0;
    }
    
    /* utility function for filesystems that need to do work on expanding
     * truncates.  Uses prepare/commit_write to allow the filesystem to
     * deal with the hole.  
     */
    int generic_cont_expand(struct inode *inode, loff_t size)
    {
    	struct address_space *mapping = inode->i_mapping;
    	struct page *page;
    	unsigned long index, offset, limit;
    	int err;
    
    	err = -EFBIG;
            limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
    	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
    		send_sig(SIGXFSZ, current, 0);
    		goto out;
    	}
    	if (size > inode->i_sb->s_maxbytes)
    		goto out;
    
    	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
    
    	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
    	** skip the prepare.  make sure we never send an offset for the start
    	** of a block
    	*/
    	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
    		offset++;
    	}
    	index = size >> PAGE_CACHE_SHIFT;
    	err = -ENOMEM;
    	page = grab_cache_page(mapping, index);
    	if (!page)
    		goto out;
    	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
    	if (!err) {
    		err = mapping->a_ops->commit_write(NULL, page, offset, offset);
    	}
    	unlock_page(page);
    	page_cache_release(page);
    	if (err > 0)
    		err = 0;
    out:
    	return err;
    }
    
    /*
     * For moronic filesystems that do not allow holes in file.
     * We may have to extend the file.
     */
    
    int cont_prepare_write(struct page *page, unsigned offset,
    		unsigned to, get_block_t *get_block, loff_t *bytes)
    {
    	struct address_space *mapping = page->mapping;
    	struct inode *inode = mapping->host;
    	struct page *new_page;
    	pgoff_t pgpos;
    	long status;
    	unsigned zerofrom;
    	unsigned blocksize = 1 << inode->i_blkbits;
    	void *kaddr;
    
    	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
    		status = -ENOMEM;
    		new_page = grab_cache_page(mapping, pgpos);
    		if (!new_page)
    			goto out;
    		/* we might sleep */
    		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
    			unlock_page(new_page);
    			page_cache_release(new_page);
    			continue;
    		}
    		zerofrom = *bytes & ~PAGE_CACHE_MASK;
    		if (zerofrom & (blocksize-1)) {
    			*bytes |= (blocksize-1);
    			(*bytes)++;
    		}
    		status = __block_prepare_write(inode, new_page, zerofrom,
    						PAGE_CACHE_SIZE, get_block);
    		if (status)
    			goto out_unmap;
    		kaddr = kmap_atomic(new_page, KM_USER0);
    		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
    		flush_dcache_page(new_page);
    		kunmap_atomic(kaddr, KM_USER0);
    		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
    		unlock_page(new_page);
    		page_cache_release(new_page);
    	}
    
    	if (page->index < pgpos) {
    		/* completely inside the area */
    		zerofrom = offset;
    	} else {
    		/* page covers the boundary, find the boundary offset */
    		zerofrom = *bytes & ~PAGE_CACHE_MASK;
    
    		/* if we will expand the thing last block will be filled */
    		if (to > zerofrom && (zerofrom & (blocksize-1))) {
    			*bytes |= (blocksize-1);
    			(*bytes)++;
    		}
    
    		/* starting below the boundary? Nothing to zero out */
    		if (offset <= zerofrom)
    			zerofrom = offset;
    	}
    	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
    	if (status)
    		goto out1;
    	if (zerofrom < offset) {
    		kaddr = kmap_atomic(page, KM_USER0);
    		memset(kaddr+zerofrom, 0, offset-zerofrom);
    		flush_dcache_page(page);
    		kunmap_atomic(kaddr, KM_USER0);
    		__block_commit_write(inode, page, zerofrom, offset);
    	}
    	return 0;
    out1:
    	ClearPageUptodate(page);
    	return status;
    
    out_unmap:
    	ClearPageUptodate(new_page);
    	unlock_page(new_page);
    	page_cache_release(new_page);
    out:
    	return status;
    }
    
    int block_prepare_write(struct page *page, unsigned from, unsigned to,
    			get_block_t *get_block)
    {
    	struct inode *inode = page->mapping->host;
    	int err = __block_prepare_write(inode, page, from, to, get_block);
    	if (err)
    		ClearPageUptodate(page);
    	return err;
    }
    
    int block_commit_write(struct page *page, unsigned from, unsigned to)
    {
    	struct inode *inode = page->mapping->host;
    	__block_commit_write(inode,page,from,to);
    	return 0;
    }
    
    int generic_commit_write(struct file *file, struct page *page,
    		unsigned from, unsigned to)
    {
    	struct inode *inode = page->mapping->host;
    	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
    	__block_commit_write(inode,page,from,to);
    	/*
    	 * No need to use i_size_read() here, the i_size
    	 * cannot change under us because we hold i_sem.
    	 */
    	if (pos > inode->i_size) {
    		i_size_write(inode, pos);
    		mark_inode_dirty(inode);
    	}
    	return 0;
    }
    
    
    /*
     * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
     * immediately, while under the page lock.  So it needs a special end_io
     * handler which does not touch the bh after unlocking it.
     *
     * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
     * a race there is benign: unlock_buffer() only use the bh's address for
     * hashing after unlocking the buffer, so it doesn't actually touch the bh
     * itself.
     */
    static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
    {
    	if (uptodate) {
    		set_buffer_uptodate(bh);
    	} else {
    		/* This happens, due to failed READA attempts. */
    		clear_buffer_uptodate(bh);
    	}
    	unlock_buffer(bh);
    }
    
    /*
     * On entry, the page is fully not uptodate.
     * On exit the page is fully uptodate in the areas outside (from,to)
     */
    int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
    			get_block_t *get_block)
    {
    	struct inode *inode = page->mapping->host;
    	const unsigned blkbits = inode->i_blkbits;
    	const unsigned blocksize = 1 << blkbits;
    	struct buffer_head map_bh;
    	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
    	unsigned block_in_page;
    	unsigned block_start;
    	sector_t block_in_file;
    	char *kaddr;
    	int nr_reads = 0;
    	int i;
    	int ret = 0;
    	int is_mapped_to_disk = 1;
    	int dirtied_it = 0;
    
    	if (PageMappedToDisk(page))
    		return 0;
    
    	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
    	map_bh.b_page = page;
    
    	/*
    	 * We loop across all blocks in the page, whether or not they are
    	 * part of the affected region.  This is so we can discover if the
    	 * page is fully mapped-to-disk.
    	 */
    	for (block_start = 0, block_in_page = 0;
    		  block_start < PAGE_CACHE_SIZE;
    		  block_in_page++, block_start += blocksize) {
    		unsigned block_end = block_start + blocksize;
    		int create;
    
    		map_bh.b_state = 0;
    		create = 1;
    		if (block_start >= to)
    			create = 0;
    		ret = get_block(inode, block_in_file + block_in_page,
    					&map_bh, create);
    		if (ret)
    			goto failed;
    		if (!buffer_mapped(&map_bh))
    			is_mapped_to_disk = 0;
    		if (buffer_new(&map_bh))
    			unmap_underlying_metadata(map_bh.b_bdev,
    							map_bh.b_blocknr);
    		if (PageUptodate(page))
    			continue;
    		if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
    			kaddr = kmap_atomic(page, KM_USER0);
    			if (block_start < from) {
    				memset(kaddr+block_start, 0, from-block_start);
    				dirtied_it = 1;
    			}
    			if (block_end > to) {
    				memset(kaddr + to, 0, block_end - to);
    				dirtied_it = 1;
    			}
    			flush_dcache_page(page);
    			kunmap_atomic(kaddr, KM_USER0);
    			continue;
    		}
    		if (buffer_uptodate(&map_bh))
    			continue;	/* reiserfs does this */
    		if (block_start < from || block_end > to) {
    			struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
    
    			if (!bh) {
    				ret = -ENOMEM;
    				goto failed;
    			}
    			bh->b_state = map_bh.b_state;
    			atomic_set(&bh->b_count, 0);
    			bh->b_this_page = NULL;
    			bh->b_page = page;
    			bh->b_blocknr = map_bh.b_blocknr;
    			bh->b_size = blocksize;
    			bh->b_data = (char *)(long)block_start;
    			bh->b_bdev = map_bh.b_bdev;
    			bh->b_private = NULL;
    			read_bh[nr_reads++] = bh;
    		}
    	}
    
    	if (nr_reads) {
    		struct buffer_head *bh;
    
    		/*
    		 * The page is locked, so these buffers are protected from
    		 * any VM or truncate activity.  Hence we don't need to care
    		 * for the buffer_head refcounts.
    		 */
    		for (i = 0; i < nr_reads; i++) {
    			bh = read_bh[i];
    			lock_buffer(bh);
    			bh->b_end_io = end_buffer_read_nobh;
    			submit_bh(READ, bh);
    		}
    		for (i = 0; i < nr_reads; i++) {
    			bh = read_bh[i];
    			wait_on_buffer(bh);
    			if (!buffer_uptodate(bh))
    				ret = -EIO;
    			free_buffer_head(bh);
    			read_bh[i] = NULL;
    		}
    		if (ret)
    			goto failed;
    	}
    
    	if (is_mapped_to_disk)
    		SetPageMappedToDisk(page);
    	SetPageUptodate(page);
    
    	/*
    	 * Setting the page dirty here isn't necessary for the prepare_write
    	 * function - commit_write will do that.  But if/when this function is
    	 * used within the pagefault handler to ensure that all mmapped pages
    	 * have backing space in the filesystem, we will need to dirty the page
    	 * if its contents were altered.
    	 */
    	if (dirtied_it)
    		set_page_dirty(page);
    
    	return 0;
    
    failed:
    	for (i = 0; i < nr_reads; i++) {
    		if (read_bh[i])
    			free_buffer_head(read_bh[i]);
    	}
    
    	/*
    	 * Error recovery is pretty slack.  Clear the page and mark it dirty
    	 * so we'll later zero out any blocks which _were_ allocated.
    	 */
    	kaddr = kmap_atomic(page, KM_USER0);
    	memset(kaddr, 0, PAGE_CACHE_SIZE);
    	kunmap_atomic(kaddr, KM_USER0);
    	SetPageUptodate(page);
    	set_page_dirty(page);
    	return ret;
    }
    EXPORT_SYMBOL(nobh_prepare_write);
    
    int nobh_commit_write(struct file *file, struct page *page,
    		unsigned from, unsigned to)
    {
    	struct inode *inode = page->mapping->host;
    	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
    
    	set_page_dirty(page);
    	if (pos > inode->i_size) {
    		i_size_write(inode, pos);
    		mark_inode_dirty(inode);
    	}
    	return 0;
    }
    EXPORT_SYMBOL(nobh_commit_write);
    
    /*
     * nobh_writepage() - based on block_full_write_page() except
     * that it tries to operate without attaching bufferheads to
     * the page.
     */
    int nobh_writepage(struct page *page, get_block_t *get_block,
    			struct writeback_control *wbc)
    {
    	struct inode * const inode = page->mapping->host;
    	loff_t i_size = i_size_read(inode);
    	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
    	unsigned offset;
    	void *kaddr;
    	int ret;
    
    	/* Is the page fully inside i_size? */
    	if (page->index < end_index)
    		goto out;
    
    	/* Is the page fully outside i_size? (truncate in progress) */
    	offset = i_size & (PAGE_CACHE_SIZE-1);
    	if (page->index >= end_index+1 || !offset) {
    		/*
    		 * The page may have dirty, unmapped buffers.  For example,
    		 * they may have been added in ext3_writepage().  Make them
    		 * freeable here, so the page does not leak.
    		 */
    #if 0
    		/* Not really sure about this  - do we need this ? */
    		if (page->mapping->a_ops->invalidatepage)
    			page->mapping->a_ops->invalidatepage(page, offset);
    #endif
    		unlock_page(page);
    		return 0; /* don't care */
    	}
    
    	/*
    	 * The page straddles i_size.  It must be zeroed out on each and every
    	 * writepage invocation because it may be mmapped.  "A file is mapped
    	 * in multiples of the page size.  For a file that is not a multiple of
    	 * the  page size, the remaining memory is zeroed when mapped, and
    	 * writes to that region are not written out to the file."
    	 */
    	kaddr = kmap_atomic(page, KM_USER0);
    	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
    	flush_dcache_page(page);
    	kunmap_atomic(kaddr, KM_USER0);
    out:
    	ret = mpage_writepage(page, get_block, wbc);
    	if (ret == -EAGAIN)
    		ret = __block_write_full_page(inode, page, get_block, wbc);
    	return ret;
    }
    EXPORT_SYMBOL(nobh_writepage);
    
    /*
     * This function assumes that ->prepare_write() uses nobh_prepare_write().
     */
    int nobh_truncate_page(struct address_space *mapping, loff_t from)
    {
    	struct inode *inode = mapping->host;
    	unsigned blocksize = 1 << inode->i_blkbits;
    	pgoff_t index = from >> PAGE_CACHE_SHIFT;
    	unsigned offset = from & (PAGE_CACHE_SIZE-1);
    	unsigned to;
    	struct page *page;
    	struct address_space_operations *a_ops = mapping->a_ops;
    	char *kaddr;
    	int ret = 0;
    
    	if ((offset & (blocksize - 1)) == 0)
    		goto out;
    
    	ret = -ENOMEM;
    	page = grab_cache_page(mapping, index);
    	if (!page)
    		goto out;
    
    	to = (offset + blocksize) & ~(blocksize - 1);
    	ret = a_ops->prepare_write(NULL, page, offset, to);
    	if (ret == 0) {
    		kaddr = kmap_atomic(page, KM_USER0);
    		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
    		flush_dcache_page(page);
    		kunmap_atomic(kaddr, KM_USER0);
    		set_page_dirty(page);
    	}
    	unlock_page(page);
    	page_cache_release(page);
    out:
    	return ret;
    }
    EXPORT_SYMBOL(nobh_truncate_page);
    
    int block_truncate_page(struct address_space *mapping,
    			loff_t from, get_block_t *get_block)
    {
    	pgoff_t index = from >> PAGE_CACHE_SHIFT;
    	unsigned offset = from & (PAGE_CACHE_SIZE-1);
    	unsigned blocksize;
    	pgoff_t iblock;
    	unsigned length, pos;
    	struct inode *inode = mapping->host;
    	struct page *page;
    	struct buffer_head *bh;
    	void *kaddr;
    	int err;
    
    	blocksize = 1 << inode->i_blkbits;
    	length = offset & (blocksize - 1);
    
    	/* Block boundary? Nothing to do */
    	if (!length)
    		return 0;
    
    	length = blocksize - length;
    	iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
    	
    	page = grab_cache_page(mapping, index);
    	err = -ENOMEM;
    	if (!page)
    		goto out;
    
    	if (!page_has_buffers(page))
    		create_empty_buffers(page, blocksize, 0);
    
    	/* Find the buffer that contains "offset" */
    	bh = page_buffers(page);
    	pos = blocksize;
    	while (offset >= pos) {
    		bh = bh->b_this_page;
    		iblock++;
    		pos += blocksize;
    	}
    
    	err = 0;
    	if (!buffer_mapped(bh)) {
    		err = get_block(inode, iblock, bh, 0);
    		if (err)
    			goto unlock;
    		/* unmapped? It's a hole - nothing to do */
    		if (!buffer_mapped(bh))
    			goto unlock;
    	}
    
    	/* Ok, it's mapped. Make sure it's up-to-date */
    	if (PageUptodate(page))
    		set_buffer_uptodate(bh);
    
    	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
    		err = -EIO;
    		ll_rw_block(READ, 1, &bh);
    		wait_on_buffer(bh);
    		/* Uhhuh. Read error. Complain and punt. */
    		if (!buffer_uptodate(bh))
    			goto unlock;
    	}
    
    	kaddr = kmap_atomic(page, KM_USER0);
    	memset(kaddr + offset, 0, length);
    	flush_dcache_page(page);
    	kunmap_atomic(kaddr, KM_USER0);
    
    	mark_buffer_dirty(bh);
    	err = 0;
    
    unlock:
    	unlock_page(page);
    	page_cache_release(page);
    out:
    	return err;
    }
    
    /*
     * The generic ->writepage function for buffer-backed address_spaces
     */
    int block_write_full_page(struct page *page, get_block_t *get_block,
    			struct writeback_control *wbc)
    {
    	struct inode * const inode = page->mapping->host;
    	loff_t i_size = i_size_read(inode);
    	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
    	unsigned offset;
    	void *kaddr;
    
    	/* Is the page fully inside i_size? */
    	if (page->index < end_index)
    		return __block_write_full_page(inode, page, get_block, wbc);
    
    	/* Is the page fully outside i_size? (truncate in progress) */
    	offset = i_size & (PAGE_CACHE_SIZE-1);
    	if (page->index >= end_index+1 || !offset) {
    		/*
    		 * The page may have dirty, unmapped buffers.  For example,
    		 * they may have been added in ext3_writepage().  Make them
    		 * freeable here, so the page does not leak.
    		 */
    
    		do_invalidatepage(page, 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		unlock_page(page);
    		return 0; /* don't care */
    	}
    
    	/*
    	 * The page straddles i_size.  It must be zeroed out on each and every
    	 * writepage invokation because it may be mmapped.  "A file is mapped
    	 * in multiples of the page size.  For a file that is not a multiple of
    	 * the  page size, the remaining memory is zeroed when mapped, and
    	 * writes to that region are not written out to the file."
    	 */
    	kaddr = kmap_atomic(page, KM_USER0);
    	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
    	flush_dcache_page(page);
    	kunmap_atomic(kaddr, KM_USER0);
    	return __block_write_full_page(inode, page, get_block, wbc);
    }
    
    sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
    			    get_block_t *get_block)
    {
    	struct buffer_head tmp;
    	struct inode *inode = mapping->host;
    	tmp.b_state = 0;
    	tmp.b_blocknr = 0;
    	get_block(inode, block, &tmp, 0);
    	return tmp.b_blocknr;
    }
    
    static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
    {
    	struct buffer_head *bh = bio->bi_private;
    
    	if (bio->bi_size)
    		return 1;
    
    	if (err == -EOPNOTSUPP) {
    		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
    		set_bit(BH_Eopnotsupp, &bh->b_state);
    	}
    
    	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
    	bio_put(bio);
    	return 0;
    }
    
    int submit_bh(int rw, struct buffer_head * bh)
    {
    	struct bio *bio;
    	int ret = 0;
    
    	BUG_ON(!buffer_locked(bh));
    	BUG_ON(!buffer_mapped(bh));
    	BUG_ON(!bh->b_end_io);
    
    	if (buffer_ordered(bh) && (rw == WRITE))
    		rw = WRITE_BARRIER;
    
    	/*
    	 * Only clear out a write error when rewriting, should this
    	 * include WRITE_SYNC as well?
    	 */
    	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
    		clear_buffer_write_io_error(bh);
    
    	/*
    	 * from here on down, it's all bio -- do the initial mapping,
    	 * submit_bio -> generic_make_request may further map this bio around
    	 */
    	bio = bio_alloc(GFP_NOIO, 1);
    
    	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
    	bio->bi_bdev = bh->b_bdev;
    	bio->bi_io_vec[0].bv_page = bh->b_page;
    	bio->bi_io_vec[0].bv_len = bh->b_size;
    	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
    
    	bio->bi_vcnt = 1;
    	bio->bi_idx = 0;
    	bio->bi_size = bh->b_size;
    
    	bio->bi_end_io = end_bio_bh_io_sync;
    	bio->bi_private = bh;
    
    	bio_get(bio);
    	submit_bio(rw, bio);
    
    	if (bio_flagged(bio, BIO_EOPNOTSUPP))
    		ret = -EOPNOTSUPP;
    
    	bio_put(bio);
    	return ret;
    }
    
    /**
     * ll_rw_block: low-level access to block devices (DEPRECATED)
    
     * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * @nr: number of &struct buffer_heads in the array
     * @bhs: array of pointers to &struct buffer_head
     *
    
     * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
     * requests an I/O operation on them, either a %READ or a %WRITE.  The third
     * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
     * are sent to disk. The fourth %READA option is described in the documentation
     * for generic_make_request() which ll_rw_block() calls.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * This function drops any buffer that it cannot get a lock on (with the
    
     * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
     * clean when doing a write request, and any buffer that appears to be
     * up-to-date when doing read request.  Further it marks as clean buffers that
     * are processed for writing (the buffer cache won't assume that they are
     * actually clean until the buffer gets unlocked).
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * ll_rw_block sets b_end_io to simple completion handler that marks
     * the buffer up-to-date (if approriate), unlocks the buffer and wakes
     * any waiters. 
     *
     * All of the buffers must be for the same device, and must also be a
     * multiple of the current approved size for the device.
     */
    void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
    {
    	int i;
    
    	for (i = 0; i < nr; i++) {
    		struct buffer_head *bh = bhs[i];
    
    
    		if (rw == SWRITE)
    			lock_buffer(bh);
    		else if (test_set_buffer_locked(bh))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    		get_bh(bh);
    
    		if (rw == WRITE || rw == SWRITE) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (test_clear_buffer_dirty(bh)) {
    
    				bh->b_end_io = end_buffer_write_sync;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				submit_bh(WRITE, bh);
    				continue;
    			}
    		} else {
    			if (!buffer_uptodate(bh)) {
    
    				bh->b_end_io = end_buffer_read_sync;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				submit_bh(rw, bh);
    				continue;
    			}
    		}
    		unlock_buffer(bh);
    		put_bh(bh);
    	}
    }
    
    /*
     * For a data-integrity writeout, we need to wait upon any in-progress I/O
     * and then start new I/O and then wait upon it.  The caller must have a ref on
     * the buffer_head.
     */
    int sync_dirty_buffer(struct buffer_head *bh)
    {
    	int ret = 0;
    
    	WARN_ON(atomic_read(&bh->b_count) < 1);
    	lock_buffer(bh);
    	if (test_clear_buffer_dirty(bh)) {
    		get_bh(bh);
    		bh->b_end_io = end_buffer_write_sync;
    		ret = submit_bh(WRITE, bh);
    		wait_on_buffer(bh);
    		if (buffer_eopnotsupp(bh)) {
    			clear_buffer_eopnotsupp(bh);
    			ret = -EOPNOTSUPP;
    		}
    		if (!ret && !buffer_uptodate(bh))
    			ret = -EIO;
    	} else {
    		unlock_buffer(bh);
    	}
    	return ret;
    }
    
    /*
     * try_to_free_buffers() checks if all the buffers on this particular page
     * are unused, and releases them if so.
     *
     * Exclusion against try_to_free_buffers may be obtained by either
     * locking the page or by holding its mapping's private_lock.
     *
     * If the page is dirty but all the buffers are clean then we need to
     * be sure to mark the page clean as well.  This is because the page
     * may be against a block device, and a later reattachment of buffers
     * to a dirty page will set *all* buffers dirty.  Which would corrupt
     * filesystem data on the same device.
     *
     * The same applies to regular filesystem pages: if all the buffers are
     * clean then we set the page clean and proceed.  To do that, we require
     * total exclusion from __set_page_dirty_buffers().  That is obtained with
     * private_lock.
     *
     * try_to_free_buffers() is non-blocking.
     */
    static inline int buffer_busy(struct buffer_head *bh)
    {
    	return atomic_read(&bh->b_count) |
    		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
    }
    
    static int
    drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
    {
    	struct buffer_head *head = page_buffers(page);
    	struct buffer_head *bh;
    
    	bh = head;
    	do {
    
    		if (buffer_write_io_error(bh) && page->mapping)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			set_bit(AS_EIO, &page->mapping->flags);
    		if (buffer_busy(bh))
    			goto failed;
    		bh = bh->b_this_page;
    	} while (bh != head);
    
    	do {
    		struct buffer_head *next = bh->b_this_page;
    
    		if (!list_empty(&bh->b_assoc_buffers))
    			__remove_assoc_queue(bh);
    		bh = next;
    	} while (bh != head);
    	*buffers_to_free = head;
    	__clear_page_buffers(page);
    	return 1;
    failed:
    	return 0;
    }
    
    int try_to_free_buffers(struct page *page)
    {
    	struct address_space * const mapping = page->mapping;
    	struct buffer_head *buffers_to_free = NULL;
    	int ret = 0;
    
    	BUG_ON(!PageLocked(page));
    	if (PageWriteback(page))
    		return 0;
    
    	if (mapping == NULL) {		/* can this still happen? */
    		ret = drop_buffers(page, &buffers_to_free);
    		goto out;
    	}
    
    	spin_lock(&mapping->private_lock);
    	ret = drop_buffers(page, &buffers_to_free);
    	if (ret) {
    		/*
    		 * If the filesystem writes its buffers by hand (eg ext3)
    		 * then we can have clean buffers against a dirty page.  We
    		 * clean the page here; otherwise later reattachment of buffers
    		 * could encounter a non-uptodate page, which is unresolvable.
    		 * This only applies in the rare case where try_to_free_buffers
    		 * succeeds but the page is not freed.
    		 */
    		clear_page_dirty(page);
    	}
    	spin_unlock(&mapping->private_lock);
    out:
    	if (buffers_to_free) {
    		struct buffer_head *bh = buffers_to_free;
    
    		do {
    			struct buffer_head *next = bh->b_this_page;
    			free_buffer_head(bh);
    			bh = next;
    		} while (bh != buffers_to_free);
    	}
    	return ret;
    }
    EXPORT_SYMBOL(try_to_free_buffers);
    
    int block_sync_page(struct page *page)
    {
    	struct address_space *mapping;
    
    	smp_mb();
    	mapping = page_mapping(page);
    	if (mapping)
    		blk_run_backing_dev(mapping->backing_dev_info, page);
    	return 0;
    }
    
    /*
     * There are no bdflush tunables left.  But distributions are