Newer
Older
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/radix-tree.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/crc32c.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
#include "btrfs_inode.h"
#include "volumes.h"
#include "async-thread.h"
#include "tree-log.h"
#include "check-integrity.h"
#include "dev-replace.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
#endif
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
struct btrfs_root *root);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages,
int mark);
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
struct extent_io_tree *pinned_extents);
/*
* end_io_wq structs are used to do processing in task context when an IO is
* complete. This is used during reads to verify checksums, and it is used
* by writes to insert metadata for new file extents after IO is complete.
*/
struct end_io_wq {
struct bio *bio;
bio_end_io_t *end_io;
void *private;
struct btrfs_fs_info *info;
int error;
struct list_head list;
struct btrfs_work work;
/*
* async submit bios are used to offload expensive checksumming
* onto the worker threads. They checksum file and metadata bios
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
struct inode *inode;
struct bio *bio;
struct list_head list;
extent_submit_bio_hook_t *submit_bio_start;
extent_submit_bio_hook_t *submit_bio_done;
/*
* bio_offset is optional, can be used if the pages in the bio
* can't tell us where in the file the bio should go
*/
u64 bio_offset;
struct btrfs_work work;
/*
* Lockdep class keys for extent_buffer->lock's in this root. For a given
* eb, the lockdep key is determined by the btrfs_root it belongs to and
* the level the eb occupies in the tree.
*
* Different roots are used for different purposes and may nest inside each
* other and they require separate keysets. As lockdep keys should be
* static, assign keysets according to the purpose of the root as indicated
* by btrfs_root->objectid. This ensures that all special purpose roots
* have separate keysets.
* Lock-nesting across peer nodes is always done with the immediate parent
* node locked thus preventing deadlock. As lockdep doesn't know this, use
* subclass to avoid triggering lockdep warning in such cases.
* The key is set by the readpage_end_io_hook after the buffer has passed
* csum validation but before the pages are unlocked. It is also set by
* btrfs_init_new_buffer on freshly allocated blocks.
* We also add a check to make sure the highest level of the tree is the
* same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
* needs update as well.
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# if BTRFS_MAX_LEVEL != 8
# error
# endif
static struct btrfs_lockdep_keyset {
u64 id; /* root objectid */
const char *name_stem; /* lock name stem */
char names[BTRFS_MAX_LEVEL + 1][20];
struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
} btrfs_lockdep_keysets[] = {
{ .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
{ .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
{ .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
{ .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
{ .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
{ .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
{ .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" },
{ .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
{ .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
{ .id = 0, .name_stem = "tree" },
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
void __init btrfs_init_lockdep(void)
{
int i, j;
/* initialize lockdep class names */
for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
for (j = 0; j < ARRAY_SIZE(ks->names); j++)
snprintf(ks->names[j], sizeof(ks->names[j]),
"btrfs-%s-%02d", ks->name_stem, j);
}
}
void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
int level)
{
struct btrfs_lockdep_keyset *ks;
BUG_ON(level >= ARRAY_SIZE(ks->keys));
/* find the matching keyset, id 0 is the default entry */
for (ks = btrfs_lockdep_keysets; ks->id; ks++)
if (ks->id == objectid)
break;
lockdep_set_class_and_name(&eb->lock,
&ks->keys[level], ks->names[level]);
}
/*
* extents on the btree inode are pretty simple, there's one extent
* that covers the entire device
*/
static struct extent_map *btree_get_extent(struct inode *inode,
struct page *page, size_t pg_offset, u64 start, u64 len,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret;
em = lookup_extent_mapping(em_tree, start, len);
if (em) {
em->bdev =
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
if (!em) {
em = ERR_PTR(-ENOMEM);
goto out;
}
em->start = 0;
em->block_start = 0;
em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
ret = add_extent_mapping(em_tree, em);
if (ret == -EEXIST) {
free_extent_map(em);
em = lookup_extent_mapping(em_tree, start, len);
em = ERR_PTR(-EIO);
} else if (ret) {
free_extent_map(em);
em = ERR_PTR(ret);
out:
return em;
u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
{
return crc32c(seed, data, len);
}
void btrfs_csum_final(u32 crc, char *result)
{
put_unaligned_le32(~crc, result);
/*
* compute the csum for a btree block, and either verify it or write it
* into the csum field of the block.
*/
static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
int verify)
{
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
unsigned long len;
unsigned long cur_len;
unsigned long offset = BTRFS_CSUM_SIZE;
char *kaddr;
unsigned long map_start;
unsigned long map_len;
int err;
u32 crc = ~(u32)0;
unsigned long inline_result;
err = map_private_extent_buffer(buf, offset, 32,
&kaddr, &map_start, &map_len);
return 1;
cur_len = min(len, map_len - (offset - map_start));
crc = btrfs_csum_data(root, kaddr + offset - map_start,
crc, cur_len);
len -= cur_len;
offset += cur_len;
}
if (csum_size > sizeof(inline_result)) {
result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
if (!result)
return 1;
} else {
result = (char *)&inline_result;
}
btrfs_csum_final(crc, result);
if (verify) {
if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
u32 val;
u32 found = 0;
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
"failed on %llu wanted %X found %X "
"level %d\n",
root->fs_info->sb->s_id,
(unsigned long long)buf->start, val, found,
btrfs_header_level(buf));
if (result != (char *)&inline_result)
kfree(result);
write_extent_buffer(buf, result, 0, csum_size);
if (result != (char *)&inline_result)
kfree(result);
/*
* we can't consider a given block up to date unless the transid of the
* block matches the transid in the parent node's pointer. This is how we
* detect blocks that either didn't get written at all or got written
* in the wrong place.
*/
static int verify_parent_transid(struct extent_io_tree *io_tree,
struct extent_buffer *eb, u64 parent_transid,
int atomic)
struct extent_state *cached_state = NULL;
int ret;
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
if (atomic)
return -EAGAIN;
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
}
printk_ratelimited("parent transid verify failed on %llu wanted %llu "
"found %llu\n",
(unsigned long long)eb->start,
(unsigned long long)parent_transid,
(unsigned long long)btrfs_header_generation(eb));
clear_extent_buffer_uptodate(eb);
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state, GFP_NOFS);
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
*/
static int btree_read_extent_buffer_pages(struct btrfs_root *root,
struct extent_buffer *eb,
u64 start, u64 parent_transid)
{
struct extent_io_tree *io_tree;
int failed = 0;
int ret;
int num_copies = 0;
int mirror_num = 0;
int failed_mirror = 0;
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
ret = read_extent_buffer_pages(io_tree, eb, start,
WAIT_COMPLETE,
btree_get_extent, mirror_num);
if (!ret) {
if (!verify_parent_transid(io_tree, eb,
parent_transid, 0))
break;
else
ret = -EIO;
}
/*
* This buffer's crc is fine, but its contents are corrupted, so
* there is no reason to read the other copies, they won't be
* any less wrong.
*/
if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
num_copies = btrfs_num_copies(root->fs_info,
eb->start, eb->len);
if (!failed_mirror) {
failed = 1;
failed_mirror = eb->read_mirror;
}
mirror_num++;
if (mirror_num == failed_mirror)
mirror_num++;
if (failed && !ret && failed_mirror)
repair_eb_io_failure(root, eb, failed_mirror);
return ret;
* checksum a dirty tree block before IO. This has extra checks to make sure
* we only fill in the checksum field in the first page of a multi-page block
static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
struct extent_io_tree *tree;
u64 found_start;
struct extent_buffer *eb;
tree = &BTRFS_I(page->mapping->host)->io_tree;
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
return 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
}
if (!PageUptodate(page)) {
WARN_ON(1);
}
csum_tree_block(root, eb, 0);
return 0;
}
static int check_tree_block_fsid(struct btrfs_root *root,
struct extent_buffer *eb)
{
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
u8 fsid[BTRFS_UUID_SIZE];
int ret = 1;
read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
BTRFS_FSID_SIZE);
while (fs_devices) {
if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
ret = 0;
break;
}
fs_devices = fs_devices->seed;
}
return ret;
}
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
#define CORRUPT(reason, eb, root, slot) \
printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
"root=%llu, slot=%d\n", reason, \
(unsigned long long)btrfs_header_bytenr(eb), \
(unsigned long long)root->objectid, slot)
static noinline int check_leaf(struct btrfs_root *root,
struct extent_buffer *leaf)
{
struct btrfs_key key;
struct btrfs_key leaf_key;
u32 nritems = btrfs_header_nritems(leaf);
int slot;
if (nritems == 0)
return 0;
/* Check the 0 item */
if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
BTRFS_LEAF_DATA_SIZE(root)) {
CORRUPT("invalid item offset size pair", leaf, root, 0);
return -EIO;
}
/*
* Check to make sure each items keys are in the correct order and their
* offsets make sense. We only have to loop through nritems-1 because
* we check the current slot against the next slot, which verifies the
* next slot's offset+size makes sense and that the current's slot
* offset is correct.
*/
for (slot = 0; slot < nritems - 1; slot++) {
btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
btrfs_item_key_to_cpu(leaf, &key, slot + 1);
/* Make sure the keys are in the right order */
if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
CORRUPT("bad key order", leaf, root, slot);
return -EIO;
}
/*
* Make sure the offset and ends are right, remember that the
* item data starts at the end of the leaf and grows towards the
* front.
*/
if (btrfs_item_offset_nr(leaf, slot) !=
btrfs_item_end_nr(leaf, slot + 1)) {
CORRUPT("slot offset bad", leaf, root, slot);
return -EIO;
}
/*
* Check to make sure that we don't point outside of the leaf,
* just incase all the items are consistent to eachother, but
* all point outside of the leaf.
*/
if (btrfs_item_end_nr(leaf, slot) >
BTRFS_LEAF_DATA_SIZE(root)) {
CORRUPT("slot end outside of leaf", leaf, root, slot);
return -EIO;
}
}
return 0;
}
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
struct page *page, int max_walk)
{
struct extent_buffer *eb;
u64 start = page_offset(page);
u64 target = start;
u64 min_start;
if (start < max_walk)
min_start = 0;
else
min_start = start - max_walk;
while (start >= min_start) {
eb = find_extent_buffer(tree, start, 0);
if (eb) {
/*
* we found an extent buffer and it contains our page
* horray!
*/
if (eb->start <= target &&
eb->start + eb->len > target)
return eb;
/* we found an extent buffer that wasn't for us */
free_extent_buffer(eb);
return NULL;
}
if (start == 0)
break;
start -= PAGE_CACHE_SIZE;
}
return NULL;
}
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int mirror)
{
struct extent_io_tree *tree;
u64 found_start;
int found_level;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
if (!page->private)
goto out;
tree = &BTRFS_I(page->mapping->host)->io_tree;
eb = (struct extent_buffer *)page->private;
/* the pending IO might have been the only thing that kept this buffer
* in memory. Make sure we have a ref for all this other checks
*/
extent_buffer_get(eb);
reads_done = atomic_dec_and_test(&eb->io_pages);
if (!reads_done)
goto err;
eb->read_mirror = mirror;
if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
ret = -EIO;
goto err;
}
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
printk_ratelimited(KERN_INFO "btrfs bad tree block start "
"%llu %llu\n",
(unsigned long long)found_start,
(unsigned long long)eb->start);
printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
ret = -EIO;
goto err;
}
found_level = btrfs_header_level(eb);
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
eb, found_level);
ret = csum_tree_block(root, eb, 1);
goto err;
}
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
if (found_level == 0 && check_leaf(root, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
if (!ret)
set_extent_buffer_uptodate(eb);
if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
btree_readahead_hook(root, eb, eb->start, ret);
}
if (ret) {
/*
* our io error hook is going to dec the io pages
* again, we have to make sure it has something
* to decrement
*/
atomic_inc(&eb->io_pages);
clear_extent_buffer_uptodate(eb);
static int btree_io_failed_hook(struct page *page, int failed_mirror)
{
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
eb = (struct extent_buffer *)page->private;
set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
eb->read_mirror = failed_mirror;
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(root, eb, eb->start, -EIO);
return -EIO; /* we fixed nothing */
}
static void end_workqueue_bio(struct bio *bio, int err)
{
struct end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
fs_info = end_io_wq->info;
end_io_wq->error = err;
end_io_wq->work.func = end_workqueue_fn;
end_io_wq->work.flags = 0;
if (bio->bi_rw & REQ_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
btrfs_queue_worker(&fs_info->endio_meta_write_workers,
&end_io_wq->work);
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
btrfs_queue_worker(&fs_info->endio_freespace_worker,
&end_io_wq->work);
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
btrfs_queue_worker(&fs_info->endio_raid56_workers,
&end_io_wq->work);
else
btrfs_queue_worker(&fs_info->endio_write_workers,
&end_io_wq->work);
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
btrfs_queue_worker(&fs_info->endio_raid56_workers,
&end_io_wq->work);
else if (end_io_wq->metadata)
btrfs_queue_worker(&fs_info->endio_meta_workers,
&end_io_wq->work);
else
btrfs_queue_worker(&fs_info->endio_workers,
&end_io_wq->work);
}
/*
* For the metadata arg you want
*
* 0 - if data
* 1 - if normal metadta
* 2 - if writing to the free space cache area
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata)
struct end_io_wq *end_io_wq;
end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
if (!end_io_wq)
return -ENOMEM;
end_io_wq->private = bio->bi_private;
end_io_wq->end_io = bio->bi_end_io;
end_io_wq->info = info;
end_io_wq->error = 0;
end_io_wq->bio = bio;
end_io_wq->metadata = metadata;
bio->bi_private = end_io_wq;
bio->bi_end_io = end_workqueue_bio;
return 0;
}
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
unsigned long limit = min_t(unsigned long,
info->workers.max_workers,
info->fs_devices->open_devices);
return 256 * limit;
}
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
async = container_of(work, struct async_submit_bio, work);
ret = async->submit_bio_start(async->inode, async->rw, async->bio,
async->mirror_num, async->bio_flags,
async->bio_offset);
if (ret)
async->error = ret;
}
static void run_one_async_done(struct btrfs_work *work)
{
struct btrfs_fs_info *fs_info;
struct async_submit_bio *async;
int limit;
async = container_of(work, struct async_submit_bio, work);
fs_info = BTRFS_I(async->inode)->root->fs_info;
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
/* If an error occured we just want to clean up the bio and move on */
if (async->error) {
bio_endio(async->bio, async->error);
return;
}
async->submit_bio_done(async->inode, async->rw, async->bio,
async->mirror_num, async->bio_flags,
async->bio_offset);
}
static void run_one_async_free(struct btrfs_work *work)
{
struct async_submit_bio *async;
async = container_of(work, struct async_submit_bio, work);
kfree(async);
}
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done)
{
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
return -ENOMEM;
async->inode = inode;
async->rw = rw;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
async->submit_bio_done = submit_bio_done;
async->work.func = run_one_async_start;
async->work.ordered_func = run_one_async_done;
async->work.ordered_free = run_one_async_free;
async->work.flags = 0;
async->bio_offset = bio_offset;
async->error = 0;

Chris Mason
committed
atomic_inc(&fs_info->nr_async_submits);
if (rw & REQ_SYNC)
btrfs_set_work_high_prio(&async->work);
btrfs_queue_worker(&fs_info->workers, &async->work);
while (atomic_read(&fs_info->async_submit_draining) &&
atomic_read(&fs_info->nr_async_submits)) {
wait_event(fs_info->async_submit_wait,
(atomic_read(&fs_info->nr_async_submits) == 0));
}
static int btree_csum_one_bio(struct bio *bio)
{
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
struct btrfs_root *root;
WARN_ON(bio->bi_vcnt <= 0);
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root, bvec->bv_page);
if (ret)
break;
bio_index++;
bvec++;
}
static int __btree_submit_bio_start(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
unsigned long bio_flags,
u64 bio_offset)
/*
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
return btree_csum_one_bio(bio);
static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
if (ret)
bio_endio(bio, ret);
return ret;
static int check_async_write(struct inode *inode, unsigned long bio_flags)
{
if (bio_flags & EXTENT_BIO_TREE_LOG)
return 0;
#ifdef CONFIG_X86
if (cpu_has_xmm4_2)
return 0;
#endif
return 1;
}
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
int async = check_async_write(inode, bio_flags);
if (!(rw & REQ_WRITE)) {
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
*/
ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
bio, 1);
goto out_w_error;
ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
mirror_num, 0);
} else if (!async) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
mirror_num, 0);
} else {
/*
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num, 0,
bio_offset,
__btree_submit_bio_start,
__btree_submit_bio_done);
if (ret) {
out_w_error:
bio_endio(bio, ret);
}
return ret;
static int btree_migratepage(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
* we haven't done the locking hook
*/
if (PageDirty(page))
return -EAGAIN;
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
return migrate_page(mapping, newpage, page, mode);
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct extent_io_tree *tree;
struct btrfs_fs_info *fs_info;
int ret;
tree = &BTRFS_I(mapping->host)->io_tree;

Chris Mason
committed
if (wbc->sync_mode == WB_SYNC_NONE) {
if (wbc->for_kupdate)
return 0;
fs_info = BTRFS_I(mapping->host)->root->fs_info;
/* this is a bit racy, but that's ok */
ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
BTRFS_DIRTY_METADATA_THRESH);
if (ret < 0)
return btree_write_cache_pages(mapping, wbc);
static int btree_readpage(struct file *file, struct page *page)
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
return extent_read_full_page(tree, page, btree_get_extent, 0);
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
if (PageWriteback(page) || PageDirty(page))
/*
* We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
* slab allocation from alloc_extent_state down the callchain where
* it'd hit a BUG_ON as those flags are not allowed.
*/