Skip to content
Snippets Groups Projects
check-integrity.c 103 KiB
Newer Older
/*
 * Copyright (C) STRATO AG 2011.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

/*
 * This module can be used to catch cases when the btrfs kernel
 * code executes write requests to the disk that bring the file
 * system in an inconsistent state. In such a state, a power-loss
 * or kernel panic event would cause that the data on disk is
 * lost or at least damaged.
 *
 * Code is added that examines all block write requests during
 * runtime (including writes of the super block). Three rules
 * are verified and an error is printed on violation of the
 * rules:
 * 1. It is not allowed to write a disk block which is
 *    currently referenced by the super block (either directly
 *    or indirectly).
 * 2. When a super block is written, it is verified that all
 *    referenced (directly or indirectly) blocks fulfill the
 *    following requirements:
 *    2a. All referenced blocks have either been present when
 *        the file system was mounted, (i.e., they have been
 *        referenced by the super block) or they have been
 *        written since then and the write completion callback
 *        was called and no write error was indicated and a
 *        FLUSH request to the device where these blocks are
 *        located was received and completed.
 *    2b. All referenced blocks need to have a generation
 *        number which is equal to the parent's number.
 *
 * One issue that was found using this module was that the log
 * tree on disk became temporarily corrupted because disk blocks
 * that had been in use for the log tree had been freed and
 * reused too early, while being referenced by the written super
 * block.
 *
 * The search term in the kernel log that can be used to filter
 * on the existence of detected integrity issues is
 * "btrfs: attempt".
 *
 * The integrity check is enabled via mount options. These
 * mount options are only supported if the integrity check
 * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
 *
 * Example #1, apply integrity checks to all metadata:
 * mount /dev/sdb1 /mnt -o check_int
 *
 * Example #2, apply integrity checks to all metadata and
 * to data extents:
 * mount /dev/sdb1 /mnt -o check_int_data
 *
 * Example #3, apply integrity checks to all metadata and dump
 * the tree that the super block references to kernel messages
 * each time after a super block was written:
 * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
 *
 * If the integrity check tool is included and activated in
 * the mount options, plenty of kernel memory is used, and
 * plenty of additional CPU cycles are spent. Enabling this
 * functionality is not intended for normal use. In most
 * cases, unless you are a btrfs developer who needs to verify
 * the integrity of (super)-block write requests, do not
 * enable the config option BTRFS_FS_CHECK_INTEGRITY to
 * include and compile the integrity check tool.
 */

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/mutex.h>
#include <linux/crc32c.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "extent_io.h"
#include "volumes.h"
#include "print-tree.h"
#include "locking.h"
#include "check-integrity.h"
#include "rcu-string.h"

#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)	/* in characters,
							 * excluding " [...]" */
#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)

/*
 * The definition of the bitmask fields for the print_mask.
 * They are specified with the mount option check_integrity_print_mask.
 */
#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE			0x00000001
#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION		0x00000002
#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE			0x00000004
#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE			0x00000008
#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH			0x00000010
#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH			0x00000020
#define BTRFSIC_PRINT_MASK_VERBOSE				0x00000040
#define BTRFSIC_PRINT_MASK_VERY_VERBOSE				0x00000080
#define BTRFSIC_PRINT_MASK_INITIAL_TREE				0x00000100
#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES			0x00000200
#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE			0x00000400
#define BTRFSIC_PRINT_MASK_NUM_COPIES				0x00000800
#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS		0x00001000

struct btrfsic_dev_state;
struct btrfsic_state;

struct btrfsic_block {
	u32 magic_num;		/* only used for debug purposes */
	unsigned int is_metadata:1;	/* if it is meta-data, not data-data */
	unsigned int is_superblock:1;	/* if it is one of the superblocks */
	unsigned int is_iodone:1;	/* if is done by lower subsystem */
	unsigned int iodone_w_error:1;	/* error was indicated to endio */
	unsigned int never_written:1;	/* block was added because it was
					 * referenced, not because it was
					 * written */
	unsigned int mirror_num;	/* large enough to hold
					 * BTRFS_SUPER_MIRROR_MAX */
	struct btrfsic_dev_state *dev_state;
	u64 dev_bytenr;		/* key, physical byte num on disk */
	u64 logical_bytenr;	/* logical byte num on disk */
	u64 generation;
	struct btrfs_disk_key disk_key;	/* extra info to print in case of
					 * issues, will not always be correct */
	struct list_head collision_resolving_node;	/* list node */
	struct list_head all_blocks_node;	/* list node */

	/* the following two lists contain block_link items */
	struct list_head ref_to_list;	/* list */
	struct list_head ref_from_list;	/* list */
	struct btrfsic_block *next_in_same_bio;
	void *orig_bio_bh_private;
	union {
		bio_end_io_t *bio;
		bh_end_io_t *bh;
	} orig_bio_bh_end_io;
	int submit_bio_bh_rw;
	u64 flush_gen; /* only valid if !never_written */
};

/*
 * Elements of this type are allocated dynamically and required because
 * each block object can refer to and can be ref from multiple blocks.
 * The key to lookup them in the hashtable is the dev_bytenr of
 * the block ref to plus the one from the block refered from.
 * The fact that they are searchable via a hashtable and that a
 * ref_cnt is maintained is not required for the btrfs integrity
 * check algorithm itself, it is only used to make the output more
 * beautiful in case that an error is detected (an error is defined
 * as a write operation to a block while that block is still referenced).
 */
struct btrfsic_block_link {
	u32 magic_num;		/* only used for debug purposes */
	u32 ref_cnt;
	struct list_head node_ref_to;	/* list node */
	struct list_head node_ref_from;	/* list node */
	struct list_head collision_resolving_node;	/* list node */
	struct btrfsic_block *block_ref_to;
	struct btrfsic_block *block_ref_from;
	u64 parent_generation;
};

struct btrfsic_dev_state {
	u32 magic_num;		/* only used for debug purposes */
	struct block_device *bdev;
	struct btrfsic_state *state;
	struct list_head collision_resolving_node;	/* list node */
	struct btrfsic_block dummy_block_for_bio_bh_flush;
	u64 last_flush_gen;
	char name[BDEVNAME_SIZE];
};

struct btrfsic_block_hashtable {
	struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
};

struct btrfsic_block_link_hashtable {
	struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
};

struct btrfsic_dev_state_hashtable {
	struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
};

struct btrfsic_block_data_ctx {
	u64 start;		/* virtual bytenr */
	u64 dev_bytenr;		/* physical bytenr on device */
	u32 len;
	struct btrfsic_dev_state *dev;
	char **datav;
	struct page **pagev;
	void *mem_to_free;
};

/* This structure is used to implement recursion without occupying
 * any stack space, refer to btrfsic_process_metablock() */
struct btrfsic_stack_frame {
	u32 magic;
	u32 nr;
	int error;
	int i;
	int limit_nesting;
	int num_copies;
	int mirror_num;
	struct btrfsic_block *block;
	struct btrfsic_block_data_ctx *block_ctx;
	struct btrfsic_block *next_block;
	struct btrfsic_block_data_ctx next_block_ctx;
	struct btrfs_header *hdr;
	struct btrfsic_stack_frame *prev;
};

/* Some state per mounted filesystem */
struct btrfsic_state {
	u32 print_mask;
	int include_extent_data;
	int csum_size;
	struct list_head all_blocks_list;
	struct btrfsic_block_hashtable block_hashtable;
	struct btrfsic_block_link_hashtable block_link_hashtable;
	struct btrfs_root *root;
	u64 max_superblock_generation;
	struct btrfsic_block *latest_superblock;
	u32 metablock_size;
	u32 datablock_size;
};

static void btrfsic_block_init(struct btrfsic_block *b);
static struct btrfsic_block *btrfsic_block_alloc(void);
static void btrfsic_block_free(struct btrfsic_block *b);
static void btrfsic_block_link_init(struct btrfsic_block_link *n);
static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
static void btrfsic_block_link_free(struct btrfsic_block_link *n);
static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
					struct btrfsic_block_hashtable *h);
static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
static struct btrfsic_block *btrfsic_block_hashtable_lookup(
		struct block_device *bdev,
		u64 dev_bytenr,
		struct btrfsic_block_hashtable *h);
static void btrfsic_block_link_hashtable_init(
		struct btrfsic_block_link_hashtable *h);
static void btrfsic_block_link_hashtable_add(
		struct btrfsic_block_link *l,
		struct btrfsic_block_link_hashtable *h);
static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
		struct block_device *bdev_ref_to,
		u64 dev_bytenr_ref_to,
		struct block_device *bdev_ref_from,
		u64 dev_bytenr_ref_from,
		struct btrfsic_block_link_hashtable *h);
static void btrfsic_dev_state_hashtable_init(
		struct btrfsic_dev_state_hashtable *h);
static void btrfsic_dev_state_hashtable_add(
		struct btrfsic_dev_state *ds,
		struct btrfsic_dev_state_hashtable *h);
static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
		struct block_device *bdev,
		struct btrfsic_dev_state_hashtable *h);
static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
static int btrfsic_process_superblock(struct btrfsic_state *state,
				      struct btrfs_fs_devices *fs_devices);
static int btrfsic_process_metablock(struct btrfsic_state *state,
				     struct btrfsic_block *block,
				     struct btrfsic_block_data_ctx *block_ctx,
				     int limit_nesting, int force_iodone_flag);
static void btrfsic_read_from_block_data(
	struct btrfsic_block_data_ctx *block_ctx,
	void *dst, u32 offset, size_t len);
static int btrfsic_create_link_to_next_block(
		struct btrfsic_state *state,
		struct btrfsic_block *block,
		struct btrfsic_block_data_ctx
		*block_ctx, u64 next_bytenr,
		int limit_nesting,
		struct btrfsic_block_data_ctx *next_block_ctx,
		struct btrfsic_block **next_blockp,
		int force_iodone_flag,
		int *num_copiesp, int *mirror_nump,
		struct btrfs_disk_key *disk_key,
		u64 parent_generation);
static int btrfsic_handle_extent_data(struct btrfsic_state *state,
				      struct btrfsic_block *block,
				      struct btrfsic_block_data_ctx *block_ctx,
				      u32 item_offset, int force_iodone_flag);
static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
			     struct btrfsic_block_data_ctx *block_ctx_out,
			     int mirror_num);
static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
				  u32 len, struct block_device *bdev,
				  struct btrfsic_block_data_ctx *block_ctx_out);
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
			      struct btrfsic_block_data_ctx *block_ctx);
static void btrfsic_dump_database(struct btrfsic_state *state);
static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
				     char **datav, unsigned int num_pages);
static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
					  u64 dev_bytenr, char **mapped_datav,
					  unsigned int num_pages,
					  struct bio *bio, int *bio_is_patched,
					  struct buffer_head *bh,
					  int submit_bio_bh_rw);
static int btrfsic_process_written_superblock(
		struct btrfsic_state *state,
		struct btrfsic_block *const block,
		struct btrfs_super_block *const super_hdr);
static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
					      const struct btrfsic_block *block,
					      int recursion_level);
static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
					struct btrfsic_block *const block,
					int recursion_level);
static void btrfsic_print_add_link(const struct btrfsic_state *state,
				   const struct btrfsic_block_link *l);
static void btrfsic_print_rem_link(const struct btrfsic_state *state,
				   const struct btrfsic_block_link *l);
static char btrfsic_get_block_type(const struct btrfsic_state *state,
				   const struct btrfsic_block *block);
static void btrfsic_dump_tree(const struct btrfsic_state *state);
static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
				  const struct btrfsic_block *block,
				  int indent_level);
static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
		struct btrfsic_state *state,
		struct btrfsic_block_data_ctx *next_block_ctx,
		struct btrfsic_block *next_block,
		struct btrfsic_block *from_block,
		u64 parent_generation);
static struct btrfsic_block *btrfsic_block_lookup_or_add(
		struct btrfsic_state *state,
		struct btrfsic_block_data_ctx *block_ctx,
		const char *additional_string,
		int is_metadata,
		int is_iodone,
		int never_written,
		int mirror_num,
		int *was_created);
static int btrfsic_process_superblock_dev_mirror(
		struct btrfsic_state *state,
		struct btrfsic_dev_state *dev_state,
		struct btrfs_device *device,
		int superblock_mirror_num,
		struct btrfsic_dev_state **selected_dev_state,
		struct btrfs_super_block *selected_super);
static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
		struct block_device *bdev);
static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
					   u64 bytenr,
					   struct btrfsic_dev_state *dev_state,

static struct mutex btrfsic_mutex;
static int btrfsic_is_initialized;
static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;


static void btrfsic_block_init(struct btrfsic_block *b)
{
	b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
	b->dev_state = NULL;
	b->dev_bytenr = 0;
	b->logical_bytenr = 0;
	b->generation = BTRFSIC_GENERATION_UNKNOWN;
	b->disk_key.objectid = 0;
	b->disk_key.type = 0;
	b->disk_key.offset = 0;
	b->is_metadata = 0;
	b->is_superblock = 0;
	b->is_iodone = 0;
	b->iodone_w_error = 0;
	b->never_written = 0;
	b->mirror_num = 0;
	b->next_in_same_bio = NULL;
	b->orig_bio_bh_private = NULL;
	b->orig_bio_bh_end_io.bio = NULL;
	INIT_LIST_HEAD(&b->collision_resolving_node);
	INIT_LIST_HEAD(&b->all_blocks_node);
	INIT_LIST_HEAD(&b->ref_to_list);
	INIT_LIST_HEAD(&b->ref_from_list);
	b->submit_bio_bh_rw = 0;
	b->flush_gen = 0;
}

static struct btrfsic_block *btrfsic_block_alloc(void)
{
	struct btrfsic_block *b;

	b = kzalloc(sizeof(*b), GFP_NOFS);
	if (NULL != b)
		btrfsic_block_init(b);

	return b;
}

static void btrfsic_block_free(struct btrfsic_block *b)
{
	BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
	kfree(b);
}

static void btrfsic_block_link_init(struct btrfsic_block_link *l)
{
	l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
	l->ref_cnt = 1;
	INIT_LIST_HEAD(&l->node_ref_to);
	INIT_LIST_HEAD(&l->node_ref_from);
	INIT_LIST_HEAD(&l->collision_resolving_node);
	l->block_ref_to = NULL;
	l->block_ref_from = NULL;
}

static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
{
	struct btrfsic_block_link *l;

	l = kzalloc(sizeof(*l), GFP_NOFS);
	if (NULL != l)
		btrfsic_block_link_init(l);

	return l;
}

static void btrfsic_block_link_free(struct btrfsic_block_link *l)
{
	BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
	kfree(l);
}

static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
{
	ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
	ds->bdev = NULL;
	ds->state = NULL;
	ds->name[0] = '\0';
	INIT_LIST_HEAD(&ds->collision_resolving_node);
	ds->last_flush_gen = 0;
	btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
	ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
	ds->dummy_block_for_bio_bh_flush.dev_state = ds;
}

static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
{
	struct btrfsic_dev_state *ds;

	ds = kzalloc(sizeof(*ds), GFP_NOFS);
	if (NULL != ds)
		btrfsic_dev_state_init(ds);

	return ds;
}

static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
{
	BUG_ON(!(NULL == ds ||
		 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
	kfree(ds);
}

static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
{
	int i;

	for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
		INIT_LIST_HEAD(h->table + i);
}

static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
					struct btrfsic_block_hashtable *h)
{
	const unsigned int hashval =
	    (((unsigned int)(b->dev_bytenr >> 16)) ^
	     ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);

	list_add(&b->collision_resolving_node, h->table + hashval);
}

static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
{
	list_del(&b->collision_resolving_node);
}

static struct btrfsic_block *btrfsic_block_hashtable_lookup(
		struct block_device *bdev,
		u64 dev_bytenr,
		struct btrfsic_block_hashtable *h)
{
	const unsigned int hashval =
	    (((unsigned int)(dev_bytenr >> 16)) ^
	     ((unsigned int)((uintptr_t)bdev))) &
	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
	struct list_head *elem;

	list_for_each(elem, h->table + hashval) {
		struct btrfsic_block *const b =
		    list_entry(elem, struct btrfsic_block,
			       collision_resolving_node);

		if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
			return b;
	}

	return NULL;
}

static void btrfsic_block_link_hashtable_init(
		struct btrfsic_block_link_hashtable *h)
{
	int i;

	for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
		INIT_LIST_HEAD(h->table + i);
}

static void btrfsic_block_link_hashtable_add(
		struct btrfsic_block_link *l,
		struct btrfsic_block_link_hashtable *h)
{
	const unsigned int hashval =
	    (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
	     ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
	     ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
	     ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
	     & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);

	BUG_ON(NULL == l->block_ref_to);
	BUG_ON(NULL == l->block_ref_from);
	list_add(&l->collision_resolving_node, h->table + hashval);
}

static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
{
	list_del(&l->collision_resolving_node);
}

static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
		struct block_device *bdev_ref_to,
		u64 dev_bytenr_ref_to,
		struct block_device *bdev_ref_from,
		u64 dev_bytenr_ref_from,
		struct btrfsic_block_link_hashtable *h)
{
	const unsigned int hashval =
	    (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
	     ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
	     ((unsigned int)((uintptr_t)bdev_ref_to)) ^
	     ((unsigned int)((uintptr_t)bdev_ref_from))) &
	     (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
	struct list_head *elem;

	list_for_each(elem, h->table + hashval) {
		struct btrfsic_block_link *const l =
		    list_entry(elem, struct btrfsic_block_link,
			       collision_resolving_node);

		BUG_ON(NULL == l->block_ref_to);
		BUG_ON(NULL == l->block_ref_from);
		if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
		    l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
		    l->block_ref_from->dev_state->bdev == bdev_ref_from &&
		    l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
			return l;
	}

	return NULL;
}

static void btrfsic_dev_state_hashtable_init(
		struct btrfsic_dev_state_hashtable *h)
{
	int i;

	for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
		INIT_LIST_HEAD(h->table + i);
}

static void btrfsic_dev_state_hashtable_add(
		struct btrfsic_dev_state *ds,
		struct btrfsic_dev_state_hashtable *h)
{
	const unsigned int hashval =
	    (((unsigned int)((uintptr_t)ds->bdev)) &
	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));

	list_add(&ds->collision_resolving_node, h->table + hashval);
}

static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
{
	list_del(&ds->collision_resolving_node);
}

static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
		struct block_device *bdev,
		struct btrfsic_dev_state_hashtable *h)
{
	const unsigned int hashval =
	    (((unsigned int)((uintptr_t)bdev)) &
	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
	struct list_head *elem;

	list_for_each(elem, h->table + hashval) {
		struct btrfsic_dev_state *const ds =
		    list_entry(elem, struct btrfsic_dev_state,
			       collision_resolving_node);

		if (ds->bdev == bdev)
			return ds;
	}

	return NULL;
}

static int btrfsic_process_superblock(struct btrfsic_state *state,
				      struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_super_block *selected_super;
	struct list_head *dev_head = &fs_devices->devices;
	struct btrfs_device *device;
	struct btrfsic_dev_state *selected_dev_state = NULL;
	int pass;

	BUG_ON(NULL == state);
	selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
	if (NULL == selected_super) {
		printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
		return -1;
	}

	list_for_each_entry(device, dev_head, dev_list) {
		int i;
		struct btrfsic_dev_state *dev_state;

		if (!device->bdev || !device->name)
			continue;

		dev_state = btrfsic_dev_state_lookup(device->bdev);
		BUG_ON(NULL == dev_state);
		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
			ret = btrfsic_process_superblock_dev_mirror(
					state, dev_state, device, i,
					&selected_dev_state, selected_super);
			if (0 != ret && 0 == i) {
				kfree(selected_super);
				return ret;
			}
		}
	}

	if (NULL == state->latest_superblock) {
		printk(KERN_INFO "btrfsic: no superblock found!\n");
		kfree(selected_super);
		return -1;
	}

	state->csum_size = btrfs_super_csum_size(selected_super);

	for (pass = 0; pass < 3; pass++) {
		int num_copies;
		int mirror_num;
		u64 next_bytenr;

		switch (pass) {
		case 0:
			next_bytenr = btrfs_super_root(selected_super);
			if (state->print_mask &
			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
				printk(KERN_INFO "root@%llu\n",
				       (unsigned long long)next_bytenr);
			break;
		case 1:
			next_bytenr = btrfs_super_chunk_root(selected_super);
			if (state->print_mask &
			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
				printk(KERN_INFO "chunk@%llu\n",
				       (unsigned long long)next_bytenr);
			break;
		case 2:
			next_bytenr = btrfs_super_log_root(selected_super);
			if (0 == next_bytenr)
				continue;
			if (state->print_mask &
			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
				printk(KERN_INFO "log@%llu\n",
				       (unsigned long long)next_bytenr);
			break;
		}

		num_copies =
		    btrfs_num_copies(state->root->fs_info,
				     next_bytenr, state->metablock_size);
		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
			       (unsigned long long)next_bytenr, num_copies);

		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
			struct btrfsic_block *next_block;
			struct btrfsic_block_data_ctx tmp_next_block_ctx;
			struct btrfsic_block_link *l;

			ret = btrfsic_map_block(state, next_bytenr,
						state->metablock_size,
						&tmp_next_block_ctx,
						mirror_num);
			if (ret) {
				printk(KERN_INFO "btrfsic:"
				       " btrfsic_map_block(root @%llu,"
				       " mirror %d) failed!\n",
				       (unsigned long long)next_bytenr,
				       mirror_num);
				kfree(selected_super);
				return -1;
			}

			next_block = btrfsic_block_hashtable_lookup(
					tmp_next_block_ctx.dev->bdev,
					tmp_next_block_ctx.dev_bytenr,
					&state->block_hashtable);
			BUG_ON(NULL == next_block);

			l = btrfsic_block_link_hashtable_lookup(
					tmp_next_block_ctx.dev->bdev,
					tmp_next_block_ctx.dev_bytenr,
					state->latest_superblock->dev_state->
					bdev,
					state->latest_superblock->dev_bytenr,
					&state->block_link_hashtable);
			BUG_ON(NULL == l);

			ret = btrfsic_read_block(state, &tmp_next_block_ctx);
			if (ret < (int)PAGE_CACHE_SIZE) {
				printk(KERN_INFO
				       "btrfsic: read @logical %llu failed!\n",
				       (unsigned long long)
				       tmp_next_block_ctx.start);
				btrfsic_release_block_ctx(&tmp_next_block_ctx);
				kfree(selected_super);
				return -1;
			}

			ret = btrfsic_process_metablock(state,
							next_block,
							&tmp_next_block_ctx,
							BTRFS_MAX_LEVEL + 3, 1);
			btrfsic_release_block_ctx(&tmp_next_block_ctx);
		}
	}

	kfree(selected_super);
	return ret;
}

static int btrfsic_process_superblock_dev_mirror(
		struct btrfsic_state *state,
		struct btrfsic_dev_state *dev_state,
		struct btrfs_device *device,
		int superblock_mirror_num,
		struct btrfsic_dev_state **selected_dev_state,
		struct btrfs_super_block *selected_super)
{
	struct btrfs_super_block *super_tmp;
	u64 dev_bytenr;
	struct buffer_head *bh;
	struct btrfsic_block *superblock_tmp;
	int pass;
	struct block_device *const superblock_bdev = device->bdev;

	/* super block bytenr is always the unmapped device bytenr */
	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
	if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
		return -1;
	bh = __bread(superblock_bdev, dev_bytenr / 4096,
		     BTRFS_SUPER_INFO_SIZE);
	if (NULL == bh)
		return -1;
	super_tmp = (struct btrfs_super_block *)
	    (bh->b_data + (dev_bytenr & 4095));

	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
	    super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
	    btrfs_super_nodesize(super_tmp) != state->metablock_size ||
	    btrfs_super_leafsize(super_tmp) != state->metablock_size ||
	    btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
		brelse(bh);
		return 0;
	}

	superblock_tmp =
	    btrfsic_block_hashtable_lookup(superblock_bdev,
					   dev_bytenr,
					   &state->block_hashtable);
	if (NULL == superblock_tmp) {
		superblock_tmp = btrfsic_block_alloc();
		if (NULL == superblock_tmp) {
			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
			brelse(bh);
			return -1;
		}
		/* for superblock, only the dev_bytenr makes sense */
		superblock_tmp->dev_bytenr = dev_bytenr;
		superblock_tmp->dev_state = dev_state;
		superblock_tmp->logical_bytenr = dev_bytenr;
		superblock_tmp->generation = btrfs_super_generation(super_tmp);
		superblock_tmp->is_metadata = 1;
		superblock_tmp->is_superblock = 1;
		superblock_tmp->is_iodone = 1;
		superblock_tmp->never_written = 0;
		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
			printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
				     " @%llu (%s/%llu/%d)\n",
				     superblock_bdev,
				     rcu_str_deref(device->name),
				     (unsigned long long)dev_bytenr,
				     dev_state->name,
				     (unsigned long long)dev_bytenr,
				     superblock_mirror_num);
		list_add(&superblock_tmp->all_blocks_node,
			 &state->all_blocks_list);
		btrfsic_block_hashtable_add(superblock_tmp,
					    &state->block_hashtable);
	}

	/* select the one with the highest generation field */
	if (btrfs_super_generation(super_tmp) >
	    state->max_superblock_generation ||
	    0 == state->max_superblock_generation) {
		memcpy(selected_super, super_tmp, sizeof(*selected_super));
		*selected_dev_state = dev_state;
		state->max_superblock_generation =
		    btrfs_super_generation(super_tmp);
		state->latest_superblock = superblock_tmp;
	}

	for (pass = 0; pass < 3; pass++) {
		u64 next_bytenr;
		int num_copies;
		int mirror_num;
		const char *additional_string = NULL;
		struct btrfs_disk_key tmp_disk_key;

		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
		tmp_disk_key.offset = 0;
		switch (pass) {
		case 0:
			tmp_disk_key.objectid =
			    cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
			additional_string = "initial root ";
			next_bytenr = btrfs_super_root(super_tmp);
			break;
		case 1:
			tmp_disk_key.objectid =
			    cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
			additional_string = "initial chunk ";
			next_bytenr = btrfs_super_chunk_root(super_tmp);
			break;
		case 2:
			tmp_disk_key.objectid =
			    cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
			additional_string = "initial log ";
			next_bytenr = btrfs_super_log_root(super_tmp);
			if (0 == next_bytenr)
				continue;
			break;
		}

		num_copies =
		    btrfs_num_copies(state->root->fs_info,
				     next_bytenr, state->metablock_size);
		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
			       (unsigned long long)next_bytenr, num_copies);
		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
			struct btrfsic_block *next_block;
			struct btrfsic_block_data_ctx tmp_next_block_ctx;
			struct btrfsic_block_link *l;

			if (btrfsic_map_block(state, next_bytenr,
					      state->metablock_size,
					      &tmp_next_block_ctx,
					      mirror_num)) {
				printk(KERN_INFO "btrfsic: btrfsic_map_block("
				       "bytenr @%llu, mirror %d) failed!\n",
				       (unsigned long long)next_bytenr,
				       mirror_num);
				brelse(bh);
				return -1;
			}

			next_block = btrfsic_block_lookup_or_add(
					state, &tmp_next_block_ctx,
					additional_string, 1, 1, 0,
					mirror_num, NULL);
			if (NULL == next_block) {
				btrfsic_release_block_ctx(&tmp_next_block_ctx);
				brelse(bh);
				return -1;
			}

			next_block->disk_key = tmp_disk_key;
			next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
			l = btrfsic_block_link_lookup_or_add(
					state, &tmp_next_block_ctx,
					next_block, superblock_tmp,
					BTRFSIC_GENERATION_UNKNOWN);
			btrfsic_release_block_ctx(&tmp_next_block_ctx);
			if (NULL == l) {
				brelse(bh);
				return -1;
			}
		}
	}
	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
		btrfsic_dump_tree_sub(state, superblock_tmp, 0);

	brelse(bh);
	return 0;
}

static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
{
	struct btrfsic_stack_frame *sf;

	sf = kzalloc(sizeof(*sf), GFP_NOFS);
	if (NULL == sf)
		printk(KERN_INFO "btrfsic: alloc memory failed!\n");
	else
		sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
	return sf;
}

static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
{
	BUG_ON(!(NULL == sf ||
		 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
	kfree(sf);
}

static int btrfsic_process_metablock(
		struct btrfsic_state *state,
		struct btrfsic_block *const first_block,
		struct btrfsic_block_data_ctx *const first_block_ctx,
		int first_limit_nesting, int force_iodone_flag)
{
	struct btrfsic_stack_frame initial_stack_frame = { 0 };
	struct btrfsic_stack_frame *sf;
	struct btrfsic_stack_frame *next_stack;
	struct btrfs_header *const first_hdr =
		(struct btrfs_header *)first_block_ctx->datav[0];
	sf = &initial_stack_frame;
	sf->error = 0;
	sf->i = -1;
	sf->limit_nesting = first_limit_nesting;
	sf->block = first_block;
	sf->block_ctx = first_block_ctx;
	sf->next_block = NULL;
	sf->hdr = first_hdr;
	sf->prev = NULL;

continue_with_new_stack_frame:
	sf->block->generation = le64_to_cpu(sf->hdr->generation);