aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/btrfs_inode.h17
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c10
-rw-r--r--fs/btrfs/ctree.h198
-rw-r--r--fs/btrfs/delayed-inode.c50
-rw-r--r--fs/btrfs/disk-io.c430
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c838
-rw-r--r--fs/btrfs/extent_io.c223
-rw-r--r--fs/btrfs/extent_io.h12
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c49
-rw-r--r--fs/btrfs/free-space-cache.c926
-rw-r--r--fs/btrfs/inode-map.c6
-rw-r--r--fs/btrfs/inode.c300
-rw-r--r--fs/btrfs/ioctl.c97
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c949
-rw-r--r--fs/btrfs/relocation.c24
-rw-r--r--fs/btrfs/scrub.c114
-rw-r--r--fs/btrfs/super.c298
-rw-r--r--fs/btrfs/transaction.c133
-rw-r--r--fs/btrfs/tree-log.c19
-rw-r--r--fs/btrfs/volumes.c77
-rw-r--r--fs/btrfs/volumes.h8
-rw-r--r--fs/btrfs/xattr.c11
-rw-r--r--fs/cifs/cifsencrypt.c54
-rw-r--r--fs/cifs/cifsfs.c10
-rw-r--r--fs/cifs/cifssmb.c3
-rw-r--r--fs/cifs/connect.c6
-rw-r--r--fs/ext3/inode.c4
-rw-r--r--fs/ext3/namei.c3
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/namei.c3
-rw-r--r--fs/gfs2/log.c4
-rw-r--r--fs/gfs2/meta_io.c6
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/hfsplus/super.c15
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/namei.c12
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4proc.c20
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/proc/task_mmu.c80
-rw-r--r--fs/quota/quota.c2
-rw-r--r--fs/stat.c2
-rw-r--r--fs/xfs/xfs_aops.c3
-rw-r--r--fs/xfs/xfs_buf_item.c3
-rw-r--r--fs/xfs/xfs_dquot_item.c10
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_super.c13
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_ail.c83
-rw-r--r--fs/xfs/xfs_trans_priv.h8
62 files changed, 3745 insertions, 1522 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 89b6ce3634fd..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
- compression.o delayed-ref.o relocation.o delayed-inode.o backref.o \
- scrub.o
+ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+ reada.o backref.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
if (!value)
return ERR_PTR(-ENOMEM);
size = __btrfs_getxattr(inode, name, value, size);
- if (size > 0) {
- acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl)) {
- kfree(value);
- return acl;
- }
- set_cached_acl(inode, type, acl);
- }
- kfree(value);
+ }
+ if (size > 0) {
+ acl = posix_acl_from_xattr(value, size);
} else if (size == -ENOENT || size == -ENODATA || size == 0) {
/* FIXME, who returns -ENOENT? I think nobody */
acl = NULL;
- set_cached_acl(inode, type, acl);
} else {
acl = ERR_PTR(-EIO);
}
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
return acl;
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..5a5d325a3935 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
*/
u64 delalloc_bytes;
- /* total number of bytes that may be used for this inode for
- * delalloc
- */
- u64 reserved_bytes;
-
/*
* the size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
*/
u64 disk_i_size;
- /* flags field from the on disk inode */
- u32 flags;
-
/*
* if this is a directory then index_cnt is the counter for the index
* number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * Number of bytes outstanding that are going to need csums. This is
+ * used in ENOSPC accounting.
+ */
+ u64 csum_bytes;
+
+ /* flags field from the on disk inode */
+ u32 flags;
+
+ /*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
* items we think we'll end up using, and reserved_extents is the number
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
static inline int compressed_bio_size(struct btrfs_root *root,
unsigned long disk_size)
{
- u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
return sizeof(struct compressed_bio) +
((disk_size + root->sectorsize - 1) / root->sectorsize) *
csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..0fe615e4ea38 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
- if (level < BTRFS_MAX_LEVEL - 1)
+ if (level < BTRFS_MAX_LEVEL - 1) {
parent = path->nodes[level + 1];
- pslot = path->slots[level + 1];
+ pslot = path->slots[level + 1];
+ }
/*
* deal with the case where there is only one pointer in the root
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
mid = path->nodes[level];
WARN_ON(btrfs_header_generation(mid) != trans->transid);
- if (level < BTRFS_MAX_LEVEL - 1)
+ if (level < BTRFS_MAX_LEVEL - 1) {
parent = path->nodes[level + 1];
- pslot = path->slots[level + 1];
+ pslot = path->slots[level + 1];
+ }
if (!parent)
return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..b9ba59ff9292 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
#include <linux/kobject.h>
#include <trace/events/btrfs.h>
#include <asm/kmap_types.h>
+#include <linux/pagemap.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
#define BTRFS_LABEL_SIZE 256
/*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+ __le64 tree_root;
+ __le64 tree_root_gen;
+
+ __le64 chunk_root;
+ __le64 chunk_root_gen;
+
+ __le64 extent_root;
+ __le64 extent_root_gen;
+
+ __le64 fs_root;
+ __le64 fs_root_gen;
+
+ __le64 dev_root;
+ __le64 dev_root_gen;
+
+ __le64 csum_root;
+ __le64 csum_root_gen;
+
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 num_devices;
+ /* future */
+ __le64 unsed_64[4];
+
+ u8 tree_root_level;
+ u8 chunk_root_level;
+ u8 extent_root_level;
+ u8 fs_root_level;
+ u8 dev_root_level;
+ u8 csum_root_level;
+ /* future and to align */
+ u8 unused_8[10];
+} __attribute__ ((__packed__));
+
+/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
*/
@@ -405,6 +447,7 @@ struct btrfs_super_block {
/* future expansion */
__le64 reserved[31];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+ struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
} __attribute__ ((__packed__));
/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
struct btrfs_block_rsv {
u64 size;
u64 reserved;
- u64 freed[2];
struct btrfs_space_info *space_info;
- struct list_head list;
spinlock_t lock;
- atomic_t usage;
- unsigned int priority:8;
- unsigned int durable:1;
- unsigned int refill_used:1;
unsigned int full:1;
};
@@ -840,10 +877,10 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
- u64 reserved_pinned;
u64 bytes_super;
u64 flags;
u64 sectorsize;
+ u64 cache_generation;
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
@@ -899,6 +936,10 @@ struct btrfs_fs_info {
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;
+ /* keep track of unallocated space */
+ spinlock_t free_chunk_lock;
+ u64 free_chunk_space;
+
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
@@ -916,14 +957,11 @@ struct btrfs_fs_info {
struct btrfs_block_rsv trans_block_rsv;
/* block reservation for chunk tree */
struct btrfs_block_rsv chunk_block_rsv;
+ /* block reservation for delayed operations */
+ struct btrfs_block_rsv delayed_block_rsv;
struct btrfs_block_rsv empty_block_rsv;
- /* list of block reservations that cross multiple transactions */
- struct list_head durable_block_rsv_list;
-
- struct mutex durable_block_rsv_mutex;
-
u64 generation;
u64 last_trans_committed;
@@ -942,8 +980,8 @@ struct btrfs_fs_info {
wait_queue_head_t transaction_blocked_wait;
wait_queue_head_t async_submit_wait;
- struct btrfs_super_block super_copy;
- struct btrfs_super_block super_for_commit;
+ struct btrfs_super_block *super_copy;
+ struct btrfs_super_block *super_for_commit;
struct block_device *__bdev;
struct super_block *sb;
struct inode *btree_inode;
@@ -1036,6 +1074,7 @@ struct btrfs_fs_info {
struct btrfs_workers endio_freespace_worker;
struct btrfs_workers submit_workers;
struct btrfs_workers caching_workers;
+ struct btrfs_workers readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1158,13 @@ struct btrfs_fs_info {
u64 fs_state;
struct btrfs_delayed_root *delayed_root;
+
+ /* readahead tree */
+ spinlock_t reada_lock;
+ struct radix_tree_root reada_tree;
+
+ /* next backup root to be overwritten */
+ int backup_root_index;
};
/*
@@ -1363,6 +1409,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
+#define BTRFS_MOUNT_RECOVERY (1 << 18)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2025,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
}
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+ tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+ tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+ tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+ chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+ chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+ extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+ fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+ fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+ fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+ dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+ dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+ dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+ csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+ csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+ csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+ num_devices, 64);
+
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2225,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
}
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+ return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
+
/* extent-tree.c */
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
@@ -2137,6 +2238,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3 * num_items;
}
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+ unsigned num_items)
+{
+ return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ num_items;
+}
+
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2258,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2311,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve, int sinfo);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len);
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2355,23 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
+int btrfs_block_rsv_check(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
- u64 min_reserved, int min_factor);
+ u64 min_reserved);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *rsv);
int btrfs_set_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2492,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
smp_mb();
return fs_info->closing;
}
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+ kfree(fs_info->delayed_root);
+ kfree(fs_info->extent_root);
+ kfree(fs_info->tree_root);
+ kfree(fs_info->chunk_root);
+ kfree(fs_info->dev_root);
+ kfree(fs_info->csum_root);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kfree(fs_info);
+}
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2704,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
- u64 *bytes_to_reserve);
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2817,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
+/* reada.c */
+struct reada_control {
+ struct btrfs_root *root; /* tree to prefetch */
+ struct btrfs_key key_start;
+ struct btrfs_key key_end; /* exclusive */
+ atomic_t elems;
+ struct kref refcnt;
+ wait_queue_head_t wait;
+};
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+ struct btrfs_key *start, struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ u64 start, int err);
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..bbe8496d5339 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
return 0;
src_rsv = trans->block_rsv;
- dst_rsv = &root->fs_info->global_block_rsv;
+ dst_rsv = &root->fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
if (!item->bytes_reserved)
return;
- rsv = &root->fs_info->global_block_rsv;
+ rsv = &root->fs_info->delayed_block_rsv;
btrfs_block_rsv_release(root, rsv,
item->bytes_reserved);
}
@@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata(
u64 num_bytes;
int ret;
- if (!trans->bytes_reserved)
- return 0;
-
src_rsv = trans->block_rsv;
- dst_rsv = &root->fs_info->global_block_rsv;
+ dst_rsv = &root->fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+ /*
+ * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+ * which doesn't reserve space for speed. This is a problem since we
+ * still need to reserve space for this update, so try to reserve the
+ * space.
+ *
+ * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+ * we're accounted for.
+ */
+ if (!trans->bytes_reserved &&
+ src_rsv != &root->fs_info->delalloc_block_rsv) {
+ ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ /*
+ * Since we're under a transaction reserve_metadata_bytes could
+ * try to commit the transaction which will make it return
+ * EAGAIN to make us stop the transaction we have, so return
+ * ENOSPC instead so that btrfs_dirty_inode knows what to do.
+ */
+ if (ret == -EAGAIN)
+ ret = -ENOSPC;
+ if (!ret)
+ node->bytes_reserved = num_bytes;
+ return ret;
+ }
+
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
if (!ret)
node->bytes_reserved = num_bytes;
@@ -646,7 +669,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
if (!node->bytes_reserved)
return;
- rsv = &root->fs_info->global_block_rsv;
+ rsv = &root->fs_info->delayed_block_rsv;
btrfs_block_rsv_release(root, rsv,
node->bytes_reserved);
node->bytes_reserved = 0;
@@ -1026,7 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->global_block_rsv;
+ trans->block_rsv = &root->fs_info->delayed_block_rsv;
delayed_root = btrfs_get_delayed_root(root);
@@ -1069,7 +1092,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
- trans->block_rsv = &node->root->fs_info->global_block_rsv;
+ trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
ret = btrfs_insert_delayed_items(trans, path, node->root, node);
if (!ret)
@@ -1149,7 +1172,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
goto free_path;
block_rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->global_block_rsv;
+ trans->block_rsv = &root->fs_info->delayed_block_rsv;
ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
if (!ret)
@@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
}
ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
- /*
- * we must reserve enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
+ if (ret)
+ goto release_node;
fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dc0343802535..0eb1f0951251 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
int verify)
{
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
char *result = NULL;
unsigned long len;
unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
- ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+ ret = read_extent_buffer_pages(io_tree, eb, start,
+ WAIT_COMPLETE,
btree_get_extent, mirror_num);
if (!ret &&
!verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,47 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
err:
+ if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ btree_readahead_hook(root, eb, eb->start, ret);
+ }
+
free_extent_buffer(eb);
out:
return ret;
}
+static int btree_io_failed_hook(struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ u64 mirror_num, struct extent_state *state)
+{
+ struct extent_io_tree *tree;
+ unsigned long len;
+ struct extent_buffer *eb;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page);
+ if (eb == NULL)
+ goto out;
+
+ if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ btree_readahead_hook(root, eb, eb->start, -EIO);
+ }
+
+out:
+ return -EIO; /* we fixed nothing */
+}
+
static void end_workqueue_bio(struct bio *bio, int err)
{
struct end_io_wq *end_io_wq = bio->bi_private;
@@ -974,11 +1010,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
if (!buf)
return 0;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- buf, 0, 0, btree_get_extent, 0);
+ buf, 0, WAIT_NONE, btree_get_extent, 0);
free_extent_buffer(buf);
return ret;
}
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ int mirror_num, struct extent_buffer **eb)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+ int ret;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return 0;
+
+ set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+ ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+ btree_get_extent, mirror_num);
+ if (ret) {
+ free_extent_buffer(buf);
+ return ret;
+ }
+
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+ free_extent_buffer(buf);
+ return -EIO;
+ } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+ *eb = buf;
+ } else {
+ free_extent_buffer(buf);
+ }
+ return 0;
+}
+
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
@@ -1135,10 +1203,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->commit_root = NULL;
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
free_extent_buffer(root->node);
+ root->node = NULL;
return -EIO;
}
root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1647,228 @@ sleep:
return 0;
}
+/*
+ * this will find the highest generation in the array of
+ * root backups. The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest root in the array with the generation
+ * in the super block. If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+ u64 cur;
+ int newest_index = -1;
+ struct btrfs_root_backup *root_backup;
+ int i;
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ root_backup = info->super_copy->super_roots + i;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = i;
+ }
+
+ /* check to see if we actually wrapped around */
+ if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+ root_backup = info->super_copy->super_roots;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = 0;
+ }
+ return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array. This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+ u64 newest_gen)
+{
+ int newest_index = -1;
+
+ newest_index = find_newest_super_backup(info, newest_gen);
+ /* if there was garbage in there, just move along */
+ if (newest_index == -1) {
+ info->backup_root_index = 0;
+ } else {
+ info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+ }
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+ int next_backup;
+ struct btrfs_root_backup *root_backup;
+ int last_backup;
+
+ next_backup = info->backup_root_index;
+ last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+
+ /*
+ * just overwrite the last backup if we're at the same generation
+ * this happens only at umount
+ */
+ root_backup = info->super_for_commit->super_roots + last_backup;
+ if (btrfs_backup_tree_root_gen(root_backup) ==
+ btrfs_header_generation(info->tree_root->node))
+ next_backup = last_backup;
+
+ root_backup = info->super_for_commit->super_roots + next_backup;
+
+ /*
+ * make sure all of our padding and empty slots get zero filled
+ * regardless of which ones we use today
+ */
+ memset(root_backup, 0, sizeof(*root_backup));
+
+ info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+ btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+ btrfs_set_backup_tree_root_gen(root_backup,
+ btrfs_header_generation(info->tree_root->node));
+
+ btrfs_set_backup_tree_root_level(root_backup,
+ btrfs_header_level(info->tree_root->node));
+
+ btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+ btrfs_set_backup_chunk_root_gen(root_backup,
+ btrfs_header_generation(info->chunk_root->node));
+ btrfs_set_backup_chunk_root_level(root_backup,
+ btrfs_header_level(info->chunk_root->node));
+
+ btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(info->extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(info->extent_root->node));
+
+ btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
+ btrfs_set_backup_fs_root_gen(root_backup,
+ btrfs_header_generation(info->fs_root->node));
+ btrfs_set_backup_fs_root_level(root_backup,
+ btrfs_header_level(info->fs_root->node));
+
+ btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+ btrfs_set_backup_dev_root_gen(root_backup,
+ btrfs_header_generation(info->dev_root->node));
+ btrfs_set_backup_dev_root_level(root_backup,
+ btrfs_header_level(info->dev_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(info->csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(info->csum_root->node));
+
+ btrfs_set_backup_total_bytes(root_backup,
+ btrfs_super_total_bytes(info->super_copy));
+ btrfs_set_backup_bytes_used(root_backup,
+ btrfs_super_bytes_used(info->super_copy));
+ btrfs_set_backup_num_devices(root_backup,
+ btrfs_super_num_devices(info->super_copy));
+
+ /*
+ * if we don't copy this out to the super_copy, it won't get remembered
+ * for the next commit
+ */
+ memcpy(&info->super_copy->super_roots,
+ &info->super_for_commit->super_roots,
+ sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block. It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+ struct btrfs_super_block *super,
+ int *num_backups_tried, int *backup_index)
+{
+ struct btrfs_root_backup *root_backup;
+ int newest = *backup_index;
+
+ if (*num_backups_tried == 0) {
+ u64 gen = btrfs_super_generation(super);
+
+ newest = find_newest_super_backup(info, gen);
+ if (newest == -1)
+ return -1;
+
+ *backup_index = newest;
+ *num_backups_tried = 1;
+ } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+ /* we've tried all the backups, all done */
+ return -1;
+ } else {
+ /* jump to the next oldest backup */
+ newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+ *backup_index = newest;
+ *num_backups_tried += 1;
+ }
+ root_backup = super->super_roots + newest;
+
+ btrfs_set_super_generation(super,
+ btrfs_backup_tree_root_gen(root_backup));
+ btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+ btrfs_set_super_root_level(super,
+ btrfs_backup_tree_root_level(root_backup));
+ btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+ /*
+ * fixme: the total bytes and num_devices need to match or we should
+ * need a fsck
+ */
+ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+ btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+ return 0;
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+ free_extent_buffer(info->tree_root->node);
+ free_extent_buffer(info->tree_root->commit_root);
+ free_extent_buffer(info->dev_root->node);
+ free_extent_buffer(info->dev_root->commit_root);
+ free_extent_buffer(info->extent_root->node);
+ free_extent_buffer(info->extent_root->commit_root);
+ free_extent_buffer(info->csum_root->node);
+ free_extent_buffer(info->csum_root->commit_root);
+
+ info->tree_root->node = NULL;
+ info->tree_root->commit_root = NULL;
+ info->dev_root->node = NULL;
+ info->dev_root->commit_root = NULL;
+ info->extent_root->node = NULL;
+ info->extent_root->commit_root = NULL;
+ info->csum_root->node = NULL;
+ info->csum_root->commit_root = NULL;
+
+ if (chunk_root) {
+ free_extent_buffer(info->chunk_root->node);
+ free_extent_buffer(info->chunk_root->commit_root);
+ info->chunk_root->node = NULL;
+ info->chunk_root->commit_root = NULL;
+ }
+}
+
+
struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
@@ -1604,6 +1896,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
int ret;
int err = -EINVAL;
+ int num_backups_tried = 0;
+ int backup_index = 0;
struct btrfs_super_block *disk_super;
@@ -1648,6 +1942,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
+ spin_lock_init(&fs_info->free_chunk_lock);
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
@@ -1665,8 +1960,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_block_rsv(&fs_info->trans_block_rsv);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
btrfs_init_block_rsv(&fs_info->empty_block_rsv);
- INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
- mutex_init(&fs_info->durable_block_rsv_mutex);
+ btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1971,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->trans_no_join = 0;
+ fs_info->free_chunk_space = 0;
+
+ /* readahead state */
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@@ -1766,14 +2065,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_alloc;
}
- memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
- memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
- sizeof(fs_info->super_for_commit));
+ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_for_commit));
brelse(bh);
- memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
- disk_super = &fs_info->super_copy;
+ disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
goto fail_alloc;
@@ -1783,6 +2082,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
/*
+ * run through our array of backup supers and setup
+ * our ring pointer to the oldest one
+ */
+ generation = btrfs_super_generation(disk_super);
+ find_oldest_super_backup(fs_info, generation);
+
+ /*
* In the long term, we'll store the compression type in the super
* block, and it'll be used for per file compression control.
*/
@@ -1870,6 +2176,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -1880,6 +2189,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
+ fs_info->readahead_workers.idle_thresh = 2;
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2203,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
btrfs_start_workers(&fs_info->delayed_workers, 1);
btrfs_start_workers(&fs_info->caching_workers, 1);
+ btrfs_start_workers(&fs_info->readahead_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2250,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
- goto fail_chunk_root;
+ goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2265,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (ret) {
printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
sb->s_id);
- goto fail_chunk_root;
+ goto fail_tree_roots;
}
btrfs_close_extra_devices(fs_devices);
+retry_root_backup:
blocksize = btrfs_level_size(tree_root,
btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2278,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
blocksize, generation);
- if (!tree_root->node)
- goto fail_chunk_root;
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ if (!tree_root->node ||
+ !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
sb->s_id);
- goto fail_tree_root;
+
+ goto recovery_tree_root;
}
+
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_EXTENT_TREE_OBJECTID, extent_root);
if (ret)
- goto fail_tree_root;
+ goto recovery_tree_root;
extent_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_DEV_TREE_OBJECTID, dev_root);
if (ret)
- goto fail_extent_root;
+ goto recovery_tree_root;
dev_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_CSUM_TREE_OBJECTID, csum_root);
if (ret)
- goto fail_dev_root;
+ goto recovery_tree_root;
csum_root->track_dirty = 1;
@@ -2124,20 +2437,10 @@ fail_cleaner:
fail_block_groups:
btrfs_free_block_groups(fs_info);
- free_extent_buffer(csum_root->node);
- free_extent_buffer(csum_root->commit_root);
-fail_dev_root:
- free_extent_buffer(dev_root->node);
- free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
- free_extent_buffer(extent_root->node);
- free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
- free_extent_buffer(tree_root->node);
- free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
- free_extent_buffer(chunk_root->node);
- free_extent_buffer(chunk_root->commit_root);
+
+fail_tree_roots:
+ free_root_pointers(fs_info, 1);
+
fail_sb_buffer:
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2152,7 +2455,6 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
fail_alloc:
- kfree(fs_info->delayed_root);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -2164,13 +2466,27 @@ fail_bdi:
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
- kfree(extent_root);
- kfree(tree_root);
- kfree(fs_info);
- kfree(chunk_root);
- kfree(dev_root);
- kfree(csum_root);
+ free_fs_info(fs_info);
return ERR_PTR(err);
+
+recovery_tree_root:
+
+ if (!btrfs_test_opt(tree_root, RECOVERY))
+ goto fail_tree_roots;
+
+ free_root_pointers(fs_info, 0);
+
+ /* don't use the log in recovery mode, it won't be valid */
+ btrfs_set_super_log_root(disk_super, 0);
+
+ /* we can't trust the free space cache either */
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+ ret = next_root_backup(fs_info, fs_info->super_copy,
+ &num_backups_tried, &backup_index);
+ if (ret == -1)
+ goto fail_block_groups;
+ goto retry_root_backup;
}
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2338,10 +2654,11 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
int total_errors = 0;
u64 flags;
- max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+ max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
do_barriers = !btrfs_test_opt(root, NOBARRIER);
+ backup_super_roots(root->fs_info);
- sb = &root->fs_info->super_for_commit;
+ sb = root->fs_info->super_for_commit;
dev_item = &sb->dev_item;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2545,8 +2862,6 @@ int close_ctree(struct btrfs_root *root)
/* clear out the rbtree of defraggable inodes */
btrfs_run_defrag_inodes(root->fs_info);
- btrfs_put_block_group_cache(fs_info);
-
/*
* Here come 2 situations when btrfs is broken to flip readonly:
*
@@ -2572,6 +2887,8 @@ int close_ctree(struct btrfs_root *root)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
+ btrfs_put_block_group_cache(fs_info);
+
kthread_stop(root->fs_info->transaction_kthread);
kthread_stop(root->fs_info->cleaner_kthread);
@@ -2603,7 +2920,6 @@ int close_ctree(struct btrfs_root *root)
del_fs_roots(fs_info);
iput(fs_info->btree_inode);
- kfree(fs_info->delayed_root);
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +2933,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
+ btrfs_stop_workers(&fs_info->readahead_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +2941,7 @@ int close_ctree(struct btrfs_root *root)
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
- kfree(fs_info->extent_root);
- kfree(fs_info->tree_root);
- kfree(fs_info->chunk_root);
- kfree(fs_info->dev_root);
- kfree(fs_info->csum_root);
- kfree(fs_info);
+ free_fs_info(fs_info);
return 0;
}
@@ -2735,7 +3047,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
return ret;
}
-int btree_lock_page_hook(struct page *page)
+static int btree_lock_page_hook(struct page *page, void *data,
+ void (*flush_fn)(void *))
{
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3065,10 @@ int btree_lock_page_hook(struct page *page)
if (!eb)
goto out;
- btrfs_tree_lock(eb);
+ if (!btrfs_try_tree_write_lock(eb)) {
+ flush_fn(data);
+ btrfs_tree_lock(eb);
+ }
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3083,10 @@ int btree_lock_page_hook(struct page *page)
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
out:
- lock_page(page);
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
+ }
return 0;
}
@@ -3123,6 +3442,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
+ .readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..c99d0a8f13fa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u32 blocksize, u64 parent_transid);
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
u64 parent_transid);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btree_lock_page_hook(struct page *page);
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 119f842c1d4f..18ea90c8943b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
#include <linux/rcupdate.h>
#include <linux/kthread.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include "compat.h"
#include "hash.h"
#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
CHUNK_ALLOC_LIMITED = 2,
};
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ * ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ * bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+ RESERVE_FREE = 0,
+ RESERVE_ALLOC = 1,
+ RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
+
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
if (atomic_dec_and_test(&cache->count)) {
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
- WARN_ON(cache->reserved_pinned > 0);
kfree(cache->free_space_ctl);
kfree(cache);
}
@@ -465,7 +482,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
* we likely hold important locks.
*/
if (trans && (!trans->transaction->in_commit) &&
- (root && root != root->fs_info->tree_root)) {
+ (root && root != root->fs_info->tree_root) &&
+ btrfs_test_opt(root, SPACE_CACHE)) {
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
spin_unlock(&cache->lock);
@@ -2700,6 +2718,13 @@ again:
goto again;
}
+ /* We've already setup this transaction, go ahead and exit */
+ if (block_group->cache_generation == trans->transid &&
+ i_size_read(inode)) {
+ dcs = BTRFS_DC_SETUP;
+ goto out_put;
+ }
+
/*
* We want to set the generation to 0, that way if anything goes wrong
* from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2774,15 @@ again:
if (!ret)
dcs = BTRFS_DC_SETUP;
btrfs_free_reserved_data_space(inode, num_pages);
+
out_put:
iput(inode);
out_free:
btrfs_release_path(path);
out:
spin_lock(&block_group->lock);
+ if (!ret)
+ block_group->cache_generation = trans->transid;
block_group->disk_cache_state = dcs;
spin_unlock(&block_group->lock);
@@ -3122,16 +3150,13 @@ commit_trans:
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
- BTRFS_I(inode)->reserved_bytes += bytes;
spin_unlock(&data_sinfo->lock);
return 0;
}
/*
- * called when we are clearing an delalloc extent from the
- * inode's io_tree or there was an error for whatever reason
- * after calling btrfs_check_data_free_space
+ * Called if we need to clear a data reservation for this inode.
*/
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
{
@@ -3144,7 +3169,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
data_sinfo = BTRFS_I(inode)->space_info;
spin_lock(&data_sinfo->lock);
data_sinfo->bytes_may_use -= bytes;
- BTRFS_I(inode)->reserved_bytes -= bytes;
spin_unlock(&data_sinfo->lock);
}
@@ -3165,6 +3189,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
struct btrfs_space_info *sinfo, u64 alloc_bytes,
int force)
{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
u64 thresh;
@@ -3173,11 +3198,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
return 1;
/*
+ * We need to take into account the global rsv because for all intents
+ * and purposes it's used space. Don't worry about locking the
+ * global_rsv, it doesn't change except when the transaction commits.
+ */
+ num_allocated += global_rsv->size;
+
+ /*
* in limited mode, we want to have some free space up to
* about 1% of the FS size.
*/
if (force == CHUNK_ALLOC_LIMITED) {
- thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
thresh = max_t(u64, 64 * 1024 * 1024,
div_factor_fine(thresh, 1));
@@ -3199,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
return 0;
- thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
/* 256MB or 5% of the FS */
thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3334,26 @@ out:
/*
* shrink metadata reservation for delalloc
*/
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 to_reclaim, int sync)
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
+ bool wait_ordered)
{
struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
+ struct btrfs_trans_handle *trans;
u64 reserved;
u64 max_reclaim;
u64 reclaimed = 0;
long time_left;
- int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
int loops = 0;
unsigned long progress;
+ trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &root->fs_info->delalloc_block_rsv;
space_info = block_rsv->space_info;
smp_mb();
- reserved = space_info->bytes_reserved;
+ reserved = space_info->bytes_may_use;
progress = space_info->reservation_progress;
if (reserved == 0)
@@ -3334,7 +3368,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
}
max_reclaim = min(reserved, to_reclaim);
-
+ nr_pages = max_t(unsigned long, nr_pages,
+ max_reclaim >> PAGE_CACHE_SHIFT);
while (loops < 1024) {
/* have the flusher threads jump in and do some IO */
smp_mb();
@@ -3343,9 +3378,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
spin_lock(&space_info->lock);
- if (reserved > space_info->bytes_reserved)
- reclaimed += reserved - space_info->bytes_reserved;
- reserved = space_info->bytes_reserved;
+ if (reserved > space_info->bytes_may_use)
+ reclaimed += reserved - space_info->bytes_may_use;
+ reserved = space_info->bytes_may_use;
spin_unlock(&space_info->lock);
loops++;
@@ -3356,11 +3391,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
if (trans && trans->transaction->blocked)
return -EAGAIN;
- time_left = schedule_timeout_interruptible(1);
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_extents(root, 0, 0);
+ } else {
+ time_left = schedule_timeout_interruptible(1);
- /* We were interrupted, exit */
- if (time_left)
- break;
+ /* We were interrupted, exit */
+ if (time_left)
+ break;
+ }
/* we've kicked the IO a few times, if anything has been freed,
* exit. There is no sense in looping here for a long time
@@ -3375,34 +3414,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
}
}
- if (reclaimed >= to_reclaim && !trans)
- btrfs_wait_ordered_extents(root, 0, 0);
+
return reclaimed >= to_reclaim;
}
-/*
- * Retries tells us how many times we've called reserve_metadata_bytes. The
- * idea is if this is the first call (retries == 0) then we will add to our
- * reserved count if we can't make the allocation in order to hold our place
- * while we go and try and free up space. That way for retries > 1 we don't try
- * and add space, we just check to see if the amount of unused space is >= the
- * total space, meaning that our reservation is valid.
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
*
- * However if we don't intend to retry this reservation, pass -1 as retries so
- * that it short circuits this logic.
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does. Otherwise it
+ * will return -ENOSPC.
*/
-static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int may_commit_transaction(struct btrfs_root *root,
+ struct btrfs_space_info *space_info,
+ u64 bytes, int force)
+{
+ struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+ struct btrfs_trans_handle *trans;
+
+ trans = (struct btrfs_trans_handle *)current->journal_info;
+ if (trans)
+ return -EAGAIN;
+
+ if (force)
+ goto commit;
+
+ /* See if there is enough pinned space to make this reservation */
+ spin_lock(&space_info->lock);
+ if (space_info->bytes_pinned >= bytes) {
+ spin_unlock(&space_info->lock);
+ goto commit;
+ }
+ spin_unlock(&space_info->lock);
+
+ /*
+ * See if there is some space in the delayed insertion reservation for
+ * this reservation.
+ */
+ if (space_info != delayed_rsv->space_info)
+ return -ENOSPC;
+
+ spin_lock(&delayed_rsv->lock);
+ if (delayed_rsv->size < bytes) {
+ spin_unlock(&delayed_rsv->lock);
+ return -ENOSPC;
+ }
+ spin_unlock(&delayed_rsv->lock);
+
+commit:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return -ENOSPC;
+
+ return btrfs_commit_transaction(trans, root);
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - wether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes, int flush)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
- u64 unused;
+ u64 used;
u64 num_bytes = orig_bytes;
int retries = 0;
int ret = 0;
bool committed = false;
bool flushing = false;
+ bool wait_ordered = false;
again:
ret = 0;
@@ -3419,7 +3514,7 @@ again:
* deadlock since we are waiting for the flusher to finish, but
* hold the current transaction open.
*/
- if (trans)
+ if (current->journal_info)
return -EAGAIN;
ret = wait_event_interruptible(space_info->wait,
!space_info->flush);
@@ -3431,9 +3526,9 @@ again:
}
ret = -ENOSPC;
- unused = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
/*
* The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3537,9 @@ again:
* lets start flushing stuff first and then come back and try to make
* our reservation.
*/
- if (unused <= space_info->total_bytes) {
- unused = space_info->total_bytes - unused;
- if (unused >= num_bytes) {
- space_info->bytes_reserved += orig_bytes;
+ if (used <= space_info->total_bytes) {
+ if (used + orig_bytes <= space_info->total_bytes) {
+ space_info->bytes_may_use += orig_bytes;
ret = 0;
} else {
/*
@@ -3461,10 +3555,64 @@ again:
* amount plus the amount of bytes that we need for this
* reservation.
*/
- num_bytes = unused - space_info->total_bytes +
+ wait_ordered = true;
+ num_bytes = used - space_info->total_bytes +
(orig_bytes * (retries + 1));
}
+ if (ret) {
+ u64 profile = btrfs_get_alloc_profile(root, 0);
+ u64 avail;
+
+ /*
+ * If we have a lot of space that's pinned, don't bother doing
+ * the overcommit dance yet and just commit the transaction.
+ */
+ avail = (space_info->total_bytes - space_info->bytes_used) * 8;
+ do_div(avail, 10);
+ if (space_info->bytes_pinned >= avail && flush && !committed) {
+ space_info->flush = 1;
+ flushing = true;
+ spin_unlock(&space_info->lock);
+ ret = may_commit_transaction(root, space_info,
+ orig_bytes, 1);
+ if (ret)
+ goto out;
+ committed = true;
+ goto again;
+ }
+
+ spin_lock(&root->fs_info->free_chunk_lock);
+ avail = root->fs_info->free_chunk_space;
+
+ /*
+ * If we have dup, raid1 or raid10 then only half of the free
+ * space is actually useable.
+ */
+ if (profile & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ avail >>= 1;
+
+ /*
+ * If we aren't flushing don't let us overcommit too much, say
+ * 1/8th of the space. If we can flush, let it overcommit up to
+ * 1/2 of the space.
+ */
+ if (flush)
+ avail >>= 3;
+ else
+ avail >>= 1;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
+ if (used + num_bytes < space_info->total_bytes + avail) {
+ space_info->bytes_may_use += orig_bytes;
+ ret = 0;
+ } else {
+ wait_ordered = true;
+ }
+ }
+
/*
* Couldn't make our reservation, save our place so while we're trying
* to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3632,7 @@ again:
* We do synchronous shrinking since we don't actually unreserve
* metadata until after the IO is completed.
*/
- ret = shrink_delalloc(trans, root, num_bytes, 1);
+ ret = shrink_delalloc(root, num_bytes, wait_ordered);
if (ret < 0)
goto out;
@@ -3496,35 +3644,17 @@ again:
* so go back around and try again.
*/
if (retries < 2) {
+ wait_ordered = true;
retries++;
goto again;
}
- /*
- * Not enough space to be reclaimed, don't bother committing the
- * transaction.
- */
- spin_lock(&space_info->lock);
- if (space_info->bytes_pinned < orig_bytes)
- ret = -ENOSPC;
- spin_unlock(&space_info->lock);
- if (ret)
- goto out;
-
- ret = -EAGAIN;
- if (trans)
- goto out;
-
ret = -ENOSPC;
if (committed)
goto out;
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- goto out;
- ret = btrfs_commit_transaction(trans, root);
+ ret = may_commit_transaction(root, space_info, orig_bytes, 0);
if (!ret) {
- trans = NULL;
committed = true;
goto again;
}
@@ -3542,10 +3672,12 @@ out:
static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_block_rsv *block_rsv;
- if (root->ref_cows)
+ struct btrfs_block_rsv *block_rsv = NULL;
+
+ if (root->ref_cows || root == root->fs_info->csum_root)
block_rsv = trans->block_rsv;
- else
+
+ if (!block_rsv)
block_rsv = root->block_rsv;
if (!block_rsv)
@@ -3616,7 +3748,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
}
if (num_bytes) {
spin_lock(&space_info->lock);
- space_info->bytes_reserved -= num_bytes;
+ space_info->bytes_may_use -= num_bytes;
space_info->reservation_progress++;
spin_unlock(&space_info->lock);
}
@@ -3640,9 +3772,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
- atomic_set(&rsv->usage, 1);
- rsv->priority = 6;
- INIT_LIST_HEAD(&rsv->list);
}
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3792,38 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
- if (rsv && atomic_dec_and_test(&rsv->usage)) {
- btrfs_block_rsv_release(root, rsv, (u64)-1);
- if (!rsv->durable)
- kfree(rsv);
- }
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+ kfree(rsv);
}
-/*
- * make the block_rsv struct be able to capture freed space.
- * the captured space will re-add to the the block_rsv struct
- * after transaction commit
- */
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
{
- block_rsv->durable = 1;
- mutex_lock(&fs_info->durable_block_rsv_mutex);
- list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
- mutex_unlock(&fs_info->durable_block_rsv_mutex);
+ int ret;
+
+ if (num_bytes == 0)
+ return 0;
+
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 1);
+ return 0;
+ }
+
+ return ret;
}
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
{
int ret;
if (num_bytes == 0)
return 0;
- ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 1);
return 0;
@@ -3703,55 +3832,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved, int min_factor)
+int btrfs_block_rsv_check(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int min_factor)
{
u64 num_bytes = 0;
- int commit_trans = 0;
int ret = -ENOSPC;
if (!block_rsv)
return 0;
spin_lock(&block_rsv->lock);
- if (min_factor > 0)
- num_bytes = div_factor(block_rsv->size, min_factor);
- if (min_reserved > num_bytes)
- num_bytes = min_reserved;
+ num_bytes = div_factor(block_rsv->size, min_factor);
+ if (block_rsv->reserved >= num_bytes)
+ ret = 0;
+ spin_unlock(&block_rsv->lock);
- if (block_rsv->reserved >= num_bytes) {
+ return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved)
+{
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ num_bytes = min_reserved;
+ if (block_rsv->reserved >= num_bytes)
ret = 0;
- } else {
+ else
num_bytes -= block_rsv->reserved;
- if (block_rsv->durable &&
- block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
- commit_trans = 1;
- }
spin_unlock(&block_rsv->lock);
+
if (!ret)
return 0;
- if (block_rsv->refill_used) {
- ret = reserve_metadata_bytes(trans, root, block_rsv,
- num_bytes, 0);
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
- return 0;
- }
- }
-
- if (commit_trans) {
- if (trans)
- return -EAGAIN;
- trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
- ret = btrfs_commit_transaction(trans, root);
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
return 0;
}
- return -ENOSPC;
+ return ret;
}
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3909,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
u64 num_bytes;
u64 meta_used;
u64 data_used;
- int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ int csum_size = btrfs_super_csum_size(fs_info->super_copy);
sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
spin_lock(&sinfo->lock);
@@ -3827,12 +3953,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
if (sinfo->total_bytes > num_bytes) {
num_bytes = sinfo->total_bytes - num_bytes;
block_rsv->reserved += num_bytes;
- sinfo->bytes_reserved += num_bytes;
+ sinfo->bytes_may_use += num_bytes;
}
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
- sinfo->bytes_reserved -= num_bytes;
+ sinfo->bytes_may_use -= num_bytes;
sinfo->reservation_progress++;
block_rsv->reserved = block_rsv->size;
block_rsv->full = 1;
@@ -3848,16 +3974,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
fs_info->chunk_block_rsv.space_info = space_info;
- fs_info->chunk_block_rsv.priority = 10;
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
fs_info->global_block_rsv.space_info = space_info;
- fs_info->global_block_rsv.priority = 10;
- fs_info->global_block_rsv.refill_used = 1;
fs_info->delalloc_block_rsv.space_info = space_info;
fs_info->trans_block_rsv.space_info = space_info;
fs_info->empty_block_rsv.space_info = space_info;
- fs_info->empty_block_rsv.priority = 10;
+ fs_info->delayed_block_rsv.space_info = space_info;
fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +3988,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
- btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
-
- btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
-
update_global_block_rsv(fs_info);
}
@@ -3881,37 +4000,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
-}
-
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *rsv)
-{
- struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
- u64 num_bytes;
- int ret;
-
- /*
- * Truncate should be freeing data, but give us 2 items just in case it
- * needs to use some space. We may want to be smarter about this in the
- * future.
- */
- num_bytes = btrfs_calc_trans_metadata_size(root, 2);
-
- /* We already have enough bytes, just return */
- if (rsv->reserved >= num_bytes)
- return 0;
-
- num_bytes -= rsv->reserved;
-
- /*
- * You should have reserved enough space before hand to do this, so this
- * should not fail.
- */
- ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
- BUG_ON(ret);
-
- return 0;
+ WARN_ON(fs_info->delayed_block_rsv.size > 0);
+ WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
}
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4010,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
if (!trans->bytes_reserved)
return;
- BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
- btrfs_block_rsv_release(root, trans->block_rsv,
- trans->bytes_reserved);
+ btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
trans->bytes_reserved = 0;
}
@@ -3964,11 +4052,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written. This will return the number of
+ * reserved extents that need to be freed. This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
static unsigned drop_outstanding_extent(struct inode *inode)
{
unsigned dropped_extents = 0;
- spin_lock(&BTRFS_I(inode)->lock);
BUG_ON(!BTRFS_I(inode)->outstanding_extents);
BTRFS_I(inode)->outstanding_extents--;
@@ -3978,19 +4074,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
*/
if (BTRFS_I(inode)->outstanding_extents >=
BTRFS_I(inode)->reserved_extents)
- goto out;
+ return 0;
dropped_extents = BTRFS_I(inode)->reserved_extents -
BTRFS_I(inode)->outstanding_extents;
BTRFS_I(inode)->reserved_extents -= dropped_extents;
-out:
- spin_unlock(&BTRFS_I(inode)->lock);
return dropped_extents;
}
-static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ * reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed. We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required. If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved. If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+ int reserve)
{
- return num_bytes >>= 3;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 csum_size;
+ int num_csums_per_leaf;
+ int num_csums;
+ int old_csums;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+ BTRFS_I(inode)->csum_bytes == 0)
+ return 0;
+
+ old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+ if (reserve)
+ BTRFS_I(inode)->csum_bytes += num_bytes;
+ else
+ BTRFS_I(inode)->csum_bytes -= num_bytes;
+ csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ num_csums_per_leaf = (int)div64_u64(csum_size,
+ sizeof(struct btrfs_csum_item) +
+ sizeof(struct btrfs_disk_key));
+ num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+ num_csums = num_csums + num_csums_per_leaf - 1;
+ num_csums = num_csums / num_csums_per_leaf;
+
+ old_csums = old_csums + num_csums_per_leaf - 1;
+ old_csums = old_csums / num_csums_per_leaf;
+
+ /* No change, no need to reserve more */
+ if (old_csums == num_csums)
+ return 0;
+
+ if (reserve)
+ return btrfs_calc_trans_metadata_size(root,
+ num_csums - old_csums);
+
+ return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
}
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4146,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
u64 to_reserve = 0;
unsigned nr_extents = 0;
+ int flush = 1;
int ret;
- if (btrfs_transaction_in_commit(root->fs_info))
+ if (btrfs_is_free_space_inode(root, inode))
+ flush = 0;
+
+ if (flush && btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4017,18 +4168,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
}
+ to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
spin_unlock(&BTRFS_I(inode)->lock);
- to_reserve += calc_csum_metadata_size(inode, num_bytes);
- ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+ ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (ret) {
+ u64 to_free = 0;
unsigned dropped;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ dropped = drop_outstanding_extent(inode);
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
/*
- * We don't need the return value since our reservation failed,
- * we just need to clean up our counter.
+ * Somebody could have come in and twiddled with the
+ * reservation, so if we have to free more than we would have
+ * reserved from this reservation go ahead and release those
+ * bytes.
*/
- dropped = drop_outstanding_extent(inode);
- WARN_ON(dropped > 1);
+ to_free -= to_reserve;
+ if (to_free)
+ btrfs_block_rsv_release(root, block_rsv, to_free);
return ret;
}
@@ -4037,6 +4199,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
return 0;
}
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode. This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4215,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
unsigned dropped;
num_bytes = ALIGN(num_bytes, root->sectorsize);
+ spin_lock(&BTRFS_I(inode)->lock);
dropped = drop_outstanding_extent(inode);
- to_free = calc_csum_metadata_size(inode, num_bytes);
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4054,6 +4227,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
to_free);
}
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ * extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
{
int ret;
@@ -4071,6 +4259,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
return 0;
}
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space. This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore. So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
{
btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4291,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
/* block accounting for super block */
spin_lock(&info->delalloc_lock);
- old_val = btrfs_super_bytes_used(&info->super_copy);
+ old_val = btrfs_super_bytes_used(info->super_copy);
if (alloc)
old_val += num_bytes;
else
old_val -= num_bytes;
- btrfs_set_super_bytes_used(&info->super_copy, old_val);
+ btrfs_set_super_bytes_used(info->super_copy, old_val);
spin_unlock(&info->delalloc_lock);
while (total) {
@@ -4123,7 +4324,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
- if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+ if (btrfs_test_opt(root, SPACE_CACHE) &&
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
@@ -4135,7 +4336,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
btrfs_set_block_group_used(&cache->item, old_val);
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
- cache->space_info->reservation_progress++;
cache->space_info->bytes_used += num_bytes;
cache->space_info->disk_used += num_bytes * factor;
spin_unlock(&cache->lock);
@@ -4187,7 +4387,6 @@ static int pin_down_extent(struct btrfs_root *root,
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
- cache->space_info->reservation_progress++;
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4414,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
}
/*
- * update size of reserved extents. this function may return -EAGAIN
- * if 'reserve' is true or 'sinfo' is false.
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ BUG_ON(!cache);
+
+ /*
+ * pull in the free space cache (if any) so that our pin
+ * removes the free space from the cache. We have load_only set
+ * to one because the slow code to read in the free extents does check
+ * the pinned extents.
+ */
+ cache_block_group(cache, trans, root, 1);
+
+ pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+ /* remove us from the free space cache (if we're there at all) */
+ btrfs_remove_free_space(cache, bytenr, num_bytes);
+ btrfs_put_block_group(cache);
+ return 0;
+}
+
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache: The cache we are manipulating
+ * @num_bytes: The number of bytes in question
+ * @reserve: One of the reservation enums
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk. For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting. For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree. We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
*/
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve, int sinfo)
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve)
{
+ struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
- if (sinfo) {
- struct btrfs_space_info *space_info = cache->space_info;
- spin_lock(&space_info->lock);
- spin_lock(&cache->lock);
- if (reserve) {
- if (cache->ro) {
- ret = -EAGAIN;
- } else {
- cache->reserved += num_bytes;
- space_info->bytes_reserved += num_bytes;
- }
- } else {
- if (cache->ro)
- space_info->bytes_readonly += num_bytes;
- cache->reserved -= num_bytes;
- space_info->bytes_reserved -= num_bytes;
- space_info->reservation_progress++;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
- } else {
- spin_lock(&cache->lock);
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve != RESERVE_FREE) {
if (cache->ro) {
ret = -EAGAIN;
} else {
- if (reserve)
- cache->reserved += num_bytes;
- else
- cache->reserved -= num_bytes;
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ if (reserve == RESERVE_ALLOC) {
+ BUG_ON(space_info->bytes_may_use < num_bytes);
+ space_info->bytes_may_use -= num_bytes;
+ }
}
- spin_unlock(&cache->lock);
+ } else {
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->reservation_progress++;
}
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
return ret;
}
@@ -4319,13 +4555,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
spin_lock(&cache->lock);
cache->pinned -= len;
cache->space_info->bytes_pinned -= len;
- if (cache->ro) {
+ if (cache->ro)
cache->space_info->bytes_readonly += len;
- } else if (cache->reserved_pinned > 0) {
- len = min(len, cache->reserved_pinned);
- cache->reserved_pinned -= len;
- cache->space_info->bytes_reserved += len;
- }
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
}
@@ -4340,11 +4571,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *unpin;
- struct btrfs_block_rsv *block_rsv;
- struct btrfs_block_rsv *next_rsv;
u64 start;
u64 end;
- int idx;
int ret;
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4595,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
cond_resched();
}
- mutex_lock(&fs_info->durable_block_rsv_mutex);
- list_for_each_entry_safe(block_rsv, next_rsv,
- &fs_info->durable_block_rsv_list, list) {
-
- idx = trans->transid & 0x1;
- if (block_rsv->freed[idx] > 0) {
- block_rsv_add_bytes(block_rsv,
- block_rsv->freed[idx], 0);
- block_rsv->freed[idx] = 0;
- }
- if (atomic_read(&block_rsv->usage) == 0) {
- btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-
- if (block_rsv->freed[0] == 0 &&
- block_rsv->freed[1] == 0) {
- list_del_init(&block_rsv->list);
- kfree(block_rsv);
- }
- } else {
- btrfs_block_rsv_release(root, block_rsv, 0);
- }
- }
- mutex_unlock(&fs_info->durable_block_rsv_mutex);
-
return 0;
}
@@ -4668,7 +4872,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
- struct btrfs_block_rsv *block_rsv;
struct btrfs_block_group_cache *cache = NULL;
int ret;
@@ -4683,64 +4886,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (!last_ref)
return;
- block_rsv = get_block_rsv(trans, root);
cache = btrfs_lookup_block_group(root->fs_info, buf->start);
- if (block_rsv->space_info != cache->space_info)
- goto out;
if (btrfs_header_generation(buf) == trans->transid) {
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, root, buf->start);
if (!ret)
- goto pin;
+ goto out;
}
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
pin_down_extent(root, cache, buf->start, buf->len, 1);
- goto pin;
+ goto out;
}
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
- if (ret == -EAGAIN) {
- /* block group became read-only */
- btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
- goto out;
- }
-
- ret = 1;
- spin_lock(&block_rsv->lock);
- if (block_rsv->reserved < block_rsv->size) {
- block_rsv->reserved += buf->len;
- ret = 0;
- }
- spin_unlock(&block_rsv->lock);
-
- if (ret) {
- spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_reserved -= buf->len;
- cache->space_info->reservation_progress++;
- spin_unlock(&cache->space_info->lock);
- }
- goto out;
- }
-pin:
- if (block_rsv->durable && !cache->ro) {
- ret = 0;
- spin_lock(&cache->lock);
- if (!cache->ro) {
- cache->reserved_pinned += buf->len;
- ret = 1;
- }
- spin_unlock(&cache->lock);
-
- if (ret) {
- spin_lock(&block_rsv->lock);
- block_rsv->freed[trans->transid & 0x1] += buf->len;
- spin_unlock(&block_rsv->lock);
- }
+ btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
}
out:
/*
@@ -4883,10 +5046,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
int last_ptr_loop = 0;
int loop = 0;
int index = 0;
+ int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+ RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool found_uncached_bg = false;
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
+ bool have_caching_bg = false;
u64 ideal_cache_percent = 0;
u64 ideal_cache_offset = 0;
@@ -4969,6 +5135,7 @@ ideal_cache:
}
}
search:
+ have_caching_bg = false;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
list) {
@@ -5177,6 +5344,8 @@ refill_cluster:
failed_alloc = true;
goto have_block_group;
} else if (!offset) {
+ if (!cached)
+ have_caching_bg = true;
goto loop;
}
checks:
@@ -5202,8 +5371,8 @@ checks:
search_start - offset);
BUG_ON(offset > search_start);
- ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
- (data & BTRFS_BLOCK_GROUP_DATA));
+ ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+ alloc_type);
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
@@ -5227,6 +5396,9 @@ loop:
}
up_read(&space_info->groups_sem);
+ if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+ goto search;
+
if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
goto search;
@@ -5325,7 +5497,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int index = 0;
spin_lock(&info->lock);
- printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+ printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+ (unsigned long long)info->flags,
(unsigned long long)(info->total_bytes - info->bytes_used -
info->bytes_pinned - info->bytes_reserved -
info->bytes_readonly),
@@ -5411,7 +5584,8 @@ again:
return ret;
}
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len, int pin)
{
struct btrfs_block_group_cache *cache;
int ret = 0;
@@ -5426,8 +5600,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
if (btrfs_test_opt(root, DISCARD))
ret = btrfs_discard_extent(root, start, len, NULL);
- btrfs_add_free_space(cache, start, len);
- btrfs_update_reserved_bytes(cache, len, 0, 1);
+ if (pin)
+ pin_down_extent(root, cache, start, len, 1);
+ else {
+ btrfs_add_free_space(cache, start, len);
+ btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+ }
btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5613,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
return ret;
}
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ return __btrfs_free_reserved_extent(root, start, len, 0);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ return __btrfs_free_reserved_extent(root, start, len, 1);
+}
+
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -5630,7 +5820,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
put_caching_control(caching_ctl);
}
- ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
+ ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+ RESERVE_ALLOC_NO_ACCOUNT);
BUG_ON(ret);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5878,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
if (block_rsv->size == 0) {
- ret = reserve_metadata_bytes(trans, root, block_rsv,
- blocksize, 0);
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
/*
* If we couldn't reserve metadata bytes try and use some from
* the global reserve.
@@ -5708,13 +5898,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
if (!ret)
return block_rsv;
if (ret) {
- WARN_ON(1);
- ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
- 0);
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ /*DEFAULT_RATELIMIT_BURST*/ 2);
+ if (__ratelimit(&_rs)) {
+ printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+ WARN_ON(1);
+ }
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
if (!ret) {
- spin_lock(&block_rsv->lock);
- block_rsv->size += blocksize;
- spin_unlock(&block_rsv->lock);
return block_rsv;
} else if (ret && block_rsv != global_rsv) {
ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6784,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
cache->bytes_super - btrfs_block_group_used(&cache->item);
if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
- sinfo->bytes_may_use + sinfo->bytes_readonly +
- cache->reserved_pinned + num_bytes + min_allocable_bytes <=
- sinfo->total_bytes) {
+ sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+ min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
- sinfo->bytes_reserved += cache->reserved_pinned;
- cache->reserved_pinned = 0;
cache->ro = 1;
ret = 0;
}
@@ -6964,7 +7153,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_space_info,
list);
if (space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0) {
+ space_info->bytes_reserved > 0 ||
+ space_info->bytes_may_use > 0) {
WARN_ON(1);
dump_space_info(space_info, 0, 0);
}
@@ -7006,14 +7196,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
return -ENOMEM;
path->reada = 1;
- cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
- if (cache_gen != 0 &&
- btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+ cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+ if (btrfs_test_opt(root, SPACE_CACHE) &&
+ btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
need_clear = 1;
if (btrfs_test_opt(root, CLEAR_CACHE))
need_clear = 1;
- if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
- printk(KERN_INFO "btrfs: disk space caching is enabled\n");
while (1) {
ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7440,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
}
- inode = lookup_free_space_inode(root, block_group, path);
+ inode = lookup_free_space_inode(tree_root, block_group, path);
if (!IS_ERR(inode)) {
ret = btrfs_orphan_add(trans, inode);
BUG_ON(ret);
@@ -7268,7 +7456,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->lock);
}
/* One for our lookup ref */
- iput(inode);
+ btrfs_add_delayed_iput(inode);
}
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7527,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
int mixed = 0;
int ret;
- disk_super = &fs_info->super_copy;
+ disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 624ef10d36cc..1f87c4d0e7a0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -895,6 +895,194 @@ search_again:
goto again;
}
+/**
+ * convert_extent - convert all bits in a given range from one bit to another
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @mask: the allocation mask
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int clear_bits, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+ err = insert_state(tree, prealloc, start, end, &bits);
+ prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ struct rb_node *next_node;
+
+ set_state_bits(tree, state, &bits);
+ clear_state_bit(tree, state, &clear_bits, 0);
+
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+
+ start = last_end + 1;
+ next_node = rb_next(&state->rb_node);
+ if (next_node && start < end && prealloc && !need_resched()) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on
+ * second half.
+ *
+ * If the extent we found extends past our
+ * range, we just split and search again. It'll get split
+ * again the next time though.
+ *
+ * If the extent we found is inside our range, we set the
+ * desired bit on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, &bits);
+ clear_state_bit(tree, state, &clear_bits, 0);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and
+ * ignore the extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with
+ * the later extent.
+ */
+ err = insert_state(tree, prealloc, start, this_end,
+ &bits);
+ BUG_ON(err == -EEXIST);
+ if (err) {
+ free_extent_state(prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and set the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ return -ENOMEM;
+
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+ set_state_bits(tree, prealloc, &bits);
+ clear_state_bit(tree, prealloc, &clear_bits, 0);
+
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
/* wrappers around set/clear extent bit */
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
@@ -920,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+ EXTENT_DELALLOC | EXTENT_UPTODATE,
0, NULL, cached_state, mask);
}
@@ -2102,7 +2290,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (tree->ops && tree->ops->readpage_io_failed_hook)
ret = tree->ops->readpage_io_failed_hook(
bio, page, start, end,
- failed_mirror, NULL);
+ failed_mirror, state);
else
ret = bio_readpage_error(bio, page, start, end,
failed_mirror, NULL);
@@ -2511,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
int compressed;
int write_flags;
unsigned long nr_written = 0;
+ bool fill_delalloc = true;
if (wbc->sync_mode == WB_SYNC_ALL)
write_flags = WRITE_SYNC;
@@ -2520,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
+
+ ClearPageError(page);
+
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
@@ -2541,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
set_page_extent_mapped(page);
+ if (!tree->ops || !tree->ops->fill_delalloc)
+ fill_delalloc = false;
+
delalloc_start = start;
delalloc_end = 0;
page_started = 0;
- if (!epd->extent_locked) {
+ if (!epd->extent_locked && fill_delalloc) {
u64 delalloc_to_write = 0;
/*
* make sure the wbc mapping index is at least updated
@@ -2796,10 +2991,16 @@ retry:
* swizzled back from swapper_space to tmpfs file
* mapping
*/
- if (tree->ops && tree->ops->write_cache_pages_lock_hook)
- tree->ops->write_cache_pages_lock_hook(page);
- else
- lock_page(page);
+ if (tree->ops &&
+ tree->ops->write_cache_pages_lock_hook) {
+ tree->ops->write_cache_pages_lock_hook(page,
+ data, flush_fn);
+ } else {
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
+ }
+ }
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
@@ -3579,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&page->mapping->tree_lock);
+ ClearPageError(page);
unlock_page(page);
}
return 0;
@@ -3724,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- u64 start, int wait,
+ struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num)
{
unsigned long i;
@@ -3761,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if (!wait) {
+ if (wait == WAIT_NONE) {
if (!trylock_page(page))
goto unlock_exit;
} else {
@@ -3805,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
if (bio)
submit_one_bio(READ, bio, mirror_num, bio_flags);
- if (ret || !wait)
+ if (ret || wait != WAIT_COMPLETE)
return ret;
for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a8e20b672922..feb9be0e23bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,7 +17,8 @@
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
#define EXTENT_FIRST_DELALLOC (1 << 12)
-#define EXTENT_DAMAGED (1 << 13)
+#define EXTENT_NEED_WAIT (1 << 13)
+#define EXTENT_DAMAGED (1 << 14)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -33,6 +34,7 @@
#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -86,7 +88,8 @@ struct extent_io_ops {
struct extent_state *other);
void (*split_extent_hook)(struct inode *inode,
struct extent_state *orig, u64 split);
- int (*write_cache_pages_lock_hook)(struct page *page);
+ int (*write_cache_pages_lock_hook)(struct page *page, void *data,
+ void (*flush_fn)(void *));
};
struct extent_io_tree {
@@ -215,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int clear_bits, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -249,6 +254,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
+#define WAIT_NONE 0
+#define WAIT_COMPLETE 1
+#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_csum_item *item;
struct extent_buffer *leaf;
u64 csum_offset = 0;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 item_last_offset = 0;
u64 disk_bytenr;
u32 diff;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int ret;
struct btrfs_path *path;
struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
int ret;
size_t size;
u64 csum_end;
- u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
u64 csum_end;
u64 end_byte = bytenr + len;
u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 csum_end;
struct extent_buffer *leaf;
int ret;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_sector_sum *sector_sum;
u32 nritems;
u32 ins_size;
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a381cd22f518..f2e928289600 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1036,11 +1036,13 @@ out:
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
-static int prepare_uptodate_page(struct page *page, u64 pos)
+static int prepare_uptodate_page(struct page *page, u64 pos,
+ bool force_uptodate)
{
int ret = 0;
- if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+ if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+ !PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
if (ret)
return ret;
@@ -1061,12 +1063,13 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
struct page **pages, size_t num_pages,
loff_t pos, unsigned long first_index,
- size_t write_bytes)
+ size_t write_bytes, bool force_uptodate)
{
struct extent_state *cached_state = NULL;
int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT;
struct inode *inode = fdentry(file)->d_inode;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0;
int faili = 0;
u64 start_pos;
@@ -1078,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
again:
for (i = 0; i < num_pages; i++) {
pages[i] = find_or_create_page(inode->i_mapping, index + i,
- GFP_NOFS);
+ mask);
if (!pages[i]) {
faili = i - 1;
err = -ENOMEM;
@@ -1086,10 +1089,11 @@ again:
}
if (i == 0)
- err = prepare_uptodate_page(pages[i], pos);
+ err = prepare_uptodate_page(pages[i], pos,
+ force_uptodate);
if (i == num_pages - 1)
err = prepare_uptodate_page(pages[i],
- pos + write_bytes);
+ pos + write_bytes, false);
if (err) {
page_cache_release(pages[i]);
faili = i - 1;
@@ -1158,6 +1162,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
size_t num_written = 0;
int nrptrs;
int ret = 0;
+ bool force_page_uptodate = false;
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1200,7 +1205,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* contents of pages from loop to loop
*/
ret = prepare_pages(root, file, pages, num_pages,
- pos, first_index, write_bytes);
+ pos, first_index, write_bytes,
+ force_page_uptodate);
if (ret) {
btrfs_delalloc_release_space(inode,
num_pages << PAGE_CACHE_SHIFT);
@@ -1217,12 +1223,15 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (copied < write_bytes)
nrptrs = 1;
- if (copied == 0)
+ if (copied == 0) {
+ force_page_uptodate = true;
dirty_pages = 0;
- else
+ } else {
+ force_page_uptodate = false;
dirty_pages = (copied + offset +
PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT;
+ }
/*
* If we had a short copy we need to release the excess delaloc
@@ -1607,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out;
}
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
- if (ret)
- goto out;
-
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -1656,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+
+ /*
+ * Make sure we have enough space before we do the
+ * allocation.
+ */
+ ret = btrfs_check_data_free_space(inode, last_byte -
+ cur_offset);
+ if (ret) {
+ free_extent_map(em);
+ break;
+ }
+
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
last_byte - cur_offset,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
+
+ /* Let go of our reservation. */
+ btrfs_free_reserved_data_space(inode, last_byte -
+ cur_offset);
if (ret < 0) {
free_extent_map(em);
break;
@@ -1686,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
}
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
-
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..7a15fcfb3e1f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/math64.h>
+#include <linux/ratelimit.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
*block_group, struct btrfs_path *path)
{
struct inode *inode = NULL;
+ u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
spin_lock(&block_group->lock);
if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
return inode;
spin_lock(&block_group->lock);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
+ if (!((BTRFS_I(inode)->flags & flags) == flags)) {
printk(KERN_INFO "Old style space inode found, converting.\n");
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+ BTRFS_INODE_NODATACOW;
block_group->disk_cache_state = BTRFS_DC_CLEAR;
}
- if (!btrfs_fs_closing(root->fs_info)) {
+ if (!block_group->iref) {
block_group->inode = igrab(inode);
block_group->iref = 1;
}
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
+ u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
int ret;
ret = btrfs_insert_empty_inode(trans, root, path, ino);
if (ret)
return ret;
+ /* We inline crc's for the free disk space cache */
+ if (ino != BTRFS_FREE_INO_OBJECTID)
+ flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
btrfs_set_inode_uid(leaf, inode_item, 0);
btrfs_set_inode_gid(leaf, inode_item, 0);
btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
- BTRFS_INODE_PREALLOC);
+ btrfs_set_inode_flags(leaf, inode_item, flags);
btrfs_set_inode_nlink(leaf, inode_item, 1);
btrfs_set_inode_transid(leaf, inode_item, trans->transid);
btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct inode *inode)
{
struct btrfs_block_rsv *rsv;
+ u64 needed_bytes;
loff_t oldsize;
int ret = 0;
rsv = trans->block_rsv;
- trans->block_rsv = root->orphan_block_rsv;
- ret = btrfs_block_rsv_check(trans, root,
- root->orphan_block_rsv,
- 0, 5);
- if (ret)
- return ret;
+ trans->block_rsv = &root->fs_info->global_block_rsv;
+
+ /* 1 for slack space, 1 for updating the inode */
+ needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
+ btrfs_calc_trans_metadata_size(root, 1);
+
+ spin_lock(&trans->block_rsv->lock);
+ if (trans->block_rsv->reserved < needed_bytes) {
+ spin_unlock(&trans->block_rsv->lock);
+ trans->block_rsv = rsv;
+ return -ENOSPC;
+ }
+ spin_unlock(&trans->block_rsv->lock);
oldsize = i_size_read(inode);
btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
- trans->block_rsv = rsv;
if (ret) {
+ trans->block_rsv = rsv;
WARN_ON(1);
return ret;
}
ret = btrfs_update_inode(trans, root, inode);
+ trans->block_rsv = rsv;
+
return ret;
}
@@ -242,26 +259,342 @@ static int readahead_cache(struct inode *inode)
return 0;
}
+struct io_ctl {
+ void *cur, *orig;
+ struct page *page;
+ struct page **pages;
+ struct btrfs_root *root;
+ unsigned long size;
+ int index;
+ int num_pages;
+ unsigned check_crcs:1;
+};
+
+static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+ struct btrfs_root *root)
+{
+ memset(io_ctl, 0, sizeof(struct io_ctl));
+ io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
+ GFP_NOFS);
+ if (!io_ctl->pages)
+ return -ENOMEM;
+ io_ctl->root = root;
+ if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+ io_ctl->check_crcs = 1;
+ return 0;
+}
+
+static void io_ctl_free(struct io_ctl *io_ctl)
+{
+ kfree(io_ctl->pages);
+}
+
+static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+{
+ if (io_ctl->cur) {
+ kunmap(io_ctl->page);
+ io_ctl->cur = NULL;
+ io_ctl->orig = NULL;
+ }
+}
+
+static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+{
+ WARN_ON(io_ctl->cur);
+ BUG_ON(io_ctl->index >= io_ctl->num_pages);
+ io_ctl->page = io_ctl->pages[io_ctl->index++];
+ io_ctl->cur = kmap(io_ctl->page);
+ io_ctl->orig = io_ctl->cur;
+ io_ctl->size = PAGE_CACHE_SIZE;
+ if (clear)
+ memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+}
+
+static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+{
+ int i;
+
+ io_ctl_unmap_page(io_ctl);
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ ClearPageChecked(io_ctl->pages[i]);
+ unlock_page(io_ctl->pages[i]);
+ page_cache_release(io_ctl->pages[i]);
+ }
+}
+
+static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+ int uptodate)
+{
+ struct page *page;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ int i;
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ page = find_or_create_page(inode->i_mapping, i, mask);
+ if (!page) {
+ io_ctl_drop_pages(io_ctl);
+ return -ENOMEM;
+ }
+ io_ctl->pages[i] = page;
+ if (uptodate && !PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ printk(KERN_ERR "btrfs: error reading free "
+ "space cache\n");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+{
+ u64 *val;
+
+ io_ctl_map_page(io_ctl, 1);
+
+ /*
+ * Skip the csum areas. If we don't check crcs then we just have a
+ * 64bit chunk at the front of the first page.
+ */
+ if (io_ctl->check_crcs) {
+ io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+ } else {
+ io_ctl->cur += sizeof(u64);
+ io_ctl->size -= sizeof(u64) * 2;
+ }
+
+ val = io_ctl->cur;
+ *val = cpu_to_le64(generation);
+ io_ctl->cur += sizeof(u64);
+}
+
+static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+{
+ u64 *gen;
+
+ /*
+ * Skip the crc area. If we don't check crcs then we just have a 64bit
+ * chunk at the front of the first page.
+ */
+ if (io_ctl->check_crcs) {
+ io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+ io_ctl->size -= sizeof(u64) +
+ (sizeof(u32) * io_ctl->num_pages);
+ } else {
+ io_ctl->cur += sizeof(u64);
+ io_ctl->size -= sizeof(u64) * 2;
+ }
+
+ gen = io_ctl->cur;
+ if (le64_to_cpu(*gen) != generation) {
+ printk_ratelimited(KERN_ERR "btrfs: space cache generation "
+ "(%Lu) does not match inode (%Lu)\n", *gen,
+ generation);
+ io_ctl_unmap_page(io_ctl);
+ return -EIO;
+ }
+ io_ctl->cur += sizeof(u64);
+ return 0;
+}
+
+static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+{
+ u32 *tmp;
+ u32 crc = ~(u32)0;
+ unsigned offset = 0;
+
+ if (!io_ctl->check_crcs) {
+ io_ctl_unmap_page(io_ctl);
+ return;
+ }
+
+ if (index == 0)
+ offset = sizeof(u32) * io_ctl->num_pages;;
+
+ crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+ PAGE_CACHE_SIZE - offset);
+ btrfs_csum_final(crc, (char *)&crc);
+ io_ctl_unmap_page(io_ctl);
+ tmp = kmap(io_ctl->pages[0]);
+ tmp += index;
+ *tmp = crc;
+ kunmap(io_ctl->pages[0]);
+}
+
+static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+{
+ u32 *tmp, val;
+ u32 crc = ~(u32)0;
+ unsigned offset = 0;
+
+ if (!io_ctl->check_crcs) {
+ io_ctl_map_page(io_ctl, 0);
+ return 0;
+ }
+
+ if (index == 0)
+ offset = sizeof(u32) * io_ctl->num_pages;
+
+ tmp = kmap(io_ctl->pages[0]);
+ tmp += index;
+ val = *tmp;
+ kunmap(io_ctl->pages[0]);
+
+ io_ctl_map_page(io_ctl, 0);
+ crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+ PAGE_CACHE_SIZE - offset);
+ btrfs_csum_final(crc, (char *)&crc);
+ if (val != crc) {
+ printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
+ "space cache\n");
+ io_ctl_unmap_page(io_ctl);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+ void *bitmap)
+{
+ struct btrfs_free_space_entry *entry;
+
+ if (!io_ctl->cur)
+ return -ENOSPC;
+
+ entry = io_ctl->cur;
+ entry->offset = cpu_to_le64(offset);
+ entry->bytes = cpu_to_le64(bytes);
+ entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+ BTRFS_FREE_SPACE_EXTENT;
+ io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+ io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+ if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+ return 0;
+
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+
+ /* No more pages to map */
+ if (io_ctl->index >= io_ctl->num_pages)
+ return 0;
+
+ /* map the next page */
+ io_ctl_map_page(io_ctl, 1);
+ return 0;
+}
+
+static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+{
+ if (!io_ctl->cur)
+ return -ENOSPC;
+
+ /*
+ * If we aren't at the start of the current page, unmap this one and
+ * map the next one if there is any left.
+ */
+ if (io_ctl->cur != io_ctl->orig) {
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ if (io_ctl->index >= io_ctl->num_pages)
+ return -ENOSPC;
+ io_ctl_map_page(io_ctl, 0);
+ }
+
+ memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ if (io_ctl->index < io_ctl->num_pages)
+ io_ctl_map_page(io_ctl, 0);
+ return 0;
+}
+
+static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+{
+ /*
+ * If we're not on the boundary we know we've modified the page and we
+ * need to crc the page.
+ */
+ if (io_ctl->cur != io_ctl->orig)
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ else
+ io_ctl_unmap_page(io_ctl);
+
+ while (io_ctl->index < io_ctl->num_pages) {
+ io_ctl_map_page(io_ctl, 1);
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ }
+}
+
+static int io_ctl_read_entry(struct io_ctl *io_ctl,
+ struct btrfs_free_space *entry, u8 *type)
+{
+ struct btrfs_free_space_entry *e;
+
+ e = io_ctl->cur;
+ entry->offset = le64_to_cpu(e->offset);
+ entry->bytes = le64_to_cpu(e->bytes);
+ *type = e->type;
+ io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+ io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+ if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+ return 0;
+
+ io_ctl_unmap_page(io_ctl);
+
+ if (io_ctl->index >= io_ctl->num_pages)
+ return 0;
+
+ return io_ctl_check_crc(io_ctl, io_ctl->index);
+}
+
+static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+ struct btrfs_free_space *entry)
+{
+ int ret;
+
+ if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
+ io_ctl_unmap_page(io_ctl);
+
+ ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+ if (ret)
+ return ret;
+
+ memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+ io_ctl_unmap_page(io_ctl);
+
+ return 0;
+}
+
int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_path *path, u64 offset)
{
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
- struct page *page;
+ struct io_ctl io_ctl;
struct btrfs_key key;
+ struct btrfs_free_space *e, *n;
struct list_head bitmaps;
u64 num_entries;
u64 num_bitmaps;
u64 generation;
- pgoff_t index = 0;
+ u8 type;
int ret = 0;
INIT_LIST_HEAD(&bitmaps);
/* Nothing in the space cache, goodbye */
if (!i_size_read(inode))
- goto out;
+ return 0;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
@@ -269,11 +602,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return 0;
else if (ret > 0) {
btrfs_release_path(path);
- ret = 0;
- goto out;
+ return 0;
}
ret = -1;
@@ -291,169 +623,100 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
" not match free space cache generation (%llu)\n",
(unsigned long long)BTRFS_I(inode)->generation,
(unsigned long long)generation);
- goto out;
+ return 0;
}
if (!num_entries)
- goto out;
+ return 0;
+ io_ctl_init(&io_ctl, inode, root);
ret = readahead_cache(inode);
if (ret)
goto out;
- while (1) {
- struct btrfs_free_space_entry *entry;
- struct btrfs_free_space *e;
- void *addr;
- unsigned long offset = 0;
- int need_loop = 0;
+ ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+ if (ret)
+ goto out;
- if (!num_entries && !num_bitmaps)
- break;
+ ret = io_ctl_check_crc(&io_ctl, 0);
+ if (ret)
+ goto free_cache;
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
- if (!page)
+ ret = io_ctl_check_generation(&io_ctl, generation);
+ if (ret)
+ goto free_cache;
+
+ while (num_entries) {
+ e = kmem_cache_zalloc(btrfs_free_space_cachep,
+ GFP_NOFS);
+ if (!e)
goto free_cache;
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- page_cache_release(page);
- printk(KERN_ERR "btrfs: error reading free "
- "space cache\n");
- goto free_cache;
- }
+ ret = io_ctl_read_entry(&io_ctl, e, &type);
+ if (ret) {
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
}
- addr = kmap(page);
- if (index == 0) {
- u64 *gen;
+ if (!e->bytes) {
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
- /*
- * We put a bogus crc in the front of the first page in
- * case old kernels try to mount a fs with the new
- * format to make sure they discard the cache.
- */
- addr += sizeof(u64);
- offset += sizeof(u64);
-
- gen = addr;
- if (*gen != BTRFS_I(inode)->generation) {
- printk(KERN_ERR "btrfs: space cache generation"
- " (%llu) does not match inode (%llu)\n",
- (unsigned long long)*gen,
- (unsigned long long)
- BTRFS_I(inode)->generation);
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
+ if (type == BTRFS_FREE_SPACE_EXTENT) {
+ spin_lock(&ctl->tree_lock);
+ ret = link_free_space(ctl, e);
+ spin_unlock(&ctl->tree_lock);
+ if (ret) {
+ printk(KERN_ERR "Duplicate entries in "
+ "free space cache, dumping\n");
+ kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
- addr += sizeof(u64);
- offset += sizeof(u64);
- }
- entry = addr;
-
- while (1) {
- if (!num_entries)
- break;
-
- need_loop = 1;
- e = kmem_cache_zalloc(btrfs_free_space_cachep,
- GFP_NOFS);
- if (!e) {
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
+ } else {
+ BUG_ON(!num_bitmaps);
+ num_bitmaps--;
+ e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!e->bitmap) {
+ kmem_cache_free(
+ btrfs_free_space_cachep, e);
goto free_cache;
}
-
- e->offset = le64_to_cpu(entry->offset);
- e->bytes = le64_to_cpu(entry->bytes);
- if (!e->bytes) {
- kunmap(page);
+ spin_lock(&ctl->tree_lock);
+ ret = link_free_space(ctl, e);
+ ctl->total_bitmaps++;
+ ctl->op->recalc_thresholds(ctl);
+ spin_unlock(&ctl->tree_lock);
+ if (ret) {
+ printk(KERN_ERR "Duplicate entries in "
+ "free space cache, dumping\n");
kmem_cache_free(btrfs_free_space_cachep, e);
- unlock_page(page);
- page_cache_release(page);
goto free_cache;
}
-
- if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
- spin_lock(&ctl->tree_lock);
- ret = link_free_space(ctl, e);
- spin_unlock(&ctl->tree_lock);
- if (ret) {
- printk(KERN_ERR "Duplicate entries in "
- "free space cache, dumping\n");
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
- goto free_cache;
- }
- } else {
- e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
- if (!e->bitmap) {
- kunmap(page);
- kmem_cache_free(
- btrfs_free_space_cachep, e);
- unlock_page(page);
- page_cache_release(page);
- goto free_cache;
- }
- spin_lock(&ctl->tree_lock);
- ret = link_free_space(ctl, e);
- ctl->total_bitmaps++;
- ctl->op->recalc_thresholds(ctl);
- spin_unlock(&ctl->tree_lock);
- if (ret) {
- printk(KERN_ERR "Duplicate entries in "
- "free space cache, dumping\n");
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
- goto free_cache;
- }
- list_add_tail(&e->list, &bitmaps);
- }
-
- num_entries--;
- offset += sizeof(struct btrfs_free_space_entry);
- if (offset + sizeof(struct btrfs_free_space_entry) >=
- PAGE_CACHE_SIZE)
- break;
- entry++;
+ list_add_tail(&e->list, &bitmaps);
}
- /*
- * We read an entry out of this page, we need to move on to the
- * next page.
- */
- if (need_loop) {
- kunmap(page);
- goto next;
- }
+ num_entries--;
+ }
- /*
- * We add the bitmaps at the end of the entries in order that
- * the bitmap entries are added to the cache.
- */
- e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+ /*
+ * We add the bitmaps at the end of the entries in order that
+ * the bitmap entries are added to the cache.
+ */
+ list_for_each_entry_safe(e, n, &bitmaps, list) {
list_del_init(&e->list);
- memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
- kunmap(page);
- num_bitmaps--;
-next:
- unlock_page(page);
- page_cache_release(page);
- index++;
+ ret = io_ctl_read_bitmap(&io_ctl, e);
+ if (ret)
+ goto free_cache;
}
+ io_ctl_drop_pages(&io_ctl);
ret = 1;
out:
+ io_ctl_free(&io_ctl);
return ret;
free_cache:
+ io_ctl_drop_pages(&io_ctl);
__btrfs_remove_free_space_cache(ctl);
goto out;
}
@@ -465,7 +728,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
struct btrfs_root *root = fs_info->tree_root;
struct inode *inode;
struct btrfs_path *path;
- int ret;
+ int ret = 0;
bool matched;
u64 used = btrfs_block_group_used(&block_group->item);
@@ -497,6 +760,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
return 0;
}
+ /* We may have converted the inode and made the cache invalid. */
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+ spin_unlock(&block_group->lock);
+ goto out;
+ }
+ spin_unlock(&block_group->lock);
+
ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
path, block_group->key.objectid);
btrfs_free_path(path);
@@ -530,6 +801,19 @@ out:
return ret;
}
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount. This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group_cache *block_group,
@@ -540,42 +824,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct extent_buffer *leaf;
struct rb_node *node;
struct list_head *pos, *n;
- struct page **pages;
- struct page *page;
struct extent_state *cached_state = NULL;
struct btrfs_free_cluster *cluster = NULL;
struct extent_io_tree *unpin = NULL;
+ struct io_ctl io_ctl;
struct list_head bitmap_list;
struct btrfs_key key;
u64 start, end, len;
- u64 bytes = 0;
- u32 crc = ~(u32)0;
- int index = 0, num_pages = 0;
int entries = 0;
int bitmaps = 0;
- int ret = -1;
- bool next_page = false;
- bool out_of_space = false;
+ int ret;
+ int err = -1;
INIT_LIST_HEAD(&bitmap_list);
- node = rb_first(&ctl->free_space_offset);
- if (!node)
- return 0;
-
if (!i_size_read(inode))
return -1;
- num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
-
- filemap_write_and_wait(inode->i_mapping);
- btrfs_wait_ordered_range(inode, inode->i_size &
- ~(root->sectorsize - 1), (u64)-1);
-
- pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
- if (!pages)
- return -1;
+ io_ctl_init(&io_ctl, inode, root);
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +855,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
*/
unpin = root->fs_info->pinned_extents;
- /*
- * Lock all pages first so we can lock the extent safely.
- *
- * NOTE: Because we hold the ref the entire time we're going to write to
- * the page find_get_page should never fail, so we don't do a check
- * after find_get_page at this point. Just putting this here so people
- * know and don't freak out.
- */
- while (index < num_pages) {
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
- if (!page) {
- int i;
-
- for (i = 0; i < num_pages; i++) {
- unlock_page(pages[i]);
- page_cache_release(pages[i]);
- }
- goto out;
- }
- pages[index] = page;
- index++;
- }
+ /* Lock all pages first so we can lock the extent safely. */
+ io_ctl_prepare_pages(&io_ctl, inode, 0);
- index = 0;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state, GFP_NOFS);
@@ -623,189 +868,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (block_group)
start = block_group->key.objectid;
- /* Write out the extent entries */
- do {
- struct btrfs_free_space_entry *entry;
- void *addr, *orig;
- unsigned long offset = 0;
+ node = rb_first(&ctl->free_space_offset);
+ if (!node && cluster) {
+ node = rb_first(&cluster->root);
+ cluster = NULL;
+ }
- next_page = false;
+ /* Make sure we can fit our crcs into the first page */
+ if (io_ctl.check_crcs &&
+ (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
+ WARN_ON(1);
+ goto out_nospc;
+ }
- if (index >= num_pages) {
- out_of_space = true;
- break;
- }
+ io_ctl_set_generation(&io_ctl, trans->transid);
- page = pages[index];
+ /* Write out the extent entries */
+ while (node) {
+ struct btrfs_free_space *e;
- orig = addr = kmap(page);
- if (index == 0) {
- u64 *gen;
+ e = rb_entry(node, struct btrfs_free_space, offset_index);
+ entries++;
- /*
- * We're going to put in a bogus crc for this page to
- * make sure that old kernels who aren't aware of this
- * format will be sure to discard the cache.
- */
- addr += sizeof(u64);
- offset += sizeof(u64);
+ ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
+ e->bitmap);
+ if (ret)
+ goto out_nospc;
- gen = addr;
- *gen = trans->transid;
- addr += sizeof(u64);
- offset += sizeof(u64);
+ if (e->bitmap) {
+ list_add_tail(&e->list, &bitmap_list);
+ bitmaps++;
}
- entry = addr;
-
- memset(addr, 0, PAGE_CACHE_SIZE - offset);
- while (node && !next_page) {
- struct btrfs_free_space *e;
-
- e = rb_entry(node, struct btrfs_free_space, offset_index);
- entries++;
-
- entry->offset = cpu_to_le64(e->offset);
- entry->bytes = cpu_to_le64(e->bytes);
- if (e->bitmap) {
- entry->type = BTRFS_FREE_SPACE_BITMAP;
- list_add_tail(&e->list, &bitmap_list);
- bitmaps++;
- } else {
- entry->type = BTRFS_FREE_SPACE_EXTENT;
- }
- node = rb_next(node);
- if (!node && cluster) {
- node = rb_first(&cluster->root);
- cluster = NULL;
- }
- offset += sizeof(struct btrfs_free_space_entry);
- if (offset + sizeof(struct btrfs_free_space_entry) >=
- PAGE_CACHE_SIZE)
- next_page = true;
- entry++;
+ node = rb_next(node);
+ if (!node && cluster) {
+ node = rb_first(&cluster->root);
+ cluster = NULL;
}
+ }
- /*
- * We want to add any pinned extents to our free space cache
- * so we don't leak the space
- */
- while (block_group && !next_page &&
- (start < block_group->key.objectid +
- block_group->key.offset)) {
- ret = find_first_extent_bit(unpin, start, &start, &end,
- EXTENT_DIRTY);
- if (ret) {
- ret = 0;
- break;
- }
-
- /* This pinned extent is out of our range */
- if (start >= block_group->key.objectid +
- block_group->key.offset)
- break;
-
- len = block_group->key.objectid +
- block_group->key.offset - start;
- len = min(len, end + 1 - start);
-
- entries++;
- entry->offset = cpu_to_le64(start);
- entry->bytes = cpu_to_le64(len);
- entry->type = BTRFS_FREE_SPACE_EXTENT;
-
- start = end + 1;
- offset += sizeof(struct btrfs_free_space_entry);
- if (offset + sizeof(struct btrfs_free_space_entry) >=
- PAGE_CACHE_SIZE)
- next_page = true;
- entry++;
+ /*
+ * We want to add any pinned extents to our free space cache
+ * so we don't leak the space
+ */
+ while (block_group && (start < block_group->key.objectid +
+ block_group->key.offset)) {
+ ret = find_first_extent_bit(unpin, start, &start, &end,
+ EXTENT_DIRTY);
+ if (ret) {
+ ret = 0;
+ break;
}
- /* Generate bogus crc value */
- if (index == 0) {
- u32 *tmp;
- crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
- PAGE_CACHE_SIZE - sizeof(u64));
- btrfs_csum_final(crc, (char *)&crc);
- crc++;
- tmp = orig;
- *tmp = crc;
- }
+ /* This pinned extent is out of our range */
+ if (start >= block_group->key.objectid +
+ block_group->key.offset)
+ break;
- kunmap(page);
+ len = block_group->key.objectid +
+ block_group->key.offset - start;
+ len = min(len, end + 1 - start);
- bytes += PAGE_CACHE_SIZE;
+ entries++;
+ ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+ if (ret)
+ goto out_nospc;
- index++;
- } while (node || next_page);
+ start = end + 1;
+ }
/* Write out the bitmaps */
list_for_each_safe(pos, n, &bitmap_list) {
- void *addr;
struct btrfs_free_space *entry =
list_entry(pos, struct btrfs_free_space, list);
- if (index >= num_pages) {
- out_of_space = true;
- break;
- }
- page = pages[index];
-
- addr = kmap(page);
- memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
- kunmap(page);
- bytes += PAGE_CACHE_SIZE;
-
+ ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
+ if (ret)
+ goto out_nospc;
list_del_init(&entry->list);
- index++;
- }
-
- if (out_of_space) {
- btrfs_drop_pages(pages, num_pages);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state,
- GFP_NOFS);
- ret = 0;
- goto out;
}
/* Zero out the rest of the pages just to make sure */
- while (index < num_pages) {
- void *addr;
-
- page = pages[index];
- addr = kmap(page);
- memset(addr, 0, PAGE_CACHE_SIZE);
- kunmap(page);
- bytes += PAGE_CACHE_SIZE;
- index++;
- }
+ io_ctl_zero_remaining_pages(&io_ctl);
- ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
- bytes, &cached_state);
- btrfs_drop_pages(pages, num_pages);
+ ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+ 0, i_size_read(inode), &cached_state);
+ io_ctl_drop_pages(&io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state, GFP_NOFS);
- if (ret) {
- ret = 0;
+ if (ret)
goto out;
- }
- BTRFS_I(inode)->generation = trans->transid;
- filemap_write_and_wait(inode->i_mapping);
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ goto out;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
key.type = 0;
- ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- ret = -1;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+ GFP_NOFS);
goto out;
}
leaf = path->nodes[0];
@@ -816,15 +983,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
- ret = -1;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL,
- GFP_NOFS);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+ NULL, GFP_NOFS);
btrfs_release_path(path);
goto out;
}
}
+
+ BTRFS_I(inode)->generation = trans->transid;
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1001,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- ret = 1;
-
+ err = 0;
out:
- kfree(pages);
- if (ret != 1) {
- invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+ io_ctl_free(&io_ctl);
+ if (err) {
+ invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
- return ret;
+ return err;
+
+out_nospc:
+ list_for_each_safe(pos, n, &bitmap_list) {
+ struct btrfs_free_space *entry =
+ list_entry(pos, struct btrfs_free_space, list);
+ list_del_init(&entry->list);
+ }
+ io_ctl_drop_pages(&io_ctl);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ goto out;
}
int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1047,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
path, block_group->key.objectid);
- if (ret < 0) {
+ if (ret) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&block_group->lock);
ret = 0;
-
+#ifdef DEBUG
printk(KERN_ERR "btrfs: failed to write free space cace "
"for block group %llu\n", block_group->key.objectid);
+#endif
}
iput(inode);
@@ -1701,6 +1880,7 @@ again:
ctl->total_bitmaps--;
}
kmem_cache_free(btrfs_free_space_cachep, info);
+ ret = 0;
goto out_lock;
}
@@ -1708,7 +1888,8 @@ again:
unlink_free_space(ctl, info);
info->offset += bytes;
info->bytes -= bytes;
- link_free_space(ctl, info);
+ ret = link_free_space(ctl, info);
+ WARN_ON(ret);
goto out_lock;
}
@@ -2472,9 +2653,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
spin_unlock(&ctl->tree_lock);
if (bytes >= minlen) {
- int update_ret;
- update_ret = btrfs_update_reserved_bytes(block_group,
- bytes, 1, 1);
+ struct btrfs_space_info *space_info;
+ int update = 0;
+
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (!block_group->ro) {
+ block_group->reserved += bytes;
+ space_info->bytes_reserved += bytes;
+ update = 1;
+ }
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
ret = btrfs_error_discard_extent(fs_info->extent_root,
start,
@@ -2482,9 +2673,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
&actually_trimmed);
btrfs_add_free_space(block_group, start, bytes);
- if (!update_ret)
- btrfs_update_reserved_bytes(block_group,
- bytes, 0, 1);
+ if (update) {
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (block_group->ro)
+ space_info->bytes_readonly += bytes;
+ block_group->reserved -= bytes;
+ space_info->bytes_reserved -= bytes;
+ spin_unlock(&space_info->lock);
+ spin_unlock(&block_group->lock);
+ }
if (ret)
break;
@@ -2643,9 +2841,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
return 0;
ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
- if (ret < 0)
+ if (ret) {
+ btrfs_delalloc_release_metadata(inode, inode->i_size);
+#ifdef DEBUG
printk(KERN_ERR "btrfs: failed to write free ino cache "
"for root %llu\n", root->root_key.objectid);
+#endif
+ }
iput(inode);
return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..53dcbdf446cd 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -465,14 +465,16 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, prealloc);
+ ret = btrfs_delalloc_reserve_space(inode, prealloc);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
- if (ret)
+ if (ret) {
+ btrfs_delalloc_release_space(inode, prealloc);
goto out_put;
+ }
btrfs_free_reserved_data_space(inode, prealloc);
out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9327f45434e8..9d0eaa57d4ee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -393,7 +393,10 @@ again:
(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
- BUG_ON(!pages);
+ if (!pages) {
+ /* just bail out to the uncompressed code */
+ goto cont;
+ }
if (BTRFS_I(inode)->force_compress)
compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +427,7 @@ again:
will_compress = 1;
}
}
+cont:
if (start == 0) {
trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
@@ -820,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,
}
BUG_ON(disk_num_bytes >
- btrfs_super_total_bytes(&root->fs_info->super_copy));
+ btrfs_super_total_bytes(root->fs_info->super_copy));
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1792,12 +1796,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
}
ret = 0;
out:
- if (nolock) {
- if (trans)
- btrfs_end_transaction_nolock(trans, root);
- } else {
+ if (root != root->fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
- if (trans)
+ if (trans) {
+ if (nolock)
+ btrfs_end_transaction_nolock(trans, root);
+ else
btrfs_end_transaction(trans, root);
}
@@ -1931,89 +1935,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
up_read(&root->fs_info->cleanup_work_sem);
}
-/*
- * calculate extra metadata reservation when snapshotting a subvolume
- * contains orphan files.
- */
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
- u64 *bytes_to_reserve)
-{
- struct btrfs_root *root;
- struct btrfs_block_rsv *block_rsv;
- u64 num_bytes;
- int index;
-
- root = pending->root;
- if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
- return;
-
- block_rsv = root->orphan_block_rsv;
-
- /* orphan block reservation for the snapshot */
- num_bytes = block_rsv->size;
-
- /*
- * after the snapshot is created, COWing tree blocks may use more
- * space than it frees. So we should make sure there is enough
- * reserved space.
- */
- index = trans->transid & 0x1;
- if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
- num_bytes += block_rsv->size -
- (block_rsv->reserved + block_rsv->freed[index]);
- }
-
- *bytes_to_reserve += num_bytes;
-}
-
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending)
-{
- struct btrfs_root *root = pending->root;
- struct btrfs_root *snap = pending->snap;
- struct btrfs_block_rsv *block_rsv;
- u64 num_bytes;
- int index;
- int ret;
-
- if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
- return;
-
- /* refill source subvolume's orphan block reservation */
- block_rsv = root->orphan_block_rsv;
- index = trans->transid & 0x1;
- if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
- num_bytes = block_rsv->size -
- (block_rsv->reserved + block_rsv->freed[index]);
- ret = btrfs_block_rsv_migrate(&pending->block_rsv,
- root->orphan_block_rsv,
- num_bytes);
- BUG_ON(ret);
- }
-
- /* setup orphan block reservation for the snapshot */
- block_rsv = btrfs_alloc_block_rsv(snap);
- BUG_ON(!block_rsv);
-
- btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
- snap->orphan_block_rsv = block_rsv;
-
- num_bytes = root->orphan_block_rsv->size;
- ret = btrfs_block_rsv_migrate(&pending->block_rsv,
- block_rsv, num_bytes);
- BUG_ON(ret);
-
-#if 0
- /* insert orphan item for the snapshot */
- WARN_ON(!root->orphan_item_inserted);
- ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
- snap->root_key.objectid);
- BUG_ON(ret);
- snap->orphan_item_inserted = 1;
-#endif
-}
-
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@@ -2099,9 +2020,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
}
spin_unlock(&root->orphan_lock);
- if (block_rsv)
- btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-
/* grab metadata reservation from transaction handle */
if (reserve) {
ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2168,6 +2086,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
struct inode *inode;
+ u64 last_objectid = 0;
int ret = 0, nr_unlink = 0, nr_truncate = 0;
if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2219,41 +2138,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* crossing root thing. we store the inode number in the
* offset of the orphan item.
*/
+
+ if (found_key.offset == last_objectid) {
+ printk(KERN_ERR "btrfs: Error removing orphan entry, "
+ "stopping orphan cleanup\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ last_objectid = found_key.offset;
+
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
+ ret = PTR_RET(inode);
+ if (ret && ret != -ESTALE)
goto out;
- }
/*
- * add this inode to the orphan list so btrfs_orphan_del does
- * the proper thing when we hit it
- */
- spin_lock(&root->orphan_lock);
- list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
- spin_unlock(&root->orphan_lock);
-
- /*
- * if this is a bad inode, means we actually succeeded in
- * removing the inode, but not the orphan record, which means
- * we need to manually delete the orphan since iput will just
- * do a destroy_inode
+ * Inode is already gone but the orphan item is still there,
+ * kill the orphan item.
*/
- if (is_bad_inode(inode)) {
- trans = btrfs_start_transaction(root, 0);
+ if (ret == -ESTALE) {
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
- btrfs_orphan_del(trans, inode);
+ ret = btrfs_del_orphan_item(trans, root,
+ found_key.objectid);
+ BUG_ON(ret);
btrfs_end_transaction(trans, root);
- iput(inode);
continue;
}
+ /*
+ * add this inode to the orphan list so btrfs_orphan_del does
+ * the proper thing when we hit it
+ */
+ spin_lock(&root->orphan_lock);
+ list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ spin_unlock(&root->orphan_lock);
+
/* if we have links, this was a truncate, lets do that */
if (inode->i_nlink) {
if (!S_ISREG(inode->i_mode)) {
@@ -2687,7 +2614,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
u64 ino = btrfs_ino(inode);
u64 dir_ino = btrfs_ino(dir);
- trans = btrfs_start_transaction(root, 10);
+ /*
+ * 1 for the possible orphan item
+ * 1 for the dir item
+ * 1 for the dir index
+ * 1 for the inode ref
+ * 1 for the inode ref in the tree log
+ * 2 for the dir entries in the log
+ * 1 for the inode
+ */
+ trans = btrfs_start_transaction(root, 8);
if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
return trans;
@@ -2710,7 +2646,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
return ERR_PTR(-ENOMEM);
}
- trans = btrfs_start_transaction(root, 0);
+ /* 1 for the orphan item */
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_free_path(path);
root->fs_info->enospc_unlink = 0;
@@ -2815,6 +2752,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
err = 0;
out:
btrfs_free_path(path);
+ /* Migrate the orphan reservation over */
+ if (!err)
+ err = btrfs_block_rsv_migrate(trans->block_rsv,
+ &root->fs_info->global_block_rsv,
+ trans->bytes_reserved);
+
if (err) {
btrfs_end_transaction(trans, root);
root->fs_info->enospc_unlink = 0;
@@ -2829,6 +2772,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+ btrfs_block_rsv_release(root, trans->block_rsv,
+ trans->bytes_reserved);
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
BUG_ON(!root->fs_info->enospc_unlink);
root->fs_info->enospc_unlink = 0;
}
@@ -3220,6 +3166,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
pgoff_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
struct page *page;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
int ret = 0;
u64 page_start;
u64 page_end;
@@ -3232,7 +3179,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
ret = -ENOMEM;
again:
- page = find_or_create_page(mapping, index, GFP_NOFS);
+ page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
goto out;
@@ -3465,6 +3412,8 @@ void btrfs_evict_inode(struct inode *inode)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *rsv, *global_rsv;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
unsigned long nr;
int ret;
@@ -3492,22 +3441,55 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
+ rsv = btrfs_alloc_block_rsv(root);
+ if (!rsv) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+ rsv->size = min_size;
+ global_rsv = &root->fs_info->global_block_rsv;
+
btrfs_i_size_write(inode, 0);
+ /*
+ * This is a bit simpler than btrfs_truncate since
+ *
+ * 1) We've already reserved our space for our orphan item in the
+ * unlink.
+ * 2) We're going to delete the inode item, so we don't need to update
+ * it at all.
+ *
+ * So we just need to reserve some slack space in case we add bytes when
+ * doing the truncate.
+ */
while (1) {
- trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
- trans->block_rsv = root->orphan_block_rsv;
+ ret = btrfs_block_rsv_refill(root, rsv, min_size);
+
+ /*
+ * Try and steal from the global reserve since we will
+ * likely not use this space anyway, we want to try as
+ * hard as possible to get this to work.
+ */
+ if (ret)
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
- ret = btrfs_block_rsv_check(trans, root,
- root->orphan_block_rsv, 0, 5);
if (ret) {
- BUG_ON(ret != -EAGAIN);
- ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
- continue;
+ printk(KERN_WARNING "Could not get space for a "
+ "delete, will truncate on mount %d\n", ret);
+ btrfs_orphan_del(NULL, inode);
+ btrfs_free_block_rsv(root, rsv);
+ goto no_delete;
+ }
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_orphan_del(NULL, inode);
+ btrfs_free_block_rsv(root, rsv);
+ goto no_delete;
}
+ trans->block_rsv = rsv;
+
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
if (ret != -EAGAIN)
break;
@@ -3516,14 +3498,17 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
trans = NULL;
btrfs_btree_balance_dirty(root, nr);
-
}
+ btrfs_free_block_rsv(root, rsv);
+
if (ret == 0) {
+ trans->block_rsv = root->orphan_block_rsv;
ret = btrfs_orphan_del(trans, inode);
BUG_ON(ret);
}
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
if (!(root == root->fs_info->tree_root ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
btrfs_return_ino(root, btrfs_ino(inode));
@@ -5647,8 +5632,7 @@ again:
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
ret = btrfs_ordered_update_i_size(inode, 0, ordered);
if (!ret)
- ret = btrfs_update_inode(trans, root, inode);
- err = ret;
+ err = btrfs_update_inode(trans, root, inode);
goto out;
}
@@ -6393,6 +6377,7 @@ static int btrfs_truncate(struct inode *inode)
struct btrfs_trans_handle *trans;
unsigned long nr;
u64 mask = root->sectorsize - 1;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
if (ret)
@@ -6440,19 +6425,23 @@ static int btrfs_truncate(struct inode *inode)
rsv = btrfs_alloc_block_rsv(root);
if (!rsv)
return -ENOMEM;
- btrfs_add_durable_block_rsv(root->fs_info, rsv);
+ rsv->size = min_size;
+ /*
+ * 1 for the truncate slack space
+ * 1 for the orphan item we're going to add
+ * 1 for the orphan item deletion
+ * 1 for updating the inode.
+ */
trans = btrfs_start_transaction(root, 4);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out;
}
- /*
- * Reserve space for the truncate process. Truncate should be adding
- * space, but if there are snapshots it may end up using space.
- */
- ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+ /* Migrate the slack space for the truncate to our reserve */
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ min_size);
BUG_ON(ret);
ret = btrfs_orphan_add(trans, inode);
@@ -6461,21 +6450,6 @@ static int btrfs_truncate(struct inode *inode)
goto out;
}
- nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
-
- /*
- * Ok so we've already migrated our bytes over for the truncate, so here
- * just reserve the one slot we need for updating the inode.
- */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
- goto out;
- }
- trans->block_rsv = rsv;
-
/*
* setattr is responsible for setting the ordered_data_close flag,
* but that is only tested during the last file release. That
@@ -6497,20 +6471,30 @@ static int btrfs_truncate(struct inode *inode)
btrfs_add_ordered_operation(trans, root, inode);
while (1) {
+ ret = btrfs_block_rsv_refill(root, rsv, min_size);
+ if (ret) {
+ /*
+ * This can only happen with the original transaction we
+ * started above, every other time we shouldn't have a
+ * transaction started yet.
+ */
+ if (ret == -EAGAIN)
+ goto end_trans;
+ err = ret;
+ break;
+ }
+
if (!trans) {
- trans = btrfs_start_transaction(root, 3);
+ /* Just need the 1 for updating the inode */
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out;
}
-
- ret = btrfs_truncate_reserve_metadata(trans, root,
- rsv);
- BUG_ON(ret);
-
- trans->block_rsv = rsv;
}
+ trans->block_rsv = rsv;
+
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
@@ -6525,7 +6509,7 @@ static int btrfs_truncate(struct inode *inode)
err = ret;
break;
}
-
+end_trans:
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
trans = NULL;
@@ -6607,9 +6591,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
- ei->reserved_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
+ ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->last_unlink_trans = 0;
@@ -6655,6 +6639,8 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(inode->i_data.nrpages);
WARN_ON(BTRFS_I(inode)->outstanding_extents);
WARN_ON(BTRFS_I(inode)->reserved_extents);
+ WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+ WARN_ON(BTRFS_I(inode)->csum_bytes);
/*
* This can happen where we create an inode, but somebody else also
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7f57efa76d11..cc9893990341 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -118,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
/*
* Inherit flags from the parent inode.
*
- * Unlike extN we don't have any flags we don't want to inherit currently.
+ * Currently only the compression flags and the cow flags are inherited.
*/
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
@@ -129,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
flags = BTRFS_I(dir)->flags;
- if (S_ISREG(inode->i_mode))
- flags &= ~BTRFS_INODE_DIRSYNC;
- else if (!S_ISDIR(inode->i_mode))
- flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+ if (flags & BTRFS_INODE_NOCOMPRESS) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ } else if (flags & BTRFS_INODE_COMPRESS) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ }
+
+ if (flags & BTRFS_INODE_NODATACOW)
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
- BTRFS_I(inode)->flags = flags;
btrfs_update_iflags(inode);
}
@@ -278,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
+ u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -296,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
}
}
rcu_read_unlock();
+
if (!num_devices)
return -EOPNOTSUPP;
-
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
+ if (range.start > total_bytes)
+ return -EINVAL;
+ range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(root, &range);
if (ret < 0)
@@ -761,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
int ret = 1;
/*
- * make sure that once we start defragging and extent, we keep on
+ * make sure that once we start defragging an extent, we keep on
* defragging it
*/
if (start < *defrag_end)
@@ -806,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
* extent will force at least part of that big extent to be defragged.
*/
if (ret) {
- *last_len += len;
*defrag_end = extent_map_end(em);
} else {
*last_len = 0;
@@ -844,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
int i_done;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
if (isize == 0)
return 0;
@@ -861,7 +870,7 @@ again:
for (i = 0; i < num_pages; i++) {
struct page *page;
page = find_or_create_page(inode->i_mapping,
- start_index + i, GFP_NOFS);
+ start_index + i, mask);
if (!page)
break;
@@ -973,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_super_block *disk_super;
struct file_ra_state *ra = NULL;
unsigned long last_index;
+ u64 isize = i_size_read(inode);
u64 features;
u64 last_len = 0;
u64 skip = 0;
u64 defrag_end = 0;
u64 newer_off = range->start;
- int newer_left = 0;
unsigned long i;
+ unsigned long ra_index = 0;
int ret;
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
int extent_thresh = range->extent_thresh;
- int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ int cluster = max_cluster;
u64 new_align = ~((u64)128 * 1024 - 1);
struct page **pages = NULL;
@@ -998,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
compress_type = range->compress_type;
}
- if (inode->i_size == 0)
+ if (isize == 0)
return 0;
/*
@@ -1014,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra = &file->f_ra;
}
- pages = kmalloc(sizeof(struct page *) * newer_cluster,
+ pages = kmalloc(sizeof(struct page *) * max_cluster,
GFP_NOFS);
if (!pages) {
ret = -ENOMEM;
@@ -1023,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
/* find the last page to defrag */
if (range->start + range->len > range->start) {
- last_index = min_t(u64, inode->i_size - 1,
+ last_index = min_t(u64, isize - 1,
range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
} else {
- last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
}
if (newer_than) {
@@ -1039,16 +1050,24 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* the extents in the file evenly spaced
*/
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
- newer_left = newer_cluster;
} else
goto out_ra;
} else {
i = range->start >> PAGE_CACHE_SHIFT;
}
if (!max_to_defrag)
- max_to_defrag = last_index - 1;
+ max_to_defrag = last_index;
+
+ /*
+ * make writeback starts from i, so the defrag range can be
+ * written sequentially.
+ */
+ if (i < inode->i_mapping->writeback_index)
+ inode->i_mapping->writeback_index = i;
- while (i <= last_index && defrag_count < max_to_defrag) {
+ while (i <= last_index && defrag_count < max_to_defrag &&
+ (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT)) {
/*
* make sure we stop running if someone unmounts
* the FS
@@ -1071,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i = max(i + 1, next);
continue;
}
+
+ if (!newer_than) {
+ cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+ PAGE_CACHE_SHIFT) - i;
+ cluster = min(cluster, max_cluster);
+ } else {
+ cluster = max_cluster;
+ }
+
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = compress_type;
- btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
+ if (i + cluster > ra_index) {
+ ra_index = max(i, ra_index);
+ btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+ cluster);
+ ra_index += max_cluster;
+ }
- ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+ ret = cluster_pages_for_defrag(inode, pages, i, cluster);
if (ret < 0)
goto out_ra;
defrag_count += ret;
balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
- i += ret;
if (newer_than) {
if (newer_off == (u64)-1)
@@ -1097,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (!ret) {
range->start = newer_off;
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
- newer_left = newer_cluster;
} else {
break;
}
} else {
- i++;
+ if (ret > 0) {
+ i += ret;
+ last_len += ret << PAGE_CACHE_SHIFT;
+ } else {
+ i++;
+ last_len = 0;
+ }
}
}
@@ -1128,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
mutex_unlock(&inode->i_mutex);
}
- disk_super = &root->fs_info->super_copy;
+ disk_super = root->fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (range->compress_type == BTRFS_COMPRESS_LZO) {
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
btrfs_set_super_incompat_flags(disk_super, features);
}
- if (!file)
- kfree(ra);
- return defrag_count;
+ ret = defrag_count;
out_ra:
if (!file)
@@ -2579,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
return PTR_ERR(trans);
}
- dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+ dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
dir_id, "default", 7, 1);
if (IS_ERR_OR_NULL(di)) {
@@ -2595,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- disk_super = &root->fs_info->super_copy;
+ disk_super = root->fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2862,7 +2897,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
int i;
unsigned long rel_ptr;
int size;
- struct btrfs_ioctl_ino_path_args *ipa;
+ struct btrfs_ioctl_ino_path_args *ipa = NULL;
struct inode_fs_paths *ipath = NULL;
struct btrfs_path *path;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
{
int i;
- u32 type;
- u32 nr = btrfs_header_nritems(l);
+ u32 type, nr;
struct btrfs_item *item;
struct btrfs_root_item *ri;
struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
struct btrfs_key key;
struct btrfs_key found_key;
+ if (!l)
+ return;
+
+ nr = btrfs_header_nritems(l);
+
printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
(unsigned long long)btrfs_header_bytenr(l), nr,
btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..cd857119ba8a
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+#undef DEBUG
+
+/*
+ * This is the implementation for the generic read ahead framework.
+ *
+ * To trigger a readahead, btrfs_reada_add must be called. It will start
+ * a read ahead for the given range [start, end) on tree root. The returned
+ * handle can either be used to wait on the readahead to finish
+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ *
+ * The read ahead works as follows:
+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ * reada_start_machine will then search for extents to prefetch and trigger
+ * some reads. When a read finishes for a node, all contained node/leaf
+ * pointers that lie in the given range will also be enqueued. The reads will
+ * be triggered in sequential order, thus giving a big win over a naive
+ * enumeration. It will also make use of multi-device layouts. Each disk
+ * will have its on read pointer and all disks will by utilized in parallel.
+ * Also will no two disks read both sides of a mirror simultaneously, as this
+ * would waste seeking capacity. Instead both disks will read different parts
+ * of the filesystem.
+ * Any number of readaheads can be started in parallel. The read order will be
+ * determined globally, i.e. 2 parallel readaheads will normally finish faster
+ * than the 2 started one after another.
+ */
+
+#define MAX_MIRRORS 2
+#define MAX_IN_FLIGHT 6
+
+struct reada_extctl {
+ struct list_head list;
+ struct reada_control *rc;
+ u64 generation;
+};
+
+struct reada_extent {
+ u64 logical;
+ struct btrfs_key top;
+ u32 blocksize;
+ int err;
+ struct list_head extctl;
+ struct kref refcnt;
+ spinlock_t lock;
+ struct reada_zone *zones[MAX_MIRRORS];
+ int nzones;
+ struct btrfs_device *scheduled_for;
+};
+
+struct reada_zone {
+ u64 start;
+ u64 end;
+ u64 elems;
+ struct list_head list;
+ spinlock_t lock;
+ int locked;
+ struct btrfs_device *device;
+ struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
+ int ndevs;
+ struct kref refcnt;
+};
+
+struct reada_machine_work {
+ struct btrfs_work work;
+ struct btrfs_fs_info *fs_info;
+};
+
+static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
+static void reada_control_release(struct kref *kref);
+static void reada_zone_release(struct kref *kref);
+static void reada_start_machine(struct btrfs_fs_info *fs_info);
+static void __reada_start_machine(struct btrfs_fs_info *fs_info);
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+ struct btrfs_key *top, int level, u64 generation);
+
+/* recurses */
+/* in case of err, eb might be NULL */
+static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ u64 start, int err)
+{
+ int level = 0;
+ int nritems;
+ int i;
+ u64 bytenr;
+ u64 generation;
+ struct reada_extent *re;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct list_head list;
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ struct btrfs_device *for_dev;
+
+ if (eb)
+ level = btrfs_header_level(eb);
+
+ /* find extent */
+ spin_lock(&fs_info->reada_lock);
+ re = radix_tree_lookup(&fs_info->reada_tree, index);
+ if (re)
+ kref_get(&re->refcnt);
+ spin_unlock(&fs_info->reada_lock);
+
+ if (!re)
+ return -1;
+
+ spin_lock(&re->lock);
+ /*
+ * just take the full list from the extent. afterwards we
+ * don't need the lock anymore
+ */
+ list_replace_init(&re->extctl, &list);
+ for_dev = re->scheduled_for;
+ re->scheduled_for = NULL;
+ spin_unlock(&re->lock);
+
+ if (err == 0) {
+ nritems = level ? btrfs_header_nritems(eb) : 0;
+ generation = btrfs_header_generation(eb);
+ /*
+ * FIXME: currently we just set nritems to 0 if this is a leaf,
+ * effectively ignoring the content. In a next step we could
+ * trigger more readahead depending from the content, e.g.
+ * fetch the checksums for the extents in the leaf.
+ */
+ } else {
+ /*
+ * this is the error case, the extent buffer has not been
+ * read correctly. We won't access anything from it and
+ * just cleanup our data structures. Effectively this will
+ * cut the branch below this node from read ahead.
+ */
+ nritems = 0;
+ generation = 0;
+ }
+
+ for (i = 0; i < nritems; i++) {
+ struct reada_extctl *rec;
+ u64 n_gen;
+ struct btrfs_key key;
+ struct btrfs_key next_key;
+
+ btrfs_node_key_to_cpu(eb, &key, i);
+ if (i + 1 < nritems)
+ btrfs_node_key_to_cpu(eb, &next_key, i + 1);
+ else
+ next_key = re->top;
+ bytenr = btrfs_node_blockptr(eb, i);
+ n_gen = btrfs_node_ptr_generation(eb, i);
+
+ list_for_each_entry(rec, &list, list) {
+ struct reada_control *rc = rec->rc;
+
+ /*
+ * if the generation doesn't match, just ignore this
+ * extctl. This will probably cut off a branch from
+ * prefetch. Alternatively one could start a new (sub-)
+ * prefetch for this branch, starting again from root.
+ * FIXME: move the generation check out of this loop
+ */
+#ifdef DEBUG
+ if (rec->generation != generation) {
+ printk(KERN_DEBUG "generation mismatch for "
+ "(%llu,%d,%llu) %llu != %llu\n",
+ key.objectid, key.type, key.offset,
+ rec->generation, generation);
+ }
+#endif
+ if (rec->generation == generation &&
+ btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
+ btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
+ reada_add_block(rc, bytenr, &next_key,
+ level - 1, n_gen);
+ }
+ }
+ /*
+ * free extctl records
+ */
+ while (!list_empty(&list)) {
+ struct reada_control *rc;
+ struct reada_extctl *rec;
+
+ rec = list_first_entry(&list, struct reada_extctl, list);
+ list_del(&rec->list);
+ rc = rec->rc;
+ kfree(rec);
+
+ kref_get(&rc->refcnt);
+ if (atomic_dec_and_test(&rc->elems)) {
+ kref_put(&rc->refcnt, reada_control_release);
+ wake_up(&rc->wait);
+ }
+ kref_put(&rc->refcnt, reada_control_release);
+
+ reada_extent_put(fs_info, re); /* one ref for each entry */
+ }
+ reada_extent_put(fs_info, re); /* our ref */
+ if (for_dev)
+ atomic_dec(&for_dev->reada_in_flight);
+
+ return 0;
+}
+
+/*
+ * start is passed separately in case eb in NULL, which may be the case with
+ * failed I/O
+ */
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ u64 start, int err)
+{
+ int ret;
+
+ ret = __readahead_hook(root, eb, start, err);
+
+ reada_start_machine(root->fs_info);
+
+ return ret;
+}
+
+static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev, u64 logical,
+ struct btrfs_bio *multi)
+{
+ int ret;
+ int looped = 0;
+ struct reada_zone *zone;
+ struct btrfs_block_group_cache *cache = NULL;
+ u64 start;
+ u64 end;
+ int i;
+
+again:
+ zone = NULL;
+ spin_lock(&fs_info->reada_lock);
+ ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+ logical >> PAGE_CACHE_SHIFT, 1);
+ if (ret == 1)
+ kref_get(&zone->refcnt);
+ spin_unlock(&fs_info->reada_lock);
+
+ if (ret == 1) {
+ if (logical >= zone->start && logical < zone->end)
+ return zone;
+ spin_lock(&fs_info->reada_lock);
+ kref_put(&zone->refcnt, reada_zone_release);
+ spin_unlock(&fs_info->reada_lock);
+ }
+
+ if (looped)
+ return NULL;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+ if (!cache)
+ return NULL;
+
+ start = cache->key.objectid;
+ end = start + cache->key.offset - 1;
+ btrfs_put_block_group(cache);
+
+ zone = kzalloc(sizeof(*zone), GFP_NOFS);
+ if (!zone)
+ return NULL;
+
+ zone->start = start;
+ zone->end = end;
+ INIT_LIST_HEAD(&zone->list);
+ spin_lock_init(&zone->lock);
+ zone->locked = 0;
+ kref_init(&zone->refcnt);
+ zone->elems = 0;
+ zone->device = dev; /* our device always sits at index 0 */
+ for (i = 0; i < multi->num_stripes; ++i) {
+ /* bounds have already been checked */
+ zone->devs[i] = multi->stripes[i].dev;
+ }
+ zone->ndevs = multi->num_stripes;
+
+ spin_lock(&fs_info->reada_lock);
+ ret = radix_tree_insert(&dev->reada_zones,
+ (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
+ zone);
+ spin_unlock(&fs_info->reada_lock);
+
+ if (ret) {
+ kfree(zone);
+ looped = 1;
+ goto again;
+ }
+
+ return zone;
+}
+
+static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+ u64 logical,
+ struct btrfs_key *top, int level)
+{
+ int ret;
+ int looped = 0;
+ struct reada_extent *re = NULL;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct btrfs_bio *multi = NULL;
+ struct btrfs_device *dev;
+ u32 blocksize;
+ u64 length;
+ int nzones = 0;
+ int i;
+ unsigned long index = logical >> PAGE_CACHE_SHIFT;
+
+again:
+ spin_lock(&fs_info->reada_lock);
+ re = radix_tree_lookup(&fs_info->reada_tree, index);
+ if (re)
+ kref_get(&re->refcnt);
+ spin_unlock(&fs_info->reada_lock);
+
+ if (re || looped)
+ return re;
+
+ re = kzalloc(sizeof(*re), GFP_NOFS);
+ if (!re)
+ return NULL;
+
+ blocksize = btrfs_level_size(root, level);
+ re->logical = logical;
+ re->blocksize = blocksize;
+ re->top = *top;
+ INIT_LIST_HEAD(&re->extctl);
+ spin_lock_init(&re->lock);
+ kref_init(&re->refcnt);
+
+ /*
+ * map block
+ */
+ length = blocksize;
+ ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0);
+ if (ret || !multi || length < blocksize)
+ goto error;
+
+ if (multi->num_stripes > MAX_MIRRORS) {
+ printk(KERN_ERR "btrfs readahead: more than %d copies not "
+ "supported", MAX_MIRRORS);
+ goto error;
+ }
+
+ for (nzones = 0; nzones < multi->num_stripes; ++nzones) {
+ struct reada_zone *zone;
+
+ dev = multi->stripes[nzones].dev;
+ zone = reada_find_zone(fs_info, dev, logical, multi);
+ if (!zone)
+ break;
+
+ re->zones[nzones] = zone;
+ spin_lock(&zone->lock);
+ if (!zone->elems)
+ kref_get(&zone->refcnt);
+ ++zone->elems;
+ spin_unlock(&zone->lock);
+ spin_lock(&fs_info->reada_lock);
+ kref_put(&zone->refcnt, reada_zone_release);
+ spin_unlock(&fs_info->reada_lock);
+ }
+ re->nzones = nzones;
+ if (nzones == 0) {
+ /* not a single zone found, error and out */
+ goto error;
+ }
+
+ /* insert extent in reada_tree + all per-device trees, all or nothing */
+ spin_lock(&fs_info->reada_lock);
+ ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+ if (ret) {
+ spin_unlock(&fs_info->reada_lock);
+ if (ret != -ENOMEM) {
+ /* someone inserted the extent in the meantime */
+ looped = 1;
+ }
+ goto error;
+ }
+ for (i = 0; i < nzones; ++i) {
+ dev = multi->stripes[i].dev;
+ ret = radix_tree_insert(&dev->reada_extents, index, re);
+ if (ret) {
+ while (--i >= 0) {
+ dev = multi->stripes[i].dev;
+ BUG_ON(dev == NULL);
+ radix_tree_delete(&dev->reada_extents, index);
+ }
+ BUG_ON(fs_info == NULL);
+ radix_tree_delete(&fs_info->reada_tree, index);
+ spin_unlock(&fs_info->reada_lock);
+ goto error;
+ }
+ }
+ spin_unlock(&fs_info->reada_lock);
+
+ return re;
+
+error:
+ while (nzones) {
+ struct reada_zone *zone;
+
+ --nzones;
+ zone = re->zones[nzones];
+ kref_get(&zone->refcnt);
+ spin_lock(&zone->lock);
+ --zone->elems;
+ if (zone->elems == 0) {
+ /*
+ * no fs_info->reada_lock needed, as this can't be
+ * the last ref
+ */
+ kref_put(&zone->refcnt, reada_zone_release);
+ }
+ spin_unlock(&zone->lock);
+
+ spin_lock(&fs_info->reada_lock);
+ kref_put(&zone->refcnt, reada_zone_release);
+ spin_unlock(&fs_info->reada_lock);
+ }
+ kfree(re);
+ if (looped)
+ goto again;
+ return NULL;
+}
+
+static void reada_kref_dummy(struct kref *kr)
+{
+}
+
+static void reada_extent_put(struct btrfs_fs_info *fs_info,
+ struct reada_extent *re)
+{
+ int i;
+ unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+
+ spin_lock(&fs_info->reada_lock);
+ if (!kref_put(&re->refcnt, reada_kref_dummy)) {
+ spin_unlock(&fs_info->reada_lock);
+ return;
+ }
+
+ radix_tree_delete(&fs_info->reada_tree, index);
+ for (i = 0; i < re->nzones; ++i) {
+ struct reada_zone *zone = re->zones[i];
+
+ radix_tree_delete(&zone->device->reada_extents, index);
+ }
+
+ spin_unlock(&fs_info->reada_lock);
+
+ for (i = 0; i < re->nzones; ++i) {
+ struct reada_zone *zone = re->zones[i];
+
+ kref_get(&zone->refcnt);
+ spin_lock(&zone->lock);
+ --zone->elems;
+ if (zone->elems == 0) {
+ /* no fs_info->reada_lock needed, as this can't be
+ * the last ref */
+ kref_put(&zone->refcnt, reada_zone_release);
+ }
+ spin_unlock(&zone->lock);
+
+ spin_lock(&fs_info->reada_lock);
+ kref_put(&zone->refcnt, reada_zone_release);
+ spin_unlock(&fs_info->reada_lock);
+ }
+ if (re->scheduled_for)
+ atomic_dec(&re->scheduled_for->reada_in_flight);
+
+ kfree(re);
+}
+
+static void reada_zone_release(struct kref *kref)
+{
+ struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+
+ radix_tree_delete(&zone->device->reada_zones,
+ zone->end >> PAGE_CACHE_SHIFT);
+
+ kfree(zone);
+}
+
+static void reada_control_release(struct kref *kref)
+{
+ struct reada_control *rc = container_of(kref, struct reada_control,
+ refcnt);
+
+ kfree(rc);
+}
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+ struct btrfs_key *top, int level, u64 generation)
+{
+ struct btrfs_root *root = rc->root;
+ struct reada_extent *re;
+ struct reada_extctl *rec;
+
+ re = reada_find_extent(root, logical, top, level); /* takes one ref */
+ if (!re)
+ return -1;
+
+ rec = kzalloc(sizeof(*rec), GFP_NOFS);
+ if (!rec) {
+ reada_extent_put(root->fs_info, re);
+ return -1;
+ }
+
+ rec->rc = rc;
+ rec->generation = generation;
+ atomic_inc(&rc->elems);
+
+ spin_lock(&re->lock);
+ list_add_tail(&rec->list, &re->extctl);
+ spin_unlock(&re->lock);
+
+ /* leave the ref on the extent */
+
+ return 0;
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
+{
+ int i;
+ unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+
+ for (i = 0; i < zone->ndevs; ++i) {
+ struct reada_zone *peer;
+ peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
+ if (peer && peer->device != zone->device)
+ peer->locked = lock;
+ }
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static int reada_pick_zone(struct btrfs_device *dev)
+{
+ struct reada_zone *top_zone = NULL;
+ struct reada_zone *top_locked_zone = NULL;
+ u64 top_elems = 0;
+ u64 top_locked_elems = 0;
+ unsigned long index = 0;
+ int ret;
+
+ if (dev->reada_curr_zone) {
+ reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
+ kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
+ dev->reada_curr_zone = NULL;
+ }
+ /* pick the zone with the most elements */
+ while (1) {
+ struct reada_zone *zone;
+
+ ret = radix_tree_gang_lookup(&dev->reada_zones,
+ (void **)&zone, index, 1);
+ if (ret == 0)
+ break;
+ index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ if (zone->locked) {
+ if (zone->elems > top_locked_elems) {
+ top_locked_elems = zone->elems;
+ top_locked_zone = zone;
+ }
+ } else {
+ if (zone->elems > top_elems) {
+ top_elems = zone->elems;
+ top_zone = zone;
+ }
+ }
+ }
+ if (top_zone)
+ dev->reada_curr_zone = top_zone;
+ else if (top_locked_zone)
+ dev->reada_curr_zone = top_locked_zone;
+ else
+ return 0;
+
+ dev->reada_next = dev->reada_curr_zone->start;
+ kref_get(&dev->reada_curr_zone->refcnt);
+ reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
+
+ return 1;
+}
+
+static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev)
+{
+ struct reada_extent *re = NULL;
+ int mirror_num = 0;
+ struct extent_buffer *eb = NULL;
+ u64 logical;
+ u32 blocksize;
+ int ret;
+ int i;
+ int need_kick = 0;
+
+ spin_lock(&fs_info->reada_lock);
+ if (dev->reada_curr_zone == NULL) {
+ ret = reada_pick_zone(dev);
+ if (!ret) {
+ spin_unlock(&fs_info->reada_lock);
+ return 0;
+ }
+ }
+ /*
+ * FIXME currently we issue the reads one extent at a time. If we have
+ * a contiguous block of extents, we could also coagulate them or use
+ * plugging to speed things up
+ */
+ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+ dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+ if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+ ret = reada_pick_zone(dev);
+ if (!ret) {
+ spin_unlock(&fs_info->reada_lock);
+ return 0;
+ }
+ re = NULL;
+ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+ dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+ }
+ if (ret == 0) {
+ spin_unlock(&fs_info->reada_lock);
+ return 0;
+ }
+ dev->reada_next = re->logical + re->blocksize;
+ kref_get(&re->refcnt);
+
+ spin_unlock(&fs_info->reada_lock);
+
+ /*
+ * find mirror num
+ */
+ for (i = 0; i < re->nzones; ++i) {
+ if (re->zones[i]->device == dev) {
+ mirror_num = i + 1;
+ break;
+ }
+ }
+ logical = re->logical;
+ blocksize = re->blocksize;
+
+ spin_lock(&re->lock);
+ if (re->scheduled_for == NULL) {
+ re->scheduled_for = dev;
+ need_kick = 1;
+ }
+ spin_unlock(&re->lock);
+
+ reada_extent_put(fs_info, re);
+
+ if (!need_kick)
+ return 0;
+
+ atomic_inc(&dev->reada_in_flight);
+ ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
+ mirror_num, &eb);
+ if (ret)
+ __readahead_hook(fs_info->extent_root, NULL, logical, ret);
+ else if (eb)
+ __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+
+ if (eb)
+ free_extent_buffer(eb);
+
+ return 1;
+
+}
+
+static void reada_start_machine_worker(struct btrfs_work *work)
+{
+ struct reada_machine_work *rmw;
+ struct btrfs_fs_info *fs_info;
+
+ rmw = container_of(work, struct reada_machine_work, work);
+ fs_info = rmw->fs_info;
+
+ kfree(rmw);
+
+ __reada_start_machine(fs_info);
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u64 enqueued;
+ u64 total = 0;
+ int i;
+
+ do {
+ enqueued = 0;
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (atomic_read(&device->reada_in_flight) <
+ MAX_IN_FLIGHT)
+ enqueued += reada_start_machine_dev(fs_info,
+ device);
+ }
+ total += enqueued;
+ } while (enqueued && total < 10000);
+
+ if (enqueued == 0)
+ return;
+
+ /*
+ * If everything is already in the cache, this is effectively single
+ * threaded. To a) not hold the caller for too long and b) to utilize
+ * more cores, we broke the loop above after 10000 iterations and now
+ * enqueue to workers to finish it. This will distribute the load to
+ * the cores.
+ */
+ for (i = 0; i < 2; ++i)
+ reada_start_machine(fs_info);
+}
+
+static void reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct reada_machine_work *rmw;
+
+ rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+ if (!rmw) {
+ /* FIXME we cannot handle this properly right now */
+ BUG();
+ }
+ rmw->work.func = reada_start_machine_worker;
+ rmw->fs_info = fs_info;
+
+ btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+}
+
+#ifdef DEBUG
+static void dump_devs(struct btrfs_fs_info *fs_info, int all)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ unsigned long index;
+ int ret;
+ int i;
+ int j;
+ int cnt;
+
+ spin_lock(&fs_info->reada_lock);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
+ atomic_read(&device->reada_in_flight));
+ index = 0;
+ while (1) {
+ struct reada_zone *zone;
+ ret = radix_tree_gang_lookup(&device->reada_zones,
+ (void **)&zone, index, 1);
+ if (ret == 0)
+ break;
+ printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
+ "%d devs", zone->start, zone->end, zone->elems,
+ zone->locked);
+ for (j = 0; j < zone->ndevs; ++j) {
+ printk(KERN_CONT " %lld",
+ zone->devs[j]->devid);
+ }
+ if (device->reada_curr_zone == zone)
+ printk(KERN_CONT " curr off %llu",
+ device->reada_next - zone->start);
+ printk(KERN_CONT "\n");
+ index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ }
+ cnt = 0;
+ index = 0;
+ while (all) {
+ struct reada_extent *re = NULL;
+
+ ret = radix_tree_gang_lookup(&device->reada_extents,
+ (void **)&re, index, 1);
+ if (ret == 0)
+ break;
+ printk(KERN_DEBUG
+ " re: logical %llu size %u empty %d for %lld",
+ re->logical, re->blocksize,
+ list_empty(&re->extctl), re->scheduled_for ?
+ re->scheduled_for->devid : -1);
+
+ for (i = 0; i < re->nzones; ++i) {
+ printk(KERN_CONT " zone %llu-%llu devs",
+ re->zones[i]->start,
+ re->zones[i]->end);
+ for (j = 0; j < re->zones[i]->ndevs; ++j) {
+ printk(KERN_CONT " %lld",
+ re->zones[i]->devs[j]->devid);
+ }
+ }
+ printk(KERN_CONT "\n");
+ index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ if (++cnt > 15)
+ break;
+ }
+ }
+
+ index = 0;
+ cnt = 0;
+ while (all) {
+ struct reada_extent *re = NULL;
+
+ ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
+ index, 1);
+ if (ret == 0)
+ break;
+ if (!re->scheduled_for) {
+ index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ continue;
+ }
+ printk(KERN_DEBUG
+ "re: logical %llu size %u list empty %d for %lld",
+ re->logical, re->blocksize, list_empty(&re->extctl),
+ re->scheduled_for ? re->scheduled_for->devid : -1);
+ for (i = 0; i < re->nzones; ++i) {
+ printk(KERN_CONT " zone %llu-%llu devs",
+ re->zones[i]->start,
+ re->zones[i]->end);
+ for (i = 0; i < re->nzones; ++i) {
+ printk(KERN_CONT " zone %llu-%llu devs",
+ re->zones[i]->start,
+ re->zones[i]->end);
+ for (j = 0; j < re->zones[i]->ndevs; ++j) {
+ printk(KERN_CONT " %lld",
+ re->zones[i]->devs[j]->devid);
+ }
+ }
+ }
+ printk(KERN_CONT "\n");
+ index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ }
+ spin_unlock(&fs_info->reada_lock);
+}
+#endif
+
+/*
+ * interface
+ */
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+ struct btrfs_key *key_start, struct btrfs_key *key_end)
+{
+ struct reada_control *rc;
+ u64 start;
+ u64 generation;
+ int level;
+ struct extent_buffer *node;
+ static struct btrfs_key max_key = {
+ .objectid = (u64)-1,
+ .type = (u8)-1,
+ .offset = (u64)-1
+ };
+
+ rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ if (!rc)
+ return ERR_PTR(-ENOMEM);
+
+ rc->root = root;
+ rc->key_start = *key_start;
+ rc->key_end = *key_end;
+ atomic_set(&rc->elems, 0);
+ init_waitqueue_head(&rc->wait);
+ kref_init(&rc->refcnt);
+ kref_get(&rc->refcnt); /* one ref for having elements */
+
+ node = btrfs_root_node(root);
+ start = node->start;
+ level = btrfs_header_level(node);
+ generation = btrfs_header_generation(node);
+ free_extent_buffer(node);
+
+ reada_add_block(rc, start, &max_key, level, generation);
+
+ reada_start_machine(root->fs_info);
+
+ return rc;
+}
+
+#ifdef DEBUG
+int btrfs_reada_wait(void *handle)
+{
+ struct reada_control *rc = handle;
+
+ while (atomic_read(&rc->elems)) {
+ wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+ 5 * HZ);
+ dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+ }
+
+ dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+
+ kref_put(&rc->refcnt, reada_control_release);
+
+ return 0;
+}
+#else
+int btrfs_reada_wait(void *handle)
+{
+ struct reada_control *rc = handle;
+
+ while (atomic_read(&rc->elems)) {
+ wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+ }
+
+ kref_put(&rc->refcnt, reada_control_release);
+
+ return 0;
+}
+#endif
+
+void btrfs_reada_detach(void *handle)
+{
+ struct reada_control *rc = handle;
+
+ kref_put(&rc->refcnt, reada_control_release);
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..24d654ce7a06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
BUG_ON(IS_ERR(trans));
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
- min_reserved, 0);
+ ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
if (ret) {
BUG_ON(ret != -EAGAIN);
ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2151,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
- ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
- num_bytes);
+ ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
if (ret)
err = ret;
}
@@ -2427,7 +2425,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
num_bytes = calcu_metadata_size(rc, node, 1) * 2;
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
if (ret) {
if (ret == -EAGAIN)
rc->commit_transaction = 1;
@@ -2922,6 +2920,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
unsigned long last_index;
struct page *page;
struct file_ra_state *ra;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int nr = 0;
int ret = 0;
@@ -2956,7 +2955,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
ra, NULL, index,
last_index + 1 - index);
page = find_or_create_page(inode->i_mapping, index,
- GFP_NOFS);
+ mask);
if (!page) {
btrfs_delalloc_release_metadata(inode,
PAGE_CACHE_SIZE);
@@ -3323,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,
}
key.objectid = ref_objectid;
- key.offset = ref_offset;
key.type = BTRFS_EXTENT_DATA_KEY;
+ if (ref_offset > ((u64)-1 << 32))
+ key.offset = 0;
+ else
+ key.offset = ref_offset;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -3645,14 +3647,11 @@ int prepare_to_relocate(struct reloc_control *rc)
* btrfs_init_reloc_root will use them when there
* is no reservation in transaction handle.
*/
- ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+ ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
rc->extent_root->nodesize * 256);
if (ret)
return ret;
- rc->block_rsv->refill_used = 1;
- btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
-
memset(&rc->cluster, 0, sizeof(rc->cluster));
rc->search_start = rc->block_group->key.objectid;
rc->extents_found = 0;
@@ -3777,8 +3776,7 @@ restart:
}
}
- ret = btrfs_block_rsv_check(trans, rc->extent_root,
- rc->block_rsv, 0, 5);
+ ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
if (ret < 0) {
if (ret != -EAGAIN) {
err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index eba42e5fd5fd..94cd3a19e9c8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -33,15 +33,12 @@
* any can be found.
*
* Future enhancements:
- * - To enhance the performance, better read-ahead strategies for the
- * extent-tree can be employed.
* - In case an unrepairable extent is encountered, track which files are
* affected and report them
* - In case of a read error on files with nodatasum, map the file and read
* the extent to trigger a writeback of the good copy
* - track and record media errors, throw out bad devices
* - add a mode to also read unallocated space
- * - make the prefetch cancellable
*/
struct scrub_bio;
@@ -209,7 +206,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
atomic_set(&sdev->in_flight, 0);
atomic_set(&sdev->fixup_cnt, 0);
atomic_set(&sdev->cancel_req, 0);
- sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
INIT_LIST_HEAD(&sdev->csum_list);
spin_lock_init(&sdev->list_lock);
@@ -1130,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
int slot;
int i;
u64 nstripes;
- int start_stripe;
struct extent_buffer *l;
struct btrfs_key key;
u64 physical;
u64 logical;
u64 generation;
int mirror_num;
+ struct reada_control *reada1;
+ struct reada_control *reada2;
+ struct btrfs_key key_start;
+ struct btrfs_key key_end;
u64 increment = map->stripe_len;
u64 offset;
@@ -1168,81 +1168,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
if (!path)
return -ENOMEM;
- path->reada = 2;
path->search_commit_root = 1;
path->skip_locking = 1;
/*
- * find all extents for each stripe and just read them to get
- * them into the page cache
- * FIXME: we can do better. build a more intelligent prefetching
+ * trigger the readahead for extent tree csum tree and wait for
+ * completion. During readahead, the scrub is officially paused
+ * to not hold off transaction commits
*/
logical = base + offset;
- physical = map->stripes[num].physical;
- ret = 0;
- for (i = 0; i < nstripes; ++i) {
- key.objectid = logical;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)0;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out_noplug;
-
- /*
- * we might miss half an extent here, but that doesn't matter,
- * as it's only the prefetch
- */
- while (1) {
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out_noplug;
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
+ wait_event(sdev->list_wait,
+ atomic_read(&sdev->in_flight) == 0);
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
- if (key.objectid >= logical + map->stripe_len)
- break;
+ /* FIXME it might be better to start readahead at commit root */
+ key_start.objectid = logical;
+ key_start.type = BTRFS_EXTENT_ITEM_KEY;
+ key_start.offset = (u64)0;
+ key_end.objectid = base + offset + nstripes * increment;
+ key_end.type = BTRFS_EXTENT_ITEM_KEY;
+ key_end.offset = (u64)0;
+ reada1 = btrfs_reada_add(root, &key_start, &key_end);
+
+ key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_start.type = BTRFS_EXTENT_CSUM_KEY;
+ key_start.offset = logical;
+ key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_end.type = BTRFS_EXTENT_CSUM_KEY;
+ key_end.offset = base + offset + nstripes * increment;
+ reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+
+ if (!IS_ERR(reada1))
+ btrfs_reada_wait(reada1);
+ if (!IS_ERR(reada2))
+ btrfs_reada_wait(reada2);
- path->slots[0]++;
- }
- btrfs_release_path(path);
- logical += increment;
- physical += map->stripe_len;
- cond_resched();
+ mutex_lock(&fs_info->scrub_lock);
+ while (atomic_read(&fs_info->scrub_pause_req)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrub_pause_req) == 0);
+ mutex_lock(&fs_info->scrub_lock);
}
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ wake_up(&fs_info->scrub_pause_wait);
/*
* collect all data csums for the stripe to avoid seeking during
* the scrub. This might currently (crc32) end up to be about 1MB
*/
- start_stripe = 0;
blk_start_plug(&plug);
-again:
- logical = base + offset + start_stripe * increment;
- for (i = start_stripe; i < nstripes; ++i) {
- ret = btrfs_lookup_csums_range(csum_root, logical,
- logical + map->stripe_len - 1,
- &sdev->csum_list, 1);
- if (ret)
- goto out;
- logical += increment;
- cond_resched();
- }
/*
* now find all extents for each stripe and scrub them
*/
- logical = base + offset + start_stripe * increment;
- physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+ logical = base + offset;
+ physical = map->stripes[num].physical;
ret = 0;
- for (i = start_stripe; i < nstripes; ++i) {
+ for (i = 0; i < nstripes; ++i) {
/*
* canceled?
*/
@@ -1271,11 +1257,14 @@ again:
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
wake_up(&fs_info->scrub_pause_wait);
- scrub_free_csums(sdev);
- start_stripe = i;
- goto again;
}
+ ret = btrfs_lookup_csums_range(csum_root, logical,
+ logical + map->stripe_len - 1,
+ &sdev->csum_list, 1);
+ if (ret)
+ goto out;
+
key.objectid = logical;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)0;
@@ -1371,7 +1360,6 @@ next:
out:
blk_finish_plug(&plug);
-out_noplug:
btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..57080dffdfc6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/cleancache.h>
+#include <linux/mnt_namespace.h>
#include "compat.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -58,6 +59,7 @@
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
+static struct file_system_type btrfs_fs_type;
static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
char nbuf[16])
@@ -162,7 +164,7 @@ enum {
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
- Opt_inode_cache, Opt_err,
+ Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
};
static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
{Opt_subvolrootid, "subvolrootid=%d"},
{Opt_defrag, "autodefrag"},
{Opt_inode_cache, "inode_cache"},
+ {Opt_no_space_cache, "no_space_cache"},
+ {Opt_recovery, "recovery"},
{Opt_err, NULL},
};
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
{
struct btrfs_fs_info *info = root->fs_info;
substring_t args[MAX_OPT_ARGS];
- char *p, *num, *orig;
+ char *p, *num, *orig = NULL;
+ u64 cache_gen;
int intarg;
int ret = 0;
char *compress_type;
bool compress_force = false;
+ cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+ if (cache_gen)
+ btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+
if (!options)
- return 0;
+ goto out;
/*
* strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, DISCARD);
break;
case Opt_space_cache:
- printk(KERN_INFO "btrfs: enabling disk space caching\n");
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
break;
+ case Opt_no_space_cache:
+ printk(KERN_INFO "btrfs: disabling disk space caching\n");
+ btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
+ break;
case Opt_inode_cache:
printk(KERN_INFO "btrfs: enabling inode map caching\n");
btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
printk(KERN_INFO "btrfs: enabling auto defrag");
btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
break;
+ case Opt_recovery:
+ printk(KERN_INFO "btrfs: enabling auto recovery");
+ btrfs_set_opt(info->mount_opt, RECOVERY);
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
}
out:
+ if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+ printk(KERN_INFO "btrfs: disk space caching is enabled\n");
kfree(orig);
return ret;
}
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
{
substring_t args[MAX_OPT_ARGS];
- char *opts, *orig, *p;
+ char *device_name, *opts, *orig, *p;
int error = 0;
int intarg;
if (!options)
- goto out;
+ return 0;
/*
* strsep changes the string, duplicate it because parse_options
@@ -457,29 +475,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
}
break;
case Opt_device:
- error = btrfs_scan_one_device(match_strdup(&args[0]),
+ device_name = match_strdup(&args[0]);
+ if (!device_name) {
+ error = -ENOMEM;
+ goto out;
+ }
+ error = btrfs_scan_one_device(device_name,
flags, holder, fs_devices);
+ kfree(device_name);
if (error)
- goto out_free_opts;
+ goto out;
break;
default:
break;
}
}
- out_free_opts:
+out:
kfree(orig);
- out:
- /*
- * If no subvolume name is specified we use the default one. Allocate
- * a copy of the string "." here so that code later in the
- * mount path doesn't care if it's the default volume or another one.
- */
- if (!*subvol_name) {
- *subvol_name = kstrdup(".", GFP_KERNEL);
- if (!*subvol_name)
- return -ENOMEM;
- }
return error;
}
@@ -492,7 +505,6 @@ static struct dentry *get_default_root(struct super_block *sb,
struct btrfs_path *path;
struct btrfs_key location;
struct inode *inode;
- struct dentry *dentry;
u64 dir_id;
int new = 0;
@@ -517,7 +529,7 @@ static struct dentry *get_default_root(struct super_block *sb,
* will mount by default if we haven't been given a specific subvolume
* to mount.
*/
- dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+ dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
@@ -566,29 +578,7 @@ setup_root:
return dget(sb->s_root);
}
- if (new) {
- const struct qstr name = { .name = "/", .len = 1 };
-
- /*
- * New inode, we need to make the dentry a sibling of s_root so
- * everything gets cleaned up properly on unmount.
- */
- dentry = d_alloc(sb->s_root, &name);
- if (!dentry) {
- iput(inode);
- return ERR_PTR(-ENOMEM);
- }
- d_splice_alias(inode, dentry);
- } else {
- /*
- * We found the inode in cache, just find a dentry for it and
- * put the reference to the inode we just got.
- */
- dentry = d_find_alias(inode);
- iput(inode);
- }
-
- return dentry;
+ return d_obtain_alias(inode);
}
static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +709,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",noacl");
if (btrfs_test_opt(root, SPACE_CACHE))
seq_puts(seq, ",space_cache");
+ else
+ seq_puts(seq, ",no_space_cache");
if (btrfs_test_opt(root, CLEAR_CACHE))
seq_puts(seq, ",clear_cache");
if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +745,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
return set_anon_super(s, data);
}
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+ if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+ return 0;
+}
+
+/*
+ * This will strip out the subvol=%s argument for an argument string and add
+ * subvolid=0 to make sure we get the actual tree root for path walking to the
+ * subvol we want.
+ */
+static char *setup_root_args(char *args)
+{
+ unsigned copied = 0;
+ unsigned len = strlen(args) + 2;
+ char *pos;
+ char *ret;
+
+ /*
+ * We need the same args as before, but minus
+ *
+ * subvol=a
+ *
+ * and add
+ *
+ * subvolid=0
+ *
+ * which is a difference of 2 characters, so we allocate strlen(args) +
+ * 2 characters.
+ */
+ ret = kzalloc(len * sizeof(char), GFP_NOFS);
+ if (!ret)
+ return NULL;
+ pos = strstr(args, "subvol=");
+
+ /* This shouldn't happen, but just in case.. */
+ if (!pos) {
+ kfree(ret);
+ return NULL;
+ }
+
+ /*
+ * The subvol=<> arg is not at the front of the string, copy everybody
+ * up to that into ret.
+ */
+ if (pos != args) {
+ *pos = '\0';
+ strcpy(ret, args);
+ copied += strlen(args);
+ pos++;
+ }
+
+ strncpy(ret + copied, "subvolid=0", len - copied);
+
+ /* Length of subvolid=0 */
+ copied += 10;
+
+ /*
+ * If there is no , after the subvol= option then we know there's no
+ * other options and we can just return.
+ */
+ pos = strchr(pos, ',');
+ if (!pos)
+ return ret;
+
+ /* Copy the rest of the arguments into our buffer */
+ strncpy(ret + copied, pos, len - copied);
+ copied += strlen(pos);
+
+ return ret;
+}
+
+static struct dentry *mount_subvol(const char *subvol_name, int flags,
+ const char *device_name, char *data)
+{
+ struct super_block *s;
+ struct dentry *root;
+ struct vfsmount *mnt;
+ struct mnt_namespace *ns_private;
+ char *newargs;
+ struct path path;
+ int error;
+
+ newargs = setup_root_args(data);
+ if (!newargs)
+ return ERR_PTR(-ENOMEM);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
+ newargs);
+ kfree(newargs);
+ if (IS_ERR(mnt))
+ return ERR_CAST(mnt);
+
+ ns_private = create_mnt_ns(mnt);
+ if (IS_ERR(ns_private)) {
+ mntput(mnt);
+ return ERR_CAST(ns_private);
+ }
+
+ /*
+ * This will trigger the automount of the subvol so we can just
+ * drop the mnt we have here and return the dentry that we
+ * found.
+ */
+ error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
+ LOOKUP_FOLLOW, &path);
+ put_mnt_ns(ns_private);
+ if (error)
+ return ERR_PTR(error);
+
+ if (!is_subvolume_inode(path.dentry->d_inode)) {
+ path_put(&path);
+ mntput(mnt);
+ error = -EINVAL;
+ printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+ subvol_name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Get a ref to the sb and the dentry we found and return it */
+ s = path.mnt->mnt_sb;
+ atomic_inc(&s->s_active);
+ root = dget(path.dentry);
+ path_put(&path);
+ down_write(&s->s_umount);
+
+ return root;
+}
/*
* Find a superblock for the given device / mount point.
@@ -784,13 +907,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (error)
return ERR_PTR(error);
+ if (subvol_name) {
+ root = mount_subvol(subvol_name, flags, device_name, data);
+ kfree(subvol_name);
+ return root;
+ }
+
error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
if (error)
- goto error_free_subvol_name;
+ return ERR_PTR(error);
error = btrfs_open_devices(fs_devices, mode, fs_type);
if (error)
- goto error_free_subvol_name;
+ return ERR_PTR(error);
if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
error = -EACCES;
@@ -813,88 +942,57 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
fs_info->fs_devices = fs_devices;
tree_root->fs_info = fs_info;
+ fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+ fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+ if (!fs_info->super_copy || !fs_info->super_for_commit) {
+ error = -ENOMEM;
+ goto error_close_devices;
+ }
+
bdev = fs_devices->latest_bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
- if (IS_ERR(s))
- goto error_s;
+ if (IS_ERR(s)) {
+ error = PTR_ERR(s);
+ goto error_close_devices;
+ }
if (s->s_root) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
deactivate_locked_super(s);
- error = -EBUSY;
- goto error_close_devices;
+ return ERR_PTR(-EBUSY);
}
btrfs_close_devices(fs_devices);
- kfree(fs_info);
+ free_fs_info(fs_info);
kfree(tree_root);
} else {
char b[BDEVNAME_SIZE];
s->s_flags = flags | MS_NOSEC;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ btrfs_sb(s)->fs_info->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
if (error) {
deactivate_locked_super(s);
- goto error_free_subvol_name;
+ return ERR_PTR(error);
}
- btrfs_sb(s)->fs_info->bdev_holder = fs_type;
s->s_flags |= MS_ACTIVE;
}
- /* if they gave us a subvolume name bind mount into that */
- if (strcmp(subvol_name, ".")) {
- struct dentry *new_root;
-
- root = get_default_root(s, subvol_rootid);
- if (IS_ERR(root)) {
- error = PTR_ERR(root);
- deactivate_locked_super(s);
- goto error_free_subvol_name;
- }
-
- mutex_lock(&root->d_inode->i_mutex);
- new_root = lookup_one_len(subvol_name, root,
- strlen(subvol_name));
- mutex_unlock(&root->d_inode->i_mutex);
-
- if (IS_ERR(new_root)) {
- dput(root);
- deactivate_locked_super(s);
- error = PTR_ERR(new_root);
- goto error_free_subvol_name;
- }
- if (!new_root->d_inode) {
- dput(root);
- dput(new_root);
- deactivate_locked_super(s);
- error = -ENXIO;
- goto error_free_subvol_name;
- }
- dput(root);
- root = new_root;
- } else {
- root = get_default_root(s, subvol_objectid);
- if (IS_ERR(root)) {
- error = PTR_ERR(root);
- deactivate_locked_super(s);
- goto error_free_subvol_name;
- }
+ root = get_default_root(s, subvol_objectid);
+ if (IS_ERR(root)) {
+ deactivate_locked_super(s);
+ return root;
}
- kfree(subvol_name);
return root;
-error_s:
- error = PTR_ERR(s);
error_close_devices:
btrfs_close_devices(fs_devices);
- kfree(fs_info);
+ free_fs_info(fs_info);
kfree(tree_root);
-error_free_subvol_name:
- kfree(subvol_name);
return ERR_PTR(error);
}
@@ -919,7 +1017,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (root->fs_info->fs_devices->rw_devices == 0)
return -EACCES;
- if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+ if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
return -EINVAL;
ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1085,7 +1183,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_root *root = btrfs_sb(dentry->d_sb);
- struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_super_block *disk_super = root->fs_info->super_copy;
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..29f782cc2cc9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -275,7 +275,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
*/
if (num_items > 0 && root != root->fs_info->chunk_root) {
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
- ret = btrfs_block_rsv_add(NULL, root,
+ ret = btrfs_block_rsv_add(root,
&root->fs_info->trans_block_rsv,
num_bytes);
if (ret)
@@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
int ret;
- ret = btrfs_block_rsv_check(trans, root,
- &root->fs_info->global_block_rsv, 0, 5);
+
+ ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
return ret ? 1 : 0;
}
@@ -427,17 +427,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_block_rsv *rsv = trans->block_rsv;
int updates;
smp_mb();
if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
return 1;
+ /*
+ * We need to do this in case we're deleting csums so the global block
+ * rsv get's used instead of the csum block rsv.
+ */
+ trans->block_rsv = NULL;
+
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (updates)
btrfs_run_delayed_refs(trans, root, updates);
+ trans->block_rsv = rsv;
+
return should_end_transaction(trans, root);
}
@@ -453,6 +462,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
return 0;
}
+ btrfs_trans_release_metadata(trans, root);
+ trans->block_rsv = NULL;
while (count < 4) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
@@ -473,8 +484,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
count++;
}
- btrfs_trans_release_metadata(trans, root);
-
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root)) {
trans->transaction->blocked = 1;
@@ -562,50 +571,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark)
{
- int ret;
int err = 0;
int werr = 0;
- struct page *page;
- struct inode *btree_inode = root->fs_info->btree_inode;
+ struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
u64 start = 0;
u64 end;
- unsigned long index;
- while (1) {
- ret = find_first_extent_bit(dirty_pages, start, &start, &end,
- mark);
- if (ret)
- break;
- while (start <= end) {
- cond_resched();
-
- index = start >> PAGE_CACHE_SHIFT;
- start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
- page = find_get_page(btree_inode->i_mapping, index);
- if (!page)
- continue;
-
- btree_lock_page_hook(page);
- if (!page->mapping) {
- unlock_page(page);
- page_cache_release(page);
- continue;
- }
-
- if (PageWriteback(page)) {
- if (PageDirty(page))
- wait_on_page_writeback(page);
- else {
- unlock_page(page);
- page_cache_release(page);
- continue;
- }
- }
- err = write_one_page(page, 0);
- if (err)
- werr = err;
- page_cache_release(page);
- }
+ while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark)) {
+ convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
+ GFP_NOFS);
+ err = filemap_fdatawrite_range(mapping, start, end);
+ if (err)
+ werr = err;
+ cond_resched();
+ start = end + 1;
}
if (err)
werr = err;
@@ -621,39 +601,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark)
{
- int ret;
int err = 0;
int werr = 0;
- struct page *page;
- struct inode *btree_inode = root->fs_info->btree_inode;
+ struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
u64 start = 0;
u64 end;
- unsigned long index;
- while (1) {
- ret = find_first_extent_bit(dirty_pages, start, &start, &end,
- mark);
- if (ret)
- break;
-
- clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
- while (start <= end) {
- index = start >> PAGE_CACHE_SHIFT;
- start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
- page = find_get_page(btree_inode->i_mapping, index);
- if (!page)
- continue;
- if (PageDirty(page)) {
- btree_lock_page_hook(page);
- wait_on_page_writeback(page);
- err = write_one_page(page, 0);
- if (err)
- werr = err;
- }
- wait_on_page_writeback(page);
- page_cache_release(page);
- cond_resched();
- }
+ while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_NEED_WAIT)) {
+ clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+ err = filemap_fdatawait_range(mapping, start, end);
+ if (err)
+ werr = err;
+ cond_resched();
+ start = end + 1;
}
if (err)
werr = err;
@@ -673,7 +634,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
ret = btrfs_write_marked_extents(root, dirty_pages, mark);
ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
- return ret || ret2;
+
+ if (ret)
+ return ret;
+ if (ret2)
+ return ret2;
+ return 0;
}
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -911,10 +877,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
- btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
if (to_reserve > 0) {
- ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+ ret = btrfs_block_rsv_add(root, &pending->block_rsv,
to_reserve);
if (ret) {
pending->error = ret;
@@ -1002,7 +967,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
BUG_ON(IS_ERR(pending->snap));
btrfs_reloc_post_snapshot(trans, pending);
- btrfs_orphan_post_snapshot(trans, pending);
fail:
kfree(new_root_item);
trans->block_rsv = rsv;
@@ -1032,7 +996,7 @@ static void update_super_roots(struct btrfs_root *root)
struct btrfs_root_item *root_item;
struct btrfs_super_block *super;
- super = &root->fs_info->super_copy;
+ super = root->fs_info->super_copy;
root_item = &root->fs_info->chunk_root->root_item;
super->chunk_root = root_item->bytenr;
@@ -1043,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
super->root = root_item->bytenr;
super->generation = root_item->generation;
super->root_level = root_item->level;
- if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+ if (btrfs_test_opt(root, SPACE_CACHE))
super->cache_generation = root_item->generation;
}
@@ -1168,14 +1132,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_run_ordered_operations(root, 0);
+ btrfs_trans_release_metadata(trans, root);
+ trans->block_rsv = NULL;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
ret = btrfs_run_delayed_refs(trans, root, 0);
BUG_ON(ret);
- btrfs_trans_release_metadata(trans, root);
-
cur_trans = trans->transaction;
/*
* set the flushing flag so procs in this transaction have to
@@ -1341,12 +1306,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
update_super_roots(root);
if (!root->fs_info->log_root_recovering) {
- btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
- btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+ btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+ btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
}
- memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
- sizeof(root->fs_info->super_copy));
+ memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
+ sizeof(*root->fs_info->super_copy));
trans->transaction->blocked = 0;
spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..f4d81c06d48f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
struct walk_control *wc, u64 gen)
{
if (wc->pin)
- btrfs_pin_extent(log->fs_info->extent_root,
- eb->start, eb->len, 0);
+ btrfs_pin_extent_for_log_replay(wc->trans,
+ log->fs_info->extent_root,
+ eb->start, eb->len);
if (btrfs_buffer_uptodate(eb, gen)) {
if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root_owner !=
BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_reserved_extent(root,
+ ret = btrfs_free_and_pin_reserved_extent(root,
bytenr, blocksize);
BUG_ON(ret);
}
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(next);
WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_reserved_extent(root,
+ ret = btrfs_free_and_pin_reserved_extent(root,
path->nodes[*level]->start,
path->nodes[*level]->len);
BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(log->root_key.objectid !=
BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_reserved_extent(log, next->start,
+ ret = btrfs_free_and_pin_reserved_extent(log, next->start,
next->len);
BUG_ON(ret);
}
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
wait_log_commit(trans, root, root->log_transid - 1);
-
while (1) {
unsigned long batch = root->log_batch;
- if (root->log_multiple_pids) {
+ /* when we're on an ssd, just kick the log commit out */
+ if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
BUG_ON(ret);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
- btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+ btrfs_set_super_log_root(root->fs_info->super_for_commit,
log_root_tree->node->start);
- btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+ btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
btrfs_header_level(log_root_tree->node));
log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 18baac5a3f6c..f8e2943101a1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
}
INIT_LIST_HEAD(&device->dev_alloc_list);
+ /* init readahead state */
+ spin_lock_init(&device->reada_lock);
+ device->reada_curr_zone = NULL;
+ atomic_set(&device->reada_in_flight, 0);
+ device->reada_next = 0;
+ INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
mutex_lock(&fs_devices->device_list_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
- if (!bh) {
- ret = -EINVAL;
+ if (!bh)
goto error_close;
- }
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
continue;
}
if (fs_devices->open_devices == 0) {
- ret = -EIO;
+ ret = -EINVAL;
goto out;
}
fs_devices->seeding = seeding;
@@ -1013,8 +1019,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
}
BUG_ON(ret);
- if (device->bytes_used > 0)
- device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+ if (device->bytes_used > 0) {
+ u64 len = btrfs_dev_extent_length(leaf, extent);
+ device->bytes_used -= len;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += len;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
ret = btrfs_del_item(trans, root, path);
out:
@@ -1356,6 +1367,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (ret)
goto error_undo;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space = device->total_bytes -
+ device->bytes_used;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root, device);
@@ -1387,8 +1403,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
call_rcu(&device->rcu, free_device);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
- btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+ num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+ btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1466,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
- struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_super_block *disk_super = root->fs_info->super_copy;
struct btrfs_device *device;
u64 super_flags;
@@ -1691,15 +1707,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fs_devices->num_can_discard++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += device->total_bytes;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;
- total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
- btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+ total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ btrfs_set_super_total_bytes(root->fs_info->super_copy,
total_bytes + device->total_bytes);
- total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
- btrfs_set_super_num_devices(&root->fs_info->super_copy,
+ total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+ btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
@@ -1790,7 +1810,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
- &device->dev_root->fs_info->super_copy;
+ device->dev_root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 diff = new_size - device->total_bytes;
@@ -1849,7 +1869,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
chunk_offset)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *ptr;
@@ -2175,7 +2195,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = device->total_bytes;
u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2212,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
lock_chunks(root);
device->total_bytes = new_size;
- if (device->writeable)
+ if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space -= diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
unlock_chunks(root);
again:
@@ -2257,6 +2281,9 @@ again:
device->total_bytes = old_size;
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
unlock_chunks(root);
goto done;
}
@@ -2292,7 +2319,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key disk_key;
u32 array_size;
u8 *ptr;
@@ -2615,6 +2642,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
index++;
}
+ spin_lock(&extent_root->fs_info->free_chunk_lock);
+ extent_root->fs_info->free_chunk_space -= (stripe_size *
+ map->num_stripes);
+ spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
index = 0;
stripe = &chunk->stripe;
while (index < map->num_stripes) {
@@ -3626,15 +3658,20 @@ static int read_one_dev(struct btrfs_root *root,
fill_device_from_item(leaf, dev_item, device);
device->dev_root = root->fs_info->dev_root;
device->in_fs_metadata = 1;
- if (device->writeable)
+ if (device->writeable) {
device->fs_devices->total_rw_bytes += device->total_bytes;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += device->total_bytes -
+ device->bytes_used;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
ret = 0;
return ret;
}
int btrfs_read_sys_array(struct btrfs_root *root)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 71f4f3f67495..ab5b1c49f352 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
struct btrfs_work work;
struct rcu_head rcu;
struct work_struct rcu_work;
+
+ /* readahead state */
+ spinlock_t reada_lock;
+ atomic_t reada_in_flight;
+ u64 reada_next;
+ struct reada_zone *reada_curr_zone;
+ struct radix_tree_root reada_zones;
+ struct radix_tree_root reada_extents;
};
struct btrfs_fs_devices {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 69565e5fc6a0..a76e41c04b71 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
again:
ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
name, name_len, value, size);
+ /*
+ * If we're setting an xattr to a new value but the new value is say
+ * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
+ * back from split_leaf. This is because it thinks we'll be extending
+ * the existing item size, but we're asking for enough space to add the
+ * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
+ * the rest of the function figure it out.
+ */
+ if (ret == -EOVERFLOW)
+ ret = -EEXIST;
+
if (ret == -EEXIST) {
if (flags & XATTR_CREATE)
goto out;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e76bfeb68267..30acd22147e1 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -351,9 +351,7 @@ static int
build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
{
unsigned int dlen;
- unsigned int wlen;
- unsigned int size = 6 * sizeof(struct ntlmssp2_name);
- __le64 curtime;
+ unsigned int size = 2 * sizeof(struct ntlmssp2_name);
char *defdmname = "WORKGROUP";
unsigned char *blobptr;
struct ntlmssp2_name *attrptr;
@@ -365,15 +363,14 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
}
dlen = strlen(ses->domainName);
- wlen = strlen(ses->server->hostname);
- /* The length of this blob is a size which is
- * six times the size of a structure which holds name/size +
- * two times the unicode length of a domain name +
- * two times the unicode length of a server name +
- * size of a timestamp (which is 8 bytes).
+ /*
+ * The length of this blob is two times the size of a
+ * structure (av pair) which holds name/size
+ * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) +
+ * unicode length of a netbios domain name
*/
- ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+ ses->auth_key.len = size + 2 * dlen;
ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
if (!ses->auth_key.response) {
ses->auth_key.len = 0;
@@ -384,44 +381,15 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
blobptr = ses->auth_key.response;
attrptr = (struct ntlmssp2_name *) blobptr;
+ /*
+ * As defined in MS-NTLM 3.3.2, just this av pair field
+ * is sufficient as part of the temp
+ */
attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
attrptr->length = cpu_to_le16(2 * dlen);
blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
- blobptr += 2 * dlen;
- attrptr = (struct ntlmssp2_name *) blobptr;
-
- attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
- attrptr->length = cpu_to_le16(2 * wlen);
- blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
- cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
-
- blobptr += 2 * wlen;
- attrptr = (struct ntlmssp2_name *) blobptr;
-
- attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
- attrptr->length = cpu_to_le16(2 * dlen);
- blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
- cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
-
- blobptr += 2 * dlen;
- attrptr = (struct ntlmssp2_name *) blobptr;
-
- attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
- attrptr->length = cpu_to_le16(2 * wlen);
- blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
- cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
-
- blobptr += 2 * wlen;
- attrptr = (struct ntlmssp2_name *) blobptr;
-
- attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
- attrptr->length = cpu_to_le16(sizeof(__le64));
- blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
- curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
- memcpy(blobptr, &curtime, sizeof(__le64));
-
return 0;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f93eb948d071..54b8f1e7da94 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -548,6 +548,12 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
struct inode *dir = dentry->d_inode;
struct dentry *child;
+ if (!dir) {
+ dput(dentry);
+ dentry = ERR_PTR(-ENOENT);
+ break;
+ }
+
/* skip separators */
while (*s == sep)
s++;
@@ -563,10 +569,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
mutex_unlock(&dir->i_mutex);
dput(dentry);
dentry = child;
- if (!dentry->d_inode) {
- dput(dentry);
- dentry = ERR_PTR(-ENOENT);
- }
} while (!IS_ERR(dentry));
_FreeXid(xid);
kfree(full_path);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index aac37d99a487..a80f7bd97b90 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4079,7 +4079,8 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
T2_FNEXT_RSP_PARMS *parms;
char *response_data;
int rc = 0;
- int bytes_returned, name_len;
+ int bytes_returned;
+ unsigned int name_len;
__u16 params, byte_count;
cFYI(1, "In FindNext");
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 633c246b6775..71beb0201970 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1298,7 +1298,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
/* ignore */
} else if (strnicmp(data, "guest", 5) == 0) {
/* ignore */
- } else if (strnicmp(data, "rw", 2) == 0) {
+ } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) {
/* ignore */
} else if (strnicmp(data, "ro", 2) == 0) {
/* ignore */
@@ -1401,7 +1401,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->server_ino = 1;
} else if (strnicmp(data, "noserverino", 9) == 0) {
vol->server_ino = 0;
- } else if (strnicmp(data, "rwpidforward", 4) == 0) {
+ } else if (strnicmp(data, "rwpidforward", 12) == 0) {
vol->rwpidforward = 1;
} else if (strnicmp(data, "cifsacl", 7) == 0) {
vol->cifs_acl = 1;
@@ -2018,7 +2018,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
warned_on_ntlm = true;
cERROR(1, "default security mechanism requested. The default "
"security mechanism will be upgraded from ntlm to "
- "ntlmv2 in kernel release 3.1");
+ "ntlmv2 in kernel release 3.2");
}
ses->overrideSecFlg = volume_info->secFlg;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 04da6acde85d..12661e1deedd 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1134,7 +1134,7 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
return bh;
if (buffer_uptodate(bh))
return bh;
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -2807,7 +2807,7 @@ make_io:
trace_ext3_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ_META, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ext3_error(inode->i_sb, "ext3_get_inode_loc",
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 5571708b6a58..0629e09f6511 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -922,7 +922,8 @@ restart:
bh = ext3_getblk(NULL, dir, b++, 0, &err);
bh_use[ra_max] = bh;
if (bh)
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO,
+ 1, &bh);
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 18d2558b7624..986e2388f031 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -647,7 +647,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return bh;
if (buffer_uptodate(bh))
return bh;
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -3298,7 +3298,7 @@ make_io:
trace_ext4_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ_META, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
EXT4_ERROR_INODE_BLOCK(inode, block,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f8068c7bae9f..1c924faeb6c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -922,7 +922,8 @@ restart:
bh = ext4_getblk(NULL, dir, b++, 0, &err);
bh_use[ra_max] = bh;
if (bh)
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO,
+ 1, &bh);
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 85c62923ee29..598646434362 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -624,9 +624,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
- submit_bh(WRITE_SYNC | REQ_META, bh);
+ submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
else
- submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+ submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 747238cd9f96..be29858900f6 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
{
struct buffer_head *bh, *head;
int nr_underway = 0;
- int write_op = REQ_META |
+ int write_op = REQ_META | REQ_PRIO |
(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
BUG_ON(!PageLocked(page));
@@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
}
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(READ_SYNC | REQ_META, bh);
+ submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
if (!(flags & DIO_WAIT))
return 0;
@@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
if (!buffer_locked(first_bh))
- ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
+ ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3bc073a4cf82..079587e53849 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -224,7 +224,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
bio->bi_end_io = end_bio_io_page;
bio->bi_private = page;
- submit_bio(READ_SYNC | REQ_META, bio);
+ submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio);
wait_on_page_locked(page);
bio_put(bio);
if (!PageUptodate(page)) {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 42e8d23bc047..0e8bb13381e4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -709,7 +709,7 @@ get_a_page:
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
goto unlock_out;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c106ca22e812..d24a9b666a23 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -344,6 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
struct inode *root, *inode;
struct qstr str;
struct nls_table *nls = NULL;
+ u64 last_fs_block, last_fs_page;
int err;
err = -EINVAL;
@@ -399,9 +400,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi->rsrc_clump_blocks)
sbi->rsrc_clump_blocks = 1;
- err = generic_check_addressable(sbi->alloc_blksz_shift,
- sbi->total_blocks);
- if (err) {
+ err = -EFBIG;
+ last_fs_block = sbi->total_blocks - 1;
+ last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
+ PAGE_CACHE_SHIFT;
+
+ if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
+ (last_fs_page > (pgoff_t)(~0ULL))) {
printk(KERN_ERR "hfs: filesystem size too large.\n");
goto out_free_vhdr;
}
@@ -525,8 +530,8 @@ out_close_cat_tree:
out_close_ext_tree:
hfs_btree_close(sbi->ext_tree);
out_free_vhdr:
- kfree(sbi->s_vhdr);
- kfree(sbi->s_backup_vhdr);
+ kfree(sbi->s_vhdr_buf);
+ kfree(sbi->s_backup_vhdr_buf);
out_unload_nls:
unload_nls(sbi->nls);
unload_nls(nls);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 10e515a0d452..7daf4b852d1c 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -272,9 +272,9 @@ reread:
return 0;
out_free_backup_vhdr:
- kfree(sbi->s_backup_vhdr);
+ kfree(sbi->s_backup_vhdr_buf);
out_free_vhdr:
- kfree(sbi->s_vhdr);
+ kfree(sbi->s_vhdr_buf);
out:
return error;
}
diff --git a/fs/namei.c b/fs/namei.c
index b52bc685465f..0b3138de2a3b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -721,12 +721,6 @@ static int follow_automount(struct path *path, unsigned flags,
if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
return -EREMOTE;
- /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
- * and this is the terminal part of the path.
- */
- if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT))
- return -EISDIR; /* we actually want to stop here */
-
/* We don't want to mount if someone's just doing a stat -
* unless they're stat'ing a directory and appended a '/' to
* the name.
@@ -739,7 +733,7 @@ static int follow_automount(struct path *path, unsigned flags,
* of the daemon to instantiate them before they can be used.
*/
if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
- LOOKUP_OPEN | LOOKUP_CREATE)) &&
+ LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
path->dentry->d_inode)
return -EISDIR;
@@ -2616,6 +2610,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
if (!dir->i_op->rmdir)
return -EPERM;
+ dget(dentry);
mutex_lock(&dentry->d_inode->i_mutex);
error = -EBUSY;
@@ -2636,6 +2631,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
out:
mutex_unlock(&dentry->d_inode->i_mutex);
+ dput(dentry);
if (!error)
d_delete(dentry);
return error;
@@ -3025,6 +3021,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
if (error)
return error;
+ dget(new_dentry);
if (target)
mutex_lock(&target->i_mutex);
@@ -3045,6 +3042,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
out:
if (target)
mutex_unlock(&target->i_mutex);
+ dput(new_dentry);
if (!error)
if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
d_move(old_dentry,new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 22bfe8273c68..b4febb29d3bb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1757,7 +1757,7 @@ static int do_loopback(struct path *path, char *old_name,
return err;
if (!old_name || !*old_name)
return -EINVAL;
- err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
+ err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
if (err)
return err;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1ec1a85fa71c..3e93e9a1bee1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -56,6 +56,9 @@ enum nfs4_session_state {
NFS4_SESSION_DRAINING,
};
+#define NFS4_RENEW_TIMEOUT 0x01
+#define NFS4_RENEW_DELEGATION_CB 0x02
+
struct nfs4_minor_version_ops {
u32 minor_version;
@@ -225,7 +228,7 @@ struct nfs4_state_recovery_ops {
};
struct nfs4_state_maintenance_ops {
- int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *);
+ int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
};
@@ -237,8 +240,6 @@ extern const struct inode_operations nfs4_dir_inode_operations;
extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
-extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
@@ -349,6 +350,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
extern void nfs4_schedule_lease_recovery(struct nfs_client *);
extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
extern void nfs41_handle_recall_slot(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8c77039e7a81..4700fae1ada0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3374,9 +3374,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
if (task->tk_status < 0) {
/* Unless we're shutting down, schedule state recovery! */
- if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
+ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
+ return;
+ if (task->tk_status != NFS4ERR_CB_PATH_DOWN) {
nfs4_schedule_lease_recovery(clp);
- return;
+ return;
+ }
+ nfs4_schedule_path_down_recovery(clp);
}
do_renew_lease(clp, timestamp);
}
@@ -3386,7 +3390,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {
.rpc_release = nfs4_renew_release,
};
-int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -3395,9 +3399,11 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
};
struct nfs4_renewdata *data;
+ if (renew_flags == 0)
+ return 0;
if (!atomic_inc_not_zero(&clp->cl_count))
return -EIO;
- data = kmalloc(sizeof(*data), GFP_KERNEL);
+ data = kmalloc(sizeof(*data), GFP_NOFS);
if (data == NULL)
return -ENOMEM;
data->client = clp;
@@ -3406,7 +3412,7 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
&nfs4_renew_ops, data);
}
-int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -5504,11 +5510,13 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
return rpc_run_task(&task_setup_data);
}
-static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
{
struct rpc_task *task;
int ret = 0;
+ if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
+ return 0;
task = _nfs41_proc_sequence(clp, cred);
if (IS_ERR(task))
ret = PTR_ERR(task);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index df8e7f3ca56d..dc484c0eae7f 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -60,6 +60,7 @@ nfs4_renew_state(struct work_struct *work)
struct rpc_cred *cred;
long lease;
unsigned long last, now;
+ unsigned renew_flags = 0;
ops = clp->cl_mvops->state_renewal_ops;
dprintk("%s: start\n", __func__);
@@ -72,18 +73,23 @@ nfs4_renew_state(struct work_struct *work)
last = clp->cl_last_renewal;
now = jiffies;
/* Are we close to a lease timeout? */
- if (time_after(now, last + lease/3)) {
+ if (time_after(now, last + lease/3))
+ renew_flags |= NFS4_RENEW_TIMEOUT;
+ if (nfs_delegations_present(clp))
+ renew_flags |= NFS4_RENEW_DELEGATION_CB;
+
+ if (renew_flags != 0) {
cred = ops->get_state_renewal_cred_locked(clp);
spin_unlock(&clp->cl_lock);
if (cred == NULL) {
- if (!nfs_delegations_present(clp)) {
+ if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
goto out;
}
nfs_expire_all_delegations(clp);
} else {
/* Queue an asynchronous RENEW. */
- ops->sched_state_renewal(clp, cred);
+ ops->sched_state_renewal(clp, cred, renew_flags);
put_rpccred(cred);
goto out_exp;
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 72ab97ef3d61..39914be40b03 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1038,6 +1038,12 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
nfs4_schedule_state_manager(clp);
}
+void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
+{
+ nfs_handle_cb_pathdown(clp);
+ nfs4_schedule_state_manager(clp);
+}
+
static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
{
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b961ceac66b4..5b19b6aabe18 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2035,9 +2035,6 @@ static inline void nfs_initialise_sb(struct super_block *sb)
sb->s_blocksize = nfs_block_bits(server->wsize,
&sb->s_blocksize_bits);
- if (server->flags & NFS_MOUNT_NOAC)
- sb->s_flags |= MS_SYNCHRONOUS;
-
sb->s_bdi = &server->backing_dev_info;
nfs_super_set_maxbytes(sb, server->maxfilesize);
@@ -2249,6 +2246,10 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
if (server->flags & NFS_MOUNT_UNSHARED)
compare_super = NULL;
+ /* -o noac implies -o sync */
+ if (server->flags & NFS_MOUNT_NOAC)
+ sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+
/* Get a superblock - note that we may end up sharing one that already exists */
s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
if (IS_ERR(s)) {
@@ -2361,6 +2362,10 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
if (server->flags & NFS_MOUNT_UNSHARED)
compare_super = NULL;
+ /* -o noac implies -o sync */
+ if (server->flags & NFS_MOUNT_NOAC)
+ sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+
/* Get a superblock - note that we may end up sharing one that already exists */
s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
if (IS_ERR(s)) {
@@ -2628,6 +2633,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
if (server->flags & NFS4_MOUNT_UNSHARED)
compare_super = NULL;
+ /* -o noac implies -o sync */
+ if (server->flags & NFS_MOUNT_NOAC)
+ sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+
/* Get a superblock - note that we may end up sharing one that already exists */
s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
if (IS_ERR(s)) {
@@ -2789,7 +2798,7 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
goto out_put_mnt_ns;
ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
- export_path, LOOKUP_FOLLOW, &path);
+ export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
nfs_referral_loop_unprotect();
put_mnt_ns(ns_private);
@@ -2916,6 +2925,10 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
if (server->flags & NFS4_MOUNT_UNSHARED)
compare_super = NULL;
+ /* -o noac implies -o sync */
+ if (server->flags & NFS_MOUNT_NOAC)
+ sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+
/* Get a superblock - note that we may end up sharing one that already exists */
s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
if (IS_ERR(s)) {
@@ -3003,6 +3016,10 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
if (server->flags & NFS4_MOUNT_UNSHARED)
compare_super = NULL;
+ /* -o noac implies -o sync */
+ if (server->flags & NFS_MOUNT_NOAC)
+ sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+
/* Get a superblock - note that we may end up sharing one that already exists */
s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
if (IS_ERR(s)) {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b39b37f80913..c9bd2a6b7d4b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -958,7 +958,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
if (!data)
goto out_bad;
data->pagevec[0] = page;
- nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags);
+ nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
list_add(&data->list, res);
requests++;
nbytes -= len;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 25b6a887adb9..5afaa58a8630 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -877,30 +877,54 @@ struct numa_maps_private {
struct numa_maps md;
};
-static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
+ unsigned long nr_pages)
{
int count = page_mapcount(page);
- md->pages++;
+ md->pages += nr_pages;
if (pte_dirty || PageDirty(page))
- md->dirty++;
+ md->dirty += nr_pages;
if (PageSwapCache(page))
- md->swapcache++;
+ md->swapcache += nr_pages;
if (PageActive(page) || PageUnevictable(page))
- md->active++;
+ md->active += nr_pages;
if (PageWriteback(page))
- md->writeback++;
+ md->writeback += nr_pages;
if (PageAnon(page))
- md->anon++;
+ md->anon += nr_pages;
if (count > md->mapcount_max)
md->mapcount_max = count;
- md->node[page_to_nid(page)]++;
+ md->node[page_to_nid(page)] += nr_pages;
+}
+
+static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+ int nid;
+
+ if (!pte_present(pte))
+ return NULL;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page)
+ return NULL;
+
+ if (PageReserved(page))
+ return NULL;
+
+ nid = page_to_nid(page);
+ if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+ return NULL;
+
+ return page;
}
static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
@@ -912,26 +936,32 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *pte;
md = walk->private;
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- do {
- struct page *page;
- int nid;
+ spin_lock(&walk->mm->page_table_lock);
+ if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_splitting(*pmd)) {
+ spin_unlock(&walk->mm->page_table_lock);
+ wait_split_huge_page(md->vma->anon_vma, pmd);
+ } else {
+ pte_t huge_pte = *(pte_t *)pmd;
+ struct page *page;
- if (!pte_present(*pte))
- continue;
+ page = can_gather_numa_stats(huge_pte, md->vma, addr);
+ if (page)
+ gather_stats(page, md, pte_dirty(huge_pte),
+ HPAGE_PMD_SIZE/PAGE_SIZE);
+ spin_unlock(&walk->mm->page_table_lock);
+ return 0;
+ }
+ } else {
+ spin_unlock(&walk->mm->page_table_lock);
+ }
- page = vm_normal_page(md->vma, addr, *pte);
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ do {
+ struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
if (!page)
continue;
-
- if (PageReserved(page))
- continue;
-
- nid = page_to_nid(page);
- if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
- continue;
-
- gather_stats(page, md, pte_dirty(*pte));
+ gather_stats(page, md, pte_dirty(*pte), 1);
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(orig_pte, ptl);
@@ -952,7 +982,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
return 0;
md = walk->private;
- gather_stats(page, md, pte_dirty(*pte));
+ gather_stats(page, md, pte_dirty(*pte), 1);
return 0;
}
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b34bdb25490c..10b6be3ca280 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -355,7 +355,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
* resolution (think about autofs) and thus deadlocks could arise.
*/
if (cmds == Q_QUOTAON) {
- ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
+ ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
if (ret)
pathp = ERR_PTR(ret);
else
diff --git a/fs/stat.c b/fs/stat.c
index ba5316ffac61..78a3aa83c7ea 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -81,8 +81,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
if (!(flag & AT_SYMLINK_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
- if (flag & AT_NO_AUTOMOUNT)
- lookup_flags |= LOOKUP_NO_AUTOMOUNT;
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 63e971e2b837..8c37dde4c521 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1300,6 +1300,7 @@ xfs_end_io_direct_write(
bool is_async)
{
struct xfs_ioend *ioend = iocb->private;
+ struct inode *inode = ioend->io_inode;
/*
* blockdev_direct_IO can return an error even after the I/O
@@ -1331,7 +1332,7 @@ xfs_end_io_direct_write(
}
/* XXX: probably should move into the real I/O completion handler */
- inode_dio_done(ioend->io_inode);
+ inode_dio_done(inode);
}
STATIC ssize_t
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cac2ecfa6746..ef43fce519a1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -629,7 +629,7 @@ xfs_buf_item_push(
* the xfsbufd to get this buffer written. We have to unlock the buffer
* to allow the xfsbufd to write it, too.
*/
-STATIC void
+STATIC bool
xfs_buf_item_pushbuf(
struct xfs_log_item *lip)
{
@@ -643,6 +643,7 @@ xfs_buf_item_pushbuf(
xfs_buf_delwri_promote(bp);
xfs_buf_relse(bp);
+ return true;
}
STATIC void
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 9e0e2fa3f2c8..bb3f71d236d2 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait(
* search the buffer cache can be a time consuming thing, and AIL lock is a
* spinlock.
*/
-STATIC void
+STATIC bool
xfs_qm_dquot_logitem_pushbuf(
struct xfs_log_item *lip)
{
struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
struct xfs_dquot *dqp = qlip->qli_dquot;
struct xfs_buf *bp;
+ bool ret = true;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
@@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf(
if (completion_done(&dqp->q_flush) ||
!(lip->li_flags & XFS_LI_IN_AIL)) {
xfs_dqunlock(dqp);
- return;
+ return true;
}
bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
xfs_dqunlock(dqp);
if (!bp)
- return;
+ return true;
if (XFS_BUF_ISDELAYWRITE(bp))
xfs_buf_delwri_promote(bp);
+ if (xfs_buf_ispinned(bp))
+ ret = false;
xfs_buf_relse(bp);
+ return ret;
}
/*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 588406dc6a35..836ad80d4f2b 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -708,13 +708,14 @@ xfs_inode_item_committed(
* marked delayed write. If that's the case, we'll promote it and that will
* allow the caller to write the buffer by triggering the xfsbufd to run.
*/
-STATIC void
+STATIC bool
xfs_inode_item_pushbuf(
struct xfs_log_item *lip)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
struct xfs_buf *bp;
+ bool ret = true;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
@@ -725,7 +726,7 @@ xfs_inode_item_pushbuf(
if (completion_done(&ip->i_flush) ||
!(lip->li_flags & XFS_LI_IN_AIL)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return;
+ return true;
}
bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
@@ -733,10 +734,13 @@ xfs_inode_item_pushbuf(
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!bp)
- return;
+ return true;
if (XFS_BUF_ISDELAYWRITE(bp))
xfs_buf_delwri_promote(bp);
+ if (xfs_buf_ispinned(bp))
+ ret = false;
xfs_buf_relse(bp);
+ return ret;
}
/*
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 1e8a45e74c3e..828662f70d64 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -68,6 +68,8 @@
#include <linux/ctype.h>
#include <linux/writeback.h>
#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/list_sort.h>
#include <asm/page.h>
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2366c54cc4fa..5cf06b85fd9d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1652,24 +1652,13 @@ xfs_init_workqueues(void)
*/
xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
if (!xfs_syncd_wq)
- goto out;
-
- xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
- if (!xfs_ail_wq)
- goto out_destroy_syncd;
-
+ return -ENOMEM;
return 0;
-
-out_destroy_syncd:
- destroy_workqueue(xfs_syncd_wq);
-out:
- return -ENOMEM;
}
STATIC void
xfs_destroy_workqueues(void)
{
- destroy_workqueue(xfs_ail_wq);
destroy_workqueue(xfs_syncd_wq);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 06a9759b6352..53597f4db9b5 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -350,7 +350,7 @@ typedef struct xfs_item_ops {
void (*iop_unlock)(xfs_log_item_t *);
xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
void (*iop_push)(xfs_log_item_t *);
- void (*iop_pushbuf)(xfs_log_item_t *);
+ bool (*iop_pushbuf)(xfs_log_item_t *);
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
} xfs_item_ops_t;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c15aa29fa169..3a1e7ca54c2d 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,6 @@
#include "xfs_trans_priv.h"
#include "xfs_error.h"
-struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
-
#ifdef DEBUG
/*
* Check that the list is sorted as it should be.
@@ -356,16 +354,10 @@ xfs_ail_delete(
xfs_trans_ail_cursor_clear(ailp, lip);
}
-/*
- * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
- * to run at a later time if there is more work to do to complete the push.
- */
-STATIC void
-xfs_ail_worker(
- struct work_struct *work)
+static long
+xfsaild_push(
+ struct xfs_ail *ailp)
{
- struct xfs_ail *ailp = container_of(to_delayed_work(work),
- struct xfs_ail, xa_work);
xfs_mount_t *mp = ailp->xa_mount;
struct xfs_ail_cursor cur;
xfs_log_item_t *lip;
@@ -427,8 +419,13 @@ xfs_ail_worker(
case XFS_ITEM_PUSHBUF:
XFS_STATS_INC(xs_push_ail_pushbuf);
- IOP_PUSHBUF(lip);
- ailp->xa_last_pushed_lsn = lsn;
+
+ if (!IOP_PUSHBUF(lip)) {
+ stuck++;
+ flush_log = 1;
+ } else {
+ ailp->xa_last_pushed_lsn = lsn;
+ }
push_xfsbufd = 1;
break;
@@ -440,7 +437,6 @@ xfs_ail_worker(
case XFS_ITEM_LOCKED:
XFS_STATS_INC(xs_push_ail_locked);
- ailp->xa_last_pushed_lsn = lsn;
stuck++;
break;
@@ -501,20 +497,6 @@ out_done:
/* We're past our target or empty, so idle */
ailp->xa_last_pushed_lsn = 0;
- /*
- * We clear the XFS_AIL_PUSHING_BIT first before checking
- * whether the target has changed. If the target has changed,
- * this pushes the requeue race directly onto the result of the
- * atomic test/set bit, so we are guaranteed that either the
- * the pusher that changed the target or ourselves will requeue
- * the work (but not both).
- */
- clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
- smp_rmb();
- if (XFS_LSN_CMP(ailp->xa_target, target) == 0 ||
- test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
- return;
-
tout = 50;
} else if (XFS_LSN_CMP(lsn, target) >= 0) {
/*
@@ -537,9 +519,30 @@ out_done:
tout = 20;
}
- /* There is more to do, requeue us. */
- queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
- msecs_to_jiffies(tout));
+ return tout;
+}
+
+static int
+xfsaild(
+ void *data)
+{
+ struct xfs_ail *ailp = data;
+ long tout = 0; /* milliseconds */
+
+ while (!kthread_should_stop()) {
+ if (tout && tout <= 20)
+ __set_current_state(TASK_KILLABLE);
+ else
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(tout ?
+ msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+
+ try_to_freeze();
+
+ tout = xfsaild_push(ailp);
+ }
+
+ return 0;
}
/*
@@ -574,8 +577,9 @@ xfs_ail_push(
*/
smp_wmb();
xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
- if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
- queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
+ smp_wmb();
+
+ wake_up_process(ailp->xa_task);
}
/*
@@ -813,9 +817,18 @@ xfs_trans_ail_init(
INIT_LIST_HEAD(&ailp->xa_ail);
INIT_LIST_HEAD(&ailp->xa_cursors);
spin_lock_init(&ailp->xa_lock);
- INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
+
+ ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+ ailp->xa_mount->m_fsname);
+ if (IS_ERR(ailp->xa_task))
+ goto out_free_ailp;
+
mp->m_ail = ailp;
return 0;
+
+out_free_ailp:
+ kmem_free(ailp);
+ return ENOMEM;
}
void
@@ -824,6 +837,6 @@ xfs_trans_ail_destroy(
{
struct xfs_ail *ailp = mp->m_ail;
- cancel_delayed_work_sync(&ailp->xa_work);
+ kthread_stop(ailp->xa_task);
kmem_free(ailp);
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 212946b97239..22750b5e4a8f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -64,23 +64,17 @@ struct xfs_ail_cursor {
*/
struct xfs_ail {
struct xfs_mount *xa_mount;
+ struct task_struct *xa_task;
struct list_head xa_ail;
xfs_lsn_t xa_target;
struct list_head xa_cursors;
spinlock_t xa_lock;
- struct delayed_work xa_work;
xfs_lsn_t xa_last_pushed_lsn;
- unsigned long xa_flags;
};
-#define XFS_AIL_PUSHING_BIT 0
-
/*
* From xfs_trans_ail.c
*/
-
-extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
-
void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
struct xfs_ail_cursor *cur,
struct xfs_log_item **log_items, int nr_items,