aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/autofs4/root.c4
-rw-r--r--fs/autofs4/waitq.c6
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/bio.c2
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c5
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/btrfs_inode.h20
-rw-r--r--fs/btrfs/check-integrity.c3
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.c68
-rw-r--r--fs/btrfs/ctree.h150
-rw-r--r--fs/btrfs/delayed-inode.c147
-rw-r--r--fs/btrfs/delayed-inode.h1
-rw-r--r--fs/btrfs/delayed-ref.c82
-rw-r--r--fs/btrfs/delayed-ref.h52
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c227
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/extent-tree.c584
-rw-r--r--fs/btrfs/extent_io.c138
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c67
-rw-r--r--fs/btrfs/file.c57
-rw-r--r--fs/btrfs/free-space-cache.c62
-rw-r--r--fs/btrfs/inode.c1064
-rw-r--r--fs/btrfs/ioctl.c211
-rw-r--r--fs/btrfs/ioctl.h502
-rw-r--r--fs/btrfs/locking.c5
-rw-r--r--fs/btrfs/ordered-data.c98
-rw-r--r--fs/btrfs/ordered-data.h14
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/qgroup.c55
-rw-r--r--fs/btrfs/raid56.c2099
-rw-r--r--fs/btrfs/raid56.h51
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c10
-rw-r--r--fs/btrfs/send.c53
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c89
-rw-r--r--fs/btrfs/sysfs.c1
-rw-r--r--fs/btrfs/transaction.c151
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c19
-rw-r--r--fs/btrfs/tree-log.c166
-rw-r--r--fs/btrfs/ulist.c2
-rw-r--r--fs/btrfs/volumes.c636
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/buffer.c10
-rw-r--r--fs/ceph/addr.c38
-rw-r--r--fs/ceph/caps.c32
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/ceph/ioctl.c6
-rw-r--r--fs/ceph/mds_client.c33
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/mdsmap.c12
-rw-r--r--fs/ceph/strings.c4
-rw-r--r--fs/ceph/super.c7
-rw-r--r--fs/ceph/super.h10
-rw-r--r--fs/ceph/xattr.c214
-rw-r--r--fs/cifs/cifsfs.c5
-rw-r--r--fs/cifs/cifssmb.c5
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/file.c12
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/extents_status.c39
-rw-r--r--fs/ext4/inode.c8
-rw-r--r--fs/ext4/mballoc.c8
-rw-r--r--fs/ext4/resize.c6
-rw-r--r--fs/ext4/super.c61
-rw-r--r--fs/fs-writeback.c60
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/lockd/clntlock.c14
-rw-r--r--fs/lockd/clntproc.c6
-rw-r--r--fs/lockd/host.c1
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/nfs/cache_lib.c12
-rw-r--r--fs/nfs/cache_lib.h2
-rw-r--r--fs/nfs/dns_resolve.c67
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/nfs4client.c1
-rw-r--r--fs/nfs/nfs4filelayout.c6
-rw-r--r--fs/nfs/nfs4filelayout.h2
-rw-r--r--fs/nfs/nfs4filelayoutdev.c1
-rw-r--r--fs/nfs/nfs4namespace.c1
-rw-r--r--fs/nfs/nfs4proc.c21
-rw-r--r--fs/nfs/pnfs.c21
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/super.c1
-rw-r--r--fs/nfs/unlink.c20
-rw-r--r--fs/nfsd/cache.h17
-rw-r--r--fs/nfsd/export.c16
-rw-r--r--fs/nfsd/fault_inject.c2
-rw-r--r--fs/nfsd/nfs4idmap.c16
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4recover.c6
-rw-r--r--fs/nfsd/nfs4state.c101
-rw-r--r--fs/nfsd/nfs4xdr.c21
-rw-r--r--fs/nfsd/nfscache.c352
-rw-r--r--fs/nfsd/nfsctl.c81
-rw-r--r--fs/nfsd/nfssvc.c6
-rw-r--r--fs/nfsd/xdr4.h2
-rw-r--r--fs/open.c15
-rw-r--r--fs/read_write.c9
-rw-r--r--fs/timerfd.c10
111 files changed, 6401 insertions, 2007 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 230bd2aad4f..9bd16255dd9 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -383,8 +383,10 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
goto done;
}
} else {
- if (!simple_empty(dentry))
+ if (!simple_empty(dentry)) {
+ spin_unlock(&sbi->fs_lock);
goto done;
+ }
}
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 03bc1d347d8..3db70dae40d 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -42,10 +42,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
while (wq) {
nwq = wq->next;
wq->status = -ENOENT; /* Magic is gone - report failure */
- if (wq->name.name) {
- kfree(wq->name.name);
- wq->name.name = NULL;
- }
+ kfree(wq->name.name);
+ wq->name.name = NULL;
wq->wait_ctr--;
wake_up_interruptible(&wq->queue);
wq = nwq;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a5702d74d2b..3939829f6c5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -322,6 +322,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
return 0;
}
+#ifndef elf_map
+
static unsigned long elf_map(struct file *filep, unsigned long addr,
struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
@@ -356,6 +358,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
+#endif /* !elf_map */
+
static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
diff --git a/fs/bio.c b/fs/bio.c
index b96fc6ce485..bb5768f59b3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1428,6 +1428,8 @@ void bio_endio(struct bio *bio, int error)
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
error = -EIO;
+ trace_block_bio_complete(bio, error);
+
if (bio->bi_end_io)
bio->bi_end_io(bio, error);
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 53f5fae5cfb..aea605c98ba 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1033,7 +1033,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
{
unsigned bsize = bdev_logical_block_size(bdev);
- bdev->bd_inode->i_size = size;
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ i_size_write(bdev->bd_inode, size);
+ mutex_unlock(&bdev->bd_inode->i_mutex);
while (bsize < PAGE_CACHE_SIZE) {
if (size & bsize)
break;
@@ -1118,7 +1120,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret && !bdev->bd_openers) {
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ccd25ba7a9a..9a8622a5b86 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -5,6 +5,9 @@ config BTRFS_FS
select ZLIB_DEFLATE
select LZO_COMPRESS
select LZO_DECOMPRESS
+ select RAID6_PQ
+ select XOR_BLOCKS
+
help
Btrfs is a new filesystem with extents, writable snapshotting,
support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee5..3932224f99e 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
+ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04edf69be87..bd605c87adf 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
err = __resolve_indirect_ref(fs_info, search_commit_root,
time_seq, ref, parents,
extent_item_pos);
- if (err) {
- if (ret == 0)
- ret = err;
+ if (err)
continue;
- }
/* we put the first parent into the ref at hand */
ULIST_ITER_INIT(&uiter);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index d61feca7945..310a7f6d09b 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -19,7 +19,7 @@
#ifndef __BTRFS_BACKREF__
#define __BTRFS_BACKREF__
-#include "ioctl.h"
+#include <linux/btrfs.h>
#include "ulist.h"
#include "extent_io.h"
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242bc4f..d9b97d4960e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,8 @@
#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
#define BTRFS_INODE_NEEDS_FULL_SYNC 7
#define BTRFS_INODE_COPY_EVERYTHING 8
+#define BTRFS_INODE_IN_DELALLOC_LIST 9
+#define BTRFS_INODE_READDIO_NEED_LOCK 10
/* in memory btrfs inode */
struct btrfs_inode {
@@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
return 0;
}
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+ set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+ smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb__before_clear_bit();
+ clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags);
+}
+
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 11d47bfb62b..18af6f48781 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror(
(bh->b_data + (dev_bytenr & 4095));
if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
- strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
- sizeof(super_tmp->magic)) ||
+ super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
btrfs_super_leafsize(super_tmp) != state->metablock_size ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 94ab2f80e7e..15b94089abc 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
if (bio->bi_size)
- ret = io_tree->ops->merge_bio_hook(page, 0,
+ ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
PAGE_CACHE_SIZE,
bio, 0);
else
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page->index = em_start >> PAGE_CACHE_SHIFT;
if (comp_bio->bi_size)
- ret = tree->ops->merge_bio_hook(page, 0,
+ ret = tree->ops->merge_bio_hook(READ, page, 0,
PAGE_CACHE_SIZE,
comp_bio, 0);
else
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eea5da7a2b9..ecd25a1b4e5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
switch (tm->op) {
case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
+ /* Fallthrough */
case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case MOD_LOG_KEY_REMOVE:
btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
__tree_mod_log_rewind(eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
- BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
return eb_rewin;
}
@@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
*/
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, int cache_only, u64 *last_ret,
+ int start_slot, u64 *last_ret,
struct btrfs_key *progress)
{
struct extent_buffer *cur;
@@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
parent_level = btrfs_header_level(parent);
- if (cache_only && parent_level != 1)
- return 0;
WARN_ON(trans->transaction != root->fs_info->running_transaction);
WARN_ON(trans->transid != root->fs_info->generation);
@@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
else
uptodate = 0;
if (!cur || !uptodate) {
- if (cache_only) {
- free_extent_buffer(cur);
- continue;
- }
if (!cur) {
cur = read_tree_block(root, blocknr,
blocksize, gen);
@@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
/*
* A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are either in cache or have a minimum
- * transaction id. This is used by the btree defrag code, and tree logging
+ * for nodes or leaves that are have a minimum transaction id.
+ * This is used by the btree defrag code, and tree logging
*
* This does not cow, but it does stuff the starting key it finds back
* into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
*/
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_key *max_key,
- struct btrfs_path *path, int cache_only,
+ struct btrfs_path *path,
u64 min_trans)
{
struct extent_buffer *cur;
@@ -4887,15 +4882,12 @@ again:
if (sret && slot > 0)
slot--;
/*
- * check this node pointer against the cache_only and
- * min_trans parameters. If it isn't in cache or is too
- * old, skip to the next one.
+ * check this node pointer against the min_trans parameters.
+ * If it is too old, old, skip to the next one.
*/
while (slot < nritems) {
u64 blockptr;
u64 gen;
- struct extent_buffer *tmp;
- struct btrfs_disk_key disk_key;
blockptr = btrfs_node_blockptr(cur, slot);
gen = btrfs_node_ptr_generation(cur, slot);
@@ -4903,27 +4895,7 @@ again:
slot++;
continue;
}
- if (!cache_only)
- break;
-
- if (max_key) {
- btrfs_node_key(cur, &disk_key, slot);
- if (comp_keys(&disk_key, max_key) >= 0) {
- ret = 1;
- goto out;
- }
- }
-
- tmp = btrfs_find_tree_block(root, blockptr,
- btrfs_level_size(root, level - 1));
-
- if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
- free_extent_buffer(tmp);
- break;
- }
- if (tmp)
- free_extent_buffer(tmp);
- slot++;
+ break;
}
find_next_key:
/*
@@ -4934,7 +4906,7 @@ find_next_key:
path->slots[level] = slot;
btrfs_set_path_blocking(path);
sret = btrfs_find_next_key(root, path, min_key, level,
- cache_only, min_trans);
+ min_trans);
if (sret == 0) {
btrfs_release_path(path);
goto again;
@@ -5399,8 +5371,7 @@ out:
/*
* this is similar to btrfs_next_leaf, but does not try to preserve
* and fixup the path. It looks for and returns the next key in the
- * tree based on the current path and the cache_only and min_trans
- * parameters.
+ * tree based on the current path and the min_trans parameters.
*
* 0 is returned if another key is found, < 0 if there are any errors
* and 1 is returned if there are no higher keys in the tree
@@ -5409,8 +5380,7 @@ out:
* calling this function.
*/
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *key, int level,
- int cache_only, u64 min_trans)
+ struct btrfs_key *key, int level, u64 min_trans)
{
int slot;
struct extent_buffer *c;
@@ -5461,22 +5431,8 @@ next:
if (level == 0)
btrfs_item_key_to_cpu(c, key, slot);
else {
- u64 blockptr = btrfs_node_blockptr(c, slot);
u64 gen = btrfs_node_ptr_generation(c, slot);
- if (cache_only) {
- struct extent_buffer *cur;
- cur = btrfs_find_tree_block(root, blockptr,
- btrfs_level_size(root, level - 1));
- if (!cur ||
- btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
- slot++;
- if (cur)
- free_extent_buffer(cur);
- goto next;
- }
- free_extent_buffer(cur);
- }
if (gen < min_trans) {
slot++;
goto next;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 547b7b05727..0d82922179d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -31,10 +31,10 @@
#include <trace/events/btrfs.h>
#include <asm/kmap_types.h>
#include <linux/pagemap.h>
+#include <linux/btrfs.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
-#include "ioctl.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
struct btrfs_ordered_sum;
-#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
#define BTRFS_MAX_MIRRORS 3
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+
/*
* The key defines the order in the tree, and so it also defines (optimal)
* block layout.
@@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
/*
* File system states
*/
+#define BTRFS_FS_STATE_ERROR 0
+#define BTRFS_FS_STATE_REMOUNTING 1
+/* Super block flags */
/* Errors detected */
#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
@@ -502,6 +507,7 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -511,6 +517,7 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
/*
@@ -952,8 +959,20 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
+#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
-#define BTRFS_NR_RAID_TYPES 5
+
+enum btrfs_raid_types {
+ BTRFS_RAID_RAID10,
+ BTRFS_RAID_RAID1,
+ BTRFS_RAID_DUP,
+ BTRFS_RAID_RAID0,
+ BTRFS_RAID_SINGLE,
+ BTRFS_RAID_RAID5,
+ BTRFS_RAID_RAID6,
+ BTRFS_NR_RAID_TYPES
+};
#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
BTRFS_BLOCK_GROUP_SYSTEM | \
@@ -961,6 +980,8 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID1 | \
+ BTRFS_BLOCK_GROUP_RAID5 | \
+ BTRFS_BLOCK_GROUP_RAID6 | \
BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID10)
/*
@@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache {
u64 flags;
u64 sectorsize;
u64 cache_generation;
+
+ /* for raid56, this is a full stripe, without parity */
+ unsigned long full_stripe_len;
+
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
@@ -1225,6 +1250,28 @@ struct seq_list {
u64 seq;
};
+enum btrfs_orphan_cleanup_state {
+ ORPHAN_CLEANUP_STARTED = 1,
+ ORPHAN_CLEANUP_DONE = 2,
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+ struct list_head hash_list;
+ wait_queue_head_t wait;
+ spinlock_t lock;
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+ struct list_head stripe_cache;
+ spinlock_t cache_lock;
+ int cache_size;
+ struct btrfs_stripe_hash table[];
+};
+
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
/* fs_info */
struct reloc_control;
struct btrfs_device;
@@ -1250,6 +1297,7 @@ struct btrfs_fs_info {
/* block group cache stuff */
spinlock_t block_group_cache_lock;
+ u64 first_logical_byte;
struct rb_root block_group_cache_tree;
/* keep track of unallocated space */
@@ -1288,7 +1336,23 @@ struct btrfs_fs_info {
u64 last_trans_log_full_commit;
unsigned long mount_opt;
unsigned long compress_type:4;
+ /*
+ * It is a suggestive number, the read side is safe even it gets a
+ * wrong number because we will write out the data into a regular
+ * extent. The write side(mount/remount) is under ->s_umount lock,
+ * so it is also safe.
+ */
u64 max_inline;
+ /*
+ * Protected by ->chunk_mutex and sb->s_umount.
+ *
+ * The reason that we use two lock to protect it is because only
+ * remount and mount operations can change it and these two operations
+ * are under sb->s_umount, but the read side (chunk allocation) can not
+ * acquire sb->s_umount or the deadlock would happen. So we use two
+ * locks to protect it. On the write side, we must acquire two locks,
+ * and on the read side, we just need acquire one of them.
+ */
u64 alloc_start;
struct btrfs_transaction *running_transaction;
wait_queue_head_t transaction_throttle;
@@ -1307,6 +1371,13 @@ struct btrfs_fs_info {
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
struct mutex volume_mutex;
+
+ /* this is used during read/modify/write to make sure
+ * no two ios are trying to mod the same stripe at the same
+ * time
+ */
+ struct btrfs_stripe_hash_table *stripe_hash_table;
+
/*
* this protects the ordered operations list only while we are
* processing all of the entries on it. This way we make
@@ -1365,6 +1436,7 @@ struct btrfs_fs_info {
*/
struct list_head ordered_extents;
+ spinlock_t delalloc_lock;
/*
* all of the inodes that have delalloc bytes. It is possible for
* this list to be empty even when there is still dirty data=ordered
@@ -1373,13 +1445,6 @@ struct btrfs_fs_info {
struct list_head delalloc_inodes;
/*
- * special rename and truncate targets that must be on disk before
- * we're allowed to commit. This is basically the ext3 style
- * data=ordered list.
- */
- struct list_head ordered_operations;
-
- /*
* there is a pool of worker threads for checksumming during writes
* and a pool for checksumming after reads. This is because readers
* can run with FS locks held, and the writers may be waiting for
@@ -1395,6 +1460,8 @@ struct btrfs_fs_info {
struct btrfs_workers flush_workers;
struct btrfs_workers endio_workers;
struct btrfs_workers endio_meta_workers;
+ struct btrfs_workers endio_raid56_workers;
+ struct btrfs_workers rmw_workers;
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers endio_freespace_worker;
@@ -1423,10 +1490,12 @@ struct btrfs_fs_info {
u64 total_pinned;
- /* protected by the delalloc lock, used to keep from writing
- * metadata until there is a nice batch
- */
- u64 dirty_metadata_bytes;
+ /* used to keep from writing metadata until there is a nice batch */
+ struct percpu_counter dirty_metadata_bytes;
+ struct percpu_counter delalloc_bytes;
+ s32 dirty_metadata_batch;
+ s32 delalloc_batch;
+
struct list_head dirty_cowonly_roots;
struct btrfs_fs_devices *fs_devices;
@@ -1442,9 +1511,6 @@ struct btrfs_fs_info {
struct reloc_control *reloc_ctl;
- spinlock_t delalloc_lock;
- u64 delalloc_bytes;
-
/* data_alloc_cluster is only used in ssd mode */
struct btrfs_free_cluster data_alloc_cluster;
@@ -1456,6 +1522,8 @@ struct btrfs_fs_info {
struct rb_root defrag_inodes;
atomic_t defrag_running;
+ /* Used to protect avail_{data, metadata, system}_alloc_bits */
+ seqlock_t profiles_lock;
/*
* these three are in extended format (availability of single
* chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1520,7 +1588,7 @@ struct btrfs_fs_info {
u64 qgroup_seq;
/* filesystem state */
- u64 fs_state;
+ unsigned long fs_state;
struct btrfs_delayed_root *delayed_root;
@@ -1623,6 +1691,9 @@ struct btrfs_root {
struct list_head root_list;
+ spinlock_t log_extents_lock[2];
+ struct list_head logged_list[2];
+
spinlock_t orphan_lock;
atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
@@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args {
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
BTRFS_MOUNT_##opt)
/*
@@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
u64 bytenr, u64 num_bytes);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode);
void btrfs_orphan_release_metadata(struct inode *inode);
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending);
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int nitems,
+ u64 *qgroup_reserved);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 qgroup_reserved);
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
@@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
- int cache_only, u64 min_trans);
+ u64 min_trans);
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_key *max_key,
- struct btrfs_path *path, int cache_only,
+ struct btrfs_path *path,
u64 min_trans);
enum btrfs_compare_tree_result {
BTRFS_COMPARE_TREE_NEW,
@@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
int find_higher, int return_any);
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, int cache_only, u64 *last_ret,
+ int start_slot, u64 *last_ret,
struct btrfs_key *progress);
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
@@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root, u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
- size_t size, struct bio *bio, unsigned long bio_flags);
-
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+ size_t size, struct bio *bio,
+ unsigned long bio_flags);
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
@@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int cache_only);
+ struct btrfs_root *root);
/* sysfs.c */
int btrfs_init_sysfs(void);
@@ -3620,11 +3696,14 @@ __printf(5, 6)
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic(). Otherwise we BUG() here.
+ */
#define btrfs_panic(fs_info, errno, fmt, args...) \
do { \
- struct btrfs_fs_info *_i = (fs_info); \
- __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \
- BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \
+ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ BUG(); \
} while (0)
/* acl.c */
@@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid)
return 1;
return 0;
}
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+ return signal_pending(current);
+}
+
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 34836036f01..0b278b117cb 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_delayed_item *delayed_item)
{
struct extent_buffer *leaf;
- struct btrfs_item *item;
char *ptr;
int ret;
@@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
- item = btrfs_item_nr(leaf, path->slots[0]);
ptr = btrfs_item_ptr(leaf, path->slots[0], char);
write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
@@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
}
}
-static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_delayed_node *node)
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
{
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
int ret;
- mutex_lock(&node->mutex);
- if (!node->inode_dirty) {
- mutex_unlock(&node->mutex);
- return 0;
- }
-
key.objectid = node->inode_id;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
+
ret = btrfs_lookup_inode(trans, root, path, &key, 1);
if (ret > 0) {
btrfs_release_path(path);
- mutex_unlock(&node->mutex);
return -ENOENT;
} else if (ret < 0) {
- mutex_unlock(&node->mutex);
return ret;
}
@@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_delayed_inode_release_metadata(root, node);
btrfs_release_delayed_inode(node);
- mutex_unlock(&node->mutex);
return 0;
}
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ int ret;
+
+ mutex_lock(&node->mutex);
+ if (!node->inode_dirty) {
+ mutex_unlock(&node->mutex);
+ return 0;
+ }
+
+ ret = __btrfs_update_delayed_inode(trans, root, path, node);
+ mutex_unlock(&node->mutex);
+ return ret;
+}
+
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ int ret;
+
+ ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+ if (ret)
+ return ret;
+
+ ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+ return ret;
+}
+
/*
* Called when committing the transaction.
* Returns 0 on success.
@@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int nr)
{
- struct btrfs_root *curr_root = root;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
struct btrfs_path *path;
@@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
curr_node = btrfs_first_delayed_node(delayed_root);
while (curr_node && (!count || (count && nr--))) {
- curr_root = curr_node->root;
- ret = btrfs_insert_delayed_items(trans, path, curr_root,
- curr_node);
- if (!ret)
- ret = btrfs_delete_delayed_items(trans, path,
- curr_root, curr_node);
- if (!ret)
- ret = btrfs_update_delayed_inode(trans, curr_root,
- path, curr_node);
+ ret = __btrfs_commit_inode_delayed_items(trans, path,
+ curr_node);
if (ret) {
btrfs_release_delayed_node(curr_node);
curr_node = NULL;
@@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
return __btrfs_run_delayed_items(trans, root, nr);
}
-static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_node *node)
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct inode *inode)
{
+ struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret;
+ if (!delayed_node)
+ return 0;
+
+ mutex_lock(&delayed_node->mutex);
+ if (!delayed_node->count) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+ }
+ mutex_unlock(&delayed_node->mutex);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
- trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
+ trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
- ret = btrfs_insert_delayed_items(trans, path, node->root, node);
- if (!ret)
- ret = btrfs_delete_delayed_items(trans, path, node->root, node);
- if (!ret)
- ret = btrfs_update_delayed_inode(trans, node->root, path, node);
- btrfs_free_path(path);
+ ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+ btrfs_release_delayed_node(delayed_node);
+ btrfs_free_path(path);
trans->block_rsv = block_rsv;
+
return ret;
}
-int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
- struct inode *inode)
+int btrfs_commit_inode_delayed_inode(struct inode *inode)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *block_rsv;
int ret;
if (!delayed_node)
return 0;
mutex_lock(&delayed_node->mutex);
- if (!delayed_node->count) {
+ if (!delayed_node->inode_dirty) {
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
return 0;
}
mutex_unlock(&delayed_node->mutex);
- ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+ trans = btrfs_join_transaction(delayed_node->root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto trans_out;
+ }
+ path->leave_spinning = 1;
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+ mutex_lock(&delayed_node->mutex);
+ if (delayed_node->inode_dirty)
+ ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+ path, delayed_node);
+ else
+ ret = 0;
+ mutex_unlock(&delayed_node->mutex);
+
+ btrfs_free_path(path);
+ trans->block_rsv = block_rsv;
+trans_out:
+ btrfs_end_transaction(trans, delayed_node->root);
+ btrfs_btree_balance_dirty(delayed_node->root);
+out:
btrfs_release_delayed_node(delayed_node);
+
return ret;
}
@@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
struct btrfs_root *root;
struct btrfs_block_rsv *block_rsv;
int need_requeue = 0;
- int ret;
async_node = container_of(work, struct btrfs_async_delayed_node, work);
@@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
block_rsv = trans->block_rsv;
trans->block_rsv = &root->fs_info->delayed_block_rsv;
- ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
- if (!ret)
- ret = btrfs_delete_delayed_items(trans, path, root,
- delayed_node);
-
- if (!ret)
- btrfs_update_delayed_inode(trans, root, path, delayed_node);
-
+ __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
/*
* Maybe new delayed items have been inserted, so we need requeue
* the work. Besides that, we must dequeue the empty delayed nodes
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 4f808e1baee..78b6ad0fc66 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
/* Used for evicting the inode. */
void btrfs_remove_delayed_node(struct inode *inode);
void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_commit_inode_delayed_inode(struct inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ae941177339..b7a0641ead7 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -23,6 +23,10 @@
#include "delayed-ref.h"
#include "transaction.h"
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
/*
* delayed back reference update tracking. For subvolume trees
* we queue up extent allocations and backref maintenance for
@@ -422,6 +426,14 @@ again:
return 1;
}
+void btrfs_release_ref_cluster(struct list_head *cluster)
+{
+ struct list_head *pos, *q;
+
+ list_for_each_safe(pos, q, cluster)
+ list_del_init(pos);
+}
+
/*
* helper function to update an extent delayed ref in the
* rbtree. existing and update must both have the same
@@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
ref->extent_op->flags_to_set;
existing_ref->extent_op->update_flags = 1;
}
- kfree(ref->extent_op);
+ btrfs_free_delayed_extent_op(ref->extent_op);
}
}
/*
@@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(head_ref);
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
} else {
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
@@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(full_ref);
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
@@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(full_ref);
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
@@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs;
BUG_ON(extent_op && extent_op->is_data);
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
- head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
- kfree(ref);
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
return -ENOMEM;
}
@@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs;
BUG_ON(extent_op && !extent_op->is_data);
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
- head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
- kfree(ref);
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
return -ENOMEM;
}
@@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
return -ENOMEM;
@@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
return btrfs_delayed_node_to_head(ref);
return NULL;
}
+
+void btrfs_delayed_ref_exit(void)
+{
+ if (btrfs_delayed_ref_head_cachep)
+ kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+ if (btrfs_delayed_tree_ref_cachep)
+ kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+ if (btrfs_delayed_data_ref_cachep)
+ kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+ if (btrfs_delayed_extent_op_cachep)
+ kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
+
+int btrfs_delayed_ref_init(void)
+{
+ btrfs_delayed_ref_head_cachep = kmem_cache_create(
+ "btrfs_delayed_ref_head",
+ sizeof(struct btrfs_delayed_ref_head), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_ref_head_cachep)
+ goto fail;
+
+ btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+ "btrfs_delayed_tree_ref",
+ sizeof(struct btrfs_delayed_tree_ref), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_tree_ref_cachep)
+ goto fail;
+
+ btrfs_delayed_data_ref_cachep = kmem_cache_create(
+ "btrfs_delayed_data_ref",
+ sizeof(struct btrfs_delayed_data_ref), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_data_ref_cachep)
+ goto fail;
+
+ btrfs_delayed_extent_op_cachep = kmem_cache_create(
+ "btrfs_delayed_extent_op",
+ sizeof(struct btrfs_delayed_extent_op), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_extent_op_cachep)
+ goto fail;
+
+ return 0;
+fail:
+ btrfs_delayed_ref_exit();
+ return -ENOMEM;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c9d703693df..f75fcaf79ae 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root {
unsigned long num_heads_ready;
/*
+ * bumped when someone is making progress on the delayed
+ * refs, so that other procs know they are just adding to
+ * contention intead of helping
+ */
+ atomic_t procs_running_refs;
+ atomic_t ref_seq;
+ wait_queue_head_t wait;
+
+ /*
* set when the tree is flushing before a transaction commit,
* used by the throttling code to decide if new updates need
* to be run right away
@@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root {
u64 run_delayed_start;
};
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+
+int btrfs_delayed_ref_init(void);
+void btrfs_delayed_ref_exit(void);
+
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+ return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+ if (op)
+ kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
+
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
WARN_ON(atomic_read(&ref->refs) == 0);
if (atomic_dec_and_test(&ref->refs)) {
WARN_ON(ref->in_tree);
- kfree(ref);
+ switch (ref->type) {
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ case BTRFS_SHARED_DATA_REF_KEY:
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ break;
+ case 0:
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+ break;
+ default:
+ BUG();
+ }
}
}
@@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+ mutex_unlock(&head->mutex);
+}
+
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
+void btrfs_release_ref_cluster(struct list_head *cluster);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 66dbc8dbddf..7ba7b3900cb 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- btrfs_start_delalloc_inodes(root, 0);
+ ret = btrfs_start_delalloc_inodes(root, 0);
+ if (ret) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return ret;
+ }
btrfs_wait_ordered_extents(root, 0);
trans = btrfs_start_transaction(root, 0);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a8f652dc940..02369a3c162 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
+#include "raid56.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+ struct btrfs_root *root);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
@@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
{
struct extent_io_tree *tree;
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 found_start;
struct extent_buffer *eb;
@@ -639,8 +641,15 @@ err:
btree_readahead_hook(root, eb, eb->start, ret);
}
- if (ret)
+ if (ret) {
+ /*
+ * our io error hook is going to dec the io pages
+ * again, we have to make sure it has something
+ * to decrement
+ */
+ atomic_inc(&eb->io_pages);
clear_extent_buffer_uptodate(eb);
+ }
free_extent_buffer(eb);
out:
return ret;
@@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
eb = (struct extent_buffer *)page->private;
set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
eb->read_mirror = failed_mirror;
+ atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(root, eb, eb->start, -EIO);
return -EIO; /* we fixed nothing */
@@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
end_io_wq->work.flags = 0;
if (bio->bi_rw & REQ_WRITE) {
- if (end_io_wq->metadata == 1)
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
btrfs_queue_worker(&fs_info->endio_meta_write_workers,
&end_io_wq->work);
- else if (end_io_wq->metadata == 2)
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
btrfs_queue_worker(&fs_info->endio_freespace_worker,
&end_io_wq->work);
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+ btrfs_queue_worker(&fs_info->endio_raid56_workers,
+ &end_io_wq->work);
else
btrfs_queue_worker(&fs_info->endio_write_workers,
&end_io_wq->work);
} else {
- if (end_io_wq->metadata)
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+ btrfs_queue_worker(&fs_info->endio_raid56_workers,
+ &end_io_wq->work);
+ else if (end_io_wq->metadata)
btrfs_queue_worker(&fs_info->endio_meta_workers,
&end_io_wq->work);
else
@@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
* 0 - if data
* 1 - if normal metadta
* 2 - if writing to the free space cache area
+ * 3 - raid parity work
*/
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata)
@@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct extent_io_tree *tree;
+ struct btrfs_fs_info *fs_info;
+ int ret;
+
tree = &BTRFS_I(mapping->host)->io_tree;
if (wbc->sync_mode == WB_SYNC_NONE) {
- struct btrfs_root *root = BTRFS_I(mapping->host)->root;
- u64 num_dirty;
- unsigned long thresh = 32 * 1024 * 1024;
if (wbc->for_kupdate)
return 0;
+ fs_info = BTRFS_I(mapping->host)->root->fs_info;
/* this is a bit racy, but that's ok */
- num_dirty = root->fs_info->dirty_metadata_bytes;
- if (num_dirty < thresh)
+ ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH);
+ if (ret < 0)
return 0;
}
return btree_write_cache_pages(mapping, wbc);
@@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
if (btrfs_header_generation(buf) ==
- root->fs_info->running_transaction->transid) {
+ fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (root->fs_info->dirty_metadata_bytes >= buf->len)
- root->fs_info->dirty_metadata_bytes -= buf->len;
- else {
- spin_unlock(&root->fs_info->delalloc_lock);
- btrfs_panic(root->fs_info, -EOVERFLOW,
- "Can't clear %lu bytes from "
- " dirty_mdatadata_bytes (%llu)",
- buf->len,
- root->fs_info->dirty_metadata_bytes);
- }
- spin_unlock(&root->fs_info->delalloc_lock);
-
+ __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+ -buf->len,
+ fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
btrfs_set_lock_blocking(buf);
clear_extent_buffer_dirty(buf);
@@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD(&root->logged_list[0]);
+ INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->accounting_lock);
+ spin_lock_init(&root->log_extents_lock[0]);
+ spin_lock_init(&root->log_extents_lock[1]);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
init_waitqueue_head(&root->log_writer_wait);
@@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb,
goto fail_srcu;
}
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+ if (ret) {
+ err = ret;
+ goto fail_bdi;
+ }
+ fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+ (1 + ilog2(nr_cpu_ids));
+
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+ if (ret) {
+ err = ret;
+ goto fail_dirty_metadata_bytes;
+ }
+
fs_info->btree_inode = new_inode(sb);
if (!fs_info->btree_inode) {
err = -ENOMEM;
- goto fail_bdi;
+ goto fail_delalloc_bytes;
}
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
- INIT_LIST_HEAD(&fs_info->ordered_operations);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->trans_lock);
@@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->tree_mod_seq_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
+ seqlock_init(&fs_info->profiles_lock);
init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
+ fs_info->first_logical_byte = (u64)-1;
extent_io_tree_init(&fs_info->freed_extents[0],
fs_info->btree_inode->i_mapping);
@@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
+ ret = btrfs_alloc_stripe_hash_table(fs_info);
+ if (ret) {
+ err = ret;
+ goto fail_alloc;
+ }
+
__setup_root(4096, 4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
/* check FS state, whether FS is broken. */
- fs_info->fs_state |= btrfs_super_flags(disk_super);
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
if (ret) {
@@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb,
leafsize = btrfs_super_leafsize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
stripesize = btrfs_super_stripesize(disk_super);
+ fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
/*
* mixed block groups end up with duplicate but slightly offset
@@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->endio_meta_write_workers,
"endio-meta-write", fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_raid56_workers,
+ "endio-raid56", fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->rmw_workers,
+ "rmw", fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
fs_info->thread_pool_size,
&fs_info->generic_worker);
@@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb,
*/
fs_info->endio_workers.idle_thresh = 4;
fs_info->endio_meta_workers.idle_thresh = 4;
+ fs_info->endio_raid56_workers.idle_thresh = 4;
+ fs_info->rmw_workers.idle_thresh = 2;
fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb,
ret |= btrfs_start_workers(&fs_info->fixup_workers);
ret |= btrfs_start_workers(&fs_info->endio_workers);
ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
+ ret |= btrfs_start_workers(&fs_info->rmw_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
ret |= btrfs_start_workers(&fs_info->endio_write_workers);
ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
- if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
- sizeof(disk_super->magic))) {
+ if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
goto fail_sb_buffer;
}
@@ -2694,13 +2742,13 @@ fail_cleaner:
* kthreads
*/
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
- invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_block_groups:
btrfs_free_block_groups(fs_info);
fail_tree_roots:
free_root_pointers(fs_info, 1);
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
btrfs_stop_workers(&fs_info->generic_worker);
@@ -2710,6 +2758,8 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_meta_workers);
+ btrfs_stop_workers(&fs_info->endio_raid56_workers);
+ btrfs_stop_workers(&fs_info->rmw_workers);
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2721,13 +2771,17 @@ fail_alloc:
fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
- invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
+fail_delalloc_bytes:
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
fail_bdi:
bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
+ btrfs_free_stripe_hash_table(fs_info);
btrfs_close_devices(fs_info->fs_devices);
return err;
@@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
super = (struct btrfs_super_block *)bh->b_data;
if (btrfs_super_bytenr(super) != bytenr ||
- strncmp((char *)(&super->magic), BTRFS_MAGIC,
- sizeof(super->magic))) {
+ super->magic != cpu_to_le64(BTRFS_MAGIC)) {
brelse(bh);
continue;
}
@@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
== 0)))
num_tolerated_disk_barrier_failures = 0;
- else if (num_tolerated_disk_barrier_failures > 1
- &&
- (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10)))
- num_tolerated_disk_barrier_failures = 1;
+ else if (num_tolerated_disk_barrier_failures > 1) {
+ if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ num_tolerated_disk_barrier_failures = 1;
+ } else if (flags &
+ BTRFS_BLOCK_GROUP_RAID5) {
+ num_tolerated_disk_barrier_failures = 2;
+ }
+ }
}
}
up_read(&sinfo->groups_sem);
@@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
if (btrfs_root_refs(&root->root_item) == 0)
synchronize_srcu(&fs_info->subvol_srcu);
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ btrfs_free_log(NULL, root);
+ btrfs_free_log_root_tree(NULL, fs_info);
+ }
+
__btrfs_remove_free_space_cache(root->free_ino_pinned);
__btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
@@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
btrfs_error_commit_super(root);
btrfs_put_block_group_cache(fs_info);
@@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root)
btrfs_free_qgroup_config(root->fs_info);
- if (fs_info->delalloc_bytes) {
- printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
- (unsigned long long)fs_info->delalloc_bytes);
+ if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+ printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
+ percpu_counter_sum(&fs_info->delalloc_bytes));
}
free_extent_buffer(fs_info->extent_root->node);
@@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_meta_workers);
+ btrfs_stop_workers(&fs_info->endio_raid56_workers);
+ btrfs_stop_workers(&fs_info->rmw_workers);
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root)
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
+ btrfs_free_stripe_hash_table(fs_info);
+
return 0;
}
@@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
(unsigned long long)transid,
(unsigned long long)root->fs_info->generation);
was_dirty = set_extent_buffer_dirty(buf);
- if (!was_dirty) {
- spin_lock(&root->fs_info->delalloc_lock);
- root->fs_info->dirty_metadata_bytes += buf->len;
- spin_unlock(&root->fs_info->delalloc_lock);
- }
+ if (!was_dirty)
+ __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+ buf->len,
+ root->fs_info->dirty_metadata_batch);
}
static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
* looks as though older kernels can get into trouble with
* this code, they end up stuck in balance_dirty_pages forever
*/
- u64 num_dirty;
- unsigned long thresh = 32 * 1024 * 1024;
+ int ret;
if (current->flags & PF_MEMALLOC)
return;
@@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
if (flush_delayed)
btrfs_balance_delayed_items(root);
- num_dirty = root->fs_info->dirty_metadata_bytes;
-
- if (num_dirty > thresh) {
+ ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH);
+ if (ret > 0) {
balance_dirty_pages_ratelimited(
root->fs_info->btree_inode->i_mapping);
}
@@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
btrfs_cleanup_transaction(root);
}
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+ struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
@@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_extent_lock);
- list_splice_init(&root->fs_info->ordered_operations, &splice);
+ list_splice_init(&t->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
@@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
- struct list_head splice;
struct btrfs_ordered_extent *ordered;
- struct inode *inode;
-
- INIT_LIST_HEAD(&splice);
spin_lock(&root->fs_info->ordered_extent_lock);
-
- list_splice_init(&root->fs_info->ordered_extents, &splice);
- while (!list_empty(&splice)) {
- ordered = list_entry(splice.next, struct btrfs_ordered_extent,
- root_extent_list);
-
- list_del_init(&ordered->root_extent_list);
- atomic_inc(&ordered->refs);
-
- /* the inode may be getting freed (in sys_unlink path). */
- inode = igrab(ordered->inode);
-
- spin_unlock(&root->fs_info->ordered_extent_lock);
- if (inode)
- iput(inode);
-
- atomic_set(&ordered->refs, 1);
- btrfs_put_ordered_extent(ordered);
-
- spin_lock(&root->fs_info->ordered_extent_lock);
- }
-
+ /*
+ * This will just short circuit the ordered completion stuff which will
+ * make sure the ordered extent gets properly cleaned up.
+ */
+ list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+ root_extent_list)
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
spin_unlock(&root->fs_info->ordered_extent_lock);
}
@@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
}
while ((node = rb_first(&delayed_refs->root)) != NULL) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ struct btrfs_delayed_ref_head *head = NULL;
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
atomic_set(&ref->refs, 1);
if (btrfs_delayed_ref_is_head(ref)) {
- struct btrfs_delayed_ref_head *head;
head = btrfs_delayed_node_to_head(ref);
if (!mutex_trylock(&head->mutex)) {
@@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
- kfree(head->extent_op);
+ btrfs_free_delayed_extent_op(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
delayed_refs->num_heads_ready--;
list_del_init(&head->cluster);
}
+
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
-
+ if (head)
+ mutex_unlock(&head->mutex);
spin_unlock(&delayed_refs->lock);
btrfs_put_delayed_ref(ref);
@@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
delalloc_inodes);
list_del_init(&btrfs_inode->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &btrfs_inode->runtime_flags);
btrfs_invalidate_inodes(btrfs_inode->root);
}
@@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
while (!list_empty(&list)) {
t = list_entry(list.next, struct btrfs_transaction, list);
- if (!t)
- break;
- btrfs_destroy_ordered_operations(root);
+ btrfs_destroy_ordered_operations(t, root);
btrfs_destroy_ordered_extents(root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e..034d7dc552b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
+enum {
+ BTRFS_WQ_ENDIO_DATA = 0,
+ BTRFS_WQ_ENDIO_METADATA = 1,
+ BTRFS_WQ_ENDIO_FREE_SPACE = 2,
+ BTRFS_WQ_ENDIO_RAID56 = 3,
+};
+
static inline u64 btrfs_sb_offset(int mirror)
{
u64 start = 16 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1e59ed575cc..3e074dab2d5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
#include "print-tree.h"
#include "transaction.h"
#include "volumes.h"
+#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
#include "math.h"
@@ -72,8 +73,7 @@ enum {
RESERVE_ALLOC_NO_ACCOUNT = 2,
};
-static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
rb_link_node(&block_group->cache_node, parent, p);
rb_insert_color(&block_group->cache_node,
&info->block_group_cache_tree);
+
+ if (info->first_logical_byte > block_group->key.objectid)
+ info->first_logical_byte = block_group->key.objectid;
+
spin_unlock(&info->block_group_cache_lock);
return 0;
@@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
break;
}
}
- if (ret)
+ if (ret) {
btrfs_get_block_group(ret);
+ if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+ info->first_logical_byte = ret->key.objectid;
+ }
spin_unlock(&info->block_group_cache_lock);
return ret;
@@ -468,8 +477,6 @@ out:
}
static int cache_block_group(struct btrfs_block_group_cache *cache,
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
int load_cache_only)
{
DEFINE_WAIT(wait);
@@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_FAST;
spin_unlock(&cache->lock);
- /*
- * We can't do the read from on-disk cache during a commit since we need
- * to have the normal tree locking. Also if we are currently trying to
- * allocate blocks for the tree root we can't do the fast caching since
- * we likely hold important locks.
- */
if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
ret = load_free_space_cache(fs_info, cache);
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
*actual_bytes = discarded_bytes;
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
return ret;
}
@@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
node->num_bytes);
}
}
- mutex_unlock(&head->mutex);
return ret;
}
@@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
* process of being added. Don't run this ref yet.
*/
list_del_init(&locked_ref->cluster);
- mutex_unlock(&locked_ref->mutex);
+ btrfs_delayed_ref_unlock(locked_ref);
locked_ref = NULL;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ref = &locked_ref->node;
if (extent_op && must_insert_reserved) {
- kfree(extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
extent_op = NULL;
}
@@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_delayed_extent_op(trans, root,
ref, extent_op);
- kfree(extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- list_del_init(&locked_ref->cluster);
- mutex_unlock(&locked_ref->mutex);
-
- printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+ printk(KERN_DEBUG
+ "btrfs: run_delayed_extent_op "
+ "returned %d\n", ret);
spin_lock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(locked_ref);
return ret;
}
goto next;
}
-
- list_del_init(&locked_ref->cluster);
- locked_ref = NULL;
}
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- if (locked_ref) {
+ if (!btrfs_delayed_ref_is_head(ref)) {
/*
* when we play the delayed ref, also correct the
* ref_mod on head
@@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_one_delayed_ref(trans, root, ref, extent_op,
must_insert_reserved);
- btrfs_put_delayed_ref(ref);
- kfree(extent_op);
- count++;
-
+ btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- if (locked_ref) {
- list_del_init(&locked_ref->cluster);
- mutex_unlock(&locked_ref->mutex);
- }
- printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+ btrfs_delayed_ref_unlock(locked_ref);
+ btrfs_put_delayed_ref(ref);
+ printk(KERN_DEBUG
+ "btrfs: run_one_delayed_ref returned %d\n", ret);
spin_lock(&delayed_refs->lock);
return ret;
}
+ /*
+ * If this node is a head, that means all the refs in this head
+ * have been dealt with, and we will pick the next head to deal
+ * with, so we must unlock the head and drop it from the cluster
+ * list before we release it.
+ */
+ if (btrfs_delayed_ref_is_head(ref)) {
+ list_del_init(&locked_ref->cluster);
+ btrfs_delayed_ref_unlock(locked_ref);
+ locked_ref = NULL;
+ }
+ btrfs_put_delayed_ref(ref);
+ count++;
next:
cond_resched();
spin_lock(&delayed_refs->lock);
@@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
return ret;
}
+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+ int count)
+{
+ int val = atomic_read(&delayed_refs->ref_seq);
+
+ if (val < seq || val >= seq + count)
+ return 1;
+ return 0;
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
+ if (count == 0) {
+ count = delayed_refs->num_entries * 2;
+ run_most = 1;
+ }
+
+ if (!run_all && !run_most) {
+ int old;
+ int seq = atomic_read(&delayed_refs->ref_seq);
+
+progress:
+ old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+ if (old) {
+ DEFINE_WAIT(__wait);
+ if (delayed_refs->num_entries < 16348)
+ return 0;
+
+ prepare_to_wait(&delayed_refs->wait, &__wait,
+ TASK_UNINTERRUPTIBLE);
+
+ old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+ if (old) {
+ schedule();
+ finish_wait(&delayed_refs->wait, &__wait);
+
+ if (!refs_newer(delayed_refs, seq, 256))
+ goto progress;
+ else
+ return 0;
+ } else {
+ finish_wait(&delayed_refs->wait, &__wait);
+ goto again;
+ }
+ }
+
+ } else {
+ atomic_inc(&delayed_refs->procs_running_refs);
+ }
+
again:
loops = 0;
spin_lock(&delayed_refs->lock);
@@ -2477,10 +2533,6 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- if (count == 0) {
- count = delayed_refs->num_entries * 2;
- run_most = 1;
- }
while (1) {
if (!(run_all || run_most) &&
delayed_refs->num_heads_ready < 64)
@@ -2500,11 +2552,15 @@ again:
ret = run_clustered_refs(trans, root, &cluster);
if (ret < 0) {
+ btrfs_release_ref_cluster(&cluster);
spin_unlock(&delayed_refs->lock);
btrfs_abort_transaction(trans, root, ret);
+ atomic_dec(&delayed_refs->procs_running_refs);
return ret;
}
+ atomic_add(ret, &delayed_refs->ref_seq);
+
count -= min_t(unsigned long, ret, count);
if (count == 0)
@@ -2573,6 +2629,11 @@ again:
goto again;
}
out:
+ atomic_dec(&delayed_refs->procs_running_refs);
+ smp_mb();
+ if (waitqueue_active(&delayed_refs->wait))
+ wake_up(&delayed_refs->wait);
+
spin_unlock(&delayed_refs->lock);
assert_qgroups_uptodate(trans);
return 0;
@@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op;
int ret;
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ extent_op = btrfs_alloc_delayed_extent_op();
if (!extent_op)
return -ENOMEM;
@@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
num_bytes, extent_op);
if (ret)
- kfree(extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
return ret;
}
@@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
u64 extra_flags = chunk_to_extended(flags) &
BTRFS_EXTENDED_PROFILE_MASK;
+ write_seqlock(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits |= extra_flags;
if (flags & BTRFS_BLOCK_GROUP_METADATA)
fs_info->avail_metadata_alloc_bits |= extra_flags;
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
fs_info->avail_system_alloc_bits |= extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
}
/*
@@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
u64 num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
u64 target;
+ u64 tmp;
/*
* see if restripe for this chunk_type is in progress, if so
@@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
}
spin_unlock(&root->fs_info->balance_lock);
+ /* First, mask out the RAID levels which aren't possible */
if (num_devices == 1)
- flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+ flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5);
+ if (num_devices < 3)
+ flags &= ~BTRFS_BLOCK_GROUP_RAID6;
if (num_devices < 4)
flags &= ~BTRFS_BLOCK_GROUP_RAID10;
- if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
- (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))) {
- flags &= ~BTRFS_BLOCK_GROUP_DUP;
- }
-
- if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
- (flags & BTRFS_BLOCK_GROUP_RAID10)) {
- flags &= ~BTRFS_BLOCK_GROUP_RAID1;
- }
+ tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+ flags &= ~tmp;
- if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
- ((flags & BTRFS_BLOCK_GROUP_RAID1) |
- (flags & BTRFS_BLOCK_GROUP_RAID10) |
- (flags & BTRFS_BLOCK_GROUP_DUP))) {
- flags &= ~BTRFS_BLOCK_GROUP_RAID0;
- }
+ if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+ tmp = BTRFS_BLOCK_GROUP_RAID6;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+ tmp = BTRFS_BLOCK_GROUP_RAID5;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+ tmp = BTRFS_BLOCK_GROUP_RAID10;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+ tmp = BTRFS_BLOCK_GROUP_RAID1;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+ tmp = BTRFS_BLOCK_GROUP_RAID0;
- return extended_to_chunk(flags);
+ return extended_to_chunk(flags | tmp);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= root->fs_info->avail_data_alloc_bits;
- else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= root->fs_info->avail_system_alloc_bits;
- else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= root->fs_info->avail_metadata_alloc_bits;
+ unsigned seq;
+
+ do {
+ seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ flags |= root->fs_info->avail_data_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ flags |= root->fs_info->avail_system_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ flags |= root->fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&root->fs_info->profiles_lock, seq));
return btrfs_reduce_alloc_profile(root, flags);
}
@@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
u64 flags;
+ u64 ret;
if (data)
flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
else
flags = BTRFS_BLOCK_GROUP_METADATA;
- return get_alloc_profile(root, flags);
+ ret = get_alloc_profile(root, flags);
+ return ret;
}
/*
@@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
int ret = 0, committed = 0, alloc_chunk = 1;
/* make sure bytes are sectorsize aligned */
- bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ bytes = ALIGN(bytes, root->sectorsize);
if (root == root->fs_info->tree_root ||
BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
struct btrfs_space_info *data_sinfo;
/* make sure bytes are sectorsize aligned */
- bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ bytes = ALIGN(bytes, root->sectorsize);
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
@@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
{
u64 num_dev;
- if (type & BTRFS_BLOCK_GROUP_RAID10 ||
- type & BTRFS_BLOCK_GROUP_RAID0)
+ if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6))
num_dev = root->fs_info->fs_devices->rw_devices;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_dev = 2;
@@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
int wait_for_alloc = 0;
int ret = 0;
+ /* Don't re-enter if we're already allocating a chunk */
+ if (trans->allocating_chunk)
+ return -ENOSPC;
+
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3686,8 @@ again:
goto again;
}
+ trans->allocating_chunk = true;
+
/*
* If we have mixed data/metadata chunks we want to make sure we keep
* allocating mixed chunks instead of individual chunks.
@@ -3632,19 +3714,20 @@ again:
check_system_chunk(trans, extent_root, flags);
ret = btrfs_alloc_chunk(trans, extent_root, flags);
- if (ret < 0 && ret != -ENOSPC)
- goto out;
+ trans->allocating_chunk = false;
spin_lock(&space_info->lock);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out;
if (ret)
space_info->full = 1;
else
ret = 1;
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
-out:
mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
@@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
u64 profile = btrfs_get_alloc_profile(root, 0);
+ u64 rsv_size = 0;
u64 avail;
u64 used;
+ u64 to_add;
used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
+ space_info->bytes_pinned + space_info->bytes_readonly;
+
+ spin_lock(&global_rsv->lock);
+ rsv_size = global_rsv->size;
+ spin_unlock(&global_rsv->lock);
+
+ /*
+ * We only want to allow over committing if we have lots of actual space
+ * free, but if we don't have enough space to handle the global reserve
+ * space then we could end up having a real enospc problem when trying
+ * to allocate a chunk or some other such important allocation.
+ */
+ rsv_size <<= 1;
+ if (used + rsv_size >= space_info->total_bytes)
+ return 0;
+
+ used += space_info->bytes_may_use;
spin_lock(&root->fs_info->free_chunk_lock);
avail = root->fs_info->free_chunk_space;
@@ -3667,40 +3768,58 @@ static int can_overcommit(struct btrfs_root *root,
/*
* If we have dup, raid1 or raid10 then only half of the free
- * space is actually useable.
+ * space is actually useable. For raid56, the space info used
+ * doesn't include the parity drive, so we don't have to
+ * change the math
*/
if (profile & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
avail >>= 1;
+ to_add = space_info->total_bytes;
+
/*
* If we aren't flushing all things, let us overcommit up to
* 1/2th of the space. If we can flush, don't let us overcommit
* too much, let it overcommit up to 1/8 of the space.
*/
if (flush == BTRFS_RESERVE_FLUSH_ALL)
- avail >>= 3;
+ to_add >>= 3;
else
- avail >>= 1;
+ to_add >>= 1;
+
+ /*
+ * Limit the overcommit to the amount of free space we could possibly
+ * allocate for chunks.
+ */
+ to_add = min(avail, to_add);
- if (used + bytes < space_info->total_bytes + avail)
+ if (used + bytes < space_info->total_bytes + to_add)
return 1;
return 0;
}
-static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
- unsigned long nr_pages,
- enum wb_reason reason)
+void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+ unsigned long nr_pages)
{
- if (!writeback_in_progress(sb->s_bdi) &&
- down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, reason);
- up_read(&sb->s_umount);
- return 1;
- }
+ struct super_block *sb = root->fs_info->sb;
+ int started;
- return 0;
+ /* If we can not start writeback, just sync all the delalloc file. */
+ started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
+ WB_REASON_FS_FREE_SPACE);
+ if (!started) {
+ /*
+ * We needn't worry the filesystem going from r/w to r/o though
+ * we don't acquire ->s_umount mutex, because the filesystem
+ * should guarantee the delalloc inodes list be empty after
+ * the filesystem is readonly(all dirty pages are written to
+ * the disk).
+ */
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0);
+ }
}
/*
@@ -3724,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
space_info = block_rsv->space_info;
smp_mb();
- delalloc_bytes = root->fs_info->delalloc_bytes;
+ delalloc_bytes = percpu_counter_sum_positive(
+ &root->fs_info->delalloc_bytes);
if (delalloc_bytes == 0) {
if (trans)
return;
@@ -3735,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
- writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
- nr_pages,
- WB_REASON_FS_FREE_SPACE);
-
+ btrfs_writeback_inodes_sb_nr(root, nr_pages);
/*
* We need to wait for the async pages to actually start before
* we do anything.
@@ -3766,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
break;
}
smp_mb();
- delalloc_bytes = root->fs_info->delalloc_bytes;
+ delalloc_bytes = percpu_counter_sum_positive(
+ &root->fs_info->delalloc_bytes);
}
}
@@ -4030,6 +4148,15 @@ again:
goto again;
out:
+ if (ret == -ENOSPC &&
+ unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+ struct btrfs_block_rsv *global_rsv =
+ &root->fs_info->global_block_rsv;
+
+ if (block_rsv != global_rsv &&
+ !block_rsv_use_bytes(global_rsv, orig_bytes))
+ ret = 0;
+ }
if (flushing) {
spin_lock(&space_info->lock);
space_info->flush = 0;
@@ -4416,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending)
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int items,
+ u64 *qgroup_reserved)
{
- struct btrfs_root *root = pending->root;
- struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
- struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
- /*
- * two for root back/forward refs, two for directory entries,
- * one for root of the snapshot and one for parent inode.
- */
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
- dst_rsv->space_info = src_rsv->space_info;
- return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+ u64 num_bytes;
+ int ret;
+
+ if (root->fs_info->quota_enabled) {
+ /* One for parent inode, two for dir entries */
+ num_bytes = 3 * root->leafsize;
+ ret = btrfs_qgroup_reserve(root, num_bytes);
+ if (ret)
+ return ret;
+ } else {
+ num_bytes = 0;
+ }
+
+ *qgroup_reserved = num_bytes;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, items);
+ rsv->space_info = __find_space_info(root->fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+ ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret) {
+ if (*qgroup_reserved)
+ btrfs_qgroup_free(root, *qgroup_reserved);
+ }
+
+ return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 qgroup_reserved)
+{
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+ if (qgroup_reserved)
+ btrfs_qgroup_free(root, qgroup_reserved);
}
/**
@@ -4536,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
+ u64 to_free = 0;
+ unsigned dropped;
/* If we are a free space inode we need to not flush since we will be in
* the middle of a transaction commit. We also don't need the delalloc
@@ -4579,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
csum_bytes = BTRFS_I(inode)->csum_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
- if (root->fs_info->quota_enabled)
+ if (root->fs_info->quota_enabled) {
ret = btrfs_qgroup_reserve(root, num_bytes +
nr_extents * root->leafsize);
+ if (ret)
+ goto out_fail;
+ }
- /*
- * ret != 0 here means the qgroup reservation failed, we go straight to
- * the shared error handling then.
- */
- if (ret == 0)
- ret = reserve_metadata_bytes(root, block_rsv,
- to_reserve, flush);
-
- if (ret) {
- u64 to_free = 0;
- unsigned dropped;
-
- spin_lock(&BTRFS_I(inode)->lock);
- dropped = drop_outstanding_extent(inode);
- /*
- * If the inodes csum_bytes is the same as the original
- * csum_bytes then we know we haven't raced with any free()ers
- * so we can just reduce our inodes csum bytes and carry on.
- * Otherwise we have to do the normal free thing to account for
- * the case that the free side didn't free up its reserve
- * because of this outstanding reservation.
- */
- if (BTRFS_I(inode)->csum_bytes == csum_bytes)
- calc_csum_metadata_size(inode, num_bytes, 0);
- else
- to_free = calc_csum_metadata_size(inode, num_bytes, 0);
- spin_unlock(&BTRFS_I(inode)->lock);
- if (dropped)
- to_free += btrfs_calc_trans_metadata_size(root, dropped);
-
- if (to_free) {
- btrfs_block_rsv_release(root, block_rsv, to_free);
- trace_btrfs_space_reservation(root->fs_info,
- "delalloc",
- btrfs_ino(inode),
- to_free, 0);
- }
- if (root->fs_info->quota_enabled) {
+ ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ if (unlikely(ret)) {
+ if (root->fs_info->quota_enabled)
btrfs_qgroup_free(root, num_bytes +
nr_extents * root->leafsize);
- }
- if (delalloc_lock)
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
- return ret;
+ goto out_fail;
}
spin_lock(&BTRFS_I(inode)->lock);
@@ -4647,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
block_rsv_add_bytes(block_rsv, to_reserve, 1);
return 0;
+
+out_fail:
+ spin_lock(&BTRFS_I(inode)->lock);
+ dropped = drop_outstanding_extent(inode);
+ /*
+ * If the inodes csum_bytes is the same as the original
+ * csum_bytes then we know we haven't raced with any free()ers
+ * so we can just reduce our inodes csum bytes and carry on.
+ * Otherwise we have to do the normal free thing to account for
+ * the case that the free side didn't free up its reserve
+ * because of this outstanding reservation.
+ */
+ if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+ calc_csum_metadata_size(inode, num_bytes, 0);
+ else
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (dropped)
+ to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+ if (to_free) {
+ btrfs_block_rsv_release(root, block_rsv, to_free);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), to_free, 0);
+ }
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ return ret;
}
/**
@@ -4668,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
spin_lock(&BTRFS_I(inode)->lock);
dropped = drop_outstanding_extent(inode);
- to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ if (num_bytes)
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
spin_unlock(&BTRFS_I(inode)->lock);
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4735,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
btrfs_free_reserved_data_space(inode, num_bytes);
}
-static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache = NULL;
@@ -4773,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && cache->cached == BTRFS_CACHE_NO)
- cache_block_group(cache, trans, NULL, 1);
+ cache_block_group(cache, 1);
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -4823,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
struct btrfs_block_group_cache *cache;
u64 bytenr;
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ bytenr = root->fs_info->first_logical_byte;
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+
+ if (bytenr < (u64)-1)
+ return bytenr;
+
cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
if (!cache)
return 0;
@@ -4873,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
/*
* this function must be called within transaction
*/
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group_cache *cache;
@@ -4888,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
* to one because the slow code to read in the free extents does check
* the pinned extents.
*/
- cache_block_group(cache, trans, root, 1);
+ cache_block_group(cache, 1);
pin_down_extent(root, cache, bytenr, num_bytes, 0);
@@ -5285,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+ ret = update_block_group(root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5330,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->extent_op) {
if (!head->must_insert_reserved)
goto out;
- kfree(head->extent_op);
+ btrfs_free_delayed_extent_op(head->extent_op);
head->extent_op = NULL;
}
@@ -5453,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ret;
}
-static u64 stripe_align(struct btrfs_root *root, u64 val)
+static u64 stripe_align(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache,
+ u64 val, u64 num_bytes)
{
- u64 mask = ((u64)root->stripesize - 1);
- u64 ret = (val + mask) & ~mask;
+ u64 ret = ALIGN(val, root->stripesize);
return ret;
}
@@ -5476,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
- DEFINE_WAIT(wait);
caching_ctl = get_caching_control(cache);
if (!caching_ctl)
@@ -5493,7 +5662,6 @@ static noinline int
wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
{
struct btrfs_caching_control *caching_ctl;
- DEFINE_WAIT(wait);
caching_ctl = get_caching_control(cache);
if (!caching_ctl)
@@ -5507,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
int __get_raid_index(u64 flags)
{
- int index;
-
if (flags & BTRFS_BLOCK_GROUP_RAID10)
- index = 0;
+ return BTRFS_RAID_RAID10;
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- index = 1;
+ return BTRFS_RAID_RAID1;
else if (flags & BTRFS_BLOCK_GROUP_DUP)
- index = 2;
+ return BTRFS_RAID_DUP;
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- index = 3;
- else
- index = 4;
+ return BTRFS_RAID_RAID0;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ return BTRFS_RAID_RAID5;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return BTRFS_RAID_RAID6;
- return index;
+ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}
static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5663,6 +5831,8 @@ search:
if (!block_group_bits(block_group, data)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID10;
/*
@@ -5678,8 +5848,7 @@ have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
found_uncached_bg = true;
- ret = cache_block_group(block_group, trans,
- orig_root, 0);
+ ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
}
@@ -5692,6 +5861,7 @@ have_block_group:
* lets look there
*/
if (last_ptr) {
+ unsigned long aligned_cluster;
/*
* the refill lock keeps out other
* people trying to start a new cluster
@@ -5758,11 +5928,15 @@ refill_cluster:
goto unclustered_alloc;
}
+ aligned_cluster = max_t(unsigned long,
+ empty_cluster + empty_size,
+ block_group->full_stripe_len);
+
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
search_start, num_bytes,
- empty_cluster + empty_size);
+ aligned_cluster);
if (ret == 0) {
/*
* now pull our allocation out of this
@@ -5833,7 +6007,8 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = stripe_align(root, offset);
+ search_start = stripe_align(root, used_block_group,
+ offset, num_bytes);
/* move on to the next group */
if (search_start + num_bytes >
@@ -5984,7 +6159,7 @@ again:
if (ret == -ENOSPC) {
if (!final_tried) {
num_bytes = num_bytes >> 1;
- num_bytes = num_bytes & ~(root->sectorsize - 1);
+ num_bytes = round_down(num_bytes, root->sectorsize);
num_bytes = max(num_bytes, min_alloc_size);
if (num_bytes == min_alloc_size)
final_tried = true;
@@ -6108,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -6172,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -6215,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
u64 num_bytes = ins->offset;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- cache_block_group(block_group, trans, NULL, 0);
+ cache_block_group(block_group, 0);
caching_ctl = get_caching_control(block_group);
if (!caching_ctl) {
@@ -6329,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
if (!ret)
return block_rsv;
if (ret && !block_rsv->failfast) {
- static DEFINE_RATELIMIT_STATE(_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- /*DEFAULT_RATELIMIT_BURST*/ 2);
- if (__ratelimit(&_rs))
- WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
- ret);
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL * 10,
+ /*DEFAULT_RATELIMIT_BURST*/ 1);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG
+ "btrfs: block rsv returned %d\n", ret);
+ }
ret = reserve_metadata_bytes(root, block_rsv, blocksize,
BTRFS_RESERVE_NO_FLUSH);
if (!ret) {
@@ -6400,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ extent_op = btrfs_alloc_delayed_extent_op();
BUG_ON(!extent_op); /* -ENOMEM */
if (key)
memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -7203,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
root->fs_info->fs_devices->missing_devices;
stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
if (num_devices == 1) {
@@ -7481,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
index = get_block_group_index(block_group);
}
- if (index == 0) {
+ if (index == BTRFS_RAID_RAID10) {
dev_min = 4;
/* Divide by 2 */
min_free >>= 1;
- } else if (index == 1) {
+ } else if (index == BTRFS_RAID_RAID1) {
dev_min = 2;
- } else if (index == 2) {
+ } else if (index == BTRFS_RAID_DUP) {
/* Multiply by 2 */
min_free <<= 1;
- } else if (index == 3) {
+ } else if (index == BTRFS_RAID_RAID0) {
dev_min = fs_devices->rw_devices;
do_div(min_free, dev_min);
}
@@ -7651,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
- if (space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0 ||
- space_info->bytes_may_use > 0) {
- WARN_ON(1);
- dump_space_info(space_info, 0, 0);
+ if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+ if (space_info->bytes_pinned > 0 ||
+ space_info->bytes_reserved > 0 ||
+ space_info->bytes_may_use > 0) {
+ WARN_ON(1);
+ dump_space_info(space_info, 0, 0);
+ }
}
list_del(&space_info->list);
kfree(space_info);
@@ -7754,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
btrfs_release_path(path);
cache->flags = btrfs_block_group_flags(&cache->item);
cache->sectorsize = root->sectorsize;
-
+ cache->full_stripe_len = btrfs_full_stripe_len(root,
+ &root->fs_info->mapping_tree,
+ found_key.objectid);
btrfs_init_free_space_ctl(cache);
/*
@@ -7808,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
if (!(get_alloc_profile(root, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_DUP)))
continue;
/*
@@ -7883,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
cache->sectorsize = root->sectorsize;
cache->fs_info = root->fs_info;
+ cache->full_stripe_len = btrfs_full_stripe_len(root,
+ &root->fs_info->mapping_tree,
+ chunk_offset);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
@@ -7932,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
u64 extra_flags = chunk_to_extended(flags) &
BTRFS_EXTENDED_PROFILE_MASK;
+ write_seqlock(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits &= ~extra_flags;
if (flags & BTRFS_BLOCK_GROUP_METADATA)
fs_info->avail_metadata_alloc_bits &= ~extra_flags;
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
fs_info->avail_system_alloc_bits &= ~extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8036,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&root->fs_info->block_group_cache_tree);
+
+ if (root->fs_info->first_logical_byte == block_group->key.objectid)
+ root->fs_info->first_logical_byte = (u64)-1;
spin_unlock(&root->fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
@@ -8158,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!block_group_cache_done(cache)) {
- ret = cache_block_group(cache, NULL, root, 0);
+ ret = cache_block_group(cache, 0);
if (!ret)
wait_block_group_cache_done(cache);
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1b319df29ee..f173c5af646 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4,7 +4,6 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
-#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
@@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
*/
static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
{
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
@@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
*/
static void check_page_locked(struct extent_io_tree *tree, struct page *page)
{
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
unlock_page(page);
@@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
if (ret)
err = ret;
- if (did_repair) {
- ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_DAMAGED, GFP_NOFS);
- if (ret && !err)
- err = ret;
- }
+ ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+ rec->start + rec->len - 1,
+ EXTENT_DAMAGED, GFP_NOFS);
+ if (ret && !err)
+ err = ret;
kfree(rec);
return err;
@@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
u64 map_length = 0;
u64 sector;
struct btrfs_bio *bbio = NULL;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int ret;
BUG_ON(!mirror_num);
+ /* we can't repair anything in raid56 yet */
+ if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
+ return 0;
+
bio = bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
@@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
return -EIO;
}
bio->bi_bdev = dev->bdev;
- bio_add_page(bio, page, length, start-page_offset(page));
+ bio_add_page(bio, page, length, start - page_offset(page));
btrfsic_submit_bio(WRITE_SYNC, bio);
wait_for_completion(&compl);
@@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page)
failrec->failed_mirror);
did_repair = !ret;
}
+ ret = 0;
}
out:
@@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
struct page *page = bvec->bv_page;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- start = ((u64)page->index << PAGE_CACHE_SHIFT) +
- bvec->bv_offset;
+ start = page_offset(page) + bvec->bv_offset;
end = start + bvec->bv_len - 1;
if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
(long int)bio->bi_bdev);
tree = &BTRFS_I(page->mapping->host)->io_tree;
- start = ((u64)page->index << PAGE_CACHE_SHIFT) +
- bvec->bv_offset;
+ start = page_offset(page) + bvec->bv_offset;
end = start + bvec->bv_len - 1;
if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
struct extent_io_tree *tree = bio->bi_private;
u64 start;
- start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ start = page_offset(page) + bvec->bv_offset;
bio->bi_private = NULL;
@@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
return ret;
}
-static int merge_bio(struct extent_io_tree *tree, struct page *page,
+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
unsigned long offset, size_t size, struct bio *bio,
unsigned long bio_flags)
{
int ret = 0;
if (tree->ops && tree->ops->merge_bio_hook)
- ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+ ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
bio_flags);
BUG_ON(ret < 0);
return ret;
@@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
sector;
if (prev_bio_flags != bio_flags || !contig ||
- merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
+ merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
prev_bio_flags);
@@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
unsigned long *bio_flags)
{
struct inode *inode = page->mapping->host;
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 page_end = start + PAGE_CACHE_SIZE - 1;
u64 end;
u64 cur = start;
@@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
}
}
while (cur <= end) {
+ unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+
if (cur >= last_byte) {
char *userpage;
struct extent_state *cached = NULL;
@@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
- iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ iosize = ALIGN(iosize, blocksize);
if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
disk_io_size = em->block_len;
sector = em->block_start >> 9;
@@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
continue;
}
- ret = 0;
- if (tree->ops && tree->ops->readpage_io_hook) {
- ret = tree->ops->readpage_io_hook(page, cur,
- cur + iosize - 1);
- }
- if (!ret) {
- unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
- pnr -= page->index;
- ret = submit_extent_page(READ, tree, page,
+ pnr -= page->index;
+ ret = submit_extent_page(READ, tree, page,
sector, disk_io_size, pg_offset,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag);
- if (!ret) {
- nr++;
- *bio_flags = this_bio_flag;
- }
- }
- if (ret) {
+ if (!ret) {
+ nr++;
+ *bio_flags = this_bio_flag;
+ } else {
SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
}
@@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct inode *inode = page->mapping->host;
struct extent_page_data *epd = data;
struct extent_io_tree *tree = epd->tree;
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 delalloc_start;
u64 page_end = start + PAGE_CACHE_SIZE - 1;
u64 end;
@@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
iosize = min(extent_map_end(em) - cur, end - cur + 1);
- iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ iosize = ALIGN(iosize, blocksize);
sector = (em->block_start + extent_offset) >> 9;
bdev = em->bdev;
block_start = em->block_start;
@@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
spin_unlock(&eb->refs_lock);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
- spin_lock(&fs_info->delalloc_lock);
- if (fs_info->dirty_metadata_bytes >= eb->len)
- fs_info->dirty_metadata_bytes -= eb->len;
- else
- WARN_ON(1);
- spin_unlock(&fs_info->delalloc_lock);
+ __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+ -eb->len,
+ fs_info->dirty_metadata_batch);
ret = 1;
} else {
spin_unlock(&eb->refs_lock);
@@ -3446,15 +3437,9 @@ retry:
* swizzled back from swapper_space to tmpfs file
* mapping
*/
- if (tree->ops &&
- tree->ops->write_cache_pages_lock_hook) {
- tree->ops->write_cache_pages_lock_hook(page,
- data, flush_fn);
- } else {
- if (!trylock_page(page)) {
- flush_fn(data);
- lock_page(page);
- }
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
}
if (unlikely(page->mapping != mapping)) {
@@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset)
{
struct extent_state *cached_state = NULL;
- u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
- start += (offset + blocksize - 1) & ~(blocksize - 1);
+ start += ALIGN(offset, blocksize);
if (start > end)
return 0;
@@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask)
{
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
int ret = 1;
@@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
gfp_t mask)
{
struct extent_map *em;
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
if ((mask & __GFP_WAIT) &&
@@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
len = last - offset;
if (len == 0)
break;
- len = (len + sectorsize - 1) & ~(sectorsize - 1);
+ len = ALIGN(len, sectorsize);
em = get_extent(inode, NULL, 0, offset, len, 0);
if (IS_ERR_OR_NULL(em))
return em;
@@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb)
list_del(&eb->leak_list);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
- if (eb->pages && eb->pages != eb->inline_pages)
- kfree(eb->pages);
kmem_cache_free(extent_buffer_cache, eb);
}
@@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
atomic_set(&eb->refs, 1);
atomic_set(&eb->io_pages, 0);
- if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
- struct page **pages;
- int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- pages = kzalloc(num_pages, mask);
- if (!pages) {
- __free_extent_buffer(eb);
- return NULL;
- }
- eb->pages = pages;
- } else {
- eb->pages = eb->inline_pages;
- }
+ /*
+ * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
+ */
+ BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
+ > MAX_INLINE_EXTENT_BUFFER_SIZE);
+ BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
return eb;
}
@@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
+ int refs;
/* the ref bit is tricky. We have to make sure it is set
* if we have the buffer dirty. Otherwise the
* code to free a buffer can end up dropping a dirty
@@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
* So bump the ref count first, then set the bit. If someone
* beat us to it, drop the ref we added.
*/
+ refs = atomic_read(&eb->refs);
+ if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ return;
+
spin_lock(&eb->refs_lock);
if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
atomic_inc(&eb->refs);
@@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
void free_extent_buffer(struct extent_buffer *eb)
{
+ int refs;
+ int old;
if (!eb)
return;
+ while (1) {
+ refs = atomic_read(&eb->refs);
+ if (refs <= 3)
+ break;
+ old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
+ if (old == refs)
+ return;
+ }
+
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2eacfabd326..6068a198556 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -72,10 +72,9 @@ struct extent_io_ops {
int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
extent_submit_bio_hook_t *submit_bio_hook;
- int (*merge_bio_hook)(struct page *page, unsigned long offset,
+ int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags);
- int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int mirror);
@@ -90,8 +89,6 @@ struct extent_io_ops {
struct extent_state *other);
void (*split_extent_hook)(struct inode *inode,
struct extent_state *orig, u64 split);
- int (*write_cache_pages_lock_hook)(struct page *page, void *data,
- void (*flush_fn)(void *));
};
struct extent_io_tree {
@@ -161,8 +158,7 @@ struct extent_buffer {
*/
wait_queue_head_t read_lock_wq;
wait_queue_head_t lock_wq;
- struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
- struct page **pages;
+ struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
};
static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index fdb7a8db3b5..2834ca5768e 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,6 +1,5 @@
#include <linux/err.h>
#include <linux/slab.h>
-#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include "ctree.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 94aa53b3872..ec160202be3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -684,6 +684,24 @@ out:
return ret;
}
+static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
+ struct btrfs_sector_sum *sector_sum,
+ u64 total_bytes, u64 sectorsize)
+{
+ u64 tmp = sectorsize;
+ u64 next_sector = sector_sum->bytenr;
+ struct btrfs_sector_sum *next = sector_sum + 1;
+
+ while ((tmp + total_bytes) < sums->len) {
+ if (next_sector + sectorsize != next->bytenr)
+ break;
+ tmp += sectorsize;
+ next_sector = next->bytenr;
+ next++;
+ }
+ return tmp;
+}
+
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
@@ -789,20 +807,32 @@ again:
goto insert;
}
- if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+ if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
csum_size) {
- u32 diff = (csum_offset + 1) * csum_size;
+ int extend_nr;
+ u64 tmp;
+ u32 diff;
+ u32 free_space;
- /*
- * is the item big enough already? we dropped our lock
- * before and need to recheck
- */
- if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
- goto csum;
+ if (btrfs_leaf_free_space(root, leaf) <
+ sizeof(struct btrfs_item) + csum_size * 2)
+ goto insert;
+
+ free_space = btrfs_leaf_free_space(root, leaf) -
+ sizeof(struct btrfs_item) - csum_size;
+ tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
+ root->sectorsize);
+ tmp >>= root->fs_info->sb->s_blocksize_bits;
+ WARN_ON(tmp < 1);
+
+ extend_nr = max_t(int, 1, (int)tmp);
+ diff = (csum_offset + extend_nr) * csum_size;
+ diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
- if (diff != csum_size)
- goto insert;
+ diff = min(free_space, diff);
+ diff /= csum_size;
+ diff *= csum_size;
btrfs_extend_item(trans, root, path, diff);
goto csum;
@@ -812,19 +842,14 @@ insert:
btrfs_release_path(path);
csum_offset = 0;
if (found_next) {
- u64 tmp = total_bytes + root->sectorsize;
- u64 next_sector = sector_sum->bytenr;
- struct btrfs_sector_sum *next = sector_sum + 1;
+ u64 tmp;
- while (tmp < sums->len) {
- if (next_sector + root->sectorsize != next->bytenr)
- break;
- tmp += root->sectorsize;
- next_sector = next->bytenr;
- next++;
- }
- tmp = min(tmp, next_offset - file_key.offset);
+ tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
+ root->sectorsize);
tmp >>= root->fs_info->sb->s_blocksize_bits;
+ tmp = min(tmp, (next_offset - file_key.offset) >>
+ root->fs_info->sb->s_blocksize_bits);
+
tmp = max((u64)1, tmp);
tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
ins_size = csum_size * tmp;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b241fe9d2f..af1d0605a5c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,11 +30,11 @@
#include <linux/statfs.h>
#include <linux/compat.h>
#include <linux/slab.h>
+#include <linux/btrfs.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "ioctl.h"
#include "print-tree.h"
#include "tree-log.h"
#include "locking.h"
@@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
atomic_inc(&fs_info->defrag_running);
while(1) {
+ /* Pause the auto defragger. */
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+ &fs_info->fs_state))
+ break;
+
if (!__need_auto_defrag(fs_info->tree_root))
break;
@@ -505,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
loff_t isize = i_size_read(inode);
start_pos = pos & ~((u64)root->sectorsize - 1);
- num_bytes = (write_bytes + pos - start_pos +
- root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -1544,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
* although we have opened a file as writable, we have
* to stop this write operation to ensure FS consistency.
*/
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
mutex_unlock(&inode->i_mutex);
err = -EROFS;
goto out;
@@ -1627,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
*/
if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags)) {
- btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /*
+ * We need to block on a committing transaction to keep us from
+ * throwing a ordered operation on to the list and causing
+ * something like sync to deadlock trying to flush out this
+ * inode.
+ */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
+ btrfs_end_transaction(trans, root);
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(inode->i_mapping);
}
@@ -1654,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
struct btrfs_trans_handle *trans;
+ bool full_sync = 0;
trace_btrfs_sync_file(file, datasync);
/*
* We write the dirty pages in the range and wait until they complete
* out of the ->i_mutex. If so, we can flush the dirty pages by
- * multi-task, and make the performance up.
+ * multi-task, and make the performance up. See
+ * btrfs_wait_ordered_range for an explanation of the ASYNC check.
*/
atomic_inc(&BTRFS_I(inode)->sync_writers);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
atomic_dec(&BTRFS_I(inode)->sync_writers);
if (ret)
return ret;
@@ -1675,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* range being left.
*/
atomic_inc(&root->log_batch);
- btrfs_wait_ordered_range(inode, start, end - start + 1);
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ if (full_sync)
+ btrfs_wait_ordered_range(inode, start, end - start + 1);
atomic_inc(&root->log_batch);
/*
@@ -1742,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret != BTRFS_NO_LOG_SYNC) {
if (ret > 0) {
+ /*
+ * If we didn't already wait for ordered extents we need
+ * to do that now.
+ */
+ if (!full_sync)
+ btrfs_wait_ordered_range(inode, start,
+ end - start + 1);
ret = btrfs_commit_transaction(trans, root);
} else {
ret = btrfs_sync_log(trans, root);
- if (ret == 0)
+ if (ret == 0) {
ret = btrfs_end_transaction(trans, root);
- else
+ } else {
+ if (!full_sync)
+ btrfs_wait_ordered_range(inode, start,
+ end -
+ start + 1);
ret = btrfs_commit_transaction(trans, root);
+ }
}
} else {
ret = btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0be7a8742a4..1f84fc09c1a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+ max_bitmaps = max(max_bitmaps, 1);
+
BUG_ON(ctl->total_bitmaps > max_bitmaps);
/*
@@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
}
static struct btrfs_free_space *
-find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ unsigned long align)
{
struct btrfs_free_space *entry;
struct rb_node *node;
+ u64 ctl_off;
+ u64 tmp;
+ u64 align_off;
int ret;
if (!ctl->free_space_offset.rb_node)
@@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
if (entry->bytes < *bytes)
continue;
+ /* make sure the space returned is big enough
+ * to match our requested alignment
+ */
+ if (*bytes >= align) {
+ ctl_off = entry->offset - ctl->start;
+ tmp = ctl_off + align - 1;;
+ do_div(tmp, align);
+ tmp = tmp * align + ctl->start;
+ align_off = tmp - entry->offset;
+ } else {
+ align_off = 0;
+ tmp = entry->offset;
+ }
+
+ if (entry->bytes < *bytes + align_off)
+ continue;
+
if (entry->bitmap) {
- ret = search_bitmap(ctl, entry, offset, bytes);
- if (!ret)
+ ret = search_bitmap(ctl, entry, &tmp, bytes);
+ if (!ret) {
+ *offset = tmp;
return entry;
+ }
continue;
}
- *offset = entry->offset;
- *bytes = entry->bytes;
+ *offset = tmp;
+ *bytes = entry->bytes - align_off;
return entry;
}
@@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
}
/*
- * some block groups are so tiny they can't be enveloped by a bitmap, so
- * don't even bother to create a bitmap for this
+ * The original block groups from mkfs can be really small, like 8
+ * megabytes, so don't bother with a bitmap for those entries. However
+ * some block groups can be smaller than what a bitmap would cover but
+ * are still large enough that they could overflow the 32k memory limit,
+ * so allow those block groups to still be allowed to have a bitmap
+ * entry.
*/
- if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
+ if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
return false;
return true;
@@ -2095,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *entry = NULL;
u64 bytes_search = bytes + empty_size;
u64 ret = 0;
+ u64 align_gap = 0;
+ u64 align_gap_len = 0;
spin_lock(&ctl->tree_lock);
- entry = find_free_space(ctl, &offset, &bytes_search);
+ entry = find_free_space(ctl, &offset, &bytes_search,
+ block_group->full_stripe_len);
if (!entry)
goto out;
@@ -2107,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
if (!entry->bytes)
free_bitmap(ctl, entry);
} else {
+
unlink_free_space(ctl, entry);
- entry->offset += bytes;
- entry->bytes -= bytes;
+ align_gap_len = offset - entry->offset;
+ align_gap = entry->offset;
+
+ entry->offset = offset + bytes;
+ WARN_ON(entry->bytes < bytes + align_gap_len);
+
+ entry->bytes -= bytes + align_gap_len;
if (!entry->bytes)
kmem_cache_free(btrfs_free_space_cachep, entry);
else
@@ -2119,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
out:
spin_unlock(&ctl->tree_lock);
+ if (align_gap_len)
+ __btrfs_add_free_space(ctl, align_gap, align_gap_len);
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 55c07b65037..c226daefd65 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,12 +39,13 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/mount.h>
+#include <linux/btrfs.h>
+#include <linux/blkdev.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "ioctl.h"
#include "print-tree.h"
#include "ordered-data.h"
#include "xattr.h"
@@ -54,6 +55,7 @@
#include "locking.h"
#include "free-space-cache.h"
#include "inode-map.h"
+#include "backref.h"
struct btrfs_iget_args {
u64 ino;
@@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
u64 isize = i_size_read(inode);
u64 actual_end = min(end + 1, isize);
u64 inline_len = actual_end - start;
- u64 aligned_end = (end + root->sectorsize - 1) &
- ~((u64)root->sectorsize - 1);
+ u64 aligned_end = ALIGN(end, root->sectorsize);
u64 data_len = inline_len;
int ret;
@@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
return 1;
}
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
@@ -389,7 +391,7 @@ again:
* a compressed extent to 128k.
*/
total_compressed = min(total_compressed, max_uncompressed);
- num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+ num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize, num_bytes);
total_in = 0;
ret = 0;
@@ -488,15 +490,13 @@ cont:
* up to a block size boundary so the allocator does sane
* things
*/
- total_compressed = (total_compressed + blocksize - 1) &
- ~(blocksize - 1);
+ total_compressed = ALIGN(total_compressed, blocksize);
/*
* one last check to make sure the compression is really a
* win, compare the page count read with the blocks on disk
*/
- total_in = (total_in + PAGE_CACHE_SIZE - 1) &
- ~(PAGE_CACHE_SIZE - 1);
+ total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
if (total_compressed >= total_in) {
will_compress = 0;
} else {
@@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
if (list_empty(&async_cow->extents))
return 0;
-
+again:
while (!list_empty(&async_cow->extents)) {
async_extent = list_entry(async_cow->extents.next,
struct async_extent, list);
@@ -648,6 +648,8 @@ retry:
async_extent->ram_size - 1,
btrfs_get_extent,
WB_SYNC_ALL);
+ else if (ret)
+ unlock_page(async_cow->locked_page);
kfree(async_extent);
cond_resched();
continue;
@@ -672,6 +674,7 @@ retry:
if (ret) {
int i;
+
for (i = 0; i < async_extent->nr_pages; i++) {
WARN_ON(async_extent->pages[i]->mapping);
page_cache_release(async_extent->pages[i]);
@@ -679,12 +682,10 @@ retry:
kfree(async_extent->pages);
async_extent->nr_pages = 0;
async_extent->pages = NULL;
- unlock_extent(io_tree, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+
if (ret == -ENOSPC)
goto retry;
- goto out_free; /* JDM: Requeue? */
+ goto out_free;
}
/*
@@ -696,10 +697,13 @@ retry:
async_extent->ram_size - 1, 0);
em = alloc_extent_map();
- BUG_ON(!em); /* -ENOMEM */
+ if (!em)
+ goto out_free_reserve;
em->start = async_extent->start;
em->len = async_extent->ram_size;
em->orig_start = em->start;
+ em->mod_start = em->start;
+ em->mod_len = em->len;
em->block_start = ins.objectid;
em->block_len = ins.offset;
@@ -726,6 +730,9 @@ retry:
async_extent->ram_size - 1, 0);
}
+ if (ret)
+ goto out_free_reserve;
+
ret = btrfs_add_ordered_extent_compress(inode,
async_extent->start,
ins.objectid,
@@ -733,7 +740,8 @@ retry:
ins.offset,
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ goto out_free_reserve;
/*
* clear dirty, set writeback and unlock the pages.
@@ -754,18 +762,30 @@ retry:
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages);
-
- BUG_ON(ret); /* -ENOMEM */
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
+ if (ret)
+ goto out;
cond_resched();
}
ret = 0;
out:
return ret;
+out_free_reserve:
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
out_free:
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
kfree(async_extent);
- goto out;
+ goto again;
}
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
BUG_ON(btrfs_is_free_space_inode(inode));
- num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+ num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize, num_bytes);
disk_num_bytes = num_bytes;
@@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
em->orig_start = em->start;
ram_size = ins.offset;
em->len = ins.offset;
+ em->mod_start = em->start;
+ em->mod_len = em->len;
em->block_start = ins.objectid;
em->block_len = ins.offset;
@@ -1338,6 +1360,8 @@ out_check:
em->block_start = disk_bytenr;
em->orig_block_len = disk_num_bytes;
em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->mod_start = em->start;
+ em->mod_len = em->len;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
set_bit(EXTENT_FLAG_FILLING, &em->flags);
em->generation = -1;
@@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode,
spin_unlock(&BTRFS_I(inode)->lock);
}
- spin_lock(&root->fs_info->delalloc_lock);
+ __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+ root->fs_info->delalloc_batch);
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
- root->fs_info->delalloc_bytes += len;
- if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
- &root->fs_info->delalloc_inodes);
+ if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
+ set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode,
&& do_list)
btrfs_free_reserved_data_space(inode, len);
- spin_lock(&root->fs_info->delalloc_lock);
- root->fs_info->delalloc_bytes -= len;
+ __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
+ root->fs_info->delalloc_batch);
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes -= len;
-
if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
- !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
* extent_io.c merge_bio_hook, this must check the chunk tree to make sure
* we don't create bios that span stripes or chunks
*/
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags)
{
@@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
length = bio->bi_size;
map_length = length;
- ret = btrfs_map_block(root->fs_info, READ, logical,
+ ret = btrfs_map_block(root->fs_info, rw, logical,
&map_length, NULL, 0);
/* Will always return 0 with map_multi == NULL */
BUG_ON(ret < 0);
@@ -1892,6 +1931,640 @@ out:
return ret;
}
+/* snapshot-aware defrag */
+struct sa_defrag_extent_backref {
+ struct rb_node node;
+ struct old_sa_defrag_extent *old;
+ u64 root_id;
+ u64 inum;
+ u64 file_pos;
+ u64 extent_offset;
+ u64 num_bytes;
+ u64 generation;
+};
+
+struct old_sa_defrag_extent {
+ struct list_head list;
+ struct new_sa_defrag_extent *new;
+
+ u64 extent_offset;
+ u64 bytenr;
+ u64 offset;
+ u64 len;
+ int count;
+};
+
+struct new_sa_defrag_extent {
+ struct rb_root root;
+ struct list_head head;
+ struct btrfs_path *path;
+ struct inode *inode;
+ u64 file_pos;
+ u64 len;
+ u64 bytenr;
+ u64 disk_len;
+ u8 compress_type;
+};
+
+static int backref_comp(struct sa_defrag_extent_backref *b1,
+ struct sa_defrag_extent_backref *b2)
+{
+ if (b1->root_id < b2->root_id)
+ return -1;
+ else if (b1->root_id > b2->root_id)
+ return 1;
+
+ if (b1->inum < b2->inum)
+ return -1;
+ else if (b1->inum > b2->inum)
+ return 1;
+
+ if (b1->file_pos < b2->file_pos)
+ return -1;
+ else if (b1->file_pos > b2->file_pos)
+ return 1;
+
+ /*
+ * [------------------------------] ===> (a range of space)
+ * |<--->| |<---->| =============> (fs/file tree A)
+ * |<---------------------------->| ===> (fs/file tree B)
+ *
+ * A range of space can refer to two file extents in one tree while
+ * refer to only one file extent in another tree.
+ *
+ * So we may process a disk offset more than one time(two extents in A)
+ * and locate at the same extent(one extent in B), then insert two same
+ * backrefs(both refer to the extent in B).
+ */
+ return 0;
+}
+
+static void backref_insert(struct rb_root *root,
+ struct sa_defrag_extent_backref *backref)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sa_defrag_extent_backref *entry;
+ int ret;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
+
+ ret = backref_comp(backref, entry);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&backref->node, parent, p);
+ rb_insert_color(&backref->node, root);
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
+ void *ctx)
+{
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_fs_info *fs_info;
+ struct old_sa_defrag_extent *old = ctx;
+ struct new_sa_defrag_extent *new = old->new;
+ struct btrfs_path *path = new->path;
+ struct btrfs_key key;
+ struct btrfs_root *root;
+ struct sa_defrag_extent_backref *backref;
+ struct extent_buffer *leaf;
+ struct inode *inode = new->inode;
+ int slot;
+ int ret;
+ u64 extent_offset;
+ u64 num_bytes;
+
+ if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
+ inum == btrfs_ino(inode))
+ return 0;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ if (PTR_ERR(root) == -ENOENT)
+ return 0;
+ WARN_ON(1);
+ pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
+ inum, offset, root_id);
+ return PTR_ERR(root);
+ }
+
+ key.objectid = inum;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ if (offset > (u64)-1 << 32)
+ key.offset = 0;
+ else
+ key.offset = offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ WARN_ON(1);
+ return ret;
+ }
+
+ while (1) {
+ cond_resched();
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ continue;
+ }
+
+ path->slots[0]++;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.objectid > inum)
+ goto out;
+
+ if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
+ continue;
+
+ extent_offset = btrfs_file_extent_offset(leaf, extent);
+ if (key.offset - extent_offset != offset)
+ continue;
+
+ num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
+ if (extent_offset >= old->extent_offset + old->offset +
+ old->len || extent_offset + num_bytes <=
+ old->extent_offset + old->offset)
+ continue;
+
+ break;
+ }
+
+ backref = kmalloc(sizeof(*backref), GFP_NOFS);
+ if (!backref) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ backref->root_id = root_id;
+ backref->inum = inum;
+ backref->file_pos = offset + extent_offset;
+ backref->num_bytes = num_bytes;
+ backref->extent_offset = extent_offset;
+ backref->generation = btrfs_file_extent_generation(leaf, extent);
+ backref->old = old;
+ backref_insert(&new->root, backref);
+ old->count++;
+out:
+ btrfs_release_path(path);
+ WARN_ON(ret);
+ return ret;
+}
+
+static noinline bool record_extent_backrefs(struct btrfs_path *path,
+ struct new_sa_defrag_extent *new)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
+ struct old_sa_defrag_extent *old, *tmp;
+ int ret;
+
+ new->path = path;
+
+ list_for_each_entry_safe(old, tmp, &new->head, list) {
+ ret = iterate_inodes_from_logical(old->bytenr, fs_info,
+ path, record_one_backref,
+ old);
+ BUG_ON(ret < 0 && ret != -ENOENT);
+
+ /* no backref to be processed for this extent */
+ if (!old->count) {
+ list_del(&old->list);
+ kfree(old);
+ }
+ }
+
+ if (list_empty(&new->head))
+ return false;
+
+ return true;
+}
+
+static int relink_is_mergable(struct extent_buffer *leaf,
+ struct btrfs_file_extent_item *fi,
+ u64 disk_bytenr)
+{
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr)
+ return 0;
+
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int relink_extent_backref(struct btrfs_path *path,
+ struct sa_defrag_extent_backref *prev,
+ struct sa_defrag_extent_backref *backref)
+{
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_file_extent_item *item;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct old_sa_defrag_extent *old = backref->old;
+ struct new_sa_defrag_extent *new = old->new;
+ struct inode *src_inode = new->inode;
+ struct inode *inode;
+ struct extent_state *cached = NULL;
+ int ret = 0;
+ u64 start;
+ u64 len;
+ u64 lock_start;
+ u64 lock_end;
+ bool merge = false;
+ int index;
+
+ if (prev && prev->root_id == backref->root_id &&
+ prev->inum == backref->inum &&
+ prev->file_pos + prev->num_bytes == backref->file_pos)
+ merge = true;
+
+ /* step 1: get root */
+ key.objectid = backref->root_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ fs_info = BTRFS_I(src_inode)->root->fs_info;
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ if (PTR_ERR(root) == -ENOENT)
+ return 0;
+ return PTR_ERR(root);
+ }
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ /* parse ENOENT to 0 */
+ return 0;
+ }
+
+ /* step 2: get inode */
+ key.objectid = backref->inum;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ if (IS_ERR(inode)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return 0;
+ }
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ /* step 3: relink backref */
+ lock_start = backref->file_pos;
+ lock_end = backref->file_pos + backref->num_bytes - 1;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+ 0, &cached);
+
+ ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ goto out_unlock;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ key.objectid = backref->inum;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = backref->file_pos;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out_free_path;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out_free_path;
+ }
+
+ extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_generation(path->nodes[0], extent) !=
+ backref->generation)
+ goto out_free_path;
+
+ btrfs_release_path(path);
+
+ start = backref->file_pos;
+ if (backref->extent_offset < old->extent_offset + old->offset)
+ start += old->extent_offset + old->offset -
+ backref->extent_offset;
+
+ len = min(backref->extent_offset + backref->num_bytes,
+ old->extent_offset + old->offset + old->len);
+ len -= max(backref->extent_offset, old->extent_offset + old->offset);
+
+ ret = btrfs_drop_extents(trans, root, inode, start,
+ start + len, 1);
+ if (ret)
+ goto out_free_path;
+again:
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ if (merge) {
+ struct btrfs_file_extent_item *fi;
+ u64 extent_len;
+ struct btrfs_key found_key;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+ if (ret < 0)
+ goto out_free_path;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+
+ if (relink_is_mergable(leaf, fi, new->bytenr) &&
+ extent_len + found_key.offset == start) {
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_len + len);
+ btrfs_mark_buffer_dirty(leaf);
+ inode_add_bytes(inode, len);
+
+ ret = 1;
+ goto out_free_path;
+ } else {
+ merge = false;
+ btrfs_release_path(path);
+ goto again;
+ }
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*extent));
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_free_path;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
+ btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
+ btrfs_set_file_extent_num_bytes(leaf, item, len);
+ btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
+ btrfs_set_file_extent_generation(leaf, item, trans->transid);
+ btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_compression(leaf, item, new->compress_type);
+ btrfs_set_file_extent_encryption(leaf, item, 0);
+ btrfs_set_file_extent_other_encoding(leaf, item, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+ inode_add_bytes(inode, len);
+
+ ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
+ new->disk_len, 0,
+ backref->root_id, backref->inum,
+ new->file_pos, 0); /* start - extent_offset */
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_free_path;
+ }
+
+ ret = 1;
+out_free_path:
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans, root);
+out_unlock:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+ &cached, GFP_NOFS);
+ iput(inode);
+ return ret;
+}
+
+static void relink_file_extents(struct new_sa_defrag_extent *new)
+{
+ struct btrfs_path *path;
+ struct old_sa_defrag_extent *old, *tmp;
+ struct sa_defrag_extent_backref *backref;
+ struct sa_defrag_extent_backref *prev = NULL;
+ struct inode *inode;
+ struct btrfs_root *root;
+ struct rb_node *node;
+ int ret;
+
+ inode = new->inode;
+ root = BTRFS_I(inode)->root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return;
+
+ if (!record_extent_backrefs(path, new)) {
+ btrfs_free_path(path);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ while (1) {
+ node = rb_first(&new->root);
+ if (!node)
+ break;
+ rb_erase(node, &new->root);
+
+ backref = rb_entry(node, struct sa_defrag_extent_backref, node);
+
+ ret = relink_extent_backref(path, prev, backref);
+ WARN_ON(ret < 0);
+
+ kfree(prev);
+
+ if (ret == 1)
+ prev = backref;
+ else
+ prev = NULL;
+ cond_resched();
+ }
+ kfree(prev);
+
+ btrfs_free_path(path);
+
+ list_for_each_entry_safe(old, tmp, &new->head, list) {
+ list_del(&old->list);
+ kfree(old);
+ }
+out:
+ atomic_dec(&root->fs_info->defrag_running);
+ wake_up(&root->fs_info->transaction_wait);
+
+ kfree(new);
+}
+
+static struct new_sa_defrag_extent *
+record_old_file_extents(struct inode *inode,
+ struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct old_sa_defrag_extent *old, *tmp;
+ struct new_sa_defrag_extent *new;
+ int ret;
+
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new)
+ return NULL;
+
+ new->inode = inode;
+ new->file_pos = ordered->file_offset;
+ new->len = ordered->len;
+ new->bytenr = ordered->start;
+ new->disk_len = ordered->disk_len;
+ new->compress_type = ordered->compress_type;
+ new->root = RB_ROOT;
+ INIT_LIST_HEAD(&new->head);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out_kfree;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = new->file_pos;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out_free_path;
+ if (ret > 0 && path->slots[0] > 0)
+ path->slots[0]--;
+
+ /* find out all the old extents for the file range */
+ while (1) {
+ struct btrfs_file_extent_item *extent;
+ struct extent_buffer *l;
+ int slot;
+ u64 num_bytes;
+ u64 offset;
+ u64 end;
+ u64 disk_bytenr;
+ u64 extent_offset;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out_free_list;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid != btrfs_ino(inode))
+ break;
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+ if (key.offset >= new->file_pos + new->len)
+ break;
+
+ extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
+
+ num_bytes = btrfs_file_extent_num_bytes(l, extent);
+ if (key.offset + num_bytes < new->file_pos)
+ goto next;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
+ if (!disk_bytenr)
+ goto next;
+
+ extent_offset = btrfs_file_extent_offset(l, extent);
+
+ old = kmalloc(sizeof(*old), GFP_NOFS);
+ if (!old)
+ goto out_free_list;
+
+ offset = max(new->file_pos, key.offset);
+ end = min(new->file_pos + new->len, key.offset + num_bytes);
+
+ old->bytenr = disk_bytenr;
+ old->extent_offset = extent_offset;
+ old->offset = offset - key.offset;
+ old->len = end - offset;
+ old->new = new;
+ old->count = 0;
+ list_add_tail(&old->list, &new->head);
+next:
+ path->slots[0]++;
+ cond_resched();
+ }
+
+ btrfs_free_path(path);
+ atomic_inc(&root->fs_info->defrag_running);
+
+ return new;
+
+out_free_list:
+ list_for_each_entry_safe(old, tmp, &new->head, list) {
+ list_del(&old->list);
+ kfree(old);
+ }
+out_free_path:
+ btrfs_free_path(path);
+out_kfree:
+ kfree(new);
+ return NULL;
+}
+
/*
* helper function for btrfs_finish_ordered_io, this
* just reads in some of the csum leaves to prime them into ram
@@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
struct btrfs_trans_handle *trans = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
+ struct new_sa_defrag_extent *new = NULL;
int compress_type = 0;
int ret;
bool nolock;
@@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset + ordered_extent->len - 1,
0, &cached_state);
+ ret = test_range_bit(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ EXTENT_DEFRAG, 1, cached_state);
+ if (ret) {
+ u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+ if (last_snapshot >= BTRFS_I(inode)->generation)
+ /* the inode is shared */
+ new = record_old_file_extents(inode, ordered_extent);
+
+ clear_extent_bit(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+ }
+
if (nolock)
trans = btrfs_join_transaction_nolock(root);
else
@@ -2001,17 +2689,33 @@ out:
if (trans)
btrfs_end_transaction(trans, root);
- if (ret)
+ if (ret) {
clear_extent_uptodate(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset +
ordered_extent->len - 1, NULL, GFP_NOFS);
+ /*
+ * If the ordered extent had an IOERR or something else went
+ * wrong we need to return the space for this ordered extent
+ * back to the allocator.
+ */
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+ btrfs_free_reserved_extent(root, ordered_extent->start,
+ ordered_extent->disk_len);
+ }
+
+
/*
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
btrfs_remove_ordered_extent(inode, ordered_extent);
+ /* for snapshot-aware defrag */
+ if (new)
+ relink_file_extents(new);
+
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
@@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int mirror)
{
- size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+ size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
char *kaddr;
@@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
}
}
-enum btrfs_orphan_cleanup_state {
- ORPHAN_CLEANUP_STARTED = 1,
- ORPHAN_CLEANUP_DONE = 2,
-};
-
/*
* This is called in transaction commit time. If there are no orphan
* files in the subvolume, it removes orphan item and frees block_rsv
@@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*/
set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags);
+ atomic_inc(&root->orphan_inodes);
/* if we have links, this was a truncate, lets do that */
if (inode->i_nlink) {
@@ -2491,6 +3191,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
goto out;
ret = btrfs_truncate(inode);
+ if (ret)
+ btrfs_orphan_del(NULL, inode);
} else {
nr_unlink++;
}
@@ -2709,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode)
{
- btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
- btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
- btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_inode_mode(leaf, item, inode->i_mode);
- btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+
+ btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+ btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+ btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+ &token);
+ btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+ btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
- btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
- inode->i_atime.tv_sec);
- btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
- inode->i_atime.tv_nsec);
+ btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_nsec, &token);
- btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
- inode->i_mtime.tv_sec);
- btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
- inode->i_mtime.tv_nsec);
+ btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_nsec, &token);
- btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
- inode->i_ctime.tv_sec);
- btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
- inode->i_ctime.tv_nsec);
+ btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_nsec, &token);
- btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
- btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
- btrfs_set_inode_sequence(leaf, item, inode->i_version);
- btrfs_set_inode_transid(leaf, item, trans->transid);
- btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
- btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
- btrfs_set_inode_block_group(leaf, item, 0);
+ btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+ &token);
+ btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
+ &token);
+ btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+ btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+ btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+ btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+ btrfs_set_token_inode_block_group(leaf, item, 0, &token);
}
/*
@@ -3304,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 extent_num_bytes = 0;
u64 extent_offset = 0;
u64 item_end = 0;
- u64 mask = root->sectorsize - 1;
u32 found_type = (u8)-1;
int found_extent;
int del_item;
@@ -3328,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* extent just the way it is.
*/
if (root->ref_cows || root == root->fs_info->tree_root)
- btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
+ btrfs_drop_extent_cache(inode, ALIGN(new_size,
+ root->sectorsize), (u64)-1, 0);
/*
* This function is also used to drop the items in the log tree before
@@ -3407,10 +4116,9 @@ search_again:
if (!del_item) {
u64 orig_num_bytes =
btrfs_file_extent_num_bytes(leaf, fi);
- extent_num_bytes = new_size -
- found_key.offset + root->sectorsize - 1;
- extent_num_bytes = extent_num_bytes &
- ~((u64)root->sectorsize - 1);
+ extent_num_bytes = ALIGN(new_size -
+ found_key.offset,
+ root->sectorsize);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_num_bytes);
num_dec = (orig_num_bytes -
@@ -3646,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- u64 mask = root->sectorsize - 1;
- u64 hole_start = (oldsize + mask) & ~mask;
- u64 block_end = (size + mask) & ~mask;
+ u64 hole_start = ALIGN(oldsize, root->sectorsize);
+ u64 block_end = ALIGN(size, root->sectorsize);
u64 last_byte;
u64 cur_offset;
u64 hole_size;
@@ -3681,7 +4388,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
break;
}
last_byte = min(extent_map_end(em), block_end);
- last_byte = (last_byte + mask) & ~mask;
+ last_byte = ALIGN(last_byte , root->sectorsize);
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
hole_size = last_byte - cur_offset;
@@ -3832,6 +4539,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
+
+ /* Disable nonlocked read DIO to avoid the end less truncate */
+ btrfs_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ btrfs_inode_resume_unlocked_dio(inode);
+
ret = btrfs_truncate(inode);
if (ret && inode->i_nlink)
btrfs_orphan_del(NULL, inode);
@@ -3904,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
+ ret = btrfs_commit_inode_delayed_inode(inode);
+ if (ret) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+
rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
if (!rsv) {
btrfs_orphan_del(NULL, inode);
@@ -3941,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
- trans = btrfs_start_transaction_lflush(root, 1);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
btrfs_orphan_del(NULL, inode);
btrfs_free_block_rsv(root, rsv);
@@ -3955,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode)
break;
trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
-
btrfs_end_transaction(trans, root);
trans = NULL;
btrfs_btree_balance_dirty(root);
@@ -4854,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (btrfs_test_opt(root, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
if (btrfs_test_opt(root, NODATACOW))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
}
insert_inode_hash(inode);
@@ -5006,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
- err = btrfs_update_inode(trans, root, inode);
- if (err) {
- drop_inode = 1;
- goto out_unlock;
- }
-
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
@@ -5396,8 +6107,7 @@ again:
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_inline_len(leaf, item);
- extent_end = (extent_start + size + root->sectorsize - 1) &
- ~((u64)root->sectorsize - 1);
+ extent_end = ALIGN(extent_start + size, root->sectorsize);
}
if (start >= extent_end) {
@@ -5469,8 +6179,7 @@ again:
copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
size - extent_offset);
em->start = extent_start + extent_offset;
- em->len = (copy_size + root->sectorsize - 1) &
- ~((u64)root->sectorsize - 1);
+ em->len = ALIGN(copy_size, root->sectorsize);
em->orig_block_len = em->len;
em->orig_start = em->start;
if (compress_type) {
@@ -5949,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
em->start = start;
em->orig_start = orig_start;
+ em->mod_start = start;
+ em->mod_len = len;
em->len = len;
em->block_len = block_len;
em->block_start = block_start;
@@ -5990,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
u64 len = bh_result->b_size;
struct btrfs_trans_handle *trans;
int unlock_bits = EXTENT_LOCKED;
- int ret;
+ int ret = 0;
- if (create) {
- ret = btrfs_delalloc_reserve_space(inode, len);
- if (ret)
- return ret;
+ if (create)
unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
- } else {
+ else
len = min_t(u64, len, root->sectorsize);
- }
lockstart = start;
lockend = start + len - 1;
@@ -6011,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
return -ENOTBLK;
- if (create) {
- ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_DELALLOC, NULL,
- &cached_state, GFP_NOFS);
- if (ret)
- goto unlock_err;
- }
-
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -6050,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (!create && (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
free_extent_map(em);
- ret = 0;
goto unlock_err;
}
@@ -6148,6 +6846,15 @@ unlock:
*/
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockstart + len - 1, EXTENT_DELALLOC, NULL,
+ &cached_state, GFP_NOFS);
+ BUG_ON(ret);
}
/*
@@ -6156,24 +6863,9 @@ unlock:
* aren't using if there is any left over space.
*/
if (lockstart < lockend) {
- if (create && len < lockend - lockstart) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockstart + len - 1,
- unlock_bits | EXTENT_DEFRAG, 1, 0,
- &cached_state, GFP_NOFS);
- /*
- * Beside unlock, we also need to cleanup reserved space
- * for the left range by attaching EXTENT_DO_ACCOUNTING.
- */
- clear_extent_bit(&BTRFS_I(inode)->io_tree,
- lockstart + len, lockend,
- unlock_bits | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
- } else {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, unlock_bits, 1, 0,
- &cached_state, GFP_NOFS);
- }
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, unlock_bits, 1, 0,
+ &cached_state, GFP_NOFS);
} else {
free_extent_state(cached_state);
}
@@ -6183,9 +6875,6 @@ unlock:
return 0;
unlock_err:
- if (create)
- unlock_bits |= EXTENT_DO_ACCOUNTING;
-
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
return ret;
@@ -6426,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
int async_submit = 0;
map_length = orig_bio->bi_size;
- ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
+ ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
&map_length, NULL, 0);
if (ret) {
bio_put(orig_bio);
return -EIO;
}
-
if (map_length >= orig_bio->bi_size) {
bio = orig_bio;
goto submit;
}
- async_submit = 1;
+ /* async crcs make it difficult to collect full stripe writes. */
+ if (btrfs_get_alloc_profile(root, 1) &
+ (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+ async_submit = 0;
+ else
+ async_submit = 1;
+
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
if (!bio)
return -ENOMEM;
@@ -6480,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio->bi_end_io = btrfs_end_dio_bio;
map_length = orig_bio->bi_size;
- ret = btrfs_map_block(root->fs_info, READ,
+ ret = btrfs_map_block(root->fs_info, rw,
start_sector << 9,
&map_length, NULL, 0);
if (ret) {
@@ -6623,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ size_t count = 0;
+ int flags = 0;
+ bool wakeup = true;
+ bool relock = false;
+ ssize_t ret;
if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
offset, nr_segs))
return 0;
- return __blockdev_direct_IO(rw, iocb, inode,
- BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, 0);
+ atomic_inc(&inode->i_dio_count);
+ smp_mb__after_atomic_inc();
+
+ if (rw & WRITE) {
+ count = iov_length(iov, nr_segs);
+ /*
+ * If the write DIO is beyond the EOF, we need update
+ * the isize, but it is protected by i_mutex. So we can
+ * not unlock the i_mutex at this case.
+ */
+ if (offset + count <= inode->i_size) {
+ mutex_unlock(&inode->i_mutex);
+ relock = true;
+ }
+ ret = btrfs_delalloc_reserve_space(inode, count);
+ if (ret)
+ goto out;
+ } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags))) {
+ inode_dio_done(inode);
+ flags = DIO_LOCKING | DIO_SKIP_HOLES;
+ wakeup = false;
+ }
+
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, flags);
+ if (rw & WRITE) {
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ btrfs_delalloc_release_space(inode, count);
+ else if (ret >= 0 && (size_t)ret < count)
+ btrfs_delalloc_release_space(inode,
+ count - (size_t)ret);
+ else
+ btrfs_delalloc_release_metadata(inode, 0);
+ }
+out:
+ if (wakeup)
+ inode_dio_done(inode);
+ if (relock)
+ mutex_lock(&inode->i_mutex);
+
+ return ret;
}
#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
@@ -6735,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
return;
}
lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
- ordered = btrfs_lookup_ordered_extent(inode,
- page_offset(page));
+ ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
if (ordered) {
/*
* IO on this page will never be started, so we need
@@ -7216,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ /* the snap/subvol tree is on deleting */
if (btrfs_root_refs(&root->root_item) == 0 &&
- !btrfs_is_free_space_inode(inode))
+ root != root->fs_info->tree_root)
return 1;
else
return generic_drop_inode(inode);
@@ -7299,40 +8038,22 @@ fail:
static int btrfs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
+ u64 delalloc_bytes;
struct inode *inode = dentry->d_inode;
u32 blocksize = inode->i_sb->s_blocksize;
generic_fillattr(inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
stat->blksize = PAGE_CACHE_SIZE;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+ spin_unlock(&BTRFS_I(inode)->lock);
stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
- ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
+ ALIGN(delalloc_bytes, blocksize)) >> 9;
return 0;
}
-/*
- * If a file is moved, it will inherit the cow and compression flags of the new
- * directory.
- */
-static void fixup_inode_flags(struct inode *dir, struct inode *inode)
-{
- struct btrfs_inode *b_dir = BTRFS_I(dir);
- struct btrfs_inode *b_inode = BTRFS_I(inode);
-
- if (b_dir->flags & BTRFS_INODE_NODATACOW)
- b_inode->flags |= BTRFS_INODE_NODATACOW;
- else
- b_inode->flags &= ~BTRFS_INODE_NODATACOW;
-
- if (b_dir->flags & BTRFS_INODE_COMPRESS) {
- b_inode->flags |= BTRFS_INODE_COMPRESS;
- b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
- } else {
- b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
- BTRFS_INODE_NOCOMPRESS);
- }
-}
-
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
@@ -7498,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- fixup_inode_flags(new_dir, old_inode);
-
ret = btrfs_add_link(trans, new_dir, old_inode,
new_dentry->d_name.name,
new_dentry->d_name.len, 0, index);
@@ -7583,7 +8302,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
-again:
+
spin_lock(&root->fs_info->delalloc_lock);
list_splice_init(&root->fs_info->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
@@ -7593,8 +8312,11 @@ again:
list_del_init(&binode->delalloc_inodes);
inode = igrab(&binode->vfs_inode);
- if (!inode)
+ if (!inode) {
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &binode->runtime_flags);
continue;
+ }
list_add_tail(&binode->delalloc_inodes,
&root->fs_info->delalloc_inodes);
@@ -7619,13 +8341,6 @@ again:
btrfs_wait_and_free_delalloc_work(work);
}
- spin_lock(&root->fs_info->delalloc_lock);
- if (!list_empty(&root->fs_info->delalloc_inodes)) {
- spin_unlock(&root->fs_info->delalloc_lock);
- goto again;
- }
- spin_unlock(&root->fs_info->delalloc_lock);
-
/* the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
@@ -7801,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
}
- ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
- 0, *alloc_hint, &ins, 1);
+ ret = btrfs_reserve_extent(trans, root,
+ min(num_bytes, 256ULL * 1024 * 1024),
+ min_size, 0, *alloc_hint, &ins, 1);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c3f09f71bed..c83086fdda0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -42,12 +42,12 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
+#include <linux/btrfs.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "ioctl.h"
#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
@@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return 0;
}
-static noinline int create_subvol(struct btrfs_root *root,
+static noinline int create_subvol(struct inode *dir,
struct dentry *dentry,
char *name, int namelen,
u64 *async_transid,
- struct btrfs_qgroup_inherit **inherit)
+ struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_root_item root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *new_root;
- struct dentry *parent = dentry->d_parent;
- struct inode *dir;
+ struct btrfs_block_rsv block_rsv;
struct timespec cur_time = CURRENT_TIME;
int ret;
int err;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
+ u64 qgroup_reserved;
uuid_le new_uuid;
ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
if (ret)
return ret;
- dir = parent->d_inode;
-
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
- * 1 - inode item
- * 2 - refs
- * 1 - root item
- * 2 - dir items
+ * The same as the snapshot creation, please see the comment
+ * of create_snapshot().
*/
- trans = btrfs_start_transaction(root, 6);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ 7, &qgroup_reserved);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
- ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
- inherit ? *inherit : NULL);
+ ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
if (ret)
goto fail;
@@ -516,6 +522,8 @@ static noinline int create_subvol(struct btrfs_root *root,
BUG_ON(ret);
fail:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
if (async_transid) {
*async_transid = trans->transid;
err = btrfs_commit_transaction_async(trans, root, 1);
@@ -527,13 +535,15 @@ fail:
if (!ret)
d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
-
+out:
+ btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
return ret;
}
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
- char *name, int namelen, u64 *async_transid,
- bool readonly, struct btrfs_qgroup_inherit **inherit)
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
+ struct dentry *dentry, char *name, int namelen,
+ u64 *async_transid, bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
{
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
@@ -549,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * 1 - parent dir inode
+ * 2 - dir entries
+ * 1 - root item
+ * 2 - root ref/backref
+ * 1 - root of snapshot
+ */
+ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
+ &pending_snapshot->block_rsv, 7,
+ &pending_snapshot->qgroup_reserved);
+ if (ret)
+ goto out;
+
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
- if (inherit) {
- pending_snapshot->inherit = *inherit;
- *inherit = NULL; /* take responsibility to free it */
- }
+ pending_snapshot->dir = dir;
+ pending_snapshot->inherit = inherit;
- trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
+ trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto fail;
}
- ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
- BUG_ON(ret);
-
spin_lock(&root->fs_info->trans_lock);
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
@@ -602,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
d_instantiate(dentry, inode);
ret = 0;
fail:
+ btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+ &pending_snapshot->block_rsv,
+ pending_snapshot->qgroup_reserved);
+out:
kfree(pending_snapshot);
return ret;
}
@@ -695,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
char *name, int namelen,
struct btrfs_root *snap_src,
u64 *async_transid, bool readonly,
- struct btrfs_qgroup_inherit **inherit)
+ struct btrfs_qgroup_inherit *inherit)
{
struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
@@ -732,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_up_read;
if (snap_src) {
- error = create_snapshot(snap_src, dentry, name, namelen,
+ error = create_snapshot(snap_src, dir, dentry, name, namelen,
async_transid, readonly, inherit);
} else {
- error = create_subvol(BTRFS_I(dir)->root, dentry,
- name, namelen, async_transid, inherit);
+ error = create_subvol(dir, dentry, name, namelen,
+ async_transid, inherit);
}
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -818,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root,
while(1) {
ret = btrfs_search_forward(root, &min_key, &max_key,
- path, 0, newer_than);
+ path, newer_than);
if (ret != 0)
goto none;
if (min_key.objectid != ino)
@@ -1206,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (!(inode->i_sb->s_flags & MS_ACTIVE))
break;
+ if (btrfs_defrag_cancelled(root->fs_info)) {
+ printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
+ ret = -EAGAIN;
+ break;
+ }
+
if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
extent_thresh, &last_len, &skip,
&defrag_end, range->flags &
@@ -1329,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
int ret = 0;
int mod = 0;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1363,6 +1388,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
*devstr = '\0';
devstr = vol_args->name;
devid = simple_strtoull(devstr, &end, 10);
+ if (!devid) {
+ ret = -EINVAL;
+ goto out_free;
+ }
printk(KERN_INFO "btrfs: resizing devid %llu\n",
(unsigned long long)devid);
}
@@ -1371,7 +1400,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (!device) {
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
- ret = -EINVAL;
+ ret = -ENODEV;
goto out_free;
}
@@ -1379,7 +1408,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
printk(KERN_INFO "btrfs: resizer unable to apply on "
"readonly device %llu\n",
(unsigned long long)devid);
- ret = -EINVAL;
+ ret = -EPERM;
goto out_free;
}
@@ -1401,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
if (device->is_tgtdev_for_dev_replace) {
- ret = -EINVAL;
+ ret = -EPERM;
goto out_free;
}
@@ -1457,7 +1486,7 @@ out:
static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
char *name, unsigned long fd, int subvol,
u64 *transid, bool readonly,
- struct btrfs_qgroup_inherit **inherit)
+ struct btrfs_qgroup_inherit *inherit)
{
int namelen;
int ret = 0;
@@ -1566,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
vol_args->fd, subvol, ptr,
- readonly, &inherit);
+ readonly, inherit);
if (ret == 0 && ptr &&
copy_to_user(arg +
@@ -1863,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode,
path->keep_locks = 1;
while(1) {
- ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+ ret = btrfs_search_forward(root, &key, &max_key, path,
sk->min_transid);
if (ret != 0) {
if (ret > 0)
@@ -2035,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
+ struct btrfs_block_rsv block_rsv;
+ u64 qgroup_reserved;
int namelen;
int ret;
int err = 0;
@@ -2124,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out_up_write;
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * One for dir inode, two for dir entries, two for root
+ * ref/backref.
+ */
+ err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ 5, &qgroup_reserved);
+ if (err)
+ goto out_up_write;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- goto out_up_write;
+ goto out_release;
}
- trans->block_rsv = &root->fs_info->global_block_rsv;
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
ret = btrfs_unlink_subvol(trans, root, dir,
dest->root_key.objectid,
@@ -2159,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
}
}
out_end_trans:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
ret = btrfs_end_transaction(trans, root);
if (ret && !err)
err = ret;
inode->i_flags |= S_DEAD;
+out_release:
+ btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
out_up_write:
up_write(&root->fs_info->subvol_sem);
out_unlock:
@@ -2171,6 +2217,12 @@ out_unlock:
shrink_dcache_sb(root->fs_info->sb);
btrfs_invalidate_inodes(dest);
d_delete(dentry);
+
+ /* the last ref */
+ if (dest->cache_inode) {
+ iput(dest->cache_inode);
+ dest->cache_inode = NULL;
+ }
}
out_dput:
dput(dentry);
@@ -2211,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EPERM;
goto out;
}
- ret = btrfs_defrag_root(root, 0);
+ ret = btrfs_defrag_root(root);
if (ret)
goto out;
- ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
+ ret = btrfs_defrag_root(root->fs_info->extent_root);
break;
case S_IFREG:
if (!(file->f_mode & FMODE_WRITE)) {
@@ -3111,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
u64 transid;
int ret;
- trans = btrfs_attach_transaction(root);
+ trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT)
return PTR_ERR(trans);
@@ -3289,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
struct inode_fs_paths *ipath = NULL;
struct btrfs_path *path;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_DAC_READ_SEARCH))
return -EPERM;
path = btrfs_alloc_path();
@@ -3914,6 +3966,65 @@ out:
return ret;
}
+static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ const char *label = root->fs_info->super_copy->label;
+ size_t len = strnlen(label, BTRFS_LABEL_SIZE);
+ int ret;
+
+ if (len == BTRFS_LABEL_SIZE) {
+ pr_warn("btrfs: label is too long, return the first %zu bytes\n",
+ --len);
+ }
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ ret = copy_to_user(arg, label, len);
+ mutex_unlock(&root->fs_info->volume_mutex);
+
+ return ret ? -EFAULT : 0;
+}
+
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct btrfs_trans_handle *trans;
+ char label[BTRFS_LABEL_SIZE];
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(label, arg, sizeof(label)))
+ return -EFAULT;
+
+ if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+ pr_err("btrfs: unable to set label with more than %d bytes\n",
+ BTRFS_LABEL_SIZE - 1);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ strcpy(super_block->label, label);
+ ret = btrfs_end_transaction(trans, root);
+
+out_unlock:
+ mutex_unlock(&root->fs_info->volume_mutex);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -4014,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_qgroup_limit(file, argp);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(root, argp);
+ case BTRFS_IOC_GET_FSLABEL:
+ return btrfs_ioctl_get_fslabel(file, argp);
+ case BTRFS_IOC_SET_FSLABEL:
+ return btrfs_ioctl_set_fslabel(file, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
deleted file mode 100644
index dabca9cc8c2..00000000000
--- a/fs/btrfs/ioctl.h
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __IOCTL_
-#define __IOCTL_
-#include <linux/ioctl.h>
-
-#define BTRFS_IOCTL_MAGIC 0x94
-#define BTRFS_VOL_NAME_MAX 255
-
-/* this should be 4k */
-#define BTRFS_PATH_NAME_MAX 4087
-struct btrfs_ioctl_vol_args {
- __s64 fd;
- char name[BTRFS_PATH_NAME_MAX + 1];
-};
-
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
-
-#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
-#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
-#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
-#define BTRFS_FSID_SIZE 16
-#define BTRFS_UUID_SIZE 16
-
-#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
-
-struct btrfs_qgroup_limit {
- __u64 flags;
- __u64 max_rfer;
- __u64 max_excl;
- __u64 rsv_rfer;
- __u64 rsv_excl;
-};
-
-struct btrfs_qgroup_inherit {
- __u64 flags;
- __u64 num_qgroups;
- __u64 num_ref_copies;
- __u64 num_excl_copies;
- struct btrfs_qgroup_limit lim;
- __u64 qgroups[0];
-};
-
-struct btrfs_ioctl_qgroup_limit_args {
- __u64 qgroupid;
- struct btrfs_qgroup_limit lim;
-};
-
-#define BTRFS_SUBVOL_NAME_MAX 4039
-struct btrfs_ioctl_vol_args_v2 {
- __s64 fd;
- __u64 transid;
- __u64 flags;
- union {
- struct {
- __u64 size;
- struct btrfs_qgroup_inherit __user *qgroup_inherit;
- };
- __u64 unused[4];
- };
- char name[BTRFS_SUBVOL_NAME_MAX + 1];
-};
-
-/*
- * structure to report errors and progress to userspace, either as a
- * result of a finished scrub, a canceled scrub or a progress inquiry
- */
-struct btrfs_scrub_progress {
- __u64 data_extents_scrubbed; /* # of data extents scrubbed */
- __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
- __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
- __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
- __u64 read_errors; /* # of read errors encountered (EIO) */
- __u64 csum_errors; /* # of failed csum checks */
- __u64 verify_errors; /* # of occurences, where the metadata
- * of a tree block did not match the
- * expected values, like generation or
- * logical */
- __u64 no_csum; /* # of 4k data block for which no csum
- * is present, probably the result of
- * data written with nodatasum */
- __u64 csum_discards; /* # of csum for which no data was found
- * in the extent tree. */
- __u64 super_errors; /* # of bad super blocks encountered */
- __u64 malloc_errors; /* # of internal kmalloc errors. These
- * will likely cause an incomplete
- * scrub */
- __u64 uncorrectable_errors; /* # of errors where either no intact
- * copy was found or the writeback
- * failed */
- __u64 corrected_errors; /* # of errors corrected */
- __u64 last_physical; /* last physical address scrubbed. In
- * case a scrub was aborted, this can
- * be used to restart the scrub */
- __u64 unverified_errors; /* # of occurences where a read for a
- * full (64k) bio failed, but the re-
- * check succeeded for each 4k piece.
- * Intermittent error. */
-};
-
-#define BTRFS_SCRUB_READONLY 1
-struct btrfs_ioctl_scrub_args {
- __u64 devid; /* in */
- __u64 start; /* in */
- __u64 end; /* in */
- __u64 flags; /* in */
- struct btrfs_scrub_progress progress; /* out */
- /* pad to 1k */
- __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
-};
-
-#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
-#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
-struct btrfs_ioctl_dev_replace_start_params {
- __u64 srcdevid; /* in, if 0, use srcdev_name instead */
- __u64 cont_reading_from_srcdev_mode; /* in, see #define
- * above */
- __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
- __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
-};
-
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
-struct btrfs_ioctl_dev_replace_status_params {
- __u64 replace_state; /* out, see #define above */
- __u64 progress_1000; /* out, 0 <= x <= 1000 */
- __u64 time_started; /* out, seconds since 1-Jan-1970 */
- __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
- __u64 num_write_errors; /* out */
- __u64 num_uncorrectable_read_errors; /* out */
-};
-
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
-struct btrfs_ioctl_dev_replace_args {
- __u64 cmd; /* in */
- __u64 result; /* out */
-
- union {
- struct btrfs_ioctl_dev_replace_start_params start;
- struct btrfs_ioctl_dev_replace_status_params status;
- }; /* in/out */
-
- __u64 spare[64];
-};
-
-struct btrfs_ioctl_dev_info_args {
- __u64 devid; /* in/out */
- __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
- __u64 bytes_used; /* out */
- __u64 total_bytes; /* out */
- __u64 unused[379]; /* pad to 4k */
- __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */
-};
-
-struct btrfs_ioctl_fs_info_args {
- __u64 max_id; /* out */
- __u64 num_devices; /* out */
- __u8 fsid[BTRFS_FSID_SIZE]; /* out */
- __u64 reserved[124]; /* pad to 1k */
-};
-
-/* balance control ioctl modes */
-#define BTRFS_BALANCE_CTL_PAUSE 1
-#define BTRFS_BALANCE_CTL_CANCEL 2
-
-/*
- * this is packed, because it should be exactly the same as its disk
- * byte order counterpart (struct btrfs_disk_balance_args)
- */
-struct btrfs_balance_args {
- __u64 profiles;
- __u64 usage;
- __u64 devid;
- __u64 pstart;
- __u64 pend;
- __u64 vstart;
- __u64 vend;
-
- __u64 target;
-
- __u64 flags;
-
- __u64 unused[8];
-} __attribute__ ((__packed__));
-
-/* report balance progress to userspace */
-struct btrfs_balance_progress {
- __u64 expected; /* estimated # of chunks that will be
- * relocated to fulfill the request */
- __u64 considered; /* # of chunks we have considered so far */
- __u64 completed; /* # of chunks relocated so far */
-};
-
-#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
-#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
-#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
-
-struct btrfs_ioctl_balance_args {
- __u64 flags; /* in/out */
- __u64 state; /* out */
-
- struct btrfs_balance_args data; /* in/out */
- struct btrfs_balance_args meta; /* in/out */
- struct btrfs_balance_args sys; /* in/out */
-
- struct btrfs_balance_progress stat; /* out */
-
- __u64 unused[72]; /* pad to 1k */
-};
-
-#define BTRFS_INO_LOOKUP_PATH_MAX 4080
-struct btrfs_ioctl_ino_lookup_args {
- __u64 treeid;
- __u64 objectid;
- char name[BTRFS_INO_LOOKUP_PATH_MAX];
-};
-
-struct btrfs_ioctl_search_key {
- /* which root are we searching. 0 is the tree of tree roots */
- __u64 tree_id;
-
- /* keys returned will be >= min and <= max */
- __u64 min_objectid;
- __u64 max_objectid;
-
- /* keys returned will be >= min and <= max */
- __u64 min_offset;
- __u64 max_offset;
-
- /* max and min transids to search for */
- __u64 min_transid;
- __u64 max_transid;
-
- /* keys returned will be >= min and <= max */
- __u32 min_type;
- __u32 max_type;
-
- /*
- * how many items did userland ask for, and how many are we
- * returning
- */
- __u32 nr_items;
-
- /* align to 64 bits */
- __u32 unused;
-
- /* some extra for later */
- __u64 unused1;
- __u64 unused2;
- __u64 unused3;
- __u64 unused4;
-};
-
-struct btrfs_ioctl_search_header {
- __u64 transid;
- __u64 objectid;
- __u64 offset;
- __u32 type;
- __u32 len;
-};
-
-#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
-/*
- * the buf is an array of search headers where
- * each header is followed by the actual item
- * the type field is expanded to 32 bits for alignment
- */
-struct btrfs_ioctl_search_args {
- struct btrfs_ioctl_search_key key;
- char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
-};
-
-struct btrfs_ioctl_clone_range_args {
- __s64 src_fd;
- __u64 src_offset, src_length;
- __u64 dest_offset;
-};
-
-/* flags for the defrag range ioctl */
-#define BTRFS_DEFRAG_RANGE_COMPRESS 1
-#define BTRFS_DEFRAG_RANGE_START_IO 2
-
-struct btrfs_ioctl_space_info {
- __u64 flags;
- __u64 total_bytes;
- __u64 used_bytes;
-};
-
-struct btrfs_ioctl_space_args {
- __u64 space_slots;
- __u64 total_spaces;
- struct btrfs_ioctl_space_info spaces[0];
-};
-
-struct btrfs_data_container {
- __u32 bytes_left; /* out -- bytes not needed to deliver output */
- __u32 bytes_missing; /* out -- additional bytes needed for result */
- __u32 elem_cnt; /* out */
- __u32 elem_missed; /* out */
- __u64 val[0]; /* out */
-};
-
-struct btrfs_ioctl_ino_path_args {
- __u64 inum; /* in */
- __u64 size; /* in */
- __u64 reserved[4];
- /* struct btrfs_data_container *fspath; out */
- __u64 fspath; /* out */
-};
-
-struct btrfs_ioctl_logical_ino_args {
- __u64 logical; /* in */
- __u64 size; /* in */
- __u64 reserved[4];
- /* struct btrfs_data_container *inodes; out */
- __u64 inodes;
-};
-
-enum btrfs_dev_stat_values {
- /* disk I/O failure stats */
- BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
- BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
- BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
-
- /* stats for indirect indications for I/O failures */
- BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
- * contents is illegal: this is an
- * indication that the block was damaged
- * during read or write, or written to
- * wrong location or read from wrong
- * location */
- BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
- * been written */
-
- BTRFS_DEV_STAT_VALUES_MAX
-};
-
-/* Reset statistics after reading; needs SYS_ADMIN capability */
-#define BTRFS_DEV_STATS_RESET (1ULL << 0)
-
-struct btrfs_ioctl_get_dev_stats {
- __u64 devid; /* in */
- __u64 nr_items; /* in/out */
- __u64 flags; /* in/out */
-
- /* out values: */
- __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
-
- __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
-};
-
-#define BTRFS_QUOTA_CTL_ENABLE 1
-#define BTRFS_QUOTA_CTL_DISABLE 2
-#define BTRFS_QUOTA_CTL_RESCAN 3
-struct btrfs_ioctl_quota_ctl_args {
- __u64 cmd;
- __u64 status;
-};
-
-struct btrfs_ioctl_qgroup_assign_args {
- __u64 assign;
- __u64 src;
- __u64 dst;
-};
-
-struct btrfs_ioctl_qgroup_create_args {
- __u64 create;
- __u64 qgroupid;
-};
-struct btrfs_ioctl_timespec {
- __u64 sec;
- __u32 nsec;
-};
-
-struct btrfs_ioctl_received_subvol_args {
- char uuid[BTRFS_UUID_SIZE]; /* in */
- __u64 stransid; /* in */
- __u64 rtransid; /* out */
- struct btrfs_ioctl_timespec stime; /* in */
- struct btrfs_ioctl_timespec rtime; /* out */
- __u64 flags; /* in */
- __u64 reserved[16]; /* in */
-};
-
-struct btrfs_ioctl_send_args {
- __s64 send_fd; /* in */
- __u64 clone_sources_count; /* in */
- __u64 __user *clone_sources; /* in */
- __u64 parent_root; /* in */
- __u64 flags; /* in */
- __u64 reserved[4]; /* in */
-};
-
-#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
- struct btrfs_ioctl_vol_args)
-/* trans start and trans end are dangerous, and only for
- * use by applications that know how to avoid the
- * resulting deadlocks
- */
-#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
-#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
-#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
-
-#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
-#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
- struct btrfs_ioctl_vol_args)
-
-#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
- struct btrfs_ioctl_clone_range_args)
-
-#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
- struct btrfs_ioctl_defrag_range_args)
-#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
- struct btrfs_ioctl_search_args)
-#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
- struct btrfs_ioctl_ino_lookup_args)
-#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
-#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
- struct btrfs_ioctl_space_args)
-#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
-#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
- struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
- struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
-#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
-#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
- struct btrfs_ioctl_scrub_args)
-#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
-#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
- struct btrfs_ioctl_scrub_args)
-#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
- struct btrfs_ioctl_dev_info_args)
-#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
- struct btrfs_ioctl_fs_info_args)
-#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
- struct btrfs_ioctl_balance_args)
-#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
-#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
- struct btrfs_ioctl_balance_args)
-#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
- struct btrfs_ioctl_ino_path_args)
-#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
- struct btrfs_ioctl_ino_path_args)
-#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
- struct btrfs_ioctl_received_subvol_args)
-#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
-#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
- struct btrfs_ioctl_quota_ctl_args)
-#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
- struct btrfs_ioctl_qgroup_assign_args)
-#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
- struct btrfs_ioctl_qgroup_create_args)
-#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
- struct btrfs_ioctl_qgroup_limit_args)
-#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
- struct btrfs_ioctl_get_dev_stats)
-#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
- struct btrfs_ioctl_dev_replace_args)
-
-#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 2a1762c6604..e95df435d89 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -113,11 +113,10 @@ again:
read_unlock(&eb->lock);
return;
}
- read_unlock(&eb->lock);
- wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
- read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers)) {
read_unlock(&eb->lock);
+ wait_event(eb->write_lock_wq,
+ atomic_read(&eb->blocking_writers) == 0);
goto again;
}
atomic_inc(&eb->read_locks);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e5ed5672960..dc08d77b717 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
+ !(type == BTRFS_ORDERED_NOCOW))
+ entry->csum_bytes_left = disk_len;
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->inode = igrab(inode);
@@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
+ INIT_LIST_HEAD(&entry->log_list);
trace_btrfs_ordered_extent_add(inode, entry);
@@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode,
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
+ WARN_ON(entry->csum_bytes_left < sum->len);
+ entry->csum_bytes_left -= sum->len;
+ if (entry->csum_bytes_left == 0)
+ wake_up(&entry->wait);
spin_unlock_irq(&tree->lock);
}
@@ -405,6 +413,66 @@ out:
return ret == 0;
}
+/* Needs to either be called under a log transaction or the log_mutex */
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_extent *ordered;
+ struct rb_node *n;
+ int index = log->log_transid % 2;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ spin_lock(&log->log_extents_lock[index]);
+ if (list_empty(&ordered->log_list)) {
+ list_add_tail(&ordered->log_list, &log->logged_list[index]);
+ atomic_inc(&ordered->refs);
+ }
+ spin_unlock(&log->log_extents_lock[index]);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+{
+ struct btrfs_ordered_extent *ordered;
+ int index = transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ while (!list_empty(&log->logged_list[index])) {
+ ordered = list_first_entry(&log->logged_list[index],
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+ wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
+ &ordered->flags));
+ btrfs_put_ordered_extent(ordered);
+ spin_lock_irq(&log->log_extents_lock[index]);
+ }
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
+{
+ struct btrfs_ordered_extent *ordered;
+ int index = transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ while (!list_empty(&log->logged_list[index])) {
+ ordered = list_first_entry(&log->logged_list[index],
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+ btrfs_put_ordered_extent(ordered);
+ spin_lock_irq(&log->log_extents_lock[index]);
+ }
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
/*
* used to drop a reference on an ordered extent. This will free
* the extent if the last reference is dropped
@@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
* extra check to make sure the ordered operation list really is empty
* before we return
*/
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int wait)
{
struct btrfs_inode *btrfs_inode;
struct inode *inode;
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct list_head splice;
struct list_head works;
struct btrfs_delalloc_work *work, *next;
@@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_extent_lock);
-again:
- list_splice_init(&root->fs_info->ordered_operations, &splice);
-
+ list_splice_init(&cur_trans->ordered_operations, &splice);
while (!list_empty(&splice)) {
-
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
-
inode = &btrfs_inode->vfs_inode;
list_del_init(&btrfs_inode->ordered_operations);
@@ -574,24 +640,22 @@ again:
* the inode may be getting freed (in sys_unlink path).
*/
inode = igrab(inode);
-
- if (!wait && inode) {
- list_add_tail(&BTRFS_I(inode)->ordered_operations,
- &root->fs_info->ordered_operations);
- }
-
if (!inode)
continue;
+
+ if (!wait)
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &cur_trans->ordered_operations);
spin_unlock(&root->fs_info->ordered_extent_lock);
work = btrfs_alloc_delalloc_work(inode, wait, 1);
if (!work) {
+ spin_lock(&root->fs_info->ordered_extent_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations))
list_add_tail(&btrfs_inode->ordered_operations,
&splice);
- spin_lock(&root->fs_info->ordered_extent_lock);
list_splice_tail(&splice,
- &root->fs_info->ordered_operations);
+ &cur_trans->ordered_operations);
spin_unlock(&root->fs_info->ordered_extent_lock);
ret = -ENOMEM;
goto out;
@@ -603,9 +667,6 @@ again:
cond_resched();
spin_lock(&root->fs_info->ordered_extent_lock);
}
- if (wait && !list_empty(&root->fs_info->ordered_operations))
- goto again;
-
spin_unlock(&root->fs_info->ordered_extent_lock);
out:
list_for_each_entry_safe(work, next, &works, list) {
@@ -974,6 +1035,7 @@ out:
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode)
{
+ struct btrfs_transaction *cur_trans = trans->transaction;
u64 last_mod;
last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -988,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->ordered_extent_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
list_add_tail(&BTRFS_I(inode)->ordered_operations,
- &root->fs_info->ordered_operations);
+ &cur_trans->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f29d4bf5fbe..8eadfe406cd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -79,6 +79,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
* has done its due diligence in updating
* the isize. */
+#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
+ ordered extent */
struct btrfs_ordered_extent {
/* logical offset in the file */
@@ -96,6 +98,9 @@ struct btrfs_ordered_extent {
/* number of bytes that still need writing */
u64 bytes_left;
+ /* number of bytes that still need csumming */
+ u64 csum_bytes_left;
+
/*
* the end of the ordered extent which is behind it but
* didn't update disk_i_size. Please see the comment of
@@ -118,6 +123,9 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
+ /* If we need to wait on this to be done */
+ struct list_head log_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int wait);
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
void ordered_data_exit(void);
#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 50d95fd190a..920957ecb27 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_dev_extent_chunk_offset(l, dev_extent),
(unsigned long long)
btrfs_dev_extent_length(l, dev_extent));
+ break;
case BTRFS_DEV_STATS_KEY:
printk(KERN_INFO "\t\tdevice stats\n");
break;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a5c85623432..aee4b1cc3d9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -23,13 +23,13 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include <linux/btrfs.h>
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
#include "locking.h"
#include "ulist.h"
-#include "ioctl.h"
#include "backref.h"
/* TODO XXX FIXME
@@ -620,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
key.offset = qgroupid;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
@@ -661,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
key.offset = qgroup->qgroupid;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
@@ -702,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
key.offset = 0;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
@@ -732,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
{
struct btrfs_path *path;
struct btrfs_key key;
+ struct extent_buffer *leaf = NULL;
int ret;
-
- if (!root)
- return -EINVAL;
+ int nr = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- while (1) {
- key.objectid = 0;
- key.offset = 0;
- key.type = 0;
+ path->leave_spinning = 1;
- path->leave_spinning = 1;
+ key.objectid = 0;
+ key.offset = 0;
+ key.type = 0;
+
+ while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret > 0) {
- if (path->slots[0] == 0)
- break;
- path->slots[0]--;
- } else if (ret < 0) {
+ if (ret < 0)
+ goto out;
+ leaf = path->nodes[0];
+ nr = btrfs_header_nritems(leaf);
+ if (!nr)
break;
- }
-
- ret = btrfs_del_item(trans, root, path);
+ /*
+ * delete the leaf one by one
+ * since the whole tree is going
+ * to be deleted.
+ */
+ path->slots[0] = 0;
+ ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
goto out;
+
btrfs_release_path(path);
}
ret = 0;
@@ -847,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
int ret = 0;
spin_lock(&fs_info->qgroup_lock);
+ if (!fs_info->quota_root) {
+ spin_unlock(&fs_info->qgroup_lock);
+ return 0;
+ }
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
quota_root = fs_info->quota_root;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 00000000000..07222053c7d
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2099 @@
+/*
+ * Copyright (C) 2012 Fusion-io All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/hash.h>
+#include <linux/list_sort.h>
+#include <linux/raid/xor.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+/* set when additional merges to this rbio are not allowed */
+#define RBIO_RMW_LOCKED_BIT 1
+
+/*
+ * set when this rbio is sitting in the hash, but it is just a cache
+ * of past RMW
+ */
+#define RBIO_CACHE_BIT 2
+
+/*
+ * set when it is safe to trust the stripe_pages for caching
+ */
+#define RBIO_CACHE_READY_BIT 3
+
+
+#define RBIO_CACHE_SIZE 1024
+
+struct btrfs_raid_bio {
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_bio *bbio;
+
+ /*
+ * logical block numbers for the start of each stripe
+ * The last one or two are p/q. These are sorted,
+ * so raid_map[0] is the start of our full stripe
+ */
+ u64 *raid_map;
+
+ /* while we're doing rmw on a stripe
+ * we put it into a hash table so we can
+ * lock the stripe and merge more rbios
+ * into it.
+ */
+ struct list_head hash_list;
+
+ /*
+ * LRU list for the stripe cache
+ */
+ struct list_head stripe_cache;
+
+ /*
+ * for scheduling work in the helper threads
+ */
+ struct btrfs_work work;
+
+ /*
+ * bio list and bio_list_lock are used
+ * to add more bios into the stripe
+ * in hopes of avoiding the full rmw
+ */
+ struct bio_list bio_list;
+ spinlock_t bio_list_lock;
+
+ /* also protected by the bio_list_lock, the
+ * plug list is used by the plugging code
+ * to collect partial bios while plugged. The
+ * stripe locking code also uses it to hand off
+ * the stripe lock to the next pending IO
+ */
+ struct list_head plug_list;
+
+ /*
+ * flags that tell us if it is safe to
+ * merge with this bio
+ */
+ unsigned long flags;
+
+ /* size of each individual stripe on disk */
+ int stripe_len;
+
+ /* number of data stripes (no p/q) */
+ int nr_data;
+
+ /*
+ * set if we're doing a parity rebuild
+ * for a read from higher up, which is handled
+ * differently from a parity rebuild as part of
+ * rmw
+ */
+ int read_rebuild;
+
+ /* first bad stripe */
+ int faila;
+
+ /* second bad stripe (for raid6 use) */
+ int failb;
+
+ /*
+ * number of pages needed to represent the full
+ * stripe
+ */
+ int nr_pages;
+
+ /*
+ * size of all the bios in the bio_list. This
+ * helps us decide if the rbio maps to a full
+ * stripe or not
+ */
+ int bio_list_bytes;
+
+ atomic_t refs;
+
+ /*
+ * these are two arrays of pointers. We allocate the
+ * rbio big enough to hold them both and setup their
+ * locations when the rbio is allocated
+ */
+
+ /* pointers to pages that we allocated for
+ * reading/writing stripes directly from the disk (including P/Q)
+ */
+ struct page **stripe_pages;
+
+ /*
+ * pointers to the pages in the bio_list. Stored
+ * here for faster lookup
+ */
+ struct page **bio_pages;
+};
+
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
+static void rmw_work(struct btrfs_work *work);
+static void read_rebuild_work(struct btrfs_work *work);
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
+static void async_read_rebuild(struct btrfs_raid_bio *rbio);
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
+static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void index_rbio_pages(struct btrfs_raid_bio *rbio);
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+
+/*
+ * the stripe hash table is used for locking, and to collect
+ * bios in hopes of making a full stripe
+ */
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
+{
+ struct btrfs_stripe_hash_table *table;
+ struct btrfs_stripe_hash_table *x;
+ struct btrfs_stripe_hash *cur;
+ struct btrfs_stripe_hash *h;
+ int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
+ int i;
+ int table_size;
+
+ if (info->stripe_hash_table)
+ return 0;
+
+ /*
+ * The table is large, starting with order 4 and can go as high as
+ * order 7 in case lock debugging is turned on.
+ *
+ * Try harder to allocate and fallback to vmalloc to lower the chance
+ * of a failing mount.
+ */
+ table_size = sizeof(*table) + sizeof(*h) * num_entries;
+ table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!table) {
+ table = vzalloc(table_size);
+ if (!table)
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&table->cache_lock);
+ INIT_LIST_HEAD(&table->stripe_cache);
+
+ h = table->table;
+
+ for (i = 0; i < num_entries; i++) {
+ cur = h + i;
+ INIT_LIST_HEAD(&cur->hash_list);
+ spin_lock_init(&cur->lock);
+ init_waitqueue_head(&cur->wait);
+ }
+
+ x = cmpxchg(&info->stripe_hash_table, NULL, table);
+ if (x) {
+ if (is_vmalloc_addr(x))
+ vfree(x);
+ else
+ kfree(x);
+ }
+ return 0;
+}
+
+/*
+ * caching an rbio means to copy anything from the
+ * bio_pages array into the stripe_pages array. We
+ * use the page uptodate bit in the stripe cache array
+ * to indicate if it has valid data
+ *
+ * once the caching is done, we set the cache ready
+ * bit.
+ */
+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ char *s;
+ char *d;
+ int ret;
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ return;
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (!rbio->bio_pages[i])
+ continue;
+
+ s = kmap(rbio->bio_pages[i]);
+ d = kmap(rbio->stripe_pages[i]);
+
+ memcpy(d, s, PAGE_CACHE_SIZE);
+
+ kunmap(rbio->bio_pages[i]);
+ kunmap(rbio->stripe_pages[i]);
+ SetPageUptodate(rbio->stripe_pages[i]);
+ }
+ set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+}
+
+/*
+ * we hash on the first logical address of the stripe
+ */
+static int rbio_bucket(struct btrfs_raid_bio *rbio)
+{
+ u64 num = rbio->raid_map[0];
+
+ /*
+ * we shift down quite a bit. We're using byte
+ * addressing, and most of the lower bits are zeros.
+ * This tends to upset hash_64, and it consistently
+ * returns just one or two different values.
+ *
+ * shifting off the lower bits fixes things.
+ */
+ return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
+}
+
+/*
+ * stealing an rbio means taking all the uptodate pages from the stripe
+ * array in the source rbio and putting them into the destination rbio
+ */
+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
+{
+ int i;
+ struct page *s;
+ struct page *d;
+
+ if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
+ return;
+
+ for (i = 0; i < dest->nr_pages; i++) {
+ s = src->stripe_pages[i];
+ if (!s || !PageUptodate(s)) {
+ continue;
+ }
+
+ d = dest->stripe_pages[i];
+ if (d)
+ __free_page(d);
+
+ dest->stripe_pages[i] = s;
+ src->stripe_pages[i] = NULL;
+ }
+}
+
+/*
+ * merging means we take the bio_list from the victim and
+ * splice it into the destination. The victim should
+ * be discarded afterwards.
+ *
+ * must be called with dest->rbio_list_lock held
+ */
+static void merge_rbio(struct btrfs_raid_bio *dest,
+ struct btrfs_raid_bio *victim)
+{
+ bio_list_merge(&dest->bio_list, &victim->bio_list);
+ dest->bio_list_bytes += victim->bio_list_bytes;
+ bio_list_init(&victim->bio_list);
+}
+
+/*
+ * used to prune items that are in the cache. The caller
+ * must hold the hash table lock.
+ */
+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+ int bucket = rbio_bucket(rbio);
+ struct btrfs_stripe_hash_table *table;
+ struct btrfs_stripe_hash *h;
+ int freeit = 0;
+
+ /*
+ * check the bit again under the hash table lock.
+ */
+ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+ return;
+
+ table = rbio->fs_info->stripe_hash_table;
+ h = table->table + bucket;
+
+ /* hold the lock for the bucket because we may be
+ * removing it from the hash table
+ */
+ spin_lock(&h->lock);
+
+ /*
+ * hold the lock for the bio list because we need
+ * to make sure the bio list is empty
+ */
+ spin_lock(&rbio->bio_list_lock);
+
+ if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+ list_del_init(&rbio->stripe_cache);
+ table->cache_size -= 1;
+ freeit = 1;
+
+ /* if the bio list isn't empty, this rbio is
+ * still involved in an IO. We take it out
+ * of the cache list, and drop the ref that
+ * was held for the list.
+ *
+ * If the bio_list was empty, we also remove
+ * the rbio from the hash_table, and drop
+ * the corresponding ref
+ */
+ if (bio_list_empty(&rbio->bio_list)) {
+ if (!list_empty(&rbio->hash_list)) {
+ list_del_init(&rbio->hash_list);
+ atomic_dec(&rbio->refs);
+ BUG_ON(!list_empty(&rbio->plug_list));
+ }
+ }
+ }
+
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock(&h->lock);
+
+ if (freeit)
+ __free_raid_bio(rbio);
+}
+
+/*
+ * prune a given rbio from the cache
+ */
+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+
+ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+ return;
+
+ table = rbio->fs_info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ __remove_rbio_from_cache(rbio);
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove everything in the cache
+ */
+void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+ struct btrfs_raid_bio *rbio;
+
+ table = info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ while (!list_empty(&table->stripe_cache)) {
+ rbio = list_entry(table->stripe_cache.next,
+ struct btrfs_raid_bio,
+ stripe_cache);
+ __remove_rbio_from_cache(rbio);
+ }
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove all cached entries and free the hash table
+ * used by unmount
+ */
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
+{
+ if (!info->stripe_hash_table)
+ return;
+ btrfs_clear_rbio_cache(info);
+ if (is_vmalloc_addr(info->stripe_hash_table))
+ vfree(info->stripe_hash_table);
+ else
+ kfree(info->stripe_hash_table);
+ info->stripe_hash_table = NULL;
+}
+
+/*
+ * insert an rbio into the stripe cache. It
+ * must have already been prepared by calling
+ * cache_rbio_pages
+ *
+ * If this rbio was already cached, it gets
+ * moved to the front of the lru.
+ *
+ * If the size of the rbio cache is too big, we
+ * prune an item.
+ */
+static void cache_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+
+ if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+ return;
+
+ table = rbio->fs_info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ spin_lock(&rbio->bio_list_lock);
+
+ /* bump our ref if we were not in the list before */
+ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
+ atomic_inc(&rbio->refs);
+
+ if (!list_empty(&rbio->stripe_cache)){
+ list_move(&rbio->stripe_cache, &table->stripe_cache);
+ } else {
+ list_add(&rbio->stripe_cache, &table->stripe_cache);
+ table->cache_size += 1;
+ }
+
+ spin_unlock(&rbio->bio_list_lock);
+
+ if (table->cache_size > RBIO_CACHE_SIZE) {
+ struct btrfs_raid_bio *found;
+
+ found = list_entry(table->stripe_cache.prev,
+ struct btrfs_raid_bio,
+ stripe_cache);
+
+ if (found != rbio)
+ __remove_rbio_from_cache(found);
+ }
+
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+ return;
+}
+
+/*
+ * helper function to run the xor_blocks api. It is only
+ * able to do MAX_XOR_BLOCKS at a time, so we need to
+ * loop through.
+ */
+static void run_xor(void **pages, int src_cnt, ssize_t len)
+{
+ int src_off = 0;
+ int xor_src_cnt = 0;
+ void *dest = pages[src_cnt];
+
+ while(src_cnt > 0) {
+ xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+ xor_blocks(xor_src_cnt, len, dest, pages + src_off);
+
+ src_cnt -= xor_src_cnt;
+ src_off += xor_src_cnt;
+ }
+}
+
+/*
+ * returns true if the bio list inside this rbio
+ * covers an entire stripe (no rmw required).
+ * Must be called with the bio list lock held, or
+ * at a time when you know it is impossible to add
+ * new bios into the list
+ */
+static int __rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+ unsigned long size = rbio->bio_list_bytes;
+ int ret = 1;
+
+ if (size != rbio->nr_data * rbio->stripe_len)
+ ret = 0;
+
+ BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+ return ret;
+}
+
+static int rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&rbio->bio_list_lock, flags);
+ ret = __rbio_is_full(rbio);
+ spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+ return ret;
+}
+
+/*
+ * returns 1 if it is safe to merge two rbios together.
+ * The merging is safe if the two rbios correspond to
+ * the same stripe and if they are both going in the same
+ * direction (read vs write), and if neither one is
+ * locked for final IO
+ *
+ * The caller is responsible for locking such that
+ * rmw_locked is safe to test
+ */
+static int rbio_can_merge(struct btrfs_raid_bio *last,
+ struct btrfs_raid_bio *cur)
+{
+ if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
+ test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
+ return 0;
+
+ /*
+ * we can't merge with cached rbios, since the
+ * idea is that when we merge the destination
+ * rbio is going to run our IO for us. We can
+ * steal from cached rbio's though, other functions
+ * handle that.
+ */
+ if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
+ test_bit(RBIO_CACHE_BIT, &cur->flags))
+ return 0;
+
+ if (last->raid_map[0] !=
+ cur->raid_map[0])
+ return 0;
+
+ /* reads can't merge with writes */
+ if (last->read_rebuild !=
+ cur->read_rebuild) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * helper to index into the pstripe
+ */
+static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+ index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+ return rbio->stripe_pages[index];
+}
+
+/*
+ * helper to index into the qstripe, returns null
+ * if there is no qstripe
+ */
+static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+ if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+ return NULL;
+
+ index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
+ PAGE_CACHE_SHIFT;
+ return rbio->stripe_pages[index];
+}
+
+/*
+ * The first stripe in the table for a logical address
+ * has the lock. rbios are added in one of three ways:
+ *
+ * 1) Nobody has the stripe locked yet. The rbio is given
+ * the lock and 0 is returned. The caller must start the IO
+ * themselves.
+ *
+ * 2) Someone has the stripe locked, but we're able to merge
+ * with the lock owner. The rbio is freed and the IO will
+ * start automatically along with the existing rbio. 1 is returned.
+ *
+ * 3) Someone has the stripe locked, but we're not able to merge.
+ * The rbio is added to the lock owner's plug list, or merged into
+ * an rbio already on the plug list. When the lock owner unlocks,
+ * the next rbio on the list is run and the IO is started automatically.
+ * 1 is returned
+ *
+ * If we return 0, the caller still owns the rbio and must continue with
+ * IO submission. If we return 1, the caller must assume the rbio has
+ * already been freed.
+ */
+static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+{
+ int bucket = rbio_bucket(rbio);
+ struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+ struct btrfs_raid_bio *cur;
+ struct btrfs_raid_bio *pending;
+ unsigned long flags;
+ DEFINE_WAIT(wait);
+ struct btrfs_raid_bio *freeit = NULL;
+ struct btrfs_raid_bio *cache_drop = NULL;
+ int ret = 0;
+ int walk = 0;
+
+ spin_lock_irqsave(&h->lock, flags);
+ list_for_each_entry(cur, &h->hash_list, hash_list) {
+ walk++;
+ if (cur->raid_map[0] == rbio->raid_map[0]) {
+ spin_lock(&cur->bio_list_lock);
+
+ /* can we steal this cached rbio's pages? */
+ if (bio_list_empty(&cur->bio_list) &&
+ list_empty(&cur->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+ !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+ list_del_init(&cur->hash_list);
+ atomic_dec(&cur->refs);
+
+ steal_rbio(cur, rbio);
+ cache_drop = cur;
+ spin_unlock(&cur->bio_list_lock);
+
+ goto lockit;
+ }
+
+ /* can we merge into the lock owner? */
+ if (rbio_can_merge(cur, rbio)) {
+ merge_rbio(cur, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+
+
+ /*
+ * we couldn't merge with the running
+ * rbio, see if we can merge with the
+ * pending ones. We don't have to
+ * check for rmw_locked because there
+ * is no way they are inside finish_rmw
+ * right now
+ */
+ list_for_each_entry(pending, &cur->plug_list,
+ plug_list) {
+ if (rbio_can_merge(pending, rbio)) {
+ merge_rbio(pending, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+ }
+
+ /* no merging, put us on the tail of the plug list,
+ * our rbio will be started with the currently
+ * running rbio unlocks
+ */
+ list_add_tail(&rbio->plug_list, &cur->plug_list);
+ spin_unlock(&cur->bio_list_lock);
+ ret = 1;
+ goto out;
+ }
+ }
+lockit:
+ atomic_inc(&rbio->refs);
+ list_add(&rbio->hash_list, &h->hash_list);
+out:
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (cache_drop)
+ remove_rbio_from_cache(cache_drop);
+ if (freeit)
+ __free_raid_bio(freeit);
+ return ret;
+}
+
+/*
+ * called as rmw or parity rebuild is completed. If the plug list has more
+ * rbios waiting for this stripe, the next one on the list will be started
+ */
+static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bucket;
+ struct btrfs_stripe_hash *h;
+ unsigned long flags;
+ int keep_cache = 0;
+
+ bucket = rbio_bucket(rbio);
+ h = rbio->fs_info->stripe_hash_table->table + bucket;
+
+ if (list_empty(&rbio->plug_list))
+ cache_rbio(rbio);
+
+ spin_lock_irqsave(&h->lock, flags);
+ spin_lock(&rbio->bio_list_lock);
+
+ if (!list_empty(&rbio->hash_list)) {
+ /*
+ * if we're still cached and there is no other IO
+ * to perform, just leave this rbio here for others
+ * to steal from later
+ */
+ if (list_empty(&rbio->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+ keep_cache = 1;
+ clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ BUG_ON(!bio_list_empty(&rbio->bio_list));
+ goto done;
+ }
+
+ list_del_init(&rbio->hash_list);
+ atomic_dec(&rbio->refs);
+
+ /*
+ * we use the plug list to hold all the rbios
+ * waiting for the chance to lock this stripe.
+ * hand the lock over to one of them.
+ */
+ if (!list_empty(&rbio->plug_list)) {
+ struct btrfs_raid_bio *next;
+ struct list_head *head = rbio->plug_list.next;
+
+ next = list_entry(head, struct btrfs_raid_bio,
+ plug_list);
+
+ list_del_init(&rbio->plug_list);
+
+ list_add(&next->hash_list, &h->hash_list);
+ atomic_inc(&next->refs);
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (next->read_rebuild)
+ async_read_rebuild(next);
+ else {
+ steal_rbio(rbio, next);
+ async_rmw_stripe(next);
+ }
+
+ goto done_nolock;
+ } else if (waitqueue_active(&h->wait)) {
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+ wake_up(&h->wait);
+ goto done_nolock;
+ }
+ }
+done:
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+done_nolock:
+ if (!keep_cache)
+ remove_rbio_from_cache(rbio);
+}
+
+static void __free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ WARN_ON(atomic_read(&rbio->refs) < 0);
+ if (!atomic_dec_and_test(&rbio->refs))
+ return;
+
+ WARN_ON(!list_empty(&rbio->stripe_cache));
+ WARN_ON(!list_empty(&rbio->hash_list));
+ WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i]) {
+ __free_page(rbio->stripe_pages[i]);
+ rbio->stripe_pages[i] = NULL;
+ }
+ }
+ kfree(rbio->raid_map);
+ kfree(rbio->bbio);
+ kfree(rbio);
+}
+
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ unlock_stripe(rbio);
+ __free_raid_bio(rbio);
+}
+
+/*
+ * this frees the rbio and runs through all the bios in the
+ * bio_list and calls end_io on them
+ */
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+{
+ struct bio *cur = bio_list_get(&rbio->bio_list);
+ struct bio *next;
+ free_raid_bio(rbio);
+
+ while (cur) {
+ next = cur->bi_next;
+ cur->bi_next = NULL;
+ if (uptodate)
+ set_bit(BIO_UPTODATE, &cur->bi_flags);
+ bio_endio(cur, err);
+ cur = next;
+ }
+}
+
+/*
+ * end io function used by finish_rmw. When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+ return;
+
+ err = 0;
+
+ /* OK, we have read all the stripes we need to. */
+ if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+ err = -EIO;
+
+ rbio_orig_end_io(rbio, err, 0);
+ return;
+}
+
+/*
+ * the read/modify/write code wants to use the original bio for
+ * any pages it included, and then use the rbio for everything
+ * else. This function decides if a given index (stripe number)
+ * and page number in that stripe fall inside the original bio
+ * or the rbio.
+ *
+ * if you set bio_list_only, you'll get a NULL back for any ranges
+ * that are outside the bio_list
+ *
+ * This doesn't take any refs on anything, you get a bare page pointer
+ * and the caller must bump refs as required.
+ *
+ * You must call index_rbio_pages once before you can trust
+ * the answers from this function.
+ */
+static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
+ int index, int pagenr, int bio_list_only)
+{
+ int chunk_page;
+ struct page *p = NULL;
+
+ chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+
+ spin_lock_irq(&rbio->bio_list_lock);
+ p = rbio->bio_pages[chunk_page];
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ if (p || bio_list_only)
+ return p;
+
+ return rbio->stripe_pages[chunk_page];
+}
+
+/*
+ * number of pages we need for the entire stripe across all the
+ * drives
+ */
+static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
+{
+ unsigned long nr = stripe_len * nr_stripes;
+ return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+/*
+ * allocation and initial setup for the btrfs_raid_bio. Not
+ * this does not allocate any pages for rbio->pages.
+ */
+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len)
+{
+ struct btrfs_raid_bio *rbio;
+ int nr_data = 0;
+ int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
+ void *p;
+
+ rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
+ GFP_NOFS);
+ if (!rbio) {
+ kfree(raid_map);
+ kfree(bbio);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ bio_list_init(&rbio->bio_list);
+ INIT_LIST_HEAD(&rbio->plug_list);
+ spin_lock_init(&rbio->bio_list_lock);
+ INIT_LIST_HEAD(&rbio->stripe_cache);
+ INIT_LIST_HEAD(&rbio->hash_list);
+ rbio->bbio = bbio;
+ rbio->raid_map = raid_map;
+ rbio->fs_info = root->fs_info;
+ rbio->stripe_len = stripe_len;
+ rbio->nr_pages = num_pages;
+ rbio->faila = -1;
+ rbio->failb = -1;
+ atomic_set(&rbio->refs, 1);
+
+ /*
+ * the stripe_pages and bio_pages array point to the extra
+ * memory we allocated past the end of the rbio
+ */
+ p = rbio + 1;
+ rbio->stripe_pages = p;
+ rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+
+ if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+ nr_data = bbio->num_stripes - 2;
+ else
+ nr_data = bbio->num_stripes - 1;
+
+ rbio->nr_data = nr_data;
+ return rbio;
+}
+
+/* allocate pages for all the stripes in the bio, including parity */
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i])
+ continue;
+ page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[i] = page;
+ ClearPageUptodate(page);
+ }
+ return 0;
+}
+
+/* allocate pages for just the p/q stripes */
+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ struct page *page;
+
+ i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+
+ for (; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i])
+ continue;
+ page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[i] = page;
+ }
+ return 0;
+}
+
+/*
+ * add a single page from a specific stripe into our list of bios for IO
+ * this will try to merge into existing bios if possible, and returns
+ * zero if all went well.
+ */
+int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct page *page,
+ int stripe_nr,
+ unsigned long page_index,
+ unsigned long bio_max_len)
+{
+ struct bio *last = bio_list->tail;
+ u64 last_end = 0;
+ int ret;
+ struct bio *bio;
+ struct btrfs_bio_stripe *stripe;
+ u64 disk_start;
+
+ stripe = &rbio->bbio->stripes[stripe_nr];
+ disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+
+ /* if the device is missing, just fail this stripe */
+ if (!stripe->dev->bdev)
+ return fail_rbio_index(rbio, stripe_nr);
+
+ /* see if we can add this page onto our existing bio */
+ if (last) {
+ last_end = (u64)last->bi_sector << 9;
+ last_end += last->bi_size;
+
+ /*
+ * we can't merge these if they are from different
+ * devices or if they are not contiguous
+ */
+ if (last_end == disk_start && stripe->dev->bdev &&
+ test_bit(BIO_UPTODATE, &last->bi_flags) &&
+ last->bi_bdev == stripe->dev->bdev) {
+ ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
+ if (ret == PAGE_CACHE_SIZE)
+ return 0;
+ }
+ }
+
+ /* put a new bio on the list */
+ bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_size = 0;
+ bio->bi_bdev = stripe->dev->bdev;
+ bio->bi_sector = disk_start >> 9;
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ bio_list_add(bio_list, bio);
+ return 0;
+}
+
+/*
+ * while we're doing the read/modify/write cycle, we could
+ * have errors in reading pages off the disk. This checks
+ * for errors and if we're not able to read the page it'll
+ * trigger parity reconstruction. The rmw will be finished
+ * after we've reconstructed the failed stripes
+ */
+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
+{
+ if (rbio->faila >= 0 || rbio->failb >= 0) {
+ BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+ __raid56_parity_recover(rbio);
+ } else {
+ finish_rmw(rbio);
+ }
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
+{
+ int index;
+ index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
+ index += page;
+ return rbio->stripe_pages[index];
+}
+
+/*
+ * helper function to walk our bio list and populate the bio_pages array with
+ * the result. This seems expensive, but it is faster than constantly
+ * searching through the bio list as we setup the IO in finish_rmw or stripe
+ * reconstruction.
+ *
+ * This must be called before you trust the answers from page_in_rbio
+ */
+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ struct bio *bio;
+ u64 start;
+ unsigned long stripe_offset;
+ unsigned long page_index;
+ struct page *p;
+ int i;
+
+ spin_lock_irq(&rbio->bio_list_lock);
+ bio_list_for_each(bio, &rbio->bio_list) {
+ start = (u64)bio->bi_sector << 9;
+ stripe_offset = start - rbio->raid_map[0];
+ page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ p = bio->bi_io_vec[i].bv_page;
+ rbio->bio_pages[page_index + i] = p;
+ }
+ }
+ spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+/*
+ * this is called from one of two situations. We either
+ * have a full stripe from the higher layers, or we've read all
+ * the missing bits off disk.
+ *
+ * This will calculate the parity and then send down any
+ * changed blocks.
+ */
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_bio *bbio = rbio->bbio;
+ void *pointers[bbio->num_stripes];
+ int stripe_len = rbio->stripe_len;
+ int nr_data = rbio->nr_data;
+ int stripe;
+ int pagenr;
+ int p_stripe = -1;
+ int q_stripe = -1;
+ struct bio_list bio_list;
+ struct bio *bio;
+ int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ if (bbio->num_stripes - rbio->nr_data == 1) {
+ p_stripe = bbio->num_stripes - 1;
+ } else if (bbio->num_stripes - rbio->nr_data == 2) {
+ p_stripe = bbio->num_stripes - 2;
+ q_stripe = bbio->num_stripes - 1;
+ } else {
+ BUG();
+ }
+
+ /* at this point we either have a full stripe,
+ * or we've read the full stripe from the drive.
+ * recalculate the parity and write the new results.
+ *
+ * We're not allowed to add any new bios to the
+ * bio list here, anyone else that wants to
+ * change this stripe needs to do their own rmw.
+ */
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ atomic_set(&rbio->bbio->error, 0);
+
+ /*
+ * now that we've set rmw_locked, run through the
+ * bio list one last time and map the page pointers
+ *
+ * We don't cache full rbios because we're assuming
+ * the higher layers are unlikely to use this area of
+ * the disk again soon. If they do use it again,
+ * hopefully they will send another full bio.
+ */
+ index_rbio_pages(rbio);
+ if (!rbio_is_full(rbio))
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ struct page *p;
+ /* first collect one page from each data stripe */
+ for (stripe = 0; stripe < nr_data; stripe++) {
+ p = page_in_rbio(rbio, stripe, pagenr, 0);
+ pointers[stripe] = kmap(p);
+ }
+
+ /* then add the parity stripe */
+ p = rbio_pstripe_page(rbio, pagenr);
+ SetPageUptodate(p);
+ pointers[stripe++] = kmap(p);
+
+ if (q_stripe != -1) {
+
+ /*
+ * raid6, add the qstripe and call the
+ * library function to fill in our p/q
+ */
+ p = rbio_qstripe_page(rbio, pagenr);
+ SetPageUptodate(p);
+ pointers[stripe++] = kmap(p);
+
+ raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+ run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+ }
+
+
+ for (stripe = 0; stripe < bbio->num_stripes; stripe++)
+ kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ }
+
+ /*
+ * time to start writing. Make bios for everything from the
+ * higher layers (the bio_list in our rbio) and our p/q. Ignore
+ * everything else.
+ */
+ for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+ for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ struct page *page;
+ if (stripe < rbio->nr_data) {
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (!page)
+ continue;
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+
+ ret = rbio_add_io_page(rbio, &bio_list,
+ page, stripe, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+ atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
+ BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
+
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_write_end_io;
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(WRITE, bio);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * helper to find the stripe number for a given bio. Used to figure out which
+ * stripe has failed. This expects the bio to correspond to a physical disk,
+ * so it looks up based on physical sector numbers.
+ */
+static int find_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ u64 physical = bio->bi_sector;
+ u64 stripe_start;
+ int i;
+ struct btrfs_bio_stripe *stripe;
+
+ physical <<= 9;
+
+ for (i = 0; i < rbio->bbio->num_stripes; i++) {
+ stripe = &rbio->bbio->stripes[i];
+ stripe_start = stripe->physical;
+ if (physical >= stripe_start &&
+ physical < stripe_start + rbio->stripe_len) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/*
+ * helper to find the stripe number for a given
+ * bio (before mapping). Used to figure out which stripe has
+ * failed. This looks up based on logical block numbers.
+ */
+static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ u64 logical = bio->bi_sector;
+ u64 stripe_start;
+ int i;
+
+ logical <<= 9;
+
+ for (i = 0; i < rbio->nr_data; i++) {
+ stripe_start = rbio->raid_map[i];
+ if (logical >= stripe_start &&
+ logical < stripe_start + rbio->stripe_len) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/*
+ * returns -EIO if we had too many failures
+ */
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&rbio->bio_list_lock, flags);
+
+ /* we already know this stripe is bad, move on */
+ if (rbio->faila == failed || rbio->failb == failed)
+ goto out;
+
+ if (rbio->faila == -1) {
+ /* first failure on this rbio */
+ rbio->faila = failed;
+ atomic_inc(&rbio->bbio->error);
+ } else if (rbio->failb == -1) {
+ /* second failure on this rbio */
+ rbio->failb = failed;
+ atomic_inc(&rbio->bbio->error);
+ } else {
+ ret = -EIO;
+ }
+out:
+ spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+
+ return ret;
+}
+
+/*
+ * helper to fail a stripe based on a physical disk
+ * bio.
+ */
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ int failed = find_bio_stripe(rbio, bio);
+
+ if (failed < 0)
+ return -EIO;
+
+ return fail_rbio_index(rbio, failed);
+}
+
+/*
+ * this sets each page in the bio uptodate. It should only be used on private
+ * rbio pages, nothing that comes in from the higher layers
+ */
+static void set_bio_pages_uptodate(struct bio *bio)
+{
+ int i;
+ struct page *p;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ p = bio->bi_io_vec[i].bv_page;
+ SetPageUptodate(p);
+ }
+}
+
+/*
+ * end io for the read phase of the rmw cycle. All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid_rmw_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+ else
+ set_bio_pages_uptodate(bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+ return;
+
+ err = 0;
+ if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+ goto cleanup;
+
+ /*
+ * this will normally call finish_rmw to start our write
+ * but if there are any failed stripes we'll reconstruct
+ * from parity first
+ */
+ validate_rbio_for_rmw(rbio);
+ return;
+
+cleanup:
+
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+ rbio->work.flags = 0;
+ rbio->work.func = rmw_work;
+
+ btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+ &rbio->work);
+}
+
+static void async_read_rebuild(struct btrfs_raid_bio *rbio)
+{
+ rbio->work.flags = 0;
+ rbio->work.func = read_rebuild_work;
+
+ btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+ &rbio->work);
+}
+
+/*
+ * the stripe must be locked by the caller. It will
+ * unlock after all the writes are done
+ */
+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct btrfs_bio *bbio = rbio->bbio;
+ struct bio_list bio_list;
+ int ret;
+ int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int pagenr;
+ int stripe;
+ struct bio *bio;
+
+ bio_list_init(&bio_list);
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ index_rbio_pages(rbio);
+
+ atomic_set(&rbio->bbio->error, 0);
+ /*
+ * build a list of bios to read all the missing parts of this
+ * stripe
+ */
+ for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+ for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ struct page *page;
+ /*
+ * we want to find all the pages missing from
+ * the rbio and read them from the disk. If
+ * page_in_rbio finds a page in the bio list
+ * we don't need to read it off the stripe.
+ */
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (page)
+ continue;
+
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ /*
+ * the bio cache may have handed us an uptodate
+ * page. If so, be happy and use it
+ */
+ if (PageUptodate(page))
+ continue;
+
+ ret = rbio_add_io_page(rbio, &bio_list, page,
+ stripe, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * this can happen if others have merged with
+ * us, it means there is nothing left to read.
+ * But if there are missing devices it may not be
+ * safe to do the full stripe write yet.
+ */
+ goto finish;
+ }
+
+ /*
+ * the bbio may be freed once we submit the last bio. Make sure
+ * not to touch it after that
+ */
+ atomic_set(&bbio->stripes_pending, bios_to_read);
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_rmw_end_io;
+
+ btrfs_bio_wq_end_io(rbio->fs_info, bio,
+ BTRFS_WQ_ENDIO_RAID56);
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(READ, bio);
+ }
+ /* the actual write will happen once the reads are done */
+ return 0;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+ return -EIO;
+
+finish:
+ validate_rbio_for_rmw(rbio);
+ return 0;
+}
+
+/*
+ * if the upper layers pass in a full stripe, we thank them by only allocating
+ * enough pages to hold the parity, and sending it all down quickly.
+ */
+static int full_stripe_write(struct btrfs_raid_bio *rbio)
+{
+ int ret;
+
+ ret = alloc_rbio_parity_pages(rbio);
+ if (ret)
+ return ret;
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0)
+ finish_rmw(rbio);
+ return 0;
+}
+
+/*
+ * partial stripe writes get handed over to async helpers.
+ * We're really hoping to merge a few more writes into this
+ * rbio before calculating new parity
+ */
+static int partial_stripe_write(struct btrfs_raid_bio *rbio)
+{
+ int ret;
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0)
+ async_rmw_stripe(rbio);
+ return 0;
+}
+
+/*
+ * sometimes while we were reading from the drive to
+ * recalculate parity, enough new bios come into create
+ * a full stripe. So we do a check here to see if we can
+ * go directly to finish_rmw
+ */
+static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
+{
+ /* head off into rmw land if we don't have a full stripe */
+ if (!rbio_is_full(rbio))
+ return partial_stripe_write(rbio);
+ return full_stripe_write(rbio);
+}
+
+/*
+ * We use plugging call backs to collect full stripes.
+ * Any time we get a partial stripe write while plugged
+ * we collect it into a list. When the unplug comes down,
+ * we sort the list by logical block number and merge
+ * everything we can into the same rbios
+ */
+struct btrfs_plug_cb {
+ struct blk_plug_cb cb;
+ struct btrfs_fs_info *info;
+ struct list_head rbio_list;
+ struct btrfs_work work;
+};
+
+/*
+ * rbios on the plug list are sorted for easier merging.
+ */
+static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+ plug_list);
+ struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+ plug_list);
+ u64 a_sector = ra->bio_list.head->bi_sector;
+ u64 b_sector = rb->bio_list.head->bi_sector;
+
+ if (a_sector < b_sector)
+ return -1;
+ if (a_sector > b_sector)
+ return 1;
+ return 0;
+}
+
+static void run_plug(struct btrfs_plug_cb *plug)
+{
+ struct btrfs_raid_bio *cur;
+ struct btrfs_raid_bio *last = NULL;
+
+ /*
+ * sort our plug list then try to merge
+ * everything we can in hopes of creating full
+ * stripes.
+ */
+ list_sort(NULL, &plug->rbio_list, plug_cmp);
+ while (!list_empty(&plug->rbio_list)) {
+ cur = list_entry(plug->rbio_list.next,
+ struct btrfs_raid_bio, plug_list);
+ list_del_init(&cur->plug_list);
+
+ if (rbio_is_full(cur)) {
+ /* we have a full stripe, send it down */
+ full_stripe_write(cur);
+ continue;
+ }
+ if (last) {
+ if (rbio_can_merge(last, cur)) {
+ merge_rbio(last, cur);
+ __free_raid_bio(cur);
+ continue;
+
+ }
+ __raid56_parity_write(last);
+ }
+ last = cur;
+ }
+ if (last) {
+ __raid56_parity_write(last);
+ }
+ kfree(plug);
+}
+
+/*
+ * if the unplug comes from schedule, we have to push the
+ * work off to a helper thread
+ */
+static void unplug_work(struct btrfs_work *work)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(work, struct btrfs_plug_cb, work);
+ run_plug(plug);
+}
+
+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+
+ if (from_schedule) {
+ plug->work.flags = 0;
+ plug->work.func = unplug_work;
+ btrfs_queue_worker(&plug->info->rmw_workers,
+ &plug->work);
+ return;
+ }
+ run_plug(plug);
+}
+
+/*
+ * our main entry point for writes from the rest of the FS.
+ */
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len)
+{
+ struct btrfs_raid_bio *rbio;
+ struct btrfs_plug_cb *plug = NULL;
+ struct blk_plug_cb *cb;
+
+ rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ if (IS_ERR(rbio)) {
+ kfree(raid_map);
+ kfree(bbio);
+ return PTR_ERR(rbio);
+ }
+ bio_list_add(&rbio->bio_list, bio);
+ rbio->bio_list_bytes = bio->bi_size;
+
+ /*
+ * don't plug on full rbios, just get them out the door
+ * as quickly as we can
+ */
+ if (rbio_is_full(rbio))
+ return full_stripe_write(rbio);
+
+ cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
+ sizeof(*plug));
+ if (cb) {
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+ if (!plug->info) {
+ plug->info = root->fs_info;
+ INIT_LIST_HEAD(&plug->rbio_list);
+ }
+ list_add_tail(&rbio->plug_list, &plug->rbio_list);
+ } else {
+ return __raid56_parity_write(rbio);
+ }
+ return 0;
+}
+
+/*
+ * all parity reconstruction happens here. We've read in everything
+ * we can find from the drives and this does the heavy lifting of
+ * sorting the good from the bad.
+ */
+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+{
+ int pagenr, stripe;
+ void **pointers;
+ int faila = -1, failb = -1;
+ int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int err;
+ int i;
+
+ pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
+ GFP_NOFS);
+ if (!pointers) {
+ err = -ENOMEM;
+ goto cleanup_io;
+ }
+
+ faila = rbio->faila;
+ failb = rbio->failb;
+
+ if (rbio->read_rebuild) {
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+ }
+
+ index_rbio_pages(rbio);
+
+ for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ /* setup our array of pointers with pages
+ * from each stripe
+ */
+ for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+ /*
+ * if we're rebuilding a read, we have to use
+ * pages from the bio list
+ */
+ if (rbio->read_rebuild &&
+ (stripe == faila || stripe == failb)) {
+ page = page_in_rbio(rbio, stripe, pagenr, 0);
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+ pointers[stripe] = kmap(page);
+ }
+
+ /* all raid6 handling here */
+ if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
+ RAID6_Q_STRIPE) {
+
+ /*
+ * single failure, rebuild from parity raid5
+ * style
+ */
+ if (failb < 0) {
+ if (faila == rbio->nr_data) {
+ /*
+ * Just the P stripe has failed, without
+ * a bad data or Q stripe.
+ * TODO, we should redo the xor here.
+ */
+ err = -EIO;
+ goto cleanup;
+ }
+ /*
+ * a single failure in raid6 is rebuilt
+ * in the pstripe code below
+ */
+ goto pstripe;
+ }
+
+ /* make sure our ps and qs are in order */
+ if (faila > failb) {
+ int tmp = failb;
+ failb = faila;
+ faila = tmp;
+ }
+
+ /* if the q stripe is failed, do a pstripe reconstruction
+ * from the xors.
+ * If both the q stripe and the P stripe are failed, we're
+ * here due to a crc mismatch and we can't give them the
+ * data they want
+ */
+ if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+ err = -EIO;
+ goto cleanup;
+ }
+ /*
+ * otherwise we have one bad data stripe and
+ * a good P stripe. raid5!
+ */
+ goto pstripe;
+ }
+
+ if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+ raid6_datap_recov(rbio->bbio->num_stripes,
+ PAGE_SIZE, faila, pointers);
+ } else {
+ raid6_2data_recov(rbio->bbio->num_stripes,
+ PAGE_SIZE, faila, failb,
+ pointers);
+ }
+ } else {
+ void *p;
+
+ /* rebuild from P stripe here (raid5 or raid6) */
+ BUG_ON(failb != -1);
+pstripe:
+ /* Copy parity block into failed block to start with */
+ memcpy(pointers[faila],
+ pointers[rbio->nr_data],
+ PAGE_CACHE_SIZE);
+
+ /* rearrange the pointer array */
+ p = pointers[faila];
+ for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
+ pointers[stripe] = pointers[stripe + 1];
+ pointers[rbio->nr_data - 1] = p;
+
+ /* xor in the rest */
+ run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+ }
+ /* if we're doing this rebuild as part of an rmw, go through
+ * and set all of our private rbio pages in the
+ * failed stripes as uptodate. This way finish_rmw will
+ * know they can be trusted. If this was a read reconstruction,
+ * other endio functions will fiddle the uptodate bits
+ */
+ if (!rbio->read_rebuild) {
+ for (i = 0; i < nr_pages; i++) {
+ if (faila != -1) {
+ page = rbio_stripe_page(rbio, faila, i);
+ SetPageUptodate(page);
+ }
+ if (failb != -1) {
+ page = rbio_stripe_page(rbio, failb, i);
+ SetPageUptodate(page);
+ }
+ }
+ }
+ for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+ /*
+ * if we're rebuilding a read, we have to use
+ * pages from the bio list
+ */
+ if (rbio->read_rebuild &&
+ (stripe == faila || stripe == failb)) {
+ page = page_in_rbio(rbio, stripe, pagenr, 0);
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+ kunmap(page);
+ }
+ }
+
+ err = 0;
+cleanup:
+ kfree(pointers);
+
+cleanup_io:
+
+ if (rbio->read_rebuild) {
+ if (err == 0)
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ rbio_orig_end_io(rbio, err, err == 0);
+ } else if (err == 0) {
+ rbio->faila = -1;
+ rbio->failb = -1;
+ finish_rmw(rbio);
+ } else {
+ rbio_orig_end_io(rbio, err, 0);
+ }
+}
+
+/*
+ * This is called only for stripes we've read from disk to
+ * reconstruct the parity.
+ */
+static void raid_recover_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ /*
+ * we only read stripe pages off the disk, set them
+ * up to date if there were no errors
+ */
+ if (err)
+ fail_bio_stripe(rbio, bio);
+ else
+ set_bio_pages_uptodate(bio);
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+ return;
+
+ if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+ rbio_orig_end_io(rbio, -EIO, 0);
+ else
+ __raid_recover_end_io(rbio);
+}
+
+/*
+ * reads everything we need off the disk to reconstruct
+ * the parity. endio handlers trigger final reconstruction
+ * when the IO is done.
+ *
+ * This is used both for reads from the higher layers and for
+ * parity construction required to finish a rmw cycle.
+ */
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct btrfs_bio *bbio = rbio->bbio;
+ struct bio_list bio_list;
+ int ret;
+ int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int pagenr;
+ int stripe;
+ struct bio *bio;
+
+ bio_list_init(&bio_list);
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ atomic_set(&rbio->bbio->error, 0);
+
+ /*
+ * read everything that hasn't failed. Thanks to the
+ * stripe cache, it is possible that some or all of these
+ * pages are going to be uptodate.
+ */
+ for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+ if (rbio->faila == stripe ||
+ rbio->failb == stripe)
+ continue;
+
+ for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ struct page *p;
+
+ /*
+ * the rmw code may have already read this
+ * page in
+ */
+ p = rbio_stripe_page(rbio, stripe, pagenr);
+ if (PageUptodate(p))
+ continue;
+
+ ret = rbio_add_io_page(rbio, &bio_list,
+ rbio_stripe_page(rbio, stripe, pagenr),
+ stripe, pagenr, rbio->stripe_len);
+ if (ret < 0)
+ goto cleanup;
+ }
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * we might have no bios to read just because the pages
+ * were up to date, or we might have no bios to read because
+ * the devices were gone.
+ */
+ if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
+ __raid_recover_end_io(rbio);
+ goto out;
+ } else {
+ goto cleanup;
+ }
+ }
+
+ /*
+ * the bbio may be freed once we submit the last bio. Make sure
+ * not to touch it after that
+ */
+ atomic_set(&bbio->stripes_pending, bios_to_read);
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_recover_end_io;
+
+ btrfs_bio_wq_end_io(rbio->fs_info, bio,
+ BTRFS_WQ_ENDIO_RAID56);
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(READ, bio);
+ }
+out:
+ return 0;
+
+cleanup:
+ if (rbio->read_rebuild)
+ rbio_orig_end_io(rbio, -EIO, 0);
+ return -EIO;
+}
+
+/*
+ * the main entry point for reads from the higher layers. This
+ * is really only called when the normal read path had a failure,
+ * so we assume the bio they send down corresponds to a failed part
+ * of the drive.
+ */
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len, int mirror_num)
+{
+ struct btrfs_raid_bio *rbio;
+ int ret;
+
+ rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ if (IS_ERR(rbio)) {
+ return PTR_ERR(rbio);
+ }
+
+ rbio->read_rebuild = 1;
+ bio_list_add(&rbio->bio_list, bio);
+ rbio->bio_list_bytes = bio->bi_size;
+
+ rbio->faila = find_logical_bio_stripe(rbio, bio);
+ if (rbio->faila == -1) {
+ BUG();
+ kfree(rbio);
+ return -EIO;
+ }
+
+ /*
+ * reconstruct from the q stripe if they are
+ * asking for mirror 3
+ */
+ if (mirror_num == 3)
+ rbio->failb = bbio->num_stripes - 2;
+
+ ret = lock_stripe_add(rbio);
+
+ /*
+ * __raid56_parity_recover will end the bio with
+ * any errors it hits. We don't want to return
+ * its error value up the stack because our caller
+ * will end up calling bio_endio with any nonzero
+ * return
+ */
+ if (ret == 0)
+ __raid56_parity_recover(rbio);
+ /*
+ * our rbio has been added to the list of
+ * rbios that will be handled after the
+ * currently lock owner is done
+ */
+ return 0;
+
+}
+
+static void rmw_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ raid56_rmw_stripe(rbio);
+}
+
+static void read_rebuild_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ __raid56_parity_recover(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 00000000000..ea5d73bfdfb
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2012 Fusion-io All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_RAID56__
+#define __BTRFS_RAID56__
+static inline int nr_parity_stripes(struct map_lookup *map)
+{
+ if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ return 1;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ return 2;
+ else
+ return 0;
+}
+
+static inline int nr_data_stripes(struct map_lookup *map)
+{
+ return map->num_stripes - nr_parity_stripes(map);
+}
+#define RAID5_P_STRIPE ((u64)-2)
+#define RAID6_Q_STRIPE ((u64)-1)
+
+#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
+ ((x) == RAID6_Q_STRIPE))
+
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len, int mirror_num);
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len);
+
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 17c306bf177..50695dc5e2a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
}
}
- page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 67783e03d12..53c3501fa4c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
#include "dev-replace.h"
#include "check-integrity.h"
#include "rcu-string.h"
+#include "raid56.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_device *extent_dev;
int extent_mirror_num;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ if (num >= nr_data_stripes(map)) {
+ return 0;
+ }
+ }
+
nstripes = length;
offset = 0;
do_div(nstripes, map->stripe_len);
@@ -2708,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
int ret;
struct btrfs_root *root = sctx->dev_root;
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return -EIO;
gen = root->fs_info->last_trans_committed;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f4ab7a9260e..f7a8b861058 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -85,6 +85,7 @@ struct send_ctx {
u32 send_max_size;
u64 total_send_size;
u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+ u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
struct vfsmount *mnt;
@@ -3709,6 +3710,39 @@ out:
return ret;
}
+/*
+ * Send an update extent command to user space.
+ */
+static int send_update_extent(struct send_ctx *sctx,
+ u64 offset, u32 len)
+{
+ int ret = 0;
+ struct fs_path *p;
+
+ p = fs_path_alloc(sctx);
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(sctx, p);
+ return ret;
+}
+
static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_path *path,
struct btrfs_key *key,
@@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx,
goto out;
}
- if (!clone_root) {
+ if (clone_root) {
+ ret = send_clone(sctx, offset, len, clone_root);
+ } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
+ ret = send_update_extent(sctx, offset, len);
+ } else {
while (pos < len) {
l = len - pos;
if (l > BTRFS_SEND_READ_SIZE)
@@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
pos += ret;
}
ret = 0;
- } else {
- ret = send_clone(sctx, offset, len, clone_root);
}
-
out:
return ret;
}
@@ -4536,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
struct btrfs_fs_info *fs_info;
struct btrfs_ioctl_send_args *arg = NULL;
struct btrfs_key key;
- struct file *filp = NULL;
struct send_ctx *sctx = NULL;
u32 i;
u64 *clone_sources_tmp = NULL;
@@ -4561,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
+ if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) {
+ ret = -EINVAL;
+ goto out;
+ }
+
sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
if (!sctx) {
ret = -ENOMEM;
@@ -4572,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
INIT_LIST_HEAD(&sctx->name_cache_list);
+ sctx->flags = arg->flags;
+
sctx->send_filp = fget(arg->send_fd);
if (IS_ERR(sctx->send_filp)) {
ret = PTR_ERR(sctx->send_filp);
@@ -4673,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
out:
- if (filp)
- fput(filp);
kfree(arg);
vfree(clone_sources_tmp);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 1bf4f32fd4e..8bb18f7ccaa 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -86,6 +86,7 @@ enum btrfs_send_cmd {
BTRFS_SEND_C_UTIMES,
BTRFS_SEND_C_END,
+ BTRFS_SEND_C_UPDATE_EXTENT,
__BTRFS_SEND_C_MAX,
};
#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d8982e9601d..68a29a1ea06 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,13 +41,13 @@
#include <linux/slab.h>
#include <linux/cleancache.h>
#include <linux/ratelimit.h>
+#include <linux/btrfs.h>
#include "compat.h"
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "ioctl.h"
#include "print-tree.h"
#include "xattr.h"
#include "volumes.h"
@@ -63,8 +63,7 @@
static const struct super_operations btrfs_super_ops;
static struct file_system_type btrfs_fs_type;
-static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
- char nbuf[16])
+static const char *btrfs_decode_error(int errno, char nbuf[16])
{
char *errstr = NULL;
@@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
* today we only save the error info into ram. Long term we'll
* also send it down to the disk
*/
- fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
}
static void save_error_info(struct btrfs_fs_info *fs_info)
@@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
if (sb->s_flags & MS_RDONLY)
return;
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
sb->s_flags |= MS_RDONLY;
printk(KERN_INFO "btrfs is forced readonly\n");
/*
@@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
struct super_block *sb = fs_info->sb;
char nbuf[16];
const char *errstr;
- va_list args;
- va_start(args, fmt);
/*
* Special case: if the error is EROFS, and we're already
@@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
return;
- errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ errstr = btrfs_decode_error(errno, nbuf);
if (fmt) {
- struct va_format vaf = {
- .fmt = fmt,
- .va = &args,
- };
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
sb->s_id, function, line, errstr, &vaf);
+ va_end(args);
} else {
printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
sb->s_id, function, line, errstr);
@@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
save_error_info(fs_info);
btrfs_handle_error(fs_info);
}
- va_end(args);
}
static const char * const logtypes[] = {
@@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
char nbuf[16];
const char *errstr;
- errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
+ errstr = btrfs_decode_error(errno, nbuf);
btrfs_printk(root->fs_info,
"%s:%d: Aborting unused transaction(%s).\n",
function, line, errstr);
@@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
va_start(args, fmt);
vaf.va = &args;
- errstr = btrfs_decode_error(fs_info, errno, nbuf);
- if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
+ errstr = btrfs_decode_error(errno, nbuf);
+ if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
s_id, function, line, &vaf, errstr);
@@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_compress_force:
case Opt_compress_force_type:
compress_force = true;
+ /* Fallthrough */
case Opt_compress:
case Opt_compress_type:
if (token == Opt_compress ||
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_alloc_start:
num = match_strdup(&args[0]);
if (num) {
+ mutex_lock(&info->chunk_mutex);
info->alloc_start = memparse(num, NULL);
+ mutex_unlock(&info->chunk_mutex);
kfree(num);
printk(KERN_INFO
"btrfs: allocations start at %llu\n",
@@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
btrfs_wait_ordered_extents(root, 0);
- trans = btrfs_attach_transaction(root);
+ trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT)
@@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
new_pool_size);
}
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
+ unsigned long old_opts, int flags)
+{
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+ (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+ (flags & MS_RDONLY))) {
+ /* wait for any defraggers to finish */
+ wait_event(fs_info->transaction_wait,
+ (atomic_read(&fs_info->defrag_running) == 0));
+ if (flags & MS_RDONLY)
+ sync_filesystem(fs_info->sb);
+ }
+}
+
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+ unsigned long old_opts)
+{
+ /*
+ * We need cleanup all defragable inodes if the autodefragment is
+ * close or the fs is R/O.
+ */
+ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+ (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+ (fs_info->sb->s_flags & MS_RDONLY))) {
+ btrfs_cleanup_defrag_inodes(fs_info);
+ }
+
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
unsigned int old_metadata_ratio = fs_info->metadata_ratio;
int ret;
+ btrfs_remount_prepare(fs_info, old_opts, *flags);
+
ret = btrfs_parse_options(root, data);
if (ret) {
ret = -EINVAL;
@@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
fs_info->thread_pool_size, old_thread_pool_size);
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
- return 0;
+ goto out;
if (*flags & MS_RDONLY) {
/*
@@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
sb->s_flags &= ~MS_RDONLY;
}
-
+out:
+ btrfs_remount_cleanup(fs_info, old_opts);
return 0;
restore:
@@ -1289,10 +1326,13 @@ restore:
fs_info->mount_opt = old_opts;
fs_info->compress_type = old_compress_type;
fs_info->max_inline = old_max_inline;
+ mutex_lock(&fs_info->chunk_mutex);
fs_info->alloc_start = old_alloc_start;
+ mutex_unlock(&fs_info->chunk_mutex);
btrfs_resize_thread_pool(fs_info,
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
+ btrfs_remount_cleanup(fs_info, old_opts);
return ret;
}
@@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = btrfs_sb(sb)->tree_root;
- trans = btrfs_attach_transaction(root);
+ trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT)
@@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_delayed_inode;
- err = btrfs_interface_init();
+ err = btrfs_delayed_ref_init();
if (err)
goto free_auto_defrag;
+ err = btrfs_interface_init();
+ if (err)
+ goto free_delayed_ref;
+
err = register_filesystem(&btrfs_fs_type);
if (err)
goto unregister_ioctl;
@@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
+free_delayed_ref:
+ btrfs_delayed_ref_exit();
free_auto_defrag:
btrfs_auto_defrag_exit();
free_delayed_inode:
@@ -1720,6 +1766,7 @@ free_compress:
static void __exit exit_btrfs_fs(void)
{
btrfs_destroy_cachep();
+ btrfs_delayed_ref_exit();
btrfs_auto_defrag_exit();
btrfs_delayed_inode_exit();
ordered_data_exit();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index daac9ae6d73..5b326cd60a4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,7 +21,6 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
-#include <linux/module.h>
#include <linux/kobject.h>
#include "ctree.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4c0067c4f76..e52da6fb116 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction)
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(transaction->delayed_refs.root.rb_node);
- memset(transaction, 0, sizeof(*transaction));
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root)
root->commit_root = btrfs_root_node(root);
}
+static inline int can_join_transaction(struct btrfs_transaction *trans,
+ int type)
+{
+ return !(trans->in_commit &&
+ type != TRANS_JOIN &&
+ type != TRANS_JOIN_NOLOCK);
+}
+
/*
* either allocate a new transaction or hop into the existing one
*/
@@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
@@ -86,6 +93,10 @@ loop:
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
+ if (!can_join_transaction(cur_trans, type)) {
+ spin_unlock(&fs_info->trans_lock);
+ return -EBUSY;
+ }
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
@@ -113,7 +124,7 @@ loop:
*/
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
goto loop;
- } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
spin_unlock(&fs_info->trans_lock);
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
return -EROFS;
@@ -155,8 +166,12 @@ loop:
spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock);
+ atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
+ atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
+ init_waitqueue_head(&cur_trans->delayed_refs.wait);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+ INIT_LIST_HEAD(&cur_trans->ordered_operations);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -301,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
int ret;
u64 qgroup_reserved = 0;
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
if (current->journal_info) {
@@ -359,8 +374,11 @@ again:
do {
ret = join_transaction(root, type);
- if (ret == -EBUSY)
+ if (ret == -EBUSY) {
wait_current_trans(root);
+ if (unlikely(type == TRANS_ATTACH))
+ ret = -ENOENT;
+ }
} while (ret == -EBUSY);
if (ret < 0) {
@@ -382,9 +400,10 @@ again:
h->block_rsv = NULL;
h->orig_rsv = NULL;
h->aborted = 0;
- h->qgroup_reserved = qgroup_reserved;
+ h->qgroup_reserved = 0;
h->delayed_ref_elem.seq = 0;
h->type = type;
+ h->allocating_chunk = false;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
@@ -400,6 +419,7 @@ again:
h->block_rsv = &root->fs_info->trans_block_rsv;
h->bytes_reserved = num_bytes;
}
+ h->qgroup_reserved = qgroup_reserved;
got_it:
btrfs_record_root_in_trans(h, root);
@@ -451,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
return start_transaction(root, 0, TRANS_USERSPACE, 0);
}
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ * btrfs_attach_transaction_barrier()
+ */
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_ATTACH, 0);
}
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+
+ trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+ if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
+ btrfs_wait_for_commit(root, 0);
+
+ return trans;
+}
+
/* wait for a transaction commit to be fully complete */
static noinline void wait_for_commit(struct btrfs_root *root,
struct btrfs_transaction *commit)
@@ -587,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- while (count < 2) {
+ while (count < 1) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (cur &&
@@ -599,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
count++;
}
+
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -644,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_run_delayed_iputs(root);
if (trans->aborted ||
- root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
err = -EIO;
- }
assert_qgroups_uptodate(trans);
- memset(trans, 0, sizeof(*trans));
kmem_cache_free(btrfs_trans_handle_cachep, trans);
return err;
}
@@ -696,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
+ struct blk_plug plug;
+ blk_start_plug(&plug);
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -710,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
}
if (err)
werr = err;
+ blk_finish_plug(&plug);
return werr;
}
@@ -960,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
}
/*
- * defrag a given btree. If cacheonly == 1, this won't read from the disk,
- * otherwise every leaf in the btree is read and defragged.
+ * defrag a given btree.
+ * Every leaf in the btree is read and defragged.
*/
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+int btrfs_defrag_root(struct btrfs_root *root)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_trans_handle *trans;
@@ -977,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_defrag_leaves(trans, root, cacheonly);
+ ret = btrfs_defrag_leaves(trans, root);
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(info->tree_root);
@@ -985,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
break;
+
+ if (btrfs_defrag_cancelled(root->fs_info)) {
+ printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
+ ret = -EAGAIN;
+ break;
+ }
}
root->defrag_running = 0;
return ret;
@@ -1007,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct inode *parent_inode;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
- struct dentry *parent;
struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
@@ -1022,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = pending->error = -ENOMEM;
- goto path_alloc_fail;
+ return ret;
}
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
@@ -1062,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
+ trans->bytes_reserved = trans->block_rsv->reserved;
dentry = pending->dentry;
- parent = dget_parent(dentry);
- parent_inode = parent->d_inode;
+ parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
record_root_in_trans(trans, parent_root);
@@ -1213,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret)
btrfs_abort_transaction(trans, root, ret);
fail:
- dput(parent);
trans->block_rsv = rsv;
+ trans->bytes_reserved = 0;
no_free_objectid:
kfree(new_root_item);
root_item_alloc_fail:
btrfs_free_path(path);
-path_alloc_fail:
- btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
return ret;
}
@@ -1306,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
struct btrfs_async_commit {
struct btrfs_trans_handle *newtrans;
struct btrfs_root *root;
- struct delayed_work work;
+ struct work_struct work;
};
static void do_async_commit(struct work_struct *work)
{
struct btrfs_async_commit *ac =
- container_of(work, struct btrfs_async_commit, work.work);
+ container_of(work, struct btrfs_async_commit, work);
/*
* We've got freeze protection passed with the transaction.
@@ -1340,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
if (!ac)
return -ENOMEM;
- INIT_DELAYED_WORK(&ac->work, do_async_commit);
+ INIT_WORK(&ac->work, do_async_commit);
ac->root = root;
ac->newtrans = btrfs_join_transaction(root);
if (IS_ERR(ac->newtrans)) {
@@ -1364,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1, _THIS_IP_);
- schedule_delayed_work(&ac->work, 0);
+ schedule_work(&ac->work);
/* wait for transaction to start and unblock */
if (wait_for_unblock)
@@ -1384,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int err)
{
struct btrfs_transaction *cur_trans = trans->transaction;
+ DEFINE_WAIT(wait);
WARN_ON(trans->use_count > 1);
@@ -1392,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->trans_lock);
list_del_init(&cur_trans->list);
if (cur_trans == root->fs_info->running_transaction) {
+ root->fs_info->trans_no_join = 1;
+ spin_unlock(&root->fs_info->trans_lock);
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+
+ spin_lock(&root->fs_info->trans_lock);
root->fs_info->running_transaction = NULL;
- root->fs_info->trans_no_join = 0;
}
spin_unlock(&root->fs_info->trans_lock);
@@ -1427,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
}
if (flush_on_commit || snap_pending) {
- btrfs_start_delalloc_inodes(root, 1);
+ ret = btrfs_start_delalloc_inodes(root, 1);
+ if (ret)
+ return ret;
btrfs_wait_ordered_extents(root, 1);
}
@@ -1449,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
* it here and no for sure that nothing new will be added
* to the list
*/
- btrfs_run_ordered_operations(root, 1);
+ ret = btrfs_run_ordered_operations(trans, root, 1);
- return 0;
+ return ret;
}
/*
@@ -1472,27 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
int should_grow = 0;
unsigned long now = get_seconds();
- ret = btrfs_run_ordered_operations(root, 0);
+ ret = btrfs_run_ordered_operations(trans, root, 0);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
- goto cleanup_transaction;
+ btrfs_end_transaction(trans, root);
+ return ret;
}
/* Stop the commit early if ->aborted is set */
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
- goto cleanup_transaction;
+ btrfs_end_transaction(trans, root);
+ return ret;
}
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
ret = btrfs_run_delayed_refs(trans, root, 0);
- if (ret)
- goto cleanup_transaction;
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ if (trans->qgroup_reserved) {
+ btrfs_qgroup_free(root, trans->qgroup_reserved);
+ trans->qgroup_reserved = 0;
+ }
cur_trans = trans->transaction;
@@ -1506,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_create_pending_block_groups(trans, root);
ret = btrfs_run_delayed_refs(trans, root, 0);
- if (ret)
- goto cleanup_transaction;
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
spin_lock(&cur_trans->commit_lock);
if (cur_trans->in_commit) {
@@ -1771,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ if (trans->qgroup_reserved) {
+ btrfs_qgroup_free(root, trans->qgroup_reserved);
+ trans->qgroup_reserved = 0;
+ }
btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
// WARN_ON(1);
if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0e8aa1e6c28..3c8e0d25c8e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -43,6 +43,7 @@ struct btrfs_transaction {
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
+ struct list_head ordered_operations;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
@@ -68,6 +69,7 @@ struct btrfs_trans_handle {
struct btrfs_block_rsv *orig_rsv;
short aborted;
short adding_csums;
+ bool allocating_chunk;
enum btrfs_trans_type type;
/*
* this root is only needed to validate that the root passed to
@@ -82,11 +84,13 @@ struct btrfs_trans_handle {
struct btrfs_pending_snapshot {
struct dentry *dentry;
+ struct inode *dir;
struct btrfs_root *root;
struct btrfs_root *snap;
struct btrfs_qgroup_inherit *inherit;
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
+ u64 qgroup_reserved;
/* extra metadata reseration for relocation */
int error;
bool readonly;
@@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+ struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_defrag_root(struct btrfs_root *root);
int btrfs_clean_old_snapshots(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3b580ee8ab1..94e05c1f118 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,13 +23,14 @@
#include "transaction.h"
#include "locking.h"
-/* defrag all the leaves in a given btree. If cache_only == 1, don't read
- * things from disk, otherwise read all the leaves and try to get key order to
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
* better reflect disk order
*/
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int cache_only)
+ struct btrfs_root *root)
{
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u64 last_ret = 0;
u64 min_trans = 0;
- if (cache_only)
- goto out;
-
if (root->fs_info->extent_root == root) {
/*
* there's recursion here right now in the tree locking,
@@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
}
path->keep_locks = 1;
- if (cache_only)
- min_trans = root->defrag_trans_start;
- ret = btrfs_search_forward(root, &key, NULL, path,
- cache_only, min_trans);
+ ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
if (ret < 0)
goto out;
if (ret > 0) {
@@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
path->slots[1] = btrfs_header_nritems(path->nodes[1]);
- next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
min_trans);
ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0,
- cache_only, &last_ret,
+ &last_ret,
&root->defrag_progress);
if (ret) {
WARN_ON(ret == -EAGAIN);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9027bb1e746..c7ef569eb22 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log,
struct walk_control *wc, u64 gen)
{
if (wc->pin)
- btrfs_pin_extent_for_log_replay(wc->trans,
- log->fs_info->extent_root,
+ btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
eb->start, eb->len);
if (btrfs_buffer_uptodate(eb, gen, 0)) {
@@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *key)
{
int found_type;
- u64 mask = root->sectorsize - 1;
u64 extent_end;
u64 start = key->offset;
u64 saved_nbytes;
@@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
extent_end = start + btrfs_file_extent_num_bytes(eb, item);
else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size = btrfs_file_extent_inline_len(eb, item);
- extent_end = (start + size + mask) & ~mask;
+ extent_end = ALIGN(start + size, root->sectorsize);
} else {
ret = 0;
goto out;
@@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
unsigned long log_transid = 0;
mutex_lock(&root->log_mutex);
+ log_transid = root->log_transid;
index1 = root->log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
wait_log_commit(trans, root, root->log_transid);
@@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
ret = -EAGAIN;
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
goto out;
}
- log_transid = root->log_transid;
if (log_transid % 2 == 0)
mark = EXTENT_DIRTY;
else
@@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
root->fs_info->last_trans_log_full_commit = trans->transid;
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out;
@@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = 0;
goto out;
@@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
@@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_wait_logged_extents(log, log_transid);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
log_root_tree->node->start);
@@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
.process_func = process_one_buffer
};
- ret = walk_log_tree(trans, log, &wc);
- BUG_ON(ret);
+ if (trans) {
+ ret = walk_log_tree(trans, log, &wc);
+ BUG_ON(ret);
+ }
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
}
+ /*
+ * We may have short-circuited the log tree with the full commit logic
+ * and left ordered extents on our list, so clear these out to keep us
+ * from leaking inodes and memory.
+ */
+ btrfs_free_logged_extents(log, 0);
+ btrfs_free_logged_extents(log, 1);
+
free_extent_buffer(log->node);
kfree(log);
}
@@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
path->keep_locks = 1;
ret = btrfs_search_forward(root, &min_key, &max_key,
- path, 0, trans->transid);
+ path, trans->transid);
/*
* we didn't find anything from this transaction, see if there
@@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
+ struct btrfs_ordered_extent *ordered;
struct list_head ordered_sums;
struct btrfs_map_token token;
struct btrfs_key key;
- u64 csum_offset = em->mod_start - em->start;
- u64 csum_len = em->mod_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
+ u64 csum_offset;
+ u64 csum_len;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
+ int index = log->log_transid % 2;
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+insert:
INIT_LIST_HEAD(&ordered_sums);
btrfs_init_map_token(&token);
key.objectid = btrfs_ino(inode);
@@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+
+ /*
+ * If we are overwriting an inline extent with a real one then we need
+ * to just delete the inline extent as it may not be large enough to
+ * have the entire file_extent_item.
+ */
+ if (ret && btrfs_token_file_extent_type(leaf, fi, &token) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ ret = btrfs_del_item(trans, log, path);
+ btrfs_release_path(path);
+ if (ret) {
+ path->really_keep_locks = 0;
+ return ret;
+ }
+ goto insert;
+ }
+
btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
&token);
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3362,6 +3399,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
csum_len = block_len;
}
+ /*
+ * First check and see if our csums are on our outstanding ordered
+ * extents.
+ */
+again:
+ spin_lock_irq(&log->log_extents_lock[index]);
+ list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+ struct btrfs_ordered_sum *sum;
+
+ if (!mod_len)
+ break;
+
+ if (ordered->inode != inode)
+ continue;
+
+ if (ordered->file_offset + ordered->len <= mod_start ||
+ mod_start + mod_len <= ordered->file_offset)
+ continue;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this
+ * ordered extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered->file_offset + ordered->len >=
+ mod_start + mod_len)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered->file_offset + ordered->len <
+ mod_start + mod_len) {
+ mod_len = (mod_start + mod_len) -
+ (ordered->file_offset + ordered->len);
+ mod_start = ordered->file_offset +
+ ordered->len;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
+ &ordered->flags))
+ continue;
+ atomic_inc(&ordered->refs);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+ /*
+ * we've dropped the lock, we must either break or
+ * start over after this.
+ */
+
+ wait_event(ordered->wait, ordered->csum_bytes_left == 0);
+
+ list_for_each_entry(sum, &ordered->list, list) {
+ ret = btrfs_csum_file_blocks(trans, log, sum);
+ if (ret) {
+ btrfs_put_ordered_extent(ordered);
+ goto unlocked;
+ }
+ }
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+
+ }
+ spin_unlock_irq(&log->log_extents_lock[index]);
+unlocked:
+
+ if (!mod_len || ret)
+ return ret;
+
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
+
/* block start is already adjusted for the file extent offset. */
ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
em->block_start + csum_offset,
@@ -3393,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
u64 test_gen;
int ret = 0;
+ int num = 0;
INIT_LIST_HEAD(&extents);
@@ -3401,16 +3525,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
list_del_init(&em->list);
+
+ /*
+ * Just an arbitrary number, this can be really CPU intensive
+ * once we start getting a lot of extents, and really once we
+ * have a bunch of extents we just want to commit since it will
+ * be faster.
+ */
+ if (++num > 32768) {
+ list_del_init(&tree->modified_extents);
+ ret = -EFBIG;
+ goto process;
+ }
+
if (em->generation <= test_gen)
continue;
/* Need a ref to keep it from getting evicted from cache */
atomic_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
list_add_tail(&em->list, &extents);
+ num++;
}
list_sort(NULL, &extents, extent_cmp);
+process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
@@ -3513,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
+ btrfs_get_logged_extents(log, inode);
+
/*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
@@ -3558,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
while (1) {
ins_nr = 0;
ret = btrfs_search_forward(root, &min_key, &max_key,
- path, 0, trans->transid);
+ path, trans->transid);
if (ret != 0)
break;
again:
@@ -3656,6 +3797,8 @@ log_extents:
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
out_unlock:
+ if (err)
+ btrfs_free_logged_extents(log, log->log_transid);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
btrfs_free_path(path);
@@ -3822,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
end_trans:
dput(old_parent);
if (ret < 0) {
- WARN_ON(ret != -ENOSPC);
root->fs_info->last_trans_log_full_commit = trans->transid;
ret = 1;
}
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 99be4c138db..ddc61cad008 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,7 +5,7 @@
*/
#include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include "ulist.h"
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cbb7f4b167..35bb2d4ed29 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
#include "extent_map.h"
@@ -32,6 +34,7 @@
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
+#include "raid56.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
new_device->writeable = 0;
new_device->in_fs_metadata = 0;
new_device->can_discard = 0;
+ spin_lock_init(&new_device->io_lock);
list_replace_rcu(&device->dev_list, &new_device->dev_list);
call_rcu(&device->rcu, free_device);
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
return ret;
}
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret)
{
struct btrfs_super_block *disk_super;
struct block_device *bdev;
- struct buffer_head *bh;
- int ret;
+ struct page *page;
+ void *p;
+ int ret = -EINVAL;
u64 devid;
u64 transid;
u64 total_devices;
+ u64 bytenr;
+ pgoff_t index;
+ /*
+ * we would like to check all the supers, but that would make
+ * a btrfs mount succeed after a mkfs from a different FS.
+ * So, we need to add a special mount option to scan for
+ * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+ */
+ bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
mutex_lock(&uuid_mutex);
- ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
- if (ret)
+
+ bdev = blkdev_get_by_path(path, flags, holder);
+
+ if (IS_ERR(bdev)) {
+ ret = PTR_ERR(bdev);
goto error;
- disk_super = (struct btrfs_super_block *)bh->b_data;
+ }
+
+ /* make sure our super fits in the device */
+ if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+ goto error_bdev_put;
+
+ /* make sure our super fits in the page */
+ if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+ goto error_bdev_put;
+
+ /* make sure our super doesn't straddle pages on disk */
+ index = bytenr >> PAGE_CACHE_SHIFT;
+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+ goto error_bdev_put;
+
+ /* pull in the page with our super */
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ index, GFP_NOFS);
+
+ if (IS_ERR_OR_NULL(page))
+ goto error_bdev_put;
+
+ p = kmap(page);
+
+ /* align our pointer to the offset of the super block */
+ disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
+ disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
+ goto error_unmap;
+
devid = btrfs_stack_device_id(&disk_super->dev_item);
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
+
if (disk_super->label[0]) {
if (disk_super->label[BTRFS_LABEL_SIZE - 1])
disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
} else {
printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
}
+
printk(KERN_CONT "devid %llu transid %llu %s\n",
(unsigned long long)devid, (unsigned long long)transid, path);
+
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
- brelse(bh);
+
+error_unmap:
+ kunmap(page);
+ page_cache_release(page);
+
+error_bdev_put:
blkdev_put(bdev, flags);
error:
mutex_unlock(&uuid_mutex);
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
u64 devid;
u64 num_devices;
u8 *dev_uuid;
+ unsigned seq;
int ret = 0;
bool clear_super = false;
mutex_lock(&uuid_mutex);
- all_avail = root->fs_info->avail_data_alloc_bits |
- root->fs_info->avail_system_alloc_bits |
- root->fs_info->avail_metadata_alloc_bits;
+ do {
+ seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+ all_avail = root->fs_info->avail_data_alloc_bits |
+ root->fs_info->avail_system_alloc_bits |
+ root->fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&root->fs_info->profiles_lock, seq));
num_devices = root->fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
goto out;
}
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
+ root->fs_info->fs_devices->rw_devices <= 2) {
+ printk(KERN_ERR "btrfs: unable to go below two "
+ "devices on raid5\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
+ root->fs_info->fs_devices->rw_devices <= 3) {
+ printk(KERN_ERR "btrfs: unable to go below three "
+ "devices on raid6\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
@@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
chunk_used = btrfs_block_group_used(&cache->item);
if (bargs->usage == 0)
- user_thresh = 0;
+ user_thresh = 1;
else if (bargs->usage > 100)
user_thresh = cache->key.offset;
else
@@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
return 0;
if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
- factor = 2;
- else
- factor = 1;
- factor = num_stripes / factor;
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
+ factor = num_stripes / 2;
+ } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
+ factor = num_stripes - 1;
+ } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
+ factor = num_stripes - 2;
+ } else {
+ factor = num_stripes;
+ }
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
@@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
int mixed = 0;
int ret;
u64 num_devices;
+ unsigned seq;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
@@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
else
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10);
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6);
if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
/* allow to reduce meta or sys integrity only if force set */
allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10;
- if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (fs_info->avail_system_alloc_bits & allowed) &&
- !(bctl->sys.target & allowed)) ||
- ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (fs_info->avail_metadata_alloc_bits & allowed) &&
- !(bctl->meta.target & allowed))) {
- if (bctl->flags & BTRFS_BALANCE_FORCE) {
- printk(KERN_INFO "btrfs: force reducing metadata "
- "integrity\n");
- } else {
- printk(KERN_ERR "btrfs: balance will reduce metadata "
- "integrity, use force if you want this\n");
- ret = -EINVAL;
- goto out;
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6;
+ do {
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_system_alloc_bits & allowed) &&
+ !(bctl->sys.target & allowed)) ||
+ ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_metadata_alloc_bits & allowed) &&
+ !(bctl->meta.target & allowed))) {
+ if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ printk(KERN_INFO "btrfs: force reducing metadata "
+ "integrity\n");
+ } else {
+ printk(KERN_ERR "btrfs: balance will reduce metadata "
+ "integrity, use force if you want this\n");
+ ret = -EINVAL;
+ goto out;
+ }
}
- }
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
int num_tolerated_disk_barrier_failures;
@@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
mutex_lock(&fs_info->balance_mutex);
atomic_dec(&fs_info->balance_running);
- if (bargs) {
- memset(bargs, 0, sizeof(*bargs));
- update_ioctl_balance_args(fs_info, 0, bargs);
- }
-
- if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
- balance_need_close(fs_info)) {
- __cancel_balance(fs_info);
- }
-
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
}
+ if (bargs) {
+ memset(bargs, 0, sizeof(*bargs));
+ update_ioctl_balance_args(fs_info, 0, bargs);
+ }
+
wake_up(&fs_info->balance_wait_q);
return ret;
@@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
}
struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
- { 2, 1, 0, 4, 2, 2 /* raid10 */ },
- { 1, 1, 2, 2, 2, 2 /* raid1 */ },
- { 1, 2, 1, 1, 1, 2 /* dup */ },
- { 1, 1, 0, 2, 1, 1 /* raid0 */ },
- { 1, 1, 1, 1, 1, 1 /* single */ },
+ [BTRFS_RAID_RAID10] = {
+ .sub_stripes = 2,
+ .dev_stripes = 1,
+ .devs_max = 0, /* 0 == as many as possible */
+ .devs_min = 4,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID1] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 2,
+ .devs_min = 2,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_DUP] = {
+ .sub_stripes = 1,
+ .dev_stripes = 2,
+ .devs_max = 1,
+ .devs_min = 1,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID0] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_SINGLE] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 1,
+ .devs_min = 1,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_RAID5] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID6] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 3,
+ .devs_increment = 1,
+ .ncopies = 3,
+ },
};
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+{
+ /* TODO allow them to set a preferred stripe size */
+ return 64 * 1024;
+}
+
+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+ u64 features;
+
+ if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+ return;
+
+ features = btrfs_super_incompat_flags(info->super_copy);
+ if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
+ return;
+
+ features |= BTRFS_FEATURE_INCOMPAT_RAID56;
+ btrfs_set_super_incompat_flags(info->super_copy, features);
+ printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
+}
+
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
struct map_lookup **map_ret,
@@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_device_info *devices_info = NULL;
u64 total_avail;
int num_stripes; /* total number of stripes to allocate */
+ int data_stripes; /* number of stripes that count for
+ block group size */
int sub_stripes; /* sub_stripes info for map */
int dev_stripes; /* stripes per dev */
int devs_max; /* max devs to use */
@@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 max_chunk_size;
u64 stripe_size;
u64 num_bytes;
+ u64 raid_stripe_len = BTRFS_STRIPE_LEN;
int ndevs;
int i;
int j;
@@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
continue;
+ if (ndevs == fs_devices->rw_devices) {
+ WARN(1, "%s: found more than %llu devices\n",
+ __func__, fs_devices->rw_devices);
+ break;
+ }
devices_info[ndevs].dev_offset = dev_offset;
devices_info[ndevs].max_avail = max_avail;
devices_info[ndevs].total_avail = total_avail;
devices_info[ndevs].dev = device;
++ndevs;
- WARN_ON(ndevs > fs_devices->rw_devices);
}
/*
@@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
stripe_size = devices_info[ndevs-1].max_avail;
num_stripes = ndevs * dev_stripes;
- if (stripe_size * ndevs > max_chunk_size * ncopies) {
- stripe_size = max_chunk_size * ncopies;
- do_div(stripe_size, ndevs);
+ /*
+ * this will have to be fixed for RAID1 and RAID10 over
+ * more drives
+ */
+ data_stripes = num_stripes / ncopies;
+
+ if (type & BTRFS_BLOCK_GROUP_RAID5) {
+ raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
+ btrfs_super_stripesize(info->super_copy));
+ data_stripes = num_stripes - 1;
+ }
+ if (type & BTRFS_BLOCK_GROUP_RAID6) {
+ raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
+ btrfs_super_stripesize(info->super_copy));
+ data_stripes = num_stripes - 2;
+ }
+
+ /*
+ * Use the number of data stripes to figure out how big this chunk
+ * is really going to be in terms of logical address space,
+ * and compare that answer with the max chunk size
+ */
+ if (stripe_size * data_stripes > max_chunk_size) {
+ u64 mask = (1ULL << 24) - 1;
+ stripe_size = max_chunk_size;
+ do_div(stripe_size, data_stripes);
+
+ /* bump the answer up to a 16MB boundary */
+ stripe_size = (stripe_size + mask) & ~mask;
+
+ /* but don't go higher than the limits we found
+ * while searching for free extents
+ */
+ if (stripe_size > devices_info[ndevs-1].max_avail)
+ stripe_size = devices_info[ndevs-1].max_avail;
}
do_div(stripe_size, dev_stripes);
/* align to BTRFS_STRIPE_LEN */
- do_div(stripe_size, BTRFS_STRIPE_LEN);
- stripe_size *= BTRFS_STRIPE_LEN;
+ do_div(stripe_size, raid_stripe_len);
+ stripe_size *= raid_stripe_len;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
@@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
}
}
map->sector_size = extent_root->sectorsize;
- map->stripe_len = BTRFS_STRIPE_LEN;
- map->io_align = BTRFS_STRIPE_LEN;
- map->io_width = BTRFS_STRIPE_LEN;
+ map->stripe_len = raid_stripe_len;
+ map->io_align = raid_stripe_len;
+ map->io_width = raid_stripe_len;
map->type = type;
map->sub_stripes = sub_stripes;
*map_ret = map;
- num_bytes = stripe_size * (num_stripes / ncopies);
+ num_bytes = stripe_size * data_stripes;
*stripe_size_out = stripe_size;
*num_bytes_out = num_bytes;
@@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- free_extent_map(em);
- if (ret)
- goto error;
-
- ret = btrfs_make_block_group(trans, extent_root, 0, type,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- start, num_bytes);
- if (ret)
+ if (ret) {
+ free_extent_map(em);
goto error;
+ }
for (i = 0; i < map->num_stripes; ++i) {
struct btrfs_device *device;
@@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
info->chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, dev_offset, stripe_size);
- if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
- goto error;
- }
+ if (ret)
+ goto error_dev_extent;
+ }
+
+ ret = btrfs_make_block_group(trans, extent_root, 0, type,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ start, num_bytes);
+ if (ret) {
+ i = map->num_stripes - 1;
+ goto error_dev_extent;
}
+ free_extent_map(em);
+ check_raid56_incompat_flag(extent_root->fs_info, type);
+
kfree(devices_info);
return 0;
+error_dev_extent:
+ for (; i >= 0; i--) {
+ struct btrfs_device *device;
+ int err;
+
+ device = map->stripes[i].dev;
+ err = btrfs_free_dev_extent(trans, device, start);
+ if (err) {
+ btrfs_abort_transaction(trans, extent_root, err);
+ break;
+ }
+ }
+ write_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ /* One for our allocation */
+ free_extent_map(em);
+ /* One for the tree reference */
+ free_extent_map(em);
error:
kfree(map);
kfree(devices_info);
@@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
- fs_info->avail_metadata_alloc_bits;
- alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
+ alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
&stripe_size, chunk_offset, alloc_profile);
if (ret)
@@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
sys_chunk_offset = chunk_offset + chunk_size;
- alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
- fs_info->avail_system_alloc_bits;
- alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
+ alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
&sys_chunk_size, &sys_stripe_size,
sys_chunk_offset, alloc_profile);
@@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
ret = map->sub_stripes;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ ret = 2;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ ret = 3;
else
ret = 1;
free_extent_map(em);
@@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return ret;
}
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+ struct btrfs_mapping_tree *map_tree,
+ u64 logical)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ unsigned long len = root->sectorsize;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ read_unlock(&em_tree->lock);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ len = map->stripe_len * nr_data_stripes(map);
+ }
+ free_extent_map(em);
+ return len;
+}
+
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+ u64 logical, u64 len, int mirror_num)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ int ret = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ read_unlock(&em_tree->lock);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6))
+ ret = 1;
+ free_extent_map(em);
+ return ret;
+}
+
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first, int num,
int optimal, int dev_replace_is_ongoing)
@@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return optimal;
}
+static inline int parity_smaller(u64 a, u64 b)
+{
+ return a > b;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+{
+ struct btrfs_bio_stripe s;
+ int i;
+ u64 l;
+ int again = 1;
+
+ while (again) {
+ again = 0;
+ for (i = 0; i < bbio->num_stripes - 1; i++) {
+ if (parity_smaller(raid_map[i], raid_map[i+1])) {
+ s = bbio->stripes[i];
+ l = raid_map[i];
+ bbio->stripes[i] = bbio->stripes[i+1];
+ raid_map[i] = raid_map[i+1];
+ bbio->stripes[i+1] = s;
+ raid_map[i+1] = l;
+ again = 1;
+ }
+ }
+ }
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
- int mirror_num)
+ int mirror_num, u64 **raid_map_ret)
{
struct extent_map *em;
struct map_lookup *map;
@@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 stripe_nr;
u64 stripe_nr_orig;
u64 stripe_nr_end;
+ u64 stripe_len;
+ u64 *raid_map = NULL;
int stripe_index;
int i;
int ret = 0;
@@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
int num_alloc_stripes;
int patch_the_first_stripe_for_dev_replace = 0;
u64 physical_to_patch_in_first_stripe = 0;
+ u64 raid56_full_stripe_start = (u64)-1;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
map = (struct map_lookup *)em->bdev;
offset = logical - em->start;
+ if (mirror_num > map->num_stripes)
+ mirror_num = 0;
+
+ stripe_len = map->stripe_len;
stripe_nr = offset;
/*
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
- do_div(stripe_nr, map->stripe_len);
+ do_div(stripe_nr, stripe_len);
- stripe_offset = stripe_nr * map->stripe_len;
+ stripe_offset = stripe_nr * stripe_len;
BUG_ON(offset < stripe_offset);
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset;
- if (rw & REQ_DISCARD)
+ /* if we're here for raid56, we need to know the stripe aligned start */
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+ raid56_full_stripe_start = offset;
+
+ /* allow a write of a full stripe, but make sure we don't
+ * allow straddling of stripes
+ */
+ do_div(raid56_full_stripe_start, full_stripe_len);
+ raid56_full_stripe_start *= full_stripe_len;
+ }
+
+ if (rw & REQ_DISCARD) {
+ /* we don't discard raid56 yet */
+ if (map->type &
+ (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
*length = min_t(u64, em->len - offset, *length);
- else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
- /* we limit the length of each bio to what fits in a stripe */
- *length = min_t(u64, em->len - offset,
- map->stripe_len - stripe_offset);
+ } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ u64 max_len;
+ /* For writes to RAID[56], allow a full stripeset across all disks.
+ For other RAID types and for RAID[56] reads, just allow a single
+ stripe (on a single disk). */
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+ (rw & REQ_WRITE)) {
+ max_len = stripe_len * nr_data_stripes(map) -
+ (offset - raid56_full_stripe_start);
+ } else {
+ /* we limit the length of each bio to what fits in a stripe */
+ max_len = stripe_len - stripe_offset;
+ }
+ *length = min_t(u64, em->len - offset, max_len);
} else {
*length = em->len - offset;
}
+ /* This is for when we're called from btrfs_merge_bio_hook() and all
+ it cares about is the length */
if (!bbio_ret)
goto out;
@@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 physical_of_found = 0;
ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
- logical, &tmp_length, &tmp_bbio, 0);
+ logical, &tmp_length, &tmp_bbio, 0, NULL);
if (ret) {
WARN_ON(tmp_bbio != NULL);
goto out;
@@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_stripes = 1;
stripe_index = 0;
stripe_nr_orig = stripe_nr;
- stripe_nr_end = (offset + *length + map->stripe_len - 1) &
- (~(map->stripe_len - 1));
+ stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
do_div(stripe_nr_end, map->stripe_len);
stripe_end_offset = stripe_nr_end * map->stripe_len -
(offset + *length);
+
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
if (rw & REQ_DISCARD)
num_stripes = min_t(u64, map->num_stripes,
@@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
+
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ u64 tmp;
+
+ if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
+ && raid_map_ret) {
+ int i, rot;
+
+ /* push stripe_nr back to the start of the full stripe */
+ stripe_nr = raid56_full_stripe_start;
+ do_div(stripe_nr, stripe_len);
+
+ stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+
+ /* RAID[56] write or recovery. Return all stripes */
+ num_stripes = map->num_stripes;
+ max_errors = nr_parity_stripes(map);
+
+ raid_map = kmalloc(sizeof(u64) * num_stripes,
+ GFP_NOFS);
+ if (!raid_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Work out the disk rotation on this stripe-set */
+ tmp = stripe_nr;
+ rot = do_div(tmp, num_stripes);
+
+ /* Fill in the logical address of each stripe */
+ tmp = stripe_nr * nr_data_stripes(map);
+ for (i = 0; i < nr_data_stripes(map); i++)
+ raid_map[(i+rot) % num_stripes] =
+ em->start + (tmp + i) * map->stripe_len;
+
+ raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ raid_map[(i+rot+1) % num_stripes] =
+ RAID6_Q_STRIPE;
+
+ *length = map->stripe_len;
+ stripe_index = 0;
+ stripe_offset = 0;
+ } else {
+ /*
+ * Mirror #0 or #1 means the original data block.
+ * Mirror #2 is RAID5 parity block.
+ * Mirror #3 is RAID6 Q block.
+ */
+ stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+ if (mirror_num > 1)
+ stripe_index = nr_data_stripes(map) +
+ mirror_num - 2;
+
+ /* We distribute the parity blocks across stripes */
+ tmp = stripe_nr + stripe_index;
+ stripe_index = do_div(tmp, map->num_stripes);
+ }
} else {
/*
* after this do_div call, stripe_nr is the number of stripes
@@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_DUP)) {
max_errors = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+ max_errors = 2;
}
}
@@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
bbio->mirror_num = map->num_stripes + 1;
}
+ if (raid_map) {
+ sort_parity_stripes(bbio, raid_map);
+ *raid_map_ret = raid_map;
+ }
out:
if (dev_replace_is_ongoing)
btrfs_dev_replace_unlock(dev_replace);
@@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
struct btrfs_bio **bbio_ret, int mirror_num)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
- mirror_num);
+ mirror_num, NULL);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 bytenr;
u64 length;
u64 stripe_nr;
+ u64 rmap_len;
int i, j, nr = 0;
read_lock(&em_tree->lock);
@@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
map = (struct map_lookup *)em->bdev;
length = em->len;
+ rmap_len = map->stripe_len;
+
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
do_div(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
do_div(length, map->num_stripes);
+ else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ do_div(length, nr_data_stripes(map));
+ rmap_len = map->stripe_len * nr_data_stripes(map);
+ }
buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
BUG_ON(!buf); /* -ENOMEM */
@@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
do_div(stripe_nr, map->sub_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
- }
- bytenr = chunk_start + stripe_nr * map->stripe_len;
+ } /* else if RAID[56], multiply by nr_data_stripes().
+ * Alternatively, just use rmap_len below instead of
+ * map->stripe_len */
+
+ bytenr = chunk_start + stripe_nr * rmap_len;
WARN_ON(nr >= map->num_stripes);
for (j = 0; j < nr; j++) {
if (buf[j] == bytenr)
@@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
*logical = buf;
*naddrs = nr;
- *stripe_len = map->stripe_len;
+ *stripe_len = rmap_len;
free_extent_map(em);
return 0;
@@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
bio->bi_bdev = (struct block_device *)
(unsigned long)bbio->mirror_num;
/* only send an error to the higher layers if it is
- * beyond the tolerance of the multi-bio
+ * beyond the tolerance of the btrfs bio
*/
if (atomic_read(&bbio->error) > bbio->max_errors) {
err = -EIO;
@@ -4668,13 +5079,18 @@ struct async_sched {
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled.
*/
-static noinline void schedule_bio(struct btrfs_root *root,
+noinline void btrfs_schedule_bio(struct btrfs_root *root,
struct btrfs_device *device,
int rw, struct bio *bio)
{
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
+ if (device->missing || !device->bdev) {
+ bio_endio(bio, -EIO);
+ return;
+ }
+
/* don't bother with additional async steps for reads, right now */
if (!(rw & REQ_WRITE)) {
bio_get(bio);
@@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
#endif
bio->bi_bdev = dev->bdev;
if (async)
- schedule_bio(root, dev, rw, bio);
+ btrfs_schedule_bio(root, dev, rw, bio);
else
btrfsic_submit_bio(rw, bio);
}
@@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
u64 logical = (u64)bio->bi_sector << 9;
u64 length = 0;
u64 map_length;
+ u64 *raid_map = NULL;
int ret;
int dev_nr = 0;
int total_devs = 1;
@@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
length = bio->bi_size;
map_length = length;
- ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
- mirror_num);
- if (ret)
+ ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
+ mirror_num, &raid_map);
+ if (ret) /* -ENOMEM */
return ret;
total_devs = bbio->num_stripes;
+ bbio->orig_bio = first_bio;
+ bbio->private = first_bio->bi_private;
+ bbio->end_io = first_bio->bi_end_io;
+ atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+
+ if (raid_map) {
+ /* In this case, map_length has been set to the length of
+ a single stripe; not the whole write */
+ if (rw & WRITE) {
+ return raid56_parity_write(root, bio, bbio,
+ raid_map, map_length);
+ } else {
+ return raid56_parity_recover(root, bio, bbio,
+ raid_map, map_length,
+ mirror_num);
+ }
+ }
+
if (map_length < length) {
printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
"len %llu\n", (unsigned long long)logical,
@@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
BUG();
}
- bbio->orig_bio = first_bio;
- bbio->private = first_bio->bi_private;
- bbio->end_io = first_bio->bi_end_io;
- atomic_set(&bbio->stripes_pending, bbio->num_stripes);
-
while (dev_nr < total_devs) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3c3939ac75..062d8604d35 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,8 +21,8 @@
#include <linux/bio.h>
#include <linux/sort.h>
+#include <linux/btrfs.h>
#include "async-thread.h"
-#include "ioctl.h"
#define BTRFS_STRIPE_LEN (64 * 1024)
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
int btrfs_scratch_superblock(struct btrfs_device *device);
-
+void btrfs_schedule_bio(struct btrfs_root *root,
+ struct btrfs_device *device,
+ int rw, struct bio *bio);
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+ u64 logical, u64 len, int mirror_num);
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+ struct btrfs_mapping_tree *map_tree,
+ u64 logical);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
diff --git a/fs/buffer.c b/fs/buffer.c
index 8e18281b407..b4dcb34c963 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -53,6 +54,13 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
}
EXPORT_SYMBOL(init_buffer);
+inline void touch_buffer(struct buffer_head *bh)
+{
+ trace_block_touch_buffer(bh);
+ mark_page_accessed(bh->b_page);
+}
+EXPORT_SYMBOL(touch_buffer);
+
static int sleep_on_buffer(void *word)
{
io_schedule();
@@ -1113,6 +1121,8 @@ void mark_buffer_dirty(struct buffer_head *bh)
{
WARN_ON_ONCE(!buffer_uptodate(bh));
+ trace_block_dirty_buffer(bh);
+
/*
* Very *carefully* optimize the it-is-already-dirty case.
*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d4f81edd9a5..a60ea977af6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page)
static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
{
struct inode *inode = req->r_inode;
- struct ceph_osd_reply_head *replyhead;
- int rc, bytes;
+ int rc = req->r_result;
+ int bytes = le32_to_cpu(msg->hdr.data_len);
int i;
- /* parse reply */
- replyhead = msg->front.iov_base;
- WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
- rc = le32_to_cpu(replyhead->result);
- bytes = le32_to_cpu(msg->hdr.data_len);
-
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
/* unlock all pages, zeroing any data we didn't read */
@@ -315,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
NULL, 0,
ci->i_truncate_seq, ci->i_truncate_size,
- NULL, false, 1, 0);
+ NULL, false, 0);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -492,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
&ci->i_layout, snapc,
page_off, len,
ci->i_truncate_seq, ci->i_truncate_size,
- &inode->i_mtime,
- &page, 1, 0, 0, true);
+ &inode->i_mtime, &page, 1);
if (err < 0) {
dout("writepage setting page/mapping error %d %p\n", err, page);
SetPageError(page);
@@ -554,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req,
struct ceph_msg *msg)
{
struct inode *inode = req->r_inode;
- struct ceph_osd_reply_head *replyhead;
- struct ceph_osd_op *op;
struct ceph_inode_info *ci = ceph_inode(inode);
unsigned wrote;
struct page *page;
int i;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
- __s32 rc = -EIO;
- u64 bytes = 0;
+ int rc = req->r_result;
+ u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
long writeback_stat;
unsigned issued = ceph_caps_issued(ci);
- /* parse reply */
- replyhead = msg->front.iov_base;
- WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
- op = (void *)(replyhead + 1);
- rc = le32_to_cpu(replyhead->result);
- bytes = le64_to_cpu(op->extent.length);
-
if (rc >= 0) {
/*
* Assume we wrote the pages we originally sent. The
@@ -741,8 +725,6 @@ retry:
struct page *page;
int want;
u64 offset, len;
- struct ceph_osd_request_head *reqhead;
- struct ceph_osd_op *op;
long writeback_stat;
next = 0;
@@ -838,7 +820,7 @@ get_more_pages:
snapc, do_sync,
ci->i_truncate_seq,
ci->i_truncate_size,
- &inode->i_mtime, true, 1, 0);
+ &inode->i_mtime, true, 0);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
@@ -906,10 +888,8 @@ get_more_pages:
/* revise final length, page count */
req->r_num_pages = locked_pages;
- reqhead = req->r_request->front.iov_base;
- op = (void *)(reqhead + 1);
- op->extent.length = cpu_to_le64(len);
- op->payload_len = cpu_to_le32(len);
+ req->r_request_ops[0].extent.length = cpu_to_le64(len);
+ req->r_request_ops[0].payload_len = cpu_to_le32(len);
req->r_request->hdr.data_len = cpu_to_le32(len);
rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ae2be696eb5..78e2f575247 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -611,8 +611,16 @@ retry:
if (flags & CEPH_CAP_FLAG_AUTH)
ci->i_auth_cap = cap;
- else if (ci->i_auth_cap == cap)
+ else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL;
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item)) {
+ dout(" moving %p to cap_dirty_migrating\n", inode);
+ list_move(&ci->i_dirty_item,
+ &mdsc->cap_dirty_migrating);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
@@ -1460,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
- int file_wanted, used;
+ int file_wanted, used, cap_used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list
@@ -1563,9 +1571,14 @@ retry_locked:
/* NOTE: no side-effects allowed, until we take s_mutex */
+ cap_used = used;
+ if (ci->i_auth_cap && cap != ci->i_auth_cap)
+ cap_used &= ~ci->i_auth_cap->issued;
+
revoking = cap->implemented & ~cap->issued;
- dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+ dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
cap->mds, cap, ceph_cap_string(cap->issued),
+ ceph_cap_string(cap_used),
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));
@@ -1593,7 +1606,7 @@ retry_locked:
}
/* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & used) == 0) {
+ if (revoking && (revoking & cap_used) == 0) {
dout("completed revocation of %s\n",
ceph_cap_string(cap->implemented & ~cap->issued));
goto ack;
@@ -1670,8 +1683,8 @@ ack:
sent++;
/* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
- retain, flushing, NULL);
+ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+ want, retain, flushing, NULL);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -2417,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
dout("mds wanted %s -> %s\n",
ceph_cap_string(le32_to_cpu(grant->wanted)),
ceph_cap_string(wanted));
- grant->wanted = cpu_to_le32(wanted);
+ /* imported cap may not have correct mds_wanted */
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+ check_caps = 1;
}
cap->seq = seq;
@@ -2821,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);
+ if (op == CEPH_CAP_OP_IMPORT)
+ ceph_add_cap_releases(mdsc, session);
+
/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 11b57c2c8f1..bf338d9b67e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
+ if (err)
+ goto out_err;
+
err = ceph_handle_snapdir(req, dentry, err);
if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
@@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = finish_no_open(file, dn);
} else {
dout("atomic_open finish_open on dn %p\n", dn);
+ if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+ *opened |= FILE_CREATED;
+ }
err = finish_open(file, dentry, ceph_open, opened);
}
@@ -535,7 +541,7 @@ more:
ci->i_snap_realm->cached_context,
do_sync,
ci->i_truncate_seq, ci->i_truncate_size,
- &mtime, false, 2, page_align);
+ &mtime, false, page_align);
if (IS_ERR(req))
return PTR_ERR(req);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f5ed767806d..4a989345b37 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
&ceph_sb_to_client(inode->i_sb)->client->osdc;
u64 len = 1, olen;
u64 tmp;
- struct ceph_object_layout ol;
struct ceph_pg pgid;
int r;
@@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
return -EFAULT;
down_read(&osdc->map_sem);
- r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+ r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset,
&olen);
if (r < 0)
@@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+ ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
osdc->osdmap);
- pgid = ol.ol_pgid;
dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
if (dl.osd >= 0) {
struct ceph_entity_addr *a =
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7a3dfe0a9a8..442880d099c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -233,6 +233,30 @@ bad:
}
/*
+ * parse create results
+ */
+static int parse_reply_info_create(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
+{
+ if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+ if (*p == end) {
+ info->has_create_ino = false;
+ } else {
+ info->has_create_ino = true;
+ info->ino = ceph_decode_64(p);
+ }
+ }
+
+ if (unlikely(*p != end))
+ goto bad;
+ return 0;
+
+bad:
+ return -EIO;
+}
+
+/*
* parse extra results
*/
static int parse_reply_info_extra(void **p, void *end,
@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,
{
if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
- else
+ else if (info->head->op == CEPH_MDS_OP_READDIR)
return parse_reply_info_dir(p, end, info, features);
+ else if (info->head->op == CEPH_MDS_OP_CREATE)
+ return parse_reply_info_create(p, end, info, features);
+ else
+ return -EIO;
}
/*
@@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&req->r_fill_mutex);
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
- if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+ if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
+ req->r_op == CEPH_MDS_OP_LSSNAP) &&
rinfo->dir_nr)
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ff4188bf619..c2a19fbbe51 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_info_in *dir_in;
u8 dir_complete, dir_end;
};
+
+ /* for create results */
+ struct {
+ bool has_create_ino;
+ u64 ino;
+ };
};
/* encoded blob describing snapshot contexts for certain
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 73b7d44e8a3..0d3c9240c61 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
return ERR_PTR(-ENOMEM);
ceph_decode_16_safe(p, end, version, bad);
+ if (version > 3) {
+ pr_warning("got mdsmap version %d > 3, failing", version);
+ goto bad;
+ }
ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
m->m_epoch = ceph_decode_32(p);
@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
m->m_num_data_pg_pools = n;
- m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+ m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
if (!m->m_data_pg_pools)
goto badmem;
- ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+ ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
for (i = 0; i < n; i++)
- m->m_data_pg_pools[i] = ceph_decode_32(p);
- m->m_cas_pg_pool = ceph_decode_32(p);
+ m->m_data_pg_pools[i] = ceph_decode_64(p);
+ m->m_cas_pg_pool = ceph_decode_64(p);
/* ok, we don't care about the rest. */
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index cd5097d7c80..89fa4a940a0 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
case CEPH_MDS_STATE_BOOT: return "up:boot";
case CEPH_MDS_STATE_STANDBY: return "up:standby";
case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
+ case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
case CEPH_MDS_STATE_CREATING: return "up:creating";
case CEPH_MDS_STATE_STARTING: return "up:starting";
/* up and in */
@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUP: return "lookup";
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
+ case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_GETATTR: return "getattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+ case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+ case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
case CEPH_MDS_OP_READDIR: return "readdir";
case CEPH_MDS_OP_MKNOD: return "mknod";
case CEPH_MDS_OP_LINK: return "link";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e86aa994812..9fe17c6c287 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
/*
* express utilization in terms of large blocks to avoid
* overflow on 32-bit machines.
+ *
+ * NOTE: for the time being, we make bsize == frsize to humor
+ * not-yet-ancient versions of glibc that are broken.
+ * Someday, we will probably want to report a real block
+ * size... whatever that may mean for a network file system!
*/
buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+ buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
buf->f_namelen = NAME_MAX;
- buf->f_frsize = PAGE_CACHE_SIZE;
/* leave fsid little-endian, regardless of host endianness */
fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f053bbd1886..c7b309723dc 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -21,7 +21,7 @@
/* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */
-#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
+#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
@@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
/* file.c */
extern const struct file_operations ceph_file_fops;
extern const struct address_space_operations ceph_aops;
-extern int ceph_copy_to_page_vector(struct page **pages,
- const char *data,
- loff_t off, size_t len);
-extern int ceph_copy_from_page_vector(struct page **pages,
- char *data,
- loff_t off, size_t len);
-extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2c2ae5be990..9b6b2b6dd16 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -29,9 +29,94 @@ struct ceph_vxattr {
size_t name_size; /* strlen(name) + 1 (for '\0') */
size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
size_t size);
- bool readonly;
+ bool readonly, hidden;
+ bool (*exists_cb)(struct ceph_inode_info *ci);
};
+/* layouts */
+
+static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
+{
+ size_t s;
+ char *p = (char *)&ci->i_layout;
+
+ for (s = 0; s < sizeof(ci->i_layout); s++, p++)
+ if (*p)
+ return true;
+ return false;
+}
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ int ret;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ const char *pool_name;
+
+ dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+ down_read(&osdc->map_sem);
+ pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+ if (pool_name)
+ ret = snprintf(val, size,
+ "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
+ (unsigned long long)ceph_file_layout_su(ci->i_layout),
+ (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+ pool_name);
+ else
+ ret = snprintf(val, size,
+ "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
+ (unsigned long long)ceph_file_layout_su(ci->i_layout),
+ (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+ (unsigned long long)pool);
+
+ up_read(&osdc->map_sem);
+ return ret;
+}
+
+static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%lld",
+ (unsigned long long)ceph_file_layout_su(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%lld",
+ (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%lld",
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ const char *pool_name;
+
+ down_read(&osdc->map_sem);
+ pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+ if (pool_name)
+ ret = snprintf(val, size, "%s", pool_name);
+ else
+ ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+ up_read(&osdc->map_sem);
+ return ret;
+}
+
/* directories */
static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -83,17 +168,43 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
(long)ci->i_rctime.tv_nsec);
}
-#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
-#define XATTR_NAME_CEPH(_type, _name) \
- { \
- .name = CEPH_XATTR_NAME(_type, _name), \
- .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
- .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
- .readonly = true, \
- }
+#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
+#define CEPH_XATTR_NAME2(_type, _name, _name2) \
+ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
+
+#define XATTR_NAME_CEPH(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .readonly = true, \
+ .hidden = false, \
+ .exists_cb = NULL, \
+ }
+#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
+ { \
+ .name = CEPH_XATTR_NAME2(_type, _name, _field), \
+ .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
+ .readonly = false, \
+ .hidden = true, \
+ .exists_cb = ceph_vxattrcb_layout_exists, \
+ }
static struct ceph_vxattr ceph_dir_vxattrs[] = {
+ {
+ .name = "ceph.dir.layout",
+ .name_size = sizeof("ceph.dir.layout"),
+ .getxattr_cb = ceph_vxattrcb_layout,
+ .readonly = false,
+ .hidden = false,
+ .exists_cb = ceph_vxattrcb_layout_exists,
+ },
+ XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
+ XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
+ XATTR_LAYOUT_FIELD(dir, layout, object_size),
+ XATTR_LAYOUT_FIELD(dir, layout, pool),
XATTR_NAME_CEPH(dir, entries),
XATTR_NAME_CEPH(dir, files),
XATTR_NAME_CEPH(dir, subdirs),
@@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_NAME_CEPH(dir, rsubdirs),
XATTR_NAME_CEPH(dir, rbytes),
XATTR_NAME_CEPH(dir, rctime),
- { 0 } /* Required table terminator */
+ { .name = NULL, 0 } /* Required table terminator */
};
static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
/* files */
-static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- int ret;
-
- ret = snprintf(val, size,
- "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
- return ret;
-}
-
static struct ceph_vxattr ceph_file_vxattrs[] = {
- XATTR_NAME_CEPH(file, layout),
- /* The following extended attribute name is deprecated */
{
- .name = XATTR_CEPH_PREFIX "layout",
- .name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
- .getxattr_cb = ceph_vxattrcb_file_layout,
- .readonly = true,
+ .name = "ceph.file.layout",
+ .name_size = sizeof("ceph.file.layout"),
+ .getxattr_cb = ceph_vxattrcb_layout,
+ .readonly = false,
+ .hidden = false,
+ .exists_cb = ceph_vxattrcb_layout_exists,
},
- { 0 } /* Required table terminator */
+ XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
+ XATTR_LAYOUT_FIELD(file, layout, stripe_count),
+ XATTR_LAYOUT_FIELD(file, layout, object_size),
+ XATTR_LAYOUT_FIELD(file, layout, pool),
+ { .name = NULL, 0 } /* Required table terminator */
};
static size_t ceph_file_vxattrs_name_size; /* total size of all names */
@@ -164,7 +266,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
size_t size = 0;
for (vxattr = vxattrs; vxattr->name; vxattr++)
- size += vxattr->name_size;
+ if (!vxattr->hidden)
+ size += vxattr->name_size;
return size;
}
@@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name))
return -ENODATA;
- /* let's see if a virtual xattr was requested */
- vxattr = ceph_match_vxattr(inode, name);
-
spin_lock(&ci->i_ceph_lock);
dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version);
+ /* let's see if a virtual xattr was requested */
+ vxattr = ceph_match_vxattr(inode, name);
+ if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+ err = vxattr->getxattr_cb(ci, value, size);
+ goto out;
+ }
+
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto get_xattr;
@@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
spin_lock(&ci->i_ceph_lock);
- if (vxattr && vxattr->readonly) {
- err = vxattr->getxattr_cb(ci, value, size);
- goto out;
- }
-
err = __build_xattrs(inode);
if (err < 0)
goto out;
@@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
get_xattr:
err = -ENODATA; /* == ENOATTR */
xattr = __get_xattr(ci, name);
- if (!xattr) {
- if (vxattr)
- err = vxattr->getxattr_cb(ci, value, size);
+ if (!xattr)
goto out;
- }
err = -ERANGE;
if (size && size < xattr->val_len)
@@ -664,23 +763,30 @@ list_xattr:
vir_namelen = ceph_vxattrs_name_size(vxattrs);
/* adding 1 byte per each variable due to the null termination */
- namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+ namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
err = -ERANGE;
- if (size && namelen > size)
+ if (size && vir_namelen + namelen > size)
goto out;
- err = namelen;
+ err = namelen + vir_namelen;
if (size == 0)
goto out;
names = __copy_xattr_names(ci, names);
/* virtual xattr names, too */
- if (vxattrs)
+ err = namelen;
+ if (vxattrs) {
for (i = 0; vxattrs[i].name; i++) {
- len = sprintf(names, "%s", vxattrs[i].name);
- names += len + 1;
+ if (!vxattrs[i].hidden &&
+ !(vxattrs[i].exists_cb &&
+ !vxattrs[i].exists_cb(ci))) {
+ len = sprintf(names, "%s", vxattrs[i].name);
+ names += len + 1;
+ err += len + 1;
+ }
}
+ }
out:
spin_unlock(&ci->i_ceph_lock);
@@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
if (vxattr && vxattr->readonly)
return -EOPNOTSUPP;
+ /* pass any unhandled ceph.* xattrs through to the MDS */
+ if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto do_sync_unlocked;
+
/* preallocate memory for xattr name, value, index node */
err = -ENOMEM;
newname = kmemdup(name, name_len + 1, GFP_NOFS);
@@ -838,6 +948,7 @@ retry:
do_sync:
spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
err = ceph_sync_setxattr(dentry, name, value, size, flags);
out:
kfree(newname);
@@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
if (vxattr && vxattr->readonly)
return -EOPNOTSUPP;
+ /* pass any unhandled ceph.* xattrs through to the MDS */
+ if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto do_sync_unlocked;
+
err = -ENOMEM;
spin_lock(&ci->i_ceph_lock);
retry:
@@ -931,6 +1046,7 @@ retry:
return err;
do_sync:
spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
err = ceph_send_removexattr(dentry, name);
out:
return err;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 4bad7b16271..1a052c0eee8 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -564,6 +564,11 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
dentry = ERR_PTR(-ENOENT);
break;
}
+ if (!S_ISDIR(dir->i_mode)) {
+ dput(dentry);
+ dentry = ERR_PTR(-ENOTDIR);
+ break;
+ }
/* skip separators */
while (*s == sep)
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 00e12f2d626..7353bc5d73d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1909,8 +1909,11 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
} while (rc == -EAGAIN);
for (i = 0; i < wdata->nr_pages; i++) {
- if (rc != 0)
+ if (rc != 0) {
SetPageError(wdata->pages[i]);
+ end_page_writeback(wdata->pages[i]);
+ page_cache_release(wdata->pages[i]);
+ }
unlock_page(wdata->pages[i]);
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4474a57f30a..54125e04fd0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1031,7 +1031,7 @@ static int cifs_parse_security_flavors(char *value,
switch (match_token(value, cifs_secflavor_tokens, args)) {
case Opt_sec_krb5:
- vol->secFlg |= CIFSSEC_MAY_KRB5;
+ vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN;
break;
case Opt_sec_krb5i:
vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c16d2a018ab..8c0d8557731 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -43,6 +43,7 @@
#include "cifs_fs_sb.h"
#include "fscache.h"
+
static inline int cifs_convert_flags(unsigned int flags)
{
if ((flags & O_ACCMODE) == O_RDONLY)
@@ -72,10 +73,15 @@ static u32 cifs_posix_convert_flags(unsigned int flags)
else if ((flags & O_ACCMODE) == O_RDWR)
posix_flags = SMB_O_RDWR;
- if (flags & O_CREAT)
+ if (flags & O_CREAT) {
posix_flags |= SMB_O_CREAT;
- if (flags & O_EXCL)
- posix_flags |= SMB_O_EXCL;
+ if (flags & O_EXCL)
+ posix_flags |= SMB_O_EXCL;
+ } else if (flags & O_EXCL)
+ cFYI(1, "Application %s pid %d has incorrectly set O_EXCL flag"
+ "but not O_CREAT on file open. Ignoring O_EXCL",
+ current->comm, current->tgid);
+
if (flags & O_TRUNC)
posix_flags |= SMB_O_TRUNC;
/* be safe and imply O_SYNC for O_DSYNC */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 2f2e0da1a6b..92e68b33fff 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -635,7 +635,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
brelse(bitmap_bh);
printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
", computed = %llu, %llu\n",
- EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
+ EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
desc_count, bitmap_count);
return bitmap_count;
#else
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 6dda04f05ef..d8cd1f0f466 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -334,7 +334,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
*
* For non-htree, ext4_llseek already chooses the proper max offset.
*/
-loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
+static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int dx_dir = is_dx_dir(inode);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6e16c186795..4a01ba31526 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1309,6 +1309,7 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
struct list_head s_es_lru;
+ struct percpu_counter s_extent_cache_cnt;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
};
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index f768f4a98a2..95796a1b752 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -147,11 +147,12 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end);
static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
int nr_to_scan);
-static int ext4_es_reclaim_extents_count(struct super_block *sb);
int __init ext4_init_es(void)
{
- ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
+ ext4_es_cachep = kmem_cache_create("ext4_extent_status",
+ sizeof(struct extent_status),
+ 0, (SLAB_RECLAIM_ACCOUNT), NULL);
if (ext4_es_cachep == NULL)
return -ENOMEM;
return 0;
@@ -302,8 +303,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
/*
* We don't count delayed extent because we never try to reclaim them
*/
- if (!ext4_es_is_delayed(es))
+ if (!ext4_es_is_delayed(es)) {
EXT4_I(inode)->i_es_lru_nr++;
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ }
return es;
}
@@ -314,6 +317,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
if (!ext4_es_is_delayed(es)) {
BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
EXT4_I(inode)->i_es_lru_nr--;
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
}
kmem_cache_free(ext4_es_cachep, es);
@@ -674,10 +678,11 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk = 0;
- trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan);
+ ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
if (!nr_to_scan)
- return ext4_es_reclaim_extents_count(sbi->s_sb);
+ return ret;
INIT_LIST_HEAD(&scanned);
@@ -705,9 +710,10 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
}
list_splice_tail(&scanned, &sbi->s_es_lru);
spin_unlock(&sbi->s_es_lru_lock);
- trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk);
- return ext4_es_reclaim_extents_count(sbi->s_sb);
+ ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+ return ret;
}
void ext4_es_register_shrinker(struct super_block *sb)
@@ -751,25 +757,6 @@ void ext4_es_lru_del(struct inode *inode)
spin_unlock(&sbi->s_es_lru_lock);
}
-static int ext4_es_reclaim_extents_count(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *ei;
- struct list_head *cur;
- int nr_cached = 0;
-
- spin_lock(&sbi->s_es_lru_lock);
- list_for_each(cur, &sbi->s_es_lru) {
- ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
- read_lock(&ei->i_es_lock);
- nr_cached += ei->i_es_lru_nr;
- read_unlock(&ei->i_es_lock);
- }
- spin_unlock(&sbi->s_es_lru_lock);
- trace_ext4_es_reclaim_extents_count(sb, nr_cached);
- return nr_cached;
-}
-
static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
int nr_to_scan)
{
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9c4f4b1c97f..9ea0cde3fa9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2512,12 +2512,8 @@ static int ext4_nonda_switch(struct super_block *sb)
/*
* Start pushing delalloc when 1/2 of free blocks are dirty.
*/
- if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
- !writeback_in_progress(sb->s_bdi) &&
- down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- }
+ if (dirty_blocks && (free_blocks < 2 * dirty_blocks))
+ try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6540ebe058e..7bb713a46fe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3419,7 +3419,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
win = offs;
ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
- EXT4_B2C(sbi, win);
+ EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
}
@@ -4565,7 +4565,7 @@ do_more:
EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
- count_clusters = EXT4_B2C(sbi, count);
+ count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh) {
err = -EIO;
@@ -4807,11 +4807,11 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
- EXT4_B2C(sbi, blocks_freed));
+ EXT4_NUM_B2C(sbi, blocks_freed));
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(EXT4_B2C(sbi, blocks_freed),
+ atomic_add(EXT4_NUM_B2C(sbi, blocks_freed),
&sbi->s_flex_groups[flex_group].free_clusters);
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c7f4d758466..b2c8ee56eb9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1247,7 +1247,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
ext4_inode_table_set(sb, gdp, group_data->inode_table);
ext4_free_group_clusters_set(sb, gdp,
- EXT4_B2C(sbi, group_data->free_blocks_count));
+ EXT4_NUM_B2C(sbi, group_data->free_blocks_count));
ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
if (ext4_has_group_desc_csum(sb))
ext4_itable_unused_set(sb, gdp,
@@ -1349,7 +1349,7 @@ static void ext4_update_super(struct super_block *sb,
/* Update the free space counts */
percpu_counter_add(&sbi->s_freeclusters_counter,
- EXT4_B2C(sbi, free_blocks));
+ EXT4_NUM_B2C(sbi, free_blocks));
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
@@ -1360,7 +1360,7 @@ static void ext4_update_super(struct super_block *sb,
sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, group_data[0].group);
- atomic_add(EXT4_B2C(sbi, free_blocks),
+ atomic_add(EXT4_NUM_B2C(sbi, free_blocks),
&sbi->s_flex_groups[flex_group].free_clusters);
atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
&sbi->s_flex_groups[flex_group].free_inodes);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 620cf5615ba..5e6c8783619 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -783,6 +783,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_extent_cache_cnt);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -1247,6 +1248,11 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"quota options when quota turned on");
return -1;
}
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
+ "when QUOTA feature is enabled");
+ return -1;
+ }
qname = match_strdup(args);
if (!qname) {
ext4_msg(sb, KERN_ERR,
@@ -1544,6 +1550,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
"quota options when quota turned on");
return -1;
}
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot set journaled quota options "
+ "when QUOTA feature is enabled");
+ return -1;
+ }
sbi->s_jquota_fmt = m->mount_opt;
#endif
} else {
@@ -1592,6 +1605,12 @@ static int parse_options(char *options, struct super_block *sb,
return 0;
}
#ifdef CONFIG_QUOTA
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
+ ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
+ "feature is enabled");
+ return 0;
+ }
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sb, USRQUOTA);
@@ -3161,7 +3180,7 @@ int ext4_calculate_overhead(struct super_block *sb)
}
/* Add the journal blocks as well */
if (sbi->s_journal)
- overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen);
+ overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
sbi->s_overhead = overhead;
smp_wmb();
@@ -3688,6 +3707,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!err) {
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
}
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0);
+ }
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount3;
@@ -3711,13 +3733,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
- sb->s_qcop = &ext4_qctl_operations;
sb->dq_op = &ext4_quota_operations;
-
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
- /* Use qctl operations for hidden quota files. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
sb->s_qcop = &ext4_qctl_sysfile_operations;
- }
+ else
+ sb->s_qcop = &ext4_qctl_operations;
#endif
memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
@@ -3913,6 +3933,16 @@ no_journal:
if (err)
goto failed_mount7;
+#ifdef CONFIG_QUOTA
+ /* Enable quota usage during mount. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ !(sb->s_flags & MS_RDONLY)) {
+ err = ext4_enable_quotas(sb);
+ if (err)
+ goto failed_mount8;
+ }
+#endif /* CONFIG_QUOTA */
+
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
@@ -3930,16 +3960,6 @@ no_journal:
} else
descr = "out journal";
-#ifdef CONFIG_QUOTA
- /* Enable quota usage during mount. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- !(sb->s_flags & MS_RDONLY)) {
- err = ext4_enable_quotas(sb);
- if (err)
- goto failed_mount8;
- }
-#endif /* CONFIG_QUOTA */
-
if (test_opt(sb, DISCARD)) {
struct request_queue *q = bdev_get_queue(sb->s_bdev);
if (!blk_queue_discard(q))
@@ -3993,6 +4013,7 @@ failed_mount3:
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_extent_cache_cnt);
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
@@ -4538,6 +4559,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!old_opts.s_qf_names[i]) {
for (j = 0; j < i; j++)
kfree(old_opts.s_qf_names[j]);
+ kfree(orig_data);
return -ENOMEM;
}
} else
@@ -4816,9 +4838,12 @@ static int ext4_release_dquot(struct dquot *dquot)
static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
+ struct super_block *sb = dquot->dq_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
/* Are we journaling quotas? */
- if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
- EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+ sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
} else {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 310972b72a6..21f46fb3a10 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -318,8 +318,14 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
- return inode->i_sb->s_op->write_inode(inode, wbc);
+ int ret;
+
+ if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
+ trace_writeback_write_inode_start(inode, wbc);
+ ret = inode->i_sb->s_op->write_inode(inode, wbc);
+ trace_writeback_write_inode(inode, wbc);
+ return ret;
+ }
return 0;
}
@@ -450,6 +456,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
WARN_ON(!(inode->i_state & I_SYNC));
+ trace_writeback_single_inode_start(inode, wbc, nr_to_write);
+
ret = do_writepages(mapping, wbc);
/*
@@ -1150,8 +1158,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* dirty the inode itself
*/
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ trace_writeback_dirty_inode_start(inode, flags);
+
if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode, flags);
+
+ trace_writeback_dirty_inode(inode, flags);
}
/*
@@ -1332,47 +1344,43 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
EXPORT_SYMBOL(writeback_inodes_sb);
/**
- * writeback_inodes_sb_if_idle - start writeback if none underway
+ * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
* @sb: the superblock
- * @reason: reason why some writeback work was initiated
+ * @nr: the number of pages to write
+ * @reason: the reason of writeback
*
- * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
* Returns 1 if writeback was started, 0 if not.
*/
-int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
+int try_to_writeback_inodes_sb_nr(struct super_block *sb,
+ unsigned long nr,
+ enum wb_reason reason)
{
- if (!writeback_in_progress(sb->s_bdi)) {
- down_read(&sb->s_umount);
- writeback_inodes_sb(sb, reason);
- up_read(&sb->s_umount);
+ if (writeback_in_progress(sb->s_bdi))
return 1;
- } else
+
+ if (!down_read_trylock(&sb->s_umount))
return 0;
+
+ writeback_inodes_sb_nr(sb, nr, reason);
+ up_read(&sb->s_umount);
+ return 1;
}
-EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
/**
- * writeback_inodes_sb_nr_if_idle - start writeback if none underway
+ * try_to_writeback_inodes_sb - try to start writeback if none underway
* @sb: the superblock
- * @nr: the number of pages to write
* @reason: reason why some writeback work was initiated
*
- * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Implement by try_to_writeback_inodes_sb_nr()
* Returns 1 if writeback was started, 0 if not.
*/
-int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
- unsigned long nr,
- enum wb_reason reason)
+int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
- if (!writeback_in_progress(sb->s_bdi)) {
- down_read(&sb->s_umount);
- writeback_inodes_sb_nr(sb, nr, reason);
- up_read(&sb->s_umount);
- return 1;
- } else
- return 0;
+ return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
-EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+EXPORT_SYMBOL(try_to_writeback_inodes_sb);
/**
* sync_inodes_sb - sync sb inode pages
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b7e2385c6e9..d6ee5aed56b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -382,7 +382,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
if (err < 0) {
jbd2_free_handle(handle);
current->journal_info = NULL;
- handle = ERR_PTR(err);
+ return ERR_PTR(err);
}
handle->h_type = type;
handle->h_line_no = line_no;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index a2717408c47..0796c45d0d4 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -11,7 +11,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/nfs_fs.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include <linux/kthread.h>
@@ -220,10 +220,19 @@ reclaimer(void *ptr)
{
struct nlm_host *host = (struct nlm_host *) ptr;
struct nlm_wait *block;
+ struct nlm_rqst *req;
struct file_lock *fl, *next;
u32 nsmstate;
struct net *net = host->net;
+ req = kmalloc(sizeof(*req), GFP_KERNEL);
+ if (!req) {
+ printk(KERN_ERR "lockd: reclaimer unable to alloc memory."
+ " Locks for %s won't be reclaimed!\n",
+ host->h_name);
+ return 0;
+ }
+
allow_signal(SIGKILL);
down_write(&host->h_rwsem);
@@ -253,7 +262,7 @@ restart:
*/
if (signalled())
continue;
- if (nlmclnt_reclaim(host, fl) != 0)
+ if (nlmclnt_reclaim(host, fl, req) != 0)
continue;
list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
if (host->h_nsmstate != nsmstate) {
@@ -279,5 +288,6 @@ restart:
/* Release host handle after use */
nlmclnt_release_host(host);
lockd_down(net);
+ kfree(req);
return 0;
}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 366277190b8..7e529c3c45c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -618,17 +618,15 @@ out_unlock:
* RECLAIM: Try to reclaim a lock
*/
int
-nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
+nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl,
+ struct nlm_rqst *req)
{
- struct nlm_rqst reqst, *req;
int status;
- req = &reqst;
memset(req, 0, sizeof(*req));
locks_init_lock(&req->a_args.lock.fl);
locks_init_lock(&req->a_res.lock.fl);
req->a_host = host;
- req->a_flags = 0;
/* Set up the argument struct */
nlmclnt_setlockargs(req, fl);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index abdd75d44dd..969d589c848 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -13,6 +13,7 @@
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include <linux/mutex.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3c2cfc68363..1812f026960 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/xprtsock.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d17bb62b06d..97e87415b14 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -13,7 +13,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/sunrpc/svc.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/nfsd/nfsfh.h>
#include <linux/nfsd/export.h>
#include <linux/lockd/lockd.h>
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 862a2f16db6..5f7b053720e 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -128,10 +128,13 @@ int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
struct super_block *pipefs_sb;
int ret = 0;
+ sunrpc_init_cache_detail(cd);
pipefs_sb = rpc_get_sb_net(net);
if (pipefs_sb) {
ret = nfs_cache_register_sb(pipefs_sb, cd);
rpc_put_sb_net(net);
+ if (ret)
+ sunrpc_destroy_cache_detail(cd);
}
return ret;
}
@@ -151,14 +154,5 @@ void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
nfs_cache_unregister_sb(pipefs_sb, cd);
rpc_put_sb_net(net);
}
-}
-
-void nfs_cache_init(struct cache_detail *cd)
-{
- sunrpc_init_cache_detail(cd);
-}
-
-void nfs_cache_destroy(struct cache_detail *cd)
-{
sunrpc_destroy_cache_detail(cd);
}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 317db95e37f..4116d2c3f52 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -23,8 +23,6 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
-extern void nfs_cache_init(struct cache_detail *cd);
-extern void nfs_cache_destroy(struct cache_detail *cd);
extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);
extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);
extern int nfs_cache_register_sb(struct super_block *sb,
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index ca4b11ec87a..94552709229 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/dns_resolver.h>
#include "dns_resolve.h"
@@ -42,6 +43,7 @@ EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
#include <linux/seq_file.h>
#include <linux/inet.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/svcauth.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
@@ -142,7 +144,7 @@ static int nfs_dns_upcall(struct cache_detail *cd,
ret = nfs_cache_upcall(cd, key->hostname);
if (ret)
- ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+ ret = sunrpc_cache_pipe_upcall(cd, ch);
return ret;
}
@@ -351,60 +353,47 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
}
EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
+static struct cache_detail nfs_dns_resolve_template = {
+ .owner = THIS_MODULE,
+ .hash_size = NFS_DNS_HASHTBL_SIZE,
+ .name = "dns_resolve",
+ .cache_put = nfs_dns_ent_put,
+ .cache_upcall = nfs_dns_upcall,
+ .cache_request = nfs_dns_request,
+ .cache_parse = nfs_dns_parse,
+ .cache_show = nfs_dns_show,
+ .match = nfs_dns_match,
+ .init = nfs_dns_ent_init,
+ .update = nfs_dns_ent_update,
+ .alloc = nfs_dns_ent_alloc,
+};
+
+
int nfs_dns_resolver_cache_init(struct net *net)
{
- int err = -ENOMEM;
+ int err;
struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct cache_detail *cd;
- struct cache_head **tbl;
- cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL);
- if (cd == NULL)
- goto err_cd;
-
- tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *),
- GFP_KERNEL);
- if (tbl == NULL)
- goto err_tbl;
-
- cd->owner = THIS_MODULE,
- cd->hash_size = NFS_DNS_HASHTBL_SIZE,
- cd->hash_table = tbl,
- cd->name = "dns_resolve",
- cd->cache_put = nfs_dns_ent_put,
- cd->cache_upcall = nfs_dns_upcall,
- cd->cache_parse = nfs_dns_parse,
- cd->cache_show = nfs_dns_show,
- cd->match = nfs_dns_match,
- cd->init = nfs_dns_ent_init,
- cd->update = nfs_dns_ent_update,
- cd->alloc = nfs_dns_ent_alloc,
-
- nfs_cache_init(cd);
- err = nfs_cache_register_net(net, cd);
+ nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net);
+ if (IS_ERR(nn->nfs_dns_resolve))
+ return PTR_ERR(nn->nfs_dns_resolve);
+
+ err = nfs_cache_register_net(net, nn->nfs_dns_resolve);
if (err)
goto err_reg;
- nn->nfs_dns_resolve = cd;
return 0;
err_reg:
- nfs_cache_destroy(cd);
- kfree(cd->hash_table);
-err_tbl:
- kfree(cd);
-err_cd:
+ cache_destroy_net(nn->nfs_dns_resolve, net);
return err;
}
void nfs_dns_resolver_cache_destroy(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct cache_detail *cd = nn->nfs_dns_resolve;
- nfs_cache_unregister_net(net, cd);
- nfs_cache_destroy(cd);
- kfree(cd->hash_table);
- kfree(cd);
+ nfs_cache_unregister_net(net, nn->nfs_dns_resolve);
+ cache_destroy_net(nn->nfs_dns_resolve, net);
}
static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b586fe9af47..1f941674b08 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -237,6 +237,8 @@ nfs_find_actor(struct inode *inode, void *opaque)
if (NFS_FILEID(inode) != fattr->fileid)
return 0;
+ if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode))
+ return 0;
if (nfs_compare_fh(NFS_FH(inode), fh))
return 0;
if (is_bad_inode(inode) || NFS_STALE(inode))
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 47d10087239..ac4fc9a8fdb 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -6,6 +6,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_idmap.h>
#include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/auth.h>
#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/bc_xprt.h>
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 194c4841033..49eeb044c10 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -99,7 +99,8 @@ static void filelayout_reset_write(struct nfs_write_data *data)
task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
&hdr->pages,
- hdr->completion_ops);
+ hdr->completion_ops,
+ hdr->dreq);
}
}
@@ -119,7 +120,8 @@ static void filelayout_reset_read(struct nfs_read_data *data)
task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
&hdr->pages,
- hdr->completion_ops);
+ hdr->completion_ops,
+ hdr->dreq);
}
}
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 8c07241fe52..b8da95548d3 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -36,7 +36,7 @@
* Default data server connection timeout and retrans vaules.
* Set by module paramters dataserver_timeo and dataserver_retrans.
*/
-#define NFS4_DEF_DS_TIMEO 60
+#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
#define NFS4_DEF_DS_RETRANS 5
/*
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index b720064bcd7..1fe284f01f8 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -31,6 +31,7 @@
#include <linux/nfs_fs.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
+#include <linux/sunrpc/addr.h>
#include "internal.h"
#include "nfs4session.h"
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 1e09eb78543..0dd766079e1 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/vfs.h>
#include <linux/inet.h>
#include "internal.h"
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index eae83bf96c6..b2671cb0f90 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -93,6 +93,8 @@ static int nfs4_map_errors(int err)
return err;
switch (err) {
case -NFS4ERR_RESOURCE:
+ case -NFS4ERR_LAYOUTTRYLATER:
+ case -NFS4ERR_RECALLCONFLICT:
return -EREMOTEIO;
case -NFS4ERR_WRONGSEC:
return -EPERM;
@@ -1158,6 +1160,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
data->o_arg.fmode);
iput(inode);
out:
+ nfs_release_seqid(data->o_arg.seqid);
return state;
err_put_inode:
iput(inode);
@@ -6045,6 +6048,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
struct nfs_server *server = NFS_SERVER(inode);
struct pnfs_layout_hdr *lo;
struct nfs4_state *state = NULL;
+ unsigned long timeo, giveup;
dprintk("--> %s\n", __func__);
@@ -6056,7 +6060,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
goto out;
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
- task->tk_status = -NFS4ERR_DELAY;
+ timeo = rpc_get_timeout(task->tk_client);
+ giveup = lgp->args.timestamp + timeo;
+ if (time_after(giveup, jiffies))
+ task->tk_status = -NFS4ERR_DELAY;
break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
@@ -6129,11 +6136,13 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
static void nfs4_layoutget_release(void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
- struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+ struct inode *inode = lgp->args.inode;
+ struct nfs_server *server = NFS_SERVER(inode);
size_t max_pages = max_response_pages(server);
dprintk("--> %s\n", __func__);
nfs4_free_pages(lgp->args.layout.pages, max_pages);
+ pnfs_put_layout_hdr(NFS_I(inode)->layout);
put_nfs_open_context(lgp->args.ctx);
kfree(calldata);
dprintk("<-- %s\n", __func__);
@@ -6148,7 +6157,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
struct pnfs_layout_segment *
nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
{
- struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+ struct inode *inode = lgp->args.inode;
+ struct nfs_server *server = NFS_SERVER(inode);
size_t max_pages = max_response_pages(server);
struct rpc_task *task;
struct rpc_message msg = {
@@ -6174,10 +6184,15 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
return ERR_PTR(-ENOMEM);
}
lgp->args.layout.pglen = max_pages * PAGE_SIZE;
+ lgp->args.timestamp = jiffies;
lgp->res.layoutp = &lgp->args.layout;
lgp->res.seq_res.sr_slot = NULL;
nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
+
+ /* nfs4_layoutget_release calls pnfs_put_layout_hdr */
+ pnfs_get_layout_hdr(NFS_I(inode)->layout);
+
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return ERR_CAST(task);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6be70f622b6..48ac5aad625 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1181,7 +1181,7 @@ pnfs_update_layout(struct inode *ino,
struct nfs_client *clp = server->nfs_client;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg = NULL;
- bool first = false;
+ bool first;
if (!pnfs_enabled_sb(NFS_SERVER(ino)))
goto out;
@@ -1215,10 +1215,9 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
- if (list_empty(&lo->plh_segs))
- first = true;
-
+ first = list_empty(&lo->plh_layouts) ? true : false;
spin_unlock(&ino->i_lock);
+
if (first) {
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
@@ -1422,13 +1421,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
int pnfs_write_done_resend_to_mds(struct inode *inode,
struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops)
+ const struct nfs_pgio_completion_ops *compl_ops,
+ struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor pgio;
LIST_HEAD(failed);
/* Resend all requests through the MDS */
nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops);
+ pgio.pg_dreq = dreq;
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
@@ -1463,7 +1464,8 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
&hdr->pages,
- hdr->completion_ops);
+ hdr->completion_ops,
+ hdr->dreq);
}
/*
@@ -1578,13 +1580,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
int pnfs_read_done_resend_to_mds(struct inode *inode,
struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops)
+ const struct nfs_pgio_completion_ops *compl_ops,
+ struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor pgio;
LIST_HEAD(failed);
/* Resend all requests through the MDS */
nfs_pageio_init_read(&pgio, inode, compl_ops);
+ pgio.pg_dreq = dreq;
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
@@ -1615,7 +1619,8 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
&hdr->pages,
- hdr->completion_ops);
+ hdr->completion_ops,
+ hdr->dreq);
}
/*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 97cb358bb88..94ba8041774 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -230,9 +230,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops);
+ const struct nfs_pgio_completion_ops *compl_ops,
+ struct nfs_direct_req *dreq);
int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops);
+ const struct nfs_pgio_completion_ops *compl_ops,
+ struct nfs_direct_req *dreq);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
/* nfs4_deviceid_flags */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a9dc5fc2995..17b32b72245 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -31,6 +31,7 @@
#include <linux/errno.h>
#include <linux/unistd.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/metrics.h>
#include <linux/sunrpc/xprtsock.h>
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index d26a32f5b53..1f1f38f0c5d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -335,20 +335,14 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
struct inode *old_dir = data->old_dir;
struct inode *new_dir = data->new_dir;
struct dentry *old_dentry = data->old_dentry;
- struct dentry *new_dentry = data->new_dentry;
if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
rpc_restart_call_prepare(task);
return;
}
- if (task->tk_status != 0) {
+ if (task->tk_status != 0)
nfs_cancel_async_unlink(old_dentry);
- return;
- }
-
- d_drop(old_dentry);
- d_drop(new_dentry);
}
/**
@@ -549,6 +543,18 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
error = rpc_wait_for_completion_task(task);
if (error == 0)
error = task->tk_status;
+ switch (error) {
+ case 0:
+ /* The rename succeeded */
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ d_move(dentry, sdentry);
+ break;
+ case -ERESTARTSYS:
+ /* The result of the rename is unknown. Play it safe by
+ * forcing a new lookup */
+ d_drop(dentry);
+ d_drop(sdentry);
+ }
rpc_put_task(task);
out_dput:
dput(sdentry);
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 93cc9d34c45..87fd1410b73 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -12,6 +12,10 @@
/*
* Representation of a reply cache entry.
+ *
+ * Note that we use a sockaddr_in6 to hold the address instead of the more
+ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage
+ * is much larger than a sockaddr_in6.
*/
struct svc_cacherep {
struct hlist_node c_hash;
@@ -20,11 +24,13 @@ struct svc_cacherep {
unsigned char c_state, /* unused, inprog, done */
c_type, /* status, buffer */
c_secure : 1; /* req came from port < 1024 */
- struct sockaddr_in c_addr;
+ struct sockaddr_in6 c_addr;
__be32 c_xid;
u32 c_prot;
u32 c_proc;
u32 c_vers;
+ unsigned int c_len;
+ __wsum c_csum;
unsigned long c_timestamp;
union {
struct kvec u_vec;
@@ -46,8 +52,7 @@ enum {
enum {
RC_DROPIT,
RC_REPLY,
- RC_DOIT,
- RC_INTR
+ RC_DOIT
};
/*
@@ -67,6 +72,12 @@ enum {
*/
#define RC_DELAY (HZ/5)
+/* Cache entries expire after this time period */
+#define RC_EXPIRE (120 * HZ)
+
+/* Checksum this amount of the request */
+#define RC_CSUMLEN (256U)
+
int nfsd_reply_cache_init(void);
void nfsd_reply_cache_shutdown(void);
int nfsd_cache_lookup(struct svc_rqst *);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5681c5906f0..5f38ea36e26 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -67,11 +67,6 @@ static void expkey_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
-static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
-}
-
static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,
struct svc_expkey *old);
static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *);
@@ -245,7 +240,7 @@ static struct cache_detail svc_expkey_cache_template = {
.hash_size = EXPKEY_HASHMAX,
.name = "nfsd.fh",
.cache_put = expkey_put,
- .cache_upcall = expkey_upcall,
+ .cache_request = expkey_request,
.cache_parse = expkey_parse,
.cache_show = expkey_show,
.match = expkey_match,
@@ -315,6 +310,7 @@ static void svc_export_put(struct kref *ref)
path_put(&exp->ex_path);
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
+ kfree(exp->ex_uuid);
kfree(exp);
}
@@ -337,11 +333,6 @@ static void svc_export_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
-static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
-}
-
static struct svc_export *svc_export_update(struct svc_export *new,
struct svc_export *old);
static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -674,6 +665,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
+ new->ex_uuid = NULL;
new->cd = item->cd;
}
@@ -715,7 +707,7 @@ static struct cache_detail svc_export_cache_template = {
.hash_size = EXPORT_HASHMAX,
.name = "nfsd.export",
.cache_put = svc_export_put,
- .cache_upcall = svc_export_upcall,
+ .cache_request = svc_export_request,
.cache_parse = svc_export_parse,
.cache_show = svc_export_show,
.match = svc_export_match,
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 497584c7036..d620e7f8142 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -9,7 +9,7 @@
#include <linux/debugfs.h>
#include <linux/module.h>
#include <linux/nsproxy.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <asm/uaccess.h>
#include "state.h"
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 0ce12346df9..4832fd819f8 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -140,12 +140,6 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
}
static int
-idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
-{
- return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
-}
-
-static int
idtoname_match(struct cache_head *ca, struct cache_head *cb)
{
struct ent *a = container_of(ca, struct ent, h);
@@ -192,7 +186,7 @@ static struct cache_detail idtoname_cache_template = {
.hash_size = ENT_HASHMAX,
.name = "nfs4.idtoname",
.cache_put = ent_put,
- .cache_upcall = idtoname_upcall,
+ .cache_request = idtoname_request,
.cache_parse = idtoname_parse,
.cache_show = idtoname_show,
.warn_no_listener = warn_no_idmapd,
@@ -321,12 +315,6 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
}
static int
-nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
-{
- return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
-}
-
-static int
nametoid_match(struct cache_head *ca, struct cache_head *cb)
{
struct ent *a = container_of(ca, struct ent, h);
@@ -365,7 +353,7 @@ static struct cache_detail nametoid_cache_template = {
.hash_size = ENT_HASHMAX,
.name = "nfs4.nametoid",
.cache_put = ent_put,
- .cache_upcall = nametoid_upcall,
+ .cache_request = nametoid_request,
.cache_parse = nametoid_parse,
.cache_show = nametoid_show,
.warn_no_listener = warn_no_idmapd,
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9d1c5dba2bb..ae73175e6e6 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -993,14 +993,15 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!buf)
return nfserr_jukebox;
+ p = buf;
status = nfsd4_encode_fattr(&cstate->current_fh,
cstate->current_fh.fh_export,
- cstate->current_fh.fh_dentry, buf,
- &count, verify->ve_bmval,
+ cstate->current_fh.fh_dentry, &p,
+ count, verify->ve_bmval,
rqstp, 0);
/* this means that nfsd4_encode_fattr() ran out of space */
- if (status == nfserr_resource && count == 0)
+ if (status == nfserr_resource)
status = nfserr_not_same;
if (status)
goto out_kfree;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4914af4a817..899ca26dd19 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1185,6 +1185,12 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
static int
nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
{
+ /* XXX: The usermode helper s not working in container yet. */
+ if (net != &init_net) {
+ WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
+ "tracking in a container!\n");
+ return -EINVAL;
+ }
return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9e7103b6e0a..16d39c6c4fb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -40,7 +40,7 @@
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sunrpc/svcauth_gss.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include "xdr4.h"
#include "vfs.h"
#include "current_stateid.h"
@@ -261,33 +261,46 @@ static inline int get_new_stid(struct nfs4_stid *stid)
return new_stid;
}
-static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct
+kmem_cache *slab)
{
- stateid_t *s = &stid->sc_stateid;
+ struct idr *stateids = &cl->cl_stateids;
+ static int min_stateid = 0;
+ struct nfs4_stid *stid;
int new_id;
- stid->sc_type = type;
+ stid = kmem_cache_alloc(slab, GFP_KERNEL);
+ if (!stid)
+ return NULL;
+
+ if (!idr_pre_get(stateids, GFP_KERNEL))
+ goto out_free;
+ if (idr_get_new_above(stateids, stid, min_stateid, &new_id))
+ goto out_free;
stid->sc_client = cl;
- s->si_opaque.so_clid = cl->cl_clientid;
- new_id = get_new_stid(stid);
- s->si_opaque.so_id = (u32)new_id;
+ stid->sc_type = 0;
+ stid->sc_stateid.si_opaque.so_id = new_id;
+ stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
/* Will be incremented before return to client: */
- s->si_generation = 0;
-}
-
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
-{
- struct idr *stateids = &cl->cl_stateids;
+ stid->sc_stateid.si_generation = 0;
- if (!idr_pre_get(stateids, GFP_KERNEL))
- return NULL;
/*
- * Note: if we fail here (or any time between now and the time
- * we actually get the new idr), we won't need to undo the idr
- * preallocation, since the idr code caps the number of
- * preallocated entries.
+ * It shouldn't be a problem to reuse an opaque stateid value.
+ * I don't think it is for 4.1. But with 4.0 I worry that, for
+ * example, a stray write retransmission could be accepted by
+ * the server when it should have been rejected. Therefore,
+ * adopt a trick from the sctp code to attempt to maximize the
+ * amount of time until an id is reused, by ensuring they always
+ * "increase" (mod INT_MAX):
*/
- return kmem_cache_alloc(slab, GFP_KERNEL);
+
+ min_stateid = new_id+1;
+ if (min_stateid == INT_MAX)
+ min_stateid = 0;
+ return stid;
+out_free:
+ kfree(stid);
+ return NULL;
}
static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
@@ -316,7 +329,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
if (dp == NULL)
return dp;
- init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+ dp->dl_stid.sc_type = NFS4_DELEG_STID;
/*
* delegation seqid's are never incremented. The 4.1 special
* meaning of seqid 0 isn't meaningful, really, but let's avoid
@@ -337,13 +350,21 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
return dp;
}
+static void free_stid(struct nfs4_stid *s, struct kmem_cache *slab)
+{
+ struct idr *stateids = &s->sc_client->cl_stateids;
+
+ idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+ kmem_cache_free(slab, s);
+}
+
void
nfs4_put_delegation(struct nfs4_delegation *dp)
{
if (atomic_dec_and_test(&dp->dl_count)) {
dprintk("NFSD: freeing dp %p\n",dp);
put_nfs4_file(dp->dl_file);
- kmem_cache_free(deleg_slab, dp);
+ free_stid(&dp->dl_stid, deleg_slab);
num_delegations--;
}
}
@@ -360,9 +381,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
static void unhash_stid(struct nfs4_stid *s)
{
- struct idr *stateids = &s->sc_client->cl_stateids;
-
- idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+ s->sc_type = 0;
}
/* Called under the state lock. */
@@ -519,7 +538,7 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp)
static void free_generic_stateid(struct nfs4_ol_stateid *stp)
{
- kmem_cache_free(stateid_slab, stp);
+ free_stid(&stp->st_stid, stateid_slab);
}
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -905,7 +924,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
new = __alloc_session(slotsize, numslots);
if (!new) {
- nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
+ nfsd4_put_drc_mem(slotsize, numslots);
return NULL;
}
init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);
@@ -1048,7 +1067,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
static inline void
free_client(struct nfs4_client *clp)
{
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id);
lockdep_assert_held(&nn->client_lock);
while (!list_empty(&clp->cl_sessions)) {
@@ -1060,6 +1079,7 @@ free_client(struct nfs4_client *clp)
}
free_svc_cred(&clp->cl_cred);
kfree(clp->cl_name.data);
+ idr_destroy(&clp->cl_stateids);
kfree(clp);
}
@@ -1258,7 +1278,12 @@ static void gen_confirm(struct nfs4_client *clp)
static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
{
- return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+ struct nfs4_stid *ret;
+
+ ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+ if (!ret || !ret->sc_type)
+ return NULL;
+ return ret;
}
static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
@@ -1844,11 +1869,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
/* cache solo and embedded create sessions under the state lock */
nfsd4_cache_create_session(cr_ses, cs_slot, status);
-out:
nfs4_unlock_state();
+out:
dprintk("%s returns %d\n", __func__, ntohl(status));
return status;
out_free_conn:
+ nfs4_unlock_state();
free_conn(conn);
out_free_session:
__free_session(new);
@@ -2443,9 +2469,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
struct nfs4_openowner *oo = open->op_openowner;
- struct nfs4_client *clp = oo->oo_owner.so_client;
- init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
+ stp->st_stid.sc_type = NFS4_OPEN_STID;
INIT_LIST_HEAD(&stp->st_lockowners);
list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
list_add(&stp->st_perfile, &fp->fi_stateids);
@@ -4031,7 +4056,7 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
stp = nfs4_alloc_stateid(clp);
if (stp == NULL)
return NULL;
- init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
+ stp->st_stid.sc_type = NFS4_LOCK_STID;
list_add(&stp->st_perfile, &fp->fi_stateids);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
stp->st_stateowner = &lo->lo_owner;
@@ -4913,16 +4938,6 @@ nfs4_state_start_net(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int ret;
- /*
- * FIXME: For now, we hang most of the pernet global stuff off of
- * init_net until nfsd is fully containerized. Eventually, we'll
- * need to pass a net pointer into this function, take a reference
- * to that instead and then do most of the rest of this on a per-net
- * basis.
- */
- if (net != &init_net)
- return -EINVAL;
-
ret = nfs4_state_create_net(net);
if (ret)
return ret;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 8ca6d17f6cf..01168865dd3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2024,12 +2024,11 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
* ourselves.
*
- * @countp is the buffer size in _words_; upon successful return this becomes
- * replaced with the number of words written.
+ * countp is the buffer size in _words_
*/
__be32
nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
+ struct dentry *dentry, __be32 **buffer, int count, u32 *bmval,
struct svc_rqst *rqstp, int ignore_crossmnt)
{
u32 bmval0 = bmval[0];
@@ -2038,12 +2037,12 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
struct kstat stat;
struct svc_fh tempfh;
struct kstatfs statfs;
- int buflen = *countp << 2;
+ int buflen = count << 2;
__be32 *attrlenp;
u32 dummy;
u64 dummy64;
u32 rdattr_err = 0;
- __be32 *p = buffer;
+ __be32 *p = *buffer;
__be32 status;
int err;
int aclsupport = 0;
@@ -2447,7 +2446,7 @@ out_acl:
}
*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
- *countp = p - buffer;
+ *buffer = p;
status = nfs_ok;
out:
@@ -2459,7 +2458,6 @@ out_nfserr:
status = nfserrno(err);
goto out;
out_resource:
- *countp = 0;
status = nfserr_resource;
goto out;
out_serverfault:
@@ -2478,7 +2476,7 @@ static inline int attributes_need_mount(u32 *bmval)
static __be32
nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
- const char *name, int namlen, __be32 *p, int *buflen)
+ const char *name, int namlen, __be32 **p, int buflen)
{
struct svc_export *exp = cd->rd_fhp->fh_export;
struct dentry *dentry;
@@ -2584,10 +2582,9 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
p = xdr_encode_array(p, name, namlen); /* name length & name */
- nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen);
+ nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen);
switch (nfserr) {
case nfs_ok:
- p += buflen;
break;
case nfserr_resource:
nfserr = nfserr_toosmall;
@@ -2714,10 +2711,8 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
- resp->p, &buflen, getattr->ga_bmval,
+ &resp->p, buflen, getattr->ga_bmval,
resp->rqstp, 0);
- if (!nfserr)
- resp->p += buflen;
return nfserr;
}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da3dbd0f897..62c1ee128ae 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -9,22 +9,22 @@
*/
#include <linux/slab.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/highmem.h>
+#include <net/checksum.h>
#include "nfsd.h"
#include "cache.h"
-/* Size of reply cache. Common values are:
- * 4.3BSD: 128
- * 4.4BSD: 256
- * Solaris2: 1024
- * DEC Unix: 512-4096
- */
-#define CACHESIZE 1024
+#define NFSDDBG_FACILITY NFSDDBG_REPCACHE
+
#define HASHSIZE 64
static struct hlist_head * cache_hash;
static struct list_head lru_head;
-static int cache_disabled = 1;
+static struct kmem_cache *drc_slab;
+static unsigned int num_drc_entries;
+static unsigned int max_drc_entries;
/*
* Calculate the hash index from an XID.
@@ -37,6 +37,14 @@ static inline u32 request_hash(u32 xid)
}
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
+static void cache_cleaner_func(struct work_struct *unused);
+static int nfsd_reply_cache_shrink(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+struct shrinker nfsd_reply_cache_shrinker = {
+ .shrink = nfsd_reply_cache_shrink,
+ .seeks = 1,
+};
/*
* locking for the reply cache:
@@ -44,30 +52,86 @@ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
* Otherwise, it when accessing _prev or _next, the lock must be held.
*/
static DEFINE_SPINLOCK(cache_lock);
+static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
-int nfsd_reply_cache_init(void)
+/*
+ * Put a cap on the size of the DRC based on the amount of available
+ * low memory in the machine.
+ *
+ * 64MB: 8192
+ * 128MB: 11585
+ * 256MB: 16384
+ * 512MB: 23170
+ * 1GB: 32768
+ * 2GB: 46340
+ * 4GB: 65536
+ * 8GB: 92681
+ * 16GB: 131072
+ *
+ * ...with a hard cap of 256k entries. In the worst case, each entry will be
+ * ~1k, so the above numbers should give a rough max of the amount of memory
+ * used in k.
+ */
+static unsigned int
+nfsd_cache_size_limit(void)
+{
+ unsigned int limit;
+ unsigned long low_pages = totalram_pages - totalhigh_pages;
+
+ limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
+ return min_t(unsigned int, limit, 256*1024);
+}
+
+static struct svc_cacherep *
+nfsd_reply_cache_alloc(void)
{
struct svc_cacherep *rp;
- int i;
- INIT_LIST_HEAD(&lru_head);
- i = CACHESIZE;
- while (i) {
- rp = kmalloc(sizeof(*rp), GFP_KERNEL);
- if (!rp)
- goto out_nomem;
- list_add(&rp->c_lru, &lru_head);
+ rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
+ if (rp) {
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
+ INIT_LIST_HEAD(&rp->c_lru);
INIT_HLIST_NODE(&rp->c_hash);
- i--;
}
+ return rp;
+}
- cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+static void
+nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
+{
+ if (rp->c_type == RC_REPLBUFF)
+ kfree(rp->c_replvec.iov_base);
+ hlist_del(&rp->c_hash);
+ list_del(&rp->c_lru);
+ --num_drc_entries;
+ kmem_cache_free(drc_slab, rp);
+}
+
+static void
+nfsd_reply_cache_free(struct svc_cacherep *rp)
+{
+ spin_lock(&cache_lock);
+ nfsd_reply_cache_free_locked(rp);
+ spin_unlock(&cache_lock);
+}
+
+int nfsd_reply_cache_init(void)
+{
+ register_shrinker(&nfsd_reply_cache_shrinker);
+ drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
+ 0, 0, NULL);
+ if (!drc_slab)
+ goto out_nomem;
+
+ cache_hash = kcalloc(HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
if (!cache_hash)
goto out_nomem;
- cache_disabled = 0;
+ INIT_LIST_HEAD(&lru_head);
+ max_drc_entries = nfsd_cache_size_limit();
+ num_drc_entries = 0;
+
return 0;
out_nomem:
printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
@@ -79,27 +143,33 @@ void nfsd_reply_cache_shutdown(void)
{
struct svc_cacherep *rp;
+ unregister_shrinker(&nfsd_reply_cache_shrinker);
+ cancel_delayed_work_sync(&cache_cleaner);
+
while (!list_empty(&lru_head)) {
rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
- if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
- kfree(rp->c_replvec.iov_base);
- list_del(&rp->c_lru);
- kfree(rp);
+ nfsd_reply_cache_free_locked(rp);
}
- cache_disabled = 1;
-
kfree (cache_hash);
cache_hash = NULL;
+
+ if (drc_slab) {
+ kmem_cache_destroy(drc_slab);
+ drc_slab = NULL;
+ }
}
/*
- * Move cache entry to end of LRU list
+ * Move cache entry to end of LRU list, and queue the cleaner to run if it's
+ * not already scheduled.
*/
static void
lru_put_end(struct svc_cacherep *rp)
{
+ rp->c_timestamp = jiffies;
list_move_tail(&rp->c_lru, &lru_head);
+ schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}
/*
@@ -112,82 +182,214 @@ hash_refile(struct svc_cacherep *rp)
hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
}
+static inline bool
+nfsd_cache_entry_expired(struct svc_cacherep *rp)
+{
+ return rp->c_state != RC_INPROG &&
+ time_after(jiffies, rp->c_timestamp + RC_EXPIRE);
+}
+
+/*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+ */
+static void
+prune_cache_entries(void)
+{
+ struct svc_cacherep *rp, *tmp;
+
+ list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
+ if (!nfsd_cache_entry_expired(rp) &&
+ num_drc_entries <= max_drc_entries)
+ break;
+ nfsd_reply_cache_free_locked(rp);
+ }
+
+ /*
+ * Conditionally rearm the job. If we cleaned out the list, then
+ * cancel any pending run (since there won't be any work to do).
+ * Otherwise, we rearm the job or modify the existing one to run in
+ * RC_EXPIRE since we just ran the pruner.
+ */
+ if (list_empty(&lru_head))
+ cancel_delayed_work(&cache_cleaner);
+ else
+ mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
+}
+
+static void
+cache_cleaner_func(struct work_struct *unused)
+{
+ spin_lock(&cache_lock);
+ prune_cache_entries();
+ spin_unlock(&cache_lock);
+}
+
+static int
+nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned int num;
+
+ spin_lock(&cache_lock);
+ if (sc->nr_to_scan)
+ prune_cache_entries();
+ num = num_drc_entries;
+ spin_unlock(&cache_lock);
+
+ return num;
+}
+
+/*
+ * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+ */
+static __wsum
+nfsd_cache_csum(struct svc_rqst *rqstp)
+{
+ int idx;
+ unsigned int base;
+ __wsum csum;
+ struct xdr_buf *buf = &rqstp->rq_arg;
+ const unsigned char *p = buf->head[0].iov_base;
+ size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
+ RC_CSUMLEN);
+ size_t len = min(buf->head[0].iov_len, csum_len);
+
+ /* rq_arg.head first */
+ csum = csum_partial(p, len, 0);
+ csum_len -= len;
+
+ /* Continue into page array */
+ idx = buf->page_base / PAGE_SIZE;
+ base = buf->page_base & ~PAGE_MASK;
+ while (csum_len) {
+ p = page_address(buf->pages[idx]) + base;
+ len = min_t(size_t, PAGE_SIZE - base, csum_len);
+ csum = csum_partial(p, len, csum);
+ csum_len -= len;
+ base = 0;
+ ++idx;
+ }
+ return csum;
+}
+
+/*
+ * Search the request hash for an entry that matches the given rqstp.
+ * Must be called with cache_lock held. Returns the found entry or
+ * NULL on failure.
+ */
+static struct svc_cacherep *
+nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
+{
+ struct svc_cacherep *rp;
+ struct hlist_head *rh;
+ __be32 xid = rqstp->rq_xid;
+ u32 proto = rqstp->rq_prot,
+ vers = rqstp->rq_vers,
+ proc = rqstp->rq_proc;
+
+ rh = &cache_hash[request_hash(xid)];
+ hlist_for_each_entry(rp, rh, c_hash) {
+ if (xid == rp->c_xid && proc == rp->c_proc &&
+ proto == rp->c_prot && vers == rp->c_vers &&
+ rqstp->rq_arg.len == rp->c_len && csum == rp->c_csum &&
+ rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) &&
+ rpc_get_port(svc_addr(rqstp)) == rpc_get_port((struct sockaddr *)&rp->c_addr))
+ return rp;
+ }
+ return NULL;
+}
+
/*
* Try to find an entry matching the current call in the cache. When none
- * is found, we grab the oldest unlocked entry off the LRU list.
- * Note that no operation within the loop may sleep.
+ * is found, we try to grab the oldest expired entry off the LRU list. If
+ * a suitable one isn't there, then drop the cache_lock and allocate a
+ * new one, then search again in case one got inserted while this thread
+ * didn't hold the lock.
*/
int
nfsd_cache_lookup(struct svc_rqst *rqstp)
{
- struct hlist_head *rh;
- struct svc_cacherep *rp;
+ struct svc_cacherep *rp, *found;
__be32 xid = rqstp->rq_xid;
u32 proto = rqstp->rq_prot,
vers = rqstp->rq_vers,
proc = rqstp->rq_proc;
+ __wsum csum;
unsigned long age;
int type = rqstp->rq_cachetype;
int rtn;
rqstp->rq_cacherep = NULL;
- if (cache_disabled || type == RC_NOCACHE) {
+ if (type == RC_NOCACHE) {
nfsdstats.rcnocache++;
return RC_DOIT;
}
+ csum = nfsd_cache_csum(rqstp);
+
spin_lock(&cache_lock);
rtn = RC_DOIT;
- rh = &cache_hash[request_hash(xid)];
- hlist_for_each_entry(rp, rh, c_hash) {
- if (rp->c_state != RC_UNUSED &&
- xid == rp->c_xid && proc == rp->c_proc &&
- proto == rp->c_prot && vers == rp->c_vers &&
- time_before(jiffies, rp->c_timestamp + 120*HZ) &&
- memcmp((char*)&rqstp->rq_addr, (char*)&rp->c_addr, sizeof(rp->c_addr))==0) {
- nfsdstats.rchits++;
- goto found_entry;
+ rp = nfsd_cache_search(rqstp, csum);
+ if (rp)
+ goto found_entry;
+
+ /* Try to use the first entry on the LRU */
+ if (!list_empty(&lru_head)) {
+ rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
+ if (nfsd_cache_entry_expired(rp) ||
+ num_drc_entries >= max_drc_entries) {
+ lru_put_end(rp);
+ prune_cache_entries();
+ goto setup_entry;
}
}
- nfsdstats.rcmisses++;
- /* This loop shouldn't take more than a few iterations normally */
- {
- int safe = 0;
- list_for_each_entry(rp, &lru_head, c_lru) {
- if (rp->c_state != RC_INPROG)
- break;
- if (safe++ > CACHESIZE) {
- printk("nfsd: loop in repcache LRU list\n");
- cache_disabled = 1;
- goto out;
- }
+ /* Drop the lock and allocate a new entry */
+ spin_unlock(&cache_lock);
+ rp = nfsd_reply_cache_alloc();
+ if (!rp) {
+ dprintk("nfsd: unable to allocate DRC entry!\n");
+ return RC_DOIT;
}
+ spin_lock(&cache_lock);
+ ++num_drc_entries;
+
+ /*
+ * Must search again just in case someone inserted one
+ * after we dropped the lock above.
+ */
+ found = nfsd_cache_search(rqstp, csum);
+ if (found) {
+ nfsd_reply_cache_free_locked(rp);
+ rp = found;
+ goto found_entry;
}
- /* All entries on the LRU are in-progress. This should not happen */
- if (&rp->c_lru == &lru_head) {
- static int complaints;
-
- printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
- if (++complaints > 5) {
- printk(KERN_WARNING "nfsd: disabling repcache.\n");
- cache_disabled = 1;
- }
- goto out;
- }
+ /*
+ * We're keeping the one we just allocated. Are we now over the
+ * limit? Prune one off the tip of the LRU in trade for the one we
+ * just allocated if so.
+ */
+ if (num_drc_entries >= max_drc_entries)
+ nfsd_reply_cache_free_locked(list_first_entry(&lru_head,
+ struct svc_cacherep, c_lru));
+setup_entry:
+ nfsdstats.rcmisses++;
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
rp->c_xid = xid;
rp->c_proc = proc;
- memcpy(&rp->c_addr, svc_addr_in(rqstp), sizeof(rp->c_addr));
+ rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
+ rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
rp->c_prot = proto;
rp->c_vers = vers;
- rp->c_timestamp = jiffies;
+ rp->c_len = rqstp->rq_arg.len;
+ rp->c_csum = csum;
hash_refile(rp);
+ lru_put_end(rp);
/* release any buffer */
if (rp->c_type == RC_REPLBUFF) {
@@ -200,9 +402,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
return rtn;
found_entry:
+ nfsdstats.rchits++;
/* We found a matching entry which is either in progress or done. */
age = jiffies - rp->c_timestamp;
- rp->c_timestamp = jiffies;
lru_put_end(rp);
rtn = RC_DROPIT;
@@ -231,7 +433,7 @@ found_entry:
break;
default:
printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
- rp->c_state = RC_UNUSED;
+ nfsd_reply_cache_free_locked(rp);
}
goto out;
@@ -256,11 +458,11 @@ found_entry:
void
nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
{
- struct svc_cacherep *rp;
+ struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
int len;
- if (!(rp = rqstp->rq_cacherep) || cache_disabled)
+ if (!rp)
return;
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
@@ -268,7 +470,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
/* Don't cache excessive amounts of data and XDR failures */
if (!statp || len > (256 >> 2)) {
- rp->c_state = RC_UNUSED;
+ nfsd_reply_cache_free(rp);
return;
}
@@ -282,21 +484,21 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
cachv = &rp->c_replvec;
cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
if (!cachv->iov_base) {
- spin_lock(&cache_lock);
- rp->c_state = RC_UNUSED;
- spin_unlock(&cache_lock);
+ nfsd_reply_cache_free(rp);
return;
}
cachv->iov_len = len << 2;
memcpy(cachv->iov_base, statp, len << 2);
break;
+ case RC_NOCACHE:
+ nfsd_reply_cache_free(rp);
+ return;
}
spin_lock(&cache_lock);
lru_put_end(rp);
rp->c_secure = rqstp->rq_secure;
rp->c_type = cachetype;
rp->c_state = RC_DONE;
- rp->c_timestamp = jiffies;
spin_unlock(&cache_lock);
return;
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2db7021b01a..13a21c8fca4 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -10,7 +10,7 @@
#include <linux/sunrpc/svcsock.h>
#include <linux/lockd/lockd.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/gss_api.h>
#include <linux/sunrpc/gss_krb5_enctypes.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
@@ -125,11 +125,11 @@ static const struct file_operations transaction_ops = {
.llseek = default_llseek,
};
-static int exports_open(struct inode *inode, struct file *file)
+static int exports_net_open(struct net *net, struct file *file)
{
int err;
struct seq_file *seq;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
err = seq_open(file, &nfs_exports_op);
if (err)
@@ -140,8 +140,26 @@ static int exports_open(struct inode *inode, struct file *file)
return 0;
}
-static const struct file_operations exports_operations = {
- .open = exports_open,
+static int exports_proc_open(struct inode *inode, struct file *file)
+{
+ return exports_net_open(current->nsproxy->net_ns, file);
+}
+
+static const struct file_operations exports_proc_operations = {
+ .open = exports_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .owner = THIS_MODULE,
+};
+
+static int exports_nfsd_open(struct inode *inode, struct file *file)
+{
+ return exports_net_open(inode->i_sb->s_fs_info, file);
+}
+
+static const struct file_operations exports_nfsd_operations = {
+ .open = exports_nfsd_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
@@ -220,6 +238,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
struct sockaddr *sap = (struct sockaddr *)&address;
size_t salen = sizeof(address);
char *fo_path;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
/* sanity check */
if (size == 0)
@@ -232,7 +251,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
if (qword_get(&buf, fo_path, size) < 0)
return -EINVAL;
- if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0)
+ if (rpc_pton(net, fo_path, size, sap, salen) == 0)
return -EINVAL;
return nlmsvc_unlock_all_by_ip(sap);
@@ -317,6 +336,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
int len;
struct auth_domain *dom;
struct knfsd_fh fh;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
if (size == 0)
return -EINVAL;
@@ -352,7 +372,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
if (!dom)
return -ENOMEM;
- len = exp_rootfh(&init_net, dom, path, &fh, maxsize);
+ len = exp_rootfh(net, dom, path, &fh, maxsize);
auth_domain_put(dom);
if (len)
return len;
@@ -396,7 +416,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
int rv;
- struct net *net = &init_net;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
if (size > 0) {
int newthreads;
@@ -447,7 +467,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
int len;
int npools;
int *nthreads;
- struct net *net = &init_net;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
npools = nfsd_nrpools(net);
@@ -510,7 +530,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
unsigned minor;
ssize_t tlen = 0;
char *sep;
- struct net *net = &init_net;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (size>0) {
@@ -534,7 +554,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
else
num = simple_strtol(vers, &minorp, 0);
if (*minorp == '.') {
- if (num < 4)
+ if (num != 4)
return -EINVAL;
minor = simple_strtoul(minorp+1, NULL, 0);
if (minor == 0)
@@ -792,7 +812,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size,
static ssize_t write_ports(struct file *file, char *buf, size_t size)
{
ssize_t rv;
- struct net *net = &init_net;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
rv = __write_ports(file, buf, size, net);
@@ -827,7 +847,7 @@ int nfsd_max_blksize;
static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
- struct net *net = &init_net;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (size > 0) {
@@ -923,7 +943,8 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
*/
static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
{
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
}
@@ -939,7 +960,8 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
*/
static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
{
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
}
@@ -995,7 +1017,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
{
ssize_t rv;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
mutex_lock(&nfsd_mutex);
rv = __write_recoverydir(file, buf, size, nn);
@@ -1013,7 +1036,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
{
static struct tree_descr nfsd_files[] = {
- [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
+ [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
[NFSD_Export_features] = {"export_features",
&export_features_operations, S_IRUGO},
[NFSD_FO_UnlockIP] = {"unlock_ip",
@@ -1037,20 +1060,35 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
#endif
/* last one */ {""}
};
- return simple_fill_super(sb, 0x6e667364, nfsd_files);
+ struct net *net = data;
+ int ret;
+
+ ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
+ if (ret)
+ return ret;
+ sb->s_fs_info = get_net(net);
+ return 0;
}
static struct dentry *nfsd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_single(fs_type, flags, data, nfsd_fill_super);
+ return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+}
+
+static void nfsd_umount(struct super_block *sb)
+{
+ struct net *net = sb->s_fs_info;
+
+ kill_litter_super(sb);
+ put_net(net);
}
static struct file_system_type nfsd_fs_type = {
.owner = THIS_MODULE,
.name = "nfsd",
.mount = nfsd_mount,
- .kill_sb = kill_litter_super,
+ .kill_sb = nfsd_umount,
};
#ifdef CONFIG_PROC_FS
@@ -1061,7 +1099,8 @@ static int create_proc_exports_entry(void)
entry = proc_mkdir("fs/nfs", NULL);
if (!entry)
return -ENOMEM;
- entry = proc_create("exports", 0, entry, &exports_operations);
+ entry = proc_create("exports", 0, entry,
+ &exports_proc_operations);
if (!entry)
return -ENOMEM;
return 0;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index be7af509930..262df5ccbf5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -652,7 +652,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
/* Check whether we have this call in the cache. */
switch (nfsd_cache_lookup(rqstp)) {
- case RC_INTR:
case RC_DROPIT:
return 0;
case RC_REPLY:
@@ -703,8 +702,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
{
int ret;
- struct net *net = &init_net;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id);
mutex_lock(&nfsd_mutex);
if (nn->nfsd_serv == NULL) {
@@ -721,7 +719,7 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
int nfsd_pool_stats_release(struct inode *inode, struct file *file)
{
int ret = seq_release(inode, file);
- struct net *net = &init_net;
+ struct net *net = inode->i_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
/* this function really, really should have been called svc_put() */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 0889bfb43dc..546f8983ecf 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -563,7 +563,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- struct dentry *dentry, __be32 *buffer, int *countp,
+ struct dentry *dentry, __be32 **buffer, int countp,
u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *,
diff --git a/fs/open.c b/fs/open.c
index 62f907e3bc3..e3441f58d2e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -30,6 +30,7 @@
#include <linux/fs_struct.h>
#include <linux/ima.h>
#include <linux/dnotify.h>
+#include <linux/compat.h>
#include "internal.h"
@@ -140,6 +141,13 @@ SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
return do_sys_truncate(path, length);
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
+{
+ return do_sys_truncate(path, length);
+}
+#endif
+
static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
struct inode *inode;
@@ -195,6 +203,13 @@ SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
return ret;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
+{
+ return do_sys_ftruncate(fd, length, 1);
+}
+#endif
+
/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ae6dbe828b..a698eff457f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -15,6 +15,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
+#include <linux/compat.h>
#include "read_write.h"
#include <asm/uaccess.h>
@@ -247,6 +248,13 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
return retval;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
+{
+ return sys_lseek(fd, offset, whence);
+}
+#endif
+
#ifdef __ARCH_WANT_SYS_LLSEEK
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
unsigned long, offset_low, loff_t __user *, result,
@@ -278,7 +286,6 @@ out_putf:
}
#endif
-
/*
* rw_verify_area doesn't like huge counts. We limit
* them to something that fits in "int" so that others
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 0e606b12a59..32b644f0369 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -383,10 +383,10 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
}
-#ifdef COMPAT
+#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
- const struct itimerspec __user *, utmr,
- struct itimerspec __user *, otmr)
+ const struct compat_itimerspec __user *, utmr,
+ struct compat_itimerspec __user *, otmr)
{
struct itimerspec new, old;
int ret;
@@ -402,12 +402,12 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
}
COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
- struct itimerspec __user *, otmr)
+ struct compat_itimerspec __user *, otmr)
{
struct itimerspec kotmr;
int ret = do_timerfd_gettime(ufd, &kotmr);
if (ret)
return ret;
- return put_compat_itimerspec(otmr, &t) ? -EFAULT: 0;
+ return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0;
}
#endif