aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/Makefile4
-rw-r--r--fs/adfs/dir_f.c12
-rw-r--r--fs/affs/amigaffs.c27
-rw-r--r--fs/affs/file.c26
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/cmservice.c16
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/flock.c1
-rw-r--r--fs/afs/fs_probe.c4
-rw-r--r--fs/afs/fsclient.c42
-rw-r--r--fs/afs/inode.c47
-rw-r--r--fs/afs/internal.h15
-rw-r--r--fs/afs/misc.c18
-rw-r--r--fs/afs/proc.c5
-rw-r--r--fs/afs/rotate.c2
-rw-r--r--fs/afs/rxrpc.c6
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/vl_list.c1
-rw-r--r--fs/afs/vl_probe.c82
-rw-r--r--fs/afs/vl_rotate.c7
-rw-r--r--fs/afs/vlclient.c24
-rw-r--r--fs/afs/write.c13
-rw-r--r--fs/afs/yfsclient.c50
-rw-r--r--fs/aio.c10
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/binfmt_flat.c20
-rw-r--r--fs/block_dev.c184
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/block-group.c70
-rw-r--r--fs/btrfs/btrfs_inode.h30
-rw-r--r--fs/btrfs/compression.c35
-rw-r--r--fs/btrfs/compression.h35
-rw-r--r--fs/btrfs/ctree.c212
-rw-r--r--fs/btrfs/ctree.h109
-rw-r--r--fs/btrfs/delalloc-space.c123
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/dev-replace.c118
-rw-r--r--fs/btrfs/disk-io.c173
-rw-r--r--fs/btrfs/disk-io.h9
-rw-r--r--fs/btrfs/extent-io-tree.h3
-rw-r--r--fs/btrfs/extent-tree.c244
-rw-r--r--fs/btrfs/extent_io.c224
-rw-r--r--fs/btrfs/extent_io.h29
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c318
-rw-r--r--fs/btrfs/free-space-cache.c25
-rw-r--r--fs/btrfs/free-space-tree.c4
-rw-r--r--fs/btrfs/inode.c817
-rw-r--r--fs/btrfs/ioctl.c96
-rw-r--r--fs/btrfs/locking.c45
-rw-r--r--fs/btrfs/locking.h78
-rw-r--r--fs/btrfs/ordered-data.c113
-rw-r--r--fs/btrfs/ordered-data.h24
-rw-r--r--fs/btrfs/print-tree.c50
-rw-r--r--fs/btrfs/print-tree.h4
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/reada.c30
-rw-r--r--fs/btrfs/reflink.c46
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/root-tree.c13
-rw-r--r--fs/btrfs/scrub.c130
-rw-r--r--fs/btrfs/send.c365
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/space-info.c323
-rw-r--r--fs/btrfs/space-info.h2
-rw-r--r--fs/btrfs/struct-funcs.c10
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/btrfs/sysfs.c249
-rw-r--r--fs/btrfs/sysfs.h11
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c7
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-checker.c19
-rw-r--r--fs/btrfs/tree-log.c294
-rw-r--r--fs/btrfs/tree-log.h32
-rw-r--r--fs/btrfs/volumes.c429
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/buffer.c18
-rw-r--r--fs/ceph/caps.c14
-rw-r--r--fs/ceph/debugfs.c4
-rw-r--r--fs/ceph/dir.c33
-rw-r--r--fs/ceph/file.c7
-rw-r--r--fs/ceph/inode.c19
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/quota.c4
-rw-r--r--fs/ceph/super.h73
-rw-r--r--fs/cifs/cifsglob.h15
-rw-r--r--fs/cifs/cifssmb.c2
-rw-r--r--fs/cifs/connect.c12
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/sess.c6
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/compat.c132
-rw-r--r--fs/configfs/dir.c4
-rw-r--r--fs/crypto/crypto.c4
-rw-r--r--fs/crypto/fname.c60
-rw-r--r--fs/crypto/fscrypt_private.h10
-rw-r--r--fs/crypto/hooks.c80
-rw-r--r--fs/crypto/inline_crypt.c7
-rw-r--r--fs/crypto/keyring.c9
-rw-r--r--fs/crypto/keysetup.c182
-rw-r--r--fs/crypto/keysetup_v1.c8
-rw-r--r--fs/crypto/policy.c209
-rw-r--r--fs/d_path.c6
-rw-r--r--fs/dax.c15
-rw-r--r--fs/debugfs/file.c4
-rw-r--r--fs/direct-io.c88
-rw-r--r--fs/dlm/Kconfig1
-rw-r--r--fs/dlm/config.c66
-rw-r--r--fs/dlm/config.h4
-rw-r--r--fs/dlm/lock.c2
-rw-r--r--fs/dlm/lowcomms.c329
-rw-r--r--fs/dlm/midcomms.c136
-rw-r--r--fs/dlm/midcomms.h3
-rw-r--r--fs/efivarfs/super.c3
-rw-r--r--fs/erofs/data.c2
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/xattr.c2
-rw-r--r--fs/erofs/zdata.c48
-rw-r--r--fs/erofs/zmap.c6
-rw-r--r--fs/eventpoll.c78
-rw-r--r--fs/exec.c138
-rw-r--r--fs/exfat/cache.c11
-rw-r--r--fs/exfat/exfat_fs.h3
-rw-r--r--fs/exfat/inode.c2
-rw-r--r--fs/exfat/namei.c13
-rw-r--r--fs/exfat/super.c5
-rw-r--r--fs/ext2/balloc.c6
-rw-r--r--fs/ext2/file.c6
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/ialloc.c119
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/super.c16
-rw-r--r--fs/f2fs/data.c3
-rw-r--r--fs/f2fs/dir.c6
-rw-r--r--fs/f2fs/f2fs.h27
-rw-r--r--fs/f2fs/namei.c7
-rw-r--r--fs/f2fs/node.c7
-rw-r--r--fs/f2fs/segment.c8
-rw-r--r--fs/f2fs/super.c15
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/file.c2
-rw-r--r--fs/fs-writeback.c112
-rw-r--r--fs/fs_context.c2
-rw-r--r--fs/fs_parser.c2
-rw-r--r--fs/fsopen.c2
-rw-r--r--fs/fuse/file.c25
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/gfs2/bmap.c4
-rw-r--r--fs/gfs2/log.c31
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/trans.c1
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/internal.h3
-rw-r--r--fs/io-wq.c221
-rw-r--r--fs/io-wq.h4
-rw-r--r--fs/io_uring.c2414
-rw-r--r--fs/iomap/buffered-io.c194
-rw-r--r--fs/iomap/direct-io.c49
-rw-r--r--fs/iomap/seek.c4
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jfs/jfs_metapage.c2
-rw-r--r--fs/kernel_read_file.c189
-rw-r--r--fs/libfs.c4
-rw-r--r--fs/locks.c6
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c29
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/dir.c5
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c47
-rw-r--r--fs/nfs/fs_context.c217
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs42proc.c10
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4idmap.c4
-rw-r--r--fs/nfs/nfs4proc.c43
-rw-r--r--fs/nfs/nfs4state.c14
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/super.c9
-rw-r--r--fs/nfs_common/nfsacl.c2
-rw-r--r--fs/nfsd/blocklayout.c8
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4proc.c2
-rw-r--r--fs/nfsd/nfs4state.c14
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/vfs.c4
-rw-r--r--fs/nilfs2/bmap.c2
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/segment.c19
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c28
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/pipe.c73
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c108
-rw-r--r--fs/pstore/zone.c1
-rw-r--r--fs/quota/Kconfig5
-rw-r--r--fs/quota/Makefile1
-rw-r--r--fs/quota/compat.c120
-rw-r--r--fs/quota/compat.h34
-rw-r--r--fs/quota/quota.c117
-rw-r--r--fs/quota/quota_v2.c1
-rw-r--r--fs/read_write.c370
-rw-r--r--fs/reiserfs/inode.c9
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/seq_file.c2
-rw-r--r--fs/signalfd.c2
-rw-r--r--fs/splice.c85
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysfs/file.c55
-rw-r--r--fs/ubifs/dir.c40
-rw-r--r--fs/ubifs/lprops.c4
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/file.c7
-rw-r--r--fs/udf/ialloc.c14
-rw-r--r--fs/udf/inode.c61
-rw-r--r--fs/udf/misc.c6
-rw-r--r--fs/udf/namei.c7
-rw-r--r--fs/udf/partition.c2
-rw-r--r--fs/udf/super.c47
-rw-r--r--fs/udf/symlink.c4
-rw-r--r--fs/udf/udf_i.h6
-rw-r--r--fs/ufs/util.h12
-rw-r--r--fs/vboxsf/dir.c2
-rw-r--r--fs/vboxsf/super.c4
-rw-r--r--fs/vboxsf/utils.c2
-rw-r--r--fs/xattr.c22
-rw-r--r--fs/xfs/kmem.c22
-rw-r--r--fs/xfs/kmem.h7
-rw-r--r--fs/xfs/libxfs/xfs_ag.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr.c14
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c47
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h29
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h6
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c35
-rw-r--r--fs/xfs/libxfs/xfs_format.h211
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c9
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c65
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c130
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h15
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c8
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h7
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h1
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h8
-rw-r--r--fs/xfs/libxfs/xfs_sb.c6
-rw-r--r--fs/xfs/libxfs/xfs_shared.h3
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c21
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/scrub/agheader.c30
-rw-r--r--fs/xfs/scrub/agheader_repair.c24
-rw-r--r--fs/xfs/scrub/inode.c31
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_list.c6
-rw-r--r--fs/xfs/xfs_bmap_util.c18
-rw-r--r--fs/xfs/xfs_buf.c208
-rw-r--r--fs/xfs/xfs_buf.h17
-rw-r--r--fs/xfs/xfs_buf_item.c264
-rw-r--r--fs/xfs/xfs_buf_item.h12
-rw-r--r--fs/xfs/xfs_buf_item_recover.c2
-rw-r--r--fs/xfs/xfs_dquot.c66
-rw-r--r--fs/xfs/xfs_dquot.h3
-rw-r--r--fs/xfs/xfs_file.c29
-rw-r--r--fs/xfs/xfs_icache.c19
-rw-r--r--fs/xfs/xfs_inode.c83
-rw-r--r--fs/xfs/xfs_inode.h38
-rw-r--r--fs/xfs/xfs_inode_item.c61
-rw-r--r--fs/xfs/xfs_inode_item.h5
-rw-r--r--fs/xfs/xfs_inode_item_recover.c76
-rw-r--r--fs/xfs/xfs_ioctl.c7
-rw-r--r--fs/xfs/xfs_log_recover.c60
-rw-r--r--fs/xfs/xfs_mount.c32
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_ondisk.h38
-rw-r--r--fs/xfs/xfs_qm.c13
-rw-r--r--fs/xfs/xfs_qm.h4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c18
-rw-r--r--fs/xfs/xfs_quota.h8
-rw-r--r--fs/xfs/xfs_rtalloc.c13
-rw-r--r--fs/xfs/xfs_super.c28
-rw-r--r--fs/xfs/xfs_trace.h29
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_dquot.c6
311 files changed, 8458 insertions, 6338 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 92cd1d80218d..6ecf863bfa2f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -213,7 +213,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
default:
WARN_ONCE(1, "unknown lock status code: %d\n", status);
- /* fall through */
+ fallthrough;
case P9_LOCK_ERROR:
case P9_LOCK_GRACE:
res = -ENOLCK;
@@ -625,7 +625,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
inode = file_inode(vma->vm_file);
- if (!mapping_cap_writeback_dirty(inode->i_mapping))
+ if (!mapping_can_writeback(inode->i_mapping))
wbc.nr_to_write = 0;
might_sleep();
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 74df32be4c6a..e34fa20acf61 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -80,8 +80,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (ret)
return ret;
- if (v9ses->cache)
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
+ if (!v9ses->cache) {
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
+ }
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
diff --git a/fs/Makefile b/fs/Makefile
index 1c7b0e3f6daa..7bb2a05fda1f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,8 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o fs_context.o fs_parser.o fsopen.o init.o
+ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
+ kernel_read_file.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -37,7 +38,6 @@ obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
-obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 30d526fecc3f..05e963402e25 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -18,11 +18,11 @@ static inline unsigned int adfs_readval(unsigned char *p, int len)
switch (len) {
case 4: val |= p[3] << 24;
- /* fall through */
+ fallthrough;
case 3: val |= p[2] << 16;
- /* fall through */
+ fallthrough;
case 2: val |= p[1] << 8;
- /* fall through */
+ fallthrough;
default: val |= p[0];
}
return val;
@@ -32,11 +32,11 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val)
{
switch (len) {
case 4: p[3] = val >> 24;
- /* fall through */
+ fallthrough;
case 3: p[2] = val >> 16;
- /* fall through */
+ fallthrough;
case 2: p[1] = val >> 8;
- /* fall through */
+ fallthrough;
default: p[0] = val;
}
}
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index f708c45d5f66..29f11e10a7c7 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -420,24 +420,51 @@ affs_mode_to_prot(struct inode *inode)
u32 prot = AFFS_I(inode)->i_protect;
umode_t mode = inode->i_mode;
+ /*
+ * First, clear all RWED bits for owner, group, other.
+ * Then, recalculate them afresh.
+ *
+ * We'll always clear the delete-inhibit bit for the owner, as that is
+ * the classic single-user mode AmigaOS protection bit and we need to
+ * stay compatible with all scenarios.
+ *
+ * Since multi-user AmigaOS is an extension, we'll only set the
+ * delete-allow bit if any of the other bits in the same user class
+ * (group/other) are used.
+ */
+ prot &= ~(FIBF_NOEXECUTE | FIBF_NOREAD
+ | FIBF_NOWRITE | FIBF_NODELETE
+ | FIBF_GRP_EXECUTE | FIBF_GRP_READ
+ | FIBF_GRP_WRITE | FIBF_GRP_DELETE
+ | FIBF_OTR_EXECUTE | FIBF_OTR_READ
+ | FIBF_OTR_WRITE | FIBF_OTR_DELETE);
+
+ /* Classic single-user AmigaOS flags. These are inverted. */
if (!(mode & 0100))
prot |= FIBF_NOEXECUTE;
if (!(mode & 0400))
prot |= FIBF_NOREAD;
if (!(mode & 0200))
prot |= FIBF_NOWRITE;
+
+ /* Multi-user extended flags. Not inverted. */
if (mode & 0010)
prot |= FIBF_GRP_EXECUTE;
if (mode & 0040)
prot |= FIBF_GRP_READ;
if (mode & 0020)
prot |= FIBF_GRP_WRITE;
+ if (mode & 0070)
+ prot |= FIBF_GRP_DELETE;
+
if (mode & 0001)
prot |= FIBF_OTR_EXECUTE;
if (mode & 0004)
prot |= FIBF_OTR_READ;
if (mode & 0002)
prot |= FIBF_OTR_WRITE;
+ if (mode & 0007)
+ prot |= FIBF_OTR_DELETE;
AFFS_I(inode)->i_protect = prot;
}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a26a0f96c119..d91b0133d95d 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,6 +429,24 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
+static int affs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ /* Clear Archived bit on file writes, as AmigaOS would do */
+ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
+ AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED;
+ mark_inode_dirty(inode);
+ }
+
+ return ret;
+}
+
static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,affs_get_block);
@@ -438,7 +456,7 @@ const struct address_space_operations affs_aops = {
.readpage = affs_readpage,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
- .write_end = generic_write_end,
+ .write_end = affs_write_end,
.direct_IO = affs_direct_IO,
.bmap = _affs_bmap
};
@@ -795,6 +813,12 @@ done:
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
+ /* Clear Archived bit on file writes, as AmigaOS would do */
+ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
+ AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED;
+ mark_inode_dirty(inode);
+ }
+
err_first_bh:
unlock_page(page);
put_page(page);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index a346cf7659f1..044412110b52 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -93,7 +93,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
case ST_ROOT:
inode->i_uid = sbi->s_uid;
inode->i_gid = sbi->s_gid;
- /* fall through */
+ fallthrough;
case ST_USERDIR:
if (be32_to_cpu(tail->stype) == ST_USERDIR ||
affs_test_opt(sbi->s_flags, SF_SETMODE)) {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 47107c6712a6..a100cd9950c8 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -474,7 +474,7 @@ got_root:
case MUFS_INTLFFS:
case MUFS_DCFFS:
affs_set_opt(sbi->s_flags, SF_MUFS);
- /* fall thru */
+ fallthrough;
case FS_INTLFFS:
case FS_DCFFS:
affs_set_opt(sbi->s_flags, SF_INTL);
@@ -486,7 +486,7 @@ got_root:
break;
case MUFS_OFS:
affs_set_opt(sbi->s_flags, SF_MUFS);
- /* fall through */
+ fallthrough;
case FS_OFS:
affs_set_opt(sbi->s_flags, SF_OFS);
sb->s_flags |= SB_NOEXEC;
@@ -494,7 +494,7 @@ got_root:
case MUFS_DCOFS:
case MUFS_INTLOFS:
affs_set_opt(sbi->s_flags, SF_MUFS);
- /* fall through */
+ fallthrough;
case FS_DCOFS:
case FS_INTLOFS:
affs_set_opt(sbi->s_flags, SF_INTL);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index bef413818af7..a4e9e6e07e93 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -252,7 +252,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
call->unmarshall++;
/* extract the FID array and its count in two steps */
- /* fall through */
+ fallthrough;
case 1:
_debug("extract FID count");
ret = afs_extract_data(call, true);
@@ -271,7 +271,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
afs_extract_to_buf(call, call->count * 3 * 4);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract FID array");
ret = afs_extract_data(call, true);
@@ -297,7 +297,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
call->unmarshall++;
/* extract the callback array and its count in two steps */
- /* fall through */
+ fallthrough;
case 3:
_debug("extract CB count");
ret = afs_extract_data(call, true);
@@ -312,7 +312,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 4:
_debug("extract discard %zu/%u",
iov_iter_count(call->iter), call->count2 * 3 * 4);
@@ -391,7 +391,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 1:
_debug("extract UUID");
ret = afs_extract_data(call, false);
@@ -503,7 +503,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 1:
_debug("extract UUID");
ret = afs_extract_data(call, false);
@@ -618,7 +618,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
call->unmarshall++;
/* extract the FID array and its count in two steps */
- /* Fall through */
+ fallthrough;
case 1:
_debug("extract FID count");
ret = afs_extract_data(call, true);
@@ -637,7 +637,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract FID array");
ret = afs_extract_data(call, false);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6f6ed1605cfe..371d1488cc54 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -311,7 +311,7 @@ int afs_page_filler(void *data, struct page *page)
case -ENOBUFS:
_debug("cache said ENOBUFS");
- /* fall through */
+ fallthrough;
default:
go_on:
req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index ffb8575345ca..cb3054c7843e 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -376,7 +376,6 @@ again:
spin_unlock(&vnode->lock);
return;
- /* Fall through */
default:
/* Looks like a lock request was withdrawn. */
spin_unlock(&vnode->lock);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 5d9ef517cf81..e7e98ad63a91 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -161,8 +161,8 @@ responded:
}
}
- rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
- if (rtt_us < server->probe.rtt) {
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index acb4d0ca2649..1d95ed9dd86e 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -320,7 +320,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
call->tmp_u = htonl(0);
afs_extract_to_tmp(call);
}
- /* Fall through */
+ fallthrough;
/* extract the returned data length */
case 1:
@@ -348,7 +348,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
call->bvec[0].bv_page = req->pages[req->index];
iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
- /* Fall through */
+ fallthrough;
/* extract the returned data */
case 2:
@@ -375,7 +375,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
/* Discard any excess data the server gave us */
afs_extract_discard(call, req->actual_len - req->len);
call->unmarshall = 3;
- /* Fall through */
+ fallthrough;
case 3:
_debug("extract discard %zu/%llu",
@@ -388,7 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
no_more_data:
call->unmarshall = 4;
afs_extract_to_buf(call, (21 + 3 + 6) * 4);
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 4:
@@ -1343,7 +1343,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
case 0:
call->unmarshall++;
afs_extract_to_buf(call, 12 * 4);
- /* Fall through */
+ fallthrough;
/* extract the returned status record */
case 1:
@@ -1356,7 +1356,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
xdr_decode_AFSFetchVolumeStatus(&bp, &op->volstatus.vs);
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* extract the volume name length */
case 2:
@@ -1371,7 +1371,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the volume name */
case 3:
@@ -1385,7 +1385,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("volname '%s'", p);
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message length */
case 4:
@@ -1400,7 +1400,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message */
case 5:
@@ -1415,7 +1415,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day length */
case 6:
@@ -1430,7 +1430,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day */
case 7:
@@ -1682,7 +1682,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the capabilities word count */
case 1:
@@ -1696,7 +1696,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
call->count2 = count;
afs_extract_discard(call, count * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract capabilities words */
case 2:
@@ -1776,7 +1776,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file status count and array in two steps */
case 1:
@@ -1794,7 +1794,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_counts:
afs_extract_to_buf(call, 21 * sizeof(__be32));
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract status array %u", call->count);
@@ -1824,7 +1824,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->count = 0;
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* Extract the callback count and array in two steps */
case 3:
@@ -1841,7 +1841,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_cbs:
afs_extract_to_buf(call, 3 * sizeof(__be32));
- /* Fall through */
+ fallthrough;
case 4:
_debug("extract CB array");
@@ -1870,7 +1870,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
afs_extract_to_buf(call, 6 * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 5:
ret = afs_extract_data(call, false);
@@ -1974,7 +1974,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the returned data length */
case 1:
@@ -1992,7 +1992,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
acl->size = call->count2;
afs_extract_begin(call, acl->data, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the returned data */
case 2:
@@ -2002,7 +2002,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
afs_extract_to_buf(call, (21 + 6) * 4);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 3:
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1d13d2e882ad..0fe8844b4bee 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -810,14 +810,32 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
- struct inode *inode = &op->file[0].vnode->vfs_inode;
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
+ loff_t old_i_size = i_size_read(inode);
+
+ op->setattr.old_i_size = old_i_size;
+ afs_vnode_commit_status(op, vp);
+ /* inode->i_size has now been changed. */
+
+ if (op->setattr.attr->ia_valid & ATTR_SIZE) {
+ loff_t size = op->setattr.attr->ia_size;
+ if (size > old_i_size)
+ pagecache_isize_extended(inode, old_i_size, size);
+ }
+}
+
+static void afs_setattr_edit_file(struct afs_operation *op)
+{
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
- afs_vnode_commit_status(op, &op->file[0]);
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
- loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size;
- if (size > i_size)
- pagecache_isize_extended(inode, i_size, size);
- truncate_pagecache(inode, size);
+ loff_t size = op->setattr.attr->ia_size;
+ loff_t i_size = op->setattr.old_i_size;
+
+ if (size < i_size)
+ truncate_pagecache(inode, size);
}
}
@@ -825,6 +843,7 @@ static const struct afs_operation_ops afs_setattr_operation = {
.issue_afs_rpc = afs_fs_setattr,
.issue_yfs_rpc = yfs_fs_setattr,
.success = afs_setattr_success,
+ .edit_dir = afs_setattr_edit_file,
};
/*
@@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(vnode->vfs_inode.i_mode))
filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+ /* Prevent any new writebacks from starting whilst we do this. */
+ down_write(&vnode->validate_lock);
+
op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ?
afs_file_key(attr->ia_file) : NULL),
vnode->volume);
- if (IS_ERR(op))
- return PTR_ERR(op);
+ if (IS_ERR(op)) {
+ ret = PTR_ERR(op);
+ goto out_unlock;
+ }
afs_op_set_vnode(op, 0, vnode);
op->setattr.attr = attr;
@@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
op->file[0].update_ctime = 1;
op->ops = &afs_setattr_operation;
- return afs_do_sync_operation(op);
+ ret = afs_do_sync_operation(op);
+
+out_unlock:
+ up_write(&vnode->validate_lock);
+ _leave(" = %d", ret);
+ return ret;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 792ac711985e..e5f0446f27e5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -401,22 +401,24 @@ struct afs_vlserver {
#define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */
#define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */
#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */
+#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */
rwlock_t lock; /* Lock on addresses */
atomic_t usage;
+ unsigned int rtt; /* Server's current RTT in uS */
/* Probe state */
wait_queue_head_t probe_wq;
atomic_t probe_outstanding;
spinlock_t probe_lock;
struct {
- unsigned int rtt; /* RTT as ktime/64 */
+ unsigned int rtt; /* RTT in uS */
u32 abort_code;
short error;
- bool have_result;
- bool responded:1;
- bool is_yfs:1;
- bool not_yfs:1;
- bool local_failure:1;
+ unsigned short flags;
+#define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */
+#define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */
+#define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */
+#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */
} probe;
u16 port;
@@ -810,6 +812,7 @@ struct afs_operation {
} store;
struct {
struct iattr *attr;
+ loff_t old_i_size;
} setattr;
struct afs_acl *acl;
struct yfs_acl *yacl;
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 5334f1bd2bca..1d1a8debe472 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -120,42 +120,42 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
if (e->error == -ETIMEDOUT ||
e->error == -ETIME)
return;
- /* Fall through */
+ fallthrough;
case -ETIMEDOUT:
case -ETIME:
if (e->error == -ENOMEM ||
e->error == -ENONET)
return;
- /* Fall through */
+ fallthrough;
case -ENOMEM:
case -ENONET:
if (e->error == -ERFKILL)
return;
- /* Fall through */
+ fallthrough;
case -ERFKILL:
if (e->error == -EADDRNOTAVAIL)
return;
- /* Fall through */
+ fallthrough;
case -EADDRNOTAVAIL:
if (e->error == -ENETUNREACH)
return;
- /* Fall through */
+ fallthrough;
case -ENETUNREACH:
if (e->error == -EHOSTUNREACH)
return;
- /* Fall through */
+ fallthrough;
case -EHOSTUNREACH:
if (e->error == -EHOSTDOWN)
return;
- /* Fall through */
+ fallthrough;
case -EHOSTDOWN:
if (e->error == -ECONNREFUSED)
return;
- /* Fall through */
+ fallthrough;
case -ECONNREFUSED:
if (e->error == -ECONNRESET)
return;
- /* Fall through */
+ fallthrough;
case -ECONNRESET: /* Responded, but call expired. */
if (e->responded)
return;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e817fc740ba0..e8babb62ed44 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -310,6 +310,11 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
alist->preferred == i ? '>' : '-',
&alist->addrs[i].transport);
}
+ seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
+ seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
+ vlserver->probe.flags, vlserver->probe.error,
+ vlserver->probe.abort_code,
+ atomic_read(&vlserver->probe_outstanding));
return 0;
}
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 6a0935cb822f..d83f13c44b92 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_operation *op)
case -ETIME:
if (op->error != -EDESTADDRREQ)
goto iterate_address;
- /* Fall through */
+ fallthrough;
case -ERFKILL:
case -EADDRNOTAVAIL:
case -ENETUNREACH:
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8fc8fb406a5a..8be709cb8542 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -568,7 +568,7 @@ static void afs_deliver_to_call(struct afs_call *call)
case -EIO:
pr_err("kAFS: Call %u in bad state %u\n",
call->debug_id, state);
- /* Fall through */
+ fallthrough;
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
@@ -669,7 +669,7 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
ret = call->ret0;
call->ret0 = 0;
- /* Fall through */
+ fallthrough;
case -ECONNABORTED:
ac->responded = true;
break;
@@ -872,7 +872,7 @@ void afs_send_empty_reply(struct afs_call *call)
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
RX_USER_ABORT, -ENOMEM, "KOO");
- /* Fall through */
+ fallthrough;
default:
_leave(" [error]");
return;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index b552357b1d13..3a40ee752c1e 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -456,7 +456,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 8fea54eba0c2..38b2ba1d9ec0 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -21,6 +21,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
+ vlserver->rtt = UINT_MAX;
vlserver->name_len = name_len;
vlserver->port = port;
memcpy(vlserver->name, name, name_len);
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index e3aa013c2177..d1c7068b4346 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -11,15 +11,33 @@
#include "internal.h"
#include "protocol_yfs.h"
-static bool afs_vl_probe_done(struct afs_vlserver *server)
+
+/*
+ * Handle the completion of a set of probes.
+ */
+static void afs_finished_vl_probe(struct afs_vlserver *server)
{
- if (!atomic_dec_and_test(&server->probe_outstanding))
- return false;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) {
+ server->rtt = UINT_MAX;
+ clear_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags);
+ }
- wake_up_var(&server->probe_outstanding);
clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags);
wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING);
- return true;
+}
+
+/*
+ * Handle the completion of a probe RPC call.
+ */
+static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up)
+{
+ if (atomic_dec_and_test(&server->probe_outstanding)) {
+ afs_finished_vl_probe(server);
+ wake_up = true;
+ }
+
+ if (wake_up)
+ wake_up_all(&server->probe_wq);
}
/*
@@ -45,15 +63,20 @@ void afs_vlserver_probe_result(struct afs_call *call)
server->probe.error = 0;
goto responded;
case -ECONNABORTED:
- if (!server->probe.responded) {
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) {
server->probe.abort_code = call->abort_code;
server->probe.error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
- server->probe.local_failure = true;
- afs_io_error(call, afs_io_error_vl_probe_fail);
+ case -EKEYEXPIRED:
+ case -EKEYREVOKED:
+ case -EKEYREJECTED:
+ server->probe.flags |= AFS_VLSERVER_PROBE_LOCAL_FAILURE;
+ if (server->probe.error == 0)
+ server->probe.error = ret;
+ trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
case -ERFKILL:
@@ -67,12 +90,12 @@ void afs_vlserver_probe_result(struct afs_call *call)
default:
clear_bit(index, &alist->responded);
set_bit(index, &alist->failed);
- if (!server->probe.responded &&
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) &&
(server->probe.error == 0 ||
server->probe.error == -ETIMEDOUT ||
server->probe.error == -ETIME))
server->probe.error = ret;
- afs_io_error(call, afs_io_error_vl_probe_fail);
+ trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail);
goto out;
}
@@ -81,39 +104,36 @@ responded:
clear_bit(index, &alist->failed);
if (call->service_id == YFS_VL_SERVICE) {
- server->probe.is_yfs = true;
+ server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
} else {
- server->probe.not_yfs = true;
- if (!server->probe.is_yfs) {
+ server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
}
}
- rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
- if (rtt_us < server->probe.rtt) {
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
+ server->rtt = rtt_us;
alist->preferred = index;
- have_result = true;
}
smp_wmb(); /* Set rtt before responded. */
- server->probe.responded = true;
+ server->probe.flags |= AFS_VLSERVER_PROBE_RESPONDED;
set_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+ set_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags);
+ have_result = true;
out:
spin_unlock(&server->probe_lock);
_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
server_index, index, &alist->addrs[index].transport, rtt_us, ret);
- have_result |= afs_vl_probe_done(server);
- if (have_result) {
- server->probe.have_result = true;
- wake_up_var(&server->probe.have_result);
- wake_up_all(&server->probe_wq);
- }
+ afs_done_one_vl_probe(server, have_result);
}
/*
@@ -151,11 +171,10 @@ static bool afs_do_probe_vlserver(struct afs_net *net,
in_progress = true;
} else {
afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
+ afs_done_one_vl_probe(server, false);
}
}
- if (!in_progress)
- afs_vl_probe_done(server);
return in_progress;
}
@@ -193,7 +212,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
{
struct wait_queue_entry *waits;
struct afs_vlserver *server;
- unsigned int rtt = UINT_MAX;
+ unsigned int rtt = UINT_MAX, rtt_s;
bool have_responders = false;
int pref = -1, i;
@@ -205,7 +224,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
server = vllist->servers[i].server;
if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
__clear_bit(i, &untried);
- if (server->probe.responded)
+ if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)
have_responders = true;
}
}
@@ -231,7 +250,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
- if (server->probe.responded)
+ if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)
goto stop;
if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
still_probing = true;
@@ -249,10 +268,11 @@ stop:
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
- if (server->probe.responded &&
- server->probe.rtt < rtt) {
+ rtt_s = READ_ONCE(server->rtt);
+ if (test_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags) &&
+ rtt_s < rtt) {
pref = i;
- rtt = server->probe.rtt;
+ rtt = rtt_s;
}
remove_wait_queue(&server->probe_wq, &waits[i]);
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index f405ca8b240a..c0458c903b31 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -192,7 +192,8 @@ pick_server:
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
- if (!test_bit(i, &vc->untried) || !s->probe.responded)
+ if (!test_bit(i, &vc->untried) ||
+ !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
continue;
if (s->probe.rtt < rtt) {
vc->index = i;
@@ -262,10 +263,14 @@ no_more_servers:
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
+ if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
+ e.responded = true;
afs_prioritise_error(&e, READ_ONCE(s->probe.error),
s->probe.abort_code);
}
+ error = e.error;
+
failed_set_error:
vc->error = error;
failed:
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index fd82850cd424..dc9327332f06 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -196,7 +196,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
/* Extract the returned uuid, uniquifier, nentries and
* blkaddrs size */
- /* Fall through */
+ fallthrough;
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -221,7 +221,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
count = min(call->count, 4U);
afs_extract_to_buf(call, count * sizeof(__be32));
- /* Fall through - and extract entries */
+ fallthrough; /* and extract entries */
case 2:
ret = afs_extract_data(call, call->count > 4);
if (ret < 0)
@@ -324,7 +324,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through - and extract the capabilities word count */
+ fallthrough; /* and extract the capabilities word count */
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -337,7 +337,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
call->unmarshall++;
afs_extract_discard(call, count * sizeof(__be32));
- /* Fall through - and extract capabilities words */
+ fallthrough; /* and extract capabilities words */
case 2:
ret = afs_extract_data(call, false);
if (ret < 0)
@@ -436,7 +436,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
/* Extract the returned uuid, uniquifier, fsEndpoints count and
* either the first fsEndpoint type or the volEndpoints
* count if there are no fsEndpoints. */
- /* Fall through */
+ fallthrough;
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -475,7 +475,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
afs_extract_to_buf(call, size);
call->unmarshall = 2;
- /* Fall through - and extract fsEndpoints[] entries */
+ fallthrough; /* and extract fsEndpoints[] entries */
case 2:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -526,7 +526,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
* extract the type of the next endpoint when we extract the
* data of the current one, but this is the first...
*/
- /* Fall through */
+ fallthrough;
case 3:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -552,7 +552,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
afs_extract_to_buf(call, size);
call->unmarshall = 4;
- /* Fall through - and extract volEndpoints[] entries */
+ fallthrough; /* and extract volEndpoints[] entries */
case 4:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -587,7 +587,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
afs_extract_discard(call, 0);
call->unmarshall = 5;
- /* Fall through - Done */
+ fallthrough; /* Done */
case 5:
ret = afs_extract_data(call, false);
if (ret < 0)
@@ -663,7 +663,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through - and extract the cell name length */
+ fallthrough; /* and extract the cell name length */
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -685,7 +685,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
afs_extract_begin(call, cell_name, namesz);
call->unmarshall++;
- /* Fall through - and extract cell name */
+ fallthrough; /* and extract cell name */
case 2:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -694,7 +694,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
afs_extract_discard(call, call->count2);
call->unmarshall++;
- /* Fall through - and extract padding */
+ fallthrough; /* and extract padding */
case 3:
ret = afs_extract_data(call, false);
if (ret < 0)
diff --git a/fs/afs/write.c b/fs/afs/write.c
index a121c247d95a..da12abd6db21 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -609,7 +609,7 @@ no_more:
default:
pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
- /* Fall through */
+ fallthrough;
case -EACCES:
case -EPERM:
case -ENOKEY:
@@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping,
int afs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct afs_vnode *vnode = AFS_FS_I(mapping->host);
pgoff_t start, end, next;
int ret;
_enter("");
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ down_read(&vnode->validate_lock);
+ else if (!down_read_trylock(&vnode->validate_lock))
+ return 0;
+
if (wbc->range_cyclic) {
start = mapping->writeback_index;
end = -1;
@@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping,
ret = afs_writepages_region(mapping, wbc, start, end, &next);
}
+ up_read(&vnode->validate_lock);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 8c24fdc899e3..3b1239b7e90d 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -373,7 +373,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
req->offset = req->pos & (PAGE_SIZE - 1);
afs_extract_to_tmp64(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the returned data length */
case 1:
@@ -401,7 +401,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
call->bvec[0].bv_page = req->pages[req->index];
iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
- /* Fall through */
+ fallthrough;
/* extract the returned data */
case 2:
@@ -428,7 +428,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
/* Discard any excess data the server gave us */
afs_extract_discard(call, req->actual_len - req->len);
call->unmarshall = 3;
- /* Fall through */
+ fallthrough;
case 3:
_debug("extract discard %zu/%llu",
@@ -444,7 +444,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
sizeof(struct yfs_xdr_YFSFetchStatus) +
sizeof(struct yfs_xdr_YFSCallBack) +
sizeof(struct yfs_xdr_YFSVolSync));
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 4:
@@ -461,7 +461,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
req->file_size = vp->scb.status.size;
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 5:
break;
@@ -1262,7 +1262,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
case 0:
call->unmarshall++;
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus));
- /* Fall through */
+ fallthrough;
/* extract the returned status record */
case 1:
@@ -1275,7 +1275,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
xdr_decode_YFSFetchVolumeStatus(&bp, &op->volstatus.vs);
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* extract the volume name length */
case 2:
@@ -1290,7 +1290,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the volume name */
case 3:
@@ -1304,7 +1304,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("volname '%s'", p);
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message length */
case 4:
@@ -1319,7 +1319,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message */
case 5:
@@ -1334,7 +1334,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day length */
case 6:
@@ -1349,7 +1349,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day */
case 7:
@@ -1363,7 +1363,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("motd '%s'", p);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 8:
break;
@@ -1622,7 +1622,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file status count and array in two steps */
case 1:
@@ -1640,7 +1640,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_counts:
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus));
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract status array %u", call->count);
@@ -1670,7 +1670,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->count = 0;
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* Extract the callback count and array in two steps */
case 3:
@@ -1687,7 +1687,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_cbs:
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack));
- /* Fall through */
+ fallthrough;
case 4:
_debug("extract CB array");
@@ -1716,7 +1716,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 5:
ret = afs_extract_data(call, false);
@@ -1727,7 +1727,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
xdr_decode_YFSVolSync(&bp, &op->volsync);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 6:
break;
@@ -1804,7 +1804,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file ACL length */
case 1:
@@ -1826,7 +1826,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
afs_extract_discard(call, size);
}
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file ACL */
case 2:
@@ -1836,7 +1836,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the volume ACL length */
case 3:
@@ -1858,7 +1858,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
afs_extract_discard(call, size);
}
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the volume ACL */
case 4:
@@ -1871,7 +1871,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
sizeof(struct yfs_xdr_YFSFetchStatus) +
sizeof(struct yfs_xdr_YFSVolSync));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 5:
@@ -1886,7 +1886,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
xdr_decode_YFSVolSync(&bp, &op->volsync);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 6:
break;
diff --git a/fs/aio.c b/fs/aio.c
index 5736bff48e9e..c45c20d87538 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1489,12 +1489,8 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
*iovec = NULL;
return ret;
}
-#ifdef CONFIG_COMPAT
- if (compat)
- return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
- iter);
-#endif
- return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
+
+ return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
}
static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
@@ -1511,7 +1507,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
* may be already running. Just fail this IO with EINTR.
*/
ret = -EINTR;
- /*FALLTHRU*/
+ fallthrough;
default:
req->ki_complete(req, ret, 0);
}
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 74c886f7c51c..5ced859dac53 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi,
mutex_lock(&sbi->pipe_mutex);
while (bytes) {
- wr = kernel_write(file, data, bytes, &file->f_pos);
+ wr = __kernel_write(file, data, bytes, NULL);
if (wr <= 0)
break;
data += wr;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index f2f9086ebe98..b9c658e0548e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -576,7 +576,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- len = data_len + extra;
+ len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -590,7 +590,9 @@ static int load_flat_file(struct linux_binprm *bprm,
vm_munmap(textpos, text_len);
goto err;
}
- datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+ datapos = ALIGN(realdatastart +
+ MAX_SHARED_LIBS * sizeof(unsigned long),
+ FLAT_DATA_ALIGN);
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
data_len + bss_len + stack_len, datapos);
@@ -620,7 +622,7 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
- len = text_len + data_len + extra;
+ len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -635,7 +637,9 @@ static int load_flat_file(struct linux_binprm *bprm,
}
realdatastart = textpos + ntohl(hdr->data_start);
- datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+ datapos = ALIGN(realdatastart +
+ MAX_SHARED_LIBS * sizeof(u32),
+ FLAT_DATA_ALIGN);
reloc = (__be32 __user *)
(datapos + (ntohl(hdr->reloc_start) - text_len));
@@ -652,9 +656,8 @@ static int load_flat_file(struct linux_binprm *bprm,
(text_len + full_data
- sizeof(struct flat_hdr)),
0);
- if (datapos != realdatastart)
- memmove((void *)datapos, (void *)realdatastart,
- full_data);
+ memmove((void *) datapos, (void *) realdatastart,
+ full_data);
#else
/*
* This is used on MMU systems mainly for testing.
@@ -710,7 +713,8 @@ static int load_flat_file(struct linux_binprm *bprm,
if (IS_ERR_VALUE(result)) {
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
- vm_munmap(textpos, text_len + data_len + extra);
+ vm_munmap(textpos, text_len + data_len + extra +
+ MAX_SHARED_LIBS * sizeof(u32));
goto err;
}
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8ae833e00443..9e84b1928b94 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -103,6 +103,35 @@ void invalidate_bdev(struct block_device *bdev)
}
EXPORT_SYMBOL(invalidate_bdev);
+/*
+ * Drop all buffers & page cache for given bdev range. This function bails
+ * with error if bdev has other exclusive owner (such as filesystem).
+ */
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+ loff_t lstart, loff_t lend)
+{
+ struct block_device *claimed_bdev = NULL;
+ int err;
+
+ /*
+ * If we don't hold exclusive handle for the device, upgrade to it
+ * while we discard the buffer cache to avoid discarding buffers
+ * under live filesystem.
+ */
+ if (!(mode & FMODE_EXCL)) {
+ claimed_bdev = bdev->bd_contains;
+ err = bd_prepare_to_claim(bdev, claimed_bdev,
+ truncate_bdev_range);
+ if (err)
+ return err;
+ }
+ truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+ if (claimed_bdev)
+ bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
+ return 0;
+}
+EXPORT_SYMBOL(truncate_bdev_range);
+
static void set_init_blocksize(struct block_device *bdev)
{
bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
@@ -862,7 +891,7 @@ static int bdev_set(struct inode *inode, void *data)
return 0;
}
-struct block_device *bdget(dev_t dev)
+static struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
@@ -876,11 +905,11 @@ struct block_device *bdget(dev_t dev)
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
+ spin_lock_init(&bdev->bd_size_lock);
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
bdev->bd_part_count = 0;
- bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
@@ -891,8 +920,6 @@ struct block_device *bdget(dev_t dev)
return bdev;
}
-EXPORT_SYMBOL(bdget);
-
/**
* bdgrab -- Grab a reference to an already referenced block device
* @bdev: Block device to grab a reference to.
@@ -904,6 +931,11 @@ struct block_device *bdgrab(struct block_device *bdev)
}
EXPORT_SYMBOL(bdgrab);
+struct block_device *bdget_part(struct hd_struct *part)
+{
+ return bdget(part_devt(part));
+}
+
long nr_blockdev_pages(void)
{
struct inode *inode;
@@ -1290,6 +1322,7 @@ static void check_disk_size_change(struct gendisk *disk,
{
loff_t disk_size, bdev_size;
+ spin_lock(&bdev->bd_size_lock);
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
@@ -1299,85 +1332,51 @@ static void check_disk_size_change(struct gendisk *disk,
disk->disk_name, bdev_size, disk_size);
}
i_size_write(bdev->bd_inode, disk_size);
- if (bdev_size > disk_size && __invalidate_device(bdev, false))
+ }
+ spin_unlock(&bdev->bd_size_lock);
+
+ if (bdev_size > disk_size) {
+ if (__invalidate_device(bdev, false))
pr_warn("VFS: busy inodes on resized disk %s\n",
disk->disk_name);
}
- bdev->bd_invalidated = 0;
}
/**
- * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
- * @disk: struct gendisk to be revalidated
+ * revalidate_disk_size - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @verbose: if %true log a message about a size change if there is any
*
- * This routine is a wrapper for lower-level driver's revalidate_disk
- * call-backs. It is used to do common pre and post operations needed
- * for all revalidate_disk operations.
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs. When shrinking the bdev size, its all caches
+ * are freed.
*/
-int revalidate_disk(struct gendisk *disk)
+void revalidate_disk_size(struct gendisk *disk, bool verbose)
{
- int ret = 0;
-
- if (disk->fops->revalidate_disk)
- ret = disk->fops->revalidate_disk(disk);
+ struct block_device *bdev;
/*
* Hidden disks don't have associated bdev so there's no point in
- * revalidating it.
+ * revalidating them.
*/
- if (!(disk->flags & GENHD_FL_HIDDEN)) {
- struct block_device *bdev = bdget_disk(disk, 0);
-
- if (!bdev)
- return ret;
+ if (disk->flags & GENHD_FL_HIDDEN)
+ return;
- mutex_lock(&bdev->bd_mutex);
- check_disk_size_change(disk, bdev, ret == 0);
- mutex_unlock(&bdev->bd_mutex);
+ bdev = bdget_disk(disk, 0);
+ if (bdev) {
+ check_disk_size_change(disk, bdev, verbose);
bdput(bdev);
}
- return ret;
}
-EXPORT_SYMBOL(revalidate_disk);
+EXPORT_SYMBOL(revalidate_disk_size);
-/*
- * This routine checks whether a removable media has been changed,
- * and invalidates all buffer-cache-entries in that case. This
- * is a relatively slow routine, so we have to try to minimize using
- * it. Thus it is called only upon a 'mount' or 'open'. This
- * is the best way of combining speed and utility, I think.
- * People changing diskettes in the middle of an operation deserve
- * to lose :-)
- */
-int check_disk_change(struct block_device *bdev)
+void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
- struct gendisk *disk = bdev->bd_disk;
- const struct block_device_operations *bdops = disk->fops;
- unsigned int events;
-
- events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
- DISK_EVENT_EJECT_REQUEST);
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
- return 0;
-
- if (__invalidate_device(bdev, true))
- pr_warn("VFS: busy inodes on changed media %s\n",
- disk->disk_name);
- bdev->bd_invalidated = 1;
- if (bdops->revalidate_disk)
- bdops->revalidate_disk(bdev->bd_disk);
- return 1;
-}
-
-EXPORT_SYMBOL(check_disk_change);
-
-void bd_set_size(struct block_device *bdev, loff_t size)
-{
- inode_lock(bdev->bd_inode);
- i_size_write(bdev->bd_inode, size);
- inode_unlock(bdev->bd_inode);
+ spin_lock(&bdev->bd_size_lock);
+ i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+ spin_unlock(&bdev->bd_size_lock);
}
-EXPORT_SYMBOL(bd_set_size);
+EXPORT_SYMBOL(bd_set_nr_sectors);
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
@@ -1388,6 +1387,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
lockdep_assert_held(&bdev->bd_mutex);
+ clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+
rescan:
ret = blk_drop_partitions(bdev);
if (ret)
@@ -1446,22 +1447,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
struct gendisk *disk;
int ret;
int partno;
- int perm = 0;
bool first_open = false, unblock_events = true, need_restart;
- if (mode & FMODE_READ)
- perm |= MAY_READ;
- if (mode & FMODE_WRITE)
- perm |= MAY_WRITE;
- /*
- * hooks: /n/, see "layering violations".
- */
- if (!for_part) {
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0)
- return ret;
- }
-
restart:
need_restart = false;
ret = -ENXIO;
@@ -1514,7 +1501,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
}
if (!ret) {
- bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ bd_set_nr_sectors(bdev, get_capacity(disk));
set_init_blocksize(bdev);
}
@@ -1524,7 +1511,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
@@ -1542,7 +1529,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
ret = -ENXIO;
goto out_clear;
}
- bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
set_init_blocksize(bdev);
}
@@ -1554,7 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
@@ -1632,16 +1619,27 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* RETURNS:
* 0 on success, -errno on failure.
*/
-int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- int res;
+ int ret, perm = 0;
- res =__blkdev_get(bdev, mode, holder, 0);
- if (res)
- bdput(bdev);
- return res;
+ if (mode & FMODE_READ)
+ perm |= MAY_READ;
+ if (mode & FMODE_WRITE)
+ perm |= MAY_WRITE;
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ if (ret)
+ goto bdput;
+
+ ret =__blkdev_get(bdev, mode, holder, 0);
+ if (ret)
+ goto bdput;
+ return 0;
+
+bdput:
+ bdput(bdev);
+ return ret;
}
-EXPORT_SYMBOL(blkdev_get);
/**
* blkdev_get_by_path - open a block device by name
@@ -1889,7 +1887,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (bdev_read_only(I_BDEV(bd_inode)))
return -EPERM;
- if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode))
+ if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
return -ETXTBSY;
if (!iov_iter_count(from))
@@ -1969,7 +1967,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
int error;
@@ -1997,8 +1994,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return -EINVAL;
/* Invalidate the page cache, including dirty pages. */
- mapping = bdev->bd_inode->i_mapping;
- truncate_inode_pages_range(mapping, start, end);
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ return error;
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
@@ -2025,7 +2023,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
* the caller will be given -EBUSY. The third argument is
* inclusive, so the rounding here is safe.
*/
- return invalidate_inode_pages2_range(mapping,
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 575636f6491e..68b95ad82126 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,6 +14,7 @@ config BTRFS_FS
select LZO_DECOMPRESS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
+ select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
select SRCU
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ea1c28ccb44f..b3268f4ea5f3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
while (!list_empty(&pending_edge)) {
struct btrfs_backref_node *upper;
struct btrfs_backref_node *lower;
- struct rb_node *rb_node;
edge = list_first_entry(&pending_edge,
struct btrfs_backref_edge, list[UPPER]);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 613920c17ac1..c0f1d6818df7 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
int index = btrfs_bg_flags_to_raid_index(cache->flags);
- bool first = false;
down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index]))
- first = true;
list_add_tail(&cache->list, &space_info->block_groups[index]);
up_write(&space_info->groups_sem);
-
- if (first)
- btrfs_sysfs_add_block_group_type(cache);
}
static struct btrfs_block_group *btrfs_create_block_group_cache(
@@ -1798,7 +1792,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
- set_free_space_tree_thresholds(cache);
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
@@ -1874,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
return ret;
}
-static int read_block_group_item(struct btrfs_block_group *cache,
+static void read_block_group_item(struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct btrfs_key *key)
{
@@ -1888,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache,
sizeof(bgi));
cache->used = btrfs_stack_block_group_used(&bgi);
cache->flags = btrfs_stack_block_group_flags(&bgi);
-
- return 0;
}
static int read_one_block_group(struct btrfs_fs_info *info,
@@ -1908,9 +1899,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (!cache)
return -ENOMEM;
- ret = read_block_group_item(cache, path, key);
- if (ret < 0)
- goto error;
+ read_block_group_item(cache, path, key);
+
+ set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
@@ -2034,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_release_path(path);
}
- rcu_read_lock();
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
+ list_for_each_entry(space_info, &info->space_info, list) {
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (list_empty(&space_info->block_groups[i]))
+ continue;
+ cache = list_first_entry(&space_info->block_groups[i],
+ struct btrfs_block_group,
+ list);
+ btrfs_sysfs_add_block_group_type(cache);
+ }
+
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -2055,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
list)
inc_block_group_ro(cache, 1);
}
- rcu_read_unlock();
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
@@ -2096,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
return;
while (!list_empty(&trans->new_bgs)) {
+ int index;
+
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group,
bg_list);
if (ret)
goto next;
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -2110,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
if (ret)
btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, block_group);
+
+ /*
+ * If we restriped during balance, we may have added a new raid
+ * type, so now add the sysfs entries when it is safe to do so.
+ * We don't have to worry about locking here as it's handled in
+ * btrfs_sysfs_add_block_group_type.
+ */
+ if (block_group->space_info->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(block_group);
+
/* Already aborted the transaction if it failed. */
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -2132,6 +2146,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
return -ENOMEM;
cache->length = size;
+ set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
@@ -2783,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
+ * very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
@@ -2959,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
+
+ /*
+ * Compression can use less space than we reserved, so wake
+ * tickets if that happens
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -2992,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
+
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -3000,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
found->force_alloc = CHUNK_ALLOC_FORCE;
}
- rcu_read_unlock();
}
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
@@ -3336,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->block_group_cache_lock);
- /*
- * Now that all the block groups are freed, go through and free all the
- * space_info structs. This is only called during the final stages of
- * unmount, and so we know nobody is using them. We call
- * synchronize_rcu() once before we start, just to be on the safe side.
- */
- synchronize_rcu();
-
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c47b6c6fea9f..92dd86bceae3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -21,14 +21,18 @@
* new data the application may have written before commit.
*/
enum {
- BTRFS_INODE_ORDERED_DATA_CLOSE,
+ BTRFS_INODE_FLUSH_ON_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
+ /*
+ * Always set under the VFS' inode lock, otherwise it can cause races
+ * during fsync (we start as a fast fsync and then end up in a full
+ * fsync racing with ordered extent completion).
+ */
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
- BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
};
@@ -212,6 +216,11 @@ struct btrfs_inode {
struct inode vfs_inode;
};
+static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
+{
+ return inode->root->fs_info->sectorsize;
+}
+
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -324,23 +333,6 @@ struct btrfs_dio_private {
u8 csums[];
};
-/*
- * Disable DIO read nolock optimization, so new dio readers will be forced
- * to grab i_mutex. It is used to avoid the endless truncate due to
- * nonlocked dio read.
- */
-static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
-{
- set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
- smp_mb();
-}
-
-static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
-{
- smp_mb__before_atomic();
- clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
-}
-
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1ab56a734e70..eeface30facd 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -29,41 +29,6 @@
#include "extent_io.h"
#include "extent_map.h"
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
-void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
-
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
-void lzo_free_workspace(struct list_head *ws);
-
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
-void zstd_put_workspace(struct list_head *ws);
-
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9f3dbe372631..8001b700ea3a 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 70e49d8d4f6c..113da62dc17f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -68,7 +68,7 @@ const char *btrfs_super_csum_driver(u16 csum_type)
btrfs_csums[csum_type].name;
}
-size_t __const btrfs_get_num_csums(void)
+size_t __attribute_const__ btrfs_get_num_csums(void)
{
return ARRAY_SIZE(btrfs_csums);
}
@@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
- &disk_key, level, buf->start, 0);
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -957,7 +958,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
const struct btrfs_disk_key *disk_key,
int level,
u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *ret;
@@ -986,7 +988,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
ret = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, disk_key, level,
- hint, empty_size);
+ hint, empty_size, nest);
trans->can_flush_pending_bgs = true;
return ret;
@@ -1009,7 +1011,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size)
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -1040,7 +1043,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = parent->start;
cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size);
+ level, search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1061,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1068,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1100,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = tree_mod_log_free_eb(buf);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1297,6 +1306,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
btrfs_tree_read_lock(eb_rewin);
__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1370,7 +1381,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (!eb)
return NULL;
- btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
@@ -1378,6 +1388,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
if (tm)
__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
else
@@ -1442,7 +1455,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret)
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
@@ -1481,7 +1495,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0);
+ parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
@@ -1653,7 +1667,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize));
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -1851,7 +1866,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(child);
btrfs_set_lock_blocking_write(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
@@ -1887,10 +1903,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
left = NULL;
if (left) {
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left);
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1902,10 +1919,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
if (right) {
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right);
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -2065,7 +2083,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (left) {
u32 left_nr;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
@@ -2073,7 +2091,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left);
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
@@ -2119,7 +2138,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (right) {
u32 right_nr;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
@@ -2128,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right);
+ &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
@@ -2597,7 +2616,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ b = __btrfs_read_lock_root_node(root, p->recurse);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2736,11 +2755,13 @@ again:
btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b);
+ &b,
+ BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1], &b);
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
@@ -2871,7 +2892,8 @@ cow_done:
} else {
if (!btrfs_tree_read_lock_atomic(b)) {
btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
+ __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
+ p->recurse);
}
p->locks[level] = BTRFS_READ_LOCK;
}
@@ -3160,6 +3182,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
}
/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
* try to push data from one node into the next node left in the
* tree.
*
@@ -3203,6 +3277,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3271,6 +3351,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
@@ -3327,7 +3413,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_node_key(lower, &lower_key, 0);
c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0);
+ root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3457,7 +3544,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_node_key(c, &disk_key, mid);
split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0);
+ c->start, 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3726,7 +3813,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return 1;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(right);
@@ -3735,7 +3822,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right);
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
@@ -3747,6 +3834,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
@@ -3959,7 +4052,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return 1;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(left);
@@ -3970,7 +4063,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left);
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
@@ -3984,6 +4078,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
@@ -4232,8 +4330,18 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0);
+ l->start, 0, num_doubles ?
+ BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4478,9 +4586,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, new_key, &item_size, 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4653,14 +4759,20 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
-/*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @cpu_key: array of keys for items to be inserted
+ * @data_size: size of the body of each item we are going to insert
+ * @nr: size of @cpu_key/@data_size arrays
*/
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+ int nr)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -4671,6 +4783,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
+ u32 total_size;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
@@ -4697,7 +4815,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (old_data < data_end) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
@@ -4730,8 +4849,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- btrfs_set_token_item_offset(&token, item, data_end - data_size[i]);
data_end -= data_size[i];
+ btrfs_set_token_item_offset(&token, item, data_end);
btrfs_set_token_item_size(&token, item, data_size[i]);
}
@@ -4773,8 +4892,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size,
- total_data, total_size, nr);
+ setup_items_for_insert(root, path, cpu_key, data_size, nr);
return 0;
}
@@ -5111,7 +5229,7 @@ again:
slot--;
/*
* check this node pointer against the min_trans parameters.
- * If it is too old, old, skip to the next one.
+ * If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
@@ -5375,7 +5493,9 @@ again:
}
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5410,7 +5530,9 @@ again:
ret = btrfs_try_tree_read_lock(next);
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9c7e466f27a9..aac3d6f4e35b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -374,6 +374,7 @@ struct btrfs_path {
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
+ unsigned int recurse:1;
};
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
@@ -494,7 +495,7 @@ enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_DONE = 2,
};
-void btrfs_init_async_reclaim_work(struct work_struct *work);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
/* fs_info */
struct reloc_control;
@@ -541,11 +542,6 @@ enum {
/* Used to record internally whether fs has been frozen */
BTRFS_FS_FROZEN,
/*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
- */
- BTRFS_FS_EXCL_OP,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -565,6 +561,19 @@ enum {
BTRFS_FS_DISCARD_RUNNING,
};
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -912,6 +921,7 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
@@ -935,6 +945,9 @@ struct btrfs_fs_info {
*/
int send_in_progress;
+ /* Type of exclusive operation running */
+ unsigned long exclusive_operation;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1181,24 +1194,40 @@ struct btrfs_root {
#endif
};
-struct btrfs_clone_extent_info {
+/*
+ * Structure that conveys information about an extent that is going to replace
+ * all the extents in a file range.
+ */
+struct btrfs_replace_extent_info {
u64 disk_offset;
u64 disk_len;
u64 data_offset;
u64 data_len;
u64 file_offset;
+ /* Pointer to a file extent item of type regular or prealloc. */
char *extent_buf;
- u32 item_size;
+ /*
+ * Set to true when attempting to replace a file range with a new extent
+ * described by this structure, set to false when attempting to clone an
+ * existing extent into a file range.
+ */
+ bool is_new_extent;
+ /* Meaningful only if is_new_extent is true. */
+ int qgroup_reserved;
+ /*
+ * Meaningful only if is_new_extent is true.
+ * Used to track how many extent items we have already inserted in a
+ * subvolume tree that refer to the extent described by this structure,
+ * so that we know when to create a new delayed ref or update an existing
+ * one.
+ */
+ int insertions;
};
struct btrfs_file_private {
void *filldir_buf;
};
-static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
-{
- return btrfs_sb(inode->i_sb)->sectorsize;
-}
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
@@ -1391,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token,
#define cpu_to_le8(v) (v)
#define __le8 u8
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
#define read_eb_member(eb, ptr, type, member, result) (\
read_extent_buffer(eb, (char *)(result), \
((unsigned long)(ptr)) + \
@@ -1449,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
const type *p = page_address(eb->pages[0]); \
- u##bits res = le##bits##_to_cpu(p->member); \
- return res; \
+ return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
type *p = page_address(eb->pages[0]); \
- p->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &p->member); \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const type *s) \
{ \
- return le##bits##_to_cpu(s->member); \
+ return get_unaligned_le##bits(&s->member); \
} \
static inline void btrfs_set_##name(type *s, u##bits val) \
{ \
- s->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &s->member); \
}
-
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
@@ -2262,7 +2299,7 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __const btrfs_get_num_csums(void);
+size_t __attribute_const__ btrfs_get_num_csums(void);
/*
@@ -2518,13 +2555,14 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr);
+ u64 objectid, u64 offset, u64 bytenr, bool strict);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size);
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2592,6 +2630,8 @@ enum btrfs_reserve_flush_enum {
*
* Can be interruped by fatal signal.
*/
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
BTRFS_RESERVE_FLUSH_ALL,
/*
@@ -2619,7 +2659,7 @@ enum btrfs_flush_state {
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
@@ -2651,8 +2691,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
@@ -2665,7 +2703,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret);
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2713,7 +2752,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+ int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2930,11 +2969,15 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes);
+ u64 *ram_bytes, bool strict);
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode);
@@ -2956,7 +2999,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
@@ -3017,6 +3060,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3031,6 +3075,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3053,9 +3100,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode, u64 start,
u64 end, int drop_cache);
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
@@ -3536,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 0e354e9e57d0..bacee09b7bfd 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- u64 used;
- int ret = 0;
- int need_commit = 2;
- int have_pinned_space;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
/* Make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, fs_info->sectorsize);
- if (btrfs_is_free_space_inode(inode)) {
- need_commit = 0;
- ASSERT(current->journal_info);
- }
-
-again:
- /* Make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- used = btrfs_space_info_used(data_sinfo, true);
-
- if (used + bytes > data_sinfo->total_bytes) {
- struct btrfs_trans_handle *trans;
-
- /*
- * If we don't have enough free bytes in this space then we need
- * to alloc a new chunk.
- */
- if (!data_sinfo->full) {
- u64 alloc_target;
-
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
- spin_unlock(&data_sinfo->lock);
-
- alloc_target = btrfs_data_alloc_profile(fs_info);
- /*
- * It is ugly that we don't call nolock join
- * transaction for the free space inode case here.
- * But it is safe because we only do the data space
- * reservation for the free space cache in the
- * transaction context, the common join transaction
- * just increase the counter of the current transaction
- * handler, doesn't try to acquire the trans_lock of
- * the fs.
- */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_chunk_alloc(trans, alloc_target,
- CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans);
- if (ret < 0) {
- if (ret != -ENOSPC)
- return ret;
- else {
- have_pinned_space = 1;
- goto commit_trans;
- }
- }
-
- goto again;
- }
-
- /*
- * If we don't have enough pinned space to deal with this
- * allocation, and no removed chunk in current transaction,
- * don't bother committing the transaction.
- */
- have_pinned_space = __percpu_counter_compare(
- &data_sinfo->total_bytes_pinned,
- used + bytes - data_sinfo->total_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- spin_unlock(&data_sinfo->lock);
-
- /* Commit the current transaction and try again */
-commit_trans:
- if (need_commit) {
- need_commit--;
-
- if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, -1);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
- (u64)-1);
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- if (have_pinned_space >= 0 ||
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
- &trans->transaction->flags) ||
- need_commit > 0) {
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- /*
- * The cleaner kthread might still be doing iput
- * operations. Wait for it to finish so that
- * more space is released. We don't need to
- * explicitly run the delayed iputs here because
- * the commit_transaction would have woken up
- * the cleaner.
- */
- ret = btrfs_wait_on_delayed_iputs(fs_info);
- if (ret)
- return ret;
- goto again;
- } else {
- btrfs_end_transaction(trans);
- }
- }
-
- trace_btrfs_space_reservation(fs_info,
- "space_info:enospc",
- data_sinfo->flags, bytes, 1);
- return -ENOSPC;
- }
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
- spin_unlock(&data_sinfo->lock);
+ if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- return 0;
+ return btrfs_reserve_data_bytes(fs_info, bytes, flush);
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
@@ -277,9 +166,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
- spin_lock(&data_sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
- spin_unlock(&data_sinfo->lock);
+ btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
}
/*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bf1595a42a98..5aba81e16113 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root,
- fs_info->nodesize, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
if (ret < 0)
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
@@ -769,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
}
/* insert the keys of the items */
- setup_items_for_insert(root, path, keys, data_size,
- total_data_size, total_size, nitems);
+ setup_items_for_insert(root, path, keys, data_size, nitems);
/* insert the dir index items */
slot = path->slots[0];
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index db93909b25e0..4a0243cb9d97 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -64,10 +64,6 @@
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
@@ -224,13 +220,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_device *device;
struct block_device *bdev;
- struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding) {
+ if (srcdev->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
@@ -244,8 +239,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
sync_blockdev(bdev);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -512,7 +506,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
- ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device);
+ ret = btrfs_sysfs_add_device(tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
@@ -599,6 +593,63 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->dev_replace.replace_wait);
}
+/*
+ * When finishing the device replace, before swapping the source device with the
+ * target device we must update the chunk allocation state in the target device,
+ * as it is empty because replace works by directly copying the chunks and not
+ * through the normal chunk allocation path.
+ */
+static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 found_start;
+ u64 found_end;
+ int ret = 0;
+
+ lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
+
+ while (!find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = set_extent_bits(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED);
+ if (ret)
+ break;
+ start = found_end + 1;
+ }
+
+ free_extent_state(cached_state);
+ return ret;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -630,7 +681,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -673,8 +724,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
- /* replace old device with new one in mapping tree */
+ /*
+ * Update allocation state in the new device and replace the old device
+ * with the new one in the mapping tree.
+ */
if (!scrub_ret) {
+ scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
+ if (scrub_ret)
+ goto error;
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
@@ -685,6 +742,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
+error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -743,9 +801,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* replace the sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device);
+ btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
- btrfs_rm_dev_replace_free_srcdev(src_device);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
+ btrfs_scratch_superblocks(fs_info, src_device->bdev,
+ src_device->name->str);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -754,33 +814,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return 0;
-}
-
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev)
-{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
- u64 start = 0;
- int i;
+ btrfs_rm_dev_replace_free_srcdev(src_device);
- write_lock(&em_tree->lock);
- do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
- break;
- map = em->map_lookup;
- for (i = 0; i < map->num_stripes; i++)
- if (srcdev == map->stripes[i].dev)
- map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
- } while (start);
- write_unlock(&em_tree->lock);
+ return 0;
}
/*
@@ -983,7 +1019,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
@@ -1020,7 +1056,7 @@ static int btrfs_dev_replace_kthread(void *data)
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret && ret != -ECANCELED);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9ae25f632157..8e3438672a82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,7 +50,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -205,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
#endif
/*
- * extents on the btree inode are pretty simple, there's one extent
- * that covers the entire device
- */
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
-{
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- int ret;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- read_unlock(&em_tree->lock);
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- em = alloc_extent_map();
- if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
- }
- em->start = 0;
- em->len = (u64)-1;
- em->block_len = (u64)-1;
- em->block_start = 0;
-
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- if (ret == -EEXIST) {
- free_extent_map(em);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em)
- em = ERR_PTR(-EIO);
- } else if (ret) {
- free_extent_map(em);
- em = ERR_PTR(ret);
- }
- write_unlock(&em_tree->lock);
-
-out:
- return em;
-}
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
@@ -545,38 +497,35 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
static int check_tree_block_fsid(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
u8 fsid[BTRFS_FSID_SIZE];
- int ret = 1;
+ u8 *metadata_uuid;
read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE);
- while (fs_devices) {
- u8 *metadata_uuid;
+ /*
+ * Checking the incompat flag is only valid for the current fs. For
+ * seed devices it's forbidden to have their uuid changed so reading
+ * ->fsid in this case is fine
+ */
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
- /*
- * Checking the incompat flag is only valid for the current
- * fs. For seed devices it's forbidden to have their uuid
- * changed so reading ->fsid in this case is fine
- */
- if (fs_devices == fs_info->fs_devices &&
- btrfs_fs_incompat(fs_info, METADATA_UUID))
- metadata_uuid = fs_devices->metadata_uuid;
- else
- metadata_uuid = fs_devices->fsid;
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
+ return 0;
- if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
- ret = 0;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- return ret;
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
+ return 0;
+
+ return 1;
}
-static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror)
{
u64 found_start;
int found_level;
@@ -636,16 +585,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
- u32 val;
- u32 found = 0;
-
- memcpy(&found, result, csum_size);
+ u8 val[BTRFS_CSUM_SIZE] = { 0 };
read_extent_buffer(eb, &val, 0, csum_size);
btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted %x found %x level %d",
+ "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
fs_info->sb->s_id, eb->start,
- val, found, btrfs_header_level(eb));
+ CSUM_FMT_VALUE(csum_size, val),
+ CSUM_FMT_VALUE(csum_size, result),
+ btrfs_header_level(eb));
ret = -EUCLEAN;
goto err;
}
@@ -865,9 +813,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info,
return 1;
}
-static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(fs_info, BTRFS_I(inode));
@@ -952,11 +899,6 @@ static int btree_writepages(struct address_space *mapping,
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_readpage(struct file *file, struct page *page)
-{
- return extent_read_full_page(page, btree_get_extent, 0);
-}
-
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
@@ -996,7 +938,6 @@ static int btree_set_page_dirty(struct page *page)
}
static const struct address_space_operations btree_aops = {
- .readpage = btree_readpage,
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
@@ -1209,7 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -1281,7 +1223,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0);
+ NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
btrfs_put_root(root);
return ERR_CAST(leaf);
@@ -1506,10 +1448,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *root;
while (!list_empty(&fs_info->allocated_roots)) {
+ char buf[BTRFS_ROOT_NAME_BUF_LEN];
+
root = list_first_entry(&fs_info->allocated_roots,
struct btrfs_root, leak_list);
- btrfs_err(fs_info, "leaked root %llu-%llu refcount %d",
- root->root_key.objectid, root->root_key.offset,
+ btrfs_err(fs_info, "leaked root %s refcount %d",
+ btrfs_root_name(root->root_key.objectid, buf),
refcount_read(&root->refs));
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
@@ -2116,12 +2060,10 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_INODE_IO, inode);
+ IO_TREE_BTREE_INODE_IO, inode);
BTRFS_I(inode)->io_tree.track_uptodate = false;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
- BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
-
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@@ -2627,18 +2569,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
level = btrfs_super_root_level(sb);
tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
generation, level, NULL);
- if (IS_ERR(tree_root->node) ||
- !extent_buffer_uptodate(tree_root->node)) {
+ if (IS_ERR(tree_root->node)) {
handle_error = true;
+ ret = PTR_ERR(tree_root->node);
+ tree_root->node = NULL;
+ btrfs_warn(fs_info, "couldn't read tree root");
+ continue;
- if (IS_ERR(tree_root->node)) {
- ret = PTR_ERR(tree_root->node);
- tree_root->node = NULL;
- } else if (!extent_buffer_uptodate(tree_root->node)) {
- ret = -EUCLEAN;
- }
-
- btrfs_warn(fs_info, "failed to read tree root");
+ } else if (!extent_buffer_uptodate(tree_root->node)) {
+ handle_error = true;
+ ret = -EIO;
+ btrfs_warn(fs_info, "error while reading tree root");
continue;
}
@@ -2754,7 +2695,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->check_integrity_print_mask = 0;
#endif
btrfs_init_balance(fs_info);
- btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
+ btrfs_init_async_reclaim_work(fs_info);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
@@ -2929,7 +2870,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Verify the type first, if that or the the checksum value are
+ * Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
csum_type = btrfs_super_csum_type(disk_super);
@@ -3091,8 +3032,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sb_buffer;
}
- sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
@@ -3418,6 +3357,8 @@ fail_block_groups:
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
+ if (fs_info->data_reloc_root)
+ btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -3481,8 +3422,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
return ERR_CAST(page);
super = page_address(page);
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
+ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-ENODATA);
+ }
+
+ if (btrfs_super_bytenr(super) != bytenr) {
btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
@@ -4055,6 +4000,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4551,6 +4497,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
cache->io_ctl.inode = NULL;
iput(inode);
}
+ ASSERT(cache->io_ctl.pages == NULL);
btrfs_put_block_group(cache);
}
@@ -4685,9 +4632,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
-
-static const struct extent_io_ops btree_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btree_submit_bio_hook,
- .readpage_end_io_hook = btree_readpage_end_io_hook,
-};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 00dc39d47ed3..fee69ced58b4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -76,7 +76,11 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror);
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -123,9 +127,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 219a09a2b734..9800a8306368 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -40,6 +40,7 @@ struct io_failure_record;
enum {
IO_TREE_FS_PINNED_EXTENTS,
IO_TREE_FS_EXCLUDED_EXTENTS,
+ IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
@@ -48,6 +49,7 @@ enum {
IO_TREE_INODE_FILE_EXTENT,
IO_TREE_LOG_CSUM_RANGE,
IO_TREE_SELFTEST,
+ IO_TREE_DEVICE_ALLOC_STATE,
};
struct extent_io_tree {
@@ -61,7 +63,6 @@ struct extent_io_tree {
u8 owner;
spinlock_t lock;
- const struct extent_io_ops *ops;
};
struct extent_state {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index de6fe176fdfb..3b21fee13e77 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -400,12 +400,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
@@ -414,12 +413,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_DATA_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else {
@@ -429,8 +427,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
}
btrfs_print_leaf((struct extent_buffer *)eb);
- btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
- eb->start, type);
+ btrfs_err(eb->fs_info,
+ "eb %llu iref 0x%lx invalid extent inline ref type %d",
+ eb->start, (unsigned long)iref, type);
WARN_ON(1);
return BTRFS_REF_TYPE_INVALID;
@@ -1178,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
num_bytes, parent, root_objectid,
owner, offset, 1);
if (ret == 0) {
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+ /*
+ * We're adding refs to a tree block we already own, this
+ * should not happen at all.
+ */
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_crit(trans->fs_info,
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
+ bytenr, num_bytes, root_objectid);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ WARN_ON(1);
+ btrfs_crit(trans->fs_info,
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+ return -EUCLEAN;
+ }
update_inline_extent_backref(path, iref, refs_to_add,
extent_op, NULL);
} else if (ret == -ENOENT) {
@@ -1398,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/*
* __btrfs_inc_extent_ref - insert backreference for a given extent
*
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
+ * how it works.
+ *
* @trans: Handle of transaction
*
* @node: The delayed ref node used to get the bytenr/length for
@@ -2306,7 +2323,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr)
+ u64 objectid, u64 offset, u64 bytenr,
+ bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
@@ -2348,9 +2366,13 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
goto out;
- /* If extent created before last snapshot => it's definitely shared */
- if (btrfs_extent_generation(leaf, ei) <=
- btrfs_root_last_snapshot(&root->root_item))
+ /*
+ * If extent created before last snapshot => it's shared unless the
+ * snapshot has been deleted. Use the heuristic if strict is false.
+ */
+ if (!strict &&
+ (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2375,7 +2397,7 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr)
+ u64 bytenr, bool strict)
{
struct btrfs_path *path;
int ret;
@@ -2386,7 +2408,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
do {
ret = check_committed_ref(root, path, objectid,
- offset, bytenr);
+ offset, bytenr, strict);
if (ret && ret != -ENOENT)
goto out;
@@ -2845,11 +2867,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len -= to_add;
}
spin_unlock(&global_rsv->lock);
- /* Add to any tickets we may have */
- if (len)
- btrfs_try_granting_tickets(fs_info,
- space_info);
}
+ /* Add to any tickets we may have */
+ if (!readonly && return_free_space && len)
+ btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -2931,6 +2952,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+/*
+ * Drop one or more refs of @node.
+ *
+ * 1. Locate the extent refs.
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
+ * Locate it, then reduce the refs number or remove the ref line completely.
+ *
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
+ *
+ * Inline backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 2 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
+ *
+ * This function gets called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 257
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 1 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ *
+ * Keyed backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 754 gen 6 flags DATA
+ * [...]
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
+ *
+ * This function get called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 866
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 753 gen 6 flags DATA
+ *
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
+ */
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
@@ -2963,7 +3043,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- BUG_ON(!is_data && refs_to_drop != 1);
+
+ if (!is_data && refs_to_drop != 1) {
+ btrfs_crit(info,
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
+ node->bytenr, refs_to_drop);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
if (is_data)
skinny_metadata = false;
@@ -2972,6 +3060,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
parent, root_objectid, owner_objectid,
owner_offset);
if (ret == 0) {
+ /*
+ * Either the inline backref or the SHARED_DATA_REF/
+ * SHARED_BLOCK_REF is found
+ *
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
+ */
extent_slot = path->slots[0];
while (extent_slot >= 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -2988,13 +3083,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
found_extent = 1;
break;
}
+
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
}
if (!found_extent) {
- BUG_ON(iref);
+ if (iref) {
+ btrfs_crit(info,
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ /* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, path, NULL,
refs_to_drop,
is_data, &last_ref);
@@ -3005,6 +3108,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
path->leave_spinning = 1;
+ /* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
@@ -3079,19 +3183,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ btrfs_crit(info,
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu",
+ key.objectid, key.type, key.offset,
+ owner_objectid, item_size,
+ sizeof(*ei) + sizeof(*bi));
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
- btrfs_err(info,
- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
+ btrfs_crit(info,
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
refs_to_drop, refs, bytenr);
- ret = -EINVAL;
- btrfs_abort_transaction(trans, ret);
- goto out;
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
}
refs -= refs_to_drop;
@@ -3103,7 +3214,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* be updated by remove_extent_backref
*/
if (iref) {
- BUG_ON(!found_extent);
+ if (!found_extent) {
+ btrfs_crit(info,
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
@@ -3118,13 +3234,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
} else {
+ /* In this branch refs == 1 */
if (found_extent) {
- BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(path, iref));
+ if (is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref)) {
+ btrfs_crit(info,
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
+ extent_data_ref_count(path, iref),
+ refs_to_drop);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
if (iref) {
- BUG_ON(path->slots[0] != extent_slot);
+ if (path->slots[0] != extent_slot) {
+ btrfs_crit(info,
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
+ key.objectid, key.type,
+ key.offset);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
- BUG_ON(path->slots[0] != extent_slot + 1);
+ /*
+ * No inline ref, we must be at SHARED_* item,
+ * And it's single ref, it must be:
+ * | extent_slot ||extent_slot + 1|
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
+ */
+ if (path->slots[0] != extent_slot + 1) {
+ btrfs_crit(info,
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
path->slots[0] = extent_slot;
num_to_del = 2;
}
@@ -3165,6 +3307,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
return ret;
+err_dump:
+ /*
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
+ * dump for debug build.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
+ path->slots[0], extent_slot);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+
+ btrfs_free_path(path);
+ return -EUCLEAN;
}
/*
@@ -3914,11 +4069,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
* |- Push harder to find free extents
* |- If not found, re-iterate all block groups
*/
-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
+static noinline int find_free_extent(struct btrfs_root *root,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
@@ -3950,7 +4106,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, num_bytes, empty_size, flags);
space_info = btrfs_find_space_info(fs_info, flags);
if (!space_info) {
@@ -4199,7 +4355,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
@@ -4500,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, int level, u64 owner)
+ u64 bytenr, int level, u64 owner,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
@@ -4522,8 +4679,8 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ERR_PTR(-EUCLEAN);
}
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
- btrfs_tree_lock(buf);
+ btrfs_set_buffer_lockdep_class(owner, buf, level);
+ __btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -4569,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
@@ -4585,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- level, root_objectid);
+ level, root_objectid, nest);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -4602,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
- root_objectid);
+ root_objectid, nest);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_free_reserved;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6def411b2eba..60f5f68d892d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -160,19 +160,20 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
{
blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags);
+ if (is_data_inode(tree->private_data))
+ ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ bio_flags);
else
- btrfsic_submit_bio(bio);
+ ret = btrfs_submit_metadata_bio(tree->private_data, bio,
+ mirror_num, bio_flags);
return blk_status_to_errno(ret);
}
@@ -280,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
- tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
@@ -2819,8 +2819,6 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- bool data_inode = btrfs_ino(BTRFS_I(inode))
- != BTRFS_BTREE_INODE_OBJECTID;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2851,9 +2849,12 @@ static void end_bio_extent_readpage(struct bio *bio)
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
- ret = tree->ops->readpage_end_io_hook(io_bio, offset,
- page, start, end,
- mirror);
+ if (is_data_inode(inode))
+ ret = btrfs_verify_data_csum(io_bio, offset, page,
+ start, end, mirror);
+ else
+ ret = btrfs_validate_metadata_buffer(io_bio,
+ offset, page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2866,7 +2867,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate))
goto readpage_ok;
- if (data_inode) {
+ if (is_data_inode(inode)) {
/*
* The generic bio_readpage_error handles errors the
@@ -2881,7 +2882,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (!btrfs_submit_read_repair(inode, bio, offset, page,
start - page_offset(page),
start, end, mirror,
- tree->ops->submit_bio_hook)) {
+ btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
offset += len;
continue;
@@ -3053,7 +3054,6 @@ static int submit_extent_page(unsigned int opf,
else
contig = bio_end_sector(bio) == sector;
- ASSERT(tree->ops);
if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
can_merge = false;
@@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page)
static struct extent_map *
__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
- u64 start, u64 len, get_extent_t *get_extent,
- struct extent_map **em_cached)
+ u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
@@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
@@ -3142,12 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct page *page,
- get_extent_t *get_extent,
- struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags, unsigned int read_flags,
- u64 *prev_em_start)
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -3209,7 +3205,7 @@ static int __do_readpage(struct page *page,
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, get_extent, em_cached);
+ end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
unlock_extent(tree, cur, end);
@@ -3241,7 +3237,7 @@ static int __do_readpage(struct page *page,
/*
* If we have a file range that points to a compressed extent
- * and it's followed by a consecutive file range that points to
+ * and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
@@ -3325,7 +3321,7 @@ static int __do_readpage(struct page *page,
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
pg_offset, bio,
- end_bio_extent_readpage, mirror_num,
+ end_bio_extent_readpage, 0,
*bio_flags,
this_bio_flag,
force_bio_submit);
@@ -3362,44 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- __do_readpage(pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
+ btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
+ REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
-static int __extent_read_full_page(struct page *page,
- get_extent_t *get_extent,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
- unsigned int read_flags)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- int ret;
-
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,
- bio_flags, read_flags, NULL);
- return ret;
-}
-
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
- int ret;
-
- ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
- &bio_flags, 0);
- if (bio)
- ret = submit_one_bio(bio, mirror_num, bio_flags);
- return ret;
-}
-
static void update_nr_written(struct writeback_control *wbc,
unsigned long nr_written)
{
@@ -4552,7 +4516,7 @@ next:
* helper function for fiemap, which doesn't want to see any holes.
* This maps until we find something past 'last'
*/
-static struct extent_map *get_extent_skip_holes(struct inode *inode,
+static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -4567,7 +4531,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
+ em = btrfs_get_extent_fiemap(inode, offset, len);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4696,7 +4660,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
return ret;
}
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret = 0;
@@ -4707,12 +4671,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 last;
u64 last_for_get_extent = 0;
u64 disko = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
struct ulist *roots;
struct ulist *tmp_ulist;
@@ -4743,8 +4707,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
goto out_free_ulist;
} else {
@@ -4758,7 +4722,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
found_type = found_key.type;
/* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (found_key.objectid != btrfs_ino(inode) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/* have to trust i_size as the end */
last = (u64)-1;
@@ -4784,7 +4748,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ lock_extent_bits(&inode->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent);
@@ -4853,8 +4817,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* then we're just getting a count and we can skip the
* lookup stuff.
*/
- ret = btrfs_check_shared(root,
- btrfs_ino(BTRFS_I(inode)),
+ ret = btrfs_check_shared(root, btrfs_ino(inode),
bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
@@ -4898,7 +4861,7 @@ out_free:
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ unlock_extent_cached(&inode->io_tree, start, start + len - 1,
&cached_state);
out_free_ulist:
@@ -4990,7 +4953,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
rwlock_init(&eb->lock);
atomic_set(&eb->blocking_readers, 0);
eb->blocking_writers = 0;
- eb->lock_nested = false;
+ eb->lock_recursed = false;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@@ -5574,20 +5537,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = __extent_read_full_page(page,
- btree_get_extent, &bio,
- mirror_num, &bio_flags,
- REQ_META);
+ err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ page, page_offset(page), PAGE_SIZE, 0,
+ &bio, end_bio_extent_readpage,
+ mirror_num, 0, 0, false);
if (err) {
- ret = err;
/*
- * We use &bio in above __extent_read_full_page,
- * so we ensure that if it returns error, the
- * current page fails to add itself to bio and
- * it's been unlocked.
- *
- * We must dec io_pages by ourselves.
+ * We failed to submit the bio so it's the
+ * caller's responsibility to perform cleanup
+ * i.e unlock page/set error bit.
*/
+ ret = err;
+ SetPageError(page);
+ unlock_page(page);
atomic_dec(&eb->io_pages);
}
} else {
@@ -5622,6 +5584,36 @@ unlock_exit:
return ret;
}
+static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ btrfs_warn(eb->fs_info,
+ "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ eb->start, eb->len, start, len);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+
+ return true;
+}
+
+/*
+ * Check if the [start, start + len) range is valid before reading/writing
+ * the eb.
+ * NOTE: @start and @len are offset inside the eb, not logical address.
+ *
+ * Caller should not touch the dst/src memory if this function returns error.
+ */
+static inline int check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
+{
+ unsigned long offset;
+
+ /* start, start + len should not go beyond eb->len nor overflow */
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
+ return report_eb_range(eb, start, len);
+
+ return false;
+}
+
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
@@ -5632,12 +5624,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
char *dst = (char *)dstv;
unsigned long i = start >> PAGE_SHIFT;
- if (start + len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, len);
- memset(dst, 0, len);
+ if (check_eb_range(eb, start, len))
return;
- }
offset = offset_in_page(start);
@@ -5655,9 +5643,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
}
}
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dstv,
- unsigned long start, unsigned long len)
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5677,7 +5665,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (copy_to_user(dst, kaddr + offset, cur)) {
+ if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
@@ -5702,8 +5690,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long i = start >> PAGE_SHIFT;
int ret = 0;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return -EINVAL;
offset = offset_in_page(start);
@@ -5756,8 +5744,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
char *src = (char *)srcv;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5785,8 +5773,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
char *kaddr;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5830,6 +5818,10 @@ void copy_extent_buffer(const struct extent_buffer *dst,
char *kaddr;
unsigned long i = dst_offset >> PAGE_SHIFT;
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(src, src_offset, len))
+ return;
+
WARN_ON(src->len != dst_len);
offset = offset_in_page(dst_offset);
@@ -6019,25 +6011,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu dst len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu dst len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
while (len > 0) {
dst_off_in_page = offset_in_page(dst_offset);
@@ -6064,7 +6046,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
@@ -6073,18 +6054,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 00a88f2eb5ab..f39d02e7f7ef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -74,18 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct bio *bio, u64 bio_offset);
-struct extent_io_ops {
- /*
- * The following callbacks must be always defined, the function
- * pointer will be called unconditionally.
- */
- submit_bio_hook_t *submit_bio_hook;
- int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror);
-};
-
-
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
struct extent_buffer {
@@ -102,7 +90,7 @@ struct extent_buffer {
int blocking_writers;
atomic_t blocking_readers;
- bool lock_nested;
+ bool lock_recursed;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
short log_index;
@@ -193,8 +181,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num);
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags);
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@@ -203,7 +194,7 @@ int extent_writepages(struct address_space *mapping,
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
@@ -241,9 +232,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dst, unsigned long start,
- unsigned long len);
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
+ unsigned long len);
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *src);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7d5ec71615b8..8f4f2bd6d9b9 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
if (page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
- count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
- csum, nblocks);
+ count = btrfs_find_ordered_sum(BTRFS_I(inode), offset,
+ disk_bytenr, csum, nblocks);
if (count)
goto found;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bb824c7cb7c7..0ff659455b1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1057,11 +1057,7 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
+ setup_items_for_insert(root, path, &key, &extent_item_size, 1);
*key_inserted = 1;
}
@@ -1477,9 +1473,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
- last_pos = start_pos
- + round_up(pos + write_bytes - start_pos,
- fs_info->sectorsize) - 1;
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
@@ -1497,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_start_ordered_extent(&inode->vfs_inode,
- ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
@@ -1571,7 +1564,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
if (!nowait)
@@ -1872,7 +1865,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, from);
+ written = btrfs_direct_IO(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
@@ -2025,7 +2018,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * 1. We must always clear IOCB_DSYNC in order to not deadlock
+ * in iomap, as it calls generic_write_sync() in this case.
+ * 2. If we are async, we can call iomap_dio_complete() either
+ * in
+ *
+ * 2.1. A worker thread from the last bio completed. In this
+ * case we need to mark the btrfs_dio_data that it is
+ * async in order to call generic_write_sync() properly.
+ * This is handled by setting BTRFS_DIO_SYNC_STUB in the
+ * current->journal_info.
+ * 2.2 The submitter context, because all IO completed
+ * before we exited iomap_dio_rw(). In this case we can
+ * just re-set the IOCB_DSYNC on the iocb and we'll do
+ * the sync below. If our ->end_io() gets called and
+ * current->journal_info is set, then we know we're in
+ * our current context and we will clear
+ * current->journal_info to indicate that we need to
+ * sync below.
+ */
+ if (sync) {
+ ASSERT(current->journal_info == NULL);
+ iocb->ki_flags &= ~IOCB_DSYNC;
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
num_written = __btrfs_direct_write(iocb, from);
+
+ /*
+ * As stated above, we cleared journal_info, so we need to do
+ * the sync ourselves.
+ */
+ if (sync && current->journal_info == NULL)
+ iocb->ki_flags |= IOCB_DSYNC;
+ current->journal_info = NULL;
} else {
num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0)
@@ -2065,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
filp->private_data = NULL;
/*
- * ordered_data_close is set by setattr when we are about to truncate
- * a file from a non-zero size to a zero size. This tries to
- * flush down new bytes that may have been written if the
- * application were using truncate to replace a file in place.
+ * Set by setattr when we are about to truncate a file from a non-zero
+ * size to a zero size. This tries to flush down new bytes that may
+ * have been written if the application were using truncate to replace
+ * a file in place.
*/
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
return 0;
@@ -2116,20 +2142,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
+ u64 len;
+ bool full_sync;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
/*
- * Set the range to full if the NO_HOLES feature is not enabled.
- * This is to avoid missing file extent items representing holes after
- * replaying the log.
+ * Always set the range to a full range, otherwise we can get into
+ * several problems, from missing file extent items to represent holes
+ * when not using the NO_HOLES feature, to log tree corruption due to
+ * races between hole detection during logging and completion of ordered
+ * extents outside the range, to missing checksums due to ordered extents
+ * for which we flushed only a subset of their pages.
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
- start = 0;
- end = LLONG_MAX;
- }
+ start = 0;
+ end = LLONG_MAX;
+ len = (u64)LLONG_MAX + 1;
/*
* We write the dirty pages in the range and wait until they complete
@@ -2153,19 +2183,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges and races between logging and
- * completion of ordered extents for adjancent ranges - both races
- * could lead to file extent items in the log with overlapping ranges.
- * Do this while holding the inode lock, to avoid races with other
- * tasks.
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
*/
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
/*
* Before we acquired the inode's lock, someone may have dirtied more
@@ -2196,20 +2219,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
- * Also, the range length can be represented by u64, we have to do the
- * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
+ * For a full fsync we wait for the ordered extents to complete while
+ * for a fast fsync we wait just for writeback to complete, and then
+ * attach the ordered extents to the transaction so that a transaction
+ * commit waits for their completion, to avoid data loss if we fsync,
+ * the current transaction commits before the ordered extents complete
+ * and a power failure happens right after that.
*/
- ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
- if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ if (full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Get our ordered extents as soon as possible to avoid doing
+ * checksum lookups in the csum tree, and use instead the
+ * checksums attached to the ordered extents.
+ */
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
+ &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
}
+
+ if (ret)
+ goto out_release_extents;
+
atomic_inc(&root->log_batch);
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
smp_mb();
if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
+ (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
+ (full_sync || list_empty(&ctx.ordered_extents)))) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2225,9 +2270,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
/*
@@ -2244,12 +2287,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
- ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2276,6 +2318,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
}
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
ret = btrfs_commit_transaction(trans);
} else {
ret = btrfs_end_transaction(trans);
@@ -2286,6 +2335,12 @@ out:
if (!ret)
ret = err;
return ret > 0 ? -EIO : ret;
+
+out_release_extents:
+ btrfs_release_log_ctx_extents(&ctx);
+ up_write(&BTRFS_I(inode)->dio_sem);
+ inode_unlock(inode);
+ goto out;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -2481,7 +2536,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ lockend);
/*
* We need to make sure we have no ordered extents in this range
@@ -2509,11 +2565,11 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
return 0;
}
-static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path,
- struct btrfs_clone_extent_info *clone_info,
- const u64 clone_len)
+ struct btrfs_replace_extent_info *extent_info,
+ const u64 replace_len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2522,51 +2578,69 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int slot;
struct btrfs_ref ref = { 0 };
- u64 ref_offset;
int ret;
- if (clone_len == 0)
+ if (replace_len == 0)
return 0;
- if (clone_info->disk_offset == 0 &&
+ if (extent_info->disk_offset == 0 &&
btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
key.objectid = btrfs_ino(BTRFS_I(inode));
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = clone_info->file_offset;
+ key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
if (ret)
return ret;
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, clone_info->extent_buf,
+ write_extent_buffer(leaf, extent_info->extent_buf,
btrfs_item_ptr_offset(leaf, slot),
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
- btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
+ if (extent_info->is_new_extent)
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
- clone_info->file_offset, clone_len);
+ extent_info->file_offset, replace_len);
if (ret)
return ret;
/* If it's a hole, nothing more needs to be done. */
- if (clone_info->disk_offset == 0)
+ if (extent_info->disk_offset == 0)
return 0;
- inode_add_bytes(inode, clone_len);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- clone_info->disk_offset,
- clone_info->disk_len, 0);
- ref_offset = clone_info->file_offset - clone_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), ref_offset);
- ret = btrfs_inc_extent_ref(trans, &ref);
+ inode_add_bytes(inode, replace_len);
+
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
+ key.objectid = extent_info->disk_offset;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = extent_info->disk_len;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(BTRFS_I(inode)),
+ extent_info->file_offset,
+ extent_info->qgroup_reserved,
+ &key);
+ } else {
+ u64 ref_offset;
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ extent_info->disk_offset,
+ extent_info->disk_len, 0);
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), ref_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ }
+
+ extent_info->insertions++;
return ret;
}
@@ -2574,15 +2648,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
/*
* The respective range must have been previously locked, as well as the inode.
* The end offset is inclusive (last byte of the range).
- * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
- * cloning.
- * When cloning, we don't want to end up in a state where we dropped extents
- * without inserting a new one, so we must abort the transaction to avoid a
- * corruption.
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
+ * the file range with an extent.
+ * When not punching a hole, we don't want to end up in a state where we dropped
+ * extents without inserting a new one, so we must abort the transaction to avoid
+ * a corruption.
*/
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2611,10 +2685,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
/*
* 1 - update the inode
* 1 - removing the extents in the range
- * 1 - adding the hole extent if no_holes isn't set or if we are cloning
- * an extent
+ * 1 - adding the hole extent if no_holes isn't set or if we are
+ * replacing the range with a new extent
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
rsv_count = 3;
else
rsv_count = 2;
@@ -2644,14 +2718,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* returned by __btrfs_drop_extents() without having
* changed anything in the file.
*/
- if (clone_info && ret && ret != -EOPNOTSUPP)
+ if (extent_info && !extent_info->is_new_extent &&
+ ret && ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans, ret);
break;
}
trans->block_rsv = &fs_info->trans_block_rsv;
- if (!clone_info && cur_offset < drop_end &&
+ if (!extent_info && cur_offset < drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
@@ -2665,7 +2740,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/*
* We are past the i_size here, but since we didn't
* insert holes we need to clear the mapped area so we
@@ -2685,18 +2760,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info && drop_end > clone_info->file_offset) {
- u64 clone_len = drop_end - clone_info->file_offset;
+ if (extent_info && drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_end - extent_info->file_offset;
- ret = btrfs_insert_clone_extent(trans, inode, path,
- clone_info, clone_len);
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- clone_info->data_len -= clone_len;
- clone_info->data_offset += clone_len;
- clone_info->file_offset += clone_len;
+ extent_info->data_len -= replace_len;
+ extent_info->data_offset += replace_len;
+ extent_info->file_offset += replace_len;
}
cur_offset = drop_end;
@@ -2720,7 +2795,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
- if (!clone_info) {
+ if (!extent_info) {
ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
@@ -2739,7 +2814,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* than 16Mb would force the full fsync any way (when
* try_release_extent_mapping() is invoked during page cache truncation.
*/
- if (clone_info)
+ if (extent_info && !extent_info->is_new_extent)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -2765,7 +2840,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
- if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
+ if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
if (ret) {
@@ -2773,7 +2848,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
cur_offset, drop_end - cur_offset);
@@ -2783,9 +2858,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info) {
- ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
- clone_info->data_len);
+ if (extent_info) {
+ ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
+ extent_info->data_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2840,9 +2915,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
+ lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
- btrfs_inode_sectorsize(inode)) - 1;
+ btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -2927,7 +3002,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
+ ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
&trans);
btrfs_free_path(path);
if (ret)
@@ -3044,7 +3119,7 @@ enum {
RANGE_BOUNDARY_HOLE,
};
-static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
const u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -3052,7 +3127,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode,
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -3077,7 +3152,7 @@ static int btrfs_zero_range(struct inode *inode,
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -3167,7 +3242,8 @@ static int btrfs_zero_range(struct inode *inode,
* to cover them.
*/
if (!IS_ALIGNED(offset, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) {
@@ -3183,7 +3259,7 @@ static int btrfs_zero_range(struct inode *inode,
}
if (!IS_ALIGNED(offset + len, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode,
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset + len);
if (ret < 0)
goto out;
@@ -3258,7 +3334,7 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(inode);
+ int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
alloc_start = round_down(offset, blocksize);
@@ -3340,7 +3416,8 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ locked_end);
if (ordered &&
ordered->file_offset + ordered->num_bytes > alloc_start &&
@@ -3541,9 +3618,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ inode_lock_shared(inode);
+ ret = btrfs_direct_IO(iocb, to);
+ inode_unlock_shared(inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return generic_file_buffered_read(iocb, to, ret);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ef0fd7afb0b1..af0013d3df63 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *val;
-
io_ctl_map_page(io_ctl, 1);
/*
@@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- val = io_ctl->cur;
- *val = cpu_to_le64(generation);
+ put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
}
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *gen;
+ u64 cache_gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- gen = io_ctl->cur;
- if (le64_to_cpu(*gen) != generation) {
+ cache_gen = get_unaligned_le64(io_ctl->cur);
+ if (cache_gen != generation) {
btrfs_err_rl(io_ctl->fs_info,
"space cache generation (%llu) does not match inode (%llu)",
- *gen, generation);
+ cache_gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
return -ENOSPC;
entry = io_ctl->cur;
- entry->offset = cpu_to_le64(offset);
- entry->bytes = cpu_to_le64(bytes);
+ put_unaligned_le64(offset, &entry->offset);
+ put_unaligned_le64(bytes, &entry->bytes);
entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
BTRFS_FREE_SPACE_EXTENT;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
@@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
}
e = io_ctl->cur;
- entry->offset = le64_to_cpu(e->offset);
- entry->bytes = le64_to_cpu(e->bytes);
+ entry->offset = get_unaligned_le64(&e->offset);
+ entry->bytes = get_unaligned_le64(&e->bytes);
*type = e->type;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
io_ctl->size -= sizeof(struct btrfs_free_space_entry);
@@ -1186,7 +1183,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root,
ret = update_cache_item(trans, root, inode, path, offset,
io_ctl->entries, io_ctl->bitmaps);
out:
- io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
@@ -1347,13 +1343,14 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* them out later
*/
io_ctl_drop_pages(io_ctl);
+ io_ctl_free(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state);
/*
* at this point the pages are under IO and we're happy,
- * The caller is responsible for waiting on them and updating the
+ * The caller is responsible for waiting on them and updating
* the cache and the inode
*/
io_ctl->entries = entries;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 8b1f5c8897b7..6b9faf3b0e96 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -22,6 +22,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
size_t bitmap_size;
u64 num_bitmaps, total_bitmap_size;
+ if (WARN_ON(cache->length == 0))
+ btrfs_warn(cache->fs_info, "block group %llu length is zero",
+ cache->start);
+
/*
* We convert to bitmaps when the disk space required for using extents
* exceeds that required for using bitmaps.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 51fcd82d41c0..936c3137c646 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,7 +6,6 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -31,6 +30,7 @@
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
+#include <linux/iomap.h>
#include <asm/unaligned.h>
#include "misc.h"
#include "ctree.h"
@@ -59,9 +59,10 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
u64 reserve;
- u64 unsubmitted_oe_range_start;
- u64 unsubmitted_oe_range_end;
- int overwrite;
+ loff_t length;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ bool sync;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -70,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
-static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
@@ -140,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
static int btrfs_dirty_inode(struct inode *inode);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode)
-{
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-}
-#endif
-
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
@@ -1610,7 +1603,7 @@ next_slot:
goto out_check;
ret = btrfs_cross_ref_exist(root, ino,
found_key.offset -
- extent_offset, disk_bytenr);
+ extent_offset, disk_bytenr, false);
if (ret) {
/*
* ret could be -EIO if the above fails to read
@@ -2161,11 +2154,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
u64 bio_offset)
{
struct inode *inode = private_data;
- blk_status_t ret = 0;
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
/*
@@ -2186,9 +2176,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
*
* c-3) otherwise: async submit
*/
-static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2248,16 +2237,15 @@ out:
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
*/
-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
- struct inode *inode, struct list_head *list)
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct list_head *list)
{
struct btrfs_ordered_sum *sum;
int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- ret = btrfs_csum_file_blocks(trans,
- BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
@@ -2360,7 +2348,7 @@ again:
unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -2551,7 +2539,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
}
static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
- struct inode *inode,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
@@ -2571,8 +2558,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
- return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset,
- &stack_fi, oe->qgroup_rsv);
+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ oe->file_offset, &stack_fi,
+ oe->qgroup_rsv);
}
/*
@@ -2669,8 +2657,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_ordered_extent_file_extent(trans, inode,
- ordered_extent);
+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
@@ -2686,7 +2673,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- ret = add_pending_csums(trans, inode, &ordered_extent->list);
+ ret = add_pending_csums(trans, &ordered_extent->list);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -2755,7 +2742,7 @@ out:
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
- btrfs_remove_ordered_extent(inode, ordered_extent);
+ btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
@@ -2775,8 +2762,8 @@ static void finish_ordered_fn(struct btrfs_work *work)
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
@@ -2787,7 +2774,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+ if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
@@ -2836,9 +2823,8 @@ zeroit:
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
*/
-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror)
{
size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
@@ -3058,7 +3044,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
if (ret == -ENOENT && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
/*
@@ -3398,7 +3383,6 @@ cache_acl:
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
break;
@@ -4054,7 +4038,7 @@ out_end_trans:
err = ret;
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
if (err) {
@@ -4586,7 +4570,7 @@ again:
&cached_state);
unlock_page(page);
put_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -4851,19 +4835,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
/*
* We're truncating a file that used to have good data down to
- * zero. Make sure it gets into the ordered flush list so that
- * any new writes get down to disk quickly.
+ * zero. Make sure any new writes to the file get on disk
+ * on close.
*/
if (newsize == 0)
- set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags);
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -5308,15 +5289,15 @@ static void inode_tree_add(struct inode *inode)
spin_unlock(&root->inode_lock);
}
-static void inode_tree_del(struct inode *inode)
+static void inode_tree_del(struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
int empty = 0;
spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
+ rb_erase(&inode->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&inode->rb_node);
empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
@@ -6314,7 +6295,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
d_instantiate_new(dentry, inode);
out_unlock:
@@ -6377,7 +6357,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- int ret;
err = btrfs_update_inode(trans, root, inode);
if (err)
@@ -6392,12 +6371,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
- true, NULL);
- if (ret == BTRFS_NEED_TRANS_COMMIT) {
- err = btrfs_commit_transaction(trans);
- trans = NULL;
- }
+ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
fail:
@@ -6543,8 +6517,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
- int err = 0;
+ int ret = 0;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -6572,7 +6545,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
}
em = alloc_extent_map();
if (!em) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
em->start = EXTENT_MAP_HOLE;
@@ -6582,7 +6555,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -6595,14 +6568,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
*/
path->leave_spinning = 1;
+ path->recurse = btrfs_is_free_space_inode(inode);
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
- err = ret;
goto out;
} else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
+ ret = 0;
}
leaf = path->nodes[0];
@@ -6628,7 +6603,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
if (!S_ISREG(inode->vfs_inode.i_mode)) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
btrfs_ino(inode));
@@ -6646,12 +6621,11 @@ next:
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- } else if (ret > 0) {
+ else if (ret > 0)
goto not_found;
- }
+
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6702,10 +6676,8 @@ next:
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6729,29 +6701,28 @@ not_found:
em->len = len;
em->block_start = EXTENT_MAP_HOLE;
insert:
+ ret = 0;
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
- err = -EIO;
+ ret = -EIO;
goto out;
}
- err = 0;
write_lock(&em_tree->lock);
- err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- if (err) {
+ if (ret) {
free_extent_map(em);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
- BUG_ON(!em); /* Error is always set */
return em;
}
@@ -6953,6 +6924,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
* @orig_start: (optional) Return the original file offset of the file extent
* @orig_len: (optional) Return the original on-disk length of the file extent
* @ram_bytes: (optional) Return the ram_bytes of the file extent
+ * @strict: if true, omit optimizations that might force us into unnecessary
+ * cow. e.g., don't trust generation number.
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks for (nowait == false) case.
@@ -6967,7 +6940,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes)
+ u64 *ram_bytes, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path;
@@ -7045,8 +7018,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
* Do the same check as in btrfs_cross_ref_exist but without the
* unnecessary search.
*/
- if (btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
+ if (!strict &&
+ (btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
backref_offset = btrfs_file_extent_offset(leaf, fi);
@@ -7082,7 +7056,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*/
ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
- key.offset - backref_offset, disk_bytenr);
+ key.offset - backref_offset, disk_bytenr,
+ strict);
if (ret) {
ret = 0;
goto out;
@@ -7110,7 +7085,7 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, int writing)
+ struct extent_state **cached_state, bool writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
@@ -7159,7 +7134,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
*/
if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
else
ret = -ENOTBLK;
btrfs_put_ordered_extent(ordered);
@@ -7248,30 +7223,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
}
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
- struct buffer_head *bh_result,
- struct inode *inode,
- u64 start, u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return -ENOENT;
-
- len = min(len, em->len - (start - em->start));
-
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- return 0;
-}
-
static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
@@ -7303,7 +7255,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1 &&
+ &orig_block_len, &ram_bytes, false) == 1 &&
btrfs_inc_nocow_writers(fs_info, block_start)) {
struct extent_map *em2;
@@ -7332,7 +7284,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
}
/* this will cow the extent */
- len = bh_result->b_size;
free_extent_map(em);
*map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
@@ -7343,64 +7294,88 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
skip_cow:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
-
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
+ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
out:
return ret;
}
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
- u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
- u64 len = bh_result->b_size;
+ const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
+ u64 len = length;
+ bool unlock_extents = false;
+ bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
+
+ /*
+ * We used current->journal_info here to see if we were sync, but
+ * there's a lot of tests in the enospc machinery to not do flushing if
+ * we have a journal_info set, so we need to clear this out and re-set
+ * it in iomap_end.
+ */
+ ASSERT(current->journal_info == NULL ||
+ current->journal_info == BTRFS_DIO_SYNC_STUB);
+ current->journal_info = NULL;
- if (!create)
+ if (!write)
len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
lockend = start + len - 1;
- if (current->journal_info) {
- /*
- * Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transaction doesn't get
- * confused.
- */
- dio_data = current->journal_info;
- current->journal_info = NULL;
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so we
+ * need to flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
}
+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+ if (!dio_data)
+ return -ENOMEM;
+
+ dio_data->sync = sync;
+ dio_data->length = length;
+ if (write) {
+ dio_data->reserve = round_up(length, fs_info->sectorsize);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, dio_data->reserve);
+ if (ret) {
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ return ret;
+ }
+ }
+ iomap->private = dio_data;
+
+
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- create)) {
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
ret = -ENOTBLK;
goto err;
}
@@ -7432,35 +7407,47 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto unlock_err;
}
- if (create) {
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- dio_data, start, len);
+ len = min(len, em->len - (start - em->start));
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len);
if (ret < 0)
goto unlock_err;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
} else {
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- start, len);
- /* Can be negative only if we read from a hole */
- if (ret < 0) {
- ret = 0;
- free_extent_map(em);
- goto unlock_err;
- }
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + bh_result->b_size;
- if (lockstart < lockend) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- } else {
- free_extent_state(cached_state);
- }
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
+ }
+
+ if (unlock_extents)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
}
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->length = len;
free_extent_map(em);
@@ -7470,8 +7457,63 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data)
- current->journal_info = dio_data;
+ if (dio_data) {
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, start,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ }
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ int ret = 0;
+ struct btrfs_dio_data *dio_data = iomap->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ goto out;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ __endio_write_update_ordered(BTRFS_I(inode), pos,
+ length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1);
+ ret = -ENOTBLK;
+ }
+
+ if (write) {
+ if (dio_data->reserve)
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, pos,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+out:
+ /*
+ * We're all done, we can re-set the current->journal_info now safely
+ * for our endio.
+ */
+ if (dio_data->sync) {
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
+ kfree(dio_data);
+ iomap->private = NULL;
+
return ret;
}
@@ -7495,7 +7537,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
dip->logical_offset + dip->bytes - 1);
}
- dio_end_io(dip->dio_bio);
+ bio_endio(dip->dio_bio);
kfree(dip);
}
@@ -7619,10 +7661,8 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
struct bio *bio, u64 offset)
{
struct inode *inode = private_data;
- blk_status_t ret;
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -7731,24 +7771,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
-
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- /*
- * Setting range start and end to the same value means that
- * no cleanup will happen in btrfs_direct_IO
- */
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
- }
return dip;
}
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ struct bio *dio_bio, loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
@@ -7765,6 +7792,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iomap->private;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -7773,8 +7801,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_status = BLK_STS_RESOURCE;
- dio_end_io(dio_bio);
- return;
+ bio_endio(dio_bio);
+ return BLK_QC_T_NONE;
}
if (!write && csum) {
@@ -7845,15 +7873,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
goto out_err;
}
+ dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
} while (submit_len > 0);
- return;
+ return BLK_QC_T_NONE;
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
+ return BLK_QC_T_NONE;
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
@@ -7889,37 +7919,59 @@ out:
return retval;
}
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned flags)
+{
+ /*
+ * Now if we're still in the context of our submitter we know we can't
+ * safely run generic_write_sync(), so clear our flag here so that the
+ * caller knows to follow up with a sync.
+ */
+ if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
+ current->journal_info = NULL;
+ return error;
+ }
+
+ if (error)
+ return error;
+
+ if (size) {
+ iocb->ki_flags |= IOCB_DSYNC;
+ return generic_write_sync(iocb, size);
+ }
+
+ return 0;
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_submit_direct,
+};
+
+static const struct iomap_dio_ops btrfs_sync_dops = {
+ .submit_io = btrfs_submit_direct,
+ .end_io = btrfs_maybe_fsync_end_io,
+};
+
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_data dio_data = { 0 };
struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
- int flags = 0;
- bool wakeup = true;
bool relock = false;
ssize_t ret;
if (check_direct_IO(fs_info, iter, offset))
return 0;
- inode_dio_begin(inode);
-
- /*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so
- * we need to flush the dirty pages again to make absolutely sure
- * that any outstanding dirty pages are on disk.
- */
count = iov_iter_count(iter);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset,
- offset + count - 1);
-
if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
@@ -7927,66 +7979,29 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- offset, count);
- if (ret)
- goto out;
-
- /*
- * We need to know how many extents we reserved so that we can
- * do the accounting properly if we go over the number we
- * originally calculated. Abuse current->journal_info for this.
- */
- dio_data.reserve = round_up(count,
- fs_info->sectorsize);
- dio_data.unsubmitted_oe_range_start = (u64)offset;
- dio_data.unsubmitted_oe_range_end = (u64)offset;
- current->journal_info = &dio_data;
down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
}
- ret = __blockdev_direct_IO(iocb, inode,
- fs_info->fs_devices->latest_bdev,
- iter, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (iov_iter_rw(iter) == WRITE) {
+ /*
+ * We have are actually a sync iocb, so we need our fancy endio to know
+ * if we need to sync.
+ */
+ if (current->journal_info)
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_sync_dops, is_sync_kiocb(iocb));
+ else
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_dio_ops, is_sync_kiocb(iocb));
+
+ if (ret == -ENOTBLK)
+ ret = 0;
+
+ if (iov_iter_rw(iter) == WRITE)
up_read(&BTRFS_I(inode)->dio_sem);
- current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED) {
- if (dio_data.reserve)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, offset, dio_data.reserve,
- true);
- /*
- * On error we might have left some ordered extents
- * without submitting corresponding bios for them, so
- * cleanup them up to avoid other tasks getting them
- * and waiting for them to complete forever.
- */
- if (dio_data.unsubmitted_oe_range_start <
- dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(BTRFS_I(inode),
- dio_data.unsubmitted_oe_range_start,
- dio_data.unsubmitted_oe_range_end -
- dio_data.unsubmitted_oe_range_start,
- false);
- } else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
- }
-out:
- if (wakeup)
- inode_dio_end(inode);
+
if (relock)
inode_lock(inode);
@@ -8003,12 +8018,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- return extent_fiemap(inode, fieinfo, start, len);
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
int btrfs_readpage(struct file *file, struct page *page)
{
- return extent_read_full_page(page, btrfs_get_extent, 0);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ unsigned long bio_flags = 0;
+ struct bio *bio = NULL;
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
+ if (bio)
+ ret = submit_one_bio(bio, 0, bio_flags);
+ return ret;
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -8092,15 +8119,15 @@ static int btrfs_migratepage(struct address_space *mapping,
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- struct inode *inode = page->mapping->host;
- struct extent_io_tree *tree;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
u64 start;
u64 end;
- int inode_evicting = inode->i_state & I_FREEING;
+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
* we have the page locked, so new writeback can't start,
@@ -8111,7 +8138,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
*/
wait_on_page_writeback(page);
- tree = &BTRFS_I(inode)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
@@ -8121,8 +8147,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
lock_extent_bits(tree, page_start, page_end, &cached_state);
again:
start = page_start;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
- page_end - start + 1);
+ ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
end = min(page_end,
ordered->file_offset + ordered->num_bytes - 1);
@@ -8143,7 +8168,7 @@ again:
struct btrfs_ordered_inode_tree *tree;
u64 new_len;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8182,7 +8207,7 @@ again:
* bit of its io_tree, and free the qgroup reserved data space.
* Since the IO will never happen for this page.
*/
- btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
@@ -8284,7 +8309,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -8615,21 +8640,21 @@ void btrfs_free_inode(struct inode *inode)
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
-void btrfs_destroy_inode(struct inode *inode)
+void btrfs_destroy_inode(struct inode *vfs_inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
- WARN_ON(!hlist_empty(&inode->i_dentry));
- WARN_ON(inode->i_data.nrpages);
- WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
- WARN_ON(BTRFS_I(inode)->block_rsv.size);
- WARN_ON(BTRFS_I(inode)->outstanding_extents);
- WARN_ON(BTRFS_I(inode)->delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->csum_bytes);
- WARN_ON(BTRFS_I(inode)->defrag_bytes);
+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
+ WARN_ON(vfs_inode->i_data.nrpages);
+ WARN_ON(inode->block_rsv.reserved);
+ WARN_ON(inode->block_rsv.size);
+ WARN_ON(inode->outstanding_extents);
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8644,7 +8669,7 @@ void btrfs_destroy_inode(struct inode *inode)
if (!ordered)
break;
else {
- btrfs_err(fs_info,
+ btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
btrfs_remove_ordered_extent(inode, ordered);
@@ -8652,11 +8677,11 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
- btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
- btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
- btrfs_put_root(BTRFS_I(inode)->root);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
+ btrfs_put_root(inode->root);
}
int btrfs_drop_inode(struct inode *inode)
@@ -8781,27 +8806,19 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
- struct dentry *parent;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
+ int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
- struct btrfs_log_ctx ctx_root;
- struct btrfs_log_ctx ctx_dest;
- bool sync_log_root = false;
- bool sync_log_dest = false;
- bool commit_transaction = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
- btrfs_init_log_ctx(&ctx_root, old_inode);
- btrfs_init_log_ctx(&ctx_dest, new_inode);
-
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -8943,30 +8960,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
if (root_log_pinned) {
- parent = new_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx_root);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_root = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
- if (!commit_transaction) {
- parent = old_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
- BTRFS_I(new_dir), parent,
- false, &ctx_dest);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_dest = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
- }
+ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
+ old_dentry->d_parent);
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -8999,46 +9000,13 @@ out_fail:
dest_log_pinned = false;
}
}
- if (!ret && sync_log_root && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
- &ctx_root);
- if (ret)
- commit_transaction = true;
- }
- if (!ret && sync_log_dest && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
- &ctx_dest);
- if (ret)
- commit_transaction = true;
- }
- if (commit_transaction) {
- /*
- * We may have set commit_transaction when logging the new name
- * in the destination root, in which case we left the source
- * root context in the list of log contextes. So make sure we
- * remove it to avoid invalid memory accesses, since the context
- * was allocated in our stack frame.
- */
- if (sync_log_root) {
- mutex_lock(&root->log_mutex);
- list_del_init(&ctx_root.list);
- mutex_unlock(&root->log_mutex);
- }
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
- ASSERT(list_empty(&ctx_root.list));
- ASSERT(list_empty(&ctx_dest.list));
-
return ret;
}
@@ -9106,11 +9074,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
int ret;
+ int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
- struct btrfs_log_ctx ctx;
- bool sync_log = false;
- bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9260,17 +9226,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = index;
if (log_pinned) {
- struct dentry *parent = new_dentry->d_parent;
-
- btrfs_init_log_ctx(&ctx, old_inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
log_pinned = false;
}
@@ -9307,23 +9264,8 @@ out_fail:
btrfs_end_log_trans(root);
log_pinned = false;
}
- if (!ret && sync_log) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
- if (ret)
- commit_transaction = true;
- } else if (sync_log) {
- mutex_lock(&root->log_mutex);
- list_del(&ctx.list);
- mutex_unlock(&root->log_mutex);
- }
- if (commit_transaction) {
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -9389,7 +9331,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -9429,9 +9371,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
list_add_tail(&work->list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
- ret++;
- if (nr != -1 && ret >= nr)
- goto out;
+ if (*nr != U64_MAX) {
+ (*nr)--;
+ if (*nr == 0)
+ goto out;
+ }
cond_resched();
spin_lock(&root->delalloc_lock);
}
@@ -9456,18 +9400,15 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ u64 nr = U64_MAX;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = start_delalloc_inodes(root, -1, true);
- if (ret > 0)
- ret = 0;
- return ret;
+ return start_delalloc_inodes(root, &nr, true);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
{
struct btrfs_root *root;
struct list_head splice;
@@ -9490,15 +9431,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, nr, false);
+ ret = start_delalloc_inodes(root, &nr, false);
btrfs_put_root(root);
if (ret < 0)
goto out;
-
- if (nr != -1) {
- nr -= ret;
- WARN_ON(nr < 0);
- }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
@@ -9569,7 +9505,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
@@ -9634,11 +9569,15 @@ out_unlock:
return err;
}
-static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ struct btrfs_trans_handle *trans_in,
struct inode *inode, struct btrfs_key *ins,
u64 file_offset)
{
struct btrfs_file_extent_item stack_fi;
+ struct btrfs_replace_extent_info extent_info;
+ struct btrfs_trans_handle *trans = trans_in;
+ struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
int ret;
@@ -9655,10 +9594,40 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
if (ret < 0)
- return ret;
- return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset,
- &stack_fi, ret);
+ return ERR_PTR(ret);
+
+ if (trans) {
+ ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
+ file_offset, &stack_fi, ret);
+ if (ret)
+ return ERR_PTR(ret);
+ return trans;
+ }
+
+ extent_info.disk_offset = start;
+ extent_info.disk_len = len;
+ extent_info.data_offset = 0;
+ extent_info.data_len = len;
+ extent_info.file_offset = file_offset;
+ extent_info.extent_buf = (char *)&stack_fi;
+ extent_info.is_new_extent = true;
+ extent_info.qgroup_reserved = ret;
+ extent_info.insertions = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
+ file_offset + len - 1, &extent_info,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return trans;
}
+
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
@@ -9681,14 +9650,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (trans)
own_trans = false;
while (num_bytes > 0) {
- if (own_trans) {
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- }
-
cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
@@ -9700,11 +9661,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
min_size, 0, *alloc_hint, &ins, 1, 0);
- if (ret) {
- if (own_trans)
- btrfs_end_transaction(trans);
+ if (ret)
break;
- }
/*
* We've reserved this space, and thus converted it from
@@ -9717,13 +9675,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
- if (ret) {
+ trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
- btrfs_abort_transaction(trans, ret);
- if (own_trans)
- btrfs_end_transaction(trans);
break;
}
@@ -9786,8 +9742,10 @@ next:
break;
}
- if (own_trans)
+ if (own_trans) {
btrfs_end_transaction(trans);
+ trans = NULL;
+ }
}
if (clear_offset < end)
btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
@@ -9866,7 +9824,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
if (ret)
@@ -10073,14 +10030,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
/*
* Balance or device remove/replace/resize can move stuff around from
- * under us. The EXCL_OP flag makes sure they aren't running/won't run
- * concurrently while we are mapping the swap extents, and
- * fs_info->swapfile_pins prevents them from running while the swap file
- * is active and moving the extents. Note that this also prevents a
- * concurrent device add which isn't actually necessary, but it's not
+ * under us. The exclop protection makes sure they aren't running/won't
+ * run concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap
+ * file is active and moving the extents. Note that this also prevents
+ * a concurrent device add which isn't actually necessary, but it's not
* really worth the trouble to allow it.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
@@ -10136,7 +10093,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -10226,7 +10183,7 @@ out:
if (ret)
btrfs_swap_deactivate(file);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (ret)
return ret;
@@ -10284,12 +10241,6 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static const struct extent_io_ops btrfs_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btrfs_submit_bio_hook,
- .readpage_end_io_hook = btrfs_readpage_end_io_hook,
-};
-
/*
* btrfs doesn't support the bmap operation because swapfiles
* use bmap to make a mapping of extents in the file. They assume
@@ -10307,7 +10258,7 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
- .direct_IO = btrfs_direct_IO,
+ .direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd3511c5ca81..ab408a23ba32 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -378,6 +378,18 @@ static int check_xflags(unsigned int flags)
return 0;
}
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
/*
* Set the xflags from the internal inode flags. The remaining items of fsxattr
* are zeroed.
@@ -618,7 +630,7 @@ static noinline int create_subvol(struct inode *dir,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
goto fail_free;
}
trans->block_rsv = &block_rsv;
@@ -628,7 +640,8 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -742,7 +755,7 @@ fail:
kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
err = btrfs_commit_transaction(trans);
if (err && !ret)
@@ -856,7 +869,7 @@ fail:
if (ret && pending_snapshot->snap)
pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
+ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
free_pending:
if (pending_snapshot->anon_dev)
free_anon_bdev(pending_snapshot->anon_dev);
@@ -1306,7 +1319,7 @@ again:
break;
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
lock_page(page);
/*
@@ -1638,7 +1651,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
@@ -1752,7 +1765,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
out_free:
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
mnt_drop_write_file(file);
return ret;
}
@@ -2086,9 +2099,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
sh.len = item_len;
sh.transid = found_transid;
- /* copy search result header */
- if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
- ret = -EFAULT;
+ /*
+ * Copy search result header. If we fault then loop again so we
+ * can fault in the pages and -EFAULT there if there's a
+ * problem. Otherwise we'll fault and then copy the buffer in
+ * properly this next time through
+ */
+ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = 0;
goto out;
}
@@ -2096,10 +2114,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
if (item_len) {
char __user *up = ubuf + *sk_offset;
- /* copy the item */
- if (read_extent_buffer_to_user(leaf, up,
- item_off, item_len)) {
- ret = -EFAULT;
+ /*
+ * Copy the item, same behavior as above, but reset the
+ * * sk_offset so we copy the full thing again.
+ */
+ if (read_extent_buffer_to_user_nofault(leaf, up,
+ item_off, item_len)) {
+ ret = 0;
+ *sk_offset -= sizeof(sh);
goto out;
}
@@ -2184,6 +2206,11 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
+ ret = fault_in_pages_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset);
+ if (ret)
+ break;
+
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
@@ -3112,7 +3139,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
vol_args = memdup_user(arg, sizeof(*vol_args));
@@ -3129,7 +3156,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -3158,7 +3185,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
@@ -3169,7 +3196,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -3200,7 +3227,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -3218,7 +3245,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out_drop_write:
mnt_drop_write_file(file);
@@ -3448,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *tmp;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3504,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
break;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3722,11 +3743,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = -EROFS;
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -3937,7 +3958,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
return ret;
again:
- if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
@@ -3983,7 +4004,6 @@ again:
}
locked:
- BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
@@ -4038,10 +4058,10 @@ locked:
do_balance:
/*
- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
- * btrfs_balance. bctl is freed in reset_balance_state, or, if
- * restriper was paused all the way until unmount, in free_fs_info.
- * The flag should be cleared after reset_balance_state.
+ * Ownership of bctl and exclusive operation goes to btrfs_balance.
+ * bctl is freed in reset_balance_state, or, if restriper was paused
+ * all the way until unmount, in free_fs_info. The flag should be
+ * cleared after reset_balance_state.
*/
need_unlock = false;
@@ -4060,7 +4080,7 @@ out_bargs:
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
return ret;
@@ -4883,7 +4903,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index f75612e18a82..66e02ebdd340 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -57,8 +57,8 @@
* performance reasons.
*
*
- * Lock nesting
- * ------------
+ * Lock recursion
+ * --------------
*
* A write operation on a tree might indirectly start a look up on the same
* tree. This can happen when btrfs_cow_block locks the tree and needs to
@@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
btrfs_assert_tree_read_locked(eb);
atomic_inc(&eb->blocking_readers);
@@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
if (eb->blocking_writers == 0) {
btrfs_assert_spinning_writers_put(eb);
@@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
*
* The rwlock is held upon exit.
*/
-void btrfs_tree_read_lock(struct extent_buffer *eb)
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse)
{
u64 start_ns = 0;
@@ -263,8 +264,9 @@ again:
* depends on this as it may be called on a partly
* (write-)locked tree.
*/
- BUG_ON(eb->lock_nested);
- eb->lock_nested = true;
+ WARN_ON(!recurse);
+ BUG_ON(eb->lock_recursed);
+ eb->lock_recursed = true;
read_unlock(&eb->lock);
trace_btrfs_tree_read_lock(eb, start_ns);
return;
@@ -279,6 +281,11 @@ again:
trace_btrfs_tree_read_lock(eb, start_ns);
}
+void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false);
+}
+
/*
* Lock extent buffer for read, optimistically expecting that there are no
* contending blocking writers. If there are, don't wait.
@@ -362,11 +369,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -388,11 +395,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -409,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
*
* The rwlock is held for write upon exit.
*/
-void btrfs_tree_lock(struct extent_buffer *eb)
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
{
u64 start_ns = 0;
@@ -434,6 +441,11 @@ again:
trace_btrfs_tree_lock(eb, start_ns);
}
+void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
+}
+
/*
* Release the write lock, either blocking or spinning (ie. there's no need
* for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
@@ -552,13 +564,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
*
* Return: root extent buffer with read lock held
*/
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index d715846c10b8..3ea81ed3320b 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -16,11 +16,81 @@
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
+/*
+ * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
+ * the time of this patch is 8, which is how many we use. Keep this in mind if
+ * you decide you want to add another subclass.
+ */
+enum btrfs_lock_nesting {
+ BTRFS_NESTING_NORMAL,
+
+ /*
+ * When we COW a block we are holding the lock on the original block,
+ * and since our lockdep maps are rootid+level, this confuses lockdep
+ * when we lock the newly allocated COW'd block. Handle this by having
+ * a subclass for COW'ed blocks so that lockdep doesn't complain.
+ */
+ BTRFS_NESTING_COW,
+
+ /*
+ * Oftentimes we need to lock adjacent nodes on the same level while
+ * still holding the lock on the original node we searched to, such as
+ * for searching forward or for split/balance.
+ *
+ * Because of this we need to indicate to lockdep that this is
+ * acceptable by having a different subclass for each of these
+ * operations.
+ */
+ BTRFS_NESTING_LEFT,
+ BTRFS_NESTING_RIGHT,
+
+ /*
+ * When splitting we will be holding a lock on the left/right node when
+ * we need to cow that node, thus we need a new set of subclasses for
+ * these two operations.
+ */
+ BTRFS_NESTING_LEFT_COW,
+ BTRFS_NESTING_RIGHT_COW,
+
+ /*
+ * When splitting we may push nodes to the left or right, but still use
+ * the subsequent nodes in our path, keeping our locks on those adjacent
+ * blocks. Thus when we go to allocate a new split block we've already
+ * used up all of our available subclasses, so this subclass exists to
+ * handle this case where we need to allocate a new split block.
+ */
+ BTRFS_NESTING_SPLIT,
+
+ /*
+ * When promoting a new block to a root we need to have a special
+ * subclass so we don't confuse lockdep, as it will appear that we are
+ * locking a higher level node before a lower level one. Copying also
+ * has this problem as it appears we're locking the same block again
+ * when we make a snapshot of an existing root.
+ */
+ BTRFS_NESTING_NEW_ROOT,
+
+ /*
+ * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * add this in here and add a static_assert to keep us from going over
+ * the limit. As of this writing we're limited to 8, and we're
+ * definitely using 8, hence this check to keep us from messing up in
+ * the future.
+ */
+ BTRFS_NESTING_MAX,
+};
+
+static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
+ "too many lock subclasses defined");
+
struct btrfs_path;
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
@@ -29,6 +99,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse);
+
+static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ return __btrfs_read_lock_root_node(root, false);
+}
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ebac13389e7e..87bac9ecdf4c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -212,11 +212,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
- trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry);
+ trace_btrfs_ordered_extent_add(inode, entry);
spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
@@ -377,17 +378,16 @@ out:
* test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
* to make sure this function only returns 1 once for a given ordered extent.
*/
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
entry = *cached;
@@ -408,7 +408,7 @@ have_entry:
}
if (io_size > entry->bytes_left) {
- btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
}
@@ -441,10 +441,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(entry->inode, entry);
+ trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
@@ -462,14 +463,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
* remove an ordered extent from the tree. No references are dropped
* and waiters are woken up.
*/
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
+ bool pending;
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
@@ -491,13 +492,41 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (pending) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ refcount_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
- trace_btrfs_ordered_extent_remove(inode, entry);
+ trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
if (!root->nr_ordered_extents) {
spin_lock(&fs_info->ordered_root_lock);
@@ -514,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
- btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
complete(&ordered->completion);
}
@@ -620,12 +649,11 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
* in the extent, and it waits on the io completion code to insert
* metadata into the btree corresponding to the extent
*/
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- int wait)
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
trace_btrfs_ordered_extent_start(inode, entry);
@@ -635,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
* for the flusher thread to find them
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -679,7 +707,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
@@ -690,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
end = ordered->file_offset;
/*
* If the ordered extent had an error save the error but don't
@@ -775,17 +803,45 @@ out:
}
/*
+ * Adds all ordered extents to the given list. The list ends up sorted by the
+ * file_offset of the ordered extents.
+ */
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *n;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+
+ ASSERT(list_empty(&ordered->log_list));
+ list_add_tail(&ordered->log_list, list);
+ refcount_inc(&ordered->refs);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
*/
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -803,20 +859,21 @@ out:
* try to find a checksum. This is used because we allow pages to
* be reclaimed before their checksum is actually put into the btree
*/
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len)
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *ordered_sum;
struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
unsigned long num_sectors;
unsigned long i;
u32 sectorsize = btrfs_inode_sectorsize(inode);
+ const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits;
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset);
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
if (!ordered)
return 0;
@@ -824,10 +881,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr &&
disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
- i = (disk_bytenr - ordered_sum->bytenr) >>
- inode->i_sb->s_blocksize_bits;
- num_sectors = ordered_sum->len >>
- inode->i_sb->s_blocksize_bits;
+ i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits;
+ num_sectors = ordered_sum->len >> blocksize_bits;
num_sectors = min_t(int, len - index, num_sectors - i);
memcpy(sum + index, ordered_sum->sums + i * csum_size,
num_sectors * csum_size);
@@ -883,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
break;
}
unlock_extent_cached(&inode->io_tree, start, end, cachedp);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d61ea9c880a3..c3a2325e64a4 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -56,6 +56,12 @@ enum {
BTRFS_ORDERED_TRUNCATED,
/* Regular IO for COW */
BTRFS_ORDERED_REGULAR,
+ /* Used during fsync to track already logged extents */
+ BTRFS_ORDERED_LOGGED,
+ /* We have already logged all the csums of the ordered extent */
+ BTRFS_ORDERED_LOGGED_CSUM,
+ /* We wait for this extent to complete in the current transaction */
+ BTRFS_ORDERED_PENDING,
};
struct btrfs_ordered_extent {
@@ -104,6 +110,9 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
+ /* used for fast fsyncs */
+ struct list_head log_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -142,9 +151,9 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
}
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
@@ -165,17 +174,18 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry, int wait);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
u64 len);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len);
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list);
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 61f44e78e3c9..7695c4783d33 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -7,6 +7,44 @@
#include "disk-io.h"
#include "print-tree.h"
+struct root_name_map {
+ u64 id;
+ char name[16];
+};
+
+static const struct root_name_map root_map[] = {
+ { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" },
+ { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" },
+ { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" },
+ { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" },
+ { BTRFS_FS_TREE_OBJECTID, "FS_TREE" },
+ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" },
+ { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" },
+ { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
+ { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
+ { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+};
+
+const char *btrfs_root_name(u64 objectid, char *buf)
+{
+ int i;
+
+ if (objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN,
+ "TREE_RELOC offset=%llu", objectid);
+ return buf;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(root_map); i++) {
+ if (root_map[i].id == objectid)
+ return root_map[i].name;
+ }
+
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid);
+ return buf;
+}
+
static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
{
int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
@@ -95,9 +133,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* offset is supposed to be a tree block which
* must be aligned to nodesize.
*/
- if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -112,8 +151,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* must be aligned to nodesize.
*/
if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index e6bb38fd75ad..78b99385a503 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,7 +6,11 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+/* Buffer size to contain tree name and possibly additional data (offset) */
+#define BTRFS_ROOT_NAME_BUF_LEN 48
+
void btrfs_print_leaf(struct extent_buffer *l);
void btrfs_print_tree(struct extent_buffer *c, bool follow);
+const char *btrfs_root_name(u64 objectid, char *buf);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c0f350c3a0cf..580899bdb991 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* Update qgroup rfer/excl counters.
* Rfer update is easy, codes can explain themselves.
*
- * Excl update is tricky, the update is split into 2 part.
+ * Excl update is tricky, the update is split into 2 parts.
* Part 1: Possible exclusive <-> sharing detect:
* | A | !A |
* -------------------------------------
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 243a2e44526e..9d4f5316a7e8 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -767,31 +767,39 @@ static void reada_start_machine_worker(struct btrfs_work *work)
kfree(rmw);
}
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+/* Try to start up to 10k READA requests for a group of devices */
+static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 enqueued;
u64 total = 0;
- int i;
+ struct btrfs_device *device;
-again:
do {
enqueued = 0;
- mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
enqueued += reada_start_machine_dev(device);
}
- mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+ return total;
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ int i;
+ u64 enqueued = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ enqueued += reada_start_for_fsdevs(fs_devices);
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ enqueued += reada_start_for_fsdevs(seed_devs);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
if (enqueued == 0)
return;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 5cd02514cf4d..99aa87c08912 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -45,7 +45,7 @@ out:
return ret;
}
-static int copy_inline_to_page(struct inode *inode,
+static int copy_inline_to_page(struct btrfs_inode *inode,
const u64 file_offset,
char *inline_data,
const u64 size,
@@ -58,6 +58,7 @@ static int copy_inline_to_page(struct inode *inode,
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
struct page *page = NULL;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
ASSERT(IS_ALIGNED(file_offset, block_size));
@@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode,
* reservation here. Also we must not do the reservation while holding
* a transaction open, otherwise we would deadlock.
*/
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- file_offset, block_size);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+ block_size);
if (ret)
goto out;
- page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
- btrfs_alloc_write_mask(inode->i_mapping));
+ page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
+ btrfs_alloc_write_mask(mapping));
if (!page) {
ret = -ENOMEM;
goto out_unlock;
}
set_page_extent_mapped(page);
- clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
+ clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end,
- 0, NULL);
+ ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -134,9 +134,9 @@ out_unlock:
put_page(page);
}
if (ret)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- file_offset, block_size, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
+ btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+ block_size, true);
+ btrfs_delalloc_release_extents(inode, block_size);
out:
extent_changeset_free(data_reserved);
@@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(dst, new_key->offset,
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
inline_data, size, datal,
comp_type);
goto out;
@@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -231,8 +231,8 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -439,7 +439,7 @@ process_slot:
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
- struct btrfs_clone_extent_info clone_info;
+ struct btrfs_replace_extent_info clone_info;
/*
* a | --- range to clone ---| b
@@ -462,8 +462,8 @@ process_slot:
clone_info.data_len = datal;
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
- clone_info.item_size = size;
- ret = btrfs_punch_hole_range(inode, path, drop_start,
+ clone_info.is_new_extent = false;
+ ret = btrfs_replace_file_extents(inode, path, drop_start,
new_key.offset + datal - 1, &clone_info,
&trans);
if (ret)
@@ -520,6 +520,8 @@ process_slot:
ret = -EINTR;
goto out;
}
+
+ cond_resched();
}
ret = 0;
@@ -533,7 +535,7 @@ process_slot:
btrfs_release_path(path);
path->leave_spinning = 0;
- ret = btrfs_punch_hole_range(inode, path, last_dest_end,
+ ret = btrfs_replace_file_extents(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
if (ret)
goto out;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4ba1ab9cc76d..3602806d71bd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1206,7 +1206,8 @@ again:
}
if (cow) {
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1274,7 +1275,8 @@ again:
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
+ slot, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* relocated and the block is tree root.
*/
leaf = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
+ BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
if (ret < 0)
@@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
- slot, &eb);
+ slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
if (ret < 0) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index c89697486366..702dc5441f03 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ if (!ret) {
+ spin_lock(&rsv->lock);
+ rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+ spin_unlock(&rsv->lock);
+ }
return ret;
}
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 qgroup_to_release;
+
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5a6cb9db512e..cf63f1e27a27 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
@@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
@@ -3716,50 +3716,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
return 0;
}
+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+ if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
+ &fs_info->scrub_lock)) {
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
+
+ scrub_workers = fs_info->scrub_workers;
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+ scrub_parity = fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ btrfs_destroy_workqueue(scrub_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
+ btrfs_destroy_workqueue(scrub_parity);
+ }
+}
+
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
+ int ret = -ENOMEM;
- lockdep_assert_held(&fs_info->scrub_lock);
+ if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
+ return 0;
- if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
- ASSERT(fs_info->scrub_workers == NULL);
- fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
- flags, is_dev_replace ? 1 : max_active, 4);
- if (!fs_info->scrub_workers)
- goto fail_scrub_workers;
-
- ASSERT(fs_info->scrub_wr_completion_workers == NULL);
- fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
- max_active, 2);
- if (!fs_info->scrub_wr_completion_workers)
- goto fail_scrub_wr_completion_workers;
+ scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
+ is_dev_replace ? 1 : max_active, 4);
+ if (!scrub_workers)
+ goto fail_scrub_workers;
- ASSERT(fs_info->scrub_parity_workers == NULL);
- fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
- if (!fs_info->scrub_parity_workers)
- goto fail_scrub_parity_workers;
+ if (!scrub_wr_comp)
+ goto fail_scrub_wr_completion_workers;
+ scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ max_active, 2);
+ if (!scrub_parity)
+ goto fail_scrub_parity_workers;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL &&
+ fs_info->scrub_wr_completion_workers == NULL &&
+ fs_info->scrub_parity_workers == NULL);
+ fs_info->scrub_workers = scrub_workers;
+ fs_info->scrub_wr_completion_workers = scrub_wr_comp;
+ fs_info->scrub_parity_workers = scrub_parity;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
- } else {
- refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ return 0;
}
- return 0;
+ /* Other thread raced in and created the workers for us */
+ refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ ret = 0;
+ btrfs_destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
- btrfs_destroy_workqueue(fs_info->scrub_workers);
+ btrfs_destroy_workqueue(scrub_workers);
fail_scrub_workers:
- return -ENOMEM;
+ return ret;
}
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -3770,9 +3804,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -3819,13 +3850,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (IS_ERR(sctx))
return PTR_ERR(sctx);
+ ret = scrub_workers_get(fs_info, is_dev_replace);
+ if (ret)
+ goto out_free_ctx;
+
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -ENODEV;
- goto out_free_ctx;
+ goto out;
}
if (!is_dev_replace && !readonly &&
@@ -3834,7 +3869,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
rcu_str_deref(dev->name));
ret = -EROFS;
- goto out_free_ctx;
+ goto out;
}
mutex_lock(&fs_info->scrub_lock);
@@ -3843,7 +3878,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
- goto out_free_ctx;
+ goto out;
}
down_read(&fs_info->dev_replace.rwsem);
@@ -3854,17 +3889,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EINPROGRESS;
- goto out_free_ctx;
+ goto out;
}
up_read(&fs_info->dev_replace.rwsem);
- ret = scrub_workers_get(fs_info, is_dev_replace);
- if (ret) {
- mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- goto out_free_ctx;
- }
-
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3917,24 +3945,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
- scrub_workers = fs_info->scrub_workers;
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
- scrub_parity = fs_info->scrub_parity_workers;
-
- fs_info->scrub_workers = NULL;
- fs_info->scrub_wr_completion_workers = NULL;
- fs_info->scrub_parity_workers = NULL;
- }
mutex_unlock(&fs_info->scrub_lock);
- btrfs_destroy_workqueue(scrub_workers);
- btrfs_destroy_workqueue(scrub_wr_comp);
- btrfs_destroy_workqueue(scrub_parity);
+ scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
return ret;
-
+out:
+ scrub_workers_put(fs_info);
out_free_ctx:
scrub_free_ctx(sctx);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d9813a5b075a..340c76a12ce1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -122,8 +122,6 @@ struct send_ctx {
struct file_ra_state ra;
- char *read_buf;
-
/*
* We process inodes by their increasing order, so if before an
* incremental send we reverse the parent/child relationship of
@@ -278,11 +276,6 @@ enum btrfs_compare_tree_result {
BTRFS_COMPARE_TREE_CHANGED,
BTRFS_COMPARE_TREE_SAME,
};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
- struct btrfs_path *right_path,
- struct btrfs_key *key,
- enum btrfs_compare_tree_result result,
- void *ctx);
__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
@@ -584,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
- hdr->tlv_type = cpu_to_le16(attr);
- hdr->tlv_len = cpu_to_le16(len);
+ put_unaligned_le16(attr, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
memcpy(hdr + 1, data, len);
sctx->send_size += total_len;
@@ -695,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->cmd = cpu_to_le16(cmd);
+ put_unaligned_le16(cmd, &hdr->cmd);
return 0;
}
@@ -707,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx)
u32 crc;
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
- hdr->crc = 0;
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
+ put_unaligned_le32(0, &hdr->crc);
crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
- hdr->crc = cpu_to_le32(crc);
+ put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+ sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
return ret;
@@ -3813,6 +3806,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
}
/*
+ * When processing the new references for an inode we may orphanize an existing
+ * directory inode because its old name conflicts with one of the new references
+ * of the current inode. Later, when processing another new reference of our
+ * inode, we might need to orphanize another inode, but the path we have in the
+ * reference reflects the pre-orphanization name of the directory we previously
+ * orphanized. For example:
+ *
+ * parent snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- f1 (ino 257)
+ * |----- f2 (ino 258)
+ * |----- d1/ (ino 259)
+ * |----- d2/ (ino 260)
+ *
+ * send snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- d1 (ino 258)
+ * |----- f2/ (ino 259)
+ * |----- f2_link/ (ino 260)
+ * | |----- f1 (ino 257)
+ * |
+ * |----- d2 (ino 258)
+ *
+ * When processing inode 257 we compute the name for inode 259 as "d1", and we
+ * cache it in the name cache. Later when we start processing inode 258, when
+ * collecting all its new references we set a full path of "d1/d2" for its new
+ * reference with name "d2". When we start processing the new references we
+ * start by processing the new reference with name "d1", and this results in
+ * orphanizing inode 259, since its old reference causes a conflict. Then we
+ * move on the next new reference, with name "d2", and we find out we must
+ * orphanize inode 260, as its old reference conflicts with ours - but for the
+ * orphanization we use a source path corresponding to the path we stored in the
+ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
+ * receiver fail since the path component "d1/" no longer exists, it was renamed
+ * to "o259-6-0/" when processing the previous new reference. So in this case we
+ * must recompute the path in the new reference and use it for the new
+ * orphanization operation.
+ */
+static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
+{
+ char *name;
+ int ret;
+
+ name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ fs_path_reset(ref->full_path);
+ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
+ if (ret < 0)
+ goto out;
+
+ ret = fs_path_add(ref->full_path, name, ref->name_len);
+ if (ret < 0)
+ goto out;
+
+ /* Update the reference's base name pointer. */
+ set_ref_path(ref, ref->full_path);
+out:
+ kfree(name);
+ return ret;
+}
+
+/*
* This does all the move/link/unlink/rmdir magic.
*/
static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
@@ -3880,52 +3939,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
+ /*
+ * Before doing any rename and link operations, do a first pass on the
+ * new references to orphanize any unprocessed inodes that may have a
+ * reference that conflicts with one of the new references of the current
+ * inode. This needs to happen first because a new reference may conflict
+ * with the old reference of a parent directory, so we must make sure
+ * that the path used for link and rename commands don't use an
+ * orphanized name when an ancestor was not yet orphanized.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir/ (ino 259)
+ * | |----- a (ino 257)
+ * |
+ * |----- b (ino 258)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir_2/ (ino 259)
+ * | |----- a (ino 260)
+ * |
+ * |----- testdir (ino 257)
+ * |----- b (ino 257)
+ * |----- b2 (ino 258)
+ *
+ * Processing the new reference for inode 257 with name "b" may happen
+ * before processing the new reference with name "testdir". If so, we
+ * must make sure that by the time we send a link command to create the
+ * hard link "b", inode 259 was already orphanized, since the generated
+ * path in "valid_path" already contains the orphanized name for 259.
+ * We are processing inode 257, so only later when processing 259 we do
+ * the rename operation to change its temporary (orphanized) name to
+ * "testdir_2".
+ */
list_for_each_entry(cur, &sctx->new_refs, list) {
- /*
- * We may have refs where the parent directory does not exist
- * yet. This happens if the parent directories inum is higher
- * than the current inum. To handle this case, we create the
- * parent directory out of order. But we need to check if this
- * did already happen before due to other refs in the same dir.
- */
ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- if (ret == inode_state_will_create) {
- ret = 0;
- /*
- * First check if any of the current inodes refs did
- * already create the dir.
- */
- list_for_each_entry(cur2, &sctx->new_refs, list) {
- if (cur == cur2)
- break;
- if (cur2->dir == cur->dir) {
- ret = 1;
- break;
- }
- }
-
- /*
- * If that did not happen, check if a previous inode
- * did already create the dir.
- */
- if (!ret)
- ret = did_create_dir(sctx, cur->dir);
- if (ret < 0)
- goto out;
- if (!ret) {
- ret = send_create_inode(sctx, cur->dir);
- if (ret < 0)
- goto out;
- }
- }
+ if (ret == inode_state_will_create)
+ continue;
/*
- * Check if this new ref would overwrite the first ref of
- * another unprocessed inode. If yes, orphanize the
- * overwritten inode. If we find an overwritten ref that is
- * not the first ref, simply unlink it.
+ * Check if this new ref would overwrite the first ref of another
+ * unprocessed inode. If yes, orphanize the overwritten inode.
+ * If we find an overwritten ref that is not the first ref,
+ * simply unlink it.
*/
ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
cur->name, cur->name_len,
@@ -3942,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct name_cache_entry *nce;
struct waiting_dir_move *wdm;
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
@@ -4004,6 +4073,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
}
}
+ }
+
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ /*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * than the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
if (ret < 0)
@@ -4799,7 +4911,25 @@ out:
return ret;
}
-static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
+{
+ return sctx->send_max_size - SZ_16K;
+}
+
+static int put_data_header(struct send_ctx *sctx, u32 len)
+{
+ struct btrfs_tlv_header *hdr;
+
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ return 0;
+}
+
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4809,21 +4939,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
- ssize_t ret = 0;
+ int ret;
+
+ ret = put_data_header(sctx, len);
+ if (ret)
+ return ret;
inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (offset + len > i_size_read(inode)) {
- if (offset > i_size_read(inode))
- len = 0;
- else
- len = offset - i_size_read(inode);
- }
- if (len == 0)
- goto out;
-
last_index = (offset + len - 1) >> PAGE_SHIFT;
/* initial readahead */
@@ -4864,16 +4989,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
}
addr = kmap(page);
- memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+ memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset,
+ cur_len);
kunmap(page);
unlock_page(page);
put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
- ret += cur_len;
+ sctx->send_size += cur_len;
}
-out:
iput(inode);
return ret;
}
@@ -4887,7 +5012,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- ssize_t num_read = 0;
p = fs_path_alloc();
if (!p)
@@ -4895,13 +5019,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
- num_read = fill_read_buf(sctx, offset, len);
- if (num_read <= 0) {
- if (num_read < 0)
- ret = num_read;
- goto out;
- }
-
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
goto out;
@@ -4912,16 +5029,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+ ret = put_file_data(sctx, offset, len);
+ if (ret < 0)
+ goto out;
ret = send_cmd(sctx);
tlv_put_failure:
out:
fs_path_free(p);
- if (ret < 0)
- return ret;
- return num_read;
+ return ret;
}
/*
@@ -5033,8 +5150,8 @@ out:
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
+ u64 read_size = max_send_read_size(sctx);
u64 offset = sctx->cur_inode_last_extent;
- u64 len;
int ret = 0;
/*
@@ -5061,16 +5178,19 @@ static int send_hole(struct send_ctx *sctx, u64 end)
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
if (ret < 0)
goto tlv_put_failure;
- memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
- len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+ u64 len = min(end - offset, read_size);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = put_data_header(sctx, len);
+ if (ret < 0)
+ break;
+ memset(sctx->send_buf + sctx->send_size, 0, len);
+ sctx->send_size += len;
ret = send_cmd(sctx);
if (ret < 0)
break;
@@ -5086,23 +5206,20 @@ static int send_extent_data(struct send_ctx *sctx,
const u64 offset,
const u64 len)
{
+ u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
while (sent < len) {
- u64 size = len - sent;
+ u64 size = min(len - sent, read_size);
int ret;
- if (size > BTRFS_SEND_READ_SIZE)
- size = BTRFS_SEND_READ_SIZE;
ret = send_write(sctx, offset + sent, size);
if (ret < 0)
return ret;
- if (!ret)
- break;
- sent += ret;
+ sent += size;
}
return 0;
}
@@ -5402,51 +5519,29 @@ static int send_write_or_clone(struct send_ctx *sctx,
struct clone_root *clone_root)
{
int ret = 0;
- struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 len;
- u8 type;
+ u64 end;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], ei);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- /*
- * it is possible the inline item won't cover the whole page,
- * but there may be items after this page. Make
- * sure to send the whole thing
- */
- len = PAGE_ALIGN(len);
- } else {
- len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- }
-
- if (offset >= sctx->cur_inode_size) {
- ret = 0;
- goto out;
- }
- if (offset + len > sctx->cur_inode_size)
- len = sctx->cur_inode_size - offset;
- if (len == 0) {
- ret = 0;
- goto out;
- }
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
+ if (offset >= end)
+ return 0;
- if (clone_root && IS_ALIGNED(offset + len, bs)) {
+ if (clone_root && IS_ALIGNED(end, bs)) {
+ struct btrfs_file_extent_item *ei;
u64 disk_byte;
u64 data_offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, len);
+ offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, offset, end - offset);
}
- sctx->cur_inode_next_write_offset = offset + len;
-out:
+ sctx->cur_inode_next_write_offset = end;
return ret;
}
@@ -6692,8 +6787,7 @@ static int tree_compare_item(struct btrfs_path *left_path,
* If it detects a change, it aborts immediately.
*/
static int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- btrfs_changed_cb_t changed_cb, void *ctx)
+ struct btrfs_root *right_root, void *ctx)
{
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
@@ -6960,8 +7054,7 @@ static int send_subvol(struct send_ctx *sctx)
goto out;
if (sctx->parent_root) {
- ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
- changed_cb, sctx);
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
if (ret < 0)
goto out;
ret = finish_inode_if_needed(sctx, 1);
@@ -7087,7 +7180,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
- unsigned alloc_size;
+ size_t alloc_size;
int sort_clone_roots = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -7169,25 +7262,20 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
- alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
-
- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ arg->clone_sources_count + 1,
+ GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
goto out;
}
- alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+ alloc_size = array_size(sizeof(*arg->clone_sources),
+ arg->clone_sources_count);
if (arg->clone_sources_count) {
clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
@@ -7378,7 +7466,6 @@ out:
kvfree(sctx->clone_roots);
kvfree(sctx->send_buf);
- kvfree(sctx->read_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index ead397f7034f..de91488b7cd0 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -13,7 +13,6 @@
#define BTRFS_SEND_STREAM_VERSION 1
#define BTRFS_SEND_BUF_SIZE SZ_64K
-#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 475968ccbd1d..64099565ab8f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list)
+ list_for_each_entry(found, head, list)
found->full = 0;
- rcu_read_unlock();
}
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
@@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (ret)
return ret;
- list_add_rcu(&space_info->list, &info->space_info);
+ list_add(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
@@ -290,22 +288,13 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & flags) {
- rcu_read_unlock();
+ list_for_each_entry(found, head, list) {
+ if (found->flags & flags)
return found;
- }
}
- rcu_read_unlock();
return NULL;
}
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
-{
- return (global->size << 1);
-}
-
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
@@ -476,28 +465,6 @@ again:
up_read(&info->groups_sem);
}
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
- unsigned long nr_pages, int nr_items)
-{
- struct super_block *sb = fs_info->sb;
-
- if (down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- } else {
- /*
- * We needn't worry the filesystem going from r/w to r/o though
- * we don't acquire ->s_umount mutex, because the filesystem
- * should guarantee the delalloc inodes list be empty after
- * the filesystem is readonly(all dirty pages are written to
- * the disk).
- */
- btrfs_start_delalloc_roots(fs_info, nr_items);
- if (!current->journal_info)
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
- }
-}
-
static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
@@ -516,25 +483,33 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
- u64 orig, bool wait_ordered)
+static void shrink_delalloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 to_reclaim, bool wait_ordered)
{
- struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 dio_bytes;
- u64 async_pages;
u64 items;
long time_left;
- unsigned long nr_pages;
int loops;
/* Calc the number of the pages we need flush for space reservation */
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ if (to_reclaim == U64_MAX) {
+ items = U64_MAX;
+ } else {
+ /*
+ * to_reclaim is set to however much metadata we need to
+ * reclaim, but reclaiming that much data doesn't really track
+ * exactly, so increase the amount to reclaim by 2x in order to
+ * make sure we're flushing enough delalloc to hopefully reclaim
+ * some metadata reservations.
+ */
+ items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ }
trans = (struct btrfs_trans_handle *)current->journal_info;
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
@@ -557,37 +532,17 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while ((delalloc_bytes || dio_bytes) && loops < 3) {
- nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
-
- /*
- * Triggers inode writeback for up to nr_pages. This will invoke
- * ->writepages callback and trigger delalloc filling
- * (btrfs_run_delalloc_range()).
- */
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+ btrfs_start_delalloc_roots(fs_info, items);
- /*
- * We need to wait for the compressed pages to start before
- * we continue.
- */
- async_pages = atomic_read(&fs_info->async_delalloc_pages);
- if (!async_pages)
- goto skip_async;
-
- /*
- * Calculate how many compressed pages we want to be written
- * before we continue. I.e if there are more async pages than we
- * require wait_event will wait until nr_pages are written.
- */
- if (async_pages <= nr_pages)
- async_pages = 0;
- else
- async_pages -= nr_pages;
+ loops++;
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ } else {
+ time_left = schedule_timeout_killable(1);
+ if (time_left)
+ break;
+ }
- wait_event(fs_info->async_submit_wait,
- atomic_read(&fs_info->async_delalloc_pages) <=
- (int)async_pages);
-skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -596,14 +551,6 @@ skip_async:
}
spin_unlock(&space_info->lock);
- loops++;
- if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
- } else {
- time_left = schedule_timeout_killable(1);
- if (time_left)
- break;
- }
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
@@ -628,8 +575,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *trans;
- u64 bytes_needed;
u64 reclaim_bytes = 0;
+ u64 bytes_needed = 0;
u64 cur_free_bytes = 0;
trans = (struct btrfs_trans_handle *)current->journal_info;
@@ -649,7 +596,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
else if (!list_empty(&space_info->tickets))
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- bytes_needed = (ticket) ? ticket->bytes : 0;
+ if (ticket)
+ bytes_needed = ticket->bytes;
if (bytes_needed > cur_free_bytes)
bytes_needed -= cur_free_bytes;
@@ -676,8 +624,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
goto commit;
/*
- * See if there is some space in the delayed insertion reservation for
- * this reservation.
+ * See if there is some space in the delayed insertion reserve for this
+ * reservation. If the space_info's don't match (like for DATA or
+ * SYSTEM) then just go enospc, reclaiming this space won't recover any
+ * space to satisfy those reservations.
*/
if (space_info != delayed_rsv->space_info)
goto enospc;
@@ -742,7 +692,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
+ shrink_delalloc(fs_info, space_info, num_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case FLUSH_DELAYED_REFS_NR:
@@ -767,7 +717,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
ret = btrfs_chunk_alloc(trans,
- btrfs_metadata_alloc_profile(fs_info),
+ btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
@@ -1037,9 +987,132 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
} while (flush_state <= COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+/*
+ * FLUSH_DELALLOC_WAIT:
+ * Space is freed from flushing delalloc in one of two ways.
+ *
+ * 1) compression is on and we allocate less space than we reserved
+ * 2) we are overwriting existing space
+ *
+ * For #1 that extra space is reclaimed as soon as the delalloc pages are
+ * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
+ * length to ->bytes_reserved, and subtracts the reserved space from
+ * ->bytes_may_use.
+ *
+ * For #2 this is trickier. Once the ordered extent runs we will drop the
+ * extent in the range we are overwriting, which creates a delayed ref for
+ * that freed extent. This however is not reclaimed until the transaction
+ * commits, thus the next stages.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we are freeing inodes, we want to make sure all delayed iputs have
+ * completed, because they could have been on an inode with i_nlink == 0, and
+ * thus have been truncated and freed up space. But again this space is not
+ * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * run and then the transaction must be committed.
+ *
+ * FLUSH_DELAYED_REFS
+ * The above two cases generate delayed refs that will affect
+ * ->total_bytes_pinned. However this counter can be inconsistent with
+ * reality if there are outstanding delayed refs. This is because we adjust
+ * the counter based solely on the current set of delayed refs and disregard
+ * any on-disk state which might include more refs. So for example, if we
+ * have an extent with 2 references, but we only drop 1, we'll see that there
+ * is a negative delayed ref count for the extent and assume that the space
+ * will be freed, and thus increase ->total_bytes_pinned.
+ *
+ * Running the delayed refs gives us the actual real view of what will be
+ * freed at the transaction commit time. This stage will not actually free
+ * space for us, it just makes sure that may_commit_transaction() has all of
+ * the information it needs to make the right decision.
+ *
+ * COMMIT_TRANS
+ * This is where we reclaim all of the pinned space generated by the previous
+ * two stages. We will not commit the transaction if we don't think we're
+ * likely to satisfy our request, which means if our current free space +
+ * total_bytes_pinned < reservation we will not commit. This is why the
+ * previous states are actually important, to make sure we know for sure
+ * whether committing the transaction will allow us to make progress.
+ *
+ * ALLOC_CHUNK_FORCE
+ * For data we start with alloc chunk force, however we could have been full
+ * before, and then the transaction commit could have freed new block groups,
+ * so if we now have space to allocate do the force chunk allocation.
+ */
+static const enum btrfs_flush_state data_flush_states[] = {
+ FLUSH_DELALLOC_WAIT,
+ RUN_DELAYED_IPUTS,
+ FLUSH_DELAYED_REFS,
+ COMMIT_TRANS,
+ ALLOC_CHUNK_FORCE,
+};
+
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 last_tickets_id;
+ int flush_state = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+ }
+
+ while (flush_state < ARRAY_SIZE(data_flush_states)) {
+ flush_space(fs_info, space_info, U64_MAX,
+ data_flush_states[flush_state]);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = 0;
+ }
+
+ if (flush_state >= ARRAY_SIZE(data_flush_states)) {
+ if (space_info->full) {
+ if (maybe_fail_all_tickets(fs_info, space_info))
+ flush_state = 0;
+ else
+ space_info->flush = 0;
+ } else {
+ flush_state = 0;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
@@ -1089,6 +1162,21 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
} while (flush_state < states_nr);
}
+static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
@@ -1141,6 +1229,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
int ret;
switch (flush) {
+ case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket);
@@ -1155,6 +1244,9 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
+ case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+ priority_reclaim_data_space(fs_info, space_info, ticket);
+ break;
default:
ASSERT(0);
break;
@@ -1214,11 +1306,11 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct work_struct *async_work;
struct reserve_ticket ticket;
u64 used;
int ret = 0;
@@ -1227,6 +1319,11 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
ASSERT(orig_bytes);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ if (flush == BTRFS_RESERVE_FLUSH_DATA)
+ async_work = &fs_info->async_data_reclaim_work;
+ else
+ async_work = &fs_info->async_reclaim_work;
+
spin_lock(&space_info->lock);
ret = -ENOSPC;
used = btrfs_space_info_used(space_info, true);
@@ -1268,7 +1365,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
init_waitqueue_head(&ticket.wait);
ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
if (flush == BTRFS_RESERVE_FLUSH_ALL ||
- flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
space_info->flush = 1;
@@ -1276,8 +1374,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ queue_work(system_unbound_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1329,8 +1426,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
@@ -1348,3 +1444,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
}
return ret;
}
+
+/**
+ * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
+ * @fs_info - the filesystem
+ * @bytes - the number of bytes we need
+ * @flush - how we are allowed to flush
+ *
+ * This will reserve bytes from the data space info. If there is not enough
+ * space then we will attempt to flush space as specified by flush.
+ */
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ int ret;
+
+ ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+
+ ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ }
+ return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index c3c64019950a..5646393b928c 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use(
btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 079b059818e9..c46be27be700 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -7,16 +7,6 @@
#include "ctree.h"
-static inline u8 get_unaligned_le8(const void *p)
-{
- return *(u8 *)p;
-}
-
-static inline void put_unaligned_le8(u8 val, void *p)
-{
- *(u8 *)p = val;
-}
-
static bool check_setget_bounds(const struct extent_buffer *eb,
const void *ptr, unsigned off, int size)
{
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e529ddb35b87..8840a4fa81eb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -625,6 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
} else if (strncmp(args[0].from, "lzo", 3) == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
+ info->compress_level = 0;
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -1870,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* the filesystem is busy.
*/
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
btrfs_discard_cleanup(fs_info);
@@ -2162,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 thresh = 0;
int mixed = 0;
- rcu_read_lock();
- list_for_each_entry_rcu(found, &fs_info->space_info, list) {
+ list_for_each_entry(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
@@ -2192,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
total_used += found->disk_used;
}
- rcu_read_unlock();
-
buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
buf->f_blocks >>= bits;
buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c8df2edafd85..279d9262b676 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -14,6 +14,7 @@
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
+#include "send.h"
#include "transaction.h"
#include "sysfs.h"
#include "volumes.h"
@@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
+static ssize_t send_stream_version_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+}
+BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
+ BTRFS_ATTR_PTR(static_feature, send_stream_version),
NULL
};
@@ -809,6 +818,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
BTRFS_ATTR(, checksum, btrfs_checksum_show);
+static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ const char *str;
+
+ switch (READ_ONCE(fs_info->exclusive_operation)) {
+ case BTRFS_EXCLOP_NONE:
+ str = "none\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ str = "balance\n";
+ break;
+ case BTRFS_EXCLOP_DEV_ADD:
+ str = "device add\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REMOVE:
+ str = "device remove\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REPLACE:
+ str = "device replace\n";
+ break;
+ case BTRFS_EXCLOP_RESIZE:
+ str = "resize\n";
+ break;
+ case BTRFS_EXCLOP_SWAP_ACTIVATE:
+ str = "swap activate\n";
+ break;
+ default:
+ str = "UNKNOWN\n";
+ break;
+ }
+ return scnprintf(buf, PAGE_SIZE, "%s", str);
+}
+BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -817,6 +862,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, quota_override),
BTRFS_ATTR_PTR(, metadata_uuid),
BTRFS_ATTR_PTR(, checksum),
+ BTRFS_ATTR_PTR(, exclusive_operation),
NULL,
};
@@ -935,12 +981,24 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
+static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+ }
+}
+
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
- btrfs_reset_fs_info_ptr(fs_info);
-
sysfs_remove_link(fsid_kobj, "bdi");
if (fs_info->space_info_kobj) {
@@ -964,7 +1022,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
addrm_unknown_feature_attrs(fs_info, false);
sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
sysfs_remove_files(fsid_kobj, btrfs_attrs);
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_info->fs_devices);
}
static const char * const btrfs_feature_set_names[FEAT_MAX] = {
@@ -973,7 +1031,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = {
[FEAT_INCOMPAT] = "incompat",
};
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set)
+const char *btrfs_feature_set_name(enum btrfs_feature_set set)
{
return btrfs_feature_set_names[set];
}
@@ -1079,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
rkobj->flags = cache->flags;
kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ /*
+ * We call this either on mount, or if we've created a block group for a
+ * new index type while running (i.e. when restriping). The running
+ * case is tricky because we could race with other threads, so we need
+ * to have this check to make sure we didn't already init the kobject.
+ *
+ * We don't have to protect on the free side because it only happens on
+ * unmount.
+ */
+ spin_lock(&space_info->lock);
+ if (space_info->block_group_kobjs[index]) {
+ spin_unlock(&space_info->lock);
+ kobject_put(&rkobj->kobj);
+ return;
+ } else {
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+ spin_unlock(&space_info->lock);
+
ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
btrfs_bg_type_to_raid_name(rkobj->flags));
memalloc_nofs_restore(nofs_flag);
if (ret) {
+ spin_lock(&space_info->lock);
+ space_info->block_group_kobjs[index] = NULL;
+ spin_unlock(&space_info->lock);
kobject_put(&rkobj->kobj);
btrfs_warn(fs_info,
"failed to add kobject for block cache, ignoring");
return;
}
-
- space_info->block_group_kobjs[index] = &rkobj->kobj;
}
/*
@@ -1151,48 +1230,30 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
return 0;
}
-/* when one_device is NULL, it removes all device links */
-
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
+ struct kobject *devices_kobj;
- if (!fs_devices->devices_kobj)
- return -EINVAL;
-
- if (one_device) {
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
-
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ /*
+ * Seed fs_devices devices_kobj aren't used, fetch kobject from the
+ * fs_info::fs_devices.
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ ASSERT(devices_kobj);
- return 0;
+ if (device->bdev) {
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+ sysfs_remove_link(devices_kobj, disk_kobj->name);
}
- list_for_each_entry(one_device, &fs_devices->devices, dev_list) {
-
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ if (device->devid_kobj.state_initialized) {
+ kobject_del(&device->devid_kobj);
+ kobject_put(&device->devid_kobj);
+ wait_for_completion(&device->kobj_unregister);
}
-
- return 0;
}
static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
@@ -1273,44 +1334,80 @@ static struct kobj_type devid_ktype = {
.release = btrfs_release_devid_kobj,
};
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_device *device)
{
- int error = 0;
- struct btrfs_device *dev;
+ int ret;
unsigned int nofs_flag;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
- nofs_flag = memalloc_nofs_save();
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ /*
+ * Make sure we use the fs_info::fs_devices to fetch the kobjects even
+ * for the seed fs_devices
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj;
+ ASSERT(devices_kobj);
+ ASSERT(devinfo_kobj);
- if (one_device && one_device != dev)
- continue;
+ nofs_flag = memalloc_nofs_save();
- if (dev->bdev) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
+ if (device->bdev) {
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
- disk = dev->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_devices->devices_kobj,
- disk_kobj, disk_kobj->name);
- if (error)
- break;
+ ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
+ if (ret) {
+ btrfs_warn(device->fs_info,
+ "creating sysfs device link for devid %llu failed: %d",
+ device->devid, ret);
+ goto out;
}
+ }
- init_completion(&dev->kobj_unregister);
- error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
- fs_devices->devinfo_kobj, "%llu",
- dev->devid);
- if (error) {
- kobject_put(&dev->devid_kobj);
- break;
- }
+ init_completion(&device->kobj_unregister);
+ ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype,
+ devinfo_kobj, "%llu", device->devid);
+ if (ret) {
+ kobject_put(&device->devid_kobj);
+ btrfs_warn(device->fs_info,
+ "devinfo init for devid %llu failed: %d",
+ device->devid, ret);
}
+
+out:
memalloc_nofs_restore(nofs_flag);
+ return ret;
+}
- return error;
+static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ btrfs_sysfs_remove_fs_devices(fs_devices);
+ return ret;
}
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
@@ -1324,8 +1421,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices)
+
{
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
@@ -1333,7 +1430,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
* Sprouting changes fsid of the mounted filesystem, rename the fsid
* directory
*/
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid);
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid);
if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
btrfs_warn(fs_devices->fs_info,
"sysfs: failed to create fsid for sprout");
@@ -1400,15 +1497,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- btrfs_set_fs_info_ptr(fs_info);
-
- error = btrfs_sysfs_add_devices_dir(fs_devs, NULL);
+ error = btrfs_sysfs_add_fs_devices(fs_devs);
if (error)
return error;
error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_sysfs_remove_devices_dir(fs_devs, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_devs);
return error;
}
@@ -1626,12 +1721,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 features;
- int ret;
+ u64 __maybe_unused features;
+ int __maybe_unused ret;
if (!fs_info)
return;
+ /*
+ * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
+ * safe when called from some contexts (eg. balance)
+ */
features = get_features(fs_info, set);
ASSERT(bit & supported_feature_masks[set]);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index cf839c46a131..bacef43f7267 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -13,15 +13,12 @@ enum btrfs_feature_set {
};
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
+const char *btrfs_feature_set_name(enum btrfs_feature_set set);
+int btrfs_sysfs_add_device(struct btrfs_device *device);
+void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
u64 bit, enum btrfs_feature_set set);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index a1b9f9b5978e..df54cdfdc250 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, &key, &value_len, 1);
item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 894a63a92236..e6719f7db386 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
}
/*
@@ -951,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
BTRFS_I(inode)->root = root;
- btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 20c6ac1a5de7..52ada47aff50 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -292,6 +292,8 @@ loop:
}
cur_trans->fs_info = fs_info;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ init_waitqueue_head(&cur_trans->pending_wait);
atomic_set(&cur_trans->num_writers, 1);
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
@@ -1182,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
- 0, &eb);
+ 0, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -1587,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_otransid(new_root_item, trans->transid);
old = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
@@ -1636,6 +1639,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
+ pending->snap = NULL;
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -2164,6 +2168,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_wait_delalloc_flush(trans);
+ /*
+ * Wait for all ordered extents started by a fast fsync that joined this
+ * transaction. Otherwise if this transaction commits before the ordered
+ * extents complete we lose logged data after a power failure.
+ */
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
+
btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d60b055b8695..858d9153a1cd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -85,6 +85,13 @@ struct btrfs_transaction {
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * Number of ordered extents the transaction must wait for before
+ * committing. These are ordered extents started by a fast fsync.
+ */
+ atomic_t pending_ordered;
+ wait_queue_head_t pending_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -105,6 +112,7 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
+#define BTRFS_DIO_SYNC_STUB ((void *)2)
struct btrfs_trans_handle {
u64 transid;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 517b44300a05..f0ffd5ee77bd 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -984,7 +984,7 @@ static int check_inode_item(struct extent_buffer *leaf,
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
inode_item_err(leaf, slot,
- "invalid inode generation: has %llu expect [0, %llu]",
+ "invalid inode transid: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
return -EUCLEAN;
}
@@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
int slot)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct btrfs_root_item ri;
+ struct btrfs_root_item ri = { 0 };
const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
BTRFS_ROOT_SUBVOL_DEAD;
int ret;
@@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
if (ret < 0)
return ret;
- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
+ if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
generic_err(leaf, slot,
- "invalid root item size, have %u expect %zu",
- btrfs_item_size_nr(leaf, slot), sizeof(ri));
+ "invalid root item size, have %u expect %zu or %u",
+ btrfs_item_size_nr(leaf, slot), sizeof(ri),
+ btrfs_legacy_root_item_size());
}
+ /*
+ * For legacy root item, the members starting at generation_v2 will be
+ * all filled with 0.
+ * And since we allow geneartion_v2 as 0, it will still pass the check.
+ */
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
- sizeof(ri));
+ btrfs_item_size_nr(leaf, slot));
/* Generation related */
if (btrfs_root_generation(&ri) >
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 696dd861cc3c..56cbc1706b6f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -96,8 +96,6 @@ enum {
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -176,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -215,9 +213,7 @@ static int join_running_log_trans(struct btrfs_root *root)
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
- mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
}
/*
@@ -3449,11 +3445,13 @@ fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (ret == -ENOSPC) {
+ if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, ret);
+ err = 0;
+ } else if (err < 0 && err != -ENOENT) {
+ /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ btrfs_abort_transaction(trans, err);
+ }
btrfs_end_log_trans(root);
@@ -3613,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* search and this search we'll not find the key again and can just
* bail.
*/
+search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
if (ret != 0)
goto done;
@@ -3632,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
if (min_key.objectid != ino || min_key.type != key_type)
goto done;
+
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
+
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
if (ret) {
@@ -4080,10 +4086,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
- const struct extent_map *em)
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
u64 csum_offset;
u64 csum_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
LIST_HEAD(ordered_sums);
int ret = 0;
@@ -4092,13 +4102,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
em->block_start == EXTENT_MAP_HOLE)
return 0;
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
+
+ if (mod_len == 0)
+ break;
+
+ if (ordered_end <= mod_start)
+ continue;
+ if (mod_end <= ordered->file_offset)
+ break;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered_end >= mod_end)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
+ continue;
+
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
+ return 0;
+
/* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
- csum_offset = em->mod_start - em->start;
- csum_len = em->mod_len;
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
@@ -4138,7 +4206,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
int ret;
int extent_inserted = 0;
- ret = log_extent_csums(trans, inode, log, em);
+ ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
@@ -4340,10 +4408,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- const u64 start,
- const u64 end)
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -4357,23 +4425,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
- /*
- * Skip extents outside our logging range. It's important to do
- * it for correctness because if we don't ignore them, we may
- * log them before their ordered extent completes, and therefore
- * we could log them without logging their respective checksums
- * (the checksum items are added to the csum tree at the very
- * end of btrfs_finish_ordered_io()). Also leave such extents
- * outside of our range in the list, since we may have another
- * ranged fsync in the near future that needs them. If an extent
- * outside our range corresponds to a hole, log it to avoid
- * leaving gaps between extents (fsck will complain when we are
- * not using the NO_HOLES feature).
- */
- if ((em->start > end || em->start + em->len <= start) &&
- em->block_start != EXTENT_MAP_HOLE)
- continue;
-
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4432,8 +4483,32 @@ process:
btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock_irq(&inode->ordered_tree.lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&inode->ordered_tree.lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
}
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
@@ -4839,7 +4914,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
- 0, LLONG_MAX, ctx);
+ ctx);
btrfs_add_delayed_iput(inode);
}
}
@@ -4881,7 +4956,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists and
- * and it has the full sync bit set (see btrfs_log_inode()).
+ * it has the full sync bit set (see btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
@@ -4897,7 +4972,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5110,8 +5185,6 @@ next_key:
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
@@ -5290,7 +5363,7 @@ log_extents:
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx, start, end);
+ ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -5299,31 +5372,8 @@ log_extents:
struct extent_map *em, *n;
write_lock(&em_tree->lock);
- /*
- * We can't just remove every em if we're called for a ranged
- * fsync - that is, one that doesn't cover the whole possible
- * file range (0 to LLONG_MAX). This is because we can have
- * em's that fall outside the range we're logging and therefore
- * their ordered operations haven't completed yet
- * (btrfs_finish_ordered_io() not invoked yet). This means we
- * didn't get their respective file extent item in the fs/subvol
- * tree yet, and need to let the next fast fsync (one which
- * consults the list of modified extent maps) find the em so
- * that it logs a matching file extent item and waits for the
- * respective ordered operation to complete (if it's still
- * running).
- *
- * Removing every em outside the range we're logging would make
- * the next fast fsync not log their matching file extent items,
- * therefore making us lose data after a log replay.
- */
- list_for_each_entry_safe(em, n, &em_tree->modified_extents,
- list) {
- const u64 mod_end = em->mod_start + em->mod_len - 1;
-
- if (em->mod_start >= start && mod_end <= end)
- list_del_init(&em->list);
- }
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
+ list_del_init(&em->list);
write_unlock(&em_tree->lock);
}
@@ -5337,19 +5387,34 @@ log_extents:
}
/*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
+ * If we are logging that an ancestor inode exists as part of logging a
+ * new name from a link or rename operation, don't mark the inode as
+ * logged - otherwise if an explicit fsync is made against an ancestor,
+ * the fsync considers the inode in the log and doesn't sync the log,
+ * resulting in the ancestor missing after a power failure unless the
+ * log was synced as part of an fsync against any other unrelated inode.
+ * So keep it simple for this case and just don't flag the ancestors as
+ * logged.
*/
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
+ if (!ctx ||
+ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
+ &inode->vfs_inode != ctx->inode)) {
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists
+ * after it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode,
+ * then the inode gets evicted after all delalloc was flushed,
+ * then we log it exists (due to a rename for example) and then
+ * fsync it. This last fsync would do nothing (not logging the
+ * extents previously written).
+ */
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+ }
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -5589,7 +5654,7 @@ process_leaf:
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
- log_mode, 0, LLONG_MAX, ctx);
+ log_mode, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
@@ -5733,7 +5798,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (ctx)
ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
- LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ LOG_INODE_ALL, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
ret = 1;
@@ -5784,8 +5849,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation > last_committed)
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_INODE_EXISTS,
- 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -5840,7 +5904,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (inode->generation > fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
if (ret)
break;
}
@@ -5948,8 +6012,6 @@ out:
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- const loff_t start,
- const loff_t end,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -6002,7 +6064,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6098,15 +6160,13 @@ end_no_trans:
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
- start, end, LOG_INODE_ALL, ctx);
+ LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -6369,26 +6429,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
/*
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
- *
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
- *
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+ struct dentry *parent)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int ret;
+ struct btrfs_log_ctx ctx;
/*
* this will force the logging code to walk the dentry chain
@@ -6403,34 +6450,17 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
-
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
- if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- }
-
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ return;
- return BTRFS_NEED_LOG_SYNC;
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 132e43d29034..731bd9c029f5 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -16,8 +16,11 @@ struct btrfs_log_ctx {
int log_ret;
int log_transid;
bool log_new_dentries;
+ bool logging_new_name;
struct inode *inode;
struct list_head list;
+ /* Only used for fast fsyncs. */
+ struct list_head ordered_extents;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -26,8 +29,23 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+}
+
+static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
}
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
@@ -49,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -67,16 +83,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
-/* Return values for btrfs_log_new_name() */
-enum {
- BTRFS_DONT_NEED_TRANS_COMMIT,
- BTRFS_NEED_TRANS_COMMIT,
- BTRFS_DONT_NEED_LOG_SYNC,
- BTRFS_NEED_LOG_SYNC,
-};
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx);
+ struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ee96c5869f57..58b9c419a2b6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
@@ -290,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* balance_mutex
*
*
- * Exclusive operations, BTRFS_FS_EXCL_OP
- * ======================================
+ * Exclusive operations
+ * ====================
*
* Maintains the exclusivity of the following operations that apply to the
* whole filesystem and cannot run in parallel.
@@ -317,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* - system power-cycle and filesystem mounted as read-only
* - filesystem or device errors leading to forced read-only
*
- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * The status of exclusive operation is set and cleared atomically.
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
* A device operation in Paused or Running state can be canceled or resumed
* either by ioctl (Balance only) or when remounted as read-write.
- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * The exclusive status is cleared when the device operation is canceled or
* completed.
*/
@@ -355,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
+ INIT_LIST_HEAD(&fs_devs->seed_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -405,7 +407,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
* Returned struct is not linked onto any lists and must be destroyed using
* btrfs_free_device.
*/
-static struct btrfs_device *__alloc_device(void)
+static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
{
struct btrfs_device *dev;
@@ -432,7 +434,8 @@ static struct btrfs_device *__alloc_device(void)
btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
return dev;
}
@@ -592,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path,
btrfs_free_device(device);
ret = 0;
- if (fs_devices->num_devices == 0)
- break;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -940,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path,
bdput(path_bdev);
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_warn_in_rcu(device->fs_info,
- "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
+ path, devid, found_transid,
+ current->comm,
+ task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
- "device fsid %pU devid %llu moved old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "devid %llu device path %s changed to %s scanned by %s (%d)",
+ devid, rcu_str_deref(device->name),
+ path, current->comm,
+ task_pid_nr(current));
}
name = rcu_string_strdup(path, GFP_NOFS);
@@ -1034,28 +1037,21 @@ error:
return ERR_PTR(ret);
}
-/*
- * After we have read the system tree and know devids belonging to
- * this filesystem, remove the device which does not belong there.
- */
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
+ int step, struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
- struct btrfs_device *latest_dev = NULL;
- mutex_lock(&uuid_mutex);
-again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state) &&
+ &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state) &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
- latest_dev = device;
+ (!*latest_dev ||
+ device->generation > (*latest_dev)->generation)) {
+ *latest_dev = device;
}
continue;
}
@@ -1093,10 +1089,22 @@ again:
btrfs_free_device(device);
}
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+}
+
+/*
+ * After we have read the system tree and know devids belonging to this
+ * filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+{
+ struct btrfs_device *latest_dev = NULL;
+ struct btrfs_fs_devices *seed_dev;
+
+ mutex_lock(&uuid_mutex);
+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
@@ -1148,47 +1156,41 @@ static void btrfs_close_one_device(struct btrfs_device *device)
ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
+ lockdep_assert_held(&uuid_mutex);
+
if (--fs_devices->opened > 0)
- return 0;
+ return;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
btrfs_close_one_device(device);
- }
- mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
fs_devices->seeding = false;
-
- return 0;
+ fs_devices->fs_info = NULL;
}
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_devices *seed_devices = NULL;
- int ret;
+ LIST_HEAD(list);
+ struct btrfs_fs_devices *tmp;
mutex_lock(&uuid_mutex);
- ret = close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
- seed_devices = fs_devices->seed;
- fs_devices->seed = NULL;
- }
- mutex_unlock(&uuid_mutex);
+ close_fs_devices(fs_devices);
+ if (!fs_devices->opened)
+ list_splice_init(&fs_devices->seed_list, &list);
- while (seed_devices) {
- fs_devices = seed_devices;
- seed_devices = fs_devices->seed;
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
close_fs_devices(fs_devices);
+ list_del(&fs_devices->seed_list);
free_fs_devices(fs_devices);
}
- return ret;
+ mutex_unlock(&uuid_mutex);
}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
@@ -1196,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
+ struct btrfs_device *tmp_device;
flags |= FMODE_EXCL;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- /* Just open everything we can; ignore failures here */
- if (btrfs_open_one_device(fs_devices, device, flags, holder))
- continue;
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
+ dev_list) {
+ int ret;
- if (!latest_dev ||
- device->generation > latest_dev->generation)
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret == 0 &&
+ (!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
+ } else if (ret == -ENODATA) {
+ fs_devices->num_devices--;
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+ }
}
if (fs_devices->open_devices == 0)
return -EINVAL;
@@ -1960,16 +1968,13 @@ static struct btrfs_device * btrfs_find_next_active_device(
* this_dev) which is active.
*/
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
- struct btrfs_device *this_dev)
+ struct btrfs_device *next_device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_device *next_device;
- if (this_dev)
- next_device = this_dev;
- else
+ if (!next_device)
next_device = btrfs_find_next_active_device(fs_info->fs_devices,
- device);
+ device);
ASSERT(next_device);
if (fs_info->sb->s_bdev &&
@@ -1998,9 +2003,9 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
return num_devices;
}
-static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path)
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path)
{
struct btrfs_super_block *disk_super;
int copy_num;
@@ -2039,7 +2044,7 @@ static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid)
+ u64 devid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2143,7 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
@@ -2164,14 +2169,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
btrfs_free_device(device);
if (cur_devices->open_devices == 0) {
- while (fs_devices) {
- if (fs_devices->seed == cur_devices) {
- fs_devices->seed = cur_devices->seed;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- cur_devices->seed = NULL;
+ list_del_init(&cur_devices->seed_list);
close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
@@ -2220,14 +2218,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
- struct btrfs_fs_info *fs_info = srcdev->fs_info;
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(fs_info, srcdev->bdev,
- srcdev->name->str);
- }
+ mutex_lock(&uuid_mutex);
btrfs_close_bdev(srcdev);
synchronize_rcu();
@@ -2235,8 +2228,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
- struct btrfs_fs_devices *tmp_fs_devices;
-
/*
* On a mounted FS, num_devices can't be zero unless it's a
* seed. In case of a seed device being replaced, the replace
@@ -2245,18 +2236,11 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
*/
ASSERT(fs_devices->seeding);
- tmp_fs_devices = fs_info->fs_devices;
- while (tmp_fs_devices) {
- if (tmp_fs_devices->seed == fs_devices) {
- tmp_fs_devices->seed = fs_devices->seed;
- break;
- }
- tmp_fs_devices = tmp_fs_devices->seed;
- }
- fs_devices->seed = NULL;
+ list_del_init(&fs_devices->seed_list);
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
+ mutex_unlock(&uuid_mutex);
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
@@ -2265,7 +2249,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev);
+ btrfs_sysfs_remove_device(tgtdev);
if (tgtdev->bdev)
fs_devices->open_devices--;
@@ -2374,10 +2358,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
if (!fs_devices->seeding)
return -EINVAL;
+ /*
+ * Private copy of the seed devices, anchored at
+ * fs_info->fs_devices->seed_list
+ */
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
+ /*
+ * It's necessary to retain a copy of the original seed fs_devices in
+ * fs_uuids so that filesystems which have been seeded can successfully
+ * reference the seed device from open_seed_devices. This also supports
+ * multiple fs seed.
+ */
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
@@ -2398,16 +2392,12 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
- mutex_lock(&fs_info->chunk_mutex);
- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- mutex_unlock(&fs_info->chunk_mutex);
-
fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
fs_devices->rotating = false;
- fs_devices->seed = seed_devices;
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -2510,7 +2500,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
u64 orig_super_num_devices;
int seeding_dev = 0;
int ret = 0;
- bool unlocked = false;
+ bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
@@ -2524,20 +2514,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
+ locked = true;
}
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ sync_blockdev(bdev);
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
- mutex_unlock(
- &fs_devices->device_list_mutex);
+ rcu_read_unlock();
goto error;
}
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
@@ -2612,9 +2602,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices + 1);
- /* add sysfs device entry */
- btrfs_sysfs_add_devices_dir(fs_devices, device);
-
/*
* we've got more storage, clear any full flags on the space
* infos
@@ -2622,6 +2609,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
+
+ /* Add sysfs device entry */
+ btrfs_sysfs_add_device(device);
+
mutex_unlock(&fs_devices->device_list_mutex);
if (seeding_dev) {
@@ -2647,8 +2638,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_sysfs;
}
- btrfs_sysfs_update_sprout_fsid(fs_devices,
- fs_info->fs_devices->fsid);
+ /*
+ * fs_devices now represents the newly sprouted filesystem and
+ * its fsid has been changed by btrfs_prepare_sprout
+ */
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
}
ret = btrfs_commit_transaction(trans);
@@ -2656,7 +2650,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
- unlocked = true;
+ locked = false;
if (ret) /* transaction commit */
return ret;
@@ -2691,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return ret;
error_sysfs:
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
@@ -2717,7 +2711,7 @@ error_free_device:
btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
- if (seeding_dev && !unlocked) {
+ if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
@@ -4044,7 +4038,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
/*
* rw_devices will not change at the moment, device add/delete/replace
- * are excluded by EXCL_OP
+ * are exclusive
*/
num_devices = fs_info->fs_devices->rw_devices;
@@ -4180,7 +4174,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
wake_up(&fs_info->balance_wait_q);
@@ -4191,7 +4185,7 @@ out:
reset_balance_state(fs_info);
else
kfree(bctl);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -4293,7 +4287,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
@@ -4375,7 +4369,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
btrfs_info(fs_info, "balance: canceled");
}
}
@@ -4462,6 +4456,7 @@ int btrfs_uuid_scan_kthread(void *data)
goto skip;
}
update_tree:
+ btrfs_release_path(path);
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
@@ -4486,6 +4481,7 @@ update_tree:
}
skip:
+ btrfs_release_path(path);
if (trans) {
ret = btrfs_end_transaction(trans);
trans = NULL;
@@ -4493,7 +4489,6 @@ skip:
break;
}
- btrfs_release_path(path);
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
@@ -6459,11 +6454,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
bool seed)
{
struct btrfs_device *device;
+ struct btrfs_fs_devices *seed_devs;
+
+ if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
+ }
- while (fs_devices) {
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
if (!fsid ||
- !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &fs_devices->devices,
+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &seed_devs->devices,
dev_list) {
if (device->devid == devid &&
(!uuid || memcmp(device->uuid, uuid,
@@ -6471,11 +6476,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
return device;
}
}
- if (seed)
- fs_devices = fs_devices->seed;
- else
- return NULL;
}
+
return NULL;
}
@@ -6483,8 +6485,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
+ unsigned int nofs_flag;
+ /*
+ * We call this under the chunk_mutex, so we want to use NOFS for this
+ * allocation, however we don't want to change btrfs_alloc_device() to
+ * always do NOFS because we use it in a lot of other GFP_KERNEL safe
+ * places.
+ */
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
@@ -6521,7 +6532,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device();
+ dev = __alloc_device(fs_info);
if (IS_ERR(dev))
return dev;
@@ -6717,13 +6728,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
- fs_devices = fs_info->fs_devices->seed;
- while (fs_devices) {
+ /* This will match only for multi-device seed fs */
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
return fs_devices;
- fs_devices = fs_devices->seed;
- }
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
@@ -6739,6 +6748,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
return fs_devices;
}
+ /*
+ * Upon first call for a seed fs fsid, just create a private copy of the
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
+ */
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -6746,20 +6759,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
}
if (!fs_devices->seeding) {
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(-EINVAL);
- goto out;
+ return ERR_PTR(-EINVAL);
}
- fs_devices->seed = fs_info->fs_devices->seed;
- fs_info->fs_devices->seed = fs_devices;
-out:
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
+
return fs_devices;
}
@@ -7178,17 +7188,22 @@ error:
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
- while (fs_devices) {
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
+ fs_devices->fs_info = fs_info;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->fs_info = fs_info;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list)
device->fs_info = fs_info;
- mutex_unlock(&fs_devices->device_list_mutex);
- fs_devices = fs_devices->seed;
+ seed_devs->fs_info = fs_info;
}
+ mutex_unlock(&fs_devices->device_list_mutex);
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7214,17 +7229,53 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
sizeof(val));
}
-int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ struct btrfs_path *path)
{
- struct btrfs_key key;
- struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_dev_stats_item *ptr;
struct extent_buffer *eb;
- int slot;
- int ret = 0;
+ struct btrfs_key key;
+ int item_size;
+ int i, ret, slot;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
+ if (ret) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_set(device, i, 0);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_set(device, i, 0);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
- int i;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7232,43 +7283,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- int item_size;
- struct btrfs_dev_stats_item *ptr;
-
- key.objectid = BTRFS_DEV_STATS_OBJECTID;
- key.type = BTRFS_PERSISTENT_ITEM_KEY;
- key.offset = device->devid;
- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
- if (ret) {
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
- btrfs_dev_stat_set(device, i, 0);
- device->dev_stats_valid = 1;
- btrfs_release_path(path);
- continue;
- }
- slot = path->slots[0];
- eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
-
- ptr = btrfs_item_ptr(eb, slot,
- struct btrfs_dev_stats_item);
-
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
- if (item_size >= (1 + i) * sizeof(__le64))
- btrfs_dev_stat_set(device, i,
- btrfs_dev_stats_value(eb, ptr, i));
- else
- btrfs_dev_stat_set(device, i, 0);
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
+ }
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
}
-
- device->dev_stats_valid = 1;
- btrfs_dev_stat_print_on_load(device);
- btrfs_release_path(path);
}
+out:
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_free_path(path);
- return ret < 0 ? ret : 0;
+ return ret;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
@@ -7485,24 +7515,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
mutex_unlock(&trans->fs_info->chunk_mutex);
}
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = fs_info;
- fs_devices = fs_devices->seed;
- }
-}
-
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = NULL;
- fs_devices = fs_devices->seed;
- }
-}
-
/*
* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
*/
@@ -7583,8 +7595,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
- dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
- NULL, false);
+ struct btrfs_fs_devices *devs;
+
+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
+ struct btrfs_fs_devices, seed_list);
+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5eea93916fbf..bf27ac07d315 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -58,7 +58,7 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string *name;
+ struct rcu_string __rcu *name;
u64 generation;
@@ -246,7 +246,7 @@ struct btrfs_fs_devices {
*/
struct list_head alloc_list;
- struct btrfs_fs_devices *seed;
+ struct list_head seed_list;
bool seeding;
int opened;
@@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
int btrfs_forget_devices(const char *path);
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
@@ -569,10 +569,11 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
diff --git a/fs/buffer.c b/fs/buffer.c
index d468ed9981e0..5a28a6aa7f16 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1958,7 +1958,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
*/
set_buffer_new(bh);
set_buffer_unwritten(bh);
- /* FALLTHRU */
+ fallthrough;
case IOMAP_MAPPED:
if ((iomap->flags & IOMAP_F_NEW) ||
offset >= i_size_read(inode))
@@ -2771,16 +2771,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
-#if 0
- /* Not really sure about this - do we need this ? */
- if (page->mapping->a_ops->invalidatepage)
- page->mapping->a_ops->invalidatepage(page, offset);
-#endif
unlock_page(page);
return 0; /* don't care */
}
@@ -2975,12 +2965,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
- do_invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 55ccccf77cea..034b3f4fdd3a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int have = ci->i_snap_caps;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
@@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if (!__cap_is_valid(cap))
continue;
if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino, cap,
+ dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
@@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/* does a combination of caps satisfy mask? */
have |= cap->issued;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
@@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
struct cap_wait cw;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
- cw.ino = inode->i_ino;
+ cw.ino = ceph_ino(inode);
cw.tgid = current->tgid;
cw.need = need;
cw.want = want;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 97539b497e4c..3e3fcda9b276 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
{
struct seq_file *s = p;
- seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+ seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented));
return 0;
@@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)
spin_lock(&mdsc->caps_list_lock);
list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
- seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+ seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
ceph_cap_string(cw->need),
ceph_cap_string(cw->want));
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 060bdcc5ce32..d72e4a12bb69 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
dentry, dentry, d_inode(dentry));
ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
- dentry->d_name.len,
- ceph_translate_ino(dentry->d_sb,
- d_inode(dentry)->i_ino),
+ dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
d_inode(dentry)->i_mode >> 12)) {
dput(dentry);
err = 0;
@@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* always start with . and .. */
if (ctx->pos == 0) {
dout("readdir off 0 -> '.'\n");
- if (!dir_emit(ctx, ".", 1,
- ceph_translate_ino(inode->i_sb, inode->i_ino),
+ if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
inode->i_mode >> 12))
return 0;
ctx->pos = 1;
}
if (ctx->pos == 1) {
- ino_t ino = parent_ino(file->f_path.dentry);
+ u64 ino;
+ struct dentry *dentry = file->f_path.dentry;
+
+ spin_lock(&dentry->d_lock);
+ ino = ceph_present_inode(dentry->d_parent->d_inode);
+ spin_unlock(&dentry->d_lock);
+
dout("readdir off 1 -> '..'\n");
- if (!dir_emit(ctx, "..", 2,
- ceph_translate_ino(inode->i_sb, ino),
- inode->i_mode >> 12))
+ if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
return 0;
ctx->pos = 2;
}
@@ -507,9 +508,6 @@ more:
}
for (; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
- struct ceph_vino vino;
- ino_t ino;
- u32 ftype;
BUG_ON(rde->offset < ctx->pos);
@@ -519,13 +517,10 @@ more:
rde->name_len, rde->name, &rde->inode.in);
BUG_ON(!rde->inode.in);
- ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
- vino.ino = le64_to_cpu(rde->inode.in->ino);
- vino.snap = le64_to_cpu(rde->inode.in->snapid);
- ino = ceph_vino_to_ino(vino);
if (!dir_emit(ctx, rde->name, rde->name_len,
- ceph_translate_ino(inode->i_sb, ino), ftype)) {
+ ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
+ le32_to_cpu(rde->inode.in->mode) >> 12)) {
dout("filldir stopping us...\n");
return 0;
}
@@ -1161,7 +1156,7 @@ retry:
if (try_async && op == CEPH_MDS_OP_UNLINK &&
(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
- dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+ dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
dentry->d_name.len, dentry->d_name.name,
ceph_cap_string(req->r_dir_caps));
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
@@ -1745,7 +1740,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
case -ENOENT:
if (d_really_is_negative(dentry))
valid = 1;
- /* Fallthrough */
+ fallthrough;
default:
break;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d51c3f2fdca0..3f4c993dfc6f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -252,7 +252,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFREG:
ceph_fscache_register_inode_cookie(inode);
ceph_fscache_file_set_cookie(inode, file);
- /* fall through */
+ fallthrough;
case S_IFDIR:
ret = ceph_init_file_info(inode, file, fmode,
S_ISDIR(inode->i_mode));
@@ -630,8 +630,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
} else {
struct dentry *dn;
- dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
- vino.ino, dir->i_ino, dentry->d_name.name);
+ dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
+ vino.ino, ceph_ino(dir), dentry->d_name.name);
ceph_dir_clear_ordered(dir);
ceph_init_inode_acls(inode, as_ctx);
if (inode->i_state & I_NEW) {
@@ -2507,6 +2507,7 @@ const struct file_operations ceph_file_fops = {
.mmap = ceph_mmap,
.fsync = ceph_fsync,
.lock = ceph_lock,
+ .setlease = simple_nosetlease,
.flock = ceph_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 357c937699d5..d163fa96cb40 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
*/
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
- ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
- inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ ci->i_vino = *(struct ceph_vino *)data;
+ inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
inode_set_iversion_raw(inode, 0);
return 0;
}
@@ -50,17 +52,14 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
{
struct inode *inode;
- ino_t t = ceph_vino_to_ino(vino);
- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+ inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
+ ceph_set_ino_cb, &vino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW)
- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
- inode, ceph_vinop(inode), (u64)inode->i_ino);
- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
- vino.snap, inode);
+ dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
+ ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
return inode;
}
@@ -2378,7 +2377,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
}
generic_fillattr(inode, stat);
- stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+ stat->ino = ceph_present_inode(inode);
/*
* btime on newly-allocated inodes is 0, so if this is still set to
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bc9e95937d7c..658800605bfb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
struct cap_wait {
struct list_head list;
- unsigned long ino;
+ u64 ino;
pid_t tgid;
int need;
int want;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 198ddde5c1e6..cc2c4d40b022 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct super_block *sb = mdsc->fsc->sb;
+ struct inode *root = d_inode(sb->s_root);
if (atomic64_read(&mdsc->quotarealms_count) > 0)
return true;
/* if root is the real CephFS root, we don't have quota realms */
- if (sb->s_root->d_inode &&
- (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+ if (root && ceph_ino(root) == CEPH_INO_ROOT)
return false;
/* otherwise, we can't know for sure */
return true;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4c3c964b1c54..a3995ebe0623 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
return ceph_inode(inode)->i_vino;
}
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * i_ino (kernel inode) st_ino (userspace)
- * i386 32 32
- * x86_64+ino32 64 32
- * x86_64 64 64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
+static inline u32 ceph_ino_to_ino32(u64 vino)
{
u32 ino = vino & 0xffffffff;
ino ^= vino >> 32;
@@ -475,34 +467,17 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
}
/*
- * kernel i_ino value
+ * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on
+ * some arches. We generally do not use this value inside the ceph driver, but
+ * we do want to set it to something, so that generic vfs code has an
+ * appropriate value for tracepoints and the like.
*/
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino)
{
-#if BITS_PER_LONG == 32
- return ceph_ino_to_ino32(vino.ino);
-#else
+ if (sizeof(ino_t) == sizeof(u32))
+ return ceph_ino_to_ino32(vino.ino);
return (ino_t)vino.ino;
-#endif
-}
-
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- return ino;
-}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
- ino = ceph_ino_to_ino32(ino);
- return ino;
}
-#endif
-
/* for printf-style formatting */
#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
@@ -511,11 +486,34 @@ static inline u64 ceph_ino(struct inode *inode)
{
return ceph_inode(inode)->i_vino.ino;
}
+
static inline u64 ceph_snap(struct inode *inode)
{
return ceph_inode(inode)->i_vino.snap;
}
+/**
+ * ceph_present_ino - format an inode number for presentation to userland
+ * @sb: superblock where the inode lives
+ * @ino: inode number to (possibly) convert
+ *
+ * If the user mounted with the ino32 option, then the 64-bit value needs
+ * to be converted to something that can fit inside 32 bits. Note that
+ * internal kernel code never uses this value, so this is entirely for
+ * userland consumption.
+ */
+static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
+{
+ if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+ return ceph_ino_to_ino32(ino);
+ return ino;
+}
+
+static inline u64 ceph_present_inode(struct inode *inode)
+{
+ return ceph_present_ino(inode->i_sb, ceph_ino(inode));
+}
+
static inline int ceph_ino_compare(struct inode *inode, void *data)
{
struct ceph_vino *pvino = (struct ceph_vino *)data;
@@ -524,11 +522,16 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
ci->i_vino.snap == pvino->snap;
}
+
static inline struct inode *ceph_find_inode(struct super_block *sb,
struct ceph_vino vino)
{
- ino_t t = ceph_vino_to_ino(vino);
- return ilookup5(sb, t, ceph_ino_compare, &vino);
+ /*
+ * NB: The hashval will be run through the fs/inode.c hash function
+ * anyway, so there is no need to squash the inode number down to
+ * 32-bits first. Just use low-order bits on arches with 32-bit long.
+ */
+ return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b296964b8afa..b565d83ba89e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -2031,4 +2031,19 @@ static inline bool is_smb1_server(struct TCP_Server_Info *server)
return strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0;
}
+static inline bool is_tcon_dfs(struct cifs_tcon *tcon)
+{
+ /*
+ * For SMB1, see MS-CIFS 2.4.55 SMB_COM_TREE_CONNECT_ANDX (0x75) and MS-CIFS 3.3.4.4 DFS
+ * Subsystem Notifies That a Share Is a DFS Share.
+ *
+ * For SMB2+, see MS-SMB2 2.2.10 SMB2 TREE_CONNECT Response and MS-SMB2 3.3.4.14 Server
+ * Application Updates a Share.
+ */
+ if (!tcon || !tcon->ses || !tcon->ses->server)
+ return false;
+ return is_smb1_server(tcon->ses->server) ? tcon->Flags & SMB_SHARE_IS_IN_DFS :
+ tcon->share_flags & (SHI1005_FLAGS_DFS | SHI1005_FLAGS_DFS_ROOT);
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 0e763d2dcf16..0496934feecb 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -581,7 +581,7 @@ should_set_ext_sec_flag(enum securityEnum sectype)
if (global_secflags &
(CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
return true;
- /* Fallthrough */
+ fallthrough;
default:
return false;
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a275ee399dce..a5731dd6e656 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1378,25 +1378,25 @@ static int cifs_parse_security_flavors(char *value,
return 1;
case Opt_sec_krb5i:
vol->sign = true;
- /* Fallthrough */
+ fallthrough;
case Opt_sec_krb5:
vol->sectype = Kerberos;
break;
case Opt_sec_ntlmsspi:
vol->sign = true;
- /* Fallthrough */
+ fallthrough;
case Opt_sec_ntlmssp:
vol->sectype = RawNTLMSSP;
break;
case Opt_sec_ntlmi:
vol->sign = true;
- /* Fallthrough */
+ fallthrough;
case Opt_ntlm:
vol->sectype = NTLM;
break;
case Opt_sec_ntlmv2i:
vol->sign = true;
- /* Fallthrough */
+ fallthrough;
case Opt_sec_ntlmv2:
vol->sectype = NTLMv2;
break;
@@ -2187,7 +2187,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->password = NULL;
break;
}
- /* Fallthrough - to Opt_pass below.*/
+ fallthrough; /* to Opt_pass below */
case Opt_pass:
/* Obtain the value string */
value = strchr(data, '=');
@@ -4909,7 +4909,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
if (!tcon)
continue;
/* Make sure that requests go through new root servers */
- if (tcon->share_flags & (SHI1005_FLAGS_DFS | SHI1005_FLAGS_DFS_ROOT)) {
+ if (is_tcon_dfs(tcon)) {
put_root_ses(root_ses);
set_root_ses(cifs_sb, ses, &root_ses);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3989d08396ac..1f75b25e559a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1017,6 +1017,8 @@ handle_mnt_opt:
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true,
full_path, fid);
+ if (rc == -EREMOTE)
+ rc = 0;
if (rc) {
cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n",
__func__, rc);
@@ -1025,6 +1027,8 @@ handle_mnt_opt:
} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false,
full_path, fid);
+ if (rc == -EREMOTE)
+ rc = 0;
if (rc) {
cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
__func__, rc);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 69cd5856621b..de564368a887 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -798,7 +798,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
if ((server->sec_kerberos || server->sec_mskerberos) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
- /* Fallthrough */
+ fallthrough;
default:
return Unspecified;
}
@@ -815,7 +815,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
default:
break;
}
- /* Fallthrough - to attempt LANMAN authentication next */
+ fallthrough; /* to attempt LANMAN authentication next */
case CIFS_NEGFLAVOR_LANMAN:
switch (requested) {
case LANMAN:
@@ -823,7 +823,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
case Unspecified:
if (global_secflags & CIFSSEC_MAY_LANMAN)
return LANMAN;
- /* Fallthrough */
+ fallthrough;
default:
return Unspecified;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 32f90dc82c84..d44df8f95bcd 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1208,7 +1208,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
rqst[1].rq_iov = si_iov;
rqst[1].rq_nvec = 1;
- len = sizeof(ea) + ea_name_len + ea_value_len + 1;
+ len = sizeof(*ea) + ea_name_len + ea_value_len + 1;
ea = kzalloc(len, GFP_KERNEL);
if (ea == NULL) {
rc = -ENOMEM;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 667d70aa335f..96c172d94fba 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1101,7 +1101,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
if ((server->sec_kerberos || server->sec_mskerberos) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
- /* Fallthrough */
+ fallthrough;
default:
return Unspecified;
}
diff --git a/fs/compat.c b/fs/compat.c
deleted file mode 100644
index 436d228cf71c..000000000000
--- a/fs/compat.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/compat.c
- *
- * Kernel compatibililty routines for e.g. 32 bit syscall support
- * on 64 bit kernels.
- *
- * Copyright (C) 2002 Stephen Rothwell, IBM Corporation
- * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
- * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
- * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz)
- */
-
-#include <linux/compat.h>
-#include <linux/nfs4_mount.h>
-#include <linux/syscalls.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include "internal.h"
-
-struct compat_nfs_string {
- compat_uint_t len;
- compat_uptr_t data;
-};
-
-static inline void compat_nfs_string(struct nfs_string *dst,
- struct compat_nfs_string *src)
-{
- dst->data = compat_ptr(src->data);
- dst->len = src->len;
-}
-
-struct compat_nfs4_mount_data_v1 {
- compat_int_t version;
- compat_int_t flags;
- compat_int_t rsize;
- compat_int_t wsize;
- compat_int_t timeo;
- compat_int_t retrans;
- compat_int_t acregmin;
- compat_int_t acregmax;
- compat_int_t acdirmin;
- compat_int_t acdirmax;
- struct compat_nfs_string client_addr;
- struct compat_nfs_string mnt_path;
- struct compat_nfs_string hostname;
- compat_uint_t host_addrlen;
- compat_uptr_t host_addr;
- compat_int_t proto;
- compat_int_t auth_flavourlen;
- compat_uptr_t auth_flavours;
-};
-
-static int do_nfs4_super_data_conv(void *raw_data)
-{
- int version = *(compat_uint_t *) raw_data;
-
- if (version == 1) {
- struct compat_nfs4_mount_data_v1 *raw = raw_data;
- struct nfs4_mount_data *real = raw_data;
-
- /* copy the fields backwards */
- real->auth_flavours = compat_ptr(raw->auth_flavours);
- real->auth_flavourlen = raw->auth_flavourlen;
- real->proto = raw->proto;
- real->host_addr = compat_ptr(raw->host_addr);
- real->host_addrlen = raw->host_addrlen;
- compat_nfs_string(&real->hostname, &raw->hostname);
- compat_nfs_string(&real->mnt_path, &raw->mnt_path);
- compat_nfs_string(&real->client_addr, &raw->client_addr);
- real->acdirmax = raw->acdirmax;
- real->acdirmin = raw->acdirmin;
- real->acregmax = raw->acregmax;
- real->acregmin = raw->acregmin;
- real->retrans = raw->retrans;
- real->timeo = raw->timeo;
- real->wsize = raw->wsize;
- real->rsize = raw->rsize;
- real->flags = raw->flags;
- real->version = raw->version;
- }
-
- return 0;
-}
-
-#define NFS4_NAME "nfs4"
-
-COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
- const char __user *, dir_name,
- const char __user *, type, compat_ulong_t, flags,
- const void __user *, data)
-{
- char *kernel_type;
- void *options;
- char *kernel_dev;
- int retval;
-
- kernel_type = copy_mount_string(type);
- retval = PTR_ERR(kernel_type);
- if (IS_ERR(kernel_type))
- goto out;
-
- kernel_dev = copy_mount_string(dev_name);
- retval = PTR_ERR(kernel_dev);
- if (IS_ERR(kernel_dev))
- goto out1;
-
- options = copy_mount_options(data);
- retval = PTR_ERR(options);
- if (IS_ERR(options))
- goto out2;
-
- if (kernel_type && options) {
- if (!strcmp(kernel_type, NFS4_NAME)) {
- retval = -EINVAL;
- if (do_nfs4_super_data_conv(options))
- goto out3;
- }
- }
-
- retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
-
- out3:
- kfree(options);
- out2:
- kfree(kernel_dev);
- out1:
- kfree(kernel_type);
- out:
- return retval;
-}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index cb733652ecca..ca2273727225 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1688,11 +1688,11 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case 1:
offset += file->f_pos;
- /* fall through */
+ fallthrough;
case 0:
if (offset >= 0)
break;
- /* fall through */
+ fallthrough;
default:
return -EINVAL;
}
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 9212325763b0..4ef3f714046a 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -343,9 +343,11 @@ void fscrypt_msg(const struct inode *inode, const char *level,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- if (inode)
+ if (inode && inode->i_ino)
printk("%sfscrypt (%s, inode %lu): %pV\n",
level, inode->i_sb->s_id, inode->i_ino, &vaf);
+ else if (inode)
+ printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf);
else
printk("%sfscrypt: %pV\n", level, &vaf);
va_end(args);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 011830f84d8d..1fbe6c24d705 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -61,15 +61,6 @@ struct fscrypt_nokey_name {
*/
#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256)
-static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
-{
- struct sha256_state sctx;
-
- sha256_init(&sctx);
- sha256_update(&sctx, data, data_len);
- sha256_final(&sctx, result);
-}
-
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
if (str->len == 1 && str->name[0] == '.')
@@ -242,11 +233,11 @@ static int base64_decode(const char *src, int len, u8 *dst)
return cp - dst;
}
-bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
- u32 max_len, u32 *encrypted_len_ret)
+bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret)
{
- const struct fscrypt_info *ci = inode->i_crypt_info;
- int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) &
+ int padding = 4 << (fscrypt_policy_flags(policy) &
FSCRYPT_POLICY_FLAGS_PAD_MASK);
u32 encrypted_len;
@@ -260,8 +251,6 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
/**
* fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
- * @inode: inode of the parent directory (for regular filenames)
- * or of the symlink (for symlink targets)
* @max_encrypted_len: maximum length of encrypted filenames the buffer will be
* used to present
* @crypto_str: (output) buffer to allocate
@@ -271,8 +260,7 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_fname_alloc_buffer(const struct inode *inode,
- u32 max_encrypted_len,
+int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
struct fscrypt_str *crypto_str)
{
const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX);
@@ -369,9 +357,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
} else {
memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes));
/* Compute strong hash of remaining part of name. */
- fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)],
- iname->len - sizeof(nokey_name.bytes),
- nokey_name.sha256);
+ sha256(&iname->name[sizeof(nokey_name.bytes)],
+ iname->len - sizeof(nokey_name.bytes),
+ nokey_name.sha256);
size = FSCRYPT_NOKEY_NAME_MAX;
}
oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name);
@@ -394,9 +382,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
* directory's encryption key, then @iname is the plaintext, so we encrypt it to
* get the disk_name.
*
- * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
- * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be
- * impossible in this case, so we fail them with ENOKEY.
+ * Else, for keyless @lookup operations, @iname should be a no-key name, so we
+ * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will
+ * be impossible in this case, so we fail them with ENOKEY.
*
* If successful, fscrypt_free_filename() must be called later to clean up.
*
@@ -421,7 +409,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return ret;
if (fscrypt_has_encryption_key(dir)) {
- if (!fscrypt_fname_encrypted_size(dir, iname->len,
+ if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
+ iname->len,
dir->i_sb->s_cop->max_namelen,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
@@ -440,7 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
}
if (!lookup)
return -ENOKEY;
- fname->is_ciphertext_name = true;
+ fname->is_nokey_name = true;
/*
* We don't have the key and we are doing a lookup; decode the
@@ -499,7 +488,7 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
{
const struct fscrypt_nokey_name *nokey_name =
(const void *)fname->crypto_buf.name;
- u8 sha256[SHA256_DIGEST_SIZE];
+ u8 digest[SHA256_DIGEST_SIZE];
if (likely(fname->disk_name.name)) {
if (de_name_len != fname->disk_name.len)
@@ -510,9 +499,9 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
return false;
if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes)))
return false;
- fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)],
- de_name_len - sizeof(nokey_name->bytes), sha256);
- return !memcmp(sha256, nokey_name->sha256, sizeof(sha256));
+ sha256(&de_name[sizeof(nokey_name->bytes)],
+ de_name_len - sizeof(nokey_name->bytes), digest);
+ return !memcmp(digest, nokey_name->sha256, sizeof(digest));
}
EXPORT_SYMBOL_GPL(fscrypt_match_name);
@@ -541,7 +530,7 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash);
* Validate dentries in encrypted directories to make sure we aren't potentially
* caching stale dentries after a key has been added.
*/
-static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct dentry *dir;
int err;
@@ -549,17 +538,17 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
/*
* Plaintext names are always valid, since fscrypt doesn't support
- * reverting to ciphertext names without evicting the directory's inode
+ * reverting to no-key names without evicting the directory's inode
* -- which implies eviction of the dentries in the directory.
*/
- if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
+ if (!(dentry->d_flags & DCACHE_NOKEY_NAME))
return 1;
/*
- * Ciphertext name; valid if the directory's key is still unavailable.
+ * No-key name; valid if the directory's key is still unavailable.
*
- * Although fscrypt forbids rename() on ciphertext names, we still must
- * use dget_parent() here rather than use ->d_parent directly. That's
+ * Although fscrypt forbids rename() on no-key names, we still must use
+ * dget_parent() here rather than use ->d_parent directly. That's
* because a corrupted fs image may contain directory hard links, which
* the VFS handles by moving the directory's dentry tree in the dcache
* each time ->lookup() finds the directory and it already has a dentry
@@ -580,6 +569,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return valid;
}
+EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
const struct dentry_operations fscrypt_d_ops = {
.d_revalidate = fscrypt_d_revalidate,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 8117a61b6f55..4f5806a3b73d 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -97,7 +97,6 @@ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx)
return NULL;
}
-#undef fscrypt_policy
union fscrypt_policy {
u8 version;
struct fscrypt_policy_v1 v1;
@@ -292,8 +291,9 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
/* fname.c */
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen);
-bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
- u32 max_len, u32 *encrypted_len_ret);
+bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret);
extern const struct dentry_operations fscrypt_d_ops;
/* hkdf.c */
@@ -572,6 +572,9 @@ int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
const struct fscrypt_master_key *mk);
+void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk);
+
/* keysetup_v1.c */
void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
@@ -590,5 +593,6 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
const union fscrypt_context *ctx_u,
int ctx_size);
+const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir);
#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 09fb8aa0f2e9..20b0df47fe6a 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -60,8 +60,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
if (err)
return err;
- /* ... in case we looked up ciphertext name before key was added */
- if (dentry->d_flags & DCACHE_ENCRYPTED_NAME)
+ /* ... in case we looked up no-key name before key was added */
+ if (dentry->d_flags & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (!fscrypt_has_permitted_context(dir, inode))
@@ -85,9 +85,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
return err;
- /* ... in case we looked up ciphertext name(s) before key was added */
- if ((old_dentry->d_flags | new_dentry->d_flags) &
- DCACHE_ENCRYPTED_NAME)
+ /* ... in case we looked up no-key name(s) before key was added */
+ if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (old_dir != new_dir) {
@@ -114,9 +113,9 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
if (err && err != -ENOENT)
return err;
- if (fname->is_ciphertext_name) {
+ if (fname->is_nokey_name) {
spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_ENCRYPTED_NAME;
+ dentry->d_flags |= DCACHE_NOKEY_NAME;
spin_unlock(&dentry->d_lock);
d_set_d_op(dentry, &fscrypt_d_ops);
}
@@ -166,26 +165,51 @@ int fscrypt_prepare_setflags(struct inode *inode,
return 0;
}
-int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
- unsigned int max_len,
- struct fscrypt_str *disk_link)
+/**
+ * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink
+ * @dir: directory in which the symlink is being created
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @max_len: space the filesystem has available to store the symlink target
+ * @disk_link: (out) the on-disk symlink target being prepared
+ *
+ * This function computes the size the symlink target will require on-disk,
+ * stores it in @disk_link->len, and validates it against @max_len. An
+ * encrypted symlink may be longer than the original.
+ *
+ * Additionally, @disk_link->name is set to @target if the symlink will be
+ * unencrypted, but left NULL if the symlink will be encrypted. For encrypted
+ * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the
+ * on-disk target later. (The reason for the two-step process is that some
+ * filesystems need to know the size of the symlink target before creating the
+ * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.)
+ *
+ * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long,
+ * -ENOKEY if the encryption key is missing, or another -errno code if a problem
+ * occurred while setting up the encryption key.
+ */
+int fscrypt_prepare_symlink(struct inode *dir, const char *target,
+ unsigned int len, unsigned int max_len,
+ struct fscrypt_str *disk_link)
{
- int err;
+ const union fscrypt_policy *policy;
/*
* To calculate the size of the encrypted symlink target we need to know
* the amount of NUL padding, which is determined by the flags set in
* the encryption policy which will be inherited from the directory.
- * The easiest way to get access to this is to just load the directory's
- * fscrypt_info, since we'll need it to create the dir_entry anyway.
- *
- * Note: in test_dummy_encryption mode, @dir may be unencrypted.
*/
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return err;
- if (!fscrypt_has_encryption_key(dir))
- return -ENOKEY;
+ policy = fscrypt_policy_to_inherit(dir);
+ if (policy == NULL) {
+ /* Not encrypted */
+ disk_link->name = (unsigned char *)target;
+ disk_link->len = len + 1;
+ if (disk_link->len > max_len)
+ return -ENAMETOOLONG;
+ return 0;
+ }
+ if (IS_ERR(policy))
+ return PTR_ERR(policy);
/*
* Calculate the size of the encrypted symlink and verify it won't
@@ -198,7 +222,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
* counting it (even though it is meaningless for ciphertext) is simpler
* for now since filesystems will assume it is there and subtract it.
*/
- if (!fscrypt_fname_encrypted_size(dir, len,
+ if (!fscrypt_fname_encrypted_size(policy, len,
max_len - sizeof(struct fscrypt_symlink_data),
&disk_link->len))
return -ENAMETOOLONG;
@@ -207,7 +231,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
disk_link->name = NULL;
return 0;
}
-EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink);
+EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink);
int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
unsigned int len, struct fscrypt_str *disk_link)
@@ -217,9 +241,13 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
struct fscrypt_symlink_data *sd;
unsigned int ciphertext_len;
- err = fscrypt_require_key(inode);
- if (err)
- return err;
+ /*
+ * fscrypt_prepare_new_inode() should have already set up the new
+ * symlink inode's encryption key. We don't wait until now to do it,
+ * since we may be in a filesystem transaction now.
+ */
+ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode)))
+ return -ENOKEY;
if (disk_link->name) {
/* filesystem-provided buffer */
@@ -319,7 +347,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
if (cstr.len + sizeof(*sd) - 1 > max_size)
return ERR_PTR(-EUCLEAN);
- err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+ err = fscrypt_fname_alloc_buffer(cstr.len, &pstr);
if (err)
return ERR_PTR(err);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index faa25541ccb6..89bffa82ed74 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -106,7 +106,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
crypto_cfg.data_unit_size = sb->s_blocksize;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
num_devs = fscrypt_get_num_devices(sb);
- devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS);
+ devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
if (!devs)
return -ENOMEM;
fscrypt_get_devices(sb, num_devs, devs);
@@ -135,9 +135,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
struct fscrypt_blk_crypto_key *blk_key;
int err;
int i;
- unsigned int flags;
- blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS);
+ blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL);
if (!blk_key)
return -ENOMEM;
@@ -166,10 +165,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
}
queue_refs++;
- flags = memalloc_nofs_save();
err = blk_crypto_start_using_key(&blk_key->base,
blk_key->devs[i]);
- memalloc_nofs_restore(flags);
if (err) {
fscrypt_err(inode,
"error %d starting to use blk-crypto", err);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index e74f239c4428..53cc552a7b8f 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -817,6 +817,7 @@ static int check_for_busy_inodes(struct super_block *sb,
struct list_head *pos;
size_t busy_count = 0;
unsigned long ino;
+ char ino_str[50] = "";
spin_lock(&mk->mk_decrypted_inodes_lock);
@@ -838,11 +839,15 @@ static int check_for_busy_inodes(struct super_block *sb,
}
spin_unlock(&mk->mk_decrypted_inodes_lock);
+ /* If the inode is currently being created, ino may still be 0. */
+ if (ino)
+ snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino);
+
fscrypt_warn(NULL,
- "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu",
+ "%s: %zu inode(s) still busy after removing key with %s %*phN%s",
sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec),
master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u,
- ino);
+ ino_str);
return -EBUSY;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index fea6226afc2b..d3c3e5d9b41f 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -10,6 +10,7 @@
#include <crypto/skcipher.h>
#include <linux/key.h>
+#include <linux/random.h>
#include "fscrypt_private.h"
@@ -222,6 +223,16 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
return 0;
}
+void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk)
+{
+ WARN_ON(ci->ci_inode->i_ino == 0);
+ WARN_ON(!mk->mk_ino_hash_key_initialized);
+
+ ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
+ &mk->mk_ino_hash_key);
+}
+
static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
struct fscrypt_master_key *mk)
{
@@ -254,13 +265,20 @@ unlock:
return err;
}
- ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
- &mk->mk_ino_hash_key);
+ /*
+ * New inodes may not have an inode number assigned yet.
+ * Hashing their inode number is delayed until later.
+ */
+ if (ci->ci_inode->i_ino == 0)
+ WARN_ON(!(ci->ci_inode->i_state & I_CREATING));
+ else
+ fscrypt_hash_inode_number(ci, mk);
return 0;
}
static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
- struct fscrypt_master_key *mk)
+ struct fscrypt_master_key *mk,
+ bool need_dirhash_key)
{
int err;
@@ -306,7 +324,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
return err;
/* Derive a secret dirhash key for directories that need it. */
- if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) {
+ if (need_dirhash_key) {
err = fscrypt_derive_dirhash_key(ci, mk);
if (err)
return err;
@@ -326,6 +344,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* key being removed with a new inode starting to use it.
*/
static int setup_file_encryption_key(struct fscrypt_info *ci,
+ bool need_dirhash_key,
struct key **master_key_ret)
{
struct key *key;
@@ -400,7 +419,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
break;
case FSCRYPT_POLICY_V2:
- err = fscrypt_setup_v2_file_key(ci, mk);
+ err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key);
break;
default:
WARN_ON(1);
@@ -454,57 +473,28 @@ static void put_crypt_info(struct fscrypt_info *ci)
kmem_cache_free(fscrypt_info_cachep, ci);
}
-int fscrypt_get_encryption_info(struct inode *inode)
+static int
+fscrypt_setup_encryption_info(struct inode *inode,
+ const union fscrypt_policy *policy,
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
+ bool need_dirhash_key)
{
struct fscrypt_info *crypt_info;
- union fscrypt_context ctx;
struct fscrypt_mode *mode;
struct key *master_key = NULL;
int res;
- if (fscrypt_has_encryption_key(inode))
- return 0;
-
res = fscrypt_initialize(inode->i_sb->s_cop->flags);
if (res)
return res;
- res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
- if (res < 0) {
- const union fscrypt_context *dummy_ctx =
- fscrypt_get_dummy_context(inode->i_sb);
-
- if (IS_ENCRYPTED(inode) || !dummy_ctx) {
- fscrypt_warn(inode,
- "Error %d getting encryption context",
- res);
- return res;
- }
- /* Fake up a context for an unencrypted directory */
- res = fscrypt_context_size(dummy_ctx);
- memcpy(&ctx, dummy_ctx, res);
- }
-
- crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
+ crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL);
if (!crypt_info)
return -ENOMEM;
crypt_info->ci_inode = inode;
-
- res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res);
- if (res) {
- fscrypt_warn(inode,
- "Unrecognized or corrupt encryption context");
- goto out;
- }
-
- memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx),
- FSCRYPT_FILE_NONCE_SIZE);
-
- if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
- res = -EINVAL;
- goto out;
- }
+ crypt_info->ci_policy = *policy;
+ memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
mode = select_encryption_mode(&crypt_info->ci_policy, inode);
if (IS_ERR(mode)) {
@@ -514,13 +504,14 @@ int fscrypt_get_encryption_info(struct inode *inode)
WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
crypt_info->ci_mode = mode;
- res = setup_file_encryption_key(crypt_info, &master_key);
+ res = setup_file_encryption_key(crypt_info, need_dirhash_key,
+ &master_key);
if (res)
goto out;
/*
- * Multiple tasks may race to set ->i_crypt_info, so use
- * cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * For existing inodes, multiple tasks may race to set ->i_crypt_info.
+ * So use cmpxchg_release(). This pairs with the smp_load_acquire() in
* fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a
* RELEASE barrier so that other tasks can ACQUIRE it.
*/
@@ -550,14 +541,113 @@ out:
up_read(&mk->mk_secret_sem);
key_put(master_key);
}
+ put_crypt_info(crypt_info);
+ return res;
+}
+
+/**
+ * fscrypt_get_encryption_info() - set up an inode's encryption key
+ * @inode: the inode to set up the key for. Must be encrypted.
+ *
+ * Set up ->i_crypt_info, if it hasn't already been done.
+ *
+ * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So
+ * generally this shouldn't be called from within a filesystem transaction.
+ *
+ * Return: 0 if ->i_crypt_info was set or was already set, *or* if the
+ * encryption key is unavailable. (Use fscrypt_has_encryption_key() to
+ * distinguish these cases.) Also can return another -errno code.
+ */
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+ int res;
+ union fscrypt_context ctx;
+ union fscrypt_policy policy;
+
+ if (fscrypt_has_encryption_key(inode))
+ return 0;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0) {
+ fscrypt_warn(inode, "Error %d getting encryption context", res);
+ return res;
+ }
+
+ res = fscrypt_policy_from_context(&policy, &ctx, res);
+ if (res) {
+ fscrypt_warn(inode,
+ "Unrecognized or corrupt encryption context");
+ return res;
+ }
+
+ if (!fscrypt_supported_policy(&policy, inode))
+ return -EINVAL;
+
+ res = fscrypt_setup_encryption_info(inode, &policy,
+ fscrypt_context_nonce(&ctx),
+ IS_CASEFOLDED(inode) &&
+ S_ISDIR(inode->i_mode));
if (res == -ENOKEY)
res = 0;
- put_crypt_info(crypt_info);
return res;
}
EXPORT_SYMBOL(fscrypt_get_encryption_info);
/**
+ * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
+ * @dir: a possibly-encrypted directory
+ * @inode: the new inode. ->i_mode must be set already.
+ * ->i_ino doesn't need to be set yet.
+ * @encrypt_ret: (output) set to %true if the new inode will be encrypted
+ *
+ * If the directory is encrypted, set up its ->i_crypt_info in preparation for
+ * encrypting the name of the new file. Also, if the new inode will be
+ * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true.
+ *
+ * This isn't %GFP_NOFS-safe, and therefore it should be called before starting
+ * any filesystem transaction to create the inode. For this reason, ->i_ino
+ * isn't required to be set yet, as the filesystem may not have set it yet.
+ *
+ * This doesn't persist the new inode's encryption context. That still needs to
+ * be done later by calling fscrypt_set_context().
+ *
+ * Return: 0 on success, -ENOKEY if the encryption key is missing, or another
+ * -errno code
+ */
+int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
+ bool *encrypt_ret)
+{
+ const union fscrypt_policy *policy;
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
+
+ policy = fscrypt_policy_to_inherit(dir);
+ if (policy == NULL)
+ return 0;
+ if (IS_ERR(policy))
+ return PTR_ERR(policy);
+
+ if (WARN_ON_ONCE(inode->i_mode == 0))
+ return -EINVAL;
+
+ /*
+ * Only regular files, directories, and symlinks are encrypted.
+ * Special files like device nodes and named pipes aren't.
+ */
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ return 0;
+
+ *encrypt_ret = true;
+
+ get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
+ return fscrypt_setup_encryption_info(inode, policy, nonce,
+ IS_CASEFOLDED(dir) &&
+ S_ISDIR(inode->i_mode));
+}
+EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
+
+/**
* fscrypt_put_encryption_info() - free most of an inode's fscrypt data
* @inode: an inode being evicted
*
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index a3cb52572b05..2762c5350432 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -60,7 +60,7 @@ static int derive_key_aes(const u8 *master_key,
goto out;
}
crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- req = skcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_KERNEL);
if (!req) {
res = -ENOMEM;
goto out;
@@ -99,7 +99,7 @@ find_and_lock_process_key(const char *prefix,
const struct user_key_payload *ukp;
const struct fscrypt_key *payload;
- description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+ description = kasprintf(GFP_KERNEL, "%s%*phN", prefix,
FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
if (!description)
return ERR_PTR(-ENOMEM);
@@ -228,7 +228,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
return dk;
/* Nope, allocate one. */
- dk = kzalloc(sizeof(*dk), GFP_NOFS);
+ dk = kzalloc(sizeof(*dk), GFP_KERNEL);
if (!dk)
return ERR_PTR(-ENOMEM);
refcount_set(&dk->dk_refcount, 1);
@@ -272,7 +272,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci,
* This cannot be a stack buffer because it will be passed to the
* scatterlist crypto API during derive_key_aes().
*/
- derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
+ derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL);
if (!derived_key)
return -ENOMEM;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 2d73fd39ad96..4441d9944b9e 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -32,6 +32,14 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
}
+static const union fscrypt_policy *
+fscrypt_get_dummy_policy(struct super_block *sb)
+{
+ if (!sb->s_cop->get_dummy_policy)
+ return NULL;
+ return sb->s_cop->get_dummy_policy(sb);
+}
+
static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
{
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
@@ -192,10 +200,15 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
32, 32))
return false;
+ /*
+ * IV_INO_LBLK_32 hashes the inode number, so in principle it can
+ * support any ino_bits. However, currently the inode number is gotten
+ * from inode::i_ino which is 'unsigned long'. So for now the
+ * implementation limit is 32 bits.
+ */
if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
- /* This uses hashed inode numbers, so ino_bits doesn't matter. */
!supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32",
- INT_MAX, 32))
+ 32, 32))
return false;
if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
@@ -231,18 +244,19 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
}
/**
- * fscrypt_new_context_from_policy() - create a new fscrypt_context from
- * an fscrypt_policy
+ * fscrypt_new_context() - create a new fscrypt_context
* @ctx_u: output context
* @policy_u: input policy
+ * @nonce: nonce to use
*
* Create an fscrypt_context for an inode that is being assigned the given
- * encryption policy. A new nonce is randomly generated.
+ * encryption policy. @nonce must be a new random nonce.
*
* Return: the size of the new context in bytes.
*/
-static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
- const union fscrypt_policy *policy_u)
+static int fscrypt_new_context(union fscrypt_context *ctx_u,
+ const union fscrypt_policy *policy_u,
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE])
{
memset(ctx_u, 0, sizeof(*ctx_u));
@@ -260,7 +274,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
memcpy(ctx->master_key_descriptor,
policy->master_key_descriptor,
sizeof(ctx->master_key_descriptor));
- get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
return sizeof(*ctx);
}
case FSCRYPT_POLICY_V2: {
@@ -276,7 +290,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
memcpy(ctx->master_key_identifier,
policy->master_key_identifier,
sizeof(ctx->master_key_identifier));
- get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
return sizeof(*ctx);
}
}
@@ -372,6 +386,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
static int set_encryption_policy(struct inode *inode,
const union fscrypt_policy *policy)
{
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
union fscrypt_context ctx;
int ctxsize;
int err;
@@ -409,7 +424,8 @@ static int set_encryption_policy(struct inode *inode,
return -EINVAL;
}
- ctxsize = fscrypt_new_context_from_policy(&ctx, policy);
+ get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
+ ctxsize = fscrypt_new_context(&ctx, policy, nonce);
return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL);
}
@@ -620,86 +636,99 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
}
EXPORT_SYMBOL(fscrypt_has_permitted_context);
+/*
+ * Return the encryption policy that new files in the directory will inherit, or
+ * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also
+ * ensure that its key is set up, so that the new filename can be encrypted.
+ */
+const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
+{
+ int err;
+
+ if (IS_ENCRYPTED(dir)) {
+ err = fscrypt_require_key(dir);
+ if (err)
+ return ERR_PTR(err);
+ return &dir->i_crypt_info->ci_policy;
+ }
+
+ return fscrypt_get_dummy_policy(dir->i_sb);
+}
+
/**
- * fscrypt_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child: Child inode that inherits the context from @parent.
- * @fs_data: private data given by FS.
- * @preload: preload child i_crypt_info if true
+ * fscrypt_set_context() - Set the fscrypt context of a new inode
+ * @inode: a new inode
+ * @fs_data: private data given by FS and passed to ->set_context()
+ *
+ * This should be called after fscrypt_prepare_new_inode(), generally during a
+ * filesystem transaction. Everything here must be %GFP_NOFS-safe.
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_inherit_context(struct inode *parent, struct inode *child,
- void *fs_data, bool preload)
+int fscrypt_set_context(struct inode *inode, void *fs_data)
{
+ struct fscrypt_info *ci = inode->i_crypt_info;
union fscrypt_context ctx;
int ctxsize;
- struct fscrypt_info *ci;
- int res;
-
- res = fscrypt_get_encryption_info(parent);
- if (res < 0)
- return res;
- ci = fscrypt_get_info(parent);
- if (ci == NULL)
+ /* fscrypt_prepare_new_inode() should have set up the key already. */
+ if (WARN_ON_ONCE(!ci))
return -ENOKEY;
- ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy);
-
BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
- res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data);
- if (res)
- return res;
- return preload ? fscrypt_get_encryption_info(child): 0;
+ ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce);
+
+ /*
+ * This may be the first time the inode number is available, so do any
+ * delayed key setup that requires the inode number.
+ */
+ if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
+ (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
+ const struct fscrypt_master_key *mk =
+ ci->ci_master_key->payload.data[0];
+
+ fscrypt_hash_inode_number(ci, mk);
+ }
+
+ return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data);
}
-EXPORT_SYMBOL(fscrypt_inherit_context);
+EXPORT_SYMBOL_GPL(fscrypt_set_context);
/**
* fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption'
* @sb: the filesystem on which test_dummy_encryption is being specified
- * @arg: the argument to the test_dummy_encryption option.
- * If no argument was specified, then @arg->from == NULL.
- * @dummy_ctx: the filesystem's current dummy context (input/output, see below)
+ * @arg: the argument to the test_dummy_encryption option. May be NULL.
+ * @dummy_policy: the filesystem's current dummy policy (input/output, see
+ * below)
*
* Handle the test_dummy_encryption mount option by creating a dummy encryption
- * context, saving it in @dummy_ctx, and adding the corresponding dummy
- * encryption key to the filesystem. If the @dummy_ctx is already set, then
+ * policy, saving it in @dummy_policy, and adding the corresponding dummy
+ * encryption key to the filesystem. If the @dummy_policy is already set, then
* instead validate that it matches @arg. Don't support changing it via
* remount, as that is difficult to do safely.
*
- * The reason we use an fscrypt_context rather than an fscrypt_policy is because
- * we mustn't generate a new nonce each time we access a dummy-encrypted
- * directory, as that would change the way filenames are encrypted.
- *
- * Return: 0 on success (dummy context set, or the same context is already set);
- * -EEXIST if a different dummy context is already set;
+ * Return: 0 on success (dummy policy set, or the same policy is already set);
+ * -EEXIST if a different dummy policy is already set;
* or another -errno value.
*/
-int fscrypt_set_test_dummy_encryption(struct super_block *sb,
- const substring_t *arg,
- struct fscrypt_dummy_context *dummy_ctx)
+int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
+ struct fscrypt_dummy_policy *dummy_policy)
{
- const char *argstr = "v2";
- const char *argstr_to_free = NULL;
struct fscrypt_key_specifier key_spec = { 0 };
int version;
- union fscrypt_context *ctx = NULL;
+ union fscrypt_policy *policy = NULL;
int err;
- if (arg->from) {
- argstr = argstr_to_free = match_strdup(arg);
- if (!argstr)
- return -ENOMEM;
- }
+ if (!arg)
+ arg = "v2";
- if (!strcmp(argstr, "v1")) {
- version = FSCRYPT_CONTEXT_V1;
+ if (!strcmp(arg, "v1")) {
+ version = FSCRYPT_POLICY_V1;
key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
memset(key_spec.u.descriptor, 0x42,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
- } else if (!strcmp(argstr, "v2")) {
- version = FSCRYPT_CONTEXT_V2;
+ } else if (!strcmp(arg, "v2")) {
+ version = FSCRYPT_POLICY_V2;
key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
/* key_spec.u.identifier gets filled in when adding the key */
} else {
@@ -707,21 +736,8 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
goto out;
}
- if (dummy_ctx->ctx) {
- /*
- * Note: if we ever make test_dummy_encryption support
- * specifying other encryption settings, such as the encryption
- * modes, we'll need to compare those settings here.
- */
- if (dummy_ctx->ctx->version == version)
- err = 0;
- else
- err = -EEXIST;
- goto out;
- }
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx) {
+ policy = kzalloc(sizeof(*policy), GFP_KERNEL);
+ if (!policy) {
err = -ENOMEM;
goto out;
}
@@ -730,18 +746,18 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
if (err)
goto out;
- ctx->version = version;
- switch (ctx->version) {
- case FSCRYPT_CONTEXT_V1:
- ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
- ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor,
+ policy->version = version;
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
break;
- case FSCRYPT_CONTEXT_V2:
- ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
- ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier,
+ case FSCRYPT_POLICY_V2:
+ policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(policy->v2.master_key_identifier, key_spec.u.identifier,
FSCRYPT_KEY_IDENTIFIER_SIZE);
break;
default:
@@ -749,12 +765,19 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
err = -EINVAL;
goto out;
}
- dummy_ctx->ctx = ctx;
- ctx = NULL;
+
+ if (dummy_policy->policy) {
+ if (fscrypt_policies_equal(policy, dummy_policy->policy))
+ err = 0;
+ else
+ err = -EEXIST;
+ goto out;
+ }
+ dummy_policy->policy = policy;
+ policy = NULL;
err = 0;
out:
- kfree(ctx);
- kfree(argstr_to_free);
+ kfree(policy);
return err;
}
EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
@@ -771,10 +794,16 @@ EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
struct super_block *sb)
{
- const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb);
+ const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb);
+ int vers;
- if (!ctx)
+ if (!policy)
return;
- seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version);
+
+ vers = policy->version;
+ if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */
+ vers = 1;
+
+ seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers);
}
EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption);
diff --git a/fs/d_path.c b/fs/d_path.c
index 0f1fc1743302..a69e2cd36e6e 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -102,6 +102,8 @@ restart:
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
struct mount *parent = READ_ONCE(mnt->mnt_parent);
+ struct mnt_namespace *mnt_ns;
+
/* Escaped? */
if (dentry != vfsmnt->mnt_root) {
bptr = *buffer;
@@ -116,7 +118,9 @@ restart:
vfsmnt = &mnt->mnt;
continue;
}
- if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns))
+ mnt_ns = READ_ONCE(mnt->mnt_ns);
+ /* open-coded is_mounted() to use local mnt_ns */
+ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
error = 1; // absolute root
else
error = 2; // detached or not attached yet
diff --git a/fs/dax.c b/fs/dax.c
index 95341af1a966..6ad346352a8c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1037,18 +1037,18 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
return ret;
}
-int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
- struct iomap *iomap)
+s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
{
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
pgoff_t pgoff;
long rc, id;
void *kaddr;
bool page_aligned = false;
-
+ unsigned offset = offset_in_page(pos);
+ unsigned size = min_t(u64, PAGE_SIZE - offset, length);
if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
- IS_ALIGNED(size, PAGE_SIZE))
+ (size == PAGE_SIZE))
page_aligned = true;
rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
@@ -1058,8 +1058,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
id = dax_read_lock();
if (page_aligned)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff,
- size >> PAGE_SHIFT);
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
if (rc < 0) {
@@ -1072,7 +1071,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
dax_flush(iomap->dax_dev, kaddr + offset, size);
}
dax_read_unlock(id);
- return 0;
+ return size;
}
static loff_t
@@ -1367,7 +1366,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ret = dax_load_hole(&xas, mapping, &entry, vmf);
goto finish_iomap;
}
- /*FALLTHRU*/
+ fallthrough;
default:
WARN_ON_ONCE(1);
error = -EIO;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index b167d2d02148..a768a09430c3 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -177,7 +177,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
goto out;
if (!fops_get(real_fops)) {
-#ifdef MODULE
+#ifdef CONFIG_MODULES
if (real_fops->owner &&
real_fops->owner->state == MODULE_STATE_GOING)
goto out;
@@ -312,7 +312,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
goto out;
if (!fops_get(real_fops)) {
-#ifdef MODULE
+#ifdef CONFIG_MODULES
if (real_fops->owner &&
real_fops->owner->state == MODULE_STATE_GOING)
goto out;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 183299892465..d53fa92a1ab6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags);
}
-/**
- * dio_end_io - handle the end io action for the given bio
- * @bio: The direct io bio thats being completed
- *
- * This is meant to be called by any filesystem that uses their own dio_submit_t
- * so that the DIO specific endio actions are dealt with after the filesystem
- * has done it's completion work.
- */
-void dio_end_io(struct bio *bio)
-{
- struct dio *dio = bio->bi_private;
-
- if (dio->is_async)
- dio_bio_end_aio(bio);
- else
- dio_bio_end_io(bio);
-}
-EXPORT_SYMBOL_GPL(dio_end_io);
-
static inline void
dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct block_device *bdev,
@@ -1165,22 +1146,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
* the early prefetch in the caller enough time.
*/
- if (align & blocksize_mask) {
- if (bdev)
- blkbits = blksize_bits(bdev_logical_block_size(bdev));
- blocksize_mask = (1 << blkbits) - 1;
- if (align & blocksize_mask)
- goto out;
- }
-
/* watch out for a 0 len io from a tricksy fs */
if (iov_iter_rw(iter) == READ && !count)
return 0;
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
- retval = -ENOMEM;
if (!dio)
- goto out;
+ return -ENOMEM;
/*
* Believe it or not, zeroing out the page array caused a .5%
* performance regression in a database benchmark. So, we take
@@ -1189,32 +1161,32 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
memset(dio, 0, offsetof(struct dio, pages));
dio->flags = flags;
- if (dio->flags & DIO_LOCKING) {
- if (iov_iter_rw(iter) == READ) {
- struct address_space *mapping =
- iocb->ki_filp->f_mapping;
-
- /* will be released by direct_io_worker */
- inode_lock(inode);
-
- retval = filemap_write_and_wait_range(mapping, offset,
- end - 1);
- if (retval) {
- inode_unlock(inode);
- kmem_cache_free(dio_cache, dio);
- goto out;
- }
- }
+ if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) {
+ /* will be released by direct_io_worker */
+ inode_lock(inode);
}
/* Once we sampled i_size check for reads beyond EOF */
dio->i_size = i_size_read(inode);
if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
- if (dio->flags & DIO_LOCKING)
- inode_unlock(inode);
- kmem_cache_free(dio_cache, dio);
retval = 0;
- goto out;
+ goto fail_dio;
+ }
+
+ if (align & blocksize_mask) {
+ if (bdev)
+ blkbits = blksize_bits(bdev_logical_block_size(bdev));
+ blocksize_mask = (1 << blkbits) - 1;
+ if (align & blocksize_mask)
+ goto fail_dio;
+ }
+
+ if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) {
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+ retval = filemap_write_and_wait_range(mapping, offset, end - 1);
+ if (retval)
+ goto fail_dio;
}
/*
@@ -1258,14 +1230,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
*/
retval = sb_init_dio_done_wq(dio->inode->i_sb);
}
- if (retval) {
- /*
- * We grab i_mutex only for reads so we don't have
- * to release it here
- */
- kmem_cache_free(dio_cache, dio);
- goto out;
- }
+ if (retval)
+ goto fail_dio;
}
/*
@@ -1368,7 +1334,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
} else
BUG_ON(retval != -EIOCBQUEUED);
-out:
+ return retval;
+
+fail_dio:
+ if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ)
+ inode_unlock(inode);
+
+ kmem_cache_free(dio_cache, dio);
return retval;
}
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index f82a4952769d..ee92634196a8 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -4,6 +4,7 @@ menuconfig DLM
depends on INET
depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
select IP_SCTP
+ select SRCU
help
A general purpose distributed lock manager for kernel or userspace
applications.
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 47f0b98b707f..49c5f9407098 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item,
CONFIGFS_ATTR(cluster_, cluster_name);
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
- int *info_field, int check_zero,
+ int *info_field, bool (*check_cb)(unsigned int x),
const char *buf, size_t len)
{
unsigned int x;
@@ -137,7 +137,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
if (rc)
return rc;
- if (check_zero && !x)
+ if (check_cb && check_cb(x))
return -EINVAL;
*cl_field = x;
@@ -146,13 +146,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
return len;
}
-#define CLUSTER_ATTR(name, check_zero) \
+#define CLUSTER_ATTR(name, check_cb) \
static ssize_t cluster_##name##_store(struct config_item *item, \
const char *buf, size_t len) \
{ \
struct dlm_cluster *cl = config_item_to_cluster(item); \
return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
- check_zero, buf, len); \
+ check_cb, buf, len); \
} \
static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
{ \
@@ -161,20 +161,30 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
} \
CONFIGFS_ATTR(cluster_, name);
-CLUSTER_ATTR(tcp_port, 1);
-CLUSTER_ATTR(buffer_size, 1);
-CLUSTER_ATTR(rsbtbl_size, 1);
-CLUSTER_ATTR(recover_timer, 1);
-CLUSTER_ATTR(toss_secs, 1);
-CLUSTER_ATTR(scan_secs, 1);
-CLUSTER_ATTR(log_debug, 0);
-CLUSTER_ATTR(log_info, 0);
-CLUSTER_ATTR(protocol, 0);
-CLUSTER_ATTR(mark, 0);
-CLUSTER_ATTR(timewarn_cs, 1);
-CLUSTER_ATTR(waitwarn_us, 0);
-CLUSTER_ATTR(new_rsb_count, 0);
-CLUSTER_ATTR(recover_callbacks, 0);
+static bool dlm_check_zero(unsigned int x)
+{
+ return !x;
+}
+
+static bool dlm_check_buffer_size(unsigned int x)
+{
+ return (x < DEFAULT_BUFFER_SIZE);
+}
+
+CLUSTER_ATTR(tcp_port, dlm_check_zero);
+CLUSTER_ATTR(buffer_size, dlm_check_buffer_size);
+CLUSTER_ATTR(rsbtbl_size, dlm_check_zero);
+CLUSTER_ATTR(recover_timer, dlm_check_zero);
+CLUSTER_ATTR(toss_secs, dlm_check_zero);
+CLUSTER_ATTR(scan_secs, dlm_check_zero);
+CLUSTER_ATTR(log_debug, NULL);
+CLUSTER_ATTR(log_info, NULL);
+CLUSTER_ATTR(protocol, NULL);
+CLUSTER_ATTR(mark, NULL);
+CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
+CLUSTER_ATTR(waitwarn_us, NULL);
+CLUSTER_ATTR(new_rsb_count, NULL);
+CLUSTER_ATTR(recover_callbacks, NULL);
static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port,
@@ -221,6 +231,7 @@ struct dlm_space {
struct list_head members;
struct mutex members_lock;
int members_count;
+ struct dlm_nodes *nds;
};
struct dlm_comms {
@@ -430,6 +441,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
INIT_LIST_HEAD(&sp->members);
mutex_init(&sp->members_lock);
sp->members_count = 0;
+ sp->nds = nds;
return &sp->group;
fail:
@@ -451,6 +463,7 @@ static void drop_space(struct config_group *g, struct config_item *i)
static void release_space(struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(i);
+ kfree(sp->nds);
kfree(sp);
}
@@ -857,18 +870,22 @@ int dlm_comm_seq(int nodeid, uint32_t *seq)
return 0;
}
-int dlm_comm_mark(int nodeid, unsigned int *mark)
+void dlm_comm_mark(int nodeid, unsigned int *mark)
{
struct dlm_comm *cm;
cm = get_comm(nodeid);
- if (!cm)
- return -ENOENT;
+ if (!cm) {
+ *mark = dlm_config.ci_mark;
+ return;
+ }
- *mark = cm->mark;
- put_comm(cm);
+ if (cm->mark)
+ *mark = cm->mark;
+ else
+ *mark = dlm_config.ci_mark;
- return 0;
+ put_comm(cm);
}
int dlm_our_nodeid(void)
@@ -889,7 +906,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
/* Config file defaults */
#define DEFAULT_TCP_PORT 21064
-#define DEFAULT_BUFFER_SIZE 4096
#define DEFAULT_RSBTBL_SIZE 1024
#define DEFAULT_RECOVER_TIMER 5
#define DEFAULT_TOSS_SECS 10
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index f62996cad561..c210250a2581 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -12,6 +12,8 @@
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
+#define DEFAULT_BUFFER_SIZE 4096
+
struct dlm_config_node {
int nodeid;
int weight;
@@ -46,7 +48,7 @@ void dlm_config_exit(void);
int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
int *count_out);
int dlm_comm_seq(int nodeid, uint32_t *seq);
-int dlm_comm_mark(int nodeid, unsigned int *mark);
+void dlm_comm_mark(int nodeid, unsigned int *mark);
int dlm_our_nodeid(void);
int dlm_our_addr(struct sockaddr_storage *addr, int num);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 18d81599522f..002123efc6b0 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5817,7 +5817,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
break;
case -EAGAIN:
error = 0;
- /* fall through */
+ fallthrough;
default:
__put_lkb(ls, lkb);
goto out;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 5050fe05769b..79f56f16bc2c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -65,40 +65,6 @@
#define MAX_SEND_MSG_COUNT 25
#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000)
-struct cbuf {
- unsigned int base;
- unsigned int len;
- unsigned int mask;
-};
-
-static void cbuf_add(struct cbuf *cb, int n)
-{
- cb->len += n;
-}
-
-static int cbuf_data(struct cbuf *cb)
-{
- return ((cb->base + cb->len) & cb->mask);
-}
-
-static void cbuf_init(struct cbuf *cb, int size)
-{
- cb->base = cb->len = 0;
- cb->mask = size-1;
-}
-
-static void cbuf_eat(struct cbuf *cb, int n)
-{
- cb->len -= n;
- cb->base += n;
- cb->base &= cb->mask;
-}
-
-static bool cbuf_empty(struct cbuf *cb)
-{
- return cb->len == 0;
-}
-
struct connection {
struct socket *sock; /* NULL if not connected */
uint32_t nodeid; /* So we know who we are in the list */
@@ -117,8 +83,6 @@ struct connection {
int (*rx_action) (struct connection *); /* What to do when active */
void (*connect_action) (struct connection *); /* What to do to connect */
void (*shutdown_action)(struct connection *con); /* What to do to shutdown */
- struct page *rx_page;
- struct cbuf cb;
int retries;
#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
@@ -126,6 +90,10 @@ struct connection {
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
+ unsigned char *rx_buf;
+ int rx_buflen;
+ int rx_leftover;
+ struct rcu_head rcu;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -167,8 +135,8 @@ static struct workqueue_struct *recv_workqueue;
static struct workqueue_struct *send_workqueue;
static struct hlist_head connection_hash[CONN_HASH_SIZE];
-static DEFINE_MUTEX(connections_lock);
-static struct kmem_cache *con_cache;
+static DEFINE_SPINLOCK(connections_lock);
+DEFINE_STATIC_SRCU(connections_srcu);
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
@@ -184,15 +152,20 @@ static inline int nodeid_hash(int nodeid)
static struct connection *__find_con(int nodeid)
{
- int r;
+ int r, idx;
struct connection *con;
r = nodeid_hash(nodeid);
- hlist_for_each_entry(con, &connection_hash[r], list) {
- if (con->nodeid == nodeid)
+ idx = srcu_read_lock(&connections_srcu);
+ hlist_for_each_entry_rcu(con, &connection_hash[r], list) {
+ if (con->nodeid == nodeid) {
+ srcu_read_unlock(&connections_srcu, idx);
return con;
+ }
}
+ srcu_read_unlock(&connections_srcu, idx);
+
return NULL;
}
@@ -200,21 +173,25 @@ static struct connection *__find_con(int nodeid)
* If 'allocation' is zero then we don't attempt to create a new
* connection structure for this node.
*/
-static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
+static struct connection *nodeid2con(int nodeid, gfp_t alloc)
{
- struct connection *con = NULL;
+ struct connection *con, *tmp;
int r;
con = __find_con(nodeid);
if (con || !alloc)
return con;
- con = kmem_cache_zalloc(con_cache, alloc);
+ con = kzalloc(sizeof(*con), alloc);
if (!con)
return NULL;
- r = nodeid_hash(nodeid);
- hlist_add_head(&con->list, &connection_hash[r]);
+ con->rx_buflen = dlm_config.ci_buffer_size;
+ con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
+ if (!con->rx_buf) {
+ kfree(con);
+ return NULL;
+ }
con->nodeid = nodeid;
mutex_init(&con->sock_mutex);
@@ -233,31 +210,41 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
con->rx_action = zerocon->rx_action;
}
+ r = nodeid_hash(nodeid);
+
+ spin_lock(&connections_lock);
+ /* Because multiple workqueues/threads calls this function it can
+ * race on multiple cpu's. Instead of locking hot path __find_con()
+ * we just check in rare cases of recently added nodes again
+ * under protection of connections_lock. If this is the case we
+ * abort our connection creation and return the existing connection.
+ */
+ tmp = __find_con(nodeid);
+ if (tmp) {
+ spin_unlock(&connections_lock);
+ kfree(con->rx_buf);
+ kfree(con);
+ return tmp;
+ }
+
+ hlist_add_head_rcu(&con->list, &connection_hash[r]);
+ spin_unlock(&connections_lock);
+
return con;
}
/* Loop round all connections */
static void foreach_conn(void (*conn_func)(struct connection *c))
{
- int i;
- struct hlist_node *n;
+ int i, idx;
struct connection *con;
+ idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry_safe(con, n, &connection_hash[i], list)
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list)
conn_func(con);
}
-}
-
-static struct connection *nodeid2con(int nodeid, gfp_t allocation)
-{
- struct connection *con;
-
- mutex_lock(&connections_lock);
- con = __nodeid2con(nodeid, allocation);
- mutex_unlock(&connections_lock);
-
- return con;
+ srcu_read_unlock(&connections_srcu, idx);
}
static struct dlm_node_addr *find_node_addr(int nodeid)
@@ -614,11 +601,8 @@ static void close_connection(struct connection *con, bool and_other,
/* Will only re-enter once. */
close_connection(con->othercon, false, true, true);
}
- if (con->rx_page) {
- __free_page(con->rx_page);
- con->rx_page = NULL;
- }
+ con->rx_leftover = 0;
con->retries = 0;
mutex_unlock(&con->sock_mutex);
clear_bit(CF_CLOSING, &con->flags);
@@ -672,16 +656,33 @@ static void dlm_tcp_shutdown(struct connection *con)
shutdown_connection(con);
}
+static int con_realloc_receive_buf(struct connection *con, int newlen)
+{
+ unsigned char *newbuf;
+
+ newbuf = kmalloc(newlen, GFP_NOFS);
+ if (!newbuf)
+ return -ENOMEM;
+
+ /* copy any leftover from last receive */
+ if (con->rx_leftover)
+ memmove(newbuf, con->rx_buf, con->rx_leftover);
+
+ /* swap to new buffer space */
+ kfree(con->rx_buf);
+ con->rx_buflen = newlen;
+ con->rx_buf = newbuf;
+
+ return 0;
+}
+
/* Data received from remote end */
static int receive_from_sock(struct connection *con)
{
- int ret = 0;
- struct msghdr msg = {};
- struct kvec iov[2];
- unsigned len;
- int r;
int call_again_soon = 0;
- int nvec;
+ struct msghdr msg;
+ struct kvec iov;
+ int ret, buflen;
mutex_lock(&con->sock_mutex);
@@ -689,71 +690,55 @@ static int receive_from_sock(struct connection *con)
ret = -EAGAIN;
goto out_close;
}
+
if (con->nodeid == 0) {
ret = -EINVAL;
goto out_close;
}
- if (con->rx_page == NULL) {
- /*
- * This doesn't need to be atomic, but I think it should
- * improve performance if it is.
- */
- con->rx_page = alloc_page(GFP_ATOMIC);
- if (con->rx_page == NULL)
+ /* realloc if we get new buffer size to read out */
+ buflen = dlm_config.ci_buffer_size;
+ if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
+ ret = con_realloc_receive_buf(con, buflen);
+ if (ret < 0)
goto out_resched;
- cbuf_init(&con->cb, PAGE_SIZE);
}
- /*
- * iov[0] is the bit of the circular buffer between the current end
- * point (cb.base + cb.len) and the end of the buffer.
- */
- iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
- iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
- iov[1].iov_len = 0;
- nvec = 1;
-
- /*
- * iov[1] is the bit of the circular buffer between the start of the
- * buffer and the start of the currently used section (cb.base)
+ /* calculate new buffer parameter regarding last receive and
+ * possible leftover bytes
*/
- if (cbuf_data(&con->cb) >= con->cb.base) {
- iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb);
- iov[1].iov_len = con->cb.base;
- iov[1].iov_base = page_address(con->rx_page);
- nvec = 2;
- }
- len = iov[0].iov_len + iov[1].iov_len;
- iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len);
+ iov.iov_base = con->rx_buf + con->rx_leftover;
+ iov.iov_len = con->rx_buflen - con->rx_leftover;
- r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL);
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
if (ret <= 0)
goto out_close;
- else if (ret == len)
+ else if (ret == iov.iov_len)
call_again_soon = 1;
- cbuf_add(&con->cb, ret);
- ret = dlm_process_incoming_buffer(con->nodeid,
- page_address(con->rx_page),
- con->cb.base, con->cb.len,
- PAGE_SIZE);
- if (ret < 0) {
- log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d",
- ret, page_address(con->rx_page), con->cb.base,
- con->cb.len, r);
- cbuf_eat(&con->cb, r);
- } else {
- cbuf_eat(&con->cb, ret);
- }
+ /* new buflen according readed bytes and leftover from last receive */
+ buflen = ret + con->rx_leftover;
+ ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
+ if (ret < 0)
+ goto out_close;
- if (cbuf_empty(&con->cb) && !call_again_soon) {
- __free_page(con->rx_page);
- con->rx_page = NULL;
+ /* calculate leftover bytes from process and put it into begin of
+ * the receive buffer, so next receive we have the full message
+ * at the start address of the receive buffer.
+ */
+ con->rx_leftover = buflen - ret;
+ if (con->rx_leftover) {
+ memmove(con->rx_buf, con->rx_buf + ret,
+ con->rx_leftover);
+ call_again_soon = true;
}
if (call_again_soon)
goto out_resched;
+
mutex_unlock(&con->sock_mutex);
return 0;
@@ -791,13 +776,11 @@ static int accept_from_sock(struct connection *con)
int nodeid;
struct connection *newcon;
struct connection *addcon;
+ unsigned int mark;
- mutex_lock(&connections_lock);
if (!dlm_allow_conn) {
- mutex_unlock(&connections_lock);
return -1;
}
- mutex_unlock(&connections_lock);
mutex_lock_nested(&con->sock_mutex, 0);
@@ -830,6 +813,9 @@ static int accept_from_sock(struct connection *con)
return -1;
}
+ dlm_comm_mark(nodeid, &mark);
+ sock_set_mark(newsock->sk, mark);
+
log_print("got connection from %d", nodeid);
/* Check to see if we already have a connection to this node. This
@@ -847,13 +833,24 @@ static int accept_from_sock(struct connection *con)
struct connection *othercon = newcon->othercon;
if (!othercon) {
- othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
+ othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
if (!othercon) {
log_print("failed to allocate incoming socket");
mutex_unlock(&newcon->sock_mutex);
result = -ENOMEM;
goto accept_err;
}
+
+ othercon->rx_buflen = dlm_config.ci_buffer_size;
+ othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS);
+ if (!othercon->rx_buf) {
+ mutex_unlock(&newcon->sock_mutex);
+ kfree(othercon);
+ log_print("failed to allocate incoming socket receive buffer");
+ result = -ENOMEM;
+ goto accept_err;
+ }
+
othercon->nodeid = nodeid;
othercon->rx_action = receive_from_sock;
mutex_init(&othercon->sock_mutex);
@@ -975,6 +972,8 @@ static void sctp_connect_to_sock(struct connection *con)
return;
}
+ dlm_comm_mark(con->nodeid, &mark);
+
mutex_lock(&con->sock_mutex);
/* Some odd races can cause double-connects, ignore them */
@@ -999,11 +998,6 @@ static void sctp_connect_to_sock(struct connection *con)
if (result < 0)
goto socket_err;
- /* set skb mark */
- result = dlm_comm_mark(con->nodeid, &mark);
- if (result < 0)
- goto bind_err;
-
sock_set_mark(sock->sk, mark);
con->rx_action = receive_from_sock;
@@ -1076,6 +1070,8 @@ static void tcp_connect_to_sock(struct connection *con)
return;
}
+ dlm_comm_mark(con->nodeid, &mark);
+
mutex_lock(&con->sock_mutex);
if (con->retries++ > MAX_CONNECT_RETRIES)
goto out;
@@ -1090,11 +1086,6 @@ static void tcp_connect_to_sock(struct connection *con)
if (result < 0)
goto out_err;
- /* set skb mark */
- result = dlm_comm_mark(con->nodeid, &mark);
- if (result < 0)
- goto out_err;
-
sock_set_mark(sock->sk, mark);
memset(&saddr, 0, sizeof(saddr));
@@ -1238,6 +1229,14 @@ static void init_local(void)
}
}
+static void deinit_local(void)
+{
+ int i;
+
+ for (i = 0; i < dlm_local_count; i++)
+ kfree(dlm_local_addr[i]);
+}
+
/* Initialise SCTP socket and bind to all interfaces */
static int sctp_listen_for_all(void)
{
@@ -1546,13 +1545,6 @@ static void process_send_sockets(struct work_struct *work)
send_to_sock(con);
}
-
-/* Discard all entries on the write queues */
-static void clean_writequeues(void)
-{
- foreach_conn(clean_one_writequeue);
-}
-
static void work_stop(void)
{
if (recv_workqueue)
@@ -1608,26 +1600,34 @@ static void shutdown_conn(struct connection *con)
con->shutdown_action(con);
}
+static void connection_release(struct rcu_head *rcu)
+{
+ struct connection *con = container_of(rcu, struct connection, rcu);
+
+ kfree(con->rx_buf);
+ kfree(con);
+}
+
static void free_conn(struct connection *con)
{
close_connection(con, true, true, true);
- if (con->othercon)
- kmem_cache_free(con_cache, con->othercon);
- hlist_del(&con->list);
- kmem_cache_free(con_cache, con);
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+ if (con->othercon) {
+ clean_one_writequeue(con->othercon);
+ call_rcu(&con->othercon->rcu, connection_release);
+ }
+ clean_one_writequeue(con);
+ call_rcu(&con->rcu, connection_release);
}
static void work_flush(void)
{
- int ok;
+ int ok, idx;
int i;
- struct hlist_node *n;
struct connection *con;
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
do {
ok = 1;
foreach_conn(stop_conn);
@@ -1635,9 +1635,10 @@ static void work_flush(void)
flush_workqueue(recv_workqueue);
if (send_workqueue)
flush_workqueue(send_workqueue);
+ idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
- hlist_for_each_entry_safe(con, n,
- &connection_hash[i], list) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i],
+ list) {
ok &= test_bit(CF_READ_PENDING, &con->flags);
ok &= test_bit(CF_WRITE_PENDING, &con->flags);
if (con->othercon) {
@@ -1648,6 +1649,7 @@ static void work_flush(void)
}
}
}
+ srcu_read_unlock(&connections_srcu, idx);
} while (!ok);
}
@@ -1656,16 +1658,18 @@ void dlm_lowcomms_stop(void)
/* Set all the flags to prevent any
socket activity.
*/
- mutex_lock(&connections_lock);
dlm_allow_conn = 0;
- mutex_unlock(&connections_lock);
+
+ if (recv_workqueue)
+ flush_workqueue(recv_workqueue);
+ if (send_workqueue)
+ flush_workqueue(send_workqueue);
+
foreach_conn(shutdown_conn);
work_flush();
- clean_writequeues();
foreach_conn(free_conn);
work_stop();
-
- kmem_cache_destroy(con_cache);
+ deinit_local();
}
int dlm_lowcomms_start(void)
@@ -1684,16 +1688,9 @@ int dlm_lowcomms_start(void)
goto fail;
}
- error = -ENOMEM;
- con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
- __alignof__(struct connection), 0,
- NULL);
- if (!con_cache)
- goto fail;
-
error = work_start();
if (error)
- goto fail_destroy;
+ goto fail;
dlm_allow_conn = 1;
@@ -1710,12 +1707,8 @@ int dlm_lowcomms_start(void)
fail_unlisten:
dlm_allow_conn = 0;
con = nodeid2con(0,0);
- if (con) {
- close_connection(con, false, true, true);
- kmem_cache_free(con_cache, con);
- }
-fail_destroy:
- kmem_cache_destroy(con_cache);
+ if (con)
+ free_conn(con);
fail:
return error;
}
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 921322d133e3..fde3a6afe4be 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -22,114 +22,84 @@
* into packets and sends them to the comms layer.
*/
+#include <asm/unaligned.h>
+
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
#include "lock.h"
#include "midcomms.h"
-
-static void copy_from_cb(void *dst, const void *base, unsigned offset,
- unsigned len, unsigned limit)
-{
- unsigned copy = len;
-
- if ((copy + offset) > limit)
- copy = limit - offset;
- memcpy(dst, base + offset, copy);
- len -= copy;
- if (len)
- memcpy(dst + copy, base, len);
-}
-
/*
* Called from the low-level comms layer to process a buffer of
* commands.
- *
- * Only complete messages are processed here, any "spare" bytes from
- * the end of a buffer are saved and tacked onto the front of the next
- * message that comes in. I doubt this will happen very often but we
- * need to be able to cope with it and I don't want the task to be waiting
- * for packets to come in when there is useful work to be done.
*/
-int dlm_process_incoming_buffer(int nodeid, const void *base,
- unsigned offset, unsigned len, unsigned limit)
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
{
- union {
- unsigned char __buf[DLM_INBUF_LEN];
- /* this is to force proper alignment on some arches */
- union dlm_packet p;
- } __tmp;
- union dlm_packet *p = &__tmp.p;
- int ret = 0;
- int err = 0;
+ const unsigned char *ptr = buf;
+ const struct dlm_header *hd;
uint16_t msglen;
- uint32_t lockspace;
-
- while (len > sizeof(struct dlm_header)) {
-
- /* Copy just the header to check the total length. The
- message may wrap around the end of the buffer back to the
- start, so we need to use a temp buffer and copy_from_cb. */
-
- copy_from_cb(p, base, offset, sizeof(struct dlm_header),
- limit);
-
- msglen = le16_to_cpu(p->header.h_length);
- lockspace = p->header.h_lockspace;
+ int ret = 0;
- err = -EINVAL;
- if (msglen < sizeof(struct dlm_header))
- break;
- if (p->header.h_cmd == DLM_MSG) {
- if (msglen < sizeof(struct dlm_message))
- break;
- } else {
- if (msglen < sizeof(struct dlm_rcom))
- break;
- }
- err = -E2BIG;
- if (msglen > dlm_config.ci_buffer_size) {
- log_print("message size %d from %d too big, buf len %d",
- msglen, nodeid, len);
- break;
+ while (len >= sizeof(struct dlm_header)) {
+ hd = (struct dlm_header *)ptr;
+
+ /* no message should be more than this otherwise we
+ * cannot deliver this message to upper layers
+ */
+ msglen = get_unaligned_le16(&hd->h_length);
+ if (msglen > DEFAULT_BUFFER_SIZE) {
+ log_print("received invalid length header: %u, will abort message parsing",
+ msglen);
+ return -EBADMSG;
}
- err = 0;
-
- /* If only part of the full message is contained in this
- buffer, then do nothing and wait for lowcomms to call
- us again later with more data. We return 0 meaning
- we've consumed none of the input buffer. */
+ /* caller will take care that leftover
+ * will be parsed next call with more data
+ */
if (msglen > len)
break;
- /* Allocate a larger temp buffer if the full message won't fit
- in the buffer on the stack (which should work for most
- ordinary messages). */
-
- if (msglen > sizeof(__tmp) && p == &__tmp.p) {
- p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
- if (p == NULL)
- return ret;
- }
+ switch (hd->h_cmd) {
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("dlm msg too small: %u, will skip this message",
+ msglen);
+ goto skip;
+ }
- copy_from_cb(p, base, offset, msglen, limit);
+ break;
+ case DLM_RCOM:
+ if (msglen < sizeof(struct dlm_rcom)) {
+ log_print("dlm rcom msg too small: %u, will skip this message",
+ msglen);
+ goto skip;
+ }
- BUG_ON(lockspace != p->header.h_lockspace);
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message",
+ hd->h_cmd);
+ goto skip;
+ }
+ /* for aligned memory access, we just copy current message
+ * to begin of the buffer which contains already parsed buffer
+ * data and should provide align access for upper layers
+ * because the start address of the buffer has a aligned
+ * address. This memmove can be removed when the upperlayer
+ * is capable of unaligned memory access.
+ */
+ memmove(buf, ptr, msglen);
+ dlm_receive_buffer((union dlm_packet *)buf, nodeid);
+
+skip:
ret += msglen;
- offset += msglen;
- offset &= (limit - 1);
len -= msglen;
-
- dlm_receive_buffer(p, nodeid);
+ ptr += msglen;
}
- if (p != &__tmp.p)
- kfree(p);
-
- return err ? err : ret;
+ return ret;
}
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 2e122e81c8d0..61e90a921849 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -12,8 +12,7 @@
#ifndef __MIDCOMMS_DOT_H__
#define __MIDCOMMS_DOT_H__
-int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
- unsigned len, unsigned limit);
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
#endif /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 28bb5689333a..15880a68faad 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -141,6 +141,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+ /* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */
+ strreplace(name, '/', '!');
+
inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
is_removable);
if (!inode)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 459ecb42cbd3..347be146884c 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -224,7 +224,7 @@ submit_bio_retry:
bio_set_dev(bio, sb->s_bdev);
bio->bi_iter.bi_sector = (sector_t)blknr <<
LOG_SECTORS_PER_BLOCK;
- bio->bi_opf = REQ_OP_READ;
+ bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0);
}
err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index ddaa516c008a..b9a09806512a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -211,9 +211,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx)
enum {
Opt_user_xattr,
- Opt_nouser_xattr,
Opt_acl,
- Opt_noacl,
Opt_cache_strategy,
Opt_err
};
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index c8c381eadcd6..5bde77d70852 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
break;
case EROFS_XATTR_INDEX_SECURITY:
break;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 6c939def00f9..50912a5420b4 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -135,6 +135,7 @@ struct z_erofs_decompress_frontend {
struct z_erofs_collector clt;
struct erofs_map_blocks map;
+ bool readahead;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
@@ -153,8 +154,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
static void preload_compressed_pages(struct z_erofs_collector *clt,
struct address_space *mc,
- enum z_erofs_cache_alloctype type,
- struct list_head *pagepool)
+ enum z_erofs_cache_alloctype type)
{
const struct z_erofs_pcluster *pcl = clt->pcl;
const unsigned int clusterpages = BIT(pcl->clusterbits);
@@ -562,8 +562,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page,
- struct list_head *pagepool)
+ struct page *page)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -620,8 +619,7 @@ restart_now:
else
cache_strategy = DONTALLOC;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi),
- cache_strategy, pagepool);
+ preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy);
hitted:
/*
@@ -653,7 +651,7 @@ retry:
/* should allocate an additional staging page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
- erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL);
+ alloc_page(GFP_NOFS | __GFP_NOFAIL);
newpage->mapping = Z_EROFS_MAPPING_STAGING;
err = z_erofs_attach_page(clt, newpage,
@@ -1151,7 +1149,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
}
static void z_erofs_submit_queue(struct super_block *sb,
- z_erofs_next_pcluster_t owned_head,
+ struct z_erofs_decompress_frontend *f,
struct list_head *pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
@@ -1160,6 +1158,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
+ z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
/* since bio will be NULL, no need to initialize last_index */
pgoff_t last_index;
unsigned int nr_bios = 0;
@@ -1193,7 +1192,6 @@ static void z_erofs_submit_queue(struct super_block *sb,
do {
struct page *page;
- int err;
page = pickup_page_for_submission(pcl, i++, pagepool,
MNGD_MAPPING(sbi),
@@ -1216,11 +1214,12 @@ submit_bio_retry:
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
bio->bi_opf = REQ_OP_READ;
+ if (f->readahead)
+ bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
}
- err = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (err < PAGE_SIZE)
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
goto submit_bio_retry;
last_index = cur;
@@ -1248,14 +1247,14 @@ submit_bio_retry:
}
static void z_erofs_runqueue(struct super_block *sb,
- struct z_erofs_collector *clt,
+ struct z_erofs_decompress_frontend *f,
struct list_head *pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg);
+ z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
/* handle bypass queue (no i/o pclusters) immediately */
z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
@@ -1282,11 +1281,11 @@ static int z_erofs_readpage(struct file *file, struct page *page)
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool, true);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
@@ -1299,25 +1298,20 @@ static int z_erofs_readpage(struct file *file, struct page *page)
return err;
}
-static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
- unsigned int nr)
-{
- return nr <= sbi->ctx.max_sync_decompress_pages;
-}
-
static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- bool sync = should_decompress_synchronously(sbi, readahead_count(rac));
+ unsigned int nr_pages = readahead_count(rac);
+ bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct page *page, *head = NULL;
LIST_HEAD(pagepool);
- trace_erofs_readpages(inode, readahead_index(rac),
- readahead_count(rac), false);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ f.readahead = true;
f.headoffset = readahead_pos(rac);
while ((page = readahead_page(rac))) {
@@ -1341,7 +1335,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
/* traversal in reverse order */
head = (void *)page_private(page);
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
if (err)
erofs_err(inode->i_sb,
"readahead error at page %lu @ nid %llu",
@@ -1351,7 +1345,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
(void)z_erofs_collector_end(&f.clt);
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
if (f.map.mpage)
put_page(f.map.mpage);
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 7d40d78ea864..ae325541884e 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -359,7 +359,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
return z_erofs_extent_lookback(m, m->delta[0]);
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
map->m_flags &= ~EROFS_MAP_ZIPPED;
- /* fallthrough */
+ fallthrough;
case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
map->m_la = (lcn << lclusterbits) | m->clusterofs;
break;
@@ -416,7 +416,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
if (endoff >= m.clusterofs)
map->m_flags &= ~EROFS_MAP_ZIPPED;
- /* fallthrough */
+ fallthrough;
case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
if (endoff >= m.clusterofs) {
map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
@@ -433,7 +433,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
end = (m.lcn << lclusterbits) | m.clusterofs;
map->m_flags |= EROFS_MAP_FULL_MAPPED;
m.delta[0] = 1;
- /* fallthrough */
+ fallthrough;
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
/* get the correspoinding first chunk */
err = z_erofs_extent_lookback(&m, m.delta[0]);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e0decff22ae2..4df61129566d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,8 +218,7 @@ struct eventpoll {
struct file *file;
/* used to optimize loop detection check */
- struct list_head visited_list_link;
- int visited;
+ u64 gen;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
@@ -274,6 +273,8 @@ static long max_user_watches __read_mostly;
*/
static DEFINE_MUTEX(epmutex);
+static u64 loop_check_gen = 0;
+
/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;
@@ -283,9 +284,6 @@ static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;
-/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
-static LIST_HEAD(visited_list);
-
/*
* List of files with newly added links, where we may need to limit the number
* of emanating paths. Protected by the epmutex.
@@ -1450,7 +1448,7 @@ static int reverse_path_check(void)
static int ep_create_wakeup_source(struct epitem *epi)
{
- const char *name;
+ struct name_snapshot n;
struct wakeup_source *ws;
if (!epi->ep->ws) {
@@ -1459,8 +1457,9 @@ static int ep_create_wakeup_source(struct epitem *epi)
return -ENOMEM;
}
- name = epi->ffd.file->f_path.dentry->d_name.name;
- ws = wakeup_source_register(NULL, name);
+ take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
+ ws = wakeup_source_register(NULL, n.name.name);
+ release_dentry_name_snapshot(&n);
if (!ws)
return -ENOMEM;
@@ -1522,6 +1521,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
RCU_INIT_POINTER(epi->ws, NULL);
}
+ /* Add the current item to the list of active epoll hook for this file */
+ spin_lock(&tfile->f_lock);
+ list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
+ spin_unlock(&tfile->f_lock);
+
+ /*
+ * Add the current item to the RB tree. All RB tree operations are
+ * protected by "mtx", and ep_insert() is called with "mtx" held.
+ */
+ ep_rbtree_insert(ep, epi);
+
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (full_check && reverse_path_check())
+ goto error_remove_epi;
+
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
@@ -1544,22 +1559,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
if (epi->nwait < 0)
goto error_unregister;
- /* Add the current item to the list of active epoll hook for this file */
- spin_lock(&tfile->f_lock);
- list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_lock);
-
- /*
- * Add the current item to the RB tree. All RB tree operations are
- * protected by "mtx", and ep_insert() is called with "mtx" held.
- */
- ep_rbtree_insert(ep, epi);
-
- /* now check if we've created too many backpaths */
- error = -EINVAL;
- if (full_check && reverse_path_check())
- goto error_remove_epi;
-
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
@@ -1588,6 +1587,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
return 0;
+error_unregister:
+ ep_unregister_pollwait(ep, epi);
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
@@ -1595,9 +1596,6 @@ error_remove_epi:
rb_erase_cached(&epi->rbn, &ep->rbr);
-error_unregister:
- ep_unregister_pollwait(ep, epi);
-
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
@@ -1972,13 +1970,12 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
struct epitem *epi;
mutex_lock_nested(&ep->mtx, call_nests + 1);
- ep->visited = 1;
- list_add(&ep->visited_list_link, &visited_list);
+ ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
ep_tovisit = epi->ffd.file->private_data;
- if (ep_tovisit->visited)
+ if (ep_tovisit->gen == loop_check_gen)
continue;
error = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, epi->ffd.file,
@@ -1995,9 +1992,9 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
* during ep_insert().
*/
if (list_empty(&epi->ffd.file->f_tfile_llink)) {
- get_file(epi->ffd.file);
- list_add(&epi->ffd.file->f_tfile_llink,
- &tfile_check_list);
+ if (get_file_rcu(epi->ffd.file))
+ list_add(&epi->ffd.file->f_tfile_llink,
+ &tfile_check_list);
}
}
}
@@ -2019,18 +2016,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
*/
static int ep_loop_check(struct eventpoll *ep, struct file *file)
{
- int ret;
- struct eventpoll *ep_cur, *ep_next;
-
- ret = ep_call_nested(&poll_loop_ncalls,
+ return ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, file, ep, current);
- /* clear visited list */
- list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
- visited_list_link) {
- ep_cur->visited = 0;
- list_del(&ep_cur->visited_list_link);
- }
- return ret;
}
static void clear_tfile_check_list(void)
@@ -2195,11 +2182,13 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
+ ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epmutex, 0, nonblock);