aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c61
-rw-r--r--fs/9p/v9fs.h3
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/affs/super.c42
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/cmservice.c16
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c1
-rw-r--r--fs/afs/inode.c7
-rw-r--r--fs/afs/internal.h16
-rw-r--r--fs/afs/main.c2
-rw-r--r--fs/afs/mntpt.c1
-rw-r--r--fs/afs/rxrpc.c18
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/afs/super.c46
-rw-r--r--fs/afs/xattr.c121
-rw-r--r--fs/aio.c15
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/befs/btree.c15
-rw-r--r--fs/befs/linuxvfs.c24
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c79
-rw-r--r--fs/binfmt_flat.c41
-rw-r--r--fs/block_dev.c31
-rw-r--r--fs/btrfs/acl.c13
-rw-r--r--fs/btrfs/backref.c10
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c57
-rw-r--r--fs/btrfs/compression.c139
-rw-r--r--fs/btrfs/compression.h48
-rw-r--r--fs/btrfs/ctree.c42
-rw-r--r--fs/btrfs/ctree.h90
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/delayed-ref.h6
-rw-r--r--fs/btrfs/dev-replace.c4
-rw-r--r--fs/btrfs/dir-item.c76
-rw-r--r--fs/btrfs/disk-io.c248
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c502
-rw-r--r--fs/btrfs/extent_io.c293
-rw-r--r--fs/btrfs/extent_io.h86
-rw-r--r--fs/btrfs/file-item.c45
-rw-r--r--fs/btrfs/file.c92
-rw-r--r--fs/btrfs/free-space-tree.c38
-rw-r--r--fs/btrfs/hash.c5
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c525
-rw-r--r--fs/btrfs/ioctl.c18
-rw-r--r--fs/btrfs/lzo.c33
-rw-r--r--fs/btrfs/ordered-data.c17
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/print-tree.c7
-rw-r--r--fs/btrfs/props.c7
-rw-r--r--fs/btrfs/qgroup.c225
-rw-r--r--fs/btrfs/qgroup.h9
-rw-r--r--fs/btrfs/raid56.c58
-rw-r--r--fs/btrfs/reada.c1
-rw-r--r--fs/btrfs/relocation.c17
-rw-r--r--fs/btrfs/root-tree.c7
-rw-r--r--fs/btrfs/scrub.c237
-rw-r--r--fs/btrfs/send.c146
-rw-r--r--fs/btrfs/super.c75
-rw-r--r--fs/btrfs/sysfs.c41
-rw-r--r--fs/btrfs/tests/extent-io-tests.c2
-rw-r--r--fs/btrfs/transaction.c25
-rw-r--r--fs/btrfs/tree-log.c43
-rw-r--r--fs/btrfs/volumes.c87
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c20
-rw-r--r--fs/buffer.c210
-rw-r--r--fs/cachefiles/internal.h4
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/cachefiles/rdwr.c4
-rw-r--r--fs/ceph/acl.c1
-rw-r--r--fs/ceph/addr.c21
-rw-r--r--fs/ceph/cache.c92
-rw-r--r--fs/ceph/caps.c40
-rw-r--r--fs/ceph/dir.c5
-rw-r--r--fs/ceph/export.c4
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/inode.c23
-rw-r--r--fs/ceph/locks.c25
-rw-r--r--fs/ceph/mds_client.c8
-rw-r--r--fs/ceph/super.c47
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/Kconfig90
-rw-r--r--fs/cifs/Makefile7
-rw-r--r--fs/cifs/cifs_unicode.c10
-rw-r--r--fs/cifs/cifs_unicode.h2
-rw-r--r--fs/cifs/cifsfs.c15
-rw-r--r--fs/cifs/cifsglob.h20
-rw-r--r--fs/cifs/cifssmb.c7
-rw-r--r--fs/cifs/connect.c41
-rw-r--r--fs/cifs/file.c18
-rw-r--r--fs/cifs/inode.c1
-rw-r--r--fs/cifs/ioctl.c2
-rw-r--r--fs/cifs/link.c4
-rw-r--r--fs/cifs/misc.c13
-rw-r--r--fs/cifs/smb1ops.c9
-rw-r--r--fs/cifs/smb2maperror.c4
-rw-r--r--fs/cifs/smb2ops.c207
-rw-r--r--fs/cifs/smb2pdu.c126
-rw-r--r--fs/cifs/smb2proto.h6
-rw-r--r--fs/cifs/smb2transport.c28
-rw-r--r--fs/cifs/transport.c7
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/coda/file.c4
-rw-r--r--fs/compat_ioctl.c25
-rw-r--r--fs/configfs/item.c8
-rw-r--r--fs/configfs/symlink.c3
-rw-r--r--fs/crypto/Kconfig1
-rw-r--r--fs/crypto/bio.c2
-rw-r--r--fs/crypto/crypto.c23
-rw-r--r--fs/crypto/fname.c9
-rw-r--r--fs/crypto/fscrypt_private.h9
-rw-r--r--fs/crypto/keyinfo.c173
-rw-r--r--fs/crypto/policy.c9
-rw-r--r--fs/dax.c20
-rw-r--r--fs/dcache.c73
-rw-r--r--fs/debugfs/file.c2
-rw-r--r--fs/debugfs/inode.c14
-rw-r--r--fs/direct-io.c25
-rw-r--r--fs/efivarfs/super.c1
-rw-r--r--fs/eventfd.c6
-rw-r--r--fs/eventpoll.c74
-rw-r--r--fs/exec.c35
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/ext2/acl.c43
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c5
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext2/super.c16
-rw-r--r--fs/ext2/xattr.c48
-rw-r--r--fs/ext4/acl.c21
-rw-r--r--fs/ext4/ext4.h63
-rw-r--r--fs/ext4/ext4_jbd2.h23
-rw-r--r--fs/ext4/extents.c3
-rw-r--r--fs/ext4/file.c42
-rw-r--r--fs/ext4/fsmap.c4
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c76
-rw-r--r--fs/ext4/indirect.c3
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c92
-rw-r--r--fs/ext4/ioctl.c10
-rw-r--r--fs/ext4/mballoc.c145
-rw-r--r--fs/ext4/mballoc.h6
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c131
-rw-r--r--fs/ext4/page-io.c15
-rw-r--r--fs/ext4/readpage.c4
-rw-r--r--fs/ext4/super.c113
-rw-r--r--fs/ext4/sysfs.c2
-rw-r--r--fs/ext4/xattr.c1699
-rw-r--r--fs/ext4/xattr.h35
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/acl.c4
-rw-r--r--fs/f2fs/checkpoint.c43
-rw-r--r--fs/f2fs/data.c225
-rw-r--r--fs/f2fs/dir.c3
-rw-r--r--fs/f2fs/extent_cache.c12
-rw-r--r--fs/f2fs/f2fs.h206
-rw-r--r--fs/f2fs/file.c186
-rw-r--r--fs/f2fs/gc.c40
-rw-r--r--fs/f2fs/inline.c22
-rw-r--r--fs/f2fs/inode.c17
-rw-r--r--fs/f2fs/namei.c71
-rw-r--r--fs/f2fs/node.c67
-rw-r--r--fs/f2fs/node.h6
-rw-r--r--fs/f2fs/segment.c238
-rw-r--r--fs/f2fs/segment.h4
-rw-r--r--fs/f2fs/super.c707
-rw-r--r--fs/f2fs/sysfs.c365
-rw-r--r--fs/fcntl.c304
-rw-r--r--fs/file.c22
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/filesystems.c7
-rw-r--r--fs/fs-writeback.c20
-rw-r--r--fs/fs_pin.c4
-rw-r--r--fs/gfs2/bmap.c2
-rw-r--r--fs/gfs2/dir.c10
-rw-r--r--fs/gfs2/glock.c135
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/glops.c56
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c19
-rw-r--r--fs/gfs2/log.c3
-rw-r--r--fs/gfs2/lops.c18
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/gfs2/ops_fstype.c6
-rw-r--r--fs/gfs2/rgrp.c6
-rw-r--r--fs/gfs2/super.c33
-rw-r--r--fs/gfs2/sys.c26
-rw-r--r--fs/gfs2/xattr.c4
-rw-r--r--fs/hfsplus/posix_acl.c30
-rw-r--r--fs/hugetlbfs/inode.c83
-rw-r--r--fs/inode.c25
-rw-r--r--fs/iomap.c107
-rw-r--r--fs/isofs/inode.c59
-rw-r--r--fs/isofs/isofs.h3
-rw-r--r--fs/jbd2/commit.c16
-rw-r--r--fs/jbd2/journal.c4
-rw-r--r--fs/jbd2/transaction.c42
-rw-r--r--fs/jfs/acl.c24
-rw-r--r--fs/jfs/jfs_logmgr.c2
-rw-r--r--fs/jfs/jfs_metapage.c11
-rw-r--r--fs/jfs/jfs_metapage.h1
-rw-r--r--fs/jfs/resize.c4
-rw-r--r--fs/jfs/super.c4
-rw-r--r--fs/libfs.c6
-rw-r--r--fs/locks.c97
-rw-r--r--fs/mbcache.c52
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/minix/itree_common.c2
-rw-r--r--fs/mount.h5
-rw-r--r--fs/mpage.c5
-rw-r--r--fs/namei.c16
-rw-r--r--fs/namespace.c76
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c4
-rw-r--r--fs/nfs/callback_xdr.c1
-rw-r--r--fs/nfs/dir.c51
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4proc.c9
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/super.c17
-rw-r--r--fs/nfsd/blocklayout.c4
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nfsd/nfs4callback.c6
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--fs/nfsd/nfsfh.h24
-rw-r--r--fs/nfsd/vfs.c73
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/nilfs2/segment.c5
-rw-r--r--fs/notify/fsnotify.c8
-rw-r--r--fs/nsfs.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c6
-rw-r--r--fs/ocfs2/cluster/netdebug.c1
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/ocfs2_fs.h5
-rw-r--r--fs/ocfs2/stackglue.c2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/ocfs2/xattr.c23
-rw-r--r--fs/omfs/inode.c33
-rw-r--r--fs/open.c4
-rw-r--r--fs/orangefs/orangefs-bufmap.c12
-rw-r--r--fs/orangefs/super.c15
-rw-r--r--fs/overlayfs/Kconfig20
-rw-r--r--fs/overlayfs/copy_up.c422
-rw-r--r--fs/overlayfs/dir.c74
-rw-r--r--fs/overlayfs/inode.c247
-rw-r--r--fs/overlayfs/namei.c393
-rw-r--r--fs/overlayfs/overlayfs.h70
-rw-r--r--fs/overlayfs/ovl_entry.h36
-rw-r--r--fs/overlayfs/readdir.c53
-rw-r--r--fs/overlayfs/super.c252
-rw-r--r--fs/overlayfs/util.c350
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/pnode.c212
-rw-r--r--fs/proc/base.c45
-rw-r--r--fs/proc/generic.c32
-rw-r--r--fs/proc/internal.h8
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/proc_sysctl.c69
-rw-r--r--fs/proc/task_mmu.c5
-rw-r--r--fs/pstore/inode.c36
-rw-r--r--fs/pstore/internal.h5
-rw-r--r--fs/pstore/platform.c71
-rw-r--r--fs/pstore/pmsg.c10
-rw-r--r--fs/pstore/ram.c16
-rw-r--r--fs/quota/dquot.c16
-rw-r--r--fs/ramfs/inode.c32
-rw-r--r--fs/read_write.c234
-rw-r--r--fs/reiserfs/bitmap.c14
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/reiserfs/xattr_acl.c17
-rw-r--r--fs/select.c48
-rw-r--r--fs/signalfd.c2
-rw-r--r--fs/splice.c2
-rw-r--r--fs/statfs.c60
-rw-r--r--fs/super.c5
-rw-r--r--fs/sync.c2
-rw-r--r--fs/sysv/dir.c2
-rw-r--r--fs/timerfd.c43
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/ubifs/crypto.c7
-rw-r--r--fs/ubifs/dir.c32
-rw-r--r--fs/ubifs/file.c27
-rw-r--r--fs/ubifs/journal.c36
-rw-r--r--fs/ubifs/key.h1
-rw-r--r--fs/ubifs/super.c11
-rw-r--r--fs/ubifs/tnc.c140
-rw-r--r--fs/ubifs/tnc_commit.c4
-rw-r--r--fs/ubifs/ubifs.h6
-rw-r--r--fs/ubifs/xattr.c39
-rw-r--r--fs/udf/file.c12
-rw-r--r--fs/udf/inode.c4
-rw-r--r--fs/udf/super.c2
-rw-r--r--fs/udf/udftime.c98
-rw-r--r--fs/ufs/balloc.c44
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/inode.c74
-rw-r--r--fs/ufs/super.c73
-rw-r--r--fs/ufs/ufs_fs.h9
-rw-r--r--fs/ufs/util.c17
-rw-r--r--fs/ufs/util.h9
-rw-r--r--fs/userfaultfd.c71
-rw-r--r--fs/xfs/Kconfig13
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/kmem.h10
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c8
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c26
-rw-r--r--fs/xfs/libxfs/xfs_attr.c28
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c13
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h10
-rw-r--r--fs/xfs/libxfs/xfs_bit.h24
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c72
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h2
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c34
-rw-r--r--fs/xfs/libxfs/xfs_btree.c58
-rw-r--r--fs/xfs/libxfs/xfs_btree.h33
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h16
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c14
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h8
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c28
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h64
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c18
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/libxfs/xfs_format.h113
-rw-r--r--fs/xfs/libxfs/xfs_fs.h16
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c53
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c36
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c7
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h31
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h256
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h4
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c20
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h16
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c12
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c14
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h11
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c34
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c4
-rw-r--r--fs/xfs/libxfs/xfs_sb.c4
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c4
-rw-r--r--fs/xfs/libxfs/xfs_types.h46
-rw-r--r--fs/xfs/uuid.c63
-rw-r--r--fs/xfs/uuid.h35
-rw-r--r--fs/xfs/xfs.h4
-rw-r--r--fs/xfs/xfs_acl.c6
-rw-r--r--fs/xfs/xfs_acl.h1
-rw-r--r--fs/xfs/xfs_aops.c20
-rw-r--r--fs/xfs/xfs_attr.h3
-rw-r--r--fs/xfs/xfs_attr_list.c63
-rw-r--r--fs/xfs/xfs_bmap_item.c17
-rw-r--r--fs/xfs/xfs_bmap_util.c164
-rw-r--r--fs/xfs/xfs_bmap_util.h4
-rw-r--r--fs/xfs/xfs_buf.c71
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_buf_item.c21
-rw-r--r--fs/xfs/xfs_dir2_readdir.c341
-rw-r--r--fs/xfs/xfs_discard.c4
-rw-r--r--fs/xfs/xfs_dquot.c71
-rw-r--r--fs/xfs/xfs_error.c319
-rw-r--r--fs/xfs/xfs_error.h44
-rw-r--r--fs/xfs/xfs_file.c412
-rw-r--r--fs/xfs/xfs_fsops.c16
-rw-r--r--fs/xfs/xfs_fsops.h4
-rw-r--r--fs/xfs/xfs_globals.c5
-rw-r--r--fs/xfs/xfs_icache.c61
-rw-r--r--fs/xfs/xfs_icache.h4
-rw-r--r--fs/xfs/xfs_inode.c25
-rw-r--r--fs/xfs/xfs_inode.h7
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_ioctl.c27
-rw-r--r--fs/xfs/xfs_ioctl.h10
-rw-r--r--fs/xfs/xfs_ioctl32.h6
-rw-r--r--fs/xfs/xfs_iomap.c26
-rw-r--r--fs/xfs/xfs_iops.c6
-rw-r--r--fs/xfs/xfs_itable.c2
-rw-r--r--fs/xfs/xfs_itable.h2
-rw-r--r--fs/xfs/xfs_linux.h23
-rw-r--r--fs/xfs/xfs_log.c87
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c91
-rw-r--r--fs/xfs/xfs_log_priv.h3
-rw-r--r--fs/xfs/xfs_log_recover.c55
-rw-r--r--fs/xfs/xfs_message.c5
-rw-r--r--fs/xfs/xfs_mount.c46
-rw-r--r--fs/xfs/xfs_mount.h60
-rw-r--r--fs/xfs/xfs_qm.c31
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_quotaops.c1
-rw-r--r--fs/xfs/xfs_reflink.c105
-rw-r--r--fs/xfs/xfs_reflink.h8
-rw-r--r--fs/xfs/xfs_rtalloc.c8
-rw-r--r--fs/xfs/xfs_rtalloc.h3
-rw-r--r--fs/xfs/xfs_stats.c8
-rw-r--r--fs/xfs/xfs_stats.h190
-rw-r--r--fs/xfs/xfs_super.c29
-rw-r--r--fs/xfs/xfs_symlink.c14
-rw-r--r--fs/xfs/xfs_symlink.h1
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c81
-rw-r--r--fs/xfs/xfs_trace.h40
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_trans_bmap.c11
-rw-r--r--fs/xfs/xfs_trans_buf.c21
-rw-r--r--fs/xfs/xfs_trans_rmap.c2
431 files changed, 12740 insertions, 7098 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index c202930086ed..8fb89ddc6cc7 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -33,6 +33,7 @@
#include <linux/parser.h>
#include <linux/idr.h>
#include <linux/slab.h>
+#include <linux/seq_file.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
#include <net/9p/transport.h>
@@ -82,6 +83,13 @@ static const match_table_t tokens = {
{Opt_err, NULL}
};
+static const char *const v9fs_cache_modes[nr__p9_cache_modes] = {
+ [CACHE_NONE] = "none",
+ [CACHE_MMAP] = "mmap",
+ [CACHE_LOOSE] = "loose",
+ [CACHE_FSCACHE] = "fscache",
+};
+
/* Interpret mount options for cache mode */
static int get_cache_mode(char *s)
{
@@ -104,6 +112,58 @@ static int get_cache_mode(char *s)
return version;
}
+/*
+ * Display the mount options in /proc/mounts.
+ */
+int v9fs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct v9fs_session_info *v9ses = root->d_sb->s_fs_info;
+
+ if (v9ses->debug)
+ seq_printf(m, ",debug=%x", v9ses->debug);
+ if (!uid_eq(v9ses->dfltuid, V9FS_DEFUID))
+ seq_printf(m, ",dfltuid=%u",
+ from_kuid_munged(&init_user_ns, v9ses->dfltuid));
+ if (!gid_eq(v9ses->dfltgid, V9FS_DEFGID))
+ seq_printf(m, ",dfltgid=%u",
+ from_kgid_munged(&init_user_ns, v9ses->dfltgid));
+ if (v9ses->afid != ~0)
+ seq_printf(m, ",afid=%u", v9ses->afid);
+ if (strcmp(v9ses->uname, V9FS_DEFUSER) != 0)
+ seq_printf(m, ",uname=%s", v9ses->uname);
+ if (strcmp(v9ses->aname, V9FS_DEFANAME) != 0)
+ seq_printf(m, ",aname=%s", v9ses->aname);
+ if (v9ses->nodev)
+ seq_puts(m, ",nodevmap");
+ if (v9ses->cache)
+ seq_printf(m, ",%s", v9fs_cache_modes[v9ses->cache]);
+#ifdef CONFIG_9P_FSCACHE
+ if (v9ses->cachetag && v9ses->cache == CACHE_FSCACHE)
+ seq_printf(m, ",cachetag=%s", v9ses->cachetag);
+#endif
+
+ switch (v9ses->flags & V9FS_ACCESS_MASK) {
+ case V9FS_ACCESS_USER:
+ seq_puts(m, ",access=user");
+ break;
+ case V9FS_ACCESS_ANY:
+ seq_puts(m, ",access=any");
+ break;
+ case V9FS_ACCESS_CLIENT:
+ seq_puts(m, ",access=client");
+ break;
+ case V9FS_ACCESS_SINGLE:
+ seq_printf(m, ",access=%u",
+ from_kuid_munged(&init_user_ns, v9ses->uid));
+ break;
+ }
+
+ if (v9ses->flags & V9FS_POSIX_ACL)
+ seq_puts(m, ",posixacl");
+
+ return p9_show_client_options(m, v9ses->clnt);
+}
+
/**
* v9fs_parse_options - parse mount options into session structure
* @v9ses: existing v9fs session information
@@ -230,6 +290,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
break;
case Opt_cachetag:
#ifdef CONFIG_9P_FSCACHE
+ kfree(v9ses->cachetag);
v9ses->cachetag = match_strdup(&args[0]);
#endif
break;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 76eaf49abd3a..982e017acadb 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -67,6 +67,7 @@ enum p9_cache_modes {
CACHE_MMAP,
CACHE_LOOSE,
CACHE_FSCACHE,
+ nr__p9_cache_modes
};
/**
@@ -137,6 +138,8 @@ static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
return container_of(inode, struct v9fs_inode, vfs_inode);
}
+extern int v9fs_show_options(struct seq_file *m, struct dentry *root);
+
struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
char *);
extern void v9fs_session_close(struct v9fs_session_info *v9ses);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index a0965fb587a5..8b75463cb211 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -33,7 +33,6 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/pagemap.h>
-#include <linux/seq_file.h>
#include <linux/mount.h>
#include <linux/idr.h>
#include <linux/sched.h>
@@ -104,7 +103,6 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_flags |= MS_POSIXACL;
#endif
- save_mount_options(sb, data);
return 0;
}
@@ -349,7 +347,7 @@ static const struct super_operations v9fs_super_ops = {
.destroy_inode = v9fs_destroy_inode,
.statfs = simple_statfs,
.evict_inode = v9fs_evict_inode,
- .show_options = generic_show_options,
+ .show_options = v9fs_show_options,
.umount_begin = v9fs_umount_begin,
.write_inode = v9fs_write_inode,
};
@@ -360,7 +358,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
.statfs = v9fs_statfs,
.drop_inode = v9fs_drop_inode,
.evict_inode = v9fs_evict_inode,
- .show_options = generic_show_options,
+ .show_options = v9fs_show_options,
.umount_begin = v9fs_umount_begin,
.write_inode = v9fs_write_inode_dotl,
};
diff --git a/fs/Kconfig b/fs/Kconfig
index b0e42b6a96b9..7aee6d699fd6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -80,7 +80,6 @@ config EXPORTFS_BLOCK_OPS
config FILE_LOCKING
bool "Enable POSIX file locking API" if EXPERT
default y
- select PERCPU_RWSEM
help
This option enables standard file locking support, required
for filesystems like NFS and for the flock() system
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c2c27a8f128e..7bf47a41cb4f 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -20,9 +20,11 @@
#include <linux/slab.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
+#include <linux/seq_file.h>
#include "affs.h"
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int affs_show_options(struct seq_file *m, struct dentry *root);
static int affs_remount (struct super_block *sb, int *flags, char *data);
static void
@@ -159,7 +161,7 @@ static const struct super_operations affs_sops = {
.sync_fs = affs_sync_fs,
.statfs = affs_statfs,
.remount_fs = affs_remount,
- .show_options = generic_show_options,
+ .show_options = affs_show_options,
};
enum {
@@ -293,6 +295,40 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
return 1;
}
+static int affs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct affs_sb_info *sbi = AFFS_SB(sb);
+
+ if (sb->s_blocksize)
+ seq_printf(m, ",bs=%lu", sb->s_blocksize);
+ if (affs_test_opt(sbi->s_flags, SF_SETMODE))
+ seq_printf(m, ",mode=%o", sbi->s_mode);
+ if (affs_test_opt(sbi->s_flags, SF_MUFS))
+ seq_puts(m, ",mufs");
+ if (affs_test_opt(sbi->s_flags, SF_NO_TRUNCATE))
+ seq_puts(m, ",nofilenametruncate");
+ if (affs_test_opt(sbi->s_flags, SF_PREFIX))
+ seq_printf(m, ",prefix=%s", sbi->s_prefix);
+ if (affs_test_opt(sbi->s_flags, SF_IMMUTABLE))
+ seq_puts(m, ",protect");
+ if (sbi->s_reserved != 2)
+ seq_printf(m, ",reserved=%u", sbi->s_reserved);
+ if (sbi->s_root_block != (sbi->s_reserved + sbi->s_partition_size - 1) / 2)
+ seq_printf(m, ",root=%u", sbi->s_root_block);
+ if (affs_test_opt(sbi->s_flags, SF_SETGID))
+ seq_printf(m, ",setgid=%u",
+ from_kgid_munged(&init_user_ns, sbi->s_gid));
+ if (affs_test_opt(sbi->s_flags, SF_SETUID))
+ seq_printf(m, ",setuid=%u",
+ from_kuid_munged(&init_user_ns, sbi->s_uid));
+ if (affs_test_opt(sbi->s_flags, SF_VERBOSE))
+ seq_puts(m, ",verbose");
+ if (sbi->s_volume[0])
+ seq_printf(m, ",volume=%s", sbi->s_volume);
+ return 0;
+}
+
/* This function definitely needs to be split up. Some fine day I'll
* hopefully have the guts to do so. Until then: sorry for the mess.
*/
@@ -316,8 +352,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
u8 sig[4];
int ret;
- save_mount_options(sb, data);
-
pr_debug("read_super(%s)\n", data ? (const char *)data : "no options");
sb->s_magic = AFFS_SUPER_MAGIC;
@@ -548,8 +582,6 @@ affs_remount(struct super_block *sb, int *flags, char *data)
}
flush_delayed_work(&sbi->sb_work);
- if (new_opts)
- replace_mount_options(sb, new_opts);
sbi->s_flags = mount_flags;
sbi->s_mode = mode;
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 4f64b95d57bd..095c54165dfd 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -27,6 +27,7 @@ kafs-objs := \
vlocation.o \
vnode.o \
volume.o \
- write.o
+ write.o \
+ xattr.o
obj-$(CONFIG_AFS_FS) := kafs.o
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 3062cceb5c2a..782d4d05a53b 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -350,7 +350,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
{
struct sockaddr_rxrpc srx;
struct afs_server *server;
- struct uuid_v1 *r;
+ struct afs_uuid *r;
unsigned loop;
__be32 *b;
int ret;
@@ -380,7 +380,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
}
_debug("unmarshall UUID");
- call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL);
+ call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
if (!call->request)
return -ENOMEM;
@@ -453,7 +453,7 @@ static int afs_deliver_cb_probe(struct afs_call *call)
static void SRXAFSCB_ProbeUuid(struct work_struct *work)
{
struct afs_call *call = container_of(work, struct afs_call, work);
- struct uuid_v1 *r = call->request;
+ struct afs_uuid *r = call->request;
struct {
__be32 match;
@@ -476,7 +476,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
*/
static int afs_deliver_cb_probe_uuid(struct afs_call *call)
{
- struct uuid_v1 *r;
+ struct afs_uuid *r;
unsigned loop;
__be32 *b;
int ret;
@@ -502,15 +502,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
}
_debug("unmarshall UUID");
- call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL);
+ call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
if (!call->request)
return -ENOMEM;
b = call->buffer;
r = call->request;
- r->time_low = b[0];
- r->time_mid = htons(ntohl(b[1]));
- r->time_hi_and_version = htons(ntohl(b[2]));
+ r->time_low = ntohl(b[0]);
+ r->time_mid = ntohl(b[1]);
+ r->time_hi_and_version = ntohl(b[2]);
r->clock_seq_hi_and_reserved = ntohl(b[3]);
r->clock_seq_low = ntohl(b[4]);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 949f960337f5..613a77058263 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -61,6 +61,7 @@ const struct inode_operations afs_dir_inode_operations = {
.permission = afs_permission,
.getattr = afs_getattr,
.setattr = afs_setattr,
+ .listxattr = afs_listxattr,
};
const struct dentry_operations afs_fs_dentry_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0d5b8508869b..510cba15fa56 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -46,6 +46,7 @@ const struct inode_operations afs_file_inode_operations = {
.getattr = afs_getattr,
.setattr = afs_setattr,
.permission = afs_permission,
+ .listxattr = afs_listxattr,
};
const struct address_space_operations afs_fs_aops = {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index aae55dd15108..342316a9e3e0 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -28,6 +28,11 @@ struct afs_iget_data {
struct afs_volume *volume; /* volume on which resides */
};
+static const struct inode_operations afs_symlink_inode_operations = {
+ .get_link = page_get_link,
+ .listxattr = afs_listxattr,
+};
+
/*
* map the AFS file status to the inode member variables
*/
@@ -67,7 +72,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
inode->i_fop = &afs_mntpt_file_operations;
} else {
inode->i_mode = S_IFLNK | vnode->status.mode;
- inode->i_op = &page_symlink_inode_operations;
+ inode->i_op = &afs_symlink_inode_operations;
}
inode_nohighmem(inode);
break;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 393672997cc2..82e16556afea 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -410,6 +410,15 @@ struct afs_interface {
unsigned mtu; /* MTU of interface */
};
+struct afs_uuid {
+ __be32 time_low; /* low part of timestamp */
+ __be16 time_mid; /* mid part of timestamp */
+ __be16 time_hi_and_version; /* high part of timestamp and version */
+ __u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */
+ __u8 clock_seq_low; /* clock seq low */
+ __u8 node[6]; /* spatially unique node ID (MAC addr) */
+};
+
/*****************************************************************************/
/*
* cache.c
@@ -544,7 +553,7 @@ extern int afs_drop_inode(struct inode *);
* main.c
*/
extern struct workqueue_struct *afs_wq;
-extern struct uuid_v1 afs_uuid;
+extern struct afs_uuid afs_uuid;
/*
* misc.c
@@ -722,6 +731,11 @@ extern int afs_writeback_all(struct afs_vnode *);
extern int afs_flush(struct file *, fl_owner_t);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
+/*
+ * xattr.c
+ */
+extern const struct xattr_handler *afs_xattr_handlers[];
+extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
/*****************************************************************************/
/*
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 51d7d17bca57..9944770849da 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -31,7 +31,7 @@ static char *rootcell;
module_param(rootcell, charp, 0);
MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
-struct uuid_v1 afs_uuid;
+struct afs_uuid afs_uuid;
struct workqueue_struct *afs_wq;
/*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index bd3b65cde282..690fea9d84c3 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -35,6 +35,7 @@ const struct inode_operations afs_mntpt_inode_operations = {
.lookup = afs_mntpt_lookup,
.readlink = page_readlink,
.getattr = afs_getattr,
+ .listxattr = afs_listxattr,
};
const struct inode_operations afs_autocell_inode_operations = {
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d5990eb160bd..02781e78ffb6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -341,6 +341,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
struct msghdr msg;
struct kvec iov[1];
size_t offset;
+ s64 tx_total_len;
u32 abort_code;
int ret;
@@ -364,9 +365,20 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
srx.transport.sin.sin_port = call->port;
memcpy(&srx.transport.sin.sin_addr, addr, 4);
+ /* Work out the length we're going to transmit. This is awkward for
+ * calls such as FS.StoreData where there's an extra injection of data
+ * after the initial fixed part.
+ */
+ tx_total_len = call->request_size;
+ if (call->send_pages) {
+ tx_total_len += call->last_to - call->first_offset;
+ tx_total_len += (call->last - call->first) * PAGE_SIZE;
+ }
+
/* create a call */
rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
- (unsigned long) call, gfp,
+ (unsigned long)call,
+ tx_total_len, gfp,
(async ?
afs_wake_up_async_call :
afs_wake_up_call_waiter));
@@ -738,6 +750,8 @@ void afs_send_empty_reply(struct afs_call *call)
_enter("");
+ rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, 0);
+
msg.msg_name = NULL;
msg.msg_namelen = 0;
iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
@@ -772,6 +786,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
_enter("");
+ rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, len);
+
iov[0].iov_base = (void *) buf;
iov[0].iov_len = len;
msg.msg_name = NULL;
diff --git a/fs/afs/security.c b/fs/afs/security.c
index ecb86a670180..faca66227ecf 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -327,12 +327,11 @@ int afs_permission(struct inode *inode, int mask)
if (!(access & AFS_ACE_LOOKUP))
goto permission_denied;
} else if (mask & MAY_READ) {
- if (!(access & AFS_ACE_READ))
+ if (!(access & AFS_ACE_LOOKUP))
goto permission_denied;
} else if (mask & MAY_WRITE) {
if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
- AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */
- AFS_ACE_WRITE))) /* chmod */
+ AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */
goto permission_denied;
} else {
BUG();
diff --git a/fs/afs/super.c b/fs/afs/super.c
index c79633e5cfd8..689173c0a682 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -37,6 +37,8 @@ static void afs_kill_super(struct super_block *sb);
static struct inode *afs_alloc_inode(struct super_block *sb);
static void afs_destroy_inode(struct inode *inode);
static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int afs_show_devname(struct seq_file *m, struct dentry *root);
+static int afs_show_options(struct seq_file *m, struct dentry *root);
struct file_system_type afs_fs_type = {
.owner = THIS_MODULE,
@@ -53,7 +55,8 @@ static const struct super_operations afs_super_ops = {
.drop_inode = afs_drop_inode,
.destroy_inode = afs_destroy_inode,
.evict_inode = afs_evict_inode,
- .show_options = generic_show_options,
+ .show_devname = afs_show_devname,
+ .show_options = afs_show_options,
};
static struct kmem_cache *afs_inode_cachep;
@@ -136,6 +139,45 @@ void __exit afs_fs_exit(void)
}
/*
+ * Display the mount device name in /proc/mounts.
+ */
+static int afs_show_devname(struct seq_file *m, struct dentry *root)
+{
+ struct afs_super_info *as = root->d_sb->s_fs_info;
+ struct afs_volume *volume = as->volume;
+ struct afs_cell *cell = volume->cell;
+ const char *suf = "";
+ char pref = '%';
+
+ switch (volume->type) {
+ case AFSVL_RWVOL:
+ break;
+ case AFSVL_ROVOL:
+ pref = '#';
+ if (volume->type_force)
+ suf = ".readonly";
+ break;
+ case AFSVL_BACKVOL:
+ pref = '#';
+ suf = ".backup";
+ break;
+ }
+
+ seq_printf(m, "%c%s:%s%s", pref, cell->name, volume->vlocation->vldb.name, suf);
+ return 0;
+}
+
+/*
+ * Display the mount options in /proc/mounts.
+ */
+static int afs_show_options(struct seq_file *m, struct dentry *root)
+{
+ if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags))
+ seq_puts(m, "autocell");
+ return 0;
+}
+
+/*
* parse the mount options
* - this function has been shamelessly adapted from the ext3 fs which
* shamelessly adapted it from the msdos fs
@@ -319,6 +361,7 @@ static int afs_fill_super(struct super_block *sb,
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
+ sb->s_xattr = afs_xattr_handlers;
ret = super_setup_bdi(sb);
if (ret)
return ret;
@@ -426,7 +469,6 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
deactivate_locked_super(sb);
goto error;
}
- save_mount_options(sb, new_opts);
sb->s_flags |= MS_ACTIVE;
} else {
_debug("reuse");
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
new file mode 100644
index 000000000000..2830e4f48d85
--- /dev/null
+++ b/fs/afs/xattr.c
@@ -0,0 +1,121 @@
+/* Extended attribute handling for AFS. We use xattrs to get and set metadata
+ * instead of providing pioctl().
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/xattr.h>
+#include "internal.h"
+
+static const char afs_xattr_list[] =
+ "afs.cell\0"
+ "afs.fid\0"
+ "afs.volume";
+
+/*
+ * Retrieve a list of the supported xattrs.
+ */
+ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ if (size == 0)
+ return sizeof(afs_xattr_list);
+ if (size < sizeof(afs_xattr_list))
+ return -ERANGE;
+ memcpy(buffer, afs_xattr_list, sizeof(afs_xattr_list));
+ return sizeof(afs_xattr_list);
+}
+
+/*
+ * Get the name of the cell on which a file resides.
+ */
+static int afs_xattr_get_cell(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ struct afs_cell *cell = vnode->volume->cell;
+ size_t namelen;
+
+ namelen = strlen(cell->name);
+ if (size == 0)
+ return namelen;
+ if (namelen > size)
+ return -ERANGE;
+ memcpy(buffer, cell->name, size);
+ return namelen;
+}
+
+static const struct xattr_handler afs_xattr_afs_cell_handler = {
+ .name = "afs.cell",
+ .get = afs_xattr_get_cell,
+};
+
+/*
+ * Get the volume ID, vnode ID and vnode uniquifier of a file as a sequence of
+ * hex numbers separated by colons.
+ */
+static int afs_xattr_get_fid(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ char text[8 + 1 + 8 + 1 + 8 + 1];
+ size_t len;
+
+ len = sprintf(text, "%x:%x:%x",
+ vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+ if (size == 0)
+ return len;
+ if (len > size)
+ return -ERANGE;
+ memcpy(buffer, text, len);
+ return len;
+}
+
+static const struct xattr_handler afs_xattr_afs_fid_handler = {
+ .name = "afs.fid",
+ .get = afs_xattr_get_fid,
+};
+
+/*
+ * Get the name of the volume on which a file resides.
+ */
+static int afs_xattr_get_volume(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ const char *volname = vnode->volume->vlocation->vldb.name;
+ size_t namelen;
+
+ namelen = strlen(volname);
+ if (size == 0)
+ return namelen;
+ if (namelen > size)
+ return -ERANGE;
+ memcpy(buffer, volname, size);
+ return namelen;
+}
+
+static const struct xattr_handler afs_xattr_afs_volume_handler = {
+ .name = "afs.volume",
+ .get = afs_xattr_get_volume,
+};
+
+const struct xattr_handler *afs_xattr_handlers[] = {
+ &afs_xattr_afs_cell_handler,
+ &afs_xattr_afs_fid_handler,
+ &afs_xattr_afs_volume_handler,
+ NULL
+};
diff --git a/fs/aio.c b/fs/aio.c
index f52d925ee259..dcad3a66748c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
ssize_t ret;
/* enforce forwards compatibility on users */
- if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
+ if (unlikely(iocb->aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
@@ -1568,6 +1568,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->common.ki_pos = iocb->aio_offset;
req->common.ki_complete = aio_complete;
req->common.ki_flags = iocb_flags(req->common.ki_filp);
+ req->common.ki_hint = file_write_hint(file);
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
@@ -1586,6 +1587,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->common.ki_flags |= IOCB_EVENTFD;
}
+ ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
+ if (unlikely(ret)) {
+ pr_debug("EINVAL: aio_rw_flags\n");
+ goto out_put_req;
+ }
+
+ if ((req->common.ki_flags & IOCB_NOWAIT) &&
+ !(req->common.ki_flags & IOCB_DIRECT)) {
+ ret = -EOPNOTSUPP;
+ goto out_put_req;
+ }
+
ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
if (unlikely(ret)) {
pr_debug("EFAULT: aio_key\n");
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 734cbf8d9676..dd9f1bebb5a3 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -344,7 +344,7 @@ static int autofs_dev_ioctl_fail(struct file *fp,
int status;
token = (autofs_wqt_t) param->fail.token;
- status = param->fail.status ? param->fail.status : -ENOENT;
+ status = param->fail.status < 0 ? param->fail.status : -ENOENT;
return autofs4_wait_release(sbi, token, status);
}
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index d509887c580c..1b7e0f7128d6 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -120,18 +120,15 @@ static int befs_compare_strings(const void *key1, int keylen1,
const void *key2, int keylen2);
/**
- * befs_bt_read_super - read in btree superblock convert to cpu byteorder
- * @sb: Filesystem superblock
- * @ds: Datastream to read from
- * @sup: Buffer in which to place the btree superblock
+ * befs_bt_read_super() - read in btree superblock convert to cpu byteorder
+ * @sb: Filesystem superblock
+ * @ds: Datastream to read from
+ * @sup: Buffer in which to place the btree superblock
*
* Calls befs_read_datastream to read in the btree superblock and
* makes sure it is in cpu byteorder, byteswapping if necessary.
- *
- * On success, returns BEFS_OK and *@sup contains the btree superblock,
- * in cpu byte order.
- *
- * On failure, BEFS_ERR is returned.
+ * Return: BEFS_OK on success and if *@sup contains the btree superblock in cpu
+ * byte order. Otherwise return BEFS_ERR on error.
*/
static int
befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 63e7c4760bfb..4a4a5a366158 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -20,6 +20,7 @@
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/exportfs.h>
+#include <linux/seq_file.h>
#include "befs.h"
#include "btree.h"
@@ -53,6 +54,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
static void befs_put_super(struct super_block *);
static int befs_remount(struct super_block *, int *, char *);
static int befs_statfs(struct dentry *, struct kstatfs *);
+static int befs_show_options(struct seq_file *, struct dentry *);
static int parse_options(char *, struct befs_mount_options *);
static struct dentry *befs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
@@ -66,7 +68,7 @@ static const struct super_operations befs_sops = {
.put_super = befs_put_super, /* uninit super */
.statfs = befs_statfs, /* statfs */
.remount_fs = befs_remount,
- .show_options = generic_show_options,
+ .show_options = befs_show_options,
};
/* slab cache for befs_inode_info objects */
@@ -771,6 +773,24 @@ parse_options(char *options, struct befs_mount_options *opts)
return 1;
}
+static int befs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct befs_sb_info *befs_sb = BEFS_SB(root->d_sb);
+ struct befs_mount_options *opts = &befs_sb->mount_opts;
+
+ if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, opts->uid));
+ if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, opts->gid));
+ if (opts->iocharset)
+ seq_printf(m, ",charset=%s", opts->iocharset);
+ if (opts->debug)
+ seq_puts(m, ",debug");
+ return 0;
+}
+
/* This function has the responsibiltiy of getting the
* filesystem ready for unmounting.
* Basically, we free everything that we allocated in
@@ -804,8 +824,6 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
const off_t x86_sb_off = 512;
int blocksize;
- save_mount_options(sb, data);
-
sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
if (sb->s_fs_info == NULL)
goto unacquire_none;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 25e312cb6071..9a69392f1fb3 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -419,7 +419,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
if (i_sblock > info->si_blocks ||
i_eblock > info->si_blocks ||
i_sblock > i_eblock ||
- i_eoff > s_size ||
+ (i_eoff != le32_to_cpu(-1) && i_eoff > s_size) ||
i_sblock * BFS_BSIZE > i_eoff) {
printf("Inode 0x%08x corrupted\n", i);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 5075fd5c62c8..879ff9c7ffd0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -163,8 +163,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
unsigned long p = bprm->p;
int argc = bprm->argc;
int envc = bprm->envc;
- elf_addr_t __user *argv;
- elf_addr_t __user *envp;
elf_addr_t __user *sp;
elf_addr_t __user *u_platform;
elf_addr_t __user *u_base_platform;
@@ -304,38 +302,38 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
/* Now, let's put argc (and argv, envp if appropriate) on the stack */
if (__put_user(argc, sp++))
return -EFAULT;
- argv = sp;
- envp = argv + argc + 1;
- /* Populate argv and envp */
+ /* Populate list of argv pointers back to argv strings. */
p = current->mm->arg_end = current->mm->arg_start;
while (argc-- > 0) {
size_t len;
- if (__put_user((elf_addr_t)p, argv++))
+ if (__put_user((elf_addr_t)p, sp++))
return -EFAULT;
len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
p += len;
}
- if (__put_user(0, argv))
+ if (__put_user(0, sp++))
return -EFAULT;
- current->mm->arg_end = current->mm->env_start = p;
+ current->mm->arg_end = p;
+
+ /* Populate list of envp pointers back to envp strings. */
+ current->mm->env_end = current->mm->env_start = p;
while (envc-- > 0) {
size_t len;
- if (__put_user((elf_addr_t)p, envp++))
+ if (__put_user((elf_addr_t)p, sp++))
return -EFAULT;
len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
if (!len || len > MAX_ARG_STRLEN)
return -EINVAL;
p += len;
}
- if (__put_user(0, envp))
+ if (__put_user(0, sp++))
return -EFAULT;
current->mm->env_end = p;
/* Put the elf_info on the stack in the right place. */
- sp = (elf_addr_t __user *)envp + 1;
if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t)))
return -EFAULT;
return 0;
@@ -927,17 +925,60 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
vaddr = elf_ppnt->p_vaddr;
+ /*
+ * If we are loading ET_EXEC or we have already performed
+ * the ET_DYN load_addr calculations, proceed normally.
+ */
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
elf_flags |= MAP_FIXED;
} else if (loc->elf_ex.e_type == ET_DYN) {
- /* Try and get dynamic programs out of the way of the
- * default mmap base, as well as whatever program they
- * might try to exec. This is because the brk will
- * follow the loader, and is not movable. */
- load_bias = ELF_ET_DYN_BASE - vaddr;
- if (current->flags & PF_RANDOMIZE)
- load_bias += arch_mmap_rnd();
- load_bias = ELF_PAGESTART(load_bias);
+ /*
+ * This logic is run once for the first LOAD Program
+ * Header for ET_DYN binaries to calculate the
+ * randomization (load_bias) for all the LOAD
+ * Program Headers, and to calculate the entire
+ * size of the ELF mapping (total_size). (Note that
+ * load_addr_set is set to true later once the
+ * initial mapping is performed.)
+ *
+ * There are effectively two types of ET_DYN
+ * binaries: programs (i.e. PIE: ET_DYN with INTERP)
+ * and loaders (ET_DYN without INTERP, since they
+ * _are_ the ELF interpreter). The loaders must
+ * be loaded away from programs since the program
+ * may otherwise collide with the loader (especially
+ * for ET_EXEC which does not have a randomized
+ * position). For example to handle invocations of
+ * "./ld.so someprog" to test out a new version of
+ * the loader, the subsequent program that the
+ * loader loads must avoid the loader itself, so
+ * they cannot share the same load range. Sufficient
+ * room for the brk must be allocated with the
+ * loader as well, since brk must be available with
+ * the loader.
+ *
+ * Therefore, programs are loaded offset from
+ * ELF_ET_DYN_BASE and loaders are loaded into the
+ * independently randomized mmap region (0 load_bias
+ * without MAP_FIXED).
+ */
+ if (elf_interpreter) {
+ load_bias = ELF_ET_DYN_BASE;
+ if (current->flags & PF_RANDOMIZE)
+ load_bias += arch_mmap_rnd();
+ elf_flags |= MAP_FIXED;
+ } else
+ load_bias = 0;
+
+ /*
+ * Since load_bias is used for all subsequent loading
+ * calculations, we must lower it by the first vaddr
+ * so that the remaining calculations based on the
+ * ELF vaddrs will be correctly offset. The result
+ * is then page aligned.
+ */
+ load_bias = ELF_PAGESTART(load_bias - vaddr);
+
total_size = total_mapping_size(elf_phdata,
loc->elf_ex.e_phnum);
if (!total_size) {
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 2edcefc0a294..a1e6860b6f46 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -422,9 +422,9 @@ static int load_flat_file(struct linux_binprm *bprm,
{
struct flat_hdr *hdr;
unsigned long textpos, datapos, realdatastart;
- unsigned long text_len, data_len, bss_len, stack_len, full_data, flags;
+ u32 text_len, data_len, bss_len, stack_len, full_data, flags;
unsigned long len, memp, memp_size, extra, rlim;
- unsigned long __user *reloc, *rp;
+ u32 __user *reloc, *rp;
struct inode *inode;
int i, rev, relocs;
loff_t fpos;
@@ -574,7 +574,7 @@ static int load_flat_file(struct linux_binprm *bprm,
MAX_SHARED_LIBS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
- pr_debug("Allocated data+bss+stack (%ld bytes): %lx\n",
+ pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
data_len + bss_len + stack_len, datapos);
fpos = ntohl(hdr->data_start);
@@ -596,13 +596,13 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- reloc = (unsigned long __user *)
+ reloc = (u32 __user *)
(datapos + (ntohl(hdr->reloc_start) - text_len));
memp = realdatastart;
memp_size = len;
} else {
- len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+ len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -618,10 +618,10 @@ static int load_flat_file(struct linux_binprm *bprm,
realdatastart = textpos + ntohl(hdr->data_start);
datapos = ALIGN(realdatastart +
- MAX_SHARED_LIBS * sizeof(unsigned long),
+ MAX_SHARED_LIBS * sizeof(u32),
FLAT_DATA_ALIGN);
- reloc = (unsigned long __user *)
+ reloc = (u32 __user *)
(datapos + (ntohl(hdr->reloc_start) - text_len));
memp = textpos;
memp_size = len;
@@ -694,7 +694,7 @@ static int load_flat_file(struct linux_binprm *bprm,
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
vm_munmap(textpos, text_len + data_len + extra +
- MAX_SHARED_LIBS * sizeof(unsigned long));
+ MAX_SHARED_LIBS * sizeof(u32));
goto err;
}
}
@@ -754,8 +754,8 @@ static int load_flat_file(struct linux_binprm *bprm,
* image.
*/
if (flags & FLAT_FLAG_GOTPIC) {
- for (rp = (unsigned long __user *)datapos; ; rp++) {
- unsigned long addr, rp_val;
+ for (rp = (u32 __user *)datapos; ; rp++) {
+ u32 addr, rp_val;
if (get_user(rp_val, rp))
return -EFAULT;
if (rp_val == 0xffffffff)
@@ -784,9 +784,9 @@ static int load_flat_file(struct linux_binprm *bprm,
* __start to address 4 so that is okay).
*/
if (rev > OLD_FLAT_VERSION) {
- unsigned long __maybe_unused persistent = 0;
+ u32 __maybe_unused persistent = 0;
for (i = 0; i < relocs; i++) {
- unsigned long addr, relval;
+ u32 addr, relval;
/*
* Get the address of the pointer to be
@@ -799,15 +799,18 @@ static int load_flat_file(struct linux_binprm *bprm,
if (flat_set_persistent(relval, &persistent))
continue;
addr = flat_get_relocate_addr(relval);
- rp = (unsigned long __user *)calc_reloc(addr, libinfo, id, 1);
- if (rp == (unsigned long __user *)RELOC_FAILED) {
+ rp = (u32 __user *)calc_reloc(addr, libinfo, id, 1);
+ if (rp == (u32 __user *)RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
}
/* Get the pointer's value. */
- addr = flat_get_addr_from_rp(rp, relval, flags,
- &persistent);
+ ret = flat_get_addr_from_rp(rp, relval, flags,
+ &addr, &persistent);
+ if (unlikely(ret))
+ goto err;
+
if (addr != 0) {
/*
* Do the relocation. PIC relocs in the data section are
@@ -822,12 +825,14 @@ static int load_flat_file(struct linux_binprm *bprm,
}
/* Write back the relocated pointer. */
- flat_put_addr_at_rp(rp, addr, relval);
+ ret = flat_put_addr_at_rp(rp, addr, relval);
+ if (unlikely(ret))
+ goto err;
}
}
} else {
for (i = 0; i < relocs; i++) {
- unsigned long relval;
+ u32 relval;
if (get_user(relval, reloc + i))
return -EFAULT;
relval = ntohl(relval);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 519599dddd36..9941dc8342df 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -225,6 +225,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio_init(&bio, vecs, nr_pages);
bio.bi_bdev = bdev;
bio.bi_iter.bi_sector = pos >> 9;
+ bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current;
bio.bi_end_io = blkdev_bio_end_io_simple;
@@ -262,8 +263,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
if (vecs != inline_vecs)
kfree(vecs);
- if (unlikely(bio.bi_error))
- return bio.bi_error;
+ if (unlikely(bio.bi_status))
+ ret = blk_status_to_errno(bio.bi_status);
+
+ bio_uninit(&bio);
+
return ret;
}
@@ -288,16 +292,18 @@ static void blkdev_bio_end_io(struct bio *bio)
bool should_dirty = dio->should_dirty;
if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
- if (bio->bi_error && !dio->bio.bi_error)
- dio->bio.bi_error = bio->bi_error;
+ if (bio->bi_status && !dio->bio.bi_status)
+ dio->bio.bi_status = bio->bi_status;
} else {
if (!dio->is_sync) {
struct kiocb *iocb = dio->iocb;
- ssize_t ret = dio->bio.bi_error;
+ ssize_t ret;
- if (likely(!ret)) {
+ if (likely(!dio->bio.bi_status)) {
ret = dio->size;
iocb->ki_pos += ret;
+ } else {
+ ret = blk_status_to_errno(dio->bio.bi_status);
}
dio->iocb->ki_complete(iocb, ret, 0);
@@ -334,7 +340,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE;
- int ret;
+ int ret = 0;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -358,12 +364,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
for (;;) {
bio->bi_bdev = bdev;
bio->bi_iter.bi_sector = pos >> 9;
+ bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
- bio->bi_error = ret;
+ bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
break;
}
@@ -412,7 +419,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
}
__set_current_state(TASK_RUNNING);
- ret = dio->bio.bi_error;
+ if (!ret)
+ ret = blk_status_to_errno(dio->bio.bi_status);
if (likely(!ret))
ret = dio->size;
@@ -436,7 +444,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static __init int blkdev_init(void)
{
- blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
+ blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
if (!blkdev_dio_pool)
return -ENOMEM;
return 0;
@@ -624,7 +632,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
struct block_device *bdev = I_BDEV(bd_inode);
int error;
- error = filemap_write_and_wait_range(filp->f_mapping, start, end);
+ error = file_write_and_wait_range(filp, start, end);
if (error)
return error;
@@ -1743,6 +1751,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
return -ENOMEM;
filp->f_mapping = bdev->bd_inode->i_mapping;
+ filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
return blkdev_get(bdev, filp->f_mode, filp);
}
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 247b8dfaf6e5..8d8370ddb6b2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -78,12 +78,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
- if (acl) {
- ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
- if (ret)
- return ret;
- }
- ret = 0;
break;
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
@@ -119,6 +113,13 @@ out:
int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
+ int ret;
+
+ if (type == ACL_TYPE_ACCESS && acl) {
+ ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (ret)
+ return ret;
+ }
return __btrfs_set_acl(NULL, inode, acl, type);
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 24865da63d8f..f723c11bb763 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,7 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/rbtree.h>
#include "ctree.h"
#include "disk-io.h"
@@ -2305,7 +2305,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
- data = vmalloc(alloc_bytes);
+ data = kvmalloc(alloc_bytes, GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
@@ -2339,9 +2339,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
if (IS_ERR(fspath))
return (void *)fspath;
- ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
+ ifp = kmalloc(sizeof(*ifp), GFP_KERNEL);
if (!ifp) {
- vfree(fspath);
+ kvfree(fspath);
return ERR_PTR(-ENOMEM);
}
@@ -2356,6 +2356,6 @@ void free_ipath(struct inode_fs_paths *ipath)
{
if (!ipath)
return;
- vfree(ipath->fspath);
+ kvfree(ipath->fspath);
kfree(ipath);
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b8622e4d1744..d87ac27a5f2b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -310,7 +310,8 @@ struct btrfs_dio_private {
* The original bio may be split to several sub-bios, this is
* done during endio of sub-bios
*/
- int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
+ blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *,
+ blk_status_t);
};
/*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ab14c2e635ca..11d37c94ce05 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,7 +94,7 @@
#include <linux/mutex.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/string.h>
#include "ctree.h"
#include "disk-io.h"
@@ -1638,12 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
struct bio *bio;
unsigned int j;
- bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
- if (!bio) {
- pr_info("btrfsic: bio_alloc() for %u pages failed!\n",
- num_pages - i);
- return -1;
- }
+ bio = btrfs_io_bio_alloc(num_pages - i);
bio->bi_bdev = block_ctx->dev->bdev;
bio->bi_iter.bi_sector = dev_bytenr >> 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
@@ -1668,14 +1663,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
dev_bytenr += (j - i) * PAGE_SIZE;
i = j;
}
- for (i = 0; i < num_pages; i++) {
+ for (i = 0; i < num_pages; i++)
block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
- if (!block_ctx->datav[i]) {
- pr_info("btrfsic: kmap() failed (dev %s)!\n",
- block_ctx->dev->name);
- return -1;
- }
- }
return block_ctx->len;
}
@@ -2129,7 +2118,7 @@ static void btrfsic_bio_end_io(struct bio *bp)
/* mutex is not held! This is not save if IO is not yet completed
* on umount */
iodone_w_error = 0;
- if (bp->bi_error)
+ if (bp->bi_status)
iodone_w_error = 1;
BUG_ON(NULL == block);
@@ -2143,7 +2132,7 @@ static void btrfsic_bio_end_io(struct bio *bp)
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
- bp->bi_error,
+ bp->bi_status,
btrfsic_get_block_type(dev_state->state, block),
block->logical_bytenr, dev_state->name,
block->dev_bytenr, block->mirror_num);
@@ -2822,44 +2811,47 @@ static void __btrfsic_submit_bio(struct bio *bio)
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- unsigned int i;
+ unsigned int i = 0;
u64 dev_bytenr;
u64 cur_bytenr;
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
int bio_is_patched;
char **mapped_datav;
+ unsigned int segs = bio_segments(bio);
dev_bytenr = 512 * bio->bi_iter.bi_sector;
bio_is_patched = 0;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_vcnt,
+ bio_op(bio), bio->bi_opf, segs,
(unsigned long long)bio->bi_iter.bi_sector,
dev_bytenr, bio->bi_bdev);
- mapped_datav = kmalloc_array(bio->bi_vcnt,
+ mapped_datav = kmalloc_array(segs,
sizeof(*mapped_datav), GFP_NOFS);
if (!mapped_datav)
goto leave;
cur_bytenr = dev_bytenr;
- bio_for_each_segment_all(bvec, bio, i) {
- BUG_ON(bvec->bv_len != PAGE_SIZE);
- mapped_datav[i] = kmap(bvec->bv_page);
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_len != PAGE_SIZE);
+ mapped_datav[i] = kmap(bvec.bv_page);
+ i++;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec->bv_len, bvec->bv_offset);
- cur_bytenr += bvec->bv_len;
+ i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
+ cur_bytenr += bvec.bv_len;
}
btrfsic_process_written_block(dev_state, dev_bytenr,
- mapped_datav, bio->bi_vcnt,
+ mapped_datav, segs,
bio, &bio_is_patched,
NULL, bio->bi_opf);
- bio_for_each_segment_all(bvec, bio, i)
- kunmap(bvec->bv_page);
+ bio_for_each_segment(bvec, bio, iter)
+ kunmap(bvec.bv_page);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
@@ -2923,13 +2915,10 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
fs_info->sectorsize, PAGE_SIZE);
return -1;
}
- state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ state = kvzalloc(sizeof(*state), GFP_KERNEL);
if (!state) {
- state = vzalloc(sizeof(*state));
- if (!state) {
- pr_info("btrfs check-integrity: vzalloc() failed!\n");
- return -1;
- }
+ pr_info("btrfs check-integrity: allocation failed!\n");
+ return -1;
}
if (!btrfsic_is_initialized) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 10e6b282d09d..d2ef9ac2a630 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,6 +32,7 @@
#include <linux/writeback.h>
#include <linux/bit_spinlock.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -42,48 +43,7 @@
#include "extent_io.h"
#include "extent_map.h"
-struct compressed_bio {
- /* number of bios pending for this compressed extent */
- refcount_t pending_bios;
-
- /* the pages with the compressed data on them */
- struct page **compressed_pages;
-
- /* inode that owns this data */
- struct inode *inode;
-
- /* starting offset in the inode for our pages */
- u64 start;
-
- /* number of bytes in the inode we're working on */
- unsigned long len;
-
- /* number of bytes on disk */
- unsigned long compressed_len;
-
- /* the compression algorithm for this bio */
- int compress_type;
-
- /* number of compressed pages in the array */
- unsigned long nr_pages;
-
- /* IO errors */
- int errors;
- int mirror_num;
-
- /* for reads, this is the bio we are copying the data into */
- struct bio *orig_bio;
-
- /*
- * the start of a variable length array of checksums only
- * used by reads
- */
- u32 sums;
-};
-
-static int btrfs_decompress_bio(int type, struct page **pages_in,
- u64 disk_start, struct bio *orig_bio,
- size_t srclen);
+static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
unsigned long disk_size)
@@ -94,12 +54,6 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
}
-static struct bio *compressed_bio_alloc(struct block_device *bdev,
- u64 first_byte, gfp_t gfp_flags)
-{
- return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags);
-}
-
static int check_compressed_csum(struct btrfs_inode *inode,
struct compressed_bio *cb,
u64 disk_start)
@@ -155,7 +109,7 @@ static void end_compressed_bio_read(struct bio *bio)
unsigned long index;
int ret;
- if (bio->bi_error)
+ if (bio->bi_status)
cb->errors = 1;
/* if there are more bios still pending for this compressed
@@ -173,11 +127,8 @@ static void end_compressed_bio_read(struct bio *bio)
/* ok, we're the last bio for this extent, lets start
* the decompression.
*/
- ret = btrfs_decompress_bio(cb->compress_type,
- cb->compressed_pages,
- cb->start,
- cb->orig_bio,
- cb->compressed_len);
+ ret = btrfs_decompress_bio(cb);
+
csum_failed:
if (ret)
cb->errors = 1;
@@ -201,6 +152,7 @@ csum_failed:
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, cb->orig_bio, i)
SetPageChecked(bvec->bv_page);
@@ -268,7 +220,7 @@ static void end_compressed_bio_write(struct bio *bio)
struct page *page;
unsigned long index;
- if (bio->bi_error)
+ if (bio->bi_status)
cb->errors = 1;
/* if there are more bios still pending for this compressed
@@ -287,7 +239,7 @@ static void end_compressed_bio_write(struct bio *bio)
cb->start,
cb->start + cb->len - 1,
NULL,
- bio->bi_error ? 0 : 1);
+ bio->bi_status ? 0 : 1);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
@@ -320,7 +272,7 @@ out:
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
-int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
@@ -335,13 +287,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
struct page *page;
u64 first_byte = disk_start;
struct block_device *bdev;
- int ret;
+ blk_status_t ret;
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
WARN_ON(start & ((u64)PAGE_SIZE - 1));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
@@ -355,11 +307,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bdev = fs_info->fs_devices->latest_bdev;
- bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
- if (!bio) {
- kfree(cb);
- return -ENOMEM;
- }
+ bio = btrfs_bio_alloc(bdev, first_byte);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
@@ -368,17 +316,17 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
/* create and submit bios for the compressed pages */
bytes_left = compressed_len;
for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
+ int submit = 0;
+
page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
if (bio->bi_iter.bi_size)
- ret = io_tree->ops->merge_bio_hook(page, 0,
+ submit = io_tree->ops->merge_bio_hook(page, 0,
PAGE_SIZE,
bio, 0);
- else
- ret = 0;
page->mapping = NULL;
- if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) <
+ if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
bio_get(bio);
@@ -400,14 +348,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
ret = btrfs_map_bio(fs_info, bio, 0, 1);
if (ret) {
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
}
bio_put(bio);
- bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
- BUG_ON(!bio);
+ bio = btrfs_bio_alloc(bdev, first_byte);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
@@ -434,7 +381,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
ret = btrfs_map_bio(fs_info, bio, 0, 1);
if (ret) {
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
}
@@ -569,7 +516,7 @@ next:
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
-int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -586,7 +533,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
- int ret = -ENOMEM;
+ blk_status_t ret = BLK_STS_RESOURCE;
int faili = 0;
u32 *sums;
@@ -600,7 +547,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
PAGE_SIZE);
read_unlock(&em_tree->lock);
if (!em)
- return -EIO;
+ return BLK_STS_IOERR;
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
@@ -638,7 +585,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
__GFP_HIGHMEM);
if (!cb->compressed_pages[pg_index]) {
faili = pg_index - 1;
- ret = -ENOMEM;
+ ret = BLK_STS_RESOURCE;
goto fail2;
}
}
@@ -650,28 +597,26 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
- comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
- if (!comp_bio)
- goto fail2;
+ comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
bio_set_op_attrs (comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
refcount_set(&cb->pending_bios, 1);
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+ int submit = 0;
+
page = cb->compressed_pages[pg_index];
page->mapping = inode->i_mapping;
page->index = em_start >> PAGE_SHIFT;
if (comp_bio->bi_iter.bi_size)
- ret = tree->ops->merge_bio_hook(page, 0,
+ submit = tree->ops->merge_bio_hook(page, 0,
PAGE_SIZE,
comp_bio, 0);
- else
- ret = 0;
page->mapping = NULL;
- if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
+ if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
bio_get(comp_bio);
@@ -697,15 +642,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
- comp_bio->bi_error = ret;
+ comp_bio->bi_status = ret;
bio_endio(comp_bio);
}
bio_put(comp_bio);
- comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
- GFP_NOFS);
- BUG_ON(!comp_bio);
+ comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
bio_set_op_attrs(comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@@ -726,7 +669,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
- comp_bio->bi_error = ret;
+ comp_bio->bi_status = ret;
bio_endio(comp_bio);
}
@@ -801,6 +744,7 @@ static struct list_head *find_workspace(int type)
struct list_head *workspace;
int cpus = num_online_cpus();
int idx = type - 1;
+ unsigned nofs_flag;
struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
@@ -830,7 +774,15 @@ again:
atomic_inc(total_ws);
spin_unlock(ws_lock);
+ /*
+ * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
+ * to turn it off here because we might get called from the restricted
+ * context of btrfs_compress_bio/btrfs_compress_pages
+ */
+ nofs_flag = memalloc_nofs_save();
workspace = btrfs_compress_op[idx]->alloc_workspace();
+ memalloc_nofs_restore(nofs_flag);
+
if (IS_ERR(workspace)) {
atomic_dec(total_ws);
wake_up(ws_wait);
@@ -961,19 +913,16 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
* be contiguous. They all correspond to the range of bytes covered by
* the compressed extent.
*/
-static int btrfs_decompress_bio(int type, struct page **pages_in,
- u64 disk_start, struct bio *orig_bio,
- size_t srclen)
+static int btrfs_decompress_bio(struct compressed_bio *cb)
{
struct list_head *workspace;
int ret;
+ int type = cb->compress_type;
workspace = find_workspace(type);
-
- ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in,
- disk_start, orig_bio,
- srclen);
+ ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
free_workspace(type, workspace);
+
return ret;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 39ec43ab8df1..87f6d3332163 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -34,6 +34,45 @@
/* Maximum size of data before compression */
#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
+struct compressed_bio {
+ /* number of bios pending for this compressed extent */
+ refcount_t pending_bios;
+
+ /* the pages with the compressed data on them */
+ struct page **compressed_pages;
+
+ /* inode that owns this data */
+ struct inode *inode;
+
+ /* starting offset in the inode for our pages */
+ u64 start;
+
+ /* number of bytes in the inode we're working on */
+ unsigned long len;
+
+ /* number of bytes on disk */
+ unsigned long compressed_len;
+
+ /* the compression algorithm for this bio */
+ int compress_type;
+
+ /* number of compressed pages in the array */
+ unsigned long nr_pages;
+
+ /* IO errors */
+ int errors;
+ int mirror_num;
+
+ /* for reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+
+ /*
+ * the start of a variable length array of checksums only
+ * used by reads
+ */
+ u32 sums;
+};
+
void btrfs_init_compress(void);
void btrfs_exit_compress(void);
@@ -48,12 +87,12 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
struct bio *bio);
-int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
unsigned long nr_pages);
-int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
enum btrfs_compression_type {
@@ -78,10 +117,7 @@ struct btrfs_compress_op {
unsigned long *total_out);
int (*decompress_bio)(struct list_head *workspace,
- struct page **pages_in,
- u64 disk_start,
- struct bio *orig_bio,
- size_t srclen);
+ struct compressed_bio *cb);
int (*decompress)(struct list_head *workspace,
unsigned char *data_in,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a3a75f1de002..3f4daa9d6e2c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -19,7 +19,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -3667,14 +3667,14 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
/* make room in the right data area */
data_end = leaf_data_end(fs_info, right);
memmove_extent_buffer(right,
- btrfs_leaf_data(right) + data_end - push_space,
- btrfs_leaf_data(right) + data_end,
+ BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
/* copy from the left data area */
- copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+ copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
- btrfs_leaf_data(left) + leaf_data_end(fs_info, left),
+ BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
@@ -3888,9 +3888,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
btrfs_item_offset_nr(right, push_items - 1);
- copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+ copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(fs_info, left) - push_space,
- btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_OFFSET +
btrfs_item_offset_nr(right, push_items - 1),
push_space);
old_left_nritems = btrfs_header_nritems(left);
@@ -3917,9 +3917,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
if (push_items < right_nritems) {
push_space = btrfs_item_offset_nr(right, push_items - 1) -
leaf_data_end(fs_info, right);
- memmove_extent_buffer(right, btrfs_leaf_data(right) +
+ memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
- btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(fs_info, right), push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(0),
@@ -4069,8 +4069,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
nritems * sizeof(struct btrfs_item));
copy_extent_buffer(right, l,
- btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(fs_info) -
- data_copy_size, btrfs_leaf_data(l) +
+ BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
+ data_copy_size, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(fs_info, l), data_copy_size);
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
@@ -4607,8 +4607,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
/* shift the data */
if (from_end) {
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end + size_diff, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data_start + new_size - data_end);
} else {
struct btrfs_disk_key disk_key;
@@ -4634,8 +4634,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
}
}
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end + size_diff, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data_start - data_end);
offset = btrfs_disk_key_offset(&disk_key);
@@ -4707,8 +4707,8 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
}
/* shift the data */
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end - data_size, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data - data_end);
data_end = old_data;
@@ -4790,8 +4790,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
(nritems - slot) * sizeof(struct btrfs_item));
/* shift the data */
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end - total_data, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data - data_end);
data_end = old_data;
}
@@ -4983,9 +4983,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (slot + nr != nritems) {
int data_end = leaf_data_end(fs_info, leaf);
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
- btrfs_leaf_data(leaf) + data_end,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
last_off - data_end);
for (i = slot + nr; i < nritems; i++) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4f8f75d9e839..3f3eb7b17cac 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,6 @@ struct btrfs_trans_handle;
struct btrfs_transaction;
struct btrfs_pending_snapshot;
extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
@@ -716,6 +715,10 @@ struct btrfs_delayed_root;
#define BTRFS_FS_BTREE_ERR 11
#define BTRFS_FS_LOG1_ERR 12
#define BTRFS_FS_LOG2_ERR 13
+#define BTRFS_FS_QUOTA_OVERRIDE 14
+/* Used to record internally whether fs has been frozen */
+#define BTRFS_FS_FROZEN 15
+
/*
* Indicate that a whole-filesystem exclusive operation is running
* (device replace, resize, device add/delete, balance)
@@ -748,8 +751,7 @@ struct btrfs_fs_info {
struct rb_root block_group_cache_tree;
/* keep track of unallocated space */
- spinlock_t free_chunk_lock;
- u64 free_chunk_space;
+ atomic64_t free_chunk_space;
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
@@ -797,17 +799,7 @@ struct btrfs_fs_info {
* so it is also safe.
*/
u64 max_inline;
- /*
- * Protected by ->chunk_mutex and sb->s_umount.
- *
- * The reason that we use two lock to protect it is because only
- * remount and mount operations can change it and these two operations
- * are under sb->s_umount, but the read side (chunk allocation) can not
- * acquire sb->s_umount or the deadlock would happen. So we use two
- * locks to protect it. On the write side, we must acquire two locks,
- * and on the read side, we just need acquire one of them.
- */
- u64 alloc_start;
+
struct btrfs_transaction *running_transaction;
wait_queue_head_t transaction_throttle;
wait_queue_head_t transaction_wait;
@@ -1107,9 +1099,6 @@ struct btrfs_fs_info {
*/
struct list_head pinned_chunks;
- /* Used to record internally whether fs has been frozen */
- int fs_frozen;
-
/* Cached block sizes */
u32 nodesize;
u32 sectorsize;
@@ -1277,21 +1266,20 @@ struct btrfs_root {
/* For qgroup metadata space reserve */
atomic64_t qgroup_meta_rsv;
};
+
static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
{
return btrfs_sb(inode->i_sb)->sectorsize;
}
-static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize)
-{
- return blocksize - sizeof(struct btrfs_header);
-}
-
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
- return __BTRFS_LEAF_DATA_SIZE(info->nodesize);
+
+ return info->nodesize - sizeof(struct btrfs_header);
}
+#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items)
+
static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
{
return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
@@ -1553,8 +1541,27 @@ static inline void btrfs_set_##name(type *s, u##bits val) \
s->member = cpu_to_le##bits(val); \
}
+
+static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb,
+ struct btrfs_dev_item *s)
+{
+ BUILD_BUG_ON(sizeof(u64) !=
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
+ total_bytes));
+}
+static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb,
+ struct btrfs_dev_item *s,
+ u64 val)
+{
+ BUILD_BUG_ON(sizeof(u64) !=
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
+ btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
+}
+
+
BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
-BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
@@ -2324,10 +2331,6 @@ static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
return btrfs_csum_sizes[t];
}
-static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
-{
- return offsetof(struct btrfs_leaf, items);
-}
/*
* The leaf data grows from end-to-front in the node.
@@ -2538,11 +2541,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
- ((type *)(btrfs_leaf_data(leaf) + \
+ ((type *)(BTRFS_LEAF_DATA_OFFSET + \
btrfs_item_offset_nr(leaf, slot)))
#define btrfs_item_ptr_offset(leaf, slot) \
- ((unsigned long)(btrfs_leaf_data(leaf) + \
+ ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
btrfs_item_offset_nr(leaf, slot)))
static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
@@ -2680,7 +2683,9 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
+u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info);
+u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info);
+u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
enum btrfs_reserve_flush_enum {
@@ -2703,9 +2708,13 @@ enum btrfs_flush_state {
COMMIT_TRANS = 6,
};
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+int btrfs_check_data_free_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
+void btrfs_free_reserved_data_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2722,8 +2731,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
+int btrfs_delalloc_reserve_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
@@ -3031,12 +3040,14 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod);
int verify_dir_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
+ struct extent_buffer *leaf, int slot,
struct btrfs_dir_item *dir_item);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const char *name,
int name_len);
+bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
+ unsigned long start, u16 name_len);
/* orphan.c */
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3078,8 +3089,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
-int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
-int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
+blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -3094,7 +3105,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
u64 file_start, int contig);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
@@ -3171,6 +3182,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags);
+void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
int btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index be70d90dfee5..93ffa898df6d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -470,7 +470,8 @@ add_tail:
static noinline void
update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_node *existing,
- struct btrfs_delayed_ref_node *update)
+ struct btrfs_delayed_ref_node *update,
+ int *old_ref_mod_ret)
{
struct btrfs_delayed_ref_head *existing_ref;
struct btrfs_delayed_ref_head *ref;
@@ -523,6 +524,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
* currently, for refs we just added we know we're a-ok.
*/
old_ref_mod = existing_ref->total_ref_mod;
+ if (old_ref_mod_ret)
+ *old_ref_mod_ret = old_ref_mod;
existing->ref_mod += update->ref_mod;
existing_ref->total_ref_mod += update->ref_mod;
@@ -550,7 +553,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *ref,
struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
- int action, int is_data, int *qrecord_inserted_ret)
+ int action, int is_data, int *qrecord_inserted_ret,
+ int *old_ref_mod, int *new_ref_mod)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -638,7 +642,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
if (existing) {
WARN_ON(ref_root && reserved && existing->qgroup_ref_root
&& existing->qgroup_reserved);
- update_existing_head_ref(delayed_refs, &existing->node, ref);
+ update_existing_head_ref(delayed_refs, &existing->node, ref,
+ old_ref_mod);
/*
* we've updated the existing ref, free the newly
* allocated ref
@@ -646,6 +651,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ if (old_ref_mod)
+ *old_ref_mod = 0;
if (is_data && count_mod < 0)
delayed_refs->pending_csums += num_bytes;
delayed_refs->num_heads++;
@@ -655,6 +662,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
+ if (new_ref_mod)
+ *new_ref_mod = head_ref->total_ref_mod;
return head_ref;
}
@@ -778,7 +787,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int *old_ref_mod, int *new_ref_mod)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
@@ -813,7 +823,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, 0, 0, action, 0,
- &qrecord_inserted);
+ &qrecord_inserted, old_ref_mod,
+ new_ref_mod);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action);
@@ -838,7 +849,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, u64 reserved, int action)
+ u64 owner, u64 offset, u64 reserved, int action,
+ int *old_ref_mod, int *new_ref_mod)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
@@ -878,7 +890,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, ref_root, reserved,
- action, 1, &qrecord_inserted);
+ action, 1, &qrecord_inserted,
+ old_ref_mod, new_ref_mod);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
@@ -909,7 +922,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
- extent_op->is_data, NULL);
+ extent_op->is_data, NULL, NULL, NULL);
spin_unlock(&delayed_refs->lock);
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c0264ff01b53..ce88e4ac5276 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -247,12 +247,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op);
+ struct btrfs_delayed_extent_op *extent_op,
+ int *old_ref_mod, int *new_ref_mod);
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, u64 reserved, int action);
+ u64 owner, u64 offset, u64 reserved, int action,
+ int *old_ref_mod, int *new_ref_mod);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5fe1ca8abc70..bee3edeea7a3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -388,7 +388,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
- btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c24d615e3d7f..41cb9196eaa8 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -395,8 +395,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
- if (verify_dir_item(fs_info, leaf, dir_item))
- return NULL;
total_len = btrfs_item_size_nr(leaf, path->slots[0]);
while (cur < total_len) {
@@ -405,6 +403,8 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
btrfs_dir_data_len(leaf, dir_item);
name_ptr = (unsigned long)(dir_item + 1);
+ if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item))
+ return NULL;
if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
return dir_item;
@@ -453,9 +453,11 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
int verify_dir_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
+ int slot,
struct btrfs_dir_item *dir_item)
{
u16 namelen = BTRFS_NAME_LEN;
+ int ret;
u8 type = btrfs_dir_type(leaf, dir_item);
if (type >= BTRFS_FT_MAX) {
@@ -472,6 +474,12 @@ int verify_dir_item(struct btrfs_fs_info *fs_info,
return 1;
}
+ namelen = btrfs_dir_name_len(leaf, dir_item);
+ ret = btrfs_is_name_len_valid(leaf, slot,
+ (unsigned long)(dir_item + 1), namelen);
+ if (!ret)
+ return 1;
+
/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
if ((btrfs_dir_data_len(leaf, dir_item) +
btrfs_dir_name_len(leaf, dir_item)) >
@@ -484,3 +492,67 @@ int verify_dir_item(struct btrfs_fs_info *fs_info,
return 0;
}
+
+bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
+ unsigned long start, u16 name_len)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_key key;
+ u32 read_start;
+ u32 read_end;
+ u32 item_start;
+ u32 item_end;
+ u32 size;
+ bool ret = true;
+
+ ASSERT(start > BTRFS_LEAF_DATA_OFFSET);
+
+ read_start = start - BTRFS_LEAF_DATA_OFFSET;
+ read_end = read_start + name_len;
+ item_start = btrfs_item_offset_nr(leaf, slot);
+ item_end = btrfs_item_end_nr(leaf, slot);
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ switch (key.type) {
+ case BTRFS_DIR_ITEM_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ case BTRFS_DIR_INDEX_KEY:
+ size = sizeof(struct btrfs_dir_item);
+ break;
+ case BTRFS_INODE_REF_KEY:
+ size = sizeof(struct btrfs_inode_ref);
+ break;
+ case BTRFS_INODE_EXTREF_KEY:
+ size = sizeof(struct btrfs_inode_extref);
+ break;
+ case BTRFS_ROOT_REF_KEY:
+ case BTRFS_ROOT_BACKREF_KEY:
+ size = sizeof(struct btrfs_root_ref);
+ break;
+ default:
+ ret = false;
+ goto out;
+ }
+
+ if (read_start < item_start) {
+ ret = false;
+ goto out;
+ }
+ if (read_end > item_end) {
+ ret = false;
+ goto out;
+ }
+
+ /* there shall be item(s) before name */
+ if (read_start - item_start < size) {
+ ret = false;
+ goto out;
+ }
+
+out:
+ if (!ret)
+ btrfs_crit(fs_info, "invalid dir item name len: %u",
+ (unsigned int)name_len);
+ return ret;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5f678dcb20e6..080e2ebb8aa0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -87,9 +87,8 @@ struct btrfs_end_io_wq {
bio_end_io_t *end_io;
void *private;
struct btrfs_fs_info *info;
- int error;
+ blk_status_t status;
enum btrfs_wq_endio_type metadata;
- struct list_head list;
struct btrfs_work work;
};
@@ -118,9 +117,9 @@ void btrfs_end_io_wq_exit(void)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
- struct inode *inode;
+ void *private_data;
+ struct btrfs_fs_info *fs_info;
struct bio *bio;
- struct list_head list;
extent_submit_bio_hook_t *submit_bio_start;
extent_submit_bio_hook_t *submit_bio_done;
int mirror_num;
@@ -131,7 +130,7 @@ struct async_submit_bio {
*/
u64 bio_offset;
struct btrfs_work work;
- int error;
+ blk_status_t status;
};
/*
@@ -799,7 +798,7 @@ static void end_workqueue_bio(struct bio *bio)
btrfs_work_func_t func;
fs_info = end_io_wq->info;
- end_io_wq->error = bio->bi_error;
+ end_io_wq->status = bio->bi_status;
if (bio_op(bio) == REQ_OP_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
@@ -836,19 +835,19 @@ static void end_workqueue_bio(struct bio *bio)
btrfs_queue_work(wq, &end_io_wq->work);
}
-int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata)
{
struct btrfs_end_io_wq *end_io_wq;
end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
if (!end_io_wq)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
end_io_wq->private = bio->bi_private;
end_io_wq->end_io = bio->bi_end_io;
end_io_wq->info = info;
- end_io_wq->error = 0;
+ end_io_wq->status = 0;
end_io_wq->bio = bio;
end_io_wq->metadata = metadata;
@@ -868,14 +867,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
- int ret;
+ blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->inode, async->bio,
+ ret = async->submit_bio_start(async->private_data, async->bio,
async->mirror_num, async->bio_flags,
async->bio_offset);
if (ret)
- async->error = ret;
+ async->status = ret;
}
static void run_one_async_done(struct btrfs_work *work)
@@ -885,7 +884,7 @@ static void run_one_async_done(struct btrfs_work *work)
int limit;
async = container_of(work, struct async_submit_bio, work);
- fs_info = BTRFS_I(async->inode)->root->fs_info;
+ fs_info = async->fs_info;
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
@@ -898,13 +897,13 @@ static void run_one_async_done(struct btrfs_work *work)
wake_up(&fs_info->async_submit_wait);
/* If an error occurred we just want to clean up the bio and move on */
- if (async->error) {
- async->bio->bi_error = async->error;
+ if (async->status) {
+ async->bio->bi_status = async->status;
bio_endio(async->bio);
return;
}
- async->submit_bio_done(async->inode, async->bio, async->mirror_num,
+ async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
async->bio_flags, async->bio_offset);
}
@@ -916,20 +915,20 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
- struct bio *bio, int mirror_num,
- unsigned long bio_flags,
- u64 bio_offset,
- extent_submit_bio_hook_t *submit_bio_start,
- extent_submit_bio_hook_t *submit_bio_done)
+blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset, void *private_data,
+ extent_submit_bio_hook_t *submit_bio_start,
+ extent_submit_bio_hook_t *submit_bio_done)
{
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
- async->inode = inode;
+ async->private_data = private_data;
+ async->fs_info = fs_info;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
@@ -941,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->bio_flags = bio_flags;
async->bio_offset = bio_offset;
- async->error = 0;
+ async->status = 0;
atomic_inc(&fs_info->nr_async_submits);
@@ -959,12 +958,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
return 0;
}
-static int btree_csum_one_bio(struct bio *bio)
+static blk_status_t btree_csum_one_bio(struct bio *bio)
{
struct bio_vec *bvec;
struct btrfs_root *root;
int i, ret = 0;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
@@ -972,12 +972,12 @@ static int btree_csum_one_bio(struct bio *bio)
break;
}
- return ret;
+ return errno_to_blk_status(ret);
}
-static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -986,11 +986,12 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
return btree_csum_one_bio(bio);
}
-static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
- int ret;
+ struct inode *inode = private_data;
+ blk_status_t ret;
/*
* when we're called for a write, we're already in the async
@@ -998,7 +999,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
*/
ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1);
if (ret) {
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
}
return ret;
@@ -1015,13 +1016,14 @@ static int check_async_write(unsigned long bio_flags)
return 1;
}
-static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
+ struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(bio_flags);
- int ret;
+ blk_status_t ret;
if (bio_op(bio) != REQ_OP_WRITE) {
/*
@@ -1043,8 +1045,8 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
- ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0,
- bio_offset,
+ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
+ bio_offset, private_data,
__btree_submit_bio_start,
__btree_submit_bio_done);
}
@@ -1054,7 +1056,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
return 0;
out_w_error:
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
return ret;
}
@@ -1222,10 +1224,10 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
buf->start + buf->len - 1);
}
-int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
- return filemap_fdatawait_range(buf->pages[0]->mapping,
- buf->start, buf->start + buf->len - 1);
+ filemap_fdatawait_range(buf->pages[0]->mapping,
+ buf->start, buf->start + buf->len - 1);
}
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -1255,9 +1257,9 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
btrfs_assert_tree_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
- -buf->len,
- fs_info->dirty_metadata_batch);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ -buf->len,
+ fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
btrfs_set_lock_blocking(buf);
clear_extent_buffer_dirty(buf);
@@ -1347,8 +1349,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->log_transid_committed = -1;
root->last_log_commit = 0;
if (!dummy)
- extent_io_tree_init(&root->dirty_log_pages,
- fs_info->btree_inode->i_mapping);
+ extent_io_tree_init(&root->dirty_log_pages, NULL);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
@@ -1820,7 +1821,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
bio = end_io_wq->bio;
- bio->bi_error = end_io_wq->error;
+ bio->bi_status = end_io_wq->status;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
@@ -2309,7 +2310,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
inode->i_mapping->a_ops = &btree_aops;
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
- extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping);
+ extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode);
BTRFS_I(inode)->io_tree.track_uptodate = 0;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
@@ -2626,7 +2627,6 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
- spin_lock_init(&fs_info->free_chunk_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->qgroup_op_lock);
@@ -2662,12 +2662,11 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
- fs_info->fs_frozen = 0;
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
- fs_info->free_chunk_space = 0;
+ atomic64_set(&fs_info->free_chunk_space, 0);
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
@@ -2704,10 +2703,8 @@ int open_ctree(struct super_block *sb,
fs_info->block_group_cache_tree = RB_ROOT;
fs_info->first_logical_byte = (u64)-1;
- extent_io_tree_init(&fs_info->freed_extents[0],
- fs_info->btree_inode->i_mapping);
- extent_io_tree_init(&fs_info->freed_extents[1],
- fs_info->btree_inode->i_mapping);
+ extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+ extent_io_tree_init(&fs_info->freed_extents[1], NULL);
fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
@@ -3485,65 +3482,61 @@ static int write_dev_supers(struct btrfs_device *device,
*/
static void btrfs_end_empty_barrier(struct bio *bio)
{
- if (bio->bi_private)
- complete(bio->bi_private);
- bio_put(bio);
+ complete(bio->bi_private);
}
/*
- * trigger flushes for one the devices. If you pass wait == 0, the flushes are
- * sent down. With wait == 1, it waits for the previous flush.
- *
- * any device where the flush fails with eopnotsupp are flagged as not-barrier
- * capable
+ * Submit a flush request to the device if it supports it. Error handling is
+ * done in the waiting counterpart.
*/
-static int write_dev_flush(struct btrfs_device *device, int wait)
+static void write_dev_flush(struct btrfs_device *device)
{
struct request_queue *q = bdev_get_queue(device->bdev);
- struct bio *bio;
- int ret = 0;
+ struct bio *bio = device->flush_bio;
if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
- return 0;
+ return;
- if (wait) {
- bio = device->flush_bio;
- if (!bio)
- return 0;
+ bio_reset(bio);
+ bio->bi_end_io = btrfs_end_empty_barrier;
+ bio->bi_bdev = device->bdev;
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
+ init_completion(&device->flush_wait);
+ bio->bi_private = &device->flush_wait;
- wait_for_completion(&device->flush_wait);
+ submit_bio(bio);
+ device->flush_bio_sent = 1;
+}
- if (bio->bi_error) {
- ret = bio->bi_error;
- btrfs_dev_stat_inc_and_print(device,
- BTRFS_DEV_STAT_FLUSH_ERRS);
- }
+/*
+ * If the flush bio has been submitted by write_dev_flush, wait for it.
+ */
+static blk_status_t wait_dev_flush(struct btrfs_device *device)
+{
+ struct bio *bio = device->flush_bio;
- /* drop the reference from the wait == 0 run */
- bio_put(bio);
- device->flush_bio = NULL;
+ if (!device->flush_bio_sent)
+ return 0;
- return ret;
- }
+ device->flush_bio_sent = 0;
+ wait_for_completion_io(&device->flush_wait);
- /*
- * one reference for us, and we leave it for the
- * caller
- */
- device->flush_bio = NULL;
- bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
- if (!bio)
- return -ENOMEM;
+ return bio->bi_status;
+}
- bio->bi_end_io = btrfs_end_empty_barrier;
- bio->bi_bdev = device->bdev;
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
- init_completion(&device->flush_wait);
- bio->bi_private = &device->flush_wait;
- device->flush_bio = bio;
+static int check_barrier_error(struct btrfs_fs_devices *fsdevs)
+{
+ int dev_flush_error = 0;
+ struct btrfs_device *dev;
+
+ list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) {
+ if (!dev->bdev || dev->last_flush_error)
+ dev_flush_error++;
+ }
- bio_get(bio);
- btrfsic_submit_bio(bio);
+ if (dev_flush_error >
+ fsdevs->fs_info->num_tolerated_disk_barrier_failures)
+ return -EIO;
return 0;
}
@@ -3556,25 +3549,21 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
{
struct list_head *head;
struct btrfs_device *dev;
- int errors_send = 0;
int errors_wait = 0;
- int ret;
+ blk_status_t ret;
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) {
if (dev->missing)
continue;
- if (!dev->bdev) {
- errors_send++;
+ if (!dev->bdev)
continue;
- }
if (!dev->in_fs_metadata || !dev->writeable)
continue;
- ret = write_dev_flush(dev, 0);
- if (ret)
- errors_send++;
+ write_dev_flush(dev);
+ dev->last_flush_error = 0;
}
/* wait for all the barriers */
@@ -3588,13 +3577,23 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
if (!dev->in_fs_metadata || !dev->writeable)
continue;
- ret = write_dev_flush(dev, 1);
- if (ret)
+ ret = wait_dev_flush(dev);
+ if (ret) {
+ dev->last_flush_error = ret;
+ btrfs_dev_stat_inc_and_print(dev,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
errors_wait++;
+ }
+ }
+
+ if (errors_wait) {
+ /*
+ * At some point we need the status of all disks
+ * to arrive at the volume status. So error checking
+ * is being pushed to a separate loop.
+ */
+ return check_barrier_error(info->fs_devices);
}
- if (errors_send > info->num_tolerated_disk_barrier_failures ||
- errors_wait > info->num_tolerated_disk_barrier_failures)
- return -EIO;
return 0;
}
@@ -4049,9 +4048,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
buf->start, transid, fs_info->generation);
was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty)
- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
- buf->len,
- fs_info->dirty_metadata_batch);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ buf->len,
+ fs_info->dirty_metadata_batch);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
btrfs_print_leaf(fs_info, buf);
@@ -4578,11 +4577,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
-
- /*
- memset(cur_trans, 0, sizeof(*cur_trans));
- kmem_cache_free(btrfs_transaction_cachep, cur_trans);
- */
}
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
@@ -4638,6 +4632,12 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
+static struct btrfs_fs_info *btree_fs_info(void *private_data)
+{
+ struct inode *inode = private_data;
+ return btrfs_sb(inode->i_sb);
+}
+
static const struct extent_io_ops btree_extent_io_ops = {
/* mandatory callbacks */
.submit_bio_hook = btree_submit_bio_hook,
@@ -4645,6 +4645,8 @@ static const struct extent_io_ops btree_extent_io_ops = {
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
.readpage_io_failed_hook = btree_io_failed_hook,
+ .set_range_writeback = btrfs_set_range_writeback,
+ .tree_fs_info = btree_fs_info,
/* optional callbacks */
};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 21f1ceb85b76..0a634d3ffc16 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -118,16 +118,16 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, u8 *result);
-int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
- struct bio *bio, int mirror_num,
- unsigned long bio_flags, u64 bio_offset,
+blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset, void *private_data,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done);
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
int btrfs_write_tree_block(struct extent_buffer *buf);
-int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 87144c9f9593..fa66980726c9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -282,6 +282,11 @@ static int btrfs_get_name(struct dentry *parent, char *name,
name_len = btrfs_inode_ref_name_len(leaf, iref);
}
+ ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len);
+ if (!ret) {
+ btrfs_free_path(path);
+ return -EIO;
+ }
read_extent_buffer(leaf, name, name_ptr, name_len);
btrfs_free_path(path);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 33d979e9ea2a..e3b0b4196d3d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -97,10 +97,11 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-static int __reserve_metadata_bytes(struct btrfs_root *root,
+static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush);
+ enum btrfs_reserve_flush_enum flush,
+ bool system_chunk);
static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes);
@@ -766,6 +767,26 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
return NULL;
}
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
+ u64 owner, u64 root_objectid)
+{
+ struct btrfs_space_info *space_info;
+ u64 flags;
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ } else {
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ }
+
+ space_info = __find_space_info(fs_info, flags);
+ ASSERT(space_info);
+ percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
/*
* after adding space to the filesystem, we need to clear the full flags
* on all the space infos.
@@ -2092,6 +2113,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset)
{
+ int old_ref_mod, new_ref_mod;
int ret;
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
@@ -2099,15 +2121,21 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL);
+ num_bytes, parent,
+ root_objectid, (int)owner,
+ BTRFS_ADD_DELAYED_REF, NULL,
+ &old_ref_mod, &new_ref_mod);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- num_bytes, parent, root_objectid,
- owner, offset, 0,
- BTRFS_ADD_DELAYED_REF);
+ num_bytes, parent,
+ root_objectid, owner, offset,
+ 0, BTRFS_ADD_DELAYED_REF,
+ &old_ref_mod, &new_ref_mod);
}
+
+ if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
+ add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
+
return ret;
}
@@ -2411,6 +2439,16 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
head = btrfs_delayed_node_to_head(node);
trace_run_delayed_ref_head(fs_info, node, head, node->action);
+ if (head->total_ref_mod < 0) {
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, node->bytenr);
+ ASSERT(cache);
+ percpu_counter_add(&cache->space_info->total_bytes_pinned,
+ -node->num_bytes);
+ btrfs_put_block_group(cache);
+ }
+
if (insert_reserved) {
btrfs_pin_extent(fs_info, node->bytenr,
node->num_bytes, 1);
@@ -3364,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
+ struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
u64 num_pages = 0;
@@ -3483,7 +3522,7 @@ again:
num_pages *= 16;
num_pages *= PAGE_SIZE;
- ret = btrfs_check_data_free_space(inode, 0, num_pages);
+ ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
if (ret)
goto out_put;
@@ -3514,6 +3553,7 @@ out:
block_group->disk_cache_state = dcs;
spin_unlock(&block_group->lock);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -3924,88 +3964,83 @@ static const char *alloc_name(u64 flags)
};
}
-static int update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly,
- struct btrfs_space_info **space_info)
+static int create_space_info(struct btrfs_fs_info *info, u64 flags,
+ struct btrfs_space_info **new)
{
- struct btrfs_space_info *found;
+
+ struct btrfs_space_info *space_info;
int i;
- int factor;
int ret;
- if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))
- factor = 2;
- else
- factor = 1;
-
- found = __find_space_info(info, flags);
- if (found) {
- spin_lock(&found->lock);
- found->total_bytes += total_bytes;
- found->disk_total += total_bytes * factor;
- found->bytes_used += bytes_used;
- found->disk_used += bytes_used * factor;
- found->bytes_readonly += bytes_readonly;
- if (total_bytes > 0)
- found->full = 0;
- space_info_add_new_bytes(info, found, total_bytes -
- bytes_used - bytes_readonly);
- spin_unlock(&found->lock);
- *space_info = found;
- return 0;
- }
- found = kzalloc(sizeof(*found), GFP_NOFS);
- if (!found)
+ space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
+ if (!space_info)
return -ENOMEM;
- ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
+ GFP_KERNEL);
if (ret) {
- kfree(found);
+ kfree(space_info);
return ret;
}
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
- INIT_LIST_HEAD(&found->block_groups[i]);
- init_rwsem(&found->groups_sem);
- spin_lock_init(&found->lock);
- found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
- found->total_bytes = total_bytes;
- found->disk_total = total_bytes * factor;
- found->bytes_used = bytes_used;
- found->disk_used = bytes_used * factor;
- found->bytes_pinned = 0;
- found->bytes_reserved = 0;
- found->bytes_readonly = bytes_readonly;
- found->bytes_may_use = 0;
- found->full = 0;
- found->max_extent_size = 0;
- found->force_alloc = CHUNK_ALLOC_NO_FORCE;
- found->chunk_alloc = 0;
- found->flush = 0;
- init_waitqueue_head(&found->wait);
- INIT_LIST_HEAD(&found->ro_bgs);
- INIT_LIST_HEAD(&found->tickets);
- INIT_LIST_HEAD(&found->priority_tickets);
-
- ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
+ INIT_LIST_HEAD(&space_info->block_groups[i]);
+ init_rwsem(&space_info->groups_sem);
+ spin_lock_init(&space_info->lock);
+ space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+ init_waitqueue_head(&space_info->wait);
+ INIT_LIST_HEAD(&space_info->ro_bgs);
+ INIT_LIST_HEAD(&space_info->tickets);
+ INIT_LIST_HEAD(&space_info->priority_tickets);
+
+ ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
info->space_info_kobj, "%s",
- alloc_name(found->flags));
+ alloc_name(space_info->flags));
if (ret) {
- percpu_counter_destroy(&found->total_bytes_pinned);
- kfree(found);
+ percpu_counter_destroy(&space_info->total_bytes_pinned);
+ kfree(space_info);
return ret;
}
- *space_info = found;
- list_add_rcu(&found->list, &info->space_info);
+ *new = space_info;
+ list_add_rcu(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
- info->data_sinfo = found;
+ info->data_sinfo = space_info;
return ret;
}
+static void update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly,
+ struct btrfs_space_info **space_info)
+{
+ struct btrfs_space_info *found;
+ int factor;
+
+ if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
+
+ found = __find_space_info(info, flags);
+ ASSERT(found);
+ spin_lock(&found->lock);
+ found->total_bytes += total_bytes;
+ found->disk_total += total_bytes * factor;
+ found->bytes_used += bytes_used;
+ found->disk_used += bytes_used * factor;
+ found->bytes_readonly += bytes_readonly;
+ if (total_bytes > 0)
+ found->full = 0;
+ space_info_add_new_bytes(info, found, total_bytes -
+ bytes_used - bytes_readonly);
+ spin_unlock(&found->lock);
+ *space_info = found;
+}
+
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
u64 extra_flags = chunk_to_extended(flags) &
@@ -4121,7 +4156,7 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
return btrfs_reduce_alloc_profile(fs_info, flags);
}
-u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 flags;
@@ -4138,6 +4173,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
return ret;
}
+u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
+}
+
+u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+}
+
+u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+}
+
static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included)
{
@@ -4187,7 +4237,7 @@ again:
data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
spin_unlock(&data_sinfo->lock);
alloc:
- alloc_target = btrfs_get_alloc_profile(root, 1);
+ alloc_target = btrfs_data_alloc_profile(fs_info);
/*
* It is ugly that we don't call nolock join
* transaction for the free space inode case here.
@@ -4238,7 +4288,7 @@ commit_trans:
if (need_commit > 0) {
btrfs_start_delalloc_roots(fs_info, 0, -1);
- btrfs_wait_ordered_roots(fs_info, -1, 0,
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
(u64)-1);
}
@@ -4278,12 +4328,8 @@ commit_trans:
return ret;
}
-/*
- * New check_data_free_space() with ability for precious data reservation
- * Will replace old btrfs_check_data_free_space(), but for patch split,
- * add a new function first and then replace it.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+int btrfs_check_data_free_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
@@ -4298,9 +4344,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
- ret = btrfs_qgroup_reserve_data(inode, start, len);
- if (ret)
+ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
+ if (ret < 0)
btrfs_free_reserved_data_space_noquota(inode, start, len);
+ else
+ ret = 0;
return ret;
}
@@ -4341,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
* This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
-void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+void btrfs_free_reserved_data_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4351,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
start = round_down(start, root->fs_info->sectorsize);
btrfs_free_reserved_data_space_noquota(inode, start, len);
- btrfs_qgroup_free_data(inode, start, len);
+ btrfs_qgroup_free_data(inode, reserved, start, len);
}
static void force_metadata_allocation(struct btrfs_fs_info *info)
@@ -4463,9 +4512,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
}
if (left < thresh) {
- u64 flags;
+ u64 flags = btrfs_system_alloc_profile(fs_info);
- flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
/*
* Ignore failure to create system chunk. We might end up not
* needing it, as we might not need to COW all nodes/leafs from
@@ -4506,10 +4554,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
space_info = __find_space_info(fs_info, flags);
if (!space_info) {
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
- BUG_ON(ret); /* -ENOMEM */
+ ret = create_space_info(fs_info, flags, &space_info);
+ if (ret)
+ return ret;
}
- BUG_ON(!space_info); /* Logic error */
again:
spin_lock(&space_info->lock);
@@ -4614,11 +4662,11 @@ out:
return ret;
}
-static int can_overcommit(struct btrfs_root *root,
+static int can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush)
+ enum btrfs_reserve_flush_enum flush,
+ bool system_chunk)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 profile;
u64 space_size;
@@ -4629,7 +4677,11 @@ static int can_overcommit(struct btrfs_root *root,
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
- profile = btrfs_get_alloc_profile(root, 0);
+ if (system_chunk)
+ profile = btrfs_system_alloc_profile(fs_info);
+ else
+ profile = btrfs_metadata_alloc_profile(fs_info);
+
used = btrfs_space_info_used(space_info, false);
/*
@@ -4646,9 +4698,7 @@ static int can_overcommit(struct btrfs_root *root,
used += space_info->bytes_may_use;
- spin_lock(&fs_info->free_chunk_lock);
- avail = fs_info->free_chunk_space;
- spin_unlock(&fs_info->free_chunk_lock);
+ avail = atomic64_read(&fs_info->free_chunk_space);
/*
* If we have dup, raid1 or raid10 then only half of the free
@@ -4698,14 +4748,14 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
}
}
-static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
u64 bytes;
- int nr;
+ u64 nr;
bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- nr = (int)div64_u64(to_reclaim, bytes);
+ nr = div64_u64(to_reclaim, bytes);
if (!nr)
nr = 1;
return nr;
@@ -4716,24 +4766,23 @@ static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
- bool wait_ordered)
+static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
+ u64 orig, bool wait_ordered)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 max_reclaim;
+ u64 items;
long time_left;
unsigned long nr_pages;
int loops;
- int items;
enum btrfs_reserve_flush_enum flush;
/* Calc the number of the pages we need flush for space reservation */
items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &fs_info->delalloc_block_rsv;
@@ -4776,10 +4825,6 @@ skip_async:
else
flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
- if (can_overcommit(root, space_info, orig, flush)) {
- spin_unlock(&space_info->lock);
- break;
- }
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
spin_unlock(&space_info->lock);
@@ -4838,7 +4883,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_rsv->lock);
if (percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes - delayed_rsv->size) >= 0) {
+ bytes - delayed_rsv->size) < 0) {
spin_unlock(&delayed_rsv->lock);
return -ENOSPC;
}
@@ -4886,7 +4931,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(root, num_bytes * 2, orig_bytes,
+ shrink_delalloc(fs_info, num_bytes * 2, orig_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case ALLOC_CHUNK:
@@ -4896,7 +4941,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
break;
}
ret = do_chunk_alloc(trans, fs_info,
- btrfs_get_alloc_profile(root, 0),
+ btrfs_metadata_alloc_profile(fs_info),
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans);
if (ret > 0 || ret == -ENOSPC)
@@ -4917,8 +4962,9 @@ static int flush_space(struct btrfs_fs_info *fs_info,
}
static inline u64
-btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
- struct btrfs_space_info *space_info)
+btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool system_chunk)
{
struct reserve_ticket *ticket;
u64 used;
@@ -4933,14 +4979,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
return to_reclaim;
to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- if (can_overcommit(root, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL))
+ if (can_overcommit(fs_info, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL, system_chunk))
return 0;
- used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
- if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
+ used = btrfs_space_info_used(space_info, true);
+
+ if (can_overcommit(fs_info, space_info, SZ_1M,
+ BTRFS_RESERVE_FLUSH_ALL, system_chunk))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@@ -4954,17 +5000,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
return to_reclaim;
}
-static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_root *root, u64 used)
+static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 used, bool system_chunk)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
return 0;
- if (!btrfs_calc_reclaim_metadata_size(root, space_info))
+ if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+ system_chunk))
return 0;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@@ -5001,8 +5048,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
- space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+ false);
if (!to_reclaim) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
@@ -5024,8 +5071,9 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
spin_unlock(&space_info->lock);
return;
}
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
- space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
+ space_info,
+ false);
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
if (last_tickets_id == space_info->tickets_id) {
@@ -5063,8 +5111,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
int flush_state = FLUSH_DELAYED_ITEMS_NR;
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->extent_root,
- space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+ false);
if (!to_reclaim) {
spin_unlock(&space_info->lock);
return;
@@ -5143,12 +5191,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_metadata_bytes(struct btrfs_root *root,
+static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+ enum btrfs_reserve_flush_enum flush,
+ bool system_chunk)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct reserve_ticket ticket;
u64 used;
int ret = 0;
@@ -5170,7 +5218,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, orig_bytes, 1);
ret = 0;
- } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
+ } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
+ system_chunk)) {
space_info->bytes_may_use += orig_bytes;
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, orig_bytes, 1);
@@ -5197,7 +5246,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
orig_bytes, flush,
"enospc");
queue_work(system_unbound_wq,
- &root->fs_info->async_reclaim_work);
+ &fs_info->async_reclaim_work);
}
} else {
list_add_tail(&ticket.list,
@@ -5211,7 +5260,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_do_async_reclaim(space_info, root, used) &&
+ need_do_async_reclaim(fs_info, space_info,
+ used, system_chunk) &&
!work_busy(&fs_info->async_reclaim_work)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
@@ -5269,9 +5319,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
+ bool system_chunk = (root == fs_info->chunk_root);
- ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
- flush);
+ ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ orig_bytes, flush, system_chunk);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
@@ -5380,9 +5431,7 @@ static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
* overcommit, and if we can't then we just need to free up our space
* and not satisfy any requests.
*/
- used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
+ used = btrfs_space_info_used(space_info, true);
if (used - num_bytes >= space_info->total_bytes)
check_overcommit = true;
again:
@@ -5394,8 +5443,7 @@ again:
* adding the ticket space would be a double count.
*/
if (check_overcommit &&
- !can_overcommit(fs_info->extent_root, space_info, 0,
- flush))
+ !can_overcommit(fs_info, space_info, 0, flush, false))
break;
if (num_bytes >= ticket->bytes) {
list_del_init(&ticket->list);
@@ -6124,6 +6172,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
* @inode: inode we're writing to
* @start: start range we are writing to
* @len: how long the range we are writing to
+ * @reserved: mandatory parameter, record actually reserved qgroup ranges of
+ * current reservation.
*
* This will do the following things
*
@@ -6141,16 +6191,17 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+int btrfs_delalloc_reserve_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, start, len);
+ ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
if (ret < 0)
- btrfs_free_reserved_data_space(inode, start, len);
+ btrfs_free_reserved_data_space(inode, *reserved, start, len);
return ret;
}
@@ -6169,10 +6220,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
+void btrfs_delalloc_release_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
{
btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
- btrfs_free_reserved_data_space(inode, start, len);
+ btrfs_free_reserved_data_space(inode, reserved, start, len);
}
static int update_block_group(struct btrfs_trans_handle *trans,
@@ -6248,6 +6300,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(info, "pinned",
cache->space_info->flags,
num_bytes, 1);
+ percpu_counter_add(&cache->space_info->total_bytes_pinned,
+ num_bytes);
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
@@ -6324,6 +6378,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,
trace_btrfs_space_reservation(fs_info, "pinned",
cache->space_info->flags, num_bytes, 1);
+ percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes);
set_extent_dirty(fs_info->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
@@ -6794,27 +6849,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
return 0;
}
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
- u64 owner, u64 root_objectid)
-{
- struct btrfs_space_info *space_info;
- u64 flags;
-
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- } else {
- flags = BTRFS_BLOCK_GROUP_DATA;
- }
-
- space_info = __find_space_info(fs_info, flags);
- BUG_ON(!space_info); /* Logic bug */
- percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
-}
-
-
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *info,
struct btrfs_delayed_ref_node *node, u64 parent,
@@ -7037,8 +7071,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
- add_pinned_bytes(info, -num_bytes, owner_objectid,
- root_objectid);
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
@@ -7170,19 +7202,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(fs_info, trans,
- buf->start, buf->len,
- parent,
+ int old_ref_mod, new_ref_mod;
+
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
+ buf->len, parent,
root->root_key.objectid,
btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL);
+ BTRFS_DROP_DELAYED_REF, NULL,
+ &old_ref_mod, &new_ref_mod);
BUG_ON(ret); /* -ENOMEM */
+ pin = old_ref_mod >= 0 && new_ref_mod < 0;
}
- if (!last_ref)
- return;
-
- if (btrfs_header_generation(buf) == trans->transid) {
+ if (last_ref && btrfs_header_generation(buf) == trans->transid) {
struct btrfs_block_group_cache *cache;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -7191,6 +7223,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
+ pin = 0;
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -7206,18 +7239,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_free_reserved_bytes(cache, buf->len, 0);
btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
- pin = 0;
}
out:
if (pin)
add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
root->root_key.objectid);
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't matter
- * anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+ if (last_ref) {
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't
+ * matter anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+ }
}
/* Can return -ENOMEM */
@@ -7226,12 +7260,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset)
{
+ int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
return 0;
- add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
/*
* tree log blocks never actually go into the extent allocation
@@ -7241,19 +7275,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
/* unlocks the pinned mutex */
btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
+ old_ref_mod = new_ref_mod = 0;
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL);
+ num_bytes, parent,
+ root_objectid, (int)owner,
+ BTRFS_DROP_DELAYED_REF, NULL,
+ &old_ref_mod, &new_ref_mod);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, owner,
- offset, 0,
- BTRFS_DROP_DELAYED_REF);
+ num_bytes, parent,
+ root_objectid, owner, offset,
+ 0, BTRFS_DROP_DELAYED_REF,
+ &old_ref_mod, &new_ref_mod);
}
+
+ if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
+ add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
+
return ret;
}
@@ -7545,6 +7585,10 @@ search:
u64 offset;
int cached;
+ /* If the block group is read-only, we can skip it entirely. */
+ if (unlikely(block_group->ro))
+ continue;
+
btrfs_grab_block_group(block_group, delalloc);
search_start = block_group->key.objectid;
@@ -7580,8 +7624,6 @@ have_block_group:
if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
goto loop;
- if (unlikely(block_group->ro))
- goto loop;
/*
* Ok we want to try and use the cluster allocator, so
@@ -7795,6 +7837,7 @@ loop:
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
btrfs_release_block_group(block_group, delalloc);
+ cond_resched();
}
up_read(&space_info->groups_sem);
@@ -7956,7 +7999,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 flags;
int ret;
- flags = btrfs_get_alloc_profile(root, is_data);
+ flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
@@ -8200,9 +8243,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
- ins->offset, 0,
- root_objectid, owner, offset,
- ram_bytes, BTRFS_ADD_DELAYED_EXTENT);
+ ins->offset, 0, root_objectid, owner,
+ offset, ram_bytes,
+ BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
return ret;
}
@@ -8422,11 +8465,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->is_data = false;
extent_op->level = level;
- ret = btrfs_add_delayed_tree_ref(fs_info, trans,
- ins.objectid, ins.offset,
- parent, root_objectid, level,
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
+ ins.offset, parent,
+ root_objectid, level,
BTRFS_ADD_DELAYED_EXTENT,
- extent_op);
+ extent_op, NULL, NULL);
if (ret)
goto out_free_delayed;
}
@@ -10059,19 +10102,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
}
trace_btrfs_add_block_group(info, cache, 0);
- ret = update_space_info(info, cache->flags, found_key.offset,
- btrfs_block_group_used(&cache->item),
- cache->bytes_super, &space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- spin_lock(&info->block_group_cache_lock);
- rb_erase(&cache->cache_node,
- &info->block_group_cache_tree);
- RB_CLEAR_NODE(&cache->cache_node);
- spin_unlock(&info->block_group_cache_lock);
- btrfs_put_block_group(cache);
- goto error;
- }
+ update_space_info(info, cache->flags, found_key.offset,
+ btrfs_block_group_used(&cache->item),
+ cache->bytes_super, &space_info);
cache->space_info = space_info;
@@ -10203,16 +10236,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
}
#endif
/*
- * Call to ensure the corresponding space_info object is created and
- * assigned to our block group, but don't update its counters just yet.
- * We want our bg to be added to the rbtree with its ->space_info set.
+ * Ensure the corresponding space_info object is created and
+ * assigned to our block group. We want our bg to be added to the rbtree
+ * with its ->space_info set.
*/
- ret = update_space_info(fs_info, cache->flags, 0, 0, 0,
- &cache->space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- return ret;
+ cache->space_info = __find_space_info(fs_info, cache->flags);
+ if (!cache->space_info) {
+ ret = create_space_info(fs_info, cache->flags,
+ &cache->space_info);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
}
ret = btrfs_add_block_group_cache(fs_info, cache);
@@ -10227,18 +10263,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* the rbtree, update the space info's counters.
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
- ret = update_space_info(fs_info, cache->flags, size, bytes_used,
+ update_space_info(fs_info, cache->flags, size, bytes_used,
cache->bytes_super, &cache->space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&cache->cache_node,
- &fs_info->block_group_cache_tree);
- RB_CLEAR_NODE(&cache->cache_node);
- spin_unlock(&fs_info->block_group_cache_lock);
- btrfs_put_block_group(cache);
- return ret;
- }
update_global_block_rsv(fs_info);
__link_block_group(cache->space_info, cache);
@@ -10786,21 +10812,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
mixed = 1;
flags = BTRFS_BLOCK_GROUP_SYSTEM;
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
+ ret = create_space_info(fs_info, flags, &space_info);
if (ret)
goto out;
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
+ ret = create_space_info(fs_info, flags, &space_info);
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
+ ret = create_space_info(fs_info, flags, &space_info);
if (ret)
goto out;
flags = BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
+ ret = create_space_info(fs_info, flags, &space_info);
}
out:
return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d3619e010005..0aff9b278c19 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void)
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree, u64 start, u64 end)
{
- struct inode *inode;
- u64 isize;
-
- if (!tree->mapping)
- return;
-
- inode = tree->mapping->host;
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
- }
+ if (tree->ops && tree->ops->check_extent_io_range)
+ tree->ops->check_extent_io_range(tree->private_data, caller,
+ start, end);
}
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
@@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data);
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
{
- if (!tree->mapping)
- return NULL;
- return btrfs_sb(tree->mapping->host->i_sb);
+ if (tree->ops)
+ return tree->ops->tree_fs_info(tree->private_data);
+ return NULL;
}
int __init extent_io_init(void)
@@ -174,7 +164,8 @@ int __init extent_io_init(void)
goto free_state_cache;
btrfs_bioset = bioset_create(BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio));
+ offsetof(struct btrfs_io_bio, bio),
+ BIOSET_NEED_BVECS);
if (!btrfs_bioset)
goto free_buffer_cache;
@@ -213,13 +204,13 @@ void extent_io_exit(void)
}
void extent_io_tree_init(struct extent_io_tree *tree,
- struct address_space *mapping)
+ void *private_data)
{
tree->state = RB_ROOT;
tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
- tree->mapping = mapping;
+ tree->private_data = private_data;
}
static struct extent_state *alloc_extent_state(gfp_t mask)
@@ -369,8 +360,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
struct extent_state *other)
{
if (tree->ops && tree->ops->merge_extent_hook)
- tree->ops->merge_extent_hook(tree->mapping->host, new,
- other);
+ tree->ops->merge_extent_hook(tree->private_data, new, other);
}
/*
@@ -421,15 +411,14 @@ static void set_state_cb(struct extent_io_tree *tree,
struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->set_bit_hook)
- tree->ops->set_bit_hook(tree->mapping->host, state, bits);
+ tree->ops->set_bit_hook(tree->private_data, state, bits);
}
static void clear_state_cb(struct extent_io_tree *tree,
struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
- tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host),
- state, bits);
+ tree->ops->clear_bit_hook(tree->private_data, state, bits);
}
static void set_state_bits(struct extent_io_tree *tree,
@@ -478,7 +467,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
u64 split)
{
if (tree->ops && tree->ops->split_extent_hook)
- tree->ops->split_extent_hook(tree->mapping->host, orig, split);
+ tree->ops->split_extent_hook(tree->private_data, orig, split);
}
/*
@@ -1402,17 +1391,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
*/
static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
-
- while (index <= end_index) {
- page = find_get_page(tree->mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- set_page_writeback(page);
- put_page(page);
- index++;
- }
+ tree->ops->set_range_writeback(tree->private_data, start, end);
}
/* find the first state struct with 'bits' set after 'start', and
@@ -1961,11 +1940,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
SetPageUptodate(page);
}
-int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
+int free_io_failure(struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree,
+ struct io_failure_record *rec)
{
int ret;
int err = 0;
- struct extent_io_tree *failure_tree = &inode->io_failure_tree;
set_state_failrec(failure_tree, rec->start, NULL);
ret = clear_extent_bits(failure_tree, rec->start,
@@ -1974,7 +1954,7 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
if (ret)
err = ret;
- ret = clear_extent_bits(&inode->io_tree, rec->start,
+ ret = clear_extent_bits(io_tree, rec->start,
rec->start + rec->len - 1,
EXTENT_DAMAGED);
if (ret && !err)
@@ -1994,11 +1974,10 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
- u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
@@ -2009,9 +1988,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
BUG_ON(!mirror_num);
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio)
- return -EIO;
+ bio = btrfs_io_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
@@ -2070,7 +2047,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
- btrfs_ino(inode), start,
+ ino, start,
rcu_str_deref(dev->name), sector);
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
@@ -2090,8 +2067,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
- ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start,
- PAGE_SIZE, start, p,
+ ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
start - page_offset(p), mirror_num);
if (ret)
break;
@@ -2105,24 +2081,24 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page,
- unsigned int pg_offset)
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree, u64 start,
+ struct page *page, u64 ino, unsigned int pg_offset)
{
u64 private;
struct io_failure_record *failrec;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_state *state;
int num_copies;
int ret;
private = 0;
- ret = count_range_bits(&inode->io_failure_tree, &private,
- (u64)-1, 1, EXTENT_DIRTY, 0);
+ ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
+ EXTENT_DIRTY, 0);
if (!ret)
return 0;
- ret = get_state_failrec(&inode->io_failure_tree, start,
- &failrec);
+ ret = get_state_failrec(failure_tree, start, &failrec);
if (ret)
return 0;
@@ -2138,25 +2114,25 @@ int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page,
if (fs_info->sb->s_flags & MS_RDONLY)
goto out;
- spin_lock(&inode->io_tree.lock);
- state = find_first_extent_bit_state(&inode->io_tree,
+ spin_lock(&io_tree->lock);
+ state = find_first_extent_bit_state(io_tree,
failrec->start,
EXTENT_LOCKED);
- spin_unlock(&inode->io_tree.lock);
+ spin_unlock(&io_tree->lock);
if (state && state->start <= failrec->start &&
state->end >= failrec->start + failrec->len - 1) {
num_copies = btrfs_num_copies(fs_info, failrec->logical,
failrec->len);
if (num_copies > 1) {
- repair_io_failure(inode, start, failrec->len,
- failrec->logical, page,
- pg_offset, failrec->failed_mirror);
+ repair_io_failure(fs_info, ino, start, failrec->len,
+ failrec->logical, page, pg_offset,
+ failrec->failed_mirror);
}
}
out:
- free_io_failure(inode, failrec);
+ free_io_failure(failure_tree, io_tree, failrec);
return 0;
}
@@ -2282,7 +2258,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
return 0;
}
-int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec, int failed_mirror)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2298,7 +2274,7 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
btrfs_debug(fs_info,
"Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
num_copies, failrec->this_mirror, failed_mirror);
- return 0;
+ return false;
}
/*
@@ -2339,10 +2315,10 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
btrfs_debug(fs_info,
"Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
num_copies, failrec->this_mirror, failed_mirror);
- return 0;
+ return false;
}
- return 1;
+ return true;
}
@@ -2356,10 +2332,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct btrfs_io_bio *btrfs_failed_bio;
struct btrfs_io_bio *btrfs_bio;
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio)
- return NULL;
-
+ bio = btrfs_io_bio_alloc(1);
bio->bi_end_io = endio_func;
bio->bi_iter.bi_sector = failrec->logical >> 9;
bio->bi_bdev = fs_info->fs_devices->latest_bdev;
@@ -2397,8 +2370,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
struct io_failure_record *failrec;
struct inode *inode = page->mapping->host;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct bio *bio;
int read_mode = 0;
+ blk_status_t status;
int ret;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2407,9 +2382,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
if (ret)
return ret;
- ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
- if (!ret) {
- free_io_failure(BTRFS_I(inode), failrec);
+ if (!btrfs_check_repairable(inode, failed_bio, failrec,
+ failed_mirror)) {
+ free_io_failure(failure_tree, tree, failrec);
return -EIO;
}
@@ -2421,21 +2396,18 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
start - page_offset(page),
(int)phy_offset, failed_bio->bi_end_io,
NULL);
- if (!bio) {
- free_io_failure(BTRFS_I(inode), failrec);
- return -EIO;
- }
bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
btrfs_debug(btrfs_sb(inode->i_sb),
"Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
read_mode, failrec->this_mirror, failrec->in_validation);
- ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror,
+ status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
failrec->bio_flags, 0);
- if (ret) {
- free_io_failure(BTRFS_I(inode), failrec);
+ if (status) {
+ free_io_failure(failure_tree, tree, failrec);
bio_put(bio);
+ ret = blk_status_to_errno(status);
}
return ret;
@@ -2474,11 +2446,13 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
*/
static void end_bio_extent_writepage(struct bio *bio)
{
+ int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
u64 start;
u64 end;
int i;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
@@ -2503,7 +2477,7 @@ static void end_bio_extent_writepage(struct bio *bio)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
- end_extent_writepage(page, bio->bi_error, start, end);
+ end_extent_writepage(page, error, start, end);
end_page_writeback(page);
}
@@ -2536,9 +2510,9 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- int uptodate = !bio->bi_error;
+ int uptodate = !bio->bi_status;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct extent_io_tree *tree;
+ struct extent_io_tree *tree, *failure_tree;
u64 offset = 0;
u64 start;
u64 end;
@@ -2549,6 +2523,7 @@ static void end_bio_extent_readpage(struct bio *bio)
int ret;
int i;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
@@ -2556,9 +2531,10 @@ static void end_bio_extent_readpage(struct bio *bio)
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- (u64)bio->bi_iter.bi_sector, bio->bi_error,
+ (u64)bio->bi_iter.bi_sector, bio->bi_status,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
+ failure_tree = &BTRFS_I(inode)->io_failure_tree;
/* We always issue full-page reads, but if some block
* in a page fails to read, blk_update_request() will
@@ -2588,8 +2564,10 @@ static void end_bio_extent_readpage(struct bio *bio)
if (ret)
uptodate = 0;
else
- clean_io_failure(BTRFS_I(inode), start,
- page, 0);
+ clean_io_failure(BTRFS_I(inode)->root->fs_info,
+ failure_tree, tree, start,
+ page,
+ btrfs_ino(BTRFS_I(inode)), 0);
}
if (likely(uptodate))
@@ -2615,7 +2593,7 @@ static void end_bio_extent_readpage(struct bio *bio)
ret = bio_readpage_error(bio, offset, page,
start, end, mirror);
if (ret == 0) {
- uptodate = !bio->bi_error;
+ uptodate = !bio->bi_status;
offset += len;
continue;
}
@@ -2673,77 +2651,80 @@ readpage_ok:
endio_readpage_release_extent(tree, extent_start, extent_len,
uptodate);
if (io_bio->end_io)
- io_bio->end_io(io_bio, bio->bi_error);
+ io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
bio_put(bio);
}
/*
- * this allocates from the btrfs_bioset. We're returning a bio right now
- * but you can call btrfs_io_bio for the appropriate container_of magic
+ * Initialize the members up to but not including 'bio'. Use after allocating a
+ * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
+ * 'bio' because use of __GFP_ZERO is not supported.
*/
-struct bio *
-btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
- gfp_t gfp_flags)
+static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
{
- struct btrfs_io_bio *btrfs_bio;
- struct bio *bio;
-
- bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
+ memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+}
- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2)) {
- bio = bio_alloc_bioset(gfp_flags,
- nr_vecs, btrfs_bioset);
- }
- }
+/*
+ * The following helpers allocate a bio. As it's backed by a bioset, it'll
+ * never fail. We're returning a bio right now but you can call btrfs_io_bio
+ * for the appropriate container_of magic
+ */
+struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
+{
+ struct bio *bio;
- if (bio) {
- bio->bi_bdev = bdev;
- bio->bi_iter.bi_sector = first_sector;
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_bio->csum = NULL;
- btrfs_bio->csum_allocated = NULL;
- btrfs_bio->end_io = NULL;
- }
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = first_byte >> 9;
+ btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
-struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *btrfs_bio_clone(struct bio *bio)
{
struct btrfs_io_bio *btrfs_bio;
struct bio *new;
- new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
- if (new) {
- btrfs_bio = btrfs_io_bio(new);
- btrfs_bio->csum = NULL;
- btrfs_bio->csum_allocated = NULL;
- btrfs_bio->end_io = NULL;
- }
+ /* Bio allocation backed by a bioset does not fail */
+ new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
+ btrfs_bio = btrfs_io_bio(new);
+ btrfs_io_bio_init(btrfs_bio);
+ btrfs_bio->iter = bio->bi_iter;
return new;
}
-/* this also allocates from the btrfs_bioset */
-struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
{
- struct btrfs_io_bio *btrfs_bio;
struct bio *bio;
- bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
- if (bio) {
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_bio->csum = NULL;
- btrfs_bio->csum_allocated = NULL;
- btrfs_bio->end_io = NULL;
- }
+ /* Bio allocation backed by a bioset does not fail */
+ bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
+ btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
+struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+{
+ struct bio *bio;
+ struct btrfs_io_bio *btrfs_bio;
+
+ /* this will never fail when it's backed by a bioset */
+ bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
+ ASSERT(bio);
+
+ btrfs_bio = btrfs_io_bio(bio);
+ btrfs_io_bio_init(btrfs_bio);
+
+ bio_trim(bio, offset >> 9, size >> 9);
+ btrfs_bio->iter = bio->bi_iter;
+ return bio;
+}
static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags)
{
- int ret = 0;
+ blk_status_t ret = 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
@@ -2755,13 +2736,13 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
bio_get(bio);
if (tree->ops)
- ret = tree->ops->submit_bio_hook(page->mapping->host, bio,
+ ret = tree->ops->submit_bio_hook(tree->private_data, bio,
mirror_num, bio_flags, start);
else
btrfsic_submit_bio(bio);
bio_put(bio);
- return ret;
+ return blk_status_to_errno(ret);
}
static int merge_bio(struct extent_io_tree *tree, struct page *page,
@@ -2818,14 +2799,11 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
}
}
- bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES,
- GFP_NOFS | __GFP_HIGH);
- if (!bio)
- return -ENOMEM;
-
+ bio = btrfs_bio_alloc(bdev, sector << 9);
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+ bio->bi_write_hint = page->mapping->host->i_write_hint;
bio_set_op_attrs(bio, op, op_flags);
if (wbc) {
wbc_init_bio(wbc, bio);
@@ -3597,9 +3575,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
spin_unlock(&eb->refs_lock);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
- -eb->len,
- fs_info->dirty_metadata_batch);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ -eb->len,
+ fs_info->dirty_metadata_batch);
ret = 1;
} else {
spin_unlock(&eb->refs_lock);
@@ -3700,6 +3678,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
struct extent_buffer *eb;
int i, done;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
@@ -3707,7 +3686,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
- if (bio->bi_error ||
+ if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page);
@@ -3757,7 +3736,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
start = btrfs_item_nr_offset(nritems);
- end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb);
+ end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
memzero_extent_buffer(eb, start, end - start);
}
@@ -4463,29 +4442,25 @@ try_submit_last:
}
/*
- * Sanity check for fiemap cache
+ * Emit last fiemap cache
+ *
+ * The last fiemap cache may still be cached in the following case:
+ * 0 4k 8k
+ * |<- Fiemap range ->|
+ * |<------------ First extent ----------->|
*
- * All fiemap cache should be submitted by emit_fiemap_extent()
- * Iteration should be terminated either by last fiemap extent or
- * fieinfo->fi_extents_max.
- * So no cached fiemap should exist.
+ * In this case, the first extent range will be cached but not emitted.
+ * So we must emit it before ending extent_fiemap().
*/
-static int check_fiemap_cache(struct btrfs_fs_info *fs_info,
- struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache)
+static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
{
int ret;
if (!cache->cached)
return 0;
- /* Small and recoverbale problem, only to info developer */
-#ifdef CONFIG_BTRFS_DEBUG
- WARN_ON(1);
-#endif
- btrfs_warn(fs_info,
- "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x",
- cache->offset, cache->phys, cache->len, cache->flags);
ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
cache->len, cache->flags);
cache->cached = false;
@@ -4701,7 +4676,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
out_free:
if (!ret)
- ret = check_fiemap_cache(root->fs_info, fieinfo, &cache);
+ ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
free_extent_map(em);
out:
btrfs_free_path(path);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1eafa2f0ede3..4f030912f3ef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -92,7 +92,7 @@ struct btrfs_inode;
struct btrfs_io_bio;
struct io_failure_record;
-typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset);
struct extent_io_ops {
@@ -108,32 +108,36 @@ struct extent_io_ops {
size_t size, struct bio *bio,
unsigned long bio_flags);
int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
+ struct btrfs_fs_info *(*tree_fs_info)(void *private_data);
+ void (*set_range_writeback)(void *private_data, u64 start, u64 end);
/*
* Optional hooks, called if the pointer is not NULL
*/
- int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+ int (*fill_delalloc)(void *private_data, struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written);
int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
- void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+ void (*set_bit_hook)(void *private_data, struct extent_state *state,
unsigned *bits);
- void (*clear_bit_hook)(struct btrfs_inode *inode,
+ void (*clear_bit_hook)(void *private_data,
struct extent_state *state,
unsigned *bits);
- void (*merge_extent_hook)(struct inode *inode,
+ void (*merge_extent_hook)(void *private_data,
struct extent_state *new,
struct extent_state *other);
- void (*split_extent_hook)(struct inode *inode,
+ void (*split_extent_hook)(void *private_data,
struct extent_state *orig, u64 split);
+ void (*check_extent_io_range)(void *private_data, const char *caller,
+ u64 start, u64 end);
};
struct extent_io_tree {
struct rb_root state;
- struct address_space *mapping;
+ void *private_data;
u64 dirty_bytes;
int track_uptodate;
spinlock_t lock;
@@ -205,12 +209,46 @@ struct extent_buffer {
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- u64 bytes_changed;
+ unsigned int bytes_changed;
/* Changed ranges */
struct ulist range_changed;
};
+static inline void extent_changeset_init(struct extent_changeset *changeset)
+{
+ changeset->bytes_changed = 0;
+ ulist_init(&changeset->range_changed);
+}
+
+static inline struct extent_changeset *extent_changeset_alloc(void)
+{
+ struct extent_changeset *ret;
+
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return NULL;
+
+ extent_changeset_init(ret);
+ return ret;
+}
+
+static inline void extent_changeset_release(struct extent_changeset *changeset)
+{
+ if (!changeset)
+ return;
+ changeset->bytes_changed = 0;
+ ulist_release(&changeset->range_changed);
+}
+
+static inline void extent_changeset_free(struct extent_changeset *changeset)
+{
+ if (!changeset)
+ return;
+ extent_changeset_release(changeset);
+ kfree(changeset);
+}
+
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
@@ -230,8 +268,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
u64 start, u64 len,
int create);
-void extent_io_tree_init(struct extent_io_tree *tree,
- struct address_space *mapping);
+void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
@@ -459,20 +496,21 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
u64 delalloc_end, struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
-struct bio *
-btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
- gfp_t gfp_flags);
-struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
+struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte);
+struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
+struct bio *btrfs_bio_clone(struct bio *bio);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
struct btrfs_fs_info;
struct btrfs_inode;
-int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
- u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
-int clean_io_failure(struct btrfs_inode *inode, u64 start,
- struct page *page, unsigned int pg_offset);
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num);
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree, u64 start,
+ struct page *page, u64 ino, unsigned int pg_offset);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int mirror_num);
@@ -501,13 +539,15 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
u64 end);
int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
struct io_failure_record **failrec_ret);
-int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
- struct io_failure_record *failrec, int fail_mirror);
+bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec, int fail_mirror);
struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec,
struct page *page, int pg_offset, int icsum,
bio_end_io_t *endio_func, void *data);
-int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec);
+int free_io_failure(struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree,
+ struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
noinline u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 64fcb31d7163..fdcb41002623 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -160,11 +160,12 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
kfree(bio->csum_allocated);
}
-static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
+static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 logical_offset, u32 *dst, int dio)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -177,12 +178,12 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 page_bytes_left;
u32 diff;
int nblocks;
- int count = 0, i;
+ int count = 0;
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
@@ -191,7 +192,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
csum_size, GFP_NOFS);
if (!btrfs_bio->csum_allocated) {
btrfs_free_path(path);
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
}
btrfs_bio->csum = btrfs_bio->csum_allocated;
btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
@@ -206,8 +207,6 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
path->reada = READA_FORWARD;
- WARN_ON(bio->bi_vcnt <= 0);
-
/*
* the free space stuff is only read when it hasn't been
* updated in the current transaction. So, we can safely
@@ -223,13 +222,13 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
if (dio)
offset = logical_offset;
- bio_for_each_segment_all(bvec, bio, i) {
- page_bytes_left = bvec->bv_len;
+ bio_for_each_segment(bvec, bio, iter) {
+ page_bytes_left = bvec.bv_len;
if (count)
goto next;
if (!dio)
- offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ offset = page_offset(bvec.bv_page) + bvec.bv_offset;
count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
(u32 *)csum, nblocks);
if (count)
@@ -303,12 +302,12 @@ next:
return 0;
}
-int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
{
return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
}
-int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
+blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
{
return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1);
}
@@ -433,26 +432,26 @@ fail:
return ret;
}
-int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
u64 file_start, int contig)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
char *data;
- struct bio_vec *bvec;
+ struct bvec_iter iter;
+ struct bio_vec bvec;
int index;
int nr_sectors;
- int i, j;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
+ int i;
u64 offset;
- WARN_ON(bio->bi_vcnt <= 0);
sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
GFP_NOFS);
if (!sums)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
@@ -465,19 +464,19 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
index = 0;
- bio_for_each_segment_all(bvec, bio, j) {
+ bio_for_each_segment(bvec, bio, iter) {
if (!contig)
- offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ offset = page_offset(bvec.bv_page) + bvec.bv_offset;
if (!ordered) {
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
}
- data = kmap_atomic(bvec->bv_page);
+ data = kmap_atomic(bvec.bv_page);
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
- bvec->bv_len + fs_info->sectorsize
+ bvec.bv_len + fs_info->sectorsize
- 1);
for (i = 0; i < nr_sectors; i++) {
@@ -504,12 +503,12 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+ total_bytes;
index = 0;
- data = kmap_atomic(bvec->bv_page);
+ data = kmap_atomic(bvec.bv_page);
}
sums->sums[index] = ~(u32)0;
sums->sums[index]
- = btrfs_csum_data(data + bvec->bv_offset
+ = btrfs_csum_data(data + bvec.bv_offset
+ (i * fs_info->sectorsize),
sums->sums[index],
fs_info->sectorsize);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index da1096eb1a40..9e75d8a39aac 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1581,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
@@ -1628,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
- ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+ extent_changeset_release(data_reserved);
+ ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
+ write_bytes);
if (ret < 0) {
if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) &&
@@ -1657,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode, pos,
- write_bytes);
+ btrfs_free_reserved_data_space(inode,
+ data_reserved, pos,
+ write_bytes);
else
btrfs_end_write_no_snapshoting(root);
break;
@@ -1740,8 +1744,9 @@ again:
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
- btrfs_delalloc_release_space(inode, __pos,
- release_bytes);
+ btrfs_delalloc_release_space(inode,
+ data_reserved, __pos,
+ release_bytes);
}
}
@@ -1796,12 +1801,13 @@ again:
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes);
} else {
- btrfs_delalloc_release_space(inode,
- round_down(pos, fs_info->sectorsize),
- release_bytes);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ round_down(pos, fs_info->sectorsize),
+ release_bytes);
}
}
+ extent_changeset_free(data_reserved);
return num_written ? num_written : ret;
}
@@ -1876,17 +1882,36 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
ssize_t err;
loff_t pos;
- size_t count;
+ size_t count = iov_iter_count(from);
loff_t oldsize;
int clean_page = 0;
- inode_lock(inode);
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
+
err = generic_write_checks(iocb, from);
if (err <= 0) {
inode_unlock(inode);
return err;
}
+ pos = iocb->ki_pos;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /*
+ * We will allocate space in case nodatacow is not set,
+ * so bail
+ */
+ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) ||
+ check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+ inode_unlock(inode);
+ return -EAGAIN;
+ }
+ }
+
current->backing_dev_info = inode_to_bdi(inode);
err = file_remove_privs(file);
if (err) {
@@ -1914,8 +1939,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
*/
update_time_for_write(inode);
- pos = iocb->ki_pos;
- count = iov_iter_count(from);
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
if (start_pos > oldsize) {
@@ -2011,7 +2034,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
- int ret = 0;
+ int ret = 0, err;
bool full_sync = 0;
u64 len;
@@ -2030,7 +2053,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret)
- return ret;
+ goto out;
inode_lock(inode);
atomic_inc(&root->log_batch);
@@ -2135,10 +2158,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* An ordered extent might have started before and completed
* already with io errors, in which case the inode was not
* updated and we end up here. So check the inode's mapping
- * flags for any errors that might have happened while doing
- * writeback of file data.
+ * for any errors that might have happened since we last
+ * checked called fsync.
*/
- ret = filemap_check_errors(inode->i_mapping);
+ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
inode_unlock(inode);
goto out;
}
@@ -2227,6 +2250,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_end_transaction(trans);
}
out:
+ err = file_check_and_advance_wb_err(file);
+ if (!ret)
+ ret = err;
return ret > 0 ? -EIO : ret;
}
@@ -2390,10 +2416,13 @@ out:
*/
static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
int ret = 0;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+ round_down(*start, fs_info->sectorsize),
+ round_up(*len, fs_info->sectorsize), 0);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2769,6 +2798,7 @@ static long btrfs_fallocate(struct file *file, int mode,
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
struct falloc_range *range;
struct falloc_range *tmp;
struct list_head reserve_list;
@@ -2898,8 +2928,8 @@ static long btrfs_fallocate(struct file *file, int mode,
free_extent_map(em);
break;
}
- ret = btrfs_qgroup_reserve_data(inode, cur_offset,
- last_byte - cur_offset);
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+ cur_offset, last_byte - cur_offset);
if (ret < 0) {
free_extent_map(em);
break;
@@ -2910,8 +2940,8 @@ static long btrfs_fallocate(struct file *file, int mode,
* range, free reserved data space first, otherwise
* it'll result in false ENOSPC error.
*/
- btrfs_free_reserved_data_space(inode, cur_offset,
- last_byte - cur_offset);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ cur_offset, last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -2930,8 +2960,9 @@ static long btrfs_fallocate(struct file *file, int mode,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
else
- btrfs_free_reserved_data_space(inode, range->start,
- range->len);
+ btrfs_free_reserved_data_space(inode,
+ data_reserved, range->start,
+ range->len);
list_del(&range->list);
kfree(range);
}
@@ -2969,8 +3000,9 @@ out:
inode_unlock(inode);
/* Let go of our reservation. */
if (ret != 0)
- btrfs_free_reserved_data_space(inode, alloc_start,
- alloc_end - cur_offset);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ alloc_start, alloc_end - cur_offset);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -3071,13 +3103,19 @@ out:
return offset;
}
+static int btrfs_file_open(struct inode *inode, struct file *filp)
+{
+ filp->f_mode |= FMODE_AIO_NOWAIT;
+ return generic_file_open(inode, filp);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
.read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.mmap = btrfs_file_mmap,
- .open = generic_file_open,
+ .open = btrfs_file_open,
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
.fallocate = btrfs_fallocate,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index fc0bd8406758..a5e34de06c2f 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -17,7 +17,7 @@
*/
#include <linux/kernel.h>
-#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
@@ -153,21 +153,21 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
static u8 *alloc_bitmap(u32 bitmap_size)
{
- void *mem;
+ u8 *ret;
+ unsigned int nofs_flag;
/*
- * The allocation size varies, observed numbers were < 4K up to 16K.
- * Using vmalloc unconditionally would be too heavy, we'll try
- * contiguous allocations first.
+ * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
+ * into the filesystem as the free space bitmap can be modified in the
+ * critical section of a transaction commit.
+ *
+ * TODO: push the memalloc_nofs_{save,restore}() to the caller where we
+ * know that recursion is unsafe.
*/
- if (bitmap_size <= PAGE_SIZE)
- return kzalloc(bitmap_size, GFP_NOFS);
-
- mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN);
- if (mem)
- return mem;
-
- return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL);
+ nofs_flag = memalloc_nofs_save();
+ ret = kvzalloc(bitmap_size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ return ret;
}
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
@@ -1188,11 +1188,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
-
- return 0;
+ return btrfs_commit_transaction(trans);
abort:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
@@ -1277,11 +1273,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
free_extent_buffer(free_space_root->commit_root);
kfree(free_space_root);
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
-
- return 0;
+ return btrfs_commit_transaction(trans);
abort:
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index a97fdc156a03..baacc1866861 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
{
SHASH_DESC_ON_STACK(shash, tfm);
u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ u32 retval;
int err;
shash->tfm = tfm;
@@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
err = crypto_shash_update(shash, address, length);
BUG_ON(err);
- return *ctx;
+ retval = *ctx;
+ barrier_data(ctx);
+ return retval;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 5c6c20ec64d8..d02019747d00 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_path *path;
struct inode *inode;
struct btrfs_block_rsv *rsv;
+ struct extent_changeset *data_reserved = NULL;
u64 num_bytes;
u64 alloc_hint = 0;
int ret;
@@ -492,7 +493,7 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_SIZE;
- ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
if (ret)
goto out_put;
@@ -516,6 +517,7 @@ out:
trans->bytes_reserved = num_bytes;
btrfs_free_path(path);
+ extent_changeset_free(data_reserved);
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ef3c98c527c1..95c212037095 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -86,7 +86,6 @@ static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
-struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
struct kmem_cache *btrfs_free_space_cachep;
@@ -178,7 +177,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
char *kaddr;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
- int err = 0;
int ret;
size_t cur_size = size;
unsigned long offset;
@@ -200,10 +198,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
- if (ret) {
- err = ret;
+ if (ret)
goto fail;
- }
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -258,9 +254,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->disk_i_size = inode->i_size;
ret = btrfs_update_inode(trans, root, inode);
- return ret;
fail:
- return err;
+ return ret;
}
@@ -350,7 +345,7 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
@@ -608,12 +603,11 @@ cont:
/*
* one last check to make sure the compression is really a
- * win, compare the page count read with the blocks on disk
+ * win, compare the page count read with the blocks on disk,
+ * compression must free at least one sector size
*/
total_in = ALIGN(total_in, PAGE_SIZE);
- if (total_compressed >= total_in) {
- will_compress = 0;
- } else {
+ if (total_compressed + blocksize <= total_in) {
num_bytes = total_in;
*num_added += 1;
@@ -842,13 +836,12 @@ retry:
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK);
- ret = btrfs_submit_compressed_write(inode,
+ if (btrfs_submit_compressed_write(inode,
async_extent->start,
async_extent->ram_size,
ins.objectid,
ins.offset, async_extent->pages,
- async_extent->nr_pages);
- if (ret) {
+ async_extent->nr_pages)) {
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
@@ -1569,10 +1562,11 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
/*
* extent_io.c call back to do delayed allocation processing
*/
-static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+static int run_delalloc_range(void *private_data, struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written)
{
+ struct inode *inode = private_data;
int ret;
int force_cow = need_force_cow(inode, start, end);
@@ -1596,9 +1590,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
return ret;
}
-static void btrfs_split_extent_hook(struct inode *inode,
+static void btrfs_split_extent_hook(void *private_data,
struct extent_state *orig, u64 split)
{
+ struct inode *inode = private_data;
u64 size;
/* not delalloc, ignore it */
@@ -1633,10 +1628,11 @@ static void btrfs_split_extent_hook(struct inode *inode,
* extents, such as when we are doing sequential writes, so we can properly
* account for the metadata space we'll need.
*/
-static void btrfs_merge_extent_hook(struct inode *inode,
+static void btrfs_merge_extent_hook(void *private_data,
struct extent_state *new,
struct extent_state *other)
{
+ struct inode *inode = private_data;
u64 new_size, old_size;
u32 num_extents;
@@ -1736,9 +1732,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* bytes in this file, and to maintain the list of inodes that
* have pending delalloc work to be done.
*/
-static void btrfs_set_bit_hook(struct inode *inode,
+static void btrfs_set_bit_hook(void *private_data,
struct extent_state *state, unsigned *bits)
{
+ struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1766,8 +1763,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
if (btrfs_is_testing(fs_info))
return;
- __percpu_counter_add(&fs_info->delalloc_bytes, len,
- fs_info->delalloc_batch);
+ percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
+ fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
if (*bits & EXTENT_DEFRAG)
@@ -1790,10 +1787,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
/*
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
-static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
+static void btrfs_clear_bit_hook(void *private_data,
struct extent_state *state,
unsigned *bits)
{
+ struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(len);
@@ -1840,8 +1838,8 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
&inode->vfs_inode,
state->start, len);
- __percpu_counter_add(&fs_info->delalloc_bytes, -len,
- fs_info->delalloc_batch);
+ percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
+ fs_info->delalloc_batch);
spin_lock(&inode->lock);
inode->delalloc_bytes -= len;
if (do_list && inode->delalloc_bytes == 0 &&
@@ -1901,11 +1899,12 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
- int ret = 0;
+ struct inode *inode = private_data;
+ blk_status_t ret = 0;
ret = btrfs_csum_one_bio(inode, bio, 0, 0);
BUG_ON(ret); /* -ENOMEM */
@@ -1920,16 +1919,17 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
+static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int ret;
+ blk_status_t ret;
ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
if (ret) {
- bio->bi_error = ret;
+ bio->bi_status = ret;
bio_endio(bio);
}
return ret;
@@ -1939,14 +1939,15 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
* extent_io.c submission hook. This does the right thing for csum calculation
* on write, or reading the csums from the tree before a read
*/
-static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
+ struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
- int ret = 0;
+ blk_status_t ret = 0;
int skip_sum;
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
@@ -1976,8 +1977,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num,
- bio_flags, bio_offset,
+ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
+ bio_offset, inode,
__btrfs_submit_bio_start,
__btrfs_submit_bio_done);
goto out;
@@ -1991,8 +1992,8 @@ mapit:
ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
out:
- if (ret < 0) {
- bio->bi_error = ret;
+ if (ret) {
+ bio->bi_status = ret;
bio_endio(bio);
}
return ret;
@@ -2035,6 +2036,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_writepage_fixup *fixup;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
struct page *page;
struct inode *inode;
u64 page_start;
@@ -2072,7 +2074,7 @@ again:
goto again;
}
- ret = btrfs_delalloc_reserve_space(inode, page_start,
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
PAGE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
@@ -2092,6 +2094,7 @@ out_page:
unlock_page(page);
put_page(page);
kfree(fixup);
+ extent_changeset_free(data_reserved);
}
/*
@@ -2143,6 +2146,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
+ u64 qg_released;
int extent_inserted = 0;
int ret;
@@ -2198,13 +2202,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins);
+
/*
* Release the reserved range from inode dirty range map, as it is
* already moved into delayed_ref_head
*/
- btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+ ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+ if (ret < 0)
+ goto out;
+ qg_released = ret;
+ ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins);
out:
btrfs_free_path(path);
@@ -2926,7 +2934,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
* space for NOCOW range.
* As NOCOW won't cause a new delayed ref, just free the space
*/
- btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+ btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (nolock)
@@ -4762,6 +4770,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
char *kaddr;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
@@ -4776,7 +4785,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
- ret = btrfs_delalloc_reserve_space(inode,
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
round_down(from, blocksize), blocksize);
if (ret)
goto out;
@@ -4784,7 +4793,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, data_reserved,
round_down(from, blocksize),
blocksize);
ret = -ENOMEM;
@@ -4856,11 +4865,12 @@ again:
out_unlock:
if (ret)
- btrfs_delalloc_release_space(inode, block_start,
+ btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize);
unlock_page(page);
put_page(page);
out:
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -5255,7 +5265,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
if (state->state & EXTENT_DELALLOC)
- btrfs_qgroup_free_data(inode, start, end - start + 1);
+ btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -5868,7 +5878,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_item *item;
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_key found_key;
@@ -5919,7 +5928,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
continue;
}
- item = btrfs_item_nr(slot);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid)
@@ -5934,7 +5942,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
ctx->pos = found_key.offset;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
- if (verify_dir_item(fs_info, leaf, di))
+ if (verify_dir_item(fs_info, leaf, slot, di))
goto next;
name_len = btrfs_dir_name_len(leaf, di);
@@ -7480,7 +7488,7 @@ out:
bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
{
struct radix_tree_root *root = &inode->i_mapping->page_tree;
- int found = false;
+ bool found = false;
void **pagep = NULL;
struct page *page = NULL;
unsigned long start_idx;
@@ -7978,9 +7986,12 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
bio_end_io_t *repair_endio, void *repair_arg)
{
struct io_failure_record *failrec;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct bio *bio;
int isector;
int read_mode = 0;
+ int segs;
int ret;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -7992,23 +8003,19 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
failed_mirror);
if (!ret) {
- free_io_failure(BTRFS_I(inode), failrec);
+ free_io_failure(failure_tree, io_tree, failrec);
return -EIO;
}
- if ((failed_bio->bi_vcnt > 1)
- || (failed_bio->bi_io_vec->bv_len
- > btrfs_inode_sectorsize(inode)))
+ segs = bio_segments(failed_bio);
+ if (segs > 1 ||
+ (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
read_mode |= REQ_FAILFAST_DEV;
isector = start - btrfs_io_bio(failed_bio)->logical;
isector >>= inode->i_sb->s_blocksize_bits;
bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
pgoff, isector, repair_endio, repair_arg);
- if (!bio) {
- free_io_failure(BTRFS_I(inode), failrec);
- return -EIO;
- }
bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
btrfs_debug(BTRFS_I(inode)->root->fs_info,
@@ -8017,7 +8024,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
if (ret) {
- free_io_failure(BTRFS_I(inode), failrec);
+ free_io_failure(failure_tree, io_tree, failrec);
bio_put(bio);
}
@@ -8034,19 +8041,25 @@ struct btrfs_retry_complete {
static void btrfs_retry_endio_nocsum(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
+ struct inode *inode = done->inode;
struct bio_vec *bvec;
+ struct extent_io_tree *io_tree, *failure_tree;
int i;
- if (bio->bi_error)
+ if (bio->bi_status)
goto end;
ASSERT(bio->bi_vcnt == 1);
- ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
+ io_tree = &BTRFS_I(inode)->io_tree;
+ failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
done->uptodate = 1;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i)
- clean_io_failure(BTRFS_I(done->inode), done->start,
- bvec->bv_page, 0);
+ clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
+ io_tree, done->start, bvec->bv_page,
+ btrfs_ino(BTRFS_I(inode)), 0);
end:
complete(&done->done);
bio_put(bio);
@@ -8056,36 +8069,40 @@ static int __btrfs_correct_data_nocsum(struct inode *inode,
struct btrfs_io_bio *io_bio)
{
struct btrfs_fs_info *fs_info;
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
struct btrfs_retry_complete done;
u64 start;
unsigned int pgoff;
u32 sectorsize;
int nr_sectors;
- int i;
int ret;
+ int err = 0;
fs_info = BTRFS_I(inode)->root->fs_info;
sectorsize = fs_info->sectorsize;
start = io_bio->logical;
done.inode = inode;
+ io_bio->bio.bi_iter = io_bio->iter;
- bio_for_each_segment_all(bvec, &io_bio->bio, i) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
- pgoff = bvec->bv_offset;
+ bio_for_each_segment(bvec, &io_bio->bio, iter) {
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
+ pgoff = bvec.bv_offset;
next_block_or_try_again:
done.uptodate = 0;
done.start = start;
init_completion(&done.done);
- ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
+ ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
pgoff, start, start + sectorsize - 1,
io_bio->mirror_num,
btrfs_retry_endio_nocsum, &done);
- if (ret)
- return ret;
+ if (ret) {
+ err = ret;
+ goto next;
+ }
wait_for_completion(&done.done);
@@ -8094,6 +8111,7 @@ next_block_or_try_again:
goto next_block_or_try_again;
}
+next:
start += sectorsize;
nr_sectors--;
@@ -8104,19 +8122,21 @@ next_block_or_try_again:
}
}
- return 0;
+ return err;
}
static void btrfs_retry_endio(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct extent_io_tree *io_tree, *failure_tree;
+ struct inode *inode = done->inode;
struct bio_vec *bvec;
int uptodate;
int ret;
int i;
- if (bio->bi_error)
+ if (bio->bi_status)
goto end;
uptodate = 1;
@@ -8124,13 +8144,20 @@ static void btrfs_retry_endio(struct bio *bio)
ASSERT(bio->bi_vcnt == 1);
ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
+ io_tree = &BTRFS_I(inode)->io_tree;
+ failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
- ret = __readpage_endio_check(done->inode, io_bio, i,
- bvec->bv_page, bvec->bv_offset,
- done->start, bvec->bv_len);
+ ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+ bvec->bv_offset, done->start,
+ bvec->bv_len);
if (!ret)
- clean_io_failure(BTRFS_I(done->inode), done->start,
- bvec->bv_page, bvec->bv_offset);
+ clean_io_failure(BTRFS_I(inode)->root->fs_info,
+ failure_tree, io_tree, done->start,
+ bvec->bv_page,
+ btrfs_ino(BTRFS_I(inode)),
+ bvec->bv_offset);
else
uptodate = 0;
}
@@ -8141,11 +8168,12 @@ end:
bio_put(bio);
}
-static int __btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, int err)
+static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, blk_status_t err)
{
struct btrfs_fs_info *fs_info;
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
struct btrfs_retry_complete done;
u64 start;
u64 offset = 0;
@@ -8153,7 +8181,7 @@ static int __btrfs_subio_endio_read(struct inode *inode,
int nr_sectors;
unsigned int pgoff;
int csum_pos;
- int i;
+ bool uptodate = (err == 0);
int ret;
fs_info = BTRFS_I(inode)->root->fs_info;
@@ -8162,29 +8190,31 @@ static int __btrfs_subio_endio_read(struct inode *inode,
err = 0;
start = io_bio->logical;
done.inode = inode;
+ io_bio->bio.bi_iter = io_bio->iter;
- bio_for_each_segment_all(bvec, &io_bio->bio, i) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ bio_for_each_segment(bvec, &io_bio->bio, iter) {
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
- pgoff = bvec->bv_offset;
+ pgoff = bvec.bv_offset;
next_block:
- csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
- ret = __readpage_endio_check(inode, io_bio, csum_pos,
- bvec->bv_page, pgoff, start,
- sectorsize);
- if (likely(!ret))
- goto next;
+ if (uptodate) {
+ csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
+ ret = __readpage_endio_check(inode, io_bio, csum_pos,
+ bvec.bv_page, pgoff, start, sectorsize);
+ if (likely(!ret))
+ goto next;
+ }
try_again:
done.uptodate = 0;
done.start = start;
init_completion(&done.done);
- ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
+ ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
pgoff, start, start + sectorsize - 1,
io_bio->mirror_num,
btrfs_retry_endio, &done);
if (ret) {
- err = ret;
+ err = errno_to_blk_status(ret);
goto next;
}
@@ -8211,8 +8241,8 @@ next:
return err;
}
-static int btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, int err)
+static blk_status_t btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, blk_status_t err)
{
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -8232,10 +8262,13 @@ static void btrfs_endio_direct_read(struct bio *bio)
struct inode *inode = dip->inode;
struct bio *dio_bio;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- int err = bio->bi_error;
+ blk_status_t err = bio->bi_status;
- if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+ if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) {
err = btrfs_subio_endio_read(inode, io_bio, err);
+ if (!err)
+ bio->bi_status = 0;
+ }
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
@@ -8243,11 +8276,11 @@ static void btrfs_endio_direct_read(struct bio *bio)
kfree(dip);
- dio_bio->bi_error = bio->bi_error;
- dio_end_io(dio_bio, bio->bi_error);
+ dio_bio->bi_status = bio->bi_status;
+ dio_end_io(dio_bio);
if (io_bio->end_io)
- io_bio->end_io(io_bio, err);
+ io_bio->end_io(io_bio, blk_status_to_errno(err));
bio_put(bio);
}
@@ -8299,20 +8332,21 @@ static void btrfs_endio_direct_write(struct bio *bio)
struct bio *dio_bio = dip->dio_bio;
__endio_write_update_ordered(dip->inode, dip->logical_offset,
- dip->bytes, !bio->bi_error);
+ dip->bytes, !bio->bi_status);
kfree(dip);
- dio_bio->bi_error = bio->bi_error;
- dio_end_io(dio_bio, bio->bi_error);
+ dio_bio->bi_status = bio->bi_status;
+ dio_end_io(dio_bio);
bio_put(bio);
}
-static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
+static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
struct bio *bio, int mirror_num,
unsigned long bio_flags, u64 offset)
{
- int ret;
+ struct inode *inode = private_data;
+ blk_status_t ret;
ret = btrfs_csum_one_bio(inode, bio, offset, 1);
BUG_ON(ret); /* -ENOMEM */
return 0;
@@ -8321,7 +8355,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
- int err = bio->bi_error;
+ blk_status_t err = bio->bi_status;
if (err)
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
@@ -8351,31 +8385,21 @@ static void btrfs_end_dio_bio(struct bio *bio)
if (dip->errors) {
bio_io_error(dip->orig_bio);
} else {
- dip->dio_bio->bi_error = 0;
+ dip->dio_bio->bi_status = 0;
bio_endio(dip->orig_bio);
}
out:
bio_put(bio);
}
-static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
- u64 first_sector, gfp_t gfp_flags)
-{
- struct bio *bio;
- bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
- if (bio)
- bio_associate_current(bio);
- return bio;
-}
-
-static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode,
+static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
struct btrfs_dio_private *dip,
struct bio *bio,
u64 file_offset)
{
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
- int ret;
+ blk_status_t ret;
/*
* We load all the csum data we need when we submit
@@ -8406,7 +8430,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
bool write = bio_op(bio) == REQ_OP_WRITE;
- int ret;
+ blk_status_t ret;
if (async_submit)
async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
@@ -8423,8 +8447,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
goto map;
if (write && async_submit) {
- ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0,
- file_offset,
+ ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
+ file_offset, inode,
__btrfs_submit_bio_start_direct_io,
__btrfs_submit_bio_done);
goto err;
@@ -8454,103 +8478,83 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
{
struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct bio *bio;
struct bio *orig_bio = dip->orig_bio;
- struct bio_vec *bvec;
u64 start_sector = orig_bio->bi_iter.bi_sector;
u64 file_offset = dip->logical_offset;
- u64 submit_len = 0;
u64 map_length;
- u32 blocksize = fs_info->sectorsize;
int async_submit = 0;
- int nr_sectors;
+ u64 submit_len;
+ int clone_offset = 0;
+ int clone_len;
int ret;
- int i, j;
map_length = orig_bio->bi_iter.bi_size;
+ submit_len = map_length;
ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
&map_length, NULL, 0);
if (ret)
return -EIO;
- if (map_length >= orig_bio->bi_iter.bi_size) {
+ if (map_length >= submit_len) {
bio = orig_bio;
dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
goto submit;
}
/* async crcs make it difficult to collect full stripe writes. */
- if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
async_submit = 0;
else
async_submit = 1;
- bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
- if (!bio)
- return -ENOMEM;
-
- bio->bi_opf = orig_bio->bi_opf;
- bio->bi_private = dip;
- bio->bi_end_io = btrfs_end_dio_bio;
- btrfs_io_bio(bio)->logical = file_offset;
+ /* bio split */
+ ASSERT(map_length <= INT_MAX);
atomic_inc(&dip->pending_bios);
+ do {
+ clone_len = min_t(int, submit_len, map_length);
- bio_for_each_segment_all(bvec, orig_bio, j) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
- i = 0;
-next_block:
- if (unlikely(map_length < submit_len + blocksize ||
- bio_add_page(bio, bvec->bv_page, blocksize,
- bvec->bv_offset + (i * blocksize)) < blocksize)) {
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the dip might get freed
- * before we're done setting it up
- */
- atomic_inc(&dip->pending_bios);
- ret = __btrfs_submit_dio_bio(bio, inode,
- file_offset, skip_sum,
- async_submit);
- if (ret) {
- bio_put(bio);
- atomic_dec(&dip->pending_bios);
- goto out_err;
- }
-
- start_sector += submit_len >> 9;
- file_offset += submit_len;
+ /*
+ * This will never fail as it's passing GPF_NOFS and
+ * the allocation is backed by btrfs_bioset.
+ */
+ bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
+ clone_len);
+ bio->bi_private = dip;
+ bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
+
+ ASSERT(submit_len >= clone_len);
+ submit_len -= clone_len;
+ if (submit_len == 0)
+ break;
- submit_len = 0;
+ /*
+ * Increase the count before we submit the bio so we know
+ * the end IO handler won't happen before we increase the
+ * count. Otherwise, the dip might get freed before we're
+ * done setting it up.
+ */
+ atomic_inc(&dip->pending_bios);
- bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
- start_sector, GFP_NOFS);
- if (!bio)
- goto out_err;
- bio->bi_opf = orig_bio->bi_opf;
- bio->bi_private = dip;
- bio->bi_end_io = btrfs_end_dio_bio;
- btrfs_io_bio(bio)->logical = file_offset;
+ ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
+ async_submit);
+ if (ret) {
+ bio_put(bio);
+ atomic_dec(&dip->pending_bios);
+ goto out_err;
+ }
- map_length = orig_bio->bi_iter.bi_size;
- ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
- start_sector << 9,
- &map_length, NULL, 0);
- if (ret) {
- bio_put(bio);
- goto out_err;
- }
+ clone_offset += clone_len;
+ start_sector += clone_len >> 9;
+ file_offset += clone_len;
- goto next_block;
- } else {
- submit_len += blocksize;
- if (--nr_sectors) {
- i++;
- goto next_block;
- }
- }
- }
+ map_length = submit_len;
+ ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
+ start_sector << 9, &map_length, NULL, 0);
+ if (ret)
+ goto out_err;
+ } while (submit_len > 0);
submit:
ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
@@ -8577,19 +8581,15 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
loff_t file_offset)
{
struct btrfs_dio_private *dip = NULL;
- struct bio *io_bio = NULL;
- struct btrfs_io_bio *btrfs_bio;
+ struct bio *bio = NULL;
+ struct btrfs_io_bio *io_bio;
int skip_sum;
bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
int ret = 0;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
- if (!io_bio) {
- ret = -ENOMEM;
- goto free_ordered;
- }
+ bio = btrfs_bio_clone(dio_bio);
dip = kzalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
@@ -8602,17 +8602,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
dip->logical_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
- io_bio->bi_private = dip;
- dip->orig_bio = io_bio;
+ bio->bi_private = dip;
+ dip->orig_bio = bio;
dip->dio_bio = dio_bio;
atomic_set(&dip->pending_bios, 0);
- btrfs_bio = btrfs_io_bio(io_bio);
- btrfs_bio->logical = file_offset;
+ io_bio = btrfs_io_bio(bio);
+ io_bio->logical = file_offset;
if (write) {
- io_bio->bi_end_io = btrfs_endio_direct_write;
+ bio->bi_end_io = btrfs_endio_direct_write;
} else {
- io_bio->bi_end_io = btrfs_endio_direct_read;
+ bio->bi_end_io = btrfs_endio_direct_read;
dip->subio_endio = btrfs_subio_endio_read;
}
@@ -8635,8 +8635,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
if (!ret)
return;
- if (btrfs_bio->end_io)
- btrfs_bio->end_io(btrfs_bio, ret);
+ if (io_bio->end_io)
+ io_bio->end_io(io_bio, ret);
free_ordered:
/*
@@ -8648,16 +8648,15 @@ free_ordered:
* same as btrfs_endio_direct_[write|read] because we can't call these
* callbacks - they require an allocated dip and a clone of dio_bio.
*/
- if (io_bio && dip) {
- io_bio->bi_error = -EIO;
- bio_endio(io_bio);
+ if (bio && dip) {
+ bio_io_error(bio);
/*
- * The end io callbacks free our dip, do the final put on io_bio
+ * The end io callbacks free our dip, do the final put on bio
* and all the cleanup and final put for dio_bio (through
* dio_end_io()).
*/
dip = NULL;
- io_bio = NULL;
+ bio = NULL;
} else {
if (write)
__endio_write_update_ordered(inode,
@@ -8668,15 +8667,15 @@ free_ordered:
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
- dio_bio->bi_error = -EIO;
+ dio_bio->bi_status = BLK_STS_IOERR;
/*
* Releases and cleans up our dio_bio, no need to bio_put()
* nor bio_endio()/bio_io_error() against dio_bio.
*/
- dio_end_io(dio_bio, ret);
+ dio_end_io(dio_bio);
}
- if (io_bio)
- bio_put(io_bio);
+ if (bio)
+ bio_put(bio);
kfree(dip);
}
@@ -8720,6 +8719,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_data dio_data = { 0 };
+ struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
int flags = 0;
@@ -8755,8 +8755,12 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
+ } else if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
}
- ret = btrfs_delalloc_reserve_space(inode, offset, count);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+ offset, count);
if (ret)
goto out;
dio_data.outstanding_extents = count_max_extents(count);
@@ -8788,8 +8792,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
- btrfs_delalloc_release_space(inode, offset,
- dio_data.reserve);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ offset, dio_data.reserve);
/*
* On error we might have left some ordered extents
* without submitting corresponding bios for them, so
@@ -8804,8 +8808,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.unsubmitted_oe_range_start,
false);
} else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode, offset,
- count - (size_t)ret);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ offset, count - (size_t)ret);
}
out:
if (wakeup)
@@ -8813,6 +8817,7 @@ out:
if (relock)
inode_lock(inode);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -9003,7 +9008,7 @@ again:
* free the entire extent.
*/
if (PageDirty(page))
- btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -9045,6 +9050,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
char *kaddr;
unsigned long zero_start;
loff_t size;
@@ -9070,7 +9076,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret = btrfs_delalloc_reserve_space(inode, page_start,
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
reserved_space);
if (!ret) {
ret = file_update_time(vmf->vma->vm_file);
@@ -9124,8 +9130,8 @@ again:
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(inode, page_start,
- PAGE_SIZE - reserved_space);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ page_start, PAGE_SIZE - reserved_space);
}
}
@@ -9176,13 +9182,16 @@ again:
out_unlock:
if (!ret) {
sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
}
unlock_page(page);
out:
- btrfs_delalloc_release_space(inode, page_start, reserved_space);
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ reserved_space);
out_noreserve:
sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -9404,8 +9413,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
- extent_io_tree_init(&ei->io_tree, &inode->i_data);
- extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+ extent_io_tree_init(&ei->io_tree, inode);
+ extent_io_tree_init(&ei->io_failure_tree, inode);
ei->io_tree.track_uptodate = 1;
ei->io_failure_tree.track_uptodate = 1;
atomic_set(&ei->sync_writers, 0);
@@ -9514,7 +9523,6 @@ void btrfs_destroy_cachep(void)
rcu_barrier();
kmem_cache_destroy(btrfs_inode_cachep);
kmem_cache_destroy(btrfs_trans_handle_cachep);
- kmem_cache_destroy(btrfs_transaction_cachep);
kmem_cache_destroy(btrfs_path_cachep);
kmem_cache_destroy(btrfs_free_space_cachep);
}
@@ -9534,12 +9542,6 @@ int btrfs_init_cachep(void)
if (!btrfs_trans_handle_cachep)
goto fail;
- btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
- sizeof(struct btrfs_transaction), 0,
- SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
- if (!btrfs_transaction_cachep)
- goto fail;
-
btrfs_path_cachep = kmem_cache_create("btrfs_path",
sizeof(struct btrfs_path), 0,
SLAB_MEM_SPREAD, NULL);
@@ -9564,6 +9566,24 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
u64 delalloc_bytes;
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
+ u32 bi_flags = BTRFS_I(inode)->flags;
+
+ stat->result_mask |= STATX_BTIME;
+ stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
+ stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
+ if (bi_flags & BTRFS_INODE_APPEND)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (bi_flags & BTRFS_INODE_COMPRESS)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+ if (bi_flags & BTRFS_INODE_IMMUTABLE)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (bi_flags & BTRFS_INODE_NODUMP)
+ stat->attributes |= STATX_ATTR_NODUMP;
+
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_COMPRESSED |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
generic_fillattr(inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
@@ -10538,7 +10558,7 @@ next:
btrfs_end_transaction(trans);
}
if (cur_offset < end)
- btrfs_free_reserved_data_space(inode, cur_offset,
+ btrfs_free_reserved_data_space(inode, NULL, cur_offset,
end - cur_offset + 1);
return ret;
}
@@ -10659,6 +10679,42 @@ static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
return -EAGAIN;
}
+static struct btrfs_fs_info *iotree_fs_info(void *private_data)
+{
+ struct inode *inode = private_data;
+ return btrfs_sb(inode->i_sb);
+}
+
+static void btrfs_check_extent_io_range(void *private_data, const char *caller,
+ u64 start, u64 end)
+{
+ struct inode *inode = private_data;
+ u64 isize;
+
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ }
+}
+
+void btrfs_set_range_writeback(void *private_data, u64 start, u64 end)
+{
+ struct inode *inode = private_data;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(inode->i_mapping, index);
+ ASSERT(page); /* Pages should be in the extent_io_tree */
+ set_page_writeback(page);
+ put_page(page);
+ index++;
+ }
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -10702,6 +10758,8 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
.merge_bio_hook = btrfs_merge_bio_hook,
.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
+ .tree_fs_info = iotree_fs_info,
+ .set_range_writeback = btrfs_set_range_writeback,
/* optional callbacks */
.fill_delalloc = run_delalloc_range,
@@ -10711,6 +10769,7 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
.clear_bit_hook = btrfs_clear_bit_hook,
.merge_extent_hook = btrfs_merge_extent_hook,
.split_extent_hook = btrfs_split_extent_hook,
+ .check_extent_io_range = btrfs_check_extent_io_range,
};
/*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e176375f374f..fa1b78cf25f6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -37,7 +37,7 @@
#include <linux/bit_spinlock.h>
#include <linux/security.h>
#include <linux/xattr.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
@@ -689,7 +689,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto dec_and_free;
- btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
@@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_io_tree *tree;
+ struct extent_changeset *data_reserved = NULL;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
file_end = (isize - 1) >> PAGE_SHIFT;
@@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
- ret = btrfs_delalloc_reserve_space(inode,
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
if (ret)
@@ -1226,7 +1227,7 @@ again:
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
(page_cnt - i_done) << PAGE_SHIFT);
}
@@ -1247,15 +1248,17 @@ again:
unlock_page(pages[i]);
put_page(pages[i]);
}
+ extent_changeset_free(data_reserved);
return i_done;
out:
for (i = 0; i < i_done; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -4588,7 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
out:
btrfs_free_path(path);
- vfree(inodes);
+ kvfree(inodes);
kfree(loi);
return ret;
@@ -4897,7 +4900,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
goto out;
}
- /* FIXME: check if the IDs really exist */
if (sa->assign) {
ret = btrfs_add_qgroup_relation(trans, fs_info,
sa->src, sa->dst);
@@ -4956,7 +4958,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
- /* FIXME: check if the IDs really exist */
if (sa->create) {
ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
} else {
@@ -5010,7 +5011,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
qgroupid = root->root_key.objectid;
}
- /* FIXME: check if the IDs really exist */
ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim);
err = btrfs_end_transaction(trans);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index f48c8c14dc14..d433e75d489a 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -18,13 +18,14 @@
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/init.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/lzo.h>
+#include <linux/refcount.h>
#include "compression.h"
#define LZO_LEN 4
@@ -40,9 +41,9 @@ static void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- vfree(workspace->buf);
- vfree(workspace->cbuf);
- vfree(workspace->mem);
+ kvfree(workspace->buf);
+ kvfree(workspace->cbuf);
+ kvfree(workspace->mem);
kfree(workspace);
}
@@ -50,13 +51,13 @@ static struct list_head *lzo_alloc_workspace(void)
{
struct workspace *workspace;
- workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
- workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
- workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
+ workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+ workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
+ workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -141,7 +142,7 @@ static int lzo_compress_pages(struct list_head *ws,
ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
&out_len, workspace->mem);
if (ret != LZO_E_OK) {
- pr_debug("BTRFS: deflate in loop returned %d\n",
+ pr_debug("BTRFS: lzo in loop returned %d\n",
ret);
ret = -EIO;
goto out;
@@ -229,8 +230,10 @@ static int lzo_compress_pages(struct list_head *ws,
in_len = min(bytes_left, PAGE_SIZE);
}
- if (tot_out > tot_in)
+ if (tot_out >= tot_in) {
+ ret = -E2BIG;
goto out;
+ }
/* store the size of all chunks of compressed data */
cpage_out = kmap(pages[0]);
@@ -254,16 +257,13 @@ out:
return ret;
}
-static int lzo_decompress_bio(struct list_head *ws,
- struct page **pages_in,
- u64 disk_start,
- struct bio *orig_bio,
- size_t srclen)
+static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
char *data_in;
unsigned long page_in_index = 0;
+ size_t srclen = cb->compressed_len;
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long buf_offset = 0;
@@ -278,6 +278,9 @@ static int lzo_decompress_bio(struct list_head *ws,
unsigned long tot_len;
char *buf;
bool may_late_unmap, need_unmap;
+ struct page **pages_in = cb->compressed_pages;
+ u64 disk_start = cb->start;
+ struct bio *orig_bio = cb->orig_bio;
data_in = kmap(pages_in[0]);
tot_len = read_compress_length(data_in);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7b40e2e7292a..a3aca495e33e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -663,7 +663,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -671,7 +671,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
LIST_HEAD(skipped);
LIST_HEAD(works);
struct btrfs_ordered_extent *ordered, *next;
- int count = 0;
+ u64 count = 0;
const u64 range_end = range_start + range_len;
mutex_lock(&root->ordered_extent_mutex);
@@ -701,7 +701,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
cond_resched();
spin_lock(&root->ordered_extent_lock);
- if (nr != -1)
+ if (nr != U64_MAX)
nr--;
count++;
}
@@ -720,13 +720,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
return count;
}
-int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
- const u64 range_start, const u64 range_len)
+u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ const u64 range_start, const u64 range_len)
{
struct btrfs_root *root;
struct list_head splice;
- int done;
- int total_done = 0;
+ u64 total_done = 0;
+ u64 done;
INIT_LIST_HEAD(&splice);
@@ -748,9 +748,8 @@ int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
total_done += done;
spin_lock(&fs_info->ordered_root_lock);
- if (nr != -1) {
+ if (nr != U64_MAX) {
nr -= done;
- WARN_ON(nr < 0);
}
}
list_splice_tail(&splice, &fs_info->ordered_roots);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e0c1d5b8d859..56c4c0ee6381 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -200,9 +200,9 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len);
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
+u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
-int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_get_logged_extents(struct btrfs_inode *inode,
struct list_head *logged_list,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index cdafbf92ef0c..fcae61e175f3 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -261,8 +261,11 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l)
case BTRFS_BLOCK_GROUP_ITEM_KEY:
bi = btrfs_item_ptr(l, i,
struct btrfs_block_group_item);
- pr_info("\t\tblock group used %llu\n",
- btrfs_disk_block_group_used(l, bi));
+ pr_info(
+ "\t\tblock group used %llu chunk_objectid %llu flags %llu\n",
+ btrfs_disk_block_group_used(l, bi),
+ btrfs_disk_block_group_chunk_objectid(l, bi),
+ btrfs_disk_block_group_flags(l, bi));
break;
case BTRFS_CHUNK_ITEM_KEY:
print_chunk(l, btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index d6cb155ef7a1..4b23ae5d0e5c 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -164,6 +164,7 @@ static int iterate_object_props(struct btrfs_root *root,
size_t),
void *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *name_buf = NULL;
char *value_buf = NULL;
@@ -214,6 +215,12 @@ static int iterate_object_props(struct btrfs_root *root,
name_ptr = (unsigned long)(di + 1);
data_ptr = name_ptr + name_len;
+ if (verify_dir_item(fs_info, leaf,
+ path->slots[0], di)) {
+ ret = -EIO;
+ goto out;
+ }
+
if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
name_ptr,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index deffbeb74a0b..4ce351efe281 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1406,38 +1406,6 @@ out:
return ret;
}
-int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_qgroup_extent_record *record;
- struct btrfs_delayed_ref_root *delayed_refs;
- struct rb_node *node;
- u64 qgroup_to_skip;
- int ret = 0;
-
- delayed_refs = &trans->transaction->delayed_refs;
- qgroup_to_skip = delayed_refs->qgroup_to_skip;
-
- /*
- * No need to do lock, since this function will only be called in
- * btrfs_commit_transaction().
- */
- node = rb_first(&delayed_refs->dirty_extent_root);
- while (node) {
- record = rb_entry(node, struct btrfs_qgroup_extent_record,
- node);
- if (WARN_ON(!record->old_roots))
- ret = btrfs_find_all_roots(NULL, fs_info,
- record->bytenr, 0, &record->old_roots);
- if (ret < 0)
- break;
- if (qgroup_to_skip)
- ulist_del(record->old_roots, qgroup_to_skip, 0);
- node = rb_next(node);
- }
- return ret;
-}
-
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
@@ -1559,6 +1527,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
if (ret)
return ret;
}
+ cond_resched();
return 0;
}
@@ -1918,6 +1887,35 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * Check if the @roots potentially is a list of fs tree roots
+ *
+ * Return 0 for definitely not a fs/subvol tree roots ulist
+ * Return 1 for possible fs/subvol tree roots in the list (considering an empty
+ * one as well)
+ */
+static int maybe_fs_roots(struct ulist *roots)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+
+ /* Empty one, still possible for fs roots */
+ if (!roots || roots->nnodes == 0)
+ return 1;
+
+ ULIST_ITER_INIT(&uiter);
+ unode = ulist_next(roots, &uiter);
+ if (!unode)
+ return 1;
+
+ /*
+ * If it contains fs tree roots, then it must belong to fs/subvol
+ * trees.
+ * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
+ */
+ return is_fstree(unode->val);
+}
+
int
btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
@@ -1934,10 +1932,20 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
- if (new_roots)
+ if (new_roots) {
+ if (!maybe_fs_roots(new_roots))
+ goto out_free;
nr_new_roots = new_roots->nnodes;
- if (old_roots)
+ }
+ if (old_roots) {
+ if (!maybe_fs_roots(old_roots))
+ goto out_free;
nr_old_roots = old_roots->nnodes;
+ }
+
+ /* Quick exit, either not fs tree roots, or won't affect any qgroup */
+ if (nr_old_roots == 0 && nr_new_roots == 0)
+ goto out_free;
BUG_ON(!fs_info->quota_root);
@@ -2017,6 +2025,19 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
if (!ret) {
/*
+ * Old roots should be searched when inserting qgroup
+ * extent record
+ */
+ if (WARN_ON(!record->old_roots)) {
+ /* Search commit root to find old_roots */
+ ret = btrfs_find_all_roots(NULL, fs_info,
+ record->bytenr, 0,
+ &record->old_roots);
+ if (ret < 0)
+ goto cleanup;
+ }
+
+ /*
* Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
* root. It's safe inside commit_transaction().
@@ -2025,8 +2046,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
record->bytenr, SEQ_LAST, &new_roots);
if (ret < 0)
goto cleanup;
- if (qgroup_to_skip)
+ if (qgroup_to_skip) {
ulist_del(new_roots, qgroup_to_skip, 0);
+ ulist_del(record->old_roots, qgroup_to_skip,
+ 0);
+ }
ret = btrfs_qgroup_account_extent(trans, fs_info,
record->bytenr, record->num_bytes,
record->old_roots, new_roots);
@@ -2338,6 +2362,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
if (num_bytes == 0)
return 0;
+
+ if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
+ capable(CAP_SYS_RESOURCE))
+ enforce = false;
+
retry:
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
@@ -2376,7 +2405,7 @@ retry:
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
return ret;
- btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -2806,55 +2835,130 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
* Return <0 for error (including -EQUOT)
*
* NOTE: this function may sleep for memory allocation.
+ * if btrfs_qgroup_reserve_data() is called multiple times with
+ * same @reserved, caller must ensure when error happens it's OK
+ * to free *ALL* reserved space.
*/
-int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+int btrfs_qgroup_reserve_data(struct inode *inode,
+ struct extent_changeset **reserved_ret, u64 start,
+ u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_changeset changeset;
struct ulist_node *unode;
struct ulist_iterator uiter;
+ struct extent_changeset *reserved;
+ u64 orig_reserved;
+ u64 to_reserve;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
!is_fstree(root->objectid) || len == 0)
return 0;
- changeset.bytes_changed = 0;
- ulist_init(&changeset.range_changed);
+ /* @reserved parameter is mandatory for qgroup */
+ if (WARN_ON(!reserved_ret))
+ return -EINVAL;
+ if (!*reserved_ret) {
+ *reserved_ret = extent_changeset_alloc();
+ if (!*reserved_ret)
+ return -ENOMEM;
+ }
+ reserved = *reserved_ret;
+ /* Record already reserved space */
+ orig_reserved = reserved->bytes_changed;
ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
+ start + len -1, EXTENT_QGROUP_RESERVED, reserved);
+
+ /* Newly reserved space */
+ to_reserve = reserved->bytes_changed - orig_reserved;
trace_btrfs_qgroup_reserve_data(inode, start, len,
- changeset.bytes_changed,
- QGROUP_RESERVE);
+ to_reserve, QGROUP_RESERVE);
if (ret < 0)
goto cleanup;
- ret = qgroup_reserve(root, changeset.bytes_changed, true);
+ ret = qgroup_reserve(root, to_reserve, true);
if (ret < 0)
goto cleanup;
- ulist_release(&changeset.range_changed);
return ret;
cleanup:
- /* cleanup already reserved ranges */
+ /* cleanup *ALL* already reserved ranges */
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(&changeset.range_changed, &uiter)))
+ while ((unode = ulist_next(&reserved->range_changed, &uiter)))
clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
GFP_NOFS);
- ulist_release(&changeset.range_changed);
+ extent_changeset_release(reserved);
+ return ret;
+}
+
+/* Free ranges specified by @reserved, normally in error path */
+static int qgroup_free_reserved_data(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct extent_changeset changeset;
+ int freed = 0;
+ int ret;
+
+ extent_changeset_init(&changeset);
+ len = round_up(start + len, root->fs_info->sectorsize);
+ start = round_down(start, root->fs_info->sectorsize);
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
+ u64 range_start = unode->val;
+ /* unode->aux is the inclusive end */
+ u64 range_len = unode->aux - range_start + 1;
+ u64 free_start;
+ u64 free_len;
+
+ extent_changeset_release(&changeset);
+
+ /* Only free range in range [start, start + len) */
+ if (range_start >= start + len ||
+ range_start + range_len <= start)
+ continue;
+ free_start = max(range_start, start);
+ free_len = min(start + len, range_start + range_len) -
+ free_start;
+ /*
+ * TODO: To also modify reserved->ranges_reserved to reflect
+ * the modification.
+ *
+ * However as long as we free qgroup reserved according to
+ * EXTENT_QGROUP_RESERVED, we won't double free.
+ * So not need to rush.
+ */
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+ free_start, free_start + free_len - 1,
+ EXTENT_QGROUP_RESERVED, &changeset);
+ if (ret < 0)
+ goto out;
+ freed += changeset.bytes_changed;
+ }
+ btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed);
+ ret = freed;
+out:
+ extent_changeset_release(&changeset);
return ret;
}
-static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
- int free)
+static int __btrfs_qgroup_release_data(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len,
+ int free)
{
struct extent_changeset changeset;
int trace_op = QGROUP_RELEASE;
int ret;
- changeset.bytes_changed = 0;
- ulist_init(&changeset.range_changed);
+ /* In release case, we shouldn't have @reserved */
+ WARN_ON(!free && reserved);
+ if (free && reserved)
+ return qgroup_free_reserved_data(inode, reserved, start, len);
+ extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
@@ -2868,8 +2972,9 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
BTRFS_I(inode)->root->objectid,
changeset.bytes_changed);
+ ret = changeset.bytes_changed;
out:
- ulist_release(&changeset.range_changed);
+ extent_changeset_release(&changeset);
return ret;
}
@@ -2878,14 +2983,17 @@ out:
*
* Should be called when a range of pages get invalidated before reaching disk.
* Or for error cleanup case.
+ * if @reserved is given, only reserved range in [@start, @start + @len) will
+ * be freed.
*
* For data written to disk, use btrfs_qgroup_release_data().
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+int btrfs_qgroup_free_data(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
{
- return __btrfs_qgroup_release_data(inode, start, len, 1);
+ return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
}
/*
@@ -2905,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
*/
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
{
- return __btrfs_qgroup_release_data(inode, start, len, 0);
+ return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
@@ -2969,8 +3077,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
struct ulist_iterator iter;
int ret;
- changeset.bytes_changed = 0;
- ulist_init(&changeset.range_changed);
+ extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_QGROUP_RESERVED, &changeset);
@@ -2987,5 +3094,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
changeset.bytes_changed);
}
- ulist_release(&changeset.range_changed);
+ extent_changeset_release(&changeset);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index fe04d3f295c6..d9984e87cddf 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -134,8 +134,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
-int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+
/*
* Inform qgroup to trace one dirty extent, its info is recorded in @record.
* So qgroup can account it at transaction committing time.
@@ -243,9 +242,11 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
#endif
/* New io_tree based accurate qgroup reserve API */
-int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_reserve_data(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
-int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len);
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
bool enforce);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d8ea0eb76325..208638384cd2 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -31,7 +31,7 @@
#include <linux/hash.h>
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
@@ -218,12 +218,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
* of a failing mount.
*/
table_size = sizeof(*table) + sizeof(*h) * num_entries;
- table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
- if (!table) {
- table = vzalloc(table_size);
- if (!table)
- return -ENOMEM;
- }
+ table = kvzalloc(table_size, GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
spin_lock_init(&table->cache_lock);
INIT_LIST_HEAD(&table->stripe_cache);
@@ -871,7 +868,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
* this frees the rbio and runs through all the bios in the
* bio_list and calls end_io on them
*/
-static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *next;
@@ -884,7 +881,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
while (cur) {
next = cur->bi_next;
cur->bi_next = NULL;
- cur->bi_error = err;
+ cur->bi_status = err;
bio_endio(cur);
cur = next;
}
@@ -897,7 +894,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
static void raid_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- int err = bio->bi_error;
+ blk_status_t err = bio->bi_status;
int max_errors;
if (err)
@@ -914,7 +911,7 @@ static void raid_write_end_io(struct bio *bio)
max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
0 : rbio->bbio->max_errors;
if (atomic_read(&rbio->error) > max_errors)
- err = -EIO;
+ err = BLK_STS_IOERR;
rbio_orig_end_io(rbio, err);
}
@@ -1092,7 +1089,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
* devices or if they are not contiguous
*/
if (last_end == disk_start && stripe->dev->bdev &&
- !last->bi_error &&
+ !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
if (ret == PAGE_SIZE)
@@ -1101,10 +1098,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
- if (!bio)
- return -ENOMEM;
-
+ bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
bio->bi_iter.bi_size = 0;
bio->bi_bdev = stripe->dev->bdev;
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1142,20 +1136,27 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
static void index_rbio_pages(struct btrfs_raid_bio *rbio)
{
struct bio *bio;
- struct bio_vec *bvec;
u64 start;
unsigned long stripe_offset;
unsigned long page_index;
- int i;
spin_lock_irq(&rbio->bio_list_lock);
bio_list_for_each(bio, &rbio->bio_list) {
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ int i = 0;
+
start = (u64)bio->bi_iter.bi_sector << 9;
stripe_offset = start - rbio->bbio->raid_map[0];
page_index = stripe_offset >> PAGE_SHIFT;
- bio_for_each_segment_all(bvec, bio, i)
- rbio->bio_pages[page_index + i] = bvec->bv_page;
+ if (bio_flagged(bio, BIO_CLONED))
+ bio->bi_iter = btrfs_io_bio(bio)->iter;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ rbio->bio_pages[page_index + i] = bvec.bv_page;
+ i++;
+ }
}
spin_unlock_irq(&rbio->bio_list_lock);
}
@@ -1429,11 +1430,14 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
*/
static void set_bio_pages_uptodate(struct bio *bio)
{
- struct bio_vec *bvec;
- int i;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ if (bio_flagged(bio, BIO_CLONED))
+ bio->bi_iter = btrfs_io_bio(bio)->iter;
- bio_for_each_segment_all(bvec, bio, i)
- SetPageUptodate(bvec->bv_page);
+ bio_for_each_segment(bvec, bio, iter)
+ SetPageUptodate(bvec.bv_page);
}
/*
@@ -1448,7 +1452,7 @@ static void raid_rmw_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- if (bio->bi_error)
+ if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -1991,7 +1995,7 @@ static void raid_recover_end_io(struct bio *bio)
* we only read stripe pages off the disk, set them
* up to date if there were no errors
*/
- if (bio->bi_error)
+ if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -2530,7 +2534,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- if (bio->bi_error)
+ if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a17e775a4a89..ab852b8e3e37 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -66,7 +66,6 @@ struct reada_extctl {
struct reada_extent {
u64 logical;
struct btrfs_key top;
- int err;
struct list_head extctl;
int refcnt;
spinlock_t lock;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d60df51959f7..65661d1aae4e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode,
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset;
+ struct extent_changeset *data_reserved = NULL;
BUG_ON(cluster->start != cluster->boundary[0]);
inode_lock(inode);
- ret = btrfs_check_data_free_space(inode, prealloc_start,
+ ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
prealloc_end + 1 - prealloc_start);
if (ret)
goto out;
@@ -3113,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
num_bytes = end + 1 - start;
if (cur_offset < start)
- btrfs_free_reserved_data_space(inode, cur_offset,
- start - cur_offset);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ cur_offset, start - cur_offset);
ret = btrfs_prealloc_file_range(inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
@@ -3125,10 +3126,11 @@ int prealloc_file_extent_cluster(struct inode *inode,
nr++;
}
if (cur_offset < prealloc_end)
- btrfs_free_reserved_data_space(inode, cur_offset,
- prealloc_end + 1 - cur_offset);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ cur_offset, prealloc_end + 1 - cur_offset);
out:
inode_unlock(inode);
+ extent_changeset_free(data_reserved);
return ret;
}
@@ -4269,8 +4271,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&rc->reloc_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(&rc->processed_blocks,
- fs_info->btree_inode->i_mapping);
+ extent_io_tree_init(&rc->processed_blocks, NULL);
return rc;
}
@@ -4372,7 +4373,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
- btrfs_wait_ordered_roots(fs_info, -1,
+ btrfs_wait_ordered_roots(fs_info, U64_MAX,
rc->block_group->key.objectid,
rc->block_group->key.offset);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7d6bc308bf43..460db0cb2d07 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -390,6 +390,13 @@ again:
WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
ptr = (unsigned long)(ref + 1);
+ ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr,
+ name_len);
+ if (!ret) {
+ err = -EIO;
+ goto out;
+ }
+
WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
*sequence = btrfs_root_ref_sequence(leaf, ref);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c7b45eb2403d..6f1e4c984b94 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -18,6 +18,7 @@
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "volumes.h"
#include "disk-io.h"
@@ -95,7 +96,7 @@ struct scrub_bio {
struct scrub_ctx *sctx;
struct btrfs_device *dev;
struct bio *bio;
- int err;
+ blk_status_t status;
u64 logical;
u64 physical;
#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
@@ -161,14 +162,6 @@ struct scrub_parity {
unsigned long bitmap[0];
};
-struct scrub_wr_ctx {
- struct scrub_bio *wr_curr_bio;
- struct btrfs_device *tgtdev;
- int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
- atomic_t flush_all_writes;
- struct mutex wr_lock;
-};
-
struct scrub_ctx {
struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
struct btrfs_fs_info *fs_info;
@@ -183,11 +176,14 @@ struct scrub_ctx {
atomic_t cancel_req;
int readonly;
int pages_per_rd_bio;
- u32 sectorsize;
- u32 nodesize;
int is_dev_replace;
- struct scrub_wr_ctx wr_ctx;
+
+ struct scrub_bio *wr_curr_bio;
+ struct mutex wr_lock;
+ int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+ atomic_t flush_all_writes;
+ struct btrfs_device *wr_tgtdev;
/*
* statistics
@@ -289,10 +285,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
u64 *extent_physical,
struct btrfs_device **extent_dev,
int *extent_mirror_num);
-static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx,
- struct btrfs_device *dev,
- int is_dev_replace);
-static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static void scrub_wr_submit(struct scrub_ctx *sctx);
@@ -643,8 +635,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (!sctx)
return;
- scrub_free_wr_ctx(&sctx->wr_ctx);
-
/* this can happen when scrub is cancelled */
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
@@ -664,6 +654,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
kfree(sbio);
}
+ kfree(sctx->wr_curr_bio);
scrub_free_csums(sctx);
kfree(sctx);
}
@@ -680,7 +671,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
struct scrub_ctx *sctx;
int i;
struct btrfs_fs_info *fs_info = dev->fs_info;
- int ret;
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
@@ -710,8 +700,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx->bios[i]->next_free = -1;
}
sctx->first_free = 0;
- sctx->nodesize = fs_info->nodesize;
- sctx->sectorsize = fs_info->sectorsize;
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
@@ -722,12 +710,16 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
spin_lock_init(&sctx->stat_lock);
init_waitqueue_head(&sctx->list_wait);
- ret = scrub_setup_wr_ctx(&sctx->wr_ctx,
- fs_info->dev_replace.tgtdev, is_dev_replace);
- if (ret) {
- scrub_free_ctx(sctx);
- return ERR_PTR(ret);
+ WARN_ON(sctx->wr_curr_bio != NULL);
+ mutex_init(&sctx->wr_lock);
+ sctx->wr_curr_bio = NULL;
+ if (is_dev_replace) {
+ WARN_ON(!fs_info->dev_replace.tgtdev);
+ sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
+ sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
+ atomic_set(&sctx->flush_all_writes, 0);
}
+
return sctx;
nomem:
@@ -742,6 +734,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
u32 nlink;
int ret;
int i;
+ unsigned nofs_flag;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
struct scrub_warning *swarn = warn_ctx;
@@ -780,7 +773,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
+ /*
+ * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
+ * uses GFP_NOFS in this context, so we keep it consistent but it does
+ * not seem to be strictly necessary.
+ */
+ nofs_flag = memalloc_nofs_save();
ipath = init_ipath(4096, local_root, swarn->path);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ipath)) {
ret = PTR_ERR(ipath);
ipath = NULL;
@@ -954,7 +954,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
ret = -EIO;
goto out;
}
- ret = repair_io_failure(BTRFS_I(inode), offset, PAGE_SIZE,
+ ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
fixup->logical, page,
offset - page_offset(page),
fixup->mirror_num);
@@ -1668,14 +1668,14 @@ leave_nomem:
struct scrub_bio_ret {
struct completion event;
- int error;
+ blk_status_t status;
};
static void scrub_bio_wait_endio(struct bio *bio)
{
struct scrub_bio_ret *ret = bio->bi_private;
- ret->error = bio->bi_error;
+ ret->status = bio->bi_status;
complete(&ret->event);
}
@@ -1693,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
int ret;
init_completion(&done.event);
- done.error = 0;
+ done.status = 0;
bio->bi_iter.bi_sector = page->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
@@ -1705,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
return ret;
wait_for_completion(&done.event);
- if (done.error)
+ if (done.status)
return -EIO;
return 0;
@@ -1737,12 +1737,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
WARN_ON(!page->page);
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio) {
- page->io_error = 1;
- sblock->no_io_error_seen = 0;
- continue;
- }
+ bio = btrfs_io_bio_alloc(1);
bio->bi_bdev = page->dev->bdev;
bio_add_page(bio, page->page, PAGE_SIZE, 0);
@@ -1830,9 +1825,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
}
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio)
- return -EIO;
+ bio = btrfs_io_bio_alloc(1);
bio->bi_bdev = page_bad->dev->bdev;
bio->bi_iter.bi_sector = page_bad->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -1898,37 +1891,31 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
- struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
struct scrub_bio *sbio;
int ret;
- mutex_lock(&wr_ctx->wr_lock);
+ mutex_lock(&sctx->wr_lock);
again:
- if (!wr_ctx->wr_curr_bio) {
- wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+ if (!sctx->wr_curr_bio) {
+ sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
GFP_KERNEL);
- if (!wr_ctx->wr_curr_bio) {
- mutex_unlock(&wr_ctx->wr_lock);
+ if (!sctx->wr_curr_bio) {
+ mutex_unlock(&sctx->wr_lock);
return -ENOMEM;
}
- wr_ctx->wr_curr_bio->sctx = sctx;
- wr_ctx->wr_curr_bio->page_count = 0;
+ sctx->wr_curr_bio->sctx = sctx;
+ sctx->wr_curr_bio->page_count = 0;
}
- sbio = wr_ctx->wr_curr_bio;
+ sbio = sctx->wr_curr_bio;
if (sbio->page_count == 0) {
struct bio *bio;
sbio->physical = spage->physical_for_dev_replace;
sbio->logical = spage->logical;
- sbio->dev = wr_ctx->tgtdev;
+ sbio->dev = sctx->wr_tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(GFP_KERNEL,
- wr_ctx->pages_per_wr_bio);
- if (!bio) {
- mutex_unlock(&wr_ctx->wr_lock);
- return -ENOMEM;
- }
+ bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
sbio->bio = bio;
}
@@ -1937,7 +1924,7 @@ again:
bio->bi_bdev = sbio->dev->bdev;
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- sbio->err = 0;
+ sbio->status = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical_for_dev_replace ||
sbio->logical + sbio->page_count * PAGE_SIZE !=
@@ -1951,7 +1938,7 @@ again:
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
- mutex_unlock(&wr_ctx->wr_lock);
+ mutex_unlock(&sctx->wr_lock);
return -EIO;
}
scrub_wr_submit(sctx);
@@ -1961,23 +1948,22 @@ again:
sbio->pagev[sbio->page_count] = spage;
scrub_page_get(spage);
sbio->page_count++;
- if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+ if (sbio->page_count == sctx->pages_per_wr_bio)
scrub_wr_submit(sctx);
- mutex_unlock(&wr_ctx->wr_lock);
+ mutex_unlock(&sctx->wr_lock);
return 0;
}
static void scrub_wr_submit(struct scrub_ctx *sctx)
{
- struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
struct scrub_bio *sbio;
- if (!wr_ctx->wr_curr_bio)
+ if (!sctx->wr_curr_bio)
return;
- sbio = wr_ctx->wr_curr_bio;
- wr_ctx->wr_curr_bio = NULL;
+ sbio = sctx->wr_curr_bio;
+ sctx->wr_curr_bio = NULL;
WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
@@ -1992,7 +1978,7 @@ static void scrub_wr_bio_end_io(struct bio *bio)
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
- sbio->err = bio->bi_error;
+ sbio->status = bio->bi_status;
sbio->bio = bio;
btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
@@ -2007,7 +1993,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
int i;
WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
- if (sbio->err) {
+ if (sbio->status) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->fs_info->dev_replace;
@@ -2081,7 +2067,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
page = sblock->pagev[0]->page;
buffer = kmap_atomic(page);
- len = sctx->sectorsize;
+ len = sctx->fs_info->sectorsize;
index = 0;
for (;;) {
u64 l = min_t(u64, len, PAGE_SIZE);
@@ -2146,7 +2132,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
BTRFS_UUID_SIZE))
sblock->header_error = 1;
- len = sctx->nodesize - BTRFS_CSUM_SIZE;
+ len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
index = 0;
@@ -2329,10 +2315,7 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(GFP_KERNEL,
- sctx->pages_per_rd_bio);
- if (!bio)
- return -ENOMEM;
+ bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
sbio->bio = bio;
}
@@ -2341,7 +2324,7 @@ again:
bio->bi_bdev = sbio->dev->bdev;
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- sbio->err = 0;
+ sbio->status = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical ||
sbio->logical + sbio->page_count * PAGE_SIZE !=
@@ -2377,7 +2360,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
struct scrub_block *sblock = bio->bi_private;
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
- if (bio->bi_error)
+ if (bio->bi_status)
sblock->no_io_error_seen = 0;
bio_put(bio);
@@ -2420,10 +2403,10 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
scrub_block_put(sblock);
if (sctx->is_dev_replace &&
- atomic_read(&sctx->wr_ctx.flush_all_writes)) {
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ atomic_read(&sctx->flush_all_writes)) {
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
}
scrub_pending_bio_dec(sctx);
@@ -2458,10 +2441,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
goto bbio_out;
}
- bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
- if (!bio)
- goto bbio_out;
-
+ bio = btrfs_io_bio_alloc(0);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
@@ -2588,7 +2568,7 @@ static void scrub_bio_end_io(struct bio *bio)
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
- sbio->err = bio->bi_error;
+ sbio->status = bio->bi_status;
sbio->bio = bio;
btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
@@ -2601,7 +2581,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
int i;
BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
- if (sbio->err) {
+ if (sbio->status) {
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
@@ -2628,10 +2608,10 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
spin_unlock(&sctx->list_lock);
if (sctx->is_dev_replace &&
- atomic_read(&sctx->wr_ctx.flush_all_writes)) {
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ atomic_read(&sctx->flush_all_writes)) {
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
}
scrub_pending_bio_dec(sctx);
@@ -2726,8 +2706,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
if (!sum)
return 0;
- index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
- num_sectors = sum->len / sctx->sectorsize;
+ index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize;
+ num_sectors = sum->len / sctx->fs_info->sectorsize;
memcpy(csum, sum->sums + index, sctx->csum_size);
if (index == num_sectors - 1) {
list_del(&sum->list);
@@ -2746,19 +2726,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
u32 blocksize;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sctx->sectorsize;
+ blocksize = sctx->fs_info->sectorsize;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- blocksize = sctx->nodesize;
+ blocksize = sctx->fs_info->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
sctx->stat.tree_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else {
- blocksize = sctx->sectorsize;
+ blocksize = sctx->fs_info->sectorsize;
WARN_ON(1);
}
@@ -2892,11 +2872,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
}
if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sctx->sectorsize;
+ blocksize = sctx->fs_info->sectorsize;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- blocksize = sctx->nodesize;
+ blocksize = sctx->fs_info->nodesize;
} else {
- blocksize = sctx->sectorsize;
+ blocksize = sctx->fs_info->sectorsize;
WARN_ON(1);
}
@@ -3004,7 +2984,7 @@ static void scrub_parity_bio_endio(struct bio *bio)
struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
- if (bio->bi_error)
+ if (bio->bi_status)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
@@ -3037,10 +3017,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
- bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
- if (!bio)
- goto bbio_out;
-
+ bio = btrfs_io_bio_alloc(0);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
@@ -3305,9 +3282,9 @@ out:
logic_end - logic_start);
scrub_parity_put(sparity);
scrub_submit(sctx);
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
btrfs_release_path(path);
return ret < 0 ? ret : 0;
@@ -3463,14 +3440,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
if (atomic_read(&fs_info->scrub_pause_req)) {
/* push queued extents */
- atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+ atomic_set(&sctx->flush_all_writes, 1);
scrub_submit(sctx);
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
- atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+ atomic_set(&sctx->flush_all_writes, 0);
scrub_blocked_if_needed(fs_info);
}
@@ -3677,9 +3654,9 @@ skip:
out:
/* push queued extents */
scrub_submit(sctx);
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
blk_finish_plug(&plug);
btrfs_free_path(path);
@@ -3859,7 +3836,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
btrfs_wait_block_group_reservations(cache);
btrfs_wait_nocow_writers(cache);
- ret = btrfs_wait_ordered_roots(fs_info, -1,
+ ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
cache->key.objectid,
cache->key.offset);
if (ret > 0) {
@@ -3916,11 +3893,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* write requests are really completed when bios_in_flight
* changes to 0.
*/
- atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+ atomic_set(&sctx->flush_all_writes, 1);
scrub_submit(sctx);
- mutex_lock(&sctx->wr_ctx.wr_lock);
+ mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_ctx.wr_lock);
+ mutex_unlock(&sctx->wr_lock);
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
@@ -3934,7 +3911,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
wait_event(sctx->list_wait,
atomic_read(&sctx->workers_pending) == 0);
- atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+ atomic_set(&sctx->flush_all_writes, 0);
scrub_pause_off(fs_info);
@@ -4337,32 +4314,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
btrfs_put_bbio(bbio);
}
-static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx,
- struct btrfs_device *dev,
- int is_dev_replace)
-{
- WARN_ON(wr_ctx->wr_curr_bio != NULL);
-
- mutex_init(&wr_ctx->wr_lock);
- wr_ctx->wr_curr_bio = NULL;
- if (!is_dev_replace)
- return 0;
-
- WARN_ON(!dev->bdev);
- wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
- wr_ctx->tgtdev = dev;
- atomic_set(&wr_ctx->flush_all_writes, 0);
- return 0;
-}
-
-static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
-{
- mutex_lock(&wr_ctx->wr_lock);
- kfree(wr_ctx->wr_curr_bio);
- wr_ctx->wr_curr_bio = NULL;
- mutex_unlock(&wr_ctx->wr_lock);
-}
-
static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
int mirror_num, u64 physical_for_dev_replace)
{
@@ -4665,7 +4616,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
struct btrfs_device *dev;
int ret;
- dev = sctx->wr_ctx.tgtdev;
+ dev = sctx->wr_tgtdev;
if (!dev)
return -EIO;
if (!dev->bdev) {
@@ -4673,13 +4624,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
"scrub write_page_nocow(bdev == NULL) is unexpected");
return -EIO;
}
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
- return -ENOMEM;
- }
+ bio = btrfs_io_bio_alloc(1);
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
bio->bi_bdev = dev->bdev;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fc496a6f842a..b082210df9c8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1069,6 +1069,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
}
+ ret = btrfs_is_name_len_valid(eb, path->slots[0],
+ (unsigned long)(di + 1), name_len + data_len);
+ if (!ret) {
+ ret = -EIO;
+ goto out;
+ }
if (name_len + data_len > buf_len) {
buf_len = name_len + data_len;
if (is_vmalloc_addr(buf)) {
@@ -1083,7 +1089,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
buf = tmp;
}
if (!buf) {
- buf = vmalloc(buf_len);
+ buf = kvmalloc(buf_len, GFP_KERNEL);
if (!buf) {
ret = -ENOMEM;
goto out;
@@ -1850,7 +1856,7 @@ out:
*/
static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
const char *name, int name_len,
- u64 *who_ino, u64 *who_gen)
+ u64 *who_ino, u64 *who_gen, u64 *who_mode)
{
int ret = 0;
u64 gen;
@@ -1899,7 +1905,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
if (other_inode > sctx->send_progress ||
is_waiting_for_move(sctx, other_inode)) {
ret = get_inode_info(sctx->parent_root, other_inode, NULL,
- who_gen, NULL, NULL, NULL, NULL);
+ who_gen, who_mode, NULL, NULL, NULL);
if (ret < 0)
goto out;
@@ -2769,15 +2775,20 @@ out:
struct recorded_ref {
struct list_head list;
- char *dir_path;
char *name;
struct fs_path *full_path;
u64 dir;
u64 dir_gen;
- int dir_path_len;
int name_len;
};
+static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
+{
+ ref->full_path = path;
+ ref->name = (char *)kbasename(ref->full_path->start);
+ ref->name_len = ref->full_path->end - ref->name;
+}
+
/*
* We need to process new refs before deleted refs, but compare_tree gives us
* everything mixed. So we first record all refs and later process them.
@@ -2794,17 +2805,7 @@ static int __record_ref(struct list_head *head, u64 dir,
ref->dir = dir;
ref->dir_gen = dir_gen;
- ref->full_path = path;
-
- ref->name = (char *)kbasename(ref->full_path->start);
- ref->name_len = ref->full_path->end - ref->name;
- ref->dir_path = ref->full_path->start;
- if (ref->name == ref->full_path->start)
- ref->dir_path_len = 0;
- else
- ref->dir_path_len = ref->full_path->end -
- ref->full_path->start - 1 - ref->name_len;
-
+ set_ref_path(ref, path);
list_add_tail(&ref->list, head);
return 0;
}
@@ -3546,9 +3547,17 @@ static int is_ancestor(struct btrfs_root *root,
struct fs_path *fs_path)
{
u64 ino = ino2;
+ bool free_path = false;
+ int ret = 0;
+
+ if (!fs_path) {
+ fs_path = fs_path_alloc();
+ if (!fs_path)
+ return -ENOMEM;
+ free_path = true;
+ }
while (ino > BTRFS_FIRST_FREE_OBJECTID) {
- int ret;
u64 parent;
u64 parent_gen;
@@ -3557,13 +3566,18 @@ static int is_ancestor(struct btrfs_root *root,
if (ret < 0) {
if (ret == -ENOENT && ino == ino2)
ret = 0;
- return ret;
+ goto out;
+ }
+ if (parent == ino1) {
+ ret = parent_gen == ino1_gen ? 1 : 0;
+ goto out;
}
- if (parent == ino1)
- return parent_gen == ino1_gen ? 1 : 0;
ino = parent;
}
- return 0;
+ out:
+ if (free_path)
+ fs_path_free(fs_path);
+ return ret;
}
static int wait_for_parent_move(struct send_ctx *sctx,
@@ -3669,6 +3683,36 @@ out:
return ret;
}
+static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
+{
+ int ret;
+ struct fs_path *new_path;
+
+ /*
+ * Our reference's name member points to its full_path member string, so
+ * we use here a new path.
+ */
+ new_path = fs_path_alloc();
+ if (!new_path)
+ return -ENOMEM;
+
+ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path);
+ if (ret < 0) {
+ fs_path_free(new_path);
+ return ret;
+ }
+ ret = fs_path_add(new_path, ref->name, ref->name_len);
+ if (ret < 0) {
+ fs_path_free(new_path);
+ return ret;
+ }
+
+ fs_path_free(ref->full_path);
+ set_ref_path(ref, new_path);
+
+ return 0;
+}
+
/*
* This does all the move/link/unlink/rmdir magic.
*/
@@ -3682,10 +3726,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct fs_path *valid_path = NULL;
u64 ow_inode = 0;
u64 ow_gen;
+ u64 ow_mode;
int did_overwrite = 0;
int is_orphan = 0;
u64 last_dir_ino_rm = 0;
bool can_rename = true;
+ bool orphanized_dir = false;
+ bool orphanized_ancestor = false;
btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
@@ -3783,7 +3830,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
*/
ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
cur->name, cur->name_len,
- &ow_inode, &ow_gen);
+ &ow_inode, &ow_gen, &ow_mode);
if (ret < 0)
goto out;
if (ret) {
@@ -3800,6 +3847,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
cur->full_path);
if (ret < 0)
goto out;
+ if (S_ISDIR(ow_mode))
+ orphanized_dir = true;
/*
* If ow_inode has its rename operation delayed
@@ -3837,9 +3886,16 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* might contain the pre-orphanization name of
* ow_inode, which is no longer valid.
*/
- fs_path_reset(valid_path);
- ret = get_cur_path(sctx, sctx->cur_ino,
- sctx->cur_inode_gen, valid_path);
+ ret = is_ancestor(sctx->parent_root,
+ ow_inode, ow_gen,
+ sctx->cur_ino, NULL);
+ if (ret > 0) {
+ orphanized_ancestor = true;
+ fs_path_reset(valid_path);
+ ret = get_cur_path(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen,
+ valid_path);
+ }
if (ret < 0)
goto out;
} else {
@@ -3898,6 +3954,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
} else {
+ /*
+ * We might have previously orphanized an inode
+ * which is an ancestor of our current inode,
+ * so our reference's full path, which was
+ * computed before any such orphanizations, must
+ * be updated.
+ */
+ if (orphanized_dir) {
+ ret = update_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
ret = send_link(sctx, cur->full_path,
valid_path);
if (ret < 0)
@@ -3960,6 +4028,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
if (!ret) {
+ /*
+ * If we orphanized any ancestor before, we need
+ * to recompute the full path for deleted names,
+ * since any such path was computed before we
+ * processed any references and orphanized any
+ * ancestor inode.
+ */
+ if (orphanized_ancestor) {
+ ret = update_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
@@ -5190,15 +5270,12 @@ static int is_extent_unchanged(struct send_ctx *sctx,
goto out;
}
- right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
if (right_type == BTRFS_FILE_EXTENT_INLINE) {
right_len = btrfs_file_extent_inline_len(eb, slot, ei);
right_len = PAGE_ALIGN(right_len);
} else {
right_len = btrfs_file_extent_num_bytes(eb, ei);
}
- right_offset = btrfs_file_extent_offset(eb, ei);
- right_gen = btrfs_file_extent_generation(eb, ei);
/*
* Are we at extent 8? If yes, we know the extent is changed.
@@ -5223,6 +5300,10 @@ static int is_extent_unchanged(struct send_ctx *sctx,
goto out;
}
+ right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+ right_offset = btrfs_file_extent_offset(eb, ei);
+ right_gen = btrfs_file_extent_generation(eb, ei);
+
left_offset_fixed = left_offset;
if (key.offset < ekey->offset) {
/* Fix the right offset for 2a and 7. */
@@ -6397,13 +6478,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
+ sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
if (!sctx->clone_roots) {
- sctx->clone_roots = vzalloc(alloc_size);
- if (!sctx->clone_roots) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4f1cdd5058f1..12540b6104b5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -601,18 +601,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
break;
case Opt_alloc_start:
- num = match_strdup(&args[0]);
- if (num) {
- mutex_lock(&info->chunk_mutex);
- info->alloc_start = memparse(num, NULL);
- mutex_unlock(&info->chunk_mutex);
- kfree(num);
- btrfs_info(info, "allocations start at %llu",
- info->alloc_start);
- } else {
- ret = -ENOMEM;
- goto out;
- }
+ btrfs_info(info,
+ "option alloc_start is obsolete, ignored");
break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
@@ -1164,7 +1154,6 @@ static int btrfs_fill_super(struct super_block *sb,
goto fail_close;
}
- save_mount_options(sb, data);
cleancache_init_fs(sb);
sb->s_flags |= MS_ACTIVE;
return 0;
@@ -1187,7 +1176,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1232,8 +1221,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",nobarrier");
if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
- if (info->alloc_start != 0)
- seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
if (info->thread_pool_size != min_t(unsigned long,
num_online_cpus() + 2, 8))
seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
@@ -1716,7 +1703,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
unsigned long old_opts = fs_info->mount_opt;
unsigned long old_compress_type = fs_info->compress_type;
u64 old_max_inline = fs_info->max_inline;
- u64 old_alloc_start = fs_info->alloc_start;
int old_thread_pool_size = fs_info->thread_pool_size;
unsigned int old_metadata_ratio = fs_info->metadata_ratio;
int ret;
@@ -1855,9 +1841,6 @@ restore:
fs_info->mount_opt = old_opts;
fs_info->compress_type = old_compress_type;
fs_info->max_inline = old_max_inline;
- mutex_lock(&fs_info->chunk_mutex);
- fs_info->alloc_start = old_alloc_start;
- mutex_unlock(&fs_info->chunk_mutex);
btrfs_resize_thread_pool(fs_info,
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
@@ -1898,18 +1881,15 @@ static inline void btrfs_descending_sort_devices(
static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
u64 *free_bytes)
{
- struct btrfs_root *root = fs_info->tree_root;
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 skip_space;
u64 type;
u64 avail_space;
- u64 used_space;
u64 min_stripe_size;
int min_stripes = 1, num_stripes = 1;
int i = 0, nr_devices;
- int ret;
/*
* We aren't under the device list lock, so this is racy-ish, but good
@@ -1927,12 +1907,12 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
}
devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
- GFP_NOFS);
+ GFP_KERNEL);
if (!devices_info)
return -ENOMEM;
/* calc min stripe number for data space allocation */
- type = btrfs_get_alloc_profile(root, 1);
+ type = btrfs_data_alloc_profile(fs_info);
if (type & BTRFS_BLOCK_GROUP_RAID0) {
min_stripes = 2;
num_stripes = nr_devices;
@@ -1949,8 +1929,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
else
min_stripe_size = BTRFS_STRIPE_LEN;
- if (fs_info->alloc_start)
- mutex_lock(&fs_devices->device_list_mutex);
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (!device->in_fs_metadata || !device->bdev ||
@@ -1973,34 +1951,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
*/
skip_space = SZ_1M;
- /* user can set the offset in fs_info->alloc_start. */
- if (fs_info->alloc_start &&
- fs_info->alloc_start + BTRFS_STRIPE_LEN <=
- device->total_bytes) {
- rcu_read_unlock();
- skip_space = max(fs_info->alloc_start, skip_space);
-
- /*
- * btrfs can not use the free space in
- * [0, skip_space - 1], we must subtract it from the
- * total. In order to implement it, we account the used
- * space in this range first.
- */
- ret = btrfs_account_dev_extents_size(device, 0,
- skip_space - 1,
- &used_space);
- if (ret) {
- kfree(devices_info);
- mutex_unlock(&fs_devices->device_list_mutex);
- return ret;
- }
-
- rcu_read_lock();
-
- /* calc the free space in [0, skip_space - 1] */
- skip_space -= used_space;
- }
-
/*
* we can use the free space in [0, skip_space - 1], subtract
* it from the total.
@@ -2019,8 +1969,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
i++;
}
rcu_read_unlock();
- if (fs_info->alloc_start)
- mutex_unlock(&fs_devices->device_list_mutex);
nr_devices = i;
@@ -2057,10 +2005,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
* multiplier to scale the sizes.
*
* Unused device space usage is based on simulating the chunk allocator
- * algorithm that respects the device sizes, order of allocations and the
- * 'alloc_start' value, this is a close approximation of the actual use but
- * there are other factors that may change the result (like a new metadata
- * chunk).
+ * algorithm that respects the device sizes and order of allocations. This is
+ * a close approximation of the actual use but there are other factors that may
+ * change the result (like a new metadata chunk).
*
* If metadata is exhausted, f_bavail will be 0.
*/
@@ -2243,7 +2190,7 @@ static int btrfs_freeze(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
- fs_info->fs_frozen = 1;
+ set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
/*
* We don't need a barrier here, we'll wait for any transaction that
* could be in progress on other threads (and do delayed iputs that
@@ -2262,7 +2209,9 @@ static int btrfs_freeze(struct super_block *sb)
static int btrfs_unfreeze(struct super_block *sb)
{
- btrfs_sb(sb)->fs_frozen = 0;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
return 0;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1f157fba8940..c2d5f3580b4c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
+static ssize_t quota_override_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ int quota_override;
+
+ quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
+ return snprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+}
+
+static ssize_t quota_override_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ unsigned long knob;
+ int err;
+
+ if (!fs_info)
+ return -EPERM;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ err = kstrtoul(buf, 10, &knob);
+ if (err)
+ return err;
+ if (knob > 1)
+ return -EINVAL;
+
+ if (knob)
+ set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
+ else
+ clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
BTRFS_ATTR_PTR(nodesize),
BTRFS_ATTR_PTR(sectorsize),
BTRFS_ATTR_PTR(clone_alignment),
+ BTRFS_ATTR_PTR(quota_override),
NULL,
};
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 133753232a94..d06b1c931d05 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -87,7 +87,7 @@ static int test_find_delalloc(u32 sectorsize)
return -ENOMEM;
}
- extent_io_tree_init(&tmp, &inode->i_data);
+ extent_io_tree_init(&tmp, inode);
/*
* First go through and create and mark all of our pages dirty, we pin
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2168654c90a1..f615d59b0489 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -93,7 +93,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
btrfs_put_block_group_trimming(cache);
btrfs_put_block_group(cache);
}
- kmem_cache_free(btrfs_transaction_cachep, transaction);
+ kfree(transaction);
}
}
@@ -228,7 +228,7 @@ loop:
*/
BUG_ON(type == TRANS_JOIN_NOLOCK);
- cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+ cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
@@ -238,11 +238,11 @@ loop:
* someone started a transaction after we unlocked. Make sure
* to redo the checks above
*/
- kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ kfree(cur_trans);
goto loop;
} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
spin_unlock(&fs_info->trans_lock);
- kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ kfree(cur_trans);
return -EROFS;
}
@@ -294,7 +294,7 @@ loop:
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
- fs_info->btree_inode->i_mapping);
+ fs_info->btree_inode);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -1374,9 +1374,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
ret = commit_fs_roots(trans, fs_info);
if (ret)
goto out;
- ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
- if (ret < 0)
- goto out;
ret = btrfs_qgroup_account_extents(trans, fs_info);
if (ret < 0)
goto out;
@@ -1926,7 +1923,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
- btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
static inline void
@@ -2180,13 +2177,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto scrub_continue;
}
- ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
-
/*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
@@ -2314,7 +2304,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* it'll result in deadlock about SB_FREEZE_FS.
*/
if (current != fs_info->transaction_kthread &&
- current != fs_info->cleaner_kthread && !fs_info->fs_frozen)
+ current != fs_info->cleaner_kthread &&
+ !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
btrfs_run_delayed_iputs(fs_info);
return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ccfe9fe7754a..3a11ae63676e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1175,15 +1175,19 @@ next:
return 0;
}
-static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index,
- u64 *parent_objectid)
+static int extref_get_fields(struct extent_buffer *eb, int slot,
+ unsigned long ref_ptr, u32 *namelen, char **name,
+ u64 *index, u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ref_ptr;
*namelen = btrfs_inode_extref_name_len(eb, extref);
+ if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name,
+ *namelen))
+ return -EIO;
+
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
@@ -1198,14 +1202,19 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
return 0;
}
-static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index)
+static int ref_get_fields(struct extent_buffer *eb, int slot,
+ unsigned long ref_ptr, u32 *namelen, char **name,
+ u64 *index)
{
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ref_ptr;
*namelen = btrfs_inode_ref_name_len(eb, ref);
+ if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1),
+ *namelen))
+ return -EIO;
+
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
@@ -1280,8 +1289,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
while (ref_ptr < ref_end) {
if (log_ref_ver) {
- ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
- &ref_index, &parent_objectid);
+ ret = extref_get_fields(eb, slot, ref_ptr, &namelen,
+ &name, &ref_index, &parent_objectid);
/*
* parent object can change from one array
* item to another.
@@ -1293,8 +1302,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
} else {
- ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
- &ref_index);
+ ret = ref_get_fields(eb, slot, ref_ptr, &namelen,
+ &name, &ref_index);
}
if (ret)
goto out;
@@ -1841,7 +1850,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(fs_info, eb, di))
+ if (verify_dir_item(fs_info, eb, slot, di))
return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
@@ -2017,7 +2026,7 @@ again:
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(fs_info, eb, di)) {
+ if (verify_dir_item(fs_info, eb, slot, di)) {
ret = -EIO;
goto out;
}
@@ -2102,6 +2111,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const u64 ino)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key search_key;
struct btrfs_path *log_path;
int i;
@@ -2143,6 +2153,11 @@ process_leaf:
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
+ ret = verify_dir_item(fs_info, path->nodes[0], i, di);
+ if (ret) {
+ ret = -EIO;
+ goto out;
+ }
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
@@ -4546,6 +4561,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
this_len = sizeof(*extref) + this_name_len;
}
+ ret = btrfs_is_name_len_valid(eb, slot, name_ptr,
+ this_name_len);
+ if (!ret) {
+ ret = -EIO;
+ goto out;
+ }
if (this_name_len > name_len) {
char *new_name;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 017b67daa3bb..e8b9a269fdde 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -242,6 +242,17 @@ static struct btrfs_device *__alloc_device(void)
if (!dev)
return ERR_PTR(-ENOMEM);
+ /*
+ * Preallocate a bio that's always going to be used for flushing device
+ * barriers and matches the device lifespan
+ */
+ dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
+ if (!dev->flush_bio) {
+ kfree(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+ bio_get(dev->flush_bio);
+
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->resized_list);
@@ -838,6 +849,7 @@ static void __free_device(struct work_struct *work)
device = container_of(work, struct btrfs_device, rcu_work);
rcu_string_free(device->name);
+ bio_put(device->flush_bio);
kfree(device);
}
@@ -1353,15 +1365,13 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
int ret;
int slot;
struct extent_buffer *l;
- u64 min_search_start;
/*
* We don't want to overwrite the superblock on the drive nor any area
* used by the boot loader (grub for example), so we make sure to start
* at an offset of at least 1MB.
*/
- min_search_start = max(fs_info->alloc_start, 1024ull * 1024);
- search_start = max(search_start, min_search_start);
+ search_start = max_t(u64, search_start, SZ_1M);
path = btrfs_alloc_path();
if (!path)
@@ -2387,7 +2397,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
- device->total_bytes = i_size_read(bdev->bd_inode);
+ device->total_bytes = round_down(i_size_read(bdev->bd_inode),
+ fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
device->fs_info = fs_info;
@@ -2417,16 +2428,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
fs_info->fs_devices->total_devices++;
fs_info->fs_devices->total_rw_bytes += device->total_bytes;
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space += device->total_bytes;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
if (!blk_queue_nonrot(q))
fs_info->fs_devices->rotating = 1;
tmp = btrfs_super_total_bytes(fs_info->super_copy);
btrfs_set_super_total_bytes(fs_info->super_copy,
- tmp + device->total_bytes);
+ round_down(tmp + device->total_bytes, fs_info->sectorsize));
tmp = btrfs_super_num_devices(fs_info->super_copy);
btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
@@ -2574,7 +2583,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
goto error;
}
- name = rcu_string_strdup(device_path, GFP_NOFS);
+ name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
kfree(device);
ret = -ENOMEM;
@@ -2689,9 +2698,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
if (!device->writeable)
return -EACCES;
+ new_size = round_down(new_size, fs_info->sectorsize);
+
mutex_lock(&fs_info->chunk_mutex);
old_total = btrfs_super_total_bytes(super_copy);
- diff = new_size - device->total_bytes;
+ diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
if (new_size <= device->total_bytes ||
device->is_tgtdev_for_dev_replace) {
@@ -2701,7 +2712,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
fs_devices = fs_info->fs_devices;
- btrfs_set_super_total_bytes(super_copy, old_total + diff);
+ btrfs_set_super_total_bytes(super_copy,
+ round_down(old_total + diff, fs_info->sectorsize));
device->fs_devices->total_rw_bytes += diff;
btrfs_device_set_total_bytes(device, new_size);
@@ -2874,9 +2886,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space += dev_extent_len;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
}
@@ -4393,7 +4403,10 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = btrfs_device_get_total_bytes(device);
- u64 diff = old_size - new_size;
+ u64 diff;
+
+ new_size = round_down(new_size, fs_info->sectorsize);
+ diff = round_down(old_size - new_size, fs_info->sectorsize);
if (device->is_tgtdev_for_dev_replace)
return -EINVAL;
@@ -4409,9 +4422,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space -= diff;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_sub(diff, &fs_info->free_chunk_space);
}
mutex_unlock(&fs_info->chunk_mutex);
@@ -4522,7 +4533,8 @@ again:
&fs_info->fs_devices->resized_devices);
WARN_ON(diff > old_total);
- btrfs_set_super_total_bytes(super_copy, old_total - diff);
+ btrfs_set_super_total_bytes(super_copy,
+ round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
/* Now btrfs_update_device() will change the on-disk size. */
@@ -4535,9 +4547,7 @@ done:
btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space += diff;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_add(diff, &fs_info->free_chunk_space);
mutex_unlock(&fs_info->chunk_mutex);
}
return ret;
@@ -4882,9 +4892,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
}
- spin_lock(&info->free_chunk_lock);
- info->free_chunk_space -= (stripe_size * map->num_stripes);
- spin_unlock(&info->free_chunk_lock);
+ atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
free_extent_map(em);
check_raid56_incompat_flag(info, type);
@@ -5029,20 +5037,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *extent_root = fs_info->extent_root;
u64 chunk_offset;
u64 sys_chunk_offset;
u64 alloc_profile;
int ret;
chunk_offset = find_next_chunk(fs_info);
- alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
+ alloc_profile = btrfs_metadata_alloc_profile(fs_info);
ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
if (ret)
return ret;
sys_chunk_offset = find_next_chunk(fs_info);
- alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
+ alloc_profile = btrfs_system_alloc_profile(fs_info);
ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
return ret;
}
@@ -6042,9 +6049,10 @@ static void btrfs_end_bio(struct bio *bio)
struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
- if (bio->bi_error) {
+ if (bio->bi_status) {
atomic_inc(&bbio->error);
- if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) {
+ if (bio->bi_status == BLK_STS_IOERR ||
+ bio->bi_status == BLK_STS_TARGET) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
struct btrfs_device *dev;
@@ -6082,13 +6090,13 @@ static void btrfs_end_bio(struct bio *bio)
* beyond the tolerance of the btrfs bio
*/
if (atomic_read(&bbio->error) > bbio->max_errors) {
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
} else {
/*
* this bio is actually up to date, we didn't
* go over the max number of errors
*/
- bio->bi_error = 0;
+ bio->bi_status = 0;
}
btrfs_end_bbio(bbio, bio);
@@ -6199,7 +6207,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
btrfs_end_bbio(bbio, bio);
}
}
@@ -6266,10 +6274,9 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
continue;
}
- if (dev_nr < total_devs - 1) {
- bio = btrfs_bio_clone(first_bio, GFP_NOFS);
- BUG_ON(!bio); /* -ENOMEM */
- } else
+ if (dev_nr < total_devs - 1)
+ bio = btrfs_bio_clone(first_bio);
+ else
bio = first_bio;
submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
@@ -6684,10 +6691,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
device->in_fs_metadata = 1;
if (device->writeable && !device->is_tgtdev_for_dev_replace) {
device->fs_devices->total_rw_bytes += device->total_bytes;
- spin_lock(&fs_info->free_chunk_lock);
- fs_info->free_chunk_space += device->total_bytes -
- device->bytes_used;
- spin_unlock(&fs_info->free_chunk_lock);
+ atomic64_add(device->total_bytes - device->bytes_used,
+ &fs_info->free_chunk_space);
}
ret = 0;
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c7d0fbc915ca..6f45fd60d15a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -74,6 +74,8 @@ struct btrfs_device {
int missing;
int can_discard;
int is_tgtdev_for_dev_replace;
+ int last_flush_error;
+ int flush_bio_sent;
#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
seqcount_t data_seqcount;
@@ -279,6 +281,11 @@ struct btrfs_io_bio {
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
u8 *csum_allocated;
btrfs_io_bio_end_io_t *end_io;
+ struct bvec_iter iter;
+ /*
+ * This member must come last, bio_alloc_bioset will allocate enough
+ * bytes for entire btrfs_io_bio but relies on bio being last.
+ */
struct bio bio;
};
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b3cbf80c5acf..2c7e53f9ff1b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -336,7 +336,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
u32 this_len = sizeof(*di) + name_len + data_len;
unsigned long name_ptr = (unsigned long)(di + 1);
- if (verify_dir_item(fs_info, leaf, di)) {
+ if (verify_dir_item(fs_info, leaf, slot, di)) {
ret = -EIO;
goto err;
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 135b10823c6d..c248f9286366 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -24,12 +24,13 @@
#include <linux/slab.h>
#include <linux/zlib.h>
#include <linux/zutil.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <linux/init.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
+#include <linux/refcount.h>
#include "compression.h"
struct workspace {
@@ -42,7 +43,7 @@ static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- vfree(workspace->strm.workspace);
+ kvfree(workspace->strm.workspace);
kfree(workspace->buf);
kfree(workspace);
}
@@ -52,14 +53,14 @@ static struct list_head *zlib_alloc_workspace(void)
struct workspace *workspace;
int workspacesize;
- workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
- workspace->strm.workspace = vmalloc(workspacesize);
- workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS);
+ workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+ workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -211,10 +212,7 @@ out:
return ret;
}
-static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in,
- u64 disk_start,
- struct bio *orig_bio,
- size_t srclen)
+static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
@@ -222,8 +220,12 @@ static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in,
char *data_in;
size_t total_out = 0;
unsigned long page_in_index = 0;
+ size_t srclen = cb->compressed_len;
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
+ struct page **pages_in = cb->compressed_pages;
+ u64 disk_start = cb->start;
+ struct bio *orig_bio = cb->orig_bio;
data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
diff --git a/fs/buffer.c b/fs/buffer.c
index 161be58c5cb0..5715dac7821f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,7 +49,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- struct writeback_control *wbc);
+ enum rw_hint hint, struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -178,7 +178,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost sync page write");
- set_buffer_write_io_error(bh);
+ mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
@@ -352,8 +352,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost async page write");
- mapping_set_error(page->mapping, -EIO);
- set_buffer_write_io_error(bh);
+ mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(page);
}
@@ -481,8 +480,6 @@ static void __remove_assoc_queue(struct buffer_head *bh)
{
list_del_init(&bh->b_assoc_buffers);
WARN_ON(!bh->b_assoc_map);
- if (buffer_write_io_error(bh))
- set_bit(AS_EIO, &bh->b_assoc_map->flags);
bh->b_assoc_map = NULL;
}
@@ -1181,6 +1178,17 @@ void mark_buffer_dirty(struct buffer_head *bh)
}
EXPORT_SYMBOL(mark_buffer_dirty);
+void mark_buffer_write_io_error(struct buffer_head *bh)
+{
+ set_buffer_write_io_error(bh);
+ /* FIXME: do we need to set this in both places? */
+ if (bh->b_page && bh->b_page->mapping)
+ mapping_set_error(bh->b_page->mapping, -EIO);
+ if (bh->b_assoc_map)
+ mapping_set_error(bh->b_assoc_map, -EIO);
+}
+EXPORT_SYMBOL(mark_buffer_write_io_error);
+
/*
* Decrement a buffer_head's reference count. If all buffers against a page
* have zero reference count, are clean and unlocked, and if the page is clean
@@ -1273,44 +1281,31 @@ static inline void check_irqs_on(void)
}
/*
- * The LRU management algorithm is dopey-but-simple. Sorry.
+ * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
+ * inserted at the front, and the buffer_head at the back if any is evicted.
+ * Or, if already in the LRU it is moved to the front.
*/
static void bh_lru_install(struct buffer_head *bh)
{
- struct buffer_head *evictee = NULL;
+ struct buffer_head *evictee = bh;
+ struct bh_lru *b;
+ int i;
check_irqs_on();
bh_lru_lock();
- if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
- struct buffer_head *bhs[BH_LRU_SIZE];
- int in;
- int out = 0;
- get_bh(bh);
- bhs[out++] = bh;
- for (in = 0; in < BH_LRU_SIZE; in++) {
- struct buffer_head *bh2 =
- __this_cpu_read(bh_lrus.bhs[in]);
-
- if (bh2 == bh) {
- __brelse(bh2);
- } else {
- if (out >= BH_LRU_SIZE) {
- BUG_ON(evictee != NULL);
- evictee = bh2;
- } else {
- bhs[out++] = bh2;
- }
- }
+ b = this_cpu_ptr(&bh_lrus);
+ for (i = 0; i < BH_LRU_SIZE; i++) {
+ swap(evictee, b->bhs[i]);
+ if (evictee == bh) {
+ bh_lru_unlock();
+ return;
}
- while (out < BH_LRU_SIZE)
- bhs[out++] = NULL;
- memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
}
- bh_lru_unlock();
- if (evictee)
- __brelse(evictee);
+ get_bh(bh);
+ bh_lru_unlock();
+ brelse(evictee);
}
/*
@@ -1829,7 +1824,8 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -1883,7 +1879,8 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -3021,11 +3018,11 @@ EXPORT_SYMBOL(block_write_full_page);
sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
get_block_t *get_block)
{
- struct buffer_head tmp;
struct inode *inode = mapping->host;
- tmp.b_state = 0;
- tmp.b_blocknr = 0;
- tmp.b_size = i_blocksize(inode);
+ struct buffer_head tmp = {
+ .b_size = i_blocksize(inode),
+ };
+
get_block(inode, block, &tmp, 0);
return tmp.b_blocknr;
}
@@ -3038,7 +3035,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
if (unlikely(bio_flagged(bio, BIO_QUIET)))
set_bit(BH_Quiet, &bh->b_state);
- bh->b_end_io(bh, !bio->bi_error);
+ bh->b_end_io(bh, !bio->bi_status);
bio_put(bio);
}
@@ -3091,7 +3088,7 @@ void guard_bio_eod(int op, struct bio *bio)
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- struct writeback_control *wbc)
+ enum rw_hint write_hint, struct writeback_control *wbc)
{
struct bio *bio;
@@ -3120,6 +3117,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
+ bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
BUG_ON(bio->bi_iter.bi_size != bh->b_size);
@@ -3142,7 +3140,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
- return submit_bh_wbc(op, op_flags, bh, NULL);
+ return submit_bh_wbc(op, op_flags, bh, 0, NULL);
}
EXPORT_SYMBOL(submit_bh);
@@ -3279,8 +3277,6 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
bh = head;
do {
- if (buffer_write_io_error(bh) && page->mapping)
- mapping_set_error(page->mapping, -EIO);
if (buffer_busy(bh))
goto failed;
bh = bh->b_this_page;
@@ -3492,6 +3488,130 @@ int bh_submit_read(struct buffer_head *bh)
}
EXPORT_SYMBOL(bh_submit_read);
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ *
+ * Returns the offset within the file on success, and -ENOENT otherwise.
+ */
+static loff_t
+page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
+{
+ loff_t offset = page_offset(page);
+ struct buffer_head *bh, *head;
+ bool seek_data = whence == SEEK_DATA;
+
+ if (lastoff < offset)
+ lastoff = offset;
+
+ bh = head = page_buffers(page);
+ do {
+ offset += bh->b_size;
+ if (lastoff >= offset)
+ continue;
+
+ /*
+ * Unwritten extents that have data in the page cache covering
+ * them can be identified by the BH_Unwritten state flag.
+ * Pages with multiple buffers might have a mix of holes, data
+ * and unwritten extents - any buffer with valid data in it
+ * should have BH_Uptodate flag set on it.
+ */
+
+ if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
+ return lastoff;
+
+ lastoff = offset;
+ } while ((bh = bh->b_this_page) != head);
+ return -ENOENT;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: unwritten and uptodate buffer heads count as data;
+ * everything else counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+ int whence)
+{
+ pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+ loff_t lastoff = offset;
+ struct pagevec pvec;
+
+ if (length <= 0)
+ return -ENOENT;
+
+ pagevec_init(&pvec, 0);
+
+ do {
+ unsigned want, nr_pages, i;
+
+ want = min_t(unsigned, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ *
+ * If current page offset is beyond where we've ended,
+ * we've found a hole.
+ */
+ if (whence == SEEK_HOLE &&
+ lastoff < page_offset(page))
+ goto check_range;
+
+ /* Searching done if the page index is out of range. */
+ if (page->index >= end)
+ goto not_found;
+
+ lock_page(page);
+ if (likely(page->mapping == inode->i_mapping) &&
+ page_has_buffers(page)) {
+ lastoff = page_seek_hole_data(page, lastoff, whence);
+ if (lastoff >= 0) {
+ unlock_page(page);
+ goto check_range;
+ }
+ }
+ unlock_page(page);
+ lastoff = page_offset(page) + PAGE_SIZE;
+ }
+
+ /* Searching done if fewer pages returned than wanted. */
+ if (nr_pages < want)
+ break;
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index < end);
+
+ /* When no page at lastoff and we are not done, we found a hole. */
+ if (whence != SEEK_HOLE)
+ goto not_found;
+
+check_range:
+ if (lastoff < offset + length)
+ goto out;
+not_found:
+ lastoff = -ENOENT;
+out:
+ pagevec_release(&pvec);
+ return lastoff;
+}
+
void __init buffer_init(void)
{
unsigned long nrpages;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 9bf90bcc56ac..bb3a02ca9da4 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -18,7 +18,7 @@
#include <linux/fscache-cache.h>
#include <linux/timer.h>
-#include <linux/wait.h>
+#include <linux/wait_bit.h>
#include <linux/cred.h>
#include <linux/workqueue.h>
#include <linux/security.h>
@@ -97,7 +97,7 @@ struct cachefiles_cache {
* backing file read tracking
*/
struct cachefiles_one_read {
- wait_queue_t monitor; /* link into monitored waitqueue */
+ wait_queue_entry_t monitor; /* link into monitored waitqueue */
struct page *back_page; /* backing file page we're waiting for */
struct page *netfs_page; /* netfs page we're going to fill */
struct fscache_retrieval *op; /* retrieval op covering this */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 41df8a27d7eb..3978b324cbca 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -204,7 +204,7 @@ wait_for_old_object:
wait_queue_head_t *wq;
signed long timeout = 60 * HZ;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
bool requeue;
/* if the object we're waiting for is queued for processing,
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index afbdc418966d..18d7aa61ef0f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -21,7 +21,7 @@
* - we use this to detect read completion of backing pages
* - the caller holds the waitqueue lock
*/
-static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
int sync, void *_key)
{
struct cachefiles_one_read *monitor =
@@ -48,7 +48,7 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
}
/* remove from the waitqueue */
- list_del(&wait->task_list);
+ list_del(&wait->entry);
/* move onto the action list and queue for FS-Cache thread pool */
ASSERT(monitor->op);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 987044bca1c2..59cb307b15fb 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -131,6 +131,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
}
if (new_mode != old_mode) {
+ newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
newattrs.ia_valid = ATTR_MODE;
ret = __ceph_setattr(inode, &newattrs);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1e71e6ca5ddf..50836280a6f8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -530,14 +530,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
long writeback_stat;
u64 truncate_size;
u32 truncate_seq;
- int err = 0, len = PAGE_SIZE;
+ int err, len = PAGE_SIZE;
dout("writepage %p idx %lu\n", page, page->index);
- if (!page->mapping || !page->mapping->host) {
- dout("writepage %p - no mapping\n", page);
- return -EFAULT;
- }
inode = page->mapping->host;
ci = ceph_inode(inode);
fsc = ceph_inode_to_client(inode);
@@ -547,7 +543,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
snapc = page_snap_context(page);
if (snapc == NULL) {
dout("writepage %p page %p not dirty?\n", inode, page);
- goto out;
+ return 0;
}
oldest = get_oldest_context(inode, &snap_size,
&truncate_size, &truncate_seq);
@@ -555,9 +551,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %p page %p snapc %p not writeable - noop\n",
inode, page, snapc);
/* we should only noop if called by kswapd */
- WARN_ON((current->flags & PF_MEMALLOC) == 0);
+ WARN_ON(!(current->flags & PF_MEMALLOC));
ceph_put_snap_context(oldest);
- goto out;
+ redirty_page_for_writepage(wbc, page);
+ return 0;
}
ceph_put_snap_context(oldest);
@@ -567,8 +564,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */
if (page_off >= snap_size) {
dout("%p page eof %llu\n", page, snap_size);
- goto out;
+ return 0;
}
+
if (snap_size < page_off + len)
len = snap_size - page_off;
@@ -595,7 +593,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage interrupted page %p\n", page);
redirty_page_for_writepage(wbc, page);
end_page_writeback(page);
- goto out;
+ return err;
}
dout("writepage setting page/mapping error %d %p\n",
err, page);
@@ -611,7 +609,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */
-out:
return err;
}
@@ -1318,7 +1315,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
- int check_cap = 0;
+ bool check_cap = false;
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
inode, page, (int)pos, (int)copied, (int)len);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 4e7421caf380..fd1172823f86 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -35,18 +35,34 @@ struct fscache_netfs ceph_cache_netfs = {
.version = 0,
};
+static DEFINE_MUTEX(ceph_fscache_lock);
+static LIST_HEAD(ceph_fscache_list);
+
+struct ceph_fscache_entry {
+ struct list_head list;
+ struct fscache_cookie *fscache;
+ struct ceph_fsid fsid;
+ size_t uniq_len;
+ char uniquifier[0];
+};
+
static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
void *buffer, uint16_t maxbuf)
{
const struct ceph_fs_client* fsc = cookie_netfs_data;
- uint16_t klen;
+ const char *fscache_uniq = fsc->mount_options->fscache_uniq;
+ uint16_t fsid_len, uniq_len;
- klen = sizeof(fsc->client->fsid);
- if (klen > maxbuf)
+ fsid_len = sizeof(fsc->client->fsid);
+ uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
+ if (fsid_len + uniq_len > maxbuf)
return 0;
- memcpy(buffer, &fsc->client->fsid, klen);
- return klen;
+ memcpy(buffer, &fsc->client->fsid, fsid_len);
+ if (uniq_len)
+ memcpy(buffer + fsid_len, fscache_uniq, uniq_len);
+
+ return fsid_len + uniq_len;
}
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
@@ -67,13 +83,54 @@ void ceph_fscache_unregister(void)
int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
{
+ const struct ceph_fsid *fsid = &fsc->client->fsid;
+ const char *fscache_uniq = fsc->mount_options->fscache_uniq;
+ size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
+ struct ceph_fscache_entry *ent;
+ int err = 0;
+
+ mutex_lock(&ceph_fscache_lock);
+ list_for_each_entry(ent, &ceph_fscache_list, list) {
+ if (memcmp(&ent->fsid, fsid, sizeof(*fsid)))
+ continue;
+ if (ent->uniq_len != uniq_len)
+ continue;
+ if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len))
+ continue;
+
+ pr_err("fscache cookie already registered for fsid %pU\n", fsid);
+ pr_err(" use fsc=%%s mount option to specify a uniquifier\n");
+ err = -EBUSY;
+ goto out_unlock;
+ }
+
+ ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL);
+ if (!ent) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def,
fsc, true);
- if (!fsc->fscache)
- pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
- return 0;
+ if (fsc->fscache) {
+ memcpy(&ent->fsid, fsid, sizeof(*fsid));
+ if (uniq_len > 0) {
+ memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
+ ent->uniq_len = uniq_len;
+ }
+ ent->fscache = fsc->fscache;
+ list_add_tail(&ent->list, &ceph_fscache_list);
+ } else {
+ kfree(ent);
+ pr_err("unable to register fscache cookie for fsid %pU\n",
+ fsid);
+ /* all other fs ignore this error */
+ }
+out_unlock:
+ mutex_unlock(&ceph_fscache_lock);
+ return err;
}
static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
@@ -349,7 +406,24 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
- fscache_relinquish_cookie(fsc->fscache, 0);
+ if (fscache_cookie_valid(fsc->fscache)) {
+ struct ceph_fscache_entry *ent;
+ bool found = false;
+
+ mutex_lock(&ceph_fscache_lock);
+ list_for_each_entry(ent, &ceph_fscache_list, list) {
+ if (ent->fscache == fsc->fscache) {
+ list_del(&ent->list);
+ kfree(ent);
+ found = true;
+ break;
+ }
+ }
+ WARN_ON_ONCE(!found);
+ mutex_unlock(&ceph_fscache_lock);
+
+ __fscache_relinquish_cookie(fsc->fscache, 0);
+ }
fsc->fscache = NULL;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a3ebb632294e..7007ae2a5ad2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1653,6 +1653,21 @@ static int try_nonblocking_invalidate(struct inode *inode)
return -1;
}
+bool __ceph_should_report_size(struct ceph_inode_info *ci)
+{
+ loff_t size = ci->vfs_inode.i_size;
+ /* mds will adjust max size according to the reported size */
+ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
+ return false;
+ if (size >= ci->i_max_size)
+ return true;
+ /* half of previous max_size increment has been used */
+ if (ci->i_max_size > ci->i_reported_size &&
+ (size << 1) >= ci->i_max_size + ci->i_reported_size)
+ return true;
+ return false;
+}
+
/*
* Swiss army knife function to examine currently used and wanted
* versus held caps. Release, flush, ack revoked caps to mds as
@@ -1806,8 +1821,7 @@ retry_locked:
}
/* approaching file_max? */
- if ((inode->i_size << 1) >= ci->i_max_size &&
- (ci->i_reported_size << 1) < ci->i_max_size) {
+ if (__ceph_should_report_size(ci)) {
dout("i_size approaching max_size\n");
goto ack;
}
@@ -3027,8 +3041,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
le32_to_cpu(grant->truncate_seq),
le64_to_cpu(grant->truncate_size),
size);
- /* max size increase? */
- if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+ }
+
+ if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
+ if (max_size != ci->i_max_size) {
dout("max_size %lld -> %llu\n",
ci->i_max_size, max_size);
ci->i_max_size = max_size;
@@ -3037,6 +3053,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
ci->i_requested_max_size = 0;
}
wake = true;
+ } else if (ci->i_wanted_max_size > ci->i_max_size &&
+ ci->i_wanted_max_size > ci->i_requested_max_size) {
+ /* CEPH_CAP_OP_IMPORT */
+ wake = true;
}
}
@@ -3554,7 +3574,6 @@ retry:
}
/* make sure we re-request max_size, if necessary */
- ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
*old_issued = issued;
@@ -3790,6 +3809,7 @@ bad:
*/
void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
+ struct inode *inode;
struct ceph_inode_info *ci;
int flags = CHECK_CAPS_NODELAY;
@@ -3805,9 +3825,15 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
time_before(jiffies, ci->i_hold_caps_max))
break;
list_del_init(&ci->i_cap_delay_list);
+
+ inode = igrab(&ci->vfs_inode);
spin_unlock(&mdsc->cap_delay_lock);
- dout("check_delayed_caps on %p\n", &ci->vfs_inode);
- ceph_check_caps(ci, flags, NULL);
+
+ if (inode) {
+ dout("check_delayed_caps on %p\n", inode);
+ ceph_check_caps(ci, flags, NULL);
+ iput(inode);
+ }
}
spin_unlock(&mdsc->cap_delay_lock);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e071d23f6148..ef7240ace576 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -271,6 +271,11 @@ out:
if (ret < 0)
err = ret;
dput(last);
+ /* last_name no longer match cache index */
+ if (fi->readdir_cache_idx >= 0) {
+ fi->readdir_cache_idx = -1;
+ fi->dir_release_count = 0;
+ }
}
return err;
}
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e8f11fa565c5..7df550c13d7f 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -91,6 +91,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
ceph_mdsc_put_request(req);
if (!inode)
return ERR_PTR(-ESTALE);
+ if (inode->i_nlink == 0) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
}
return d_obtain_alias(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 29308a80d66f..3d48c415f3cb 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1040,8 +1040,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
int num_pages;
int written = 0;
int flags;
- int check_caps = 0;
int ret;
+ bool check_caps = false;
struct timespec mtime = current_time(inode);
size_t count = iov_iter_count(from);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index dcce79b84406..220dfd87cbfa 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1016,6 +1016,7 @@ static void update_dentry_lease(struct dentry *dentry,
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
struct inode *dir;
+ struct ceph_mds_session *old_lease_session = NULL;
/*
* Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
@@ -1051,8 +1052,10 @@ static void update_dentry_lease(struct dentry *dentry,
time_before(ttl, di->time))
goto out_unlock; /* we already have a newer lease. */
- if (di->lease_session && di->lease_session != session)
- goto out_unlock;
+ if (di->lease_session && di->lease_session != session) {
+ old_lease_session = di->lease_session;
+ di->lease_session = NULL;
+ }
ceph_dentry_lru_touch(dentry);
@@ -1065,6 +1068,8 @@ static void update_dentry_lease(struct dentry *dentry,
di->time = ttl;
out_unlock:
spin_unlock(&dentry->d_lock);
+ if (old_lease_session)
+ ceph_put_mds_session(old_lease_session);
return;
}
@@ -1653,20 +1658,17 @@ out:
return err;
}
-int ceph_inode_set_size(struct inode *inode, loff_t size)
+bool ceph_inode_set_size(struct inode *inode, loff_t size)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int ret = 0;
+ bool ret;
spin_lock(&ci->i_ceph_lock);
dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
i_size_write(inode, size);
inode->i_blocks = calc_inode_blocks(size);
- /* tell the MDS if we are approaching max_size */
- if ((size << 1) >= ci->i_max_size &&
- (ci->i_reported_size << 1) < ci->i_max_size)
- ret = 1;
+ ret = __ceph_should_report_size(ci);
spin_unlock(&ci->i_ceph_lock);
return ret;
@@ -2022,7 +2024,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
attr->ia_size > inode->i_size) {
i_size_write(inode, attr->ia_size);
inode->i_blocks = calc_inode_blocks(attr->ia_size);
- inode->i_ctime = attr->ia_ctime;
ci->i_reported_size = attr->ia_size;
dirtied |= CEPH_CAP_FILE_EXCL;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
@@ -2044,7 +2045,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
only ? "ctime only" : "ignored");
- inode->i_ctime = attr->ia_ctime;
if (only) {
/*
* if kernel wants to dirty ctime but nothing else,
@@ -2067,7 +2067,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
- inode->i_ctime = current_time(inode);
+ inode->i_ctime = attr->ia_ctime;
}
release &= issued;
@@ -2085,6 +2085,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
req->r_inode_drop = release;
req->r_args.setattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1;
+ req->r_stamp = attr->ia_ctime;
err = ceph_mdsc_do_request(mdsc, NULL, req);
}
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 6806dbeaee19..64ae74472046 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -127,6 +127,29 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
req->r_tid);
+ mutex_lock(&mdsc->mutex);
+ if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+ err = 0;
+ } else {
+ /*
+ * ensure we aren't running concurrently with
+ * ceph_fill_trace or ceph_readdir_prepopulate, which
+ * rely on locks (dir mutex) held by our caller.
+ */
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = err;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+
+ if (!req->r_session) {
+ // haven't sent the request
+ err = 0;
+ }
+ }
+ mutex_unlock(&mdsc->mutex);
+ if (!err)
+ return 0;
+
intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
USE_AUTH_MDS);
if (IS_ERR(intr_req))
@@ -146,7 +169,7 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
if (err && err != -ERESTARTSYS)
return err;
- wait_for_completion(&req->r_completion);
+ wait_for_completion_killable(&req->r_safe_completion);
return 0;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f38e56fa9712..666a9f274832 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1687,7 +1687,6 @@ struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
- struct timespec ts;
if (!req)
return ERR_PTR(-ENOMEM);
@@ -1706,8 +1705,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
- ktime_get_real_ts(&ts);
- req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
+ req->r_stamp = timespec_trunc(current_kernel_time(), mdsc->fsc->sb->s_time_gran);
req->r_op = op;
req->r_direct_mode = mode;
@@ -3771,13 +3769,13 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
{
struct ceph_mds_client *mdsc = fsc->mdsc;
-
dout("mdsc_destroy %p\n", mdsc);
- ceph_mdsc_stop(mdsc);
/* flush out any connection work with references to us */
ceph_msgr_flush();
+ ceph_mdsc_stop(mdsc);
+
fsc->mdsc = NULL;
kfree(mdsc);
dout("mdsc_destroy %p done\n", mdsc);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 8d7918ce694a..aa06a8c24792 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -121,6 +121,7 @@ enum {
/* int args above */
Opt_snapdirname,
Opt_mds_namespace,
+ Opt_fscache_uniq,
Opt_last_string,
/* string args above */
Opt_dirstat,
@@ -158,6 +159,7 @@ static match_table_t fsopt_tokens = {
/* int args above */
{Opt_snapdirname, "snapdirname=%s"},
{Opt_mds_namespace, "mds_namespace=%s"},
+ {Opt_fscache_uniq, "fsc=%s"},
/* string args above */
{Opt_dirstat, "dirstat"},
{Opt_nodirstat, "nodirstat"},
@@ -223,6 +225,14 @@ static int parse_fsopt_token(char *c, void *private)
if (!fsopt->mds_namespace)
return -ENOMEM;
break;
+ case Opt_fscache_uniq:
+ fsopt->fscache_uniq = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->fscache_uniq)
+ return -ENOMEM;
+ fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+ break;
/* misc */
case Opt_wsize:
fsopt->wsize = intval;
@@ -317,6 +327,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
kfree(args->snapdir_name);
kfree(args->mds_namespace);
kfree(args->server_path);
+ kfree(args->fscache_uniq);
kfree(args);
}
@@ -350,10 +361,12 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
if (ret)
return ret;
-
ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
if (ret)
return ret;
+ ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
+ if (ret)
+ return ret;
return ceph_compare_options(new_opt, fsc->client);
}
@@ -475,8 +488,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noasyncreaddir");
if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
seq_puts(m, ",nodcache");
- if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
- seq_puts(m, ",fsc");
+ if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
+ if (fsopt->fscache_uniq)
+ seq_printf(m, ",fsc=%s", fsopt->fscache_uniq);
+ else
+ seq_puts(m, ",fsc");
+ }
if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
seq_puts(m, ",nopoolperm");
@@ -597,18 +614,11 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->wb_pagevec_pool)
goto fail_trunc_wq;
- /* setup fscache */
- if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
- (ceph_fscache_register_fs(fsc) != 0))
- goto fail_fscache;
-
/* caps */
fsc->min_caps = fsopt->max_readdir;
return fsc;
-fail_fscache:
- ceph_fscache_unregister_fs(fsc);
fail_trunc_wq:
destroy_workqueue(fsc->trunc_wq);
fail_pg_inv_wq:
@@ -626,8 +636,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
{
dout("destroy_fs_client %p\n", fsc);
- ceph_fscache_unregister_fs(fsc);
-
destroy_workqueue(fsc->wb_wq);
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
@@ -636,8 +644,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_mount_options(fsc->mount_options);
- ceph_fs_debugfs_cleanup(fsc);
-
ceph_destroy_client(fsc->client);
kfree(fsc);
@@ -822,6 +828,13 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
if (err < 0)
goto out;
+ /* setup fscache */
+ if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) {
+ err = ceph_fscache_register_fs(fsc);
+ if (err < 0)
+ goto out;
+ }
+
if (!fsc->mount_options->server_path) {
path = "";
dout("mount opening path \\t\n");
@@ -1040,6 +1053,12 @@ static void ceph_kill_sb(struct super_block *s)
ceph_mdsc_pre_umount(fsc->mdsc);
generic_shutdown_super(s);
+
+ fsc->client->extra_mon_dispatch = NULL;
+ ceph_fs_debugfs_cleanup(fsc);
+
+ ceph_fscache_unregister_fs(fsc);
+
ceph_mdsc_destroy(fsc);
destroy_fs_client(fsc);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a973acd8beaf..f02a2225fe42 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -73,6 +73,7 @@ struct ceph_mount_options {
char *snapdir_name; /* default ".snap" */
char *mds_namespace; /* default NULL */
char *server_path; /* default "/" */
+ char *fscache_uniq; /* default NULL */
};
struct ceph_fs_client {
@@ -793,7 +794,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
extern int ceph_inode_holds_cap(struct inode *inode, int mask);
-extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
extern void ceph_queue_vmtruncate(struct inode *inode);
@@ -918,6 +919,7 @@ extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
extern void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession);
+extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 75267cdd5dfd..11263f102e4c 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -756,6 +756,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr) {
+ err = ceph_do_getattr(inode, 0, true);
+ if (err)
+ return err;
err = -ENODATA;
if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
err = vxattr->getxattr_cb(ci, value, size);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 034f00f21390..f7243617316c 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -1,5 +1,5 @@
config CIFS
- tristate "CIFS support (advanced network filesystem, SMBFS successor)"
+ tristate "SMB3 and CIFS support (advanced network filesystem)"
depends on INET
select NLS
select CRYPTO
@@ -10,28 +10,35 @@ config CIFS
select CRYPTO_ECB
select CRYPTO_DES
help
- This is the client VFS module for the Common Internet File System
- (CIFS) protocol which is the successor to the Server Message Block
- (SMB) protocol, the native file sharing mechanism for most early
- PC operating systems. The CIFS protocol is fully supported by
- file servers such as Windows 2000 (including Windows 2003, Windows 2008,
- NT 4 and Windows XP) as well by Samba (which provides excellent CIFS
+ This is the client VFS module for the SMB3 family of NAS protocols,
+ as well as for earlier dialects such as SMB2.1, SMB2 and the
+ Common Internet File System (CIFS) protocol. CIFS was the successor
+ to the original dialect, the Server Message Block (SMB) protocol, the
+ native file sharing mechanism for most early PC operating systems.
+
+ The SMB3 protocol is supported by most modern operating systems and
+ NAS appliances (e.g. Samba, Windows 8, Windows 2012, MacOS).
+ The older CIFS protocol was included in Windows NT4, 2000 and XP (and
+ later) as well by Samba (which provides excellent CIFS and SMB3
server support for Linux and many other operating systems). Limited
- support for OS/2 and Windows ME and similar servers is provided as
- well.
+ support for OS/2 and Windows ME and similar very old servers is
+ provided as well.
- The module also provides optional support for the followon
- protocols for CIFS including SMB3, which enables
- useful performance and security features (see the description
- of CONFIG_CIFS_SMB2).
-
- The cifs module provides an advanced network file system
- client for mounting to CIFS compliant servers. It includes
+ The cifs module provides an advanced network file system client
+ for mounting to SMB3 (and CIFS) compliant servers. It includes
support for DFS (hierarchical name space), secure per-user
session establishment via Kerberos or NTLM or NTLMv2,
safe distributed caching (oplock), optional packet
signing, Unicode and other internationalization improvements.
- If you need to mount to Samba or Windows from this machine, say Y.
+
+ In general, the default dialects, SMB3 and later, enable better
+ performance, security and features, than would be possible with CIFS.
+ Note that when mounting to Samba, due to the CIFS POSIX extensions,
+ CIFS mounts can provide slightly better POSIX compatibility
+ than SMB3 mounts. SMB2/SMB3 mount options are also
+ slightly simpler (compared to CIFS) due to protocol improvements.
+
+ If you need to mount to Samba, Macs or Windows from this machine, say Y.
config CIFS_STATS
bool "CIFS statistics"
@@ -89,7 +96,7 @@ config CIFS_UPCALL
Enables an upcall mechanism for CIFS which accesses userspace helper
utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
which are needed to mount to certain secure servers (for which more
- secure Kerberos authentication is required). If unsure, say N.
+ secure Kerberos authentication is required). If unsure, say Y.
config CIFS_XATTR
bool "CIFS extended attributes"
@@ -105,7 +112,7 @@ config CIFS_XATTR
(used by some filesystems to store ACLs) is not supported at
this time.
- If unsure, say N.
+ If unsure, say Y.
config CIFS_POSIX
bool "CIFS POSIX Extensions"
@@ -125,7 +132,7 @@ config CIFS_ACL
help
Allows fetching CIFS/NTFS ACL from the server. The DACL blob
is handed over to the application/caller. See the man
- page for getcifsacl for more information.
+ page for getcifsacl for more information. If unsure, say Y.
config CIFS_DEBUG
bool "Enable CIFS debugging routines"
@@ -146,6 +153,16 @@ config CIFS_DEBUG2
option can be turned off unless you are debugging
cifs problems. If unsure, say N.
+config CIFS_DEBUG_DUMP_KEYS
+ bool "Dump encryption keys for offline decryption (Unsafe)"
+ depends on CIFS_DEBUG
+ help
+ Enabling this will dump the encryption and decryption keys
+ used to communicate on an encrypted share connection on the
+ console. This allows Wireshark to decrypt and dissect
+ encrypted network captures. Enable this carefully.
+ If unsure, say N.
+
config CIFS_DFS_UPCALL
bool "DFS feature support"
depends on CIFS && KEYS
@@ -157,7 +174,7 @@ config CIFS_DFS_UPCALL
an upcall mechanism for CIFS which contacts userspace helper
utilities to provide server name resolution (host names to
IP addresses) which is needed for implicit mounts of DFS junction
- points. If unsure, say N.
+ points. If unsure, say Y.
config CIFS_NFSD_EXPORT
bool "Allow nfsd to export CIFS file system"
@@ -165,38 +182,9 @@ config CIFS_NFSD_EXPORT
help
Allows NFS server to export a CIFS mounted share (nfsd over cifs)
-config CIFS_SMB2
- bool "SMB2 and SMB3 network file system support"
- depends on CIFS
- select KEYS
- select FSCACHE
- select DNS_RESOLVER
- select CRYPTO_AES
- select CRYPTO_SHA256
- select CRYPTO_CMAC
- select CRYPTO_AEAD2
- select CRYPTO_CCM
-
- help
- This enables support for the Server Message Block version 2
- family of protocols, including SMB3. SMB3 support is
- enabled on mount by specifying "vers=3.0" in the mount
- options. These protocols are the successors to the popular
- CIFS and SMB network file sharing protocols. SMB3 is the
- native file sharing mechanism for the more recent
- versions of Windows (Windows 8 and Windows 2012 and
- later) and Samba server and many others support SMB3 well.
- In general SMB3 enables better performance, security
- and features, than would be possible with CIFS (Note that
- when mounting to Samba, due to the CIFS POSIX extensions,
- CIFS mounts can provide slightly better POSIX compatibility
- than SMB3 mounts do though). Note that SMB2/SMB3 mount
- options are also slightly simpler (compared to CIFS) due
- to protocol improvements.
-
config CIFS_SMB311
bool "SMB3.1.1 network file system support (Experimental)"
- depends on CIFS_SMB2
+ depends on CIFS
help
This enables experimental support for the newest, SMB3.1.1, dialect.
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index eed7eb09f46f..5e853a395b92 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,9 @@ obj-$(CONFIG_CIFS) += cifs.o
cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
cifs_unicode.o nterr.o cifsencrypt.o \
- readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o
+ readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \
+ smb2ops.o smb2maperror.o smb2transport.o \
+ smb2misc.o smb2pdu.o smb2inode.o smb2file.o
cifs-$(CONFIG_CIFS_XATTR) += xattr.o
cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
@@ -16,6 +18,3 @@ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
-
-cifs-$(CONFIG_CIFS_SMB2) += smb2ops.o smb2maperror.o smb2transport.o \
- smb2misc.o smb2pdu.o smb2inode.o smb2file.o
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index a0b3e7d1be48..b380e0871372 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -79,6 +79,10 @@ convert_sfu_char(const __u16 src_char, char *target)
static bool
convert_sfm_char(const __u16 src_char, char *target)
{
+ if (src_char >= 0xF001 && src_char <= 0xF01F) {
+ *target = src_char - 0xF000;
+ return true;
+ }
switch (src_char) {
case SFM_COLON:
*target = ':';
@@ -417,6 +421,10 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
{
__le16 dest_char;
+ if (src_char >= 0x01 && src_char <= 0x1F) {
+ dest_char = cpu_to_le16(src_char + 0xF000);
+ return dest_char;
+ }
switch (src_char) {
case ':':
dest_char = cpu_to_le16(SFM_COLON);
@@ -580,7 +588,6 @@ ctoUTF16_out:
return j;
}
-#ifdef CONFIG_CIFS_SMB2
/*
* cifs_local_to_utf16_bytes - how long will a string be after conversion?
* @from - pointer to input string
@@ -639,4 +646,3 @@ cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
*utf16_len = len;
return dst;
}
-#endif /* CONFIG_CIFS_SMB2 */
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 8a79a34e66b8..8360b74530a9 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -116,11 +116,9 @@ char *cifs_strndup_from_utf16(const char *src, const int maxlen,
extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
const struct nls_table *cp, int mapChars);
extern int cifs_remap(struct cifs_sb_info *cifs_sb);
-#ifdef CONFIG_CIFS_SMB2
extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
int *utf16_len, const struct nls_table *cp,
int remap);
-#endif /* CONFIG_CIFS_SMB2 */
#endif
wchar_t cifs_toupper(wchar_t in);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9a1667e0e8d6..180b3356ff86 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -51,9 +51,7 @@
#include <linux/key-type.h>
#include "cifs_spnego.h"
#include "fscache.h"
-#ifdef CONFIG_CIFS_SMB2
#include "smb2pdu.h"
-#endif
int cifsFYI = 0;
bool traceSMB;
@@ -277,9 +275,8 @@ cifs_alloc_inode(struct super_block *sb)
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
cifs_inode->epoch = 0;
-#ifdef CONFIG_CIFS_SMB2
generate_random_uuid(cifs_inode->lease_key);
-#endif
+
/*
* Can not set i_flags here - they get immediately overwritten to zero
* by the VFS.
@@ -1213,14 +1210,12 @@ cifs_destroy_inodecache(void)
static int
cifs_init_request_bufs(void)
{
- size_t max_hdr_size = MAX_CIFS_HDR_SIZE;
-#ifdef CONFIG_CIFS_SMB2
/*
* SMB2 maximum header size is bigger than CIFS one - no problems to
* allocate some more bytes for CIFS.
*/
- max_hdr_size = MAX_SMB2_HDR_SIZE;
-#endif
+ size_t max_hdr_size = MAX_SMB2_HDR_SIZE;
+
if (CIFSMaxBufSize < 8192) {
/* Buffer size can not be smaller than 2 * PATH_MAX since maximum
Unicode path name has to fit in any SMB/CIFS path based frames */
@@ -1359,7 +1354,7 @@ init_cifs(void)
spin_lock_init(&cifs_tcp_ses_lock);
spin_lock_init(&GlobalMid_Lock);
- get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret));
+ cifs_lock_secret = get_random_u32();
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
@@ -1476,12 +1471,10 @@ MODULE_SOFTDEP("pre: hmac");
MODULE_SOFTDEP("pre: md4");
MODULE_SOFTDEP("pre: md5");
MODULE_SOFTDEP("pre: nls");
-#ifdef CONFIG_CIFS_SMB2
MODULE_SOFTDEP("pre: aes");
MODULE_SOFTDEP("pre: cmac");
MODULE_SOFTDEP("pre: sha256");
MODULE_SOFTDEP("pre: aead2");
MODULE_SOFTDEP("pre: ccm");
-#endif /* CONFIG_CIFS_SMB2 */
module_init(init_cifs)
module_exit(exit_cifs)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index bcc7d9acad64..221693fe49ec 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -29,9 +29,7 @@
#include <crypto/internal/hash.h>
#include <linux/scatterlist.h>
#include <uapi/linux/cifs/cifs_mount.h>
-#ifdef CONFIG_CIFS_SMB2
#include "smb2pdu.h"
-#endif
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
@@ -367,6 +365,8 @@ struct smb_version_operations {
unsigned int (*calc_smb_size)(void *);
/* check for STATUS_PENDING and process it in a positive case */
bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
+ /* check for STATUS_NETWORK_SESSION_EXPIRED */
+ bool (*is_session_expired)(char *);
/* send oplock break response */
int (*oplock_response)(struct cifs_tcon *, struct cifs_fid *,
struct cifsInodeInfo *);
@@ -610,12 +610,10 @@ struct TCP_Server_Info {
__u16 sec_mode;
bool sign; /* is signing enabled on this connection? */
bool session_estab; /* mark when very first sess is established */
-#ifdef CONFIG_CIFS_SMB2
int echo_credits; /* echo reserved slots */
int oplock_credits; /* oplock break reserved slots */
bool echoes:1; /* enable echoes */
__u8 client_guid[SMB2_CLIENT_GUID_SIZE]; /* Client GUID */
-#endif
u16 dialect; /* dialect index that server chose */
bool oplocks:1; /* enable oplocks */
unsigned int maxReq; /* Clients should submit no more */
@@ -659,13 +657,11 @@ struct TCP_Server_Info {
atomic_t in_send; /* requests trying to send */
atomic_t num_waiters; /* blocked waiting to get in sendrecv */
#endif
-#ifdef CONFIG_CIFS_SMB2
unsigned int max_read;
unsigned int max_write;
__u8 preauth_hash[512];
struct delayed_work reconnect; /* reconnect workqueue job */
struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
-#endif /* CONFIG_CIFS_SMB2 */
unsigned long echo_interval;
};
@@ -847,13 +843,11 @@ struct cifs_ses {
bool sign; /* is signing required? */
bool need_reconnect:1; /* connection reset, uid now invalid */
bool domainAuto:1;
-#ifdef CONFIG_CIFS_SMB2
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 preauth_hash[512];
-#endif /* CONFIG_CIFS_SMB2 */
};
static inline bool
@@ -905,12 +899,10 @@ struct cifs_tcon {
atomic_t num_acl_get;
atomic_t num_acl_set;
} cifs_stats;
-#ifdef CONFIG_CIFS_SMB2
struct {
atomic_t smb2_com_sent[NUMBER_OF_SMB2_COMMANDS];
atomic_t smb2_com_failed[NUMBER_OF_SMB2_COMMANDS];
} smb2_stats;
-#endif /* CONFIG_CIFS_SMB2 */
} stats;
#ifdef CONFIG_CIFS_STATS2
unsigned long long time_writes;
@@ -946,7 +938,6 @@ struct cifs_tcon {
bool need_reopen_files:1; /* need to reopen tcon file handles */
bool use_resilient:1; /* use resilient instead of durable handles */
bool use_persistent:1; /* use persistent instead of durable handles */
-#ifdef CONFIG_CIFS_SMB2
bool print:1; /* set if connection to printer share */
__le32 capabilities;
__u32 share_flags;
@@ -959,7 +950,6 @@ struct cifs_tcon {
__u32 max_chunks;
__u32 max_bytes_chunk;
__u32 max_bytes_copy;
-#endif /* CONFIG_CIFS_SMB2 */
#ifdef CONFIG_CIFS_FSCACHE
u64 resource_id; /* server resource id */
struct fscache_cookie *fscache; /* cookie for share */
@@ -1062,12 +1052,10 @@ struct cifs_open_parms {
struct cifs_fid {
__u16 netfid;
-#ifdef CONFIG_CIFS_SMB2
__u64 persistent_fid; /* persist file id for smb2 */
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
__u8 create_guid[16];
-#endif
struct cifs_pending_open *pending_open;
unsigned int epoch;
bool purge_cache;
@@ -1105,10 +1093,8 @@ struct cifsFileInfo {
struct cifs_io_parms {
__u16 netfid;
-#ifdef CONFIG_CIFS_SMB2
__u64 persistent_fid; /* persist file id for smb2 */
__u64 volatile_fid; /* volatile file id for smb2 */
-#endif
__u32 pid;
__u64 offset;
unsigned int length;
@@ -1234,9 +1220,7 @@ struct cifsInodeInfo {
u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
u64 createtime; /* creation time on server */
-#ifdef CONFIG_CIFS_SMB2
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */
-#endif
#ifdef CONFIG_CIFS_FSCACHE
struct fscache_cookie *fscache;
#endif
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index fbb0d4cbda41..72a53bd19865 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1460,6 +1460,13 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return length;
server->total_read += length;
+ if (server->ops->is_session_expired &&
+ server->ops->is_session_expired(buf)) {
+ cifs_reconnect(server);
+ wake_up(&server->response_q);
+ return -1;
+ }
+
if (server->ops->is_status_pending &&
server->ops->is_status_pending(buf, server, 0)) {
cifs_discard_remaining_data(server);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9365c0cf77ad..59647eb72c5f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -55,9 +55,7 @@
#include "nterr.h"
#include "rfc1002pdu.h"
#include "fscache.h"
-#ifdef CONFIG_CIFS_SMB2
#include "smb2proto.h"
-#endif
#define CIFS_PORT 445
#define RFC1001_PORT 139
@@ -341,9 +339,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->tcpStatus = CifsNeedReconnect;
spin_unlock(&GlobalMid_Lock);
server->maxBuf = 0;
-#ifdef CONFIG_CIFS_SMB2
server->max_read = 0;
-#endif
cifs_dbg(FYI, "Reconnecting tcp session\n");
@@ -812,6 +808,13 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
cifs_dump_mem("Bad SMB: ", buf,
min_t(unsigned int, server->total_read, 48));
+ if (server->ops->is_session_expired &&
+ server->ops->is_session_expired(buf)) {
+ cifs_reconnect(server);
+ wake_up(&server->response_q);
+ return -1;
+ }
+
if (server->ops->is_status_pending &&
server->ops->is_status_pending(buf, server, length))
return -1;
@@ -1122,7 +1125,6 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
vol->ops = &smb1_operations;
vol->vals = &smb1_values;
break;
-#ifdef CONFIG_CIFS_SMB2
case Smb_20:
vol->ops = &smb20_operations;
vol->vals = &smb20_values;
@@ -1145,7 +1147,6 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
vol->vals = &smb311_values;
break;
#endif /* SMB311 */
-#endif
default:
cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
return 1;
@@ -1271,9 +1272,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->actimeo = CIFS_DEF_ACTIMEO;
- /* FIXME: add autonegotiation -- for now, SMB1 is default */
- vol->ops = &smb1_operations;
- vol->vals = &smb1_values;
+ /* FIXME: add autonegotiation for SMB3 or later rather than just SMB3 */
+ vol->ops = &smb30_operations; /* both secure and accepted widely */
+ vol->vals = &smb30_values;
vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT;
@@ -2170,7 +2171,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
cancel_delayed_work_sync(&server->echo);
-#ifdef CONFIG_CIFS_SMB2
if (from_reconnect)
/*
* Avoid deadlock here: reconnect work calls
@@ -2181,7 +2181,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
cancel_delayed_work(&server->reconnect);
else
cancel_delayed_work_sync(&server->reconnect);
-#endif
spin_lock(&GlobalMid_Lock);
server->tcpStatus = CifsExiting;
@@ -2247,17 +2246,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
-#ifdef CONFIG_CIFS_SMB2
INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server);
mutex_init(&tcp_ses->reconnect_mutex);
-#endif
memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
sizeof(tcp_ses->srcaddr));
memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
sizeof(tcp_ses->dstaddr));
-#ifdef CONFIG_CIFS_SMB2
generate_random_uuid(tcp_ses->client_guid);
-#endif
/*
* at this point we are the only ones with the pointer
* to the struct since the kernel thread not created yet
@@ -2655,10 +2650,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info)
return 0;
if (tcon->seal != volume_info->seal)
return 0;
-#ifdef CONFIG_CIFS_SMB2
if (tcon->snapshot_time != volume_info->snapshot_time)
return 0;
-#endif /* CONFIG_CIFS_SMB2 */
return 1;
}
@@ -2733,7 +2726,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
}
if (volume_info->snapshot_time) {
-#ifdef CONFIG_CIFS_SMB2
if (ses->server->vals->protocol_id == 0) {
cifs_dbg(VFS,
"Use SMB2 or later for snapshot mount option\n");
@@ -2741,11 +2733,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
goto out_fail;
} else
tcon->snapshot_time = volume_info->snapshot_time;
-#else
- cifs_dbg(VFS, "Snapshot mount option requires SMB2 support\n");
- rc = -EOPNOTSUPP;
- goto out_fail;
-#endif /* CONFIG_CIFS_SMB2 */
}
tcon->ses = ses;
@@ -2781,7 +2768,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
"SMB3 or later required for persistent handles\n");
rc = -EOPNOTSUPP;
goto out_fail;
-#ifdef CONFIG_CIFS_SMB2
} else if (ses->server->capabilities &
SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
tcon->use_persistent = true;
@@ -2790,15 +2776,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
"Persistent handles not supported on share\n");
rc = -EOPNOTSUPP;
goto out_fail;
-#endif /* CONFIG_CIFS_SMB2 */
}
-#ifdef CONFIG_CIFS_SMB2
} else if ((tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
&& (ses->server->capabilities & SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
&& (volume_info->nopersistent == false)) {
cifs_dbg(FYI, "enabling persistent handles\n");
tcon->use_persistent = true;
-#endif /* CONFIG_CIFS_SMB2 */
} else if (volume_info->resilient) {
if (ses->server->vals->protocol_id == 0) {
cifs_dbg(VFS,
@@ -2815,7 +2798,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
"SMB3 or later required for encryption\n");
rc = -EOPNOTSUPP;
goto out_fail;
-#ifdef CONFIG_CIFS_SMB2
} else if (tcon->ses->server->capabilities &
SMB2_GLOBAL_CAP_ENCRYPTION)
tcon->seal = true;
@@ -2823,7 +2805,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
cifs_dbg(VFS, "Encryption is not supported on share\n");
rc = -EOPNOTSUPP;
goto out_fail;
-#endif /* CONFIG_CIFS_SMB2 */
}
}
@@ -3738,14 +3719,12 @@ try_mount_again:
goto mount_fail_check;
}
-#ifdef CONFIG_CIFS_SMB2
if ((volume_info->persistent == true) && ((ses->server->capabilities &
SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
cifs_dbg(VFS, "persistent handles not supported by server\n");
rc = -EOPNOTSUPP;
goto mount_fail_check;
}
-#endif /* CONFIG_CIFS_SMB2*/
/* search for existing tcon to this server share */
tcon = cifs_get_tcon(ses, volume_info);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0fd081bd2a2f..bc09df6b473a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2234,14 +2234,16 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
set_page_writeback(page);
retry_write:
rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
- if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
- goto retry_write;
- else if (rc == -EAGAIN)
+ if (rc == -EAGAIN) {
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ goto retry_write;
redirty_page_for_writepage(wbc, page);
- else if (rc != 0)
+ } else if (rc != 0) {
SetPageError(page);
- else
+ mapping_set_error(page->mapping, rc);
+ } else {
SetPageUptodate(page);
+ }
end_page_writeback(page);
put_page(page);
free_xid(xid);
@@ -2810,12 +2812,12 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
ssize_t rc;
+ inode_lock(inode);
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents writing.
*/
down_read(&cinode->lock_sem);
- inode_lock(inode);
rc = generic_write_checks(iocb, from);
if (rc <= 0)
@@ -2828,11 +2830,11 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
else
rc = -EACCES;
out:
+ up_read(&cinode->lock_sem);
inode_unlock(inode);
if (rc > 0)
rc = generic_write_sync(iocb, rc);
- up_read(&cinode->lock_sem);
return rc;
}
@@ -3271,7 +3273,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
if (!is_sync_kiocb(iocb))
ctx->iocb = iocb;
- if (to->type & ITER_IOVEC)
+ if (to->type == ITER_IOVEC)
ctx->should_dirty = true;
rc = setup_aio_ctx_iter(ctx, to, READ);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4d1fcd76d022..a8693632235f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -24,6 +24,7 @@
#include <linux/pagemap.h>
#include <linux/freezer.h>
#include <linux/sched/signal.h>
+#include <linux/wait_bit.h>
#include <asm/div64.h>
#include "cifsfs.h"
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 76fb0917dc8c..54f32f9143a9 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -101,7 +101,6 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
fsinf->fs_attributes = le32_to_cpu(tcon->fsAttrInfo.Attributes);
fsinf->max_path_component =
le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength);
-#ifdef CONFIG_CIFS_SMB2
fsinf->vol_serial_number = tcon->vol_serial_number;
fsinf->vol_create_time = le64_to_cpu(tcon->vol_create_time);
fsinf->share_flags = tcon->share_flags;
@@ -110,7 +109,6 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
fsinf->optimal_sector_size = tcon->perf_sector_size;
fsinf->max_bytes_chunk = tcon->max_bytes_chunk;
fsinf->maximal_access = tcon->maximal_access;
-#endif /* SMB2 */
fsinf->cifs_posix_caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
if (copy_to_user(arg, fsinf, sizeof(struct smb_mnt_fs_info)))
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index c4d996f78e1c..60b5a11ee11b 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -29,9 +29,7 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
-#ifdef CONFIG_CIFS_SMB2
#include "smb2proto.h"
-#endif
/*
* M-F Symlink Functions - Begin
@@ -402,7 +400,6 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
/*
* SMB 2.1/SMB3 Protocol specific functions
*/
-#ifdef CONFIG_CIFS_SMB2
int
smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const unsigned char *path,
@@ -525,7 +522,6 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
kfree(utf16_path);
return rc;
}
-#endif /* CONFIG_CIFS_SMB2 */
/*
* M-F Symlink Functions - End
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index b08531977daa..eea93ac15ef0 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -30,9 +30,7 @@
#include "smberr.h"
#include "nterr.h"
#include "cifs_unicode.h"
-#ifdef CONFIG_CIFS_SMB2
#include "smb2pdu.h"
-#endif
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
@@ -149,15 +147,12 @@ struct smb_hdr *
cifs_buf_get(void)
{
struct smb_hdr *ret_buf = NULL;
- size_t buf_size = sizeof(struct smb_hdr);
-
-#ifdef CONFIG_CIFS_SMB2
/*
* SMB2 header is bigger than CIFS one - no problems to clean some
* more bytes for CIFS.
*/
- buf_size = sizeof(struct smb2_hdr);
-#endif
+ size_t buf_size = sizeof(struct smb2_hdr);
+
/*
* We could use negotiated size instead of max_msgsize -
* but it may be more efficient to always alloc same size
@@ -620,9 +615,7 @@ void
cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink,
struct cifs_pending_open *open)
{
-#ifdef CONFIG_CIFS_SMB2
memcpy(open->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE);
-#endif
open->oplock = CIFS_OPLOCK_NO_CHANGE;
open->tlink = tlink;
fid->pending_open = open;
@@ -810,7 +803,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
if (!pages) {
pages = vmalloc(max_pages * sizeof(struct page *));
- if (!bv) {
+ if (!pages) {
kvfree(bv);
return -ENOMEM;
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 27bc360c7ffd..a723df3e0197 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -849,8 +849,13 @@ cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid *fid, __u16 search_flags,
struct cifs_search_info *srch_inf)
{
- return CIFSFindFirst(xid, tcon, path, cifs_sb,
- &fid->netfid, search_flags, srch_inf, true);
+ int rc;
+
+ rc = CIFSFindFirst(xid, tcon, path, cifs_sb,
+ &fid->netfid, search_flags, srch_inf, true);
+ if (rc)
+ cifs_dbg(FYI, "find first failed=%d\n", rc);
+ return rc;
}
static int
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 3030a9dfb0dd..7ca9808a0daa 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -2475,8 +2475,8 @@ map_smb2_to_linux_error(char *buf, bool log_err)
/* on error mapping not found - return EIO */
- cifs_dbg(FYI, "Mapping SMB2 status code %d to POSIX err %d\n",
- smb2err, rc);
+ cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n",
+ __le32_to_cpu(smb2err), rc);
return rc;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index c58691834eb2..cfacf2c97e94 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -982,7 +982,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
kfree(utf16_path);
if (rc) {
- cifs_dbg(VFS, "open dir failed\n");
+ cifs_dbg(FYI, "open dir failed rc=%d\n", rc);
return rc;
}
@@ -992,7 +992,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
fid->volatile_fid, 0, srch_inf);
if (rc) {
- cifs_dbg(VFS, "query directory failed\n");
+ cifs_dbg(FYI, "query directory failed rc=%d\n", rc);
SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
}
return rc;
@@ -1036,6 +1036,18 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
return true;
}
+static bool
+smb2_is_session_expired(char *buf)
+{
+ struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+
+ if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED)
+ return false;
+
+ cifs_dbg(FYI, "Session expired\n");
+ return true;
+}
+
static int
smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
struct cifsInodeInfo *cinode)
@@ -1288,6 +1300,165 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+#ifdef CONFIG_CIFS_ACL
+static struct cifs_ntsd *
+get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
+ const struct cifs_fid *cifsfid, u32 *pacllen)
+{
+ struct cifs_ntsd *pntsd = NULL;
+ unsigned int xid;
+ int rc = -EOPNOTSUPP;
+ struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+ if (IS_ERR(tlink))
+ return ERR_CAST(tlink);
+
+ xid = get_xid();
+ cifs_dbg(FYI, "trying to get acl\n");
+
+ rc = SMB2_query_acl(xid, tlink_tcon(tlink), cifsfid->persistent_fid,
+ cifsfid->volatile_fid, (void **)&pntsd, pacllen);
+ free_xid(xid);
+
+ cifs_put_tlink(tlink);
+
+ cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
+ if (rc)
+ return ERR_PTR(rc);
+ return pntsd;
+
+}
+
+static struct cifs_ntsd *
+get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
+ const char *path, u32 *pacllen)
+{
+ struct cifs_ntsd *pntsd = NULL;
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ unsigned int xid;
+ int rc;
+ struct cifs_tcon *tcon;
+ struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ __le16 *utf16_path;
+
+ cifs_dbg(FYI, "get smb3 acl for path %s\n", path);
+ if (IS_ERR(tlink))
+ return ERR_CAST(tlink);
+
+ tcon = tlink_tcon(tlink);
+ xid = get_xid();
+
+ if (backup_cred(cifs_sb))
+ oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+ else
+ oparms.create_options = 0;
+
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return ERR_PTR(-ENOMEM);
+
+ oparms.tcon = tcon;
+ oparms.desired_access = READ_CONTROL;
+ oparms.disposition = FILE_OPEN;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+ kfree(utf16_path);
+ if (!rc) {
+ rc = SMB2_query_acl(xid, tlink_tcon(tlink), fid.persistent_fid,
+ fid.volatile_fid, (void **)&pntsd, pacllen);
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ }
+
+ cifs_put_tlink(tlink);
+ free_xid(xid);
+
+ cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
+ if (rc)
+ return ERR_PTR(rc);
+ return pntsd;
+}
+
+#ifdef CONFIG_CIFS_ACL
+static int
+set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+ struct inode *inode, const char *path, int aclflag)
+{
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ unsigned int xid;
+ int rc, access_flags = 0;
+ struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ __le16 *utf16_path;
+
+ cifs_dbg(FYI, "set smb3 acl for path %s\n", path);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+
+ tcon = tlink_tcon(tlink);
+ xid = get_xid();
+
+ if (backup_cred(cifs_sb))
+ oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+ else
+ oparms.create_options = 0;
+
+ if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
+ access_flags = WRITE_OWNER;
+ else
+ access_flags = WRITE_DAC;
+
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
+
+ oparms.tcon = tcon;
+ oparms.desired_access = access_flags;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+ kfree(utf16_path);
+ if (!rc) {
+ rc = SMB2_set_acl(xid, tlink_tcon(tlink), fid.persistent_fid,
+ fid.volatile_fid, pnntsd, acllen, aclflag);
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ }
+
+ cifs_put_tlink(tlink);
+ free_xid(xid);
+ return rc;
+}
+#endif /* CIFS_ACL */
+
+/* Retrieve an ACL from the server */
+static struct cifs_ntsd *
+get_smb2_acl(struct cifs_sb_info *cifs_sb,
+ struct inode *inode, const char *path,
+ u32 *pacllen)
+{
+ struct cifs_ntsd *pntsd = NULL;
+ struct cifsFileInfo *open_file = NULL;
+
+ if (inode)
+ open_file = find_readable_file(CIFS_I(inode), true);
+ if (!open_file)
+ return get_smb2_acl_by_path(cifs_sb, path, pacllen);
+
+ pntsd = get_smb2_acl_by_fid(cifs_sb, &open_file->fid, pacllen);
+ cifsFileInfo_put(open_file);
+ return pntsd;
+}
+#endif
+
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len, bool keep_size)
{
@@ -1809,7 +1980,8 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
sg = init_sg(rqst, sign);
if (!sg) {
- cifs_dbg(VFS, "%s: Failed to init sg %d", __func__, rc);
+ cifs_dbg(VFS, "%s: Failed to init sg", __func__);
+ rc = -ENOMEM;
goto free_req;
}
@@ -1817,6 +1989,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
iv = kzalloc(iv_len, GFP_KERNEL);
if (!iv) {
cifs_dbg(VFS, "%s: Failed to alloc IV", __func__);
+ rc = -ENOMEM;
goto free_sg;
}
iv[0] = 3;
@@ -2056,6 +2229,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return -ENOTSUPP;
}
+ if (server->ops->is_session_expired &&
+ server->ops->is_session_expired(buf)) {
+ cifs_reconnect(server);
+ wake_up(&server->response_q);
+ return -1;
+ }
+
if (server->ops->is_status_pending &&
server->ops->is_status_pending(buf, server, 0))
return -1;
@@ -2373,6 +2553,7 @@ struct smb_version_operations smb20_operations = {
.close_dir = smb2_close_dir,
.calc_smb_size = smb2_calc_size,
.is_status_pending = smb2_is_status_pending,
+ .is_session_expired = smb2_is_session_expired,
.oplock_response = smb2_oplock_response,
.queryfs = smb2_queryfs,
.mand_lock = smb2_mand_lock,
@@ -2391,6 +2572,11 @@ struct smb_version_operations smb20_operations = {
.dir_needs_close = smb2_dir_needs_close,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_ACL
+ .get_acl = get_smb2_acl,
+ .get_acl_by_fid = get_smb2_acl_by_fid,
+ .set_acl = set_smb2_acl,
+#endif /* CIFS_ACL */
};
struct smb_version_operations smb21_operations = {
@@ -2456,6 +2642,7 @@ struct smb_version_operations smb21_operations = {
.close_dir = smb2_close_dir,
.calc_smb_size = smb2_calc_size,
.is_status_pending = smb2_is_status_pending,
+ .is_session_expired = smb2_is_session_expired,
.oplock_response = smb2_oplock_response,
.queryfs = smb2_queryfs,
.mand_lock = smb2_mand_lock,
@@ -2475,6 +2662,11 @@ struct smb_version_operations smb21_operations = {
.enum_snapshots = smb3_enum_snapshots,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_ACL
+ .get_acl = get_smb2_acl,
+ .get_acl_by_fid = get_smb2_acl_by_fid,
+ .set_acl = set_smb2_acl,
+#endif /* CIFS_ACL */
};
struct smb_version_operations smb30_operations = {
@@ -2541,6 +2733,7 @@ struct smb_version_operations smb30_operations = {
.close_dir = smb2_close_dir,
.calc_smb_size = smb2_calc_size,
.is_status_pending = smb2_is_status_pending,
+ .is_session_expired = smb2_is_session_expired,
.oplock_response = smb2_oplock_response,
.queryfs = smb2_queryfs,
.mand_lock = smb2_mand_lock,
@@ -2569,6 +2762,11 @@ struct smb_version_operations smb30_operations = {
.receive_transform = smb3_receive_transform,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_ACL
+ .get_acl = get_smb2_acl,
+ .get_acl_by_fid = get_smb2_acl_by_fid,
+ .set_acl = set_smb2_acl,
+#endif /* CIFS_ACL */
};
#ifdef CONFIG_CIFS_SMB311
@@ -2636,6 +2834,7 @@ struct smb_version_operations smb311_operations = {
.close_dir = smb2_close_dir,
.calc_smb_size = smb2_calc_size,
.is_status_pending = smb2_is_status_pending,
+ .is_session_expired = smb2_is_session_expired,
.oplock_response = smb2_oplock_response,
.queryfs = smb2_queryfs,
.mand_lock = smb2_mand_lock,
@@ -2751,7 +2950,7 @@ struct smb_version_values smb302_values = {
struct smb_version_values smb311_values = {
.version_string = SMB311_VERSION_STRING,
.protocol_id = SMB311_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index e4afdaae743f..5fb2fc2d0080 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1167,15 +1167,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
int rc = 0;
int resp_buftype;
int unc_path_len;
- struct TCP_Server_Info *server;
__le16 *unc_path = NULL;
int flags = 0;
cifs_dbg(FYI, "TCON\n");
- if ((ses->server) && tree)
- server = ses->server;
- else
+ if (!(ses->server) || !tree)
return -EIO;
unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL);
@@ -1294,15 +1291,12 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
{
struct smb2_tree_disconnect_req *req; /* response is trivial */
int rc = 0;
- struct TCP_Server_Info *server;
struct cifs_ses *ses = tcon->ses;
int flags = 0;
cifs_dbg(FYI, "Tree Disconnect\n");
- if (ses && (ses->server))
- server = ses->server;
- else
+ if (!ses || !(ses->server))
return -EIO;
if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
@@ -1794,7 +1788,6 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
struct smb2_ioctl_req *req;
struct smb2_ioctl_rsp *rsp;
struct smb2_sync_hdr *shdr;
- struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct kvec iov[2];
struct kvec rsp_iov;
@@ -1817,9 +1810,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
else
return -EIO;
- if (ses && (ses->server))
- server = ses->server;
- else
+ if (!ses || !(ses->server))
return -EIO;
rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
@@ -1977,7 +1968,6 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
{
struct smb2_close_req *req;
struct smb2_close_rsp *rsp;
- struct TCP_Server_Info *server;
struct cifs_ses *ses = tcon->ses;
struct kvec iov[1];
struct kvec rsp_iov;
@@ -1987,9 +1977,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
cifs_dbg(FYI, "Close\n");
- if (ses && (ses->server))
- server = ses->server;
- else
+ if (!ses || !(ses->server))
return -EIO;
rc = small_smb2_init(SMB2_CLOSE, tcon, (void **) &req);
@@ -2081,8 +2069,9 @@ validate_and_copy_buf(unsigned int offset, unsigned int buffer_length,
static int
query_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, u8 info_class,
- size_t output_len, size_t min_len, void *data)
+ u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type,
+ u32 additional_info, size_t output_len, size_t min_len, void **data,
+ u32 *dlen)
{
struct smb2_query_info_req *req;
struct smb2_query_info_rsp *rsp = NULL;
@@ -2090,15 +2079,12 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec rsp_iov;
int rc = 0;
int resp_buftype;
- struct TCP_Server_Info *server;
struct cifs_ses *ses = tcon->ses;
int flags = 0;
cifs_dbg(FYI, "Query Info\n");
- if (ses && (ses->server))
- server = ses->server;
- else
+ if (!ses || !(ses->server))
return -EIO;
rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req);
@@ -2108,10 +2094,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->InfoType = SMB2_O_INFO_FILE;
+ req->InfoType = info_type;
req->FileInfoClass = info_class;
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
+ req->AdditionalInformation = cpu_to_le32(additional_info);
/* 4 for rfc1002 length field and 1 for Buffer */
req->InputBufferOffset =
cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
@@ -2130,24 +2117,51 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
goto qinf_exit;
}
+ if (dlen) {
+ *dlen = le32_to_cpu(rsp->OutputBufferLength);
+ if (!*data) {
+ *data = kmalloc(*dlen, GFP_KERNEL);
+ if (!*data) {
+ cifs_dbg(VFS,
+ "Error %d allocating memory for acl\n",
+ rc);
+ *dlen = 0;
+ goto qinf_exit;
+ }
+ }
+ }
+
rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength),
- &rsp->hdr, min_len, data);
+ &rsp->hdr, min_len, *data);
qinf_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
}
+int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data)
+{
+ return query_info(xid, tcon, persistent_fid, volatile_fid,
+ FILE_ALL_INFORMATION, SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
+ sizeof(struct smb2_file_all_info), (void **)&data,
+ NULL);
+}
+
int
-SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
- struct smb2_file_all_info *data)
+ void **data, u32 *plen)
{
+ __u32 additional_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO;
+ *plen = 0;
+
return query_info(xid, tcon, persistent_fid, volatile_fid,
- FILE_ALL_INFORMATION,
- sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
- sizeof(struct smb2_file_all_info), data);
+ 0, SMB2_O_INFO_SECURITY, additional_info,
+ SMB2_MAX_BUFFER_SIZE,
+ sizeof(struct smb2_file_all_info), data, plen);
}
int
@@ -2155,9 +2169,10 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid)
{
return query_info(xid, tcon, persistent_fid, volatile_fid,
- FILE_INTERNAL_INFORMATION,
+ FILE_INTERNAL_INFORMATION, SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_internal_info),
sizeof(struct smb2_file_internal_info),
- sizeof(struct smb2_file_internal_info), uniqueid);
+ (void **)&uniqueid, NULL);
}
/*
@@ -2281,7 +2296,6 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
u64 volatile_fid)
{
struct smb2_flush_req *req;
- struct TCP_Server_Info *server;
struct cifs_ses *ses = tcon->ses;
struct kvec iov[1];
struct kvec rsp_iov;
@@ -2291,9 +2305,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
cifs_dbg(FYI, "Flush\n");
- if (ses && (ses->server))
- server = ses->server;
- else
+ if (!ses || !(ses->server))
return -EIO;
rc = small_smb2_init(SMB2_FLUSH, tcon, (void **) &req);
@@ -2970,8 +2982,9 @@ qdir_exit:
static int
send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, u32 pid, int info_class,
- unsigned int num, void **data, unsigned int *size)
+ u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class,
+ u8 info_type, u32 additional_info, unsigned int num,
+ void **data, unsigned int *size)
{
struct smb2_set_info_req *req;
struct smb2_set_info_rsp *rsp = NULL;
@@ -2980,13 +2993,10 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
unsigned int i;
- struct TCP_Server_Info *server;
struct cifs_ses *ses = tcon->ses;
int flags = 0;
- if (ses && (ses->server))
- server = ses->server;
- else
+ if (!ses || !(ses->server))
return -EIO;
if (!num)
@@ -3007,10 +3017,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid);
- req->InfoType = SMB2_O_INFO_FILE;
+ req->InfoType = info_type;
req->FileInfoClass = info_class;
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
+ req->AdditionalInformation = cpu_to_le32(additional_info);
/* 4 for RFC1001 length and 1 for Buffer */
req->BufferOffset =
@@ -3070,8 +3081,8 @@ SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
size[1] = len + 2 /* null */;
rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_RENAME_INFORMATION, 2, data,
- size);
+ current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE,
+ 0, 2, data, size);
kfree(data);
return rc;
}
@@ -3088,8 +3099,8 @@ SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
size = 1; /* sizeof __u8 */
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_DISPOSITION_INFORMATION, 1, &data,
- &size);
+ current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE,
+ 0, 1, &data, &size);
}
int
@@ -3118,7 +3129,8 @@ SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
size[1] = len + 2 /* null */;
rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_LINK_INFORMATION, 2, data, size);
+ current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE,
+ 0, 2, data, size);
kfree(data);
return rc;
}
@@ -3138,10 +3150,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (is_falloc)
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- pid, FILE_ALLOCATION_INFORMATION, 1, &data, &size);
+ pid, FILE_ALLOCATION_INFORMATION, SMB2_O_INFO_FILE,
+ 0, 1, &data, &size);
else
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- pid, FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
+ pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,
+ 0, 1, &data, &size);
}
int
@@ -3151,8 +3165,18 @@ SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int size;
size = sizeof(FILE_BASIC_INFO);
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_BASIC_INFORMATION, 1,
- (void **)&buf, &size);
+ current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE,
+ 0, 1, (void **)&buf, &size);
+}
+
+int
+SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid,
+ struct cifs_ntsd *pnntsd, int pacllen, int aclflag)
+{
+ return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+ current->tgid, 0, SMB2_O_INFO_SECURITY, aclflag,
+ 1, (void **)&pnntsd, &pacllen);
}
int
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 6853454fc871..1cadaf9f3c58 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -135,6 +135,9 @@ extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
struct smb2_file_all_info *data);
+extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_file_id, u64 volatile_file_id,
+ void **data, unsigned int *plen);
extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
__le64 *uniqueid);
@@ -163,6 +166,9 @@ extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
FILE_BASIC_INFO *buf);
+extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid,
+ struct cifs_ntsd *pnntsd, int pacllen, int aclflag);
extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid);
extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index c69ec96e92ac..67367cf1f8cd 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -335,9 +335,31 @@ generate_smb3signingkey(struct cifs_ses *ses,
if (rc)
return rc;
- return generate_key(ses, ptriplet->decryption.label,
- ptriplet->decryption.context,
- ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+ rc = generate_key(ses, ptriplet->decryption.label,
+ ptriplet->decryption.context,
+ ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+
+ if (rc)
+ return rc;
+
+#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
+ cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__);
+ /*
+ * The session id is opaque in terms of endianness, so we can't
+ * print it as a long long. we dump it as we got it on the wire
+ */
+ cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid),
+ &ses->Suid);
+ cifs_dbg(VFS, "Session Key %*ph\n",
+ SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response);
+ cifs_dbg(VFS, "Signing Key %*ph\n",
+ SMB3_SIGN_KEY_SIZE, ses->smb3signingkey);
+ cifs_dbg(VFS, "ServerIn Key %*ph\n",
+ SMB3_SIGN_KEY_SIZE, ses->smb3encryptionkey);
+ cifs_dbg(VFS, "ServerOut Key %*ph\n",
+ SMB3_SIGN_KEY_SIZE, ses->smb3decryptionkey);
+#endif
+ return rc;
}
int
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 47a125ece11e..7efbab013957 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -536,11 +536,14 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
list_add_tail(&mid->qhead, &server->pending_mid_q);
spin_unlock(&GlobalMid_Lock);
-
+ /*
+ * Need to store the time in mid before calling I/O. For call_async,
+ * I/O response may come back and free the mid entry on another thread.
+ */
+ cifs_save_when_sent(mid);
cifs_in_send_inc(server);
rc = smb_send_rqst(server, rqst, flags);
cifs_in_send_dec(server);
- cifs_save_when_sent(mid);
if (rc < 0) {
server->sequence_number -= 2;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3cb5c9e2d4e7..de50e749ff05 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -188,8 +188,6 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode,
pcreatetime = (__u64 *)value;
*pcreatetime = CIFS_I(inode)->createtime;
return sizeof(__u64);
-
- return rc;
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 9d956cd6d46f..363402fcb3ed 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -34,7 +34,7 @@ coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
- return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos);
+ return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0);
}
static ssize_t
@@ -51,7 +51,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
host_file = cfi->cfi_container;
file_start_write(host_file);
inode_lock(coda_inode);
- ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
+ ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6116d5275a3e..2dd4a7af7dd7 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -739,23 +739,22 @@ static int do_i2c_smbus_ioctl(struct file *file,
unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata)
{
struct i2c_smbus_ioctl_data __user *tdata;
- compat_caddr_t datap;
+ union {
+ /* beginnings of those have identical layouts */
+ struct i2c_smbus_ioctl_data32 data32;
+ struct i2c_smbus_ioctl_data data;
+ } v;
tdata = compat_alloc_user_space(sizeof(*tdata));
if (tdata == NULL)
return -ENOMEM;
- if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata)))
- return -EFAULT;
- if (!access_ok(VERIFY_READ, udata, sizeof(*udata)))
+ memset(&v, 0, sizeof(v));
+ if (copy_from_user(&v.data32, udata, sizeof(v.data32)))
return -EFAULT;
+ v.data.data = compat_ptr(v.data32.data);
- if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8)))
- return -EFAULT;
- if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32)))
- return -EFAULT;
- if (__get_user(datap, &udata->data) ||
- __put_user(compat_ptr(datap), &tdata->data))
+ if (copy_to_user(tdata, &v.data, sizeof(v.data)))
return -EFAULT;
return do_ioctl(file, cmd, (unsigned long)tdata);
@@ -866,8 +865,6 @@ COMPATIBLE_IOCTL(TIOCGDEV)
COMPATIBLE_IOCTL(TIOCCBRK)
COMPATIBLE_IOCTL(TIOCGSID)
COMPATIBLE_IOCTL(TIOCGICOUNT)
-COMPATIBLE_IOCTL(TIOCGPKT)
-COMPATIBLE_IOCTL(TIOCGPTLCK)
COMPATIBLE_IOCTL(TIOCGEXCL)
/* Little t */
COMPATIBLE_IOCTL(TIOCGETD)
@@ -883,16 +880,12 @@ COMPATIBLE_IOCTL(TIOCMGET)
COMPATIBLE_IOCTL(TIOCMBIC)
COMPATIBLE_IOCTL(TIOCMBIS)
COMPATIBLE_IOCTL(TIOCMSET)
-COMPATIBLE_IOCTL(TIOCPKT)
COMPATIBLE_IOCTL(TIOCNOTTY)
COMPATIBLE_IOCTL(TIOCSTI)
COMPATIBLE_IOCTL(TIOCOUTQ)
COMPATIBLE_IOCTL(TIOCSPGRP)
COMPATIBLE_IOCTL(TIOCGPGRP)
-COMPATIBLE_IOCTL(TIOCGPTN)
-COMPATIBLE_IOCTL(TIOCSPTLCK)
COMPATIBLE_IOCTL(TIOCSERGETLSR)
-COMPATIBLE_IOCTL(TIOCSIG)
#ifdef TIOCSRS485
COMPATIBLE_IOCTL(TIOCSRS485)
#endif
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 8b2a994042dd..a66f6624d899 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -138,6 +138,14 @@ struct config_item *config_item_get(struct config_item *item)
}
EXPORT_SYMBOL(config_item_get);
+struct config_item *config_item_get_unless_zero(struct config_item *item)
+{
+ if (item && kref_get_unless_zero(&item->ci_kref))
+ return item;
+ return NULL;
+}
+EXPORT_SYMBOL(config_item_get_unless_zero);
+
static void config_item_cleanup(struct config_item *item)
{
struct config_item_type *t = item->ci_type;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index a6ab012a2c6a..c8aabba502f6 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -83,14 +83,13 @@ static int create_link(struct config_item *parent_item,
ret = -ENOMEM;
sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
if (sl) {
- sl->sl_target = config_item_get(item);
spin_lock(&configfs_dirent_lock);
if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
spin_unlock(&configfs_dirent_lock);
- config_item_put(item);
kfree(sl);
return -ENOENT;
}
+ sl->sl_target = config_item_get(item);
list_add(&sl->sl_list, &target_sd->s_links);
spin_unlock(&configfs_dirent_lock);
ret = configfs_create_link(sl, parent_item->ci_dentry,
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 08b46e6e3995..02b7d91c9231 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -7,6 +7,7 @@ config FS_ENCRYPTION
select CRYPTO_XTS
select CRYPTO_CTS
select CRYPTO_CTR
+ select CRYPTO_SHA256
select KEYS
help
Enable encryption of files and directories. This
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index a409a84f1bca..6181e9526860 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -129,7 +129,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
goto errout;
}
err = submit_bio_wait(bio);
- if ((err == 0) && bio->bi_error)
+ if (err == 0 && bio->bi_status)
err = -EIO;
bio_put(bio);
if (err)
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6d6eca394d4d..c7835df7e7b8 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -26,6 +26,7 @@
#include <linux/ratelimit.h>
#include <linux/dcache.h>
#include <linux/namei.h>
+#include <crypto/aes.h>
#include "fscrypt_private.h"
static unsigned int num_prealloc_crypto_pages = 32;
@@ -147,8 +148,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
{
struct {
__le64 index;
- u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)];
- } xts_tweak;
+ u8 padding[FS_IV_SIZE - sizeof(__le64)];
+ } iv;
struct skcipher_request *req = NULL;
DECLARE_FS_COMPLETION_RESULT(ecr);
struct scatterlist dst, src;
@@ -158,6 +159,16 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
BUG_ON(len == 0);
+ BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE);
+ BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE);
+ iv.index = cpu_to_le64(lblk_num);
+ memset(iv.padding, 0, sizeof(iv.padding));
+
+ if (ci->ci_essiv_tfm != NULL) {
+ crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv,
+ (u8 *)&iv);
+ }
+
req = skcipher_request_alloc(tfm, gfp_flags);
if (!req) {
printk_ratelimited(KERN_ERR
@@ -170,15 +181,11 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
page_crypt_complete, &ecr);
- BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
- xts_tweak.index = cpu_to_le64(lblk_num);
- memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
-
sg_init_table(&dst, 1);
sg_set_page(&dst, dest_page, len, offs);
sg_init_table(&src, 1);
sg_set_page(&src, src_page, len, offs);
- skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak);
+ skcipher_request_set_crypt(req, &src, &dst, len, &iv);
if (rw == FS_DECRYPT)
res = crypto_skcipher_decrypt(req);
else
@@ -477,6 +484,8 @@ static void __exit fscrypt_exit(void)
destroy_workqueue(fscrypt_read_workqueue);
kmem_cache_destroy(fscrypt_ctx_cachep);
kmem_cache_destroy(fscrypt_info_cachep);
+
+ fscrypt_essiv_cleanup();
}
module_exit(fscrypt_exit);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index d1bb02b1ee58..ad9f814fdead 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -453,12 +453,3 @@ errout:
return ret;
}
EXPORT_SYMBOL(fscrypt_setup_filename);
-
-void fscrypt_free_filename(struct fscrypt_name *fname)
-{
- kfree(fname->crypto_buf.name);
- fname->crypto_buf.name = NULL;
- fname->usr_fname = NULL;
- fname->disk_name.name = NULL;
-}
-EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 1e1f8a361b75..a1d5021c31ef 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,10 +12,13 @@
#define _FSCRYPT_PRIVATE_H
#include <linux/fscrypt_supp.h>
+#include <crypto/hash.h>
/* Encryption parameters */
-#define FS_XTS_TWEAK_SIZE 16
+#define FS_IV_SIZE 16
#define FS_AES_128_ECB_KEY_SIZE 16
+#define FS_AES_128_CBC_KEY_SIZE 16
+#define FS_AES_128_CTS_KEY_SIZE 16
#define FS_AES_256_GCM_KEY_SIZE 32
#define FS_AES_256_CBC_KEY_SIZE 32
#define FS_AES_256_CTS_KEY_SIZE 32
@@ -54,6 +57,7 @@ struct fscrypt_info {
u8 ci_filename_mode;
u8 ci_flags;
struct crypto_skcipher *ci_ctfm;
+ struct crypto_cipher *ci_essiv_tfm;
u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
};
@@ -87,4 +91,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode,
extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
gfp_t gfp_flags);
+/* keyinfo.c */
+extern void __exit fscrypt_essiv_cleanup(void);
+
#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 179e578b875b..018c588c7ac3 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,8 +10,13 @@
#include <keys/user-type.h>
#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <crypto/aes.h>
+#include <crypto/sha.h>
#include "fscrypt_private.h"
+static struct crypto_shash *essiv_hash_tfm;
+
static void derive_crypt_complete(struct crypto_async_request *req, int rc)
{
struct fscrypt_completion_result *ecr = req->data;
@@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc)
* derive_key_aes() - Derive a key using AES-128-ECB
* @deriving_key: Encryption key used for derivation.
* @source_key: Source key to which to apply derivation.
- * @derived_key: Derived key.
+ * @derived_raw_key: Derived raw key.
*
* Return: Zero on success; non-zero otherwise.
*/
static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
- u8 source_key[FS_AES_256_XTS_KEY_SIZE],
- u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+ const struct fscrypt_key *source_key,
+ u8 derived_raw_key[FS_MAX_KEY_SIZE])
{
int res = 0;
struct skcipher_request *req = NULL;
@@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
if (res < 0)
goto out;
- sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
- sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg,
- FS_AES_256_XTS_KEY_SIZE, NULL);
+ sg_init_one(&src_sg, source_key->raw, source_key->size);
+ sg_init_one(&dst_sg, derived_raw_key, source_key->size);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size,
+ NULL);
res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
@@ -77,7 +82,7 @@ out:
static int validate_user_key(struct fscrypt_info *crypt_info,
struct fscrypt_context *ctx, u8 *raw_key,
- const char *prefix)
+ const char *prefix, int min_keysize)
{
char *description;
struct key *keyring_key;
@@ -111,50 +116,60 @@ static int validate_user_key(struct fscrypt_info *crypt_info,
master_key = (struct fscrypt_key *)ukp->data;
BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
- if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+ if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE
+ || master_key->size % AES_BLOCK_SIZE != 0) {
printk_once(KERN_WARNING
"%s: key size incorrect: %d\n",
__func__, master_key->size);
res = -ENOKEY;
goto out;
}
- res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
+ res = derive_key_aes(ctx->nonce, master_key, raw_key);
out:
up_read(&keyring_key->sem);
key_put(keyring_key);
return res;
}
+static const struct {
+ const char *cipher_str;
+ int keysize;
+} available_modes[] = {
+ [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)",
+ FS_AES_256_XTS_KEY_SIZE },
+ [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))",
+ FS_AES_256_CTS_KEY_SIZE },
+ [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)",
+ FS_AES_128_CBC_KEY_SIZE },
+ [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))",
+ FS_AES_128_CTS_KEY_SIZE },
+};
+
static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
const char **cipher_str_ret, int *keysize_ret)
{
- if (S_ISREG(inode->i_mode)) {
- if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) {
- *cipher_str_ret = "xts(aes)";
- *keysize_ret = FS_AES_256_XTS_KEY_SIZE;
- return 0;
- }
- pr_warn_once("fscrypto: unsupported contents encryption mode "
- "%d for inode %lu\n",
- ci->ci_data_mode, inode->i_ino);
- return -ENOKEY;
+ u32 mode;
+
+ if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) {
+ pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n",
+ inode->i_ino,
+ ci->ci_data_mode, ci->ci_filename_mode);
+ return -EINVAL;
}
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
- if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) {
- *cipher_str_ret = "cts(cbc(aes))";
- *keysize_ret = FS_AES_256_CTS_KEY_SIZE;
- return 0;
- }
- pr_warn_once("fscrypto: unsupported filenames encryption mode "
- "%d for inode %lu\n",
- ci->ci_filename_mode, inode->i_ino);
- return -ENOKEY;
+ if (S_ISREG(inode->i_mode)) {
+ mode = ci->ci_data_mode;
+ } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+ mode = ci->ci_filename_mode;
+ } else {
+ WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+ inode->i_ino, (inode->i_mode & S_IFMT));
+ return -EINVAL;
}
- pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n",
- (inode->i_mode & S_IFMT), inode->i_ino);
- return -ENOKEY;
+ *cipher_str_ret = available_modes[mode].cipher_str;
+ *keysize_ret = available_modes[mode].keysize;
+ return 0;
}
static void put_crypt_info(struct fscrypt_info *ci)
@@ -163,9 +178,76 @@ static void put_crypt_info(struct fscrypt_info *ci)
return;
crypto_free_skcipher(ci->ci_ctfm);
+ crypto_free_cipher(ci->ci_essiv_tfm);
kmem_cache_free(fscrypt_info_cachep, ci);
}
+static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
+{
+ struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
+
+ /* init hash transform on demand */
+ if (unlikely(!tfm)) {
+ struct crypto_shash *prev_tfm;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n",
+ PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+ prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
+ if (prev_tfm) {
+ crypto_free_shash(tfm);
+ tfm = prev_tfm;
+ }
+ }
+
+ {
+ SHASH_DESC_ON_STACK(desc, tfm);
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ return crypto_shash_digest(desc, key, keysize, salt);
+ }
+}
+
+static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
+ int keysize)
+{
+ int err;
+ struct crypto_cipher *essiv_tfm;
+ u8 salt[SHA256_DIGEST_SIZE];
+
+ essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
+ if (IS_ERR(essiv_tfm))
+ return PTR_ERR(essiv_tfm);
+
+ ci->ci_essiv_tfm = essiv_tfm;
+
+ err = derive_essiv_salt(raw_key, keysize, salt);
+ if (err)
+ goto out;
+
+ /*
+ * Using SHA256 to derive the salt/key will result in AES-256 being
+ * used for IV generation. File contents encryption will still use the
+ * configured keysize (AES-128) nevertheless.
+ */
+ err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
+ if (err)
+ goto out;
+
+out:
+ memzero_explicit(salt, sizeof(salt));
+ return err;
+}
+
+void __exit fscrypt_essiv_cleanup(void)
+{
+ crypto_free_shash(essiv_hash_tfm);
+}
+
int fscrypt_get_encryption_info(struct inode *inode)
{
struct fscrypt_info *crypt_info;
@@ -212,6 +294,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
crypt_info->ci_data_mode = ctx.contents_encryption_mode;
crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
crypt_info->ci_ctfm = NULL;
+ crypt_info->ci_essiv_tfm = NULL;
memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
sizeof(crypt_info->ci_master_key));
@@ -228,10 +311,12 @@ int fscrypt_get_encryption_info(struct inode *inode)
if (!raw_key)
goto out;
- res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX);
+ res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX,
+ keysize);
if (res && inode->i_sb->s_cop->key_prefix) {
int res2 = validate_user_key(crypt_info, &ctx, raw_key,
- inode->i_sb->s_cop->key_prefix);
+ inode->i_sb->s_cop->key_prefix,
+ keysize);
if (res2) {
if (res2 == -ENOKEY)
res = -ENOKEY;
@@ -243,18 +328,30 @@ int fscrypt_get_encryption_info(struct inode *inode)
ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
if (!ctfm || IS_ERR(ctfm)) {
res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
- printk(KERN_DEBUG
- "%s: error %d (inode %u) allocating crypto tfm\n",
- __func__, res, (unsigned) inode->i_ino);
+ pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n",
+ __func__, res, inode->i_ino);
goto out;
}
crypt_info->ci_ctfm = ctfm;
crypto_skcipher_clear_flags(ctfm, ~0);
crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ /*
+ * if the provided key is longer than keysize, we use the first
+ * keysize bytes of the derived key only
+ */
res = crypto_skcipher_setkey(ctfm, raw_key, keysize);
if (res)
goto out;
+ if (S_ISREG(inode->i_mode) &&
+ crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) {
+ res = init_essiv_generator(crypt_info, raw_key, keysize);
+ if (res) {
+ pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n",
+ __func__, res, inode->i_ino);
+ goto out;
+ }
+ }
if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL)
crypt_info = NULL;
out:
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 210976e7a269..ce07a86200f3 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -38,12 +38,8 @@ static int create_encryption_context_from_policy(struct inode *inode,
memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE);
- if (!fscrypt_valid_contents_enc_mode(
- policy->contents_encryption_mode))
- return -EINVAL;
-
- if (!fscrypt_valid_filenames_enc_mode(
- policy->filenames_encryption_mode))
+ if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ policy->filenames_encryption_mode))
return -EINVAL;
if (policy->flags & ~FS_POLICY_FLAGS_VALID)
@@ -260,6 +256,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
memcpy(ctx.master_key_descriptor, ci->ci_master_key,
FS_KEY_DESCRIPTOR_SIZE);
get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
res = parent->i_sb->s_cop->set_context(child, &ctx,
sizeof(ctx), fs_data);
if (res)
diff --git a/fs/dax.c b/fs/dax.c
index 2a6889b3585f..306c2b603fb8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -25,7 +25,6 @@
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/pagevec.h>
-#include <linux/pmem.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/uio.h>
@@ -84,7 +83,7 @@ struct exceptional_entry_key {
};
struct wait_exceptional_entry_queue {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct exceptional_entry_key key;
};
@@ -108,7 +107,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
return wait_table + hash;
}
-static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
int sync, void *keyp)
{
struct exceptional_entry_key *key = keyp;
@@ -784,7 +783,7 @@ static int dax_writeback_one(struct block_device *bdev,
}
dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
- wb_cache_pmem(kaddr, size);
+ dax_flush(dax_dev, pgoff, kaddr, size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
@@ -856,9 +855,12 @@ int dax_writeback_mapping_range(struct address_space *mapping,
ret = dax_writeback_one(bdev, dax_dev, mapping,
indices[i], pvec.pages[i]);
- if (ret < 0)
+ if (ret < 0) {
+ mapping_set_error(mapping, ret);
goto out;
+ }
}
+ start_index = indices[pvec.nr - 1] + 1;
}
out:
put_dax(dax_dev);
@@ -975,7 +977,8 @@ int __dax_zero_page_range(struct block_device *bdev,
dax_read_unlock(id);
return rc;
}
- clear_pmem(kaddr + offset, size);
+ memset(kaddr + offset, 0, size);
+ dax_flush(dax_dev, pgoff, kaddr + offset, size);
dax_read_unlock(id);
}
return 0;
@@ -1054,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
map_len = end - pos;
if (iov_iter_rw(iter) == WRITE)
- map_len = copy_from_iter_pmem(kaddr, map_len, iter);
+ map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+ map_len, iter);
else
map_len = copy_to_iter(kaddr, map_len, iter);
if (map_len <= 0) {
@@ -1212,7 +1216,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
case IOMAP_MAPPED:
if (iomap.flags & IOMAP_F_NEW) {
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
diff --git a/fs/dcache.c b/fs/dcache.c
index cddf39777835..f90141387f01 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -90,6 +90,11 @@ EXPORT_SYMBOL(rename_lock);
static struct kmem_cache *dentry_cache __read_mostly;
+const struct qstr empty_name = QSTR_INIT("", 0);
+EXPORT_SYMBOL(empty_name);
+const struct qstr slash_name = QSTR_INIT("/", 1);
+EXPORT_SYMBOL(slash_name);
+
/*
* This is the single most critical data structure when it comes
* to the dcache: the hashtable for lookups. Somebody should try
@@ -277,6 +282,33 @@ static inline int dname_external(const struct dentry *dentry)
return dentry->d_name.name != dentry->d_iname;
}
+void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ if (unlikely(dname_external(dentry))) {
+ struct external_name *p = external_name(dentry);
+ atomic_inc(&p->u.count);
+ spin_unlock(&dentry->d_lock);
+ name->name = p->name;
+ } else {
+ memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN);
+ spin_unlock(&dentry->d_lock);
+ name->name = name->inline_name;
+ }
+}
+EXPORT_SYMBOL(take_dentry_name_snapshot);
+
+void release_dentry_name_snapshot(struct name_snapshot *name)
+{
+ if (unlikely(name->name != name->inline_name)) {
+ struct external_name *p;
+ p = container_of(name->name, struct external_name, name[0]);
+ if (unlikely(atomic_dec_and_test(&p->u.count)))
+ kfree_rcu(p, u.head);
+ }
+}
+EXPORT_SYMBOL(release_dentry_name_snapshot);
+
static inline void __d_set_inode_and_type(struct dentry *dentry,
struct inode *inode,
unsigned type_flags)
@@ -1133,11 +1165,12 @@ void shrink_dcache_sb(struct super_block *sb)
LIST_HEAD(dispose);
freed = list_lru_walk(&sb->s_dentry_lru,
- dentry_lru_isolate_shrink, &dispose, UINT_MAX);
+ dentry_lru_isolate_shrink, &dispose, 1024);
this_cpu_sub(nr_dentry_unused, freed);
shrink_dentry_list(&dispose);
- } while (freed > 0);
+ cond_resched();
+ } while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);
@@ -1494,7 +1527,7 @@ static void check_and_drop(void *_data)
{
struct detach_data *data = _data;
- if (!data->mountpoint && !data->select.found)
+ if (!data->mountpoint && list_empty(&data->select.dispose))
__d_drop(data->select.start);
}
@@ -1536,17 +1569,15 @@ void d_invalidate(struct dentry *dentry)
d_walk(dentry, &data, detach_and_collect, check_and_drop);
- if (data.select.found)
+ if (!list_empty(&data.select.dispose))
shrink_dentry_list(&data.select.dispose);
+ else if (!data.mountpoint)
+ return;
if (data.mountpoint) {
detach_mounts(data.mountpoint);
dput(data.mountpoint);
}
-
- if (!data.mountpoint && !data.select.found)
- break;
-
cond_resched();
}
}
@@ -1580,8 +1611,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
*/
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (unlikely(!name)) {
- static const struct qstr anon = QSTR_INIT("/", 1);
- name = &anon;
+ name = &slash_name;
dname = dentry->d_iname;
} else if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
@@ -3548,8 +3578,6 @@ __setup("dhash_entries=", set_dhash_entries);
static void __init dcache_init_early(void)
{
- unsigned int loop;
-
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
*/
@@ -3561,24 +3589,19 @@ static void __init dcache_init_early(void)
sizeof(struct hlist_bl_head),
dhash_entries,
13,
- HASH_EARLY,
+ HASH_EARLY | HASH_ZERO,
&d_hash_shift,
&d_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << d_hash_shift); loop++)
- INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
static void __init dcache_init(void)
{
- unsigned int loop;
-
- /*
+ /*
* A constructor could be added for stable state like the lists,
* but it is probably not worth it because of the cache nature
- * of the dcache.
+ * of the dcache.
*/
dentry_cache = KMEM_CACHE(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
@@ -3592,14 +3615,11 @@ static void __init dcache_init(void)
sizeof(struct hlist_bl_head),
dhash_entries,
13,
- 0,
+ HASH_ZERO,
&d_hash_shift,
&d_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << d_hash_shift); loop++)
- INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
/* SLAB cache for __getname() consumers */
@@ -3610,6 +3630,11 @@ EXPORT_SYMBOL(d_genocide);
void __init vfs_caches_init_early(void)
{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
+ INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
+
dcache_init_early();
inode_init_early();
}
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 354e2ab62031..6dabc4a10396 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -9,7 +9,7 @@
* 2 as published by the Free Software Foundation.
*
* debugfs is for people to use instead of /proc or /sys.
- * See Documentation/DocBook/filesystems for more details.
+ * See Documentation/filesystems/ for more details.
*
*/
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e892ae7d89f8..c59f015f386e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -9,7 +9,7 @@
* 2 as published by the Free Software Foundation.
*
* debugfs is for people to use instead of /proc or /sys.
- * See Documentation/DocBook/kernel-api for more details.
+ * See ./Documentation/core-api/kernel-api.rst for more details.
*
*/
@@ -203,8 +203,6 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
struct debugfs_fs_info *fsi;
int err;
- save_mount_options(sb, data);
-
fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL);
sb->s_fs_info = fsi;
if (!fsi) {
@@ -766,7 +764,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
{
int error;
struct dentry *dentry = NULL, *trap;
- const char *old_name;
+ struct name_snapshot old_name;
trap = lock_rename(new_dir, old_dir);
/* Source or destination directories don't exist? */
@@ -781,19 +779,19 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry))
goto exit;
- old_name = fsnotify_oldname_init(old_dentry->d_name.name);
+ take_dentry_name_snapshot(&old_name, old_dentry);
error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir),
dentry, 0);
if (error) {
- fsnotify_oldname_free(old_name);
+ release_dentry_name_snapshot(&old_name);
goto exit;
}
d_move(old_dentry, dentry);
- fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name,
+ fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name.name,
d_is_dir(old_dentry),
NULL, old_dentry);
- fsnotify_oldname_free(old_name);
+ release_dentry_name_snapshot(&old_name);
unlock_rename(new_dir, old_dir);
dput(dentry);
return old_dentry;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a04ebea77de8..08cf27811e5a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -294,7 +294,7 @@ static void dio_aio_complete_work(struct work_struct *work)
dio_complete(dio, 0, true);
}
-static int dio_bio_complete(struct dio *dio, struct bio *bio);
+static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
/*
* Asynchronous IO callback.
@@ -348,13 +348,12 @@ static void dio_bio_end_io(struct bio *bio)
/**
* dio_end_io - handle the end io action for the given bio
* @bio: The direct io bio thats being completed
- * @error: Error if there was one
*
* This is meant to be called by any filesystem that uses their own dio_submit_t
* so that the DIO specific endio actions are dealt with after the filesystem
* has done it's completion work.
*/
-void dio_end_io(struct bio *bio, int error)
+void dio_end_io(struct bio *bio)
{
struct dio *dio = bio->bi_private;
@@ -386,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
else
bio->bi_end_io = dio_bio_end_io;
+ bio->bi_write_hint = dio->iocb->ki_hint;
+
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
@@ -474,17 +475,20 @@ static struct bio *dio_await_one(struct dio *dio)
/*
* Process one completed BIO. No locks are held.
*/
-static int dio_bio_complete(struct dio *dio, struct bio *bio)
+static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
{
struct bio_vec *bvec;
unsigned i;
- int err;
+ blk_status_t err = bio->bi_status;
- if (bio->bi_error)
- dio->io_error = -EIO;
+ if (err) {
+ if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT))
+ dio->io_error = -EAGAIN;
+ else
+ dio->io_error = -EIO;
+ }
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
- err = bio->bi_error;
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
bio_for_each_segment_all(bvec, bio, i) {
@@ -495,7 +499,6 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
set_page_dirty_lock(page);
put_page(page);
}
- err = bio->bi_error;
bio_put(bio);
}
return err;
@@ -539,7 +542,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
bio = dio->bio_list;
dio->bio_list = bio->bi_private;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- ret2 = dio_bio_complete(dio, bio);
+ ret2 = blk_status_to_errno(dio_bio_complete(dio, bio));
if (ret == 0)
ret = ret2;
}
@@ -1197,6 +1200,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
if (iov_iter_rw(iter) == WRITE) {
dio->op = REQ_OP_WRITE;
dio->op_flags = REQ_SYNC | REQ_IDLE;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ dio->op_flags |= REQ_NOWAIT;
} else {
dio->op = REQ_OP_READ;
}
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index d7a7c53803c1..5b68e4294faa 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -29,7 +29,6 @@ static const struct super_operations efivarfs_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.evict_inode = efivarfs_evict_inode,
- .show_options = generic_show_options,
};
static struct super_block *efivarfs_sb;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 68b9fffcb2c8..2fb4eadaa118 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
* This is used to atomically remove a wait queue entry from the eventfd wait
* queue head, and read/reset the counter value.
*/
-int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
__u64 *cnt)
{
unsigned long flags;
@@ -215,8 +215,8 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
*
* Returns %0 if successful, or the following error codes:
*
- * -EAGAIN : The operation would have blocked but @no_wait was non-zero.
- * -ERESTARTSYS : A signal interrupted the wait operation.
+ * - -EAGAIN : The operation would have blocked but @no_wait was non-zero.
+ * - -ERESTARTSYS : A signal interrupted the wait operation.
*
* If @no_wait is zero, the function might sleep until the eventfd internal
* counter becomes greater than zero.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5420767c9b68..e767e4389cb1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -244,7 +244,7 @@ struct eppoll_entry {
* Wait queue item that will be linked to the target file wait
* queue head.
*/
- wait_queue_t wait;
+ wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item */
wait_queue_head_t *whead;
@@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p)
return !list_empty(p);
}
-static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
+static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
{
return container_of(p, struct eppoll_entry, wait);
}
/* Get the "struct epitem" from a wait queue pointer */
-static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
+static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
return container_of(p, struct eppoll_entry, wait)->base;
}
@@ -960,10 +960,14 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f)
mutex_lock(&ep->mtx);
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
+ struct inode *inode = file_inode(epi->ffd.file);
- seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
+ seq_printf(m, "tfd: %8d events: %8x data: %16llx "
+ " pos:%lli ino:%lx sdev:%x\n",
epi->ffd.fd, epi->event.events,
- (long long)epi->event.data);
+ (long long)epi->event.data,
+ (long long)epi->ffd.file->f_pos,
+ inode->i_ino, inode->i_sb->s_dev);
if (seq_has_overflowed(m))
break;
}
@@ -1073,12 +1077,56 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
return epir;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
+{
+ struct rb_node *rbp;
+ struct epitem *epi;
+
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ epi = rb_entry(rbp, struct epitem, rbn);
+ if (epi->ffd.fd == tfd) {
+ if (toff == 0)
+ return epi;
+ else
+ toff--;
+ }
+ cond_resched();
+ }
+
+ return NULL;
+}
+
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
+ unsigned long toff)
+{
+ struct file *file_raw;
+ struct eventpoll *ep;
+ struct epitem *epi;
+
+ if (!is_file_epoll(file))
+ return ERR_PTR(-EINVAL);
+
+ ep = file->private_data;
+
+ mutex_lock(&ep->mtx);
+ epi = ep_find_tfd(ep, tfd, toff);
+ if (epi)
+ file_raw = epi->ffd.file;
+ else
+ file_raw = ERR_PTR(-ENOENT);
+ mutex_unlock(&ep->mtx);
+
+ return file_raw;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
*/
-static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
@@ -1094,7 +1142,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* can't use __remove_wait_queue(). whead->lock is held by
* the caller.
*/
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
}
spin_lock_irqsave(&ep->lock, flags);
@@ -1699,7 +1747,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int res = 0, eavail, timed_out = 0;
unsigned long flags;
u64 slack = 0;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
if (timeout > 0) {
@@ -1748,6 +1796,16 @@ fetch_events:
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
+ /*
+ * Always short-circuit for fatal signals to allow
+ * threads to make a timely exit without the chance of
+ * finding more events available and fetching
+ * repeatedly.
+ */
+ if (fatal_signal_pending(current)) {
+ res = -EINTR;
+ break;
+ }
if (ep_events_available(ep) || timed_out)
break;
if (signal_pending(current)) {
diff --git a/fs/exec.c b/fs/exec.c
index 72934df68471..62175cbcc801 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -220,7 +220,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
if (write) {
unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
- struct rlimit *rlim;
+ unsigned long ptr_size, limit;
+
+ /*
+ * Since the stack will hold pointers to the strings, we
+ * must account for them as well.
+ *
+ * The size calculation is the entire vma while each arg page is
+ * built, so each time we get here it's calculating how far it
+ * is currently (rather than each call being just the newly
+ * added size from the arg page). As a result, we need to
+ * always add the entire size of the pointers, so that on the
+ * last call to get_arg_page() we'll actually have the entire
+ * correct size.
+ */
+ ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ if (ptr_size > ULONG_MAX - size)
+ goto fail;
+ size += ptr_size;
acct_arg_size(bprm, size / PAGE_SIZE);
@@ -232,20 +249,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
return page;
/*
- * Limit to 1/4-th the stack size for the argv+env strings.
+ * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
+ * (whichever is smaller) for the argv+env strings.
* This ensures that:
* - the remaining binfmt code will not run out of stack space,
* - the program will have a reasonable amount of stack left
* to work from.
*/
- rlim = current->signal->rlim;
- if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
- put_page(page);
- return NULL;
- }
+ limit = _STK_LIM / 4 * 3;
+ limit = min(limit, rlimit(RLIMIT_STACK) / 4);
+ if (size > limit)
+ goto fail;
}
return page;
+
+fail:
+ put_page(page);
+ return NULL;
}
static void put_arg_page(struct page *page)
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 8eeb694332fe..98233a97b7b8 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -72,7 +72,7 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
set_page_dirty(page);
if (IS_DIRSYNC(dir))
- err = write_one_page(page, 1);
+ err = write_one_page(page);
else
unlock_page(page);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 79dafa71effd..51f0aea70cb4 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -175,11 +175,8 @@ ext2_get_acl(struct inode *inode, int type)
return acl;
}
-/*
- * inode->i_mutex: down
- */
-int
-ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+static int
+__ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int name_index;
void *value = NULL;
@@ -189,13 +186,6 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
switch(type) {
case ACL_TYPE_ACCESS:
name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
- if (acl) {
- error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
- if (error)
- return error;
- inode->i_ctime = current_time(inode);
- mark_inode_dirty(inode);
- }
break;
case ACL_TYPE_DEFAULT:
@@ -222,6 +212,31 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
}
/*
+ * inode->i_mutex: down
+ */
+int
+ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ int error;
+ int update_mode = 0;
+ umode_t mode = inode->i_mode;
+
+ if (type == ACL_TYPE_ACCESS && acl) {
+ error = posix_acl_update_mode(inode, &mode, &acl);
+ if (error)
+ return error;
+ update_mode = 1;
+ }
+ error = __ext2_set_acl(inode, acl, type);
+ if (!error && update_mode) {
+ inode->i_mode = mode;
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
+ }
+ return error;
+}
+
+/*
* Initialize the ACLs of a new inode. Called from ext2_new_inode.
*
* dir->i_mutex: down
@@ -238,12 +253,12 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
return error;
if (default_acl) {
- error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
}
if (acl) {
if (!error)
- error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
}
return error;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d9650c9508e4..e2709695b177 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -100,7 +100,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
}
if (IS_DIRSYNC(dir)) {
- err = write_one_page(page, 1);
+ err = write_one_page(page);
if (!err)
err = sync_inode_metadata(dir, 1);
} else {
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 03f5ce1d3dbe..23ebb92484c6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -113,7 +113,7 @@ struct ext2_sb_info {
* of the mount options.
*/
spinlock_t s_lock;
- struct mb_cache *s_mb_cache;
+ struct mb_cache *s_ea_block_cache;
};
static inline spinlock_t *
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index b21891a6bfca..d34d32bdc944 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -174,15 +174,12 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
int ret;
struct super_block *sb = file->f_mapping->host->i_sb;
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
ret = generic_file_fsync(file, start, end, datasync);
- if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
+ if (ret == -EIO)
/* We don't really know where the IO error happened... */
ext2_error(sb, __func__,
"detected IO error when writing metadata buffers");
- ret = -EIO;
- }
return ret;
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 2dcbd5698884..30163d007b2f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -659,6 +659,7 @@ static int ext2_get_blocks(struct inode *inode,
*/
err = -EAGAIN;
count = 0;
+ partial = chain + depth - 1;
break;
}
blk = le32_to_cpu(*(chain[depth-1].p + count));
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9c2028b50e5c..7b1bc9059863 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -147,9 +147,9 @@ static void ext2_put_super (struct super_block * sb)
ext2_quota_off_umount(sb);
- if (sbi->s_mb_cache) {
- ext2_xattr_destroy_cache(sbi->s_mb_cache);
- sbi->s_mb_cache = NULL;
+ if (sbi->s_ea_block_cache) {
+ ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
+ sbi->s_ea_block_cache = NULL;
}
if (!(sb->s_flags & MS_RDONLY)) {
struct ext2_super_block *es = sbi->s_es;
@@ -1131,9 +1131,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
#ifdef CONFIG_EXT2_FS_XATTR
- sbi->s_mb_cache = ext2_xattr_create_cache();
- if (!sbi->s_mb_cache) {
- ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ sbi->s_ea_block_cache = ext2_xattr_create_cache();
+ if (!sbi->s_ea_block_cache) {
+ ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache");
goto failed_mount3;
}
#endif
@@ -1182,8 +1182,8 @@ cantfind_ext2:
sb->s_id);
goto failed_mount;
failed_mount3:
- if (sbi->s_mb_cache)
- ext2_xattr_destroy_cache(sbi->s_mb_cache);
+ if (sbi->s_ea_block_cache)
+ ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index fbdb8f171893..1b9b1268d418 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -121,6 +121,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = {
NULL
};
+#define EA_BLOCK_CACHE(inode) (EXT2_SB(inode->i_sb)->s_ea_block_cache)
+
static inline const struct xattr_handler *
ext2_xattr_handler(int name_index)
{
@@ -150,7 +152,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
size_t name_len, size;
char *end;
int error;
- struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
@@ -195,7 +197,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
goto found;
entry = next;
}
- if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
+ if (ext2_xattr_cache_insert(ea_block_cache, bh))
ea_idebug(inode, "cache insert failed");
error = -ENODATA;
goto cleanup;
@@ -208,7 +210,7 @@ found:
le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
goto bad_block;
- if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
+ if (ext2_xattr_cache_insert(ea_block_cache, bh))
ea_idebug(inode, "cache insert failed");
if (buffer) {
error = -ERANGE;
@@ -246,7 +248,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
char *end;
size_t rest = buffer_size;
int error;
- struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
@@ -281,7 +283,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
goto bad_block;
entry = next;
}
- if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
+ if (ext2_xattr_cache_insert(ea_block_cache, bh))
ea_idebug(inode, "cache insert failed");
/* list the attribute names */
@@ -493,8 +495,8 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
* This must happen under buffer lock for
* ext2_xattr_set2() to reliably detect modified block
*/
- mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
- hash, bh->b_blocknr);
+ mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
+ bh->b_blocknr);
/* keep the buffer locked while modifying it. */
} else {
@@ -627,7 +629,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
struct super_block *sb = inode->i_sb;
struct buffer_head *new_bh = NULL;
int error;
- struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
if (header) {
new_bh = ext2_xattr_cache_find(inode, header);
@@ -655,7 +657,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
don't need to change the reference count. */
new_bh = old_bh;
get_bh(new_bh);
- ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
+ ext2_xattr_cache_insert(ea_block_cache, new_bh);
} else {
/* We need to allocate a new block */
ext2_fsblk_t goal = ext2_group_first_block_no(sb,
@@ -676,7 +678,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
memcpy(new_bh->b_data, header, new_bh->b_size);
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
- ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
+ ext2_xattr_cache_insert(ea_block_cache, new_bh);
ext2_xattr_update_super_block(sb);
}
@@ -721,8 +723,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
* This must happen under buffer lock for
* ext2_xattr_set2() to reliably detect freed block
*/
- mb_cache_entry_delete_block(ext2_mb_cache,
- hash, old_bh->b_blocknr);
+ mb_cache_entry_delete(ea_block_cache, hash,
+ old_bh->b_blocknr);
/* Free the old block. */
ea_bdebug(old_bh, "freeing");
ext2_free_blocks(inode, old_bh->b_blocknr, 1);
@@ -795,8 +797,8 @@ ext2_xattr_delete_inode(struct inode *inode)
* This must happen under buffer lock for ext2_xattr_set2() to
* reliably detect freed block
*/
- mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
- hash, bh->b_blocknr);
+ mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
+ bh->b_blocknr);
ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
get_bh(bh);
bforget(bh);
@@ -897,21 +899,21 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
- struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext2_mb_cache, hash);
+ ce = mb_cache_entry_find_first(ea_block_cache, hash);
while (ce) {
struct buffer_head *bh;
- bh = sb_bread(inode->i_sb, ce->e_block);
+ bh = sb_bread(inode->i_sb, ce->e_value);
if (!bh) {
ext2_error(inode->i_sb, "ext2_xattr_cache_find",
"inode %ld: block %ld read error",
- inode->i_ino, (unsigned long) ce->e_block);
+ inode->i_ino, (unsigned long) ce->e_value);
} else {
lock_buffer(bh);
/*
@@ -924,27 +926,27 @@ again:
* entry is still hashed is reliable.
*/
if (hlist_bl_unhashed(&ce->e_hash_list)) {
- mb_cache_entry_put(ext2_mb_cache, ce);
+ mb_cache_entry_put(ea_block_cache, ce);
unlock_buffer(bh);
brelse(bh);
goto again;
} else if (le32_to_cpu(HDR(bh)->h_refcount) >
EXT2_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %ld refcount %d>%d",
- (unsigned long) ce->e_block,
+ (unsigned long) ce->e_value,
le32_to_cpu(HDR(bh)->h_refcount),
EXT2_XATTR_REFCOUNT_MAX);
} else if (!ext2_xattr_cmp(header, HDR(bh))) {
ea_bdebug(bh, "b_count=%d",
atomic_read(&(bh->b_count)));
- mb_cache_entry_touch(ext2_mb_cache, ce);
- mb_cache_entry_put(ext2_mb_cache, ce);
+ mb_cache_entry_touch(ea_block_cache, ce);
+ mb_cache_entry_put(ea_block_cache, ce);
return bh;
}
unlock_buffer(bh);
brelse(bh);
}
- ce = mb_cache_entry_find_next(ext2_mb_cache, ce);
+ ce = mb_cache_entry_find_next(ea_block_cache, ce);
}
return NULL;
}
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 3ec0e46de95f..09441ae07a5b 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -183,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type)
*/
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
- struct posix_acl *acl)
+ struct posix_acl *acl, int xattr_flags)
{
int name_index;
void *value = NULL;
@@ -218,7 +218,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
}
error = ext4_xattr_set_handle(handle, inode, name_index, "",
- value, size, 0);
+ value, size, xattr_flags);
kfree(value);
if (!error)
@@ -231,18 +231,23 @@ int
ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
handle_t *handle;
- int error, retries = 0;
+ int error, credits, retries = 0;
+ size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
error = dquot_initialize(inode);
if (error)
return error;
retry:
- handle = ext4_journal_start(inode, EXT4_HT_XATTR,
- ext4_jbd2_credits_xattr(inode));
+ error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */,
+ &credits);
+ if (error)
+ return error;
+
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
- error = __ext4_set_acl(handle, inode, type, acl);
+ error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
ext4_journal_stop(handle);
if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -267,13 +272,13 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
if (default_acl) {
error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
- default_acl);
+ default_acl, XATTR_CREATE);
posix_acl_release(default_acl);
}
if (acl) {
if (!error)
error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
- acl);
+ acl, XATTR_CREATE);
posix_acl_release(acl);
}
return error;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 32191548abed..9ebde0cd632e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1114,6 +1114,7 @@ struct ext4_inode_info {
/*
* Mount flags set via mount options or defaults
*/
+#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
@@ -1444,6 +1445,8 @@ struct ext4_sb_info {
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
unsigned int s_mb_free_pending;
+ struct list_head s_freed_data_list; /* List of blocks to be freed
+ after commit completed */
/* tunables */
unsigned long s_stripe;
@@ -1516,7 +1519,8 @@ struct ext4_sb_info {
struct list_head s_es_list; /* List of inodes with reclaimable extents */
long s_es_nr_inode;
struct ext4_es_stats s_es_stats;
- struct mb_cache *s_mb_cache;
+ struct mb_cache *s_ea_block_cache;
+ struct mb_cache *s_ea_inode_cache;
spinlock_t s_es_lock ____cacheline_aligned_in_smp;
/* Ratelimit ext4 messages. */
@@ -1797,10 +1801,12 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+ EXT4_FEATURE_INCOMPAT_EA_INODE| \
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
- EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+ EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -2098,6 +2104,12 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
}
+static inline bool ext4_is_quota_file(struct inode *inode)
+{
+ return IS_NOQUOTA(inode) &&
+ !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
+}
+
/*
* This structure is stuffed into the struct file's private_data field
* for directories. It is where we put information so that we can do
@@ -2126,6 +2138,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
*/
#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
+/* htree levels for ext4 */
+#define EXT4_HTREE_LEVEL_COMPAT 2
+#define EXT4_HTREE_LEVEL 3
+
+static inline int ext4_dir_htree_level(struct super_block *sb)
+{
+ return ext4_has_feature_largedir(sb) ?
+ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+}
+
/*
* Timeout and state flag for lazy initialization inode thread.
*/
@@ -2389,16 +2411,17 @@ extern int ext4fs_dirhash(const char *name, int len, struct
/* ialloc.c */
extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
- uid_t *owner, int handle_type,
- unsigned int line_no, int nblocks);
+ uid_t *owner, __u32 i_flags,
+ int handle_type, unsigned int line_no,
+ int nblocks);
-#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
__ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
- 0, 0, 0)
+ i_flags, 0, 0, 0)
#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
type, nblocks) \
__ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
- (type), __LINE__, (nblocks))
+ 0, (type), __LINE__, (nblocks))
extern void ext4_free_inode(handle_t *, struct inode *);
@@ -2433,6 +2456,7 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
/* inode.c */
int ext4_inode_is_fast_symlink(struct inode *inode);
@@ -2704,19 +2728,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
extern int ext4_register_li_request(struct super_block *sb,
ext4_group_t first_not_zeroed);
-static inline int ext4_has_group_desc_csum(struct super_block *sb)
-{
- return ext4_has_feature_gdt_csum(sb) ||
- EXT4_SB(sb)->s_chksum_driver != NULL;
-}
-
static inline int ext4_has_metadata_csum(struct super_block *sb)
{
WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
!EXT4_SB(sb)->s_chksum_driver);
- return (EXT4_SB(sb)->s_chksum_driver != NULL);
+ return ext4_has_feature_metadata_csum(sb) &&
+ (EXT4_SB(sb)->s_chksum_driver != NULL);
}
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+ return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+}
+
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
@@ -2756,13 +2781,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}
-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+static inline loff_t ext4_isize(struct super_block *sb,
+ struct ext4_inode *raw_inode)
{
- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+ if (ext4_has_feature_largedir(sb) ||
+ S_ISREG(le16_to_cpu(raw_inode->i_mode)))
return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
le32_to_cpu(raw_inode->i_size_lo);
- else
- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+
+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}
static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index f97611171023..dabad1bc8617 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -77,7 +77,14 @@
#define EXT4_RESERVE_TRANS_BLOCKS 12U
-#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
+/*
+ * Number of credits needed if we need to insert an entry into a
+ * directory. For each new index block, we need 4 blocks (old index
+ * block, new index block, bitmap block, bg summary). For normal
+ * htree directories there are 2 levels; if the largedir feature
+ * enabled it's 3 levels.
+ */
+#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
@@ -104,20 +111,6 @@
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
-static inline int ext4_jbd2_credits_xattr(struct inode *inode)
-{
- int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
-
- /*
- * In case of inline data, we may push out the data to a block,
- * so we need to reserve credits for this eventuality
- */
- if (ext4_has_inline_data(inode))
- credits += ext4_writepage_trans_blocks(inode) + 1;
- return credits;
-}
-
-
/*
* Ext4 handle operation types -- for logging purposes
*/
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3e36508610b7..e0a8425ff74d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
static inline int get_default_free_blocks_flags(struct inode *inode)
{
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
else if (ext4_should_journal_data(inode))
return EXT4_FREE_BLOCKS_FORGET;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 02ce7e7bbdf5..58294c9a7e1d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -37,7 +37,11 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
- inode_lock_shared(inode);
+ if (!inode_trylock_shared(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock_shared(inode);
+ }
/*
* Recheck under inode lock - at this point we are sure it cannot
* change anymore
@@ -179,7 +183,11 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
- inode_lock(inode);
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -216,7 +224,12 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ext4_dax_write_iter(iocb, from);
#endif
- inode_lock(inode);
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
+
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -235,9 +248,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
iocb->private = &overwrite;
/* Check whether we do a DIO overwrite or not */
- if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
- ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
- overwrite = 1;
+ if (o_direct && !unaligned_aio) {
+ if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
+ if (ext4_should_dioread_nolock(inode))
+ overwrite = 1;
+ } else if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
ret = __generic_file_write_iter(iocb, from);
inode_unlock(inode);
@@ -345,13 +364,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- if (ext4_encrypted_inode(inode)) {
- int err = fscrypt_get_encryption_info(inode);
- if (err)
- return 0;
- if (!fscrypt_has_encryption_key(inode))
- return -ENOKEY;
- }
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
@@ -435,6 +447,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
if (ret < 0)
return ret;
}
+
+ /* Set the flags to support nowait AIO */
+ filp->f_mode |= FMODE_AIO_NOWAIT;
+
return dquot_file_open(inode, filp);
}
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index b19436098837..7ec340898598 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -480,6 +480,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start_fsb;
ext4_fsblk_t end_fsb;
+ ext4_fsblk_t bofs;
ext4_fsblk_t eofs;
ext4_group_t start_ag;
ext4_group_t end_ag;
@@ -487,9 +488,12 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
ext4_grpblk_t last_cluster;
int error = 0;
+ bofs = le32_to_cpu(sbi->s_es->s_first_data_block);
eofs = ext4_blocks_count(sbi->s_es);
if (keys[0].fmr_physical >= eofs)
return 0;
+ else if (keys[0].fmr_physical < bofs)
+ keys[0].fmr_physical = bofs;
if (keys[1].fmr_physical >= eofs)
keys[1].fmr_physical = eofs - 1;
start_fsb = keys[0].fmr_physical;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 9d549608fd30..aae2c3971cef 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -124,7 +124,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
/*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 98ac2f1f23b3..507bfb3344d4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
* as writing the quota to disk may need the lock as well.
*/
dquot_initialize(inode);
- ext4_xattr_delete_inode(handle, inode);
dquot_free_inode(inode);
dquot_drop(inode);
@@ -743,8 +742,9 @@ out:
*/
struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
umode_t mode, const struct qstr *qstr,
- __u32 goal, uid_t *owner, int handle_type,
- unsigned int line_no, int nblocks)
+ __u32 goal, uid_t *owner, __u32 i_flags,
+ int handle_type, unsigned int line_no,
+ int nblocks)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -766,30 +766,69 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (!dir || !dir->i_nlink)
return ERR_PTR(-EPERM);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ sb = dir->i_sb;
+ sbi = EXT4_SB(sb);
+
+ if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((ext4_encrypted_inode(dir) ||
- DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+ if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
+ !(i_flags & EXT4_EA_INODE_FL)) {
err = fscrypt_get_encryption_info(dir);
if (err)
return ERR_PTR(err);
if (!fscrypt_has_encryption_key(dir))
return ERR_PTR(-ENOKEY);
- if (!handle)
- nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
encrypt = 1;
}
- sb = dir->i_sb;
+ if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (p) {
+ int acl_size = p->a_count * sizeof(ext4_acl_entry);
+
+ nblocks += (S_ISDIR(mode) ? 2 : 1) *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, acl_size,
+ true /* is_create */);
+ posix_acl_release(p);
+ }
+#endif
+
+#ifdef CONFIG_SECURITY
+ {
+ int num_security_xattrs = 1;
+
+#ifdef CONFIG_INTEGRITY
+ num_security_xattrs++;
+#endif
+ /*
+ * We assume that security xattrs are never
+ * more than 1k. In practice they are under
+ * 128 bytes.
+ */
+ nblocks += num_security_xattrs *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, 1024,
+ true /* is_create */);
+ }
+#endif
+ if (encrypt)
+ nblocks += __ext4_xattr_set_credits(sb,
+ NULL /* inode */, NULL /* block_bh */,
+ FSCRYPT_SET_CONTEXT_MAX_SIZE,
+ true /* is_create */);
+ }
+
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
ei = EXT4_I(inode);
- sbi = EXT4_SB(sb);
/*
* Initialize owners and quota early so that we don't have to account
@@ -1053,6 +1092,7 @@ got:
/* Don't inherit extent flag from directory, amongst others. */
ei->i_flags =
ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
+ ei->i_flags |= i_flags;
ei->i_file_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
@@ -1109,13 +1149,15 @@ got:
goto fail_free_drop;
}
- err = ext4_init_acl(handle, inode, dir);
- if (err)
- goto fail_free_drop;
+ if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
+ err = ext4_init_acl(handle, inode, dir);
+ if (err)
+ goto fail_free_drop;
- err = ext4_init_security(handle, inode, dir, qstr);
- if (err)
- goto fail_free_drop;
+ err = ext4_init_security(handle, inode, dir, qstr);
+ if (err)
+ goto fail_free_drop;
+ }
if (ext4_has_feature_extents(sb)) {
/* set extent flag only for directory, file and normal symlink*/
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index bc15c2c17633..7ffa290cbb8e 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
int flags = EXT4_FREE_BLOCKS_VALIDATED;
int err;
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
else if (ext4_should_journal_data(inode))
flags |= EXT4_FREE_BLOCKS_FORGET;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 8d141c0c8ff9..28c5c3abddb3 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
/* Compute min_offs. */
for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
- if (!entry->e_value_block && entry->e_value_size) {
+ if (!entry->e_value_inum && entry->e_value_size) {
size_t offs = le16_to_cpu(entry->e_value_offs);
if (offs < min_offs)
min_offs = offs;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cf82d03968c..3c600f02673f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -144,16 +144,12 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
/*
* Test whether an inode is a fast symlink.
+ * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
*/
int ext4_inode_is_fast_symlink(struct inode *inode)
{
- int ea_blocks = EXT4_I(inode)->i_file_acl ?
- EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
-
- if (ext4_has_inline_data(inode))
- return 0;
-
- return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+ return S_ISLNK(inode->i_mode) && inode->i_size &&
+ (inode->i_size < EXT4_N_BLOCKS * 4);
}
/*
@@ -189,6 +185,8 @@ void ext4_evict_inode(struct inode *inode)
{
handle_t *handle;
int err;
+ int extra_credits = 3;
+ struct ext4_xattr_inode_array *ea_inode_array = NULL;
trace_ext4_evict_inode(inode);
@@ -213,7 +211,8 @@ void ext4_evict_inode(struct inode *inode)
*/
if (inode->i_ino != EXT4_JOURNAL_INO &&
ext4_should_journal_data(inode) &&
- (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_data.nrpages) {
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
@@ -238,8 +237,12 @@ void ext4_evict_inode(struct inode *inode)
* protection against it
*/
sb_start_intwrite(inode->i_sb);
+
+ if (!IS_NOQUOTA(inode))
+ extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
- ext4_blocks_for_truncate(inode)+3);
+ ext4_blocks_for_truncate(inode)+extra_credits);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -254,6 +257,16 @@ void ext4_evict_inode(struct inode *inode)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
+
+ /*
+ * Set inode->i_size to 0 before calling ext4_truncate(). We need
+ * special handling of symlinks here because i_size is used to
+ * determine whether ext4_inode_info->i_data contains symlink data or
+ * block mappings. Setting i_size to 0 will remove its fast symlink
+ * status. Erase i_data so that it becomes a valid empty block map.
+ */
+ if (ext4_inode_is_fast_symlink(inode))
+ memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
inode->i_size = 0;
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
@@ -271,25 +284,17 @@ void ext4_evict_inode(struct inode *inode)
}
}
- /*
- * ext4_ext_truncate() doesn't reserve any slop when it
- * restarts journal transactions; therefore there may not be
- * enough credits left in the handle to remove the inode from
- * the orphan list and set the dtime field.
- */
- if (!ext4_handle_has_enough_credits(handle, 3)) {
- err = ext4_journal_extend(handle, 3);
- if (err > 0)
- err = ext4_journal_restart(handle, 3);
- if (err != 0) {
- ext4_warning(inode->i_sb,
- "couldn't extend journal (err %d)", err);
- stop_handle:
- ext4_journal_stop(handle);
- ext4_orphan_del(NULL, inode);
- sb_end_intwrite(inode->i_sb);
- goto no_delete;
- }
+ /* Remove xattr references. */
+ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
+ extra_credits);
+ if (err) {
+ ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
+stop_handle:
+ ext4_journal_stop(handle);
+ ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
+ ext4_xattr_inode_array_free(ea_inode_array);
+ goto no_delete;
}
/*
@@ -317,6 +322,7 @@ void ext4_evict_inode(struct inode *inode)
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
sb_end_intwrite(inode->i_sb);
+ ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
@@ -710,7 +716,7 @@ out_sem:
if (map->m_flags & EXT4_MAP_NEW &&
!(map->m_flags & EXT4_MAP_UNWRITTEN) &&
!(flags & EXT4_GET_BLOCKS_ZERO) &&
- !IS_NOQUOTA(inode) &&
+ !ext4_is_quota_file(inode) &&
ext4_should_order_data(inode)) {
if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
ret = ext4_jbd2_inode_add_wait(handle, inode);
@@ -4712,7 +4718,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
- inode->i_size = ext4_isize(raw_inode);
+ inode->i_size = ext4_isize(sb, raw_inode);
if ((size = i_size_read(inode)) < 0) {
EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -4846,6 +4852,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)