aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h23
-rw-r--r--fs/9p/vfs_file.c6
-rw-r--r--fs/9p/vfs_inode.c23
-rw-r--r--fs/9p/vfs_inode_dotl.c27
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/Makefile5
-rw-r--r--fs/afs/cell.c1
-rw-r--r--fs/afs/internal.h9
-rw-r--r--fs/afs/mntpt.c149
-rw-r--r--fs/afs/super.c432
-rw-r--r--fs/afs/volume.c4
-rw-r--r--fs/aio.c94
-rw-r--r--fs/autofs/autofs_i.h3
-rw-r--r--fs/autofs/inode.c19
-rw-r--r--fs/binfmt_aout.c83
-rw-r--r--fs/binfmt_elf.c32
-rw-r--r--fs/block_dev.c36
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.h34
-rw-r--r--fs/btrfs/disk-io.c5
-rw-r--r--fs/btrfs/extent_io.c16
-rw-r--r--fs/btrfs/inode.c6
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/buffer.c12
-rw-r--r--fs/ceph/caps.c72
-rw-r--r--fs/ceph/debugfs.c27
-rw-r--r--fs/ceph/dir.c455
-rw-r--r--fs/ceph/file.c13
-rw-r--r--fs/ceph/inode.c52
-rw-r--r--fs/ceph/mds_client.c698
-rw-r--r--fs/ceph/mds_client.h43
-rw-r--r--fs/ceph/snap.c159
-rw-r--r--fs/ceph/super.c21
-rw-r--r--fs/ceph/super.h43
-rw-r--r--fs/ceph/xattr.c20
-rw-r--r--fs/cifs/Kconfig120
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_dfs_ref.c4
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_ioctl.h3
-rw-r--r--fs/cifs/cifsfs.c5
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h81
-rw-r--r--fs/cifs/cifsproto.h8
-rw-r--r--fs/cifs/cifssmb.c54
-rw-r--r--fs/cifs/connect.c70
-rw-r--r--fs/cifs/dir.c107
-rw-r--r--fs/cifs/file.c415
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/link.c14
-rw-r--r--fs/cifs/smb1ops.c134
-rw-r--r--fs/cifs/smb2inode.c87
-rw-r--r--fs/cifs/smb2maperror.c3
-rw-r--r--fs/cifs/smb2misc.c24
-rw-r--r--fs/cifs/smb2ops.c536
-rw-r--r--fs/cifs/smb2pdu.c335
-rw-r--r--fs/cifs/smb2pdu.h11
-rw-r--r--fs/cifs/smb2proto.h7
-rw-r--r--fs/cifs/smb2status.h6
-rw-r--r--fs/cifs/smb2transport.c25
-rw-r--r--fs/cifs/smbdirect.c6
-rw-r--r--fs/cifs/trace.h219
-rw-r--r--fs/cifs/transport.c302
-rw-r--r--fs/crypto/Kconfig6
-rw-r--r--fs/crypto/bio.c3
-rw-r--r--fs/crypto/fscrypt_private.h1
-rw-r--r--fs/crypto/hooks.c6
-rw-r--r--fs/crypto/keyinfo.c4
-rw-r--r--fs/crypto/policy.c3
-rw-r--r--fs/dax.c25
-rw-r--r--fs/debugfs/inode.c4
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/ecryptfs/crypto.c5
-rw-r--r--fs/eventpoll.c173
-rw-r--r--fs/exec.c15
-rw-r--r--fs/exofs/BUGS3
-rw-r--r--fs/exofs/Kbuild20
-rw-r--r--fs/exofs/Kconfig13
-rw-r--r--fs/exofs/Kconfig.ore14
-rw-r--r--fs/exofs/common.h262
-rw-r--r--fs/exofs/dir.c661
-rw-r--r--fs/exofs/exofs.h240
-rw-r--r--fs/exofs/file.c83
-rw-r--r--fs/exofs/inode.c1514
-rw-r--r--fs/exofs/namei.c323
-rw-r--r--fs/exofs/ore.c1178
-rw-r--r--fs/exofs/ore_raid.c756
-rw-r--r--fs/exofs/ore_raid.h62
-rw-r--r--fs/exofs/super.c1071
-rw-r--r--fs/exofs/sys.c205
-rw-r--r--fs/ext2/dir.c35
-rw-r--r--fs/ext2/ext2.h17
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c30
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c44
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/ext4/Kconfig17
-rw-r--r--fs/ext4/dir.c10
-rw-r--r--fs/ext4/ext4.h21
-rw-r--r--fs/ext4/ext4_jbd2.h4
-rw-r--r--fs/ext4/extents.c33
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c49
-rw-r--r--fs/ext4/inode.c75
-rw-r--r--fs/ext4/ioctl.c112
-rw-r--r--fs/ext4/mballoc.c7
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c18
-rw-r--r--fs/ext4/page-io.c16
-rw-r--r--fs/ext4/readpage.c8
-rw-r--r--fs/ext4/resize.c20
-rw-r--r--fs/ext4/super.c23
-rw-r--r--fs/ext4/sysfs.c17
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/checkpoint.c20
-rw-r--r--fs/f2fs/data.c72
-rw-r--r--fs/f2fs/debug.c39
-rw-r--r--fs/f2fs/dir.c25
-rw-r--r--fs/f2fs/extent_cache.c2
-rw-r--r--fs/f2fs/f2fs.h95
-rw-r--r--fs/f2fs/file.c56
-rw-r--r--fs/f2fs/inline.c12
-rw-r--r--fs/f2fs/inode.c19
-rw-r--r--fs/f2fs/namei.c9
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/f2fs/segment.c80
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/super.c122
-rw-r--r--fs/f2fs/sysfs.c21
-rw-r--r--fs/f2fs/trace.c20
-rw-r--r--fs/f2fs/xattr.c25
-rw-r--r--fs/f2fs/xattr.h6
-rw-r--r--fs/fat/file.c1
-rw-r--r--fs/file.c16
-rw-r--r--fs/file_table.c9
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/fs_context.c642
-rw-r--r--fs/fs_parser.c447
-rw-r--r--fs/fs_types.c105
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c7
-rw-r--r--fs/fuse/dev.c115
-rw-r--r--fs/fuse/dir.c54
-rw-r--r--fs/fuse/file.c342
-rw-r--r--fs/fuse/fuse_i.h28
-rw-r--r--fs/fuse/inode.c28
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c72
-rw-r--r--fs/gfs2/glock.h4
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/lops.c6
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/hpfs/hpfs.h8
-rw-r--r--fs/hugetlbfs/inode.c372
-rw-r--r--fs/inode.c8
-rw-r--r--fs/internal.h13
-rw-r--r--fs/io_uring.c2964
-rw-r--r--fs/iomap.c61
-rw-r--r--fs/jbd2/checkpoint.c17
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c90
-rw-r--r--fs/jbd2/transaction.c83
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/kernfs/file.c31
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/kernfs/kernfs-internal.h3
-rw-r--r--fs/kernfs/mount.c134
-rw-r--r--fs/lockd/clnt4xdr.c14
-rw-r--r--fs/lockd/clntxdr.c14
-rw-r--r--fs/locks.c37
-rw-r--r--fs/mount.h5
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c9
-rw-r--r--fs/namespace.c401
-rw-r--r--fs/nfs/callback_xdr.c64
-rw-r--r--fs/nfs/delegation.c22
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c98
-rw-r--r--fs/nfs/direct.c7
-rw-r--r--fs/nfs/file.c44
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c225
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h75
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c161
-rw-r--r--fs/nfs/inode.c33
-rw-r--r--fs/nfs/internal.h5
-rw-r--r--fs/nfs/io.c12
-rw-r--r--fs/nfs/namespace.c8
-rw-r--r--fs/nfs/nfs2xdr.c124
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3xdr.c209
-rw-r--r--fs/nfs/nfs42.h3
-rw-r--r--fs/nfs/nfs42proc.c164
-rw-r--r--fs/nfs/nfs42xdr.c130
-rw-r--r--fs/nfs/nfs4client.c33
-rw-r--r--fs/nfs/nfs4namespace.c5
-rw-r--r--fs/nfs/nfs4proc.c138
-rw-r--r--fs/nfs/nfs4session.c7
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfs/nfs4trace.h25
-rw-r--r--fs/nfs/nfs4xdr.c530
-rw-r--r--fs/nfs/nfstrace.c1
-rw-r--r--fs/nfs/nfstrace.h85
-rw-r--r--fs/nfs/pagelist.c47
-rw-r--r--fs/nfs/pnfs.c35
-rw-r--r--fs/nfs/pnfs.h2
-rw-r--r--fs/nfs/pnfs_dev.c13
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/unlink.c8
-rw-r--r--fs/nfs/write.c19
-rw-r--r--fs/nfsd/nfs3proc.c18
-rw-r--r--fs/nfsd/nfs3xdr.c5
-rw-r--r--fs/nfsd/nfs4callback.c17
-rw-r--r--fs/nfsd/nfs4state.c8
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/notify/fanotify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c267
-rw-r--r--fs/notify/fanotify/fanotify.h116
-rw-r--r--fs/notify/fanotify/fanotify_user.c383
-rw-r--r--fs/notify/fsnotify.c15
-rw-r--r--fs/notify/inotify/inotify.h1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c18
-rw-r--r--fs/notify/inotify/inotify_user.c12
-rw-r--r--fs/notify/mark.c42
-rw-r--r--fs/notify/notification.c42
-rw-r--r--fs/ocfs2/alloc.c159
-rw-r--r--fs/ocfs2/cluster/nodemanager.c14
-rw-r--r--fs/ocfs2/dlmglue.c5
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/slot_map.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/orangefs/inode.c7
-rw-r--r--fs/overlayfs/copy_up.c59
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/util.c55
-rw-r--r--fs/pipe.c35
-rw-r--r--fs/pnode.c5
-rw-r--r--fs/pnode.h3
-rw-r--r--fs/proc/array.c16
-rw-r--r--fs/proc/base.c143
-rw-r--r--fs/proc/inode.c52
-rw-r--r--fs/proc/internal.h8
-rw-r--r--fs/proc/kcore.c27
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/root.c238
-rw-r--r--fs/proc/self.c16
-rw-r--r--fs/proc/stat.c89
-rw-r--r--fs/proc/task_mmu.c10
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/proc/thread_self.c16
-rw-r--r--fs/pstore/platform.c3
-rw-r--r--fs/pstore/ram.c64
-rw-r--r--fs/read_write.c16
-rw-r--r--fs/select.c4
-rw-r--r--fs/splice.c22
-rw-r--r--fs/stat.c12
-rw-r--r--fs/statfs.c14
-rw-r--r--fs/super.c344
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/sysfs/mount.c75
-rw-r--r--fs/timerfd.c4
-rw-r--r--fs/ubifs/Kconfig12
-rw-r--r--fs/ubifs/Makefile2
-rw-r--r--fs/ubifs/ioctl.c12
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/udf/inode.c4
-rw-r--r--fs/udf/super.c51
-rw-r--r--fs/udf/truncate.c8
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/utimes.c10
-rw-r--r--fs/xfs/libxfs/xfs_ag.c6
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c12
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c74
-rw-r--r--fs/xfs/libxfs/xfs_attr.c17
-rw-r--r--fs/xfs/libxfs/xfs_attr.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c302
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h16
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c13
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c49
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c17
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h1
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c137
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c28
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c29
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c11
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
-rw-r--r--fs/xfs/libxfs/xfs_shared.h4
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c3
-rw-r--r--fs/xfs/libxfs/xfs_types.c24
-rw-r--r--fs/xfs/libxfs/xfs_types.h3
-rw-r--r--fs/xfs/scrub/agheader.c10
-rw-r--r--fs/xfs/scrub/agheader_repair.c12
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bmap.c27
-rw-r--r--fs/xfs/scrub/dir.c6
-rw-r--r--fs/xfs/scrub/ialloc.c330
-rw-r--r--fs/xfs/scrub/repair.c3
-rw-r--r--fs/xfs/scrub/repair.h3
-rw-r--r--fs/xfs/scrub/rtbitmap.c5
-rw-r--r--fs/xfs/scrub/trace.h45
-rw-r--r--fs/xfs/xfs_aops.c275
-rw-r--r--fs/xfs/xfs_aops.h24
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bmap_util.c9
-rw-r--r--fs/xfs/xfs_buf.c72
-rw-r--r--fs/xfs/xfs_buf.h8
-rw-r--r--fs/xfs/xfs_error.c6
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_file.c32
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_globals.c2
-rw-r--r--fs/xfs/xfs_inode.c769
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_iomap.c518
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c21
-rw-r--r--fs/xfs/xfs_log_recover.c14
-rw-r--r--fs/xfs/xfs_mount.c5
-rw-r--r--fs/xfs/xfs_mount.h10
-rw-r--r--fs/xfs/xfs_ondisk.h21
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_reflink.c150
-rw-r--r--fs/xfs/xfs_reflink.h18
-rw-r--r--fs/xfs/xfs_super.c22
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h115
-rw-r--r--fs/xfs/xfs_trans_bmap.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c1
-rw-r--r--fs/xfs/xfs_trans_refcount.c1
-rw-r--r--fs/xfs/xfs_trans_rmap.c1
-rw-r--r--fs/xfs/xfs_xattr.c3
364 files changed, 15119 insertions, 12563 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 5a0db6dec8d1..aaee1e6584e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,6 +40,9 @@
*/
#define P9_LOCK_TIMEOUT (30*HZ)
+/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */
+#define V9FS_STAT2INODE_KEEP_ISIZE 1
+
extern struct file_system_type v9fs_fs_type;
extern const struct address_space_operations v9fs_addr_operations;
extern const struct file_operations v9fs_file_operations;
@@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t);
void v9fs_evict_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
-void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
+void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
+ struct super_block *sb, unsigned int flags);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+ unsigned int flags);
int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
@@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
}
int v9fs_open_to_dotl_flags(int flags);
+
+static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+ /*
+ * 32-bit need the lock, concurrent updates could break the
+ * sequences and make i_size_read() loop forever.
+ * 64-bit updates are atomic and can skip the locking.
+ */
+ if (sizeof(i_size) > sizeof(long))
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, i_size);
+ if (sizeof(i_size) > sizeof(long))
+ spin_unlock(&inode->i_lock);
+}
#endif
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a25efa782fcc..9a1125305d84 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -446,7 +446,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
i_size = i_size_read(inode);
if (iocb->ki_pos > i_size) {
inode_add_bytes(inode, iocb->ki_pos - i_size);
- i_size_write(inode, iocb->ki_pos);
+ /*
+ * Need to serialize against i_size_write() in
+ * v9fs_stat2inode()
+ */
+ v9fs_i_size_write(inode, iocb->ki_pos);
}
return retval;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 85ff859d3af5..72b779bc0942 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
if (retval)
goto error;
- v9fs_stat2inode(st, inode, sb);
+ v9fs_stat2inode(st, inode, sb, 0);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
return inode;
@@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb);
+ v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
generic_fillattr(d_inode(dentry), stat);
p9stat_free(st);
@@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
* @stat: Plan 9 metadata (mistat) structure
* @inode: inode to populate
* @sb: superblock of filesystem
+ * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
*
*/
void
v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
- struct super_block *sb)
+ struct super_block *sb, unsigned int flags)
{
umode_t mode;
char ext[32];
@@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode = p9mode2perm(v9ses, stat);
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- i_size_write(inode, stat->length);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ v9fs_i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
- inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+ inode->i_blocks = (stat->length + 512 - 1) >> 9;
v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
}
@@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
{
int umode;
dev_t rdev;
- loff_t i_size;
struct p9_wstat *st;
struct v9fs_session_info *v9ses;
+ unsigned int flags;
v9ses = v9fs_inode2v9ses(inode);
st = p9_client_stat(fid);
@@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
goto out;
- spin_lock(&inode->i_lock);
/*
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- i_size = inode->i_size;
- v9fs_stat2inode(st, inode, inode->i_sb);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- inode->i_size = i_size;
- spin_unlock(&inode->i_lock);
+ flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ V9FS_STAT2INODE_KEEP_ISIZE : 0;
+ v9fs_stat2inode(st, inode, inode->i_sb, flags);
out:
p9stat_free(st);
kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4823e1c46999..a950a927a626 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
if (retval)
goto error;
- v9fs_stat2inode_dotl(st, inode);
+ v9fs_stat2inode_dotl(st, inode, 0);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
if (retval)
@@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode_dotl(st, d_inode(dentry));
+ v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
generic_fillattr(d_inode(dentry), stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;
@@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
* v9fs_stat2inode_dotl - populate an inode structure with stat info
* @stat: stat structure
* @inode: inode to populate
+ * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
*
*/
void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+ unsigned int flags)
{
umode_t mode;
struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- i_size_write(inode, stat->st_size);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ v9fs_i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
} else {
if (stat->st_result_mask & P9_STATS_ATIME) {
@@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
}
if (stat->st_result_mask & P9_STATS_RDEV)
inode->i_rdev = new_decode_dev(stat->st_rdev);
- if (stat->st_result_mask & P9_STATS_SIZE)
- i_size_write(inode, stat->st_size);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
+ stat->st_result_mask & P9_STATS_SIZE)
+ v9fs_i_size_write(inode, stat->st_size);
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
}
@@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
{
- loff_t i_size;
struct p9_stat_dotl *st;
struct v9fs_session_info *v9ses;
+ unsigned int flags;
v9ses = v9fs_inode2v9ses(inode);
st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
@@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
goto out;
- spin_lock(&inode->i_lock);
/*
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- i_size = inode->i_size;
- v9fs_stat2inode_dotl(st, inode);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- inode->i_size = i_size;
- spin_unlock(&inode->i_lock);
+ flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ V9FS_STAT2INODE_KEEP_ISIZE : 0;
+ v9fs_stat2inode_dotl(st, inode, flags);
out:
kfree(st);
return 0;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 48ce50484e80..d13d35cf69c0 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -92,7 +92,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
return ret;
if (v9ses->cache)
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
@@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
}
d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode_dotl(st, d_inode(root));
+ v9fs_stat2inode_dotl(st, d_inode(root), 0);
kfree(st);
} else {
struct p9_wstat *st = NULL;
@@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
}
d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode(st, d_inode(root), sb);
+ v9fs_stat2inode(st, d_inode(root), sb, 0);
p9stat_free(st);
kfree(st);
diff --git a/fs/Kconfig b/fs/Kconfig
index ac474a61be37..3e6d3101f3ff 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -8,6 +8,13 @@ menu "File systems"
config DCACHE_WORD_ACCESS
bool
+config VALIDATE_FS_PARSER
+ bool "Validate filesystem parameter description"
+ default y
+ help
+ Enable this to perform validation of the parameter description for a
+ filesystem when it is registered.
+
if BLOCK
config FS_IOMAP
@@ -254,12 +261,9 @@ source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
-source "fs/exofs/Kconfig"
endif # MISC_FILESYSTEMS
-source "fs/exofs/Kconfig.ore"
-
menuconfig NETWORK_FILESYSTEMS
bool "Network File Systems"
default y
diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..427fec226fae 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,8 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
- stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
+ fs_types.o fs_context.o fs_parser.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -30,6 +31,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
+obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FILE_LOCKING) += locks.o
@@ -124,7 +126,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_F2FS_FS) += f2fs/
-obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index cf445dbd5f2e..9de46116c749 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -173,6 +173,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
rcu_assign_pointer(cell->vl_servers, vllist);
cell->dns_expiry = TIME64_MAX;
+ __clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags);
} else {
cell->dns_expiry = ktime_get_real_seconds();
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8871b9e8645f..bb1f244b2b3a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -36,15 +36,14 @@
struct pagevec;
struct afs_call;
-struct afs_mount_params {
- bool rwpath; /* T if the parent should be considered R/W */
+struct afs_fs_context {
bool force; /* T to force cell type */
bool autocell; /* T if set auto mount operation */
bool dyn_root; /* T if dynamic root */
+ bool no_cell; /* T if the source is "none" (for dynroot) */
afs_voltype_t type; /* type of volume requested */
- int volnamesz; /* size of volume name */
+ unsigned int volnamesz; /* size of volume name */
const char *volname; /* name of volume to mount */
- struct net *net_ns; /* Network namespace in effect */
struct afs_net *net; /* the AFS net namespace stuff */
struct afs_cell *cell; /* cell in which to find volume */
struct afs_volume *volume; /* volume record */
@@ -1274,7 +1273,7 @@ static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume)
return volume;
}
-extern struct afs_volume *afs_create_volume(struct afs_mount_params *);
+extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
extern void afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
extern void afs_put_volume(struct afs_cell *, struct afs_volume *);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2e51c6994148..eecd8b699186 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -17,6 +17,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/gfp.h>
+#include <linux/fs_context.h>
#include "internal.h"
@@ -47,6 +48,8 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
+static const char afs_root_volume[] = "root.cell";
+
/*
* no valid lookup procedure on this sort of dir
*/
@@ -68,108 +71,112 @@ static int afs_mntpt_open(struct inode *inode, struct file *file)
}
/*
- * create a vfsmount to be automounted
+ * Set the parameters for the proposed superblock.
*/
-static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
{
- struct afs_super_info *as;
- struct vfsmount *mnt;
- struct afs_vnode *vnode;
- struct page *page;
- char *devname, *options;
- bool rwpath = false;
+ struct afs_fs_context *ctx = fc->fs_private;
+ struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb);
+ struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt));
+ struct afs_cell *cell;
+ const char *p;
int ret;
- _enter("{%pd}", mntpt);
-
- BUG_ON(!d_inode(mntpt));
-
- ret = -ENOMEM;
- devname = (char *) get_zeroed_page(GFP_KERNEL);
- if (!devname)
- goto error_no_devname;
-
- options = (char *) get_zeroed_page(GFP_KERNEL);
- if (!options)
- goto error_no_options;
+ if (fc->net_ns != src_as->net_ns) {
+ put_net(fc->net_ns);
+ fc->net_ns = get_net(src_as->net_ns);
+ }
- vnode = AFS_FS_I(d_inode(mntpt));
+ if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) {
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
+ }
+ if (ctx->cell) {
+ afs_put_cell(ctx->net, ctx->cell);
+ ctx->cell = NULL;
+ }
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
/* if the directory is a pseudo directory, use the d_name */
- static const char afs_root_cell[] = ":root.cell.";
unsigned size = mntpt->d_name.len;
- ret = -ENOENT;
- if (size < 2 || size > AFS_MAXCELLNAME)
- goto error_no_page;
+ if (size < 2)
+ return -ENOENT;
+ p = mntpt->d_name.name;
if (mntpt->d_name.name[0] == '.') {
- devname[0] = '%';
- memcpy(devname + 1, mntpt->d_name.name + 1, size - 1);
- memcpy(devname + size, afs_root_cell,
- sizeof(afs_root_cell));
- rwpath = true;
- } else {
- devname[0] = '#';
- memcpy(devname + 1, mntpt->d_name.name, size);
- memcpy(devname + size + 1, afs_root_cell,
- sizeof(afs_root_cell));
+ size--;
+ p++;
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
}
+ if (size > AFS_MAXCELLNAME)
+ return -ENAMETOOLONG;
+
+ cell = afs_lookup_cell(ctx->net, p, size, NULL, false);
+ if (IS_ERR(cell)) {
+ pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
+ return PTR_ERR(cell);
+ }
+ ctx->cell = cell;
+
+ ctx->volname = afs_root_volume;
+ ctx->volnamesz = sizeof(afs_root_volume) - 1;
} else {
/* read the contents of the AFS special symlink */
+ struct page *page;
loff_t size = i_size_read(d_inode(mntpt));
char *buf;
- ret = -EINVAL;
+ if (src_as->cell)
+ ctx->cell = afs_get_cell(src_as->cell);
+
if (size > PAGE_SIZE - 1)
- goto error_no_page;
+ return -EINVAL;
page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
- goto error_no_page;
- }
+ if (IS_ERR(page))
+ return PTR_ERR(page);
if (PageError(page)) {
ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
- goto error;
+ put_page(page);
+ return ret;
}
- buf = kmap_atomic(page);
- memcpy(devname, buf, size);
- kunmap_atomic(buf);
+ buf = kmap(page);
+ ret = vfs_parse_fs_string(fc, "source", buf, size);
+ kunmap(page);
put_page(page);
- page = NULL;
+ if (ret < 0)
+ return ret;
}
- /* work out what options we want */
- as = AFS_FS_S(mntpt->d_sb);
- if (as->cell) {
- memcpy(options, "cell=", 5);
- strcpy(options + 5, as->cell->name);
- if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath)
- strcat(options, ",rwpath");
- }
+ return 0;
+}
- /* try and do the mount */
- _debug("--- attempting mount %s -o %s ---", devname, options);
- mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
- _debug("--- mount result %p ---", mnt);
+/*
+ * create a vfsmount to be automounted
+ */
+static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+ int ret;
- free_page((unsigned long) devname);
- free_page((unsigned long) options);
- _leave(" = %p", mnt);
- return mnt;
+ BUG_ON(!d_inode(mntpt));
-error:
- put_page(page);
-error_no_page:
- free_page((unsigned long) options);
-error_no_options:
- free_page((unsigned long) devname);
-error_no_devname:
- _leave(" = %d", ret);
- return ERR_PTR(ret);
+ fc = fs_context_for_submount(&afs_fs_type, mntpt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ret = afs_mntpt_set_params(fc, mntpt);
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+
+ put_fs_context(fc);
+ return mnt;
}
/*
diff --git a/fs/afs/super.c b/fs/afs/super.c
index dcd07fe99871..5adf012b8e27 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -1,6 +1,6 @@
/* AFS superblock handling
*
- * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved.
*
* This software may be freely redistributed under the terms of the
* GNU General Public License.
@@ -21,7 +21,7 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/statfs.h>
#include <linux/sched.h>
#include <linux/nsproxy.h>
@@ -30,21 +30,22 @@
#include "internal.h"
static void afs_i_init_once(void *foo);
-static struct dentry *afs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data);
static void afs_kill_super(struct super_block *sb);
static struct inode *afs_alloc_inode(struct super_block *sb);
static void afs_destroy_inode(struct inode *inode);
static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int afs_show_devname(struct seq_file *m, struct dentry *root);
static int afs_show_options(struct seq_file *m, struct dentry *root);
+static int afs_init_fs_context(struct fs_context *fc);
+static const struct fs_parameter_description afs_fs_parameters;
struct file_system_type afs_fs_type = {
- .owner = THIS_MODULE,
- .name = "afs",
- .mount = afs_mount,
- .kill_sb = afs_kill_super,
- .fs_flags = 0,
+ .owner = THIS_MODULE,
+ .name = "afs",
+ .init_fs_context = afs_init_fs_context,
+ .parameters = &afs_fs_parameters,
+ .kill_sb = afs_kill_super,
+ .fs_flags = 0,
};
MODULE_ALIAS_FS("afs");
@@ -63,22 +64,22 @@ static const struct super_operations afs_super_ops = {
static struct kmem_cache *afs_inode_cachep;
static atomic_t afs_count_active_inodes;
-enum {
- afs_no_opt,
- afs_opt_cell,
- afs_opt_dyn,
- afs_opt_rwpath,
- afs_opt_vol,
- afs_opt_autocell,
+enum afs_param {
+ Opt_autocell,
+ Opt_dyn,
+ Opt_source,
};
-static const match_table_t afs_options_list = {
- { afs_opt_cell, "cell=%s" },
- { afs_opt_dyn, "dyn" },
- { afs_opt_rwpath, "rwpath" },
- { afs_opt_vol, "vol=%s" },
- { afs_opt_autocell, "autocell" },
- { afs_no_opt, NULL },
+static const struct fs_parameter_spec afs_param_specs[] = {
+ fsparam_flag ("autocell", Opt_autocell),
+ fsparam_flag ("dyn", Opt_dyn),
+ fsparam_string("source", Opt_source),
+ {}
+};
+
+static const struct fs_parameter_description afs_fs_parameters = {
+ .name = "kAFS",
+ .specs = afs_param_specs,
};
/*
@@ -190,84 +191,23 @@ static int afs_show_options(struct seq_file *m, struct dentry *root)
}
/*
- * parse the mount options
- * - this function has been shamelessly adapted from the ext3 fs which
- * shamelessly adapted it from the msdos fs
- */
-static int afs_parse_options(struct afs_mount_params *params,
- char *options, const char **devname)
-{
- struct afs_cell *cell;
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int token;
-
- _enter("%s", options);
-
- options[PAGE_SIZE - 1] = 0;
-
- while ((p = strsep(&options, ","))) {
- if (!*p)
- continue;
-
- token = match_token(p, afs_options_list, args);
- switch (token) {
- case afs_opt_cell:
- rcu_read_lock();
- cell = afs_lookup_cell_rcu(params->net,
- args[0].from,
- args[0].to - args[0].from);
- rcu_read_unlock();
- if (IS_ERR(cell))
- return PTR_ERR(cell);
- afs_put_cell(params->net, params->cell);
- params->cell = cell;
- break;
-
- case afs_opt_rwpath:
- params->rwpath = true;
- break;
-
- case afs_opt_vol:
- *devname = args[0].from;
- break;
-
- case afs_opt_autocell:
- params->autocell = true;
- break;
-
- case afs_opt_dyn:
- params->dyn_root = true;
- break;
-
- default:
- printk(KERN_ERR "kAFS:"
- " Unknown or invalid mount option: '%s'\n", p);
- return -EINVAL;
- }
- }
-
- _leave(" = 0");
- return 0;
-}
-
-/*
- * parse a device name to get cell name, volume name, volume type and R/W
- * selector
- * - this can be one of the following:
+ * Parse the source name to get cell name, volume name, volume type and R/W
+ * selector.
+ *
+ * This can be one of the following:
* "%[cell:]volume[.]" R/W volume
- * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0),
- * or R/W (rwpath=1) volume
+ * "#[cell:]volume[.]" R/O or R/W volume (R/O parent),
+ * or R/W (R/W parent) volume
* "%[cell:]volume.readonly" R/O volume
* "#[cell:]volume.readonly" R/O volume
* "%[cell:]volume.backup" Backup volume
* "#[cell:]volume.backup" Backup volume
*/
-static int afs_parse_device_name(struct afs_mount_params *params,
- const char *name)
+static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
{
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_cell *cell;
- const char *cellname, *suffix;
+ const char *cellname, *suffix, *name = param->string;
int cellnamesz;
_enter(",%s", name);
@@ -278,69 +218,149 @@ static int afs_parse_device_name(struct afs_mount_params *params,
}
if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+ /* To use dynroot, we don't want to have to provide a source */
+ if (strcmp(name, "none") == 0) {
+ ctx->no_cell = true;
+ return 0;
+ }
printk(KERN_ERR "kAFS: unparsable volume name\n");
return -EINVAL;
}
/* determine the type of volume we're looking for */
- params->type = AFSVL_ROVOL;
- params->force = false;
- if (params->rwpath || name[0] == '%') {
- params->type = AFSVL_RWVOL;
- params->force = true;
+ if (name[0] == '%') {
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
}
name++;
/* split the cell name out if there is one */
- params->volname = strchr(name, ':');
- if (params->volname) {
+ ctx->volname = strchr(name, ':');
+ if (ctx->volname) {
cellname = name;
- cellnamesz = params->volname - name;
- params->volname++;
+ cellnamesz = ctx->volname - name;
+ ctx->volname++;
} else {
- params->volname = name;
+ ctx->volname = name;
cellname = NULL;
cellnamesz = 0;
}
/* the volume type is further affected by a possible suffix */
- suffix = strrchr(params->volname, '.');
+ suffix = strrchr(ctx->volname, '.');
if (suffix) {
if (strcmp(suffix, ".readonly") == 0) {
- params->type = AFSVL_ROVOL;
- params->force = true;
+ ctx->type = AFSVL_ROVOL;
+ ctx->force = true;
} else if (strcmp(suffix, ".backup") == 0) {
- params->type = AFSVL_BACKVOL;
- params->force = true;
+ ctx->type = AFSVL_BACKVOL;
+ ctx->force = true;
} else if (suffix[1] == 0) {
} else {
suffix = NULL;
}
}
- params->volnamesz = suffix ?
- suffix - params->volname : strlen(params->volname);
+ ctx->volnamesz = suffix ?
+ suffix - ctx->volname : strlen(ctx->volname);
_debug("cell %*.*s [%p]",
- cellnamesz, cellnamesz, cellname ?: "", params->cell);
+ cellnamesz, cellnamesz, cellname ?: "", ctx->cell);
/* lookup the cell record */
- if (cellname || !params->cell) {
- cell = afs_lookup_cell(params->net, cellname, cellnamesz,
+ if (cellname) {
+ cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
NULL, false);
if (IS_ERR(cell)) {
- printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
+ pr_err("kAFS: unable to lookup cell '%*.*s'\n",
cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
- afs_put_cell(params->net, params->cell);
- params->cell = cell;
+ afs_put_cell(ctx->net, ctx->cell);
+ ctx->cell = cell;
}
_debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
- params->cell->name, params->cell,
- params->volnamesz, params->volnamesz, params->volname,
- suffix ?: "-", params->type, params->force ? " FORCE" : "");
+ ctx->cell->name, ctx->cell,
+ ctx->volnamesz, ctx->volnamesz, ctx->volname,
+ suffix ?: "-", ctx->type, ctx->force ? " FORCE" : "");
+
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+}
+
+/*
+ * Parse a single mount parameter.
+ */
+static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct afs_fs_context *ctx = fc->fs_private;
+ int opt;
+
+ opt = fs_parse(fc, &afs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_source:
+ return afs_parse_source(fc, param);
+
+ case Opt_autocell:
+ ctx->autocell = true;
+ break;
+
+ case Opt_dyn:
+ ctx->dyn_root = true;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Validate the options, get the cell key and look up the volume.
+ */
+static int afs_validate_fc(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx = fc->fs_private;
+ struct afs_volume *volume;
+ struct key *key;
+
+ if (!ctx->dyn_root) {
+ if (ctx->no_cell) {
+ pr_warn("kAFS: Can only specify source 'none' with -o dyn\n");
+ return -EINVAL;
+ }
+
+ if (!ctx->cell) {
+ pr_warn("kAFS: No cell specified\n");
+ return -EDESTADDRREQ;
+ }
+
+ /* We try to do the mount securely. */
+ key = afs_request_key(ctx->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ ctx->key = key;
+
+ if (ctx->volume) {
+ afs_put_volume(ctx->cell, ctx->volume);
+ ctx->volume = NULL;
+ }
+
+ volume = afs_create_volume(ctx);
+ if (IS_ERR(volume))
+ return PTR_ERR(volume);
+
+ ctx->volume = volume;
+ }
return 0;
}
@@ -348,39 +368,34 @@ static int afs_parse_device_name(struct afs_mount_params *params,
/*
* check a superblock to see if it's the one we're looking for
*/
-static int afs_test_super(struct super_block *sb, void *data)
+static int afs_test_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as1 = data;
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_super_info *as = AFS_FS_S(sb);
- return (as->net_ns == as1->net_ns &&
+ return (as->net_ns == fc->net_ns &&
as->volume &&
- as->volume->vid == as1->volume->vid &&
+ as->volume->vid == ctx->volume->vid &&
!as->dyn_root);
}
-static int afs_dynroot_test_super(struct super_block *sb, void *data)
+static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as1 = data;
struct afs_super_info *as = AFS_FS_S(sb);
- return (as->net_ns == as1->net_ns &&
+ return (as->net_ns == fc->net_ns &&
as->dyn_root);
}
-static int afs_set_super(struct super_block *sb, void *data)
+static int afs_set_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as = data;
-
- sb->s_fs_info = as;
return set_anon_super(sb, NULL);
}
/*
* fill in the superblock
*/
-static int afs_fill_super(struct super_block *sb,
- struct afs_mount_params *params)
+static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
{
struct afs_super_info *as = AFS_FS_S(sb);
struct afs_fid fid;
@@ -399,7 +414,7 @@ static int afs_fill_super(struct super_block *sb,
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
@@ -412,13 +427,13 @@ static int afs_fill_super(struct super_block *sb,
fid.vnode = 1;
fid.vnode_hi = 0;
fid.unique = 1;
- inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL);
+ inode = afs_iget(sb, ctx->key, &fid, NULL, NULL, NULL);
}
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (params->autocell || params->dyn_root)
+ if (ctx->autocell || as->dyn_root)
set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
ret = -ENOMEM;
@@ -443,17 +458,20 @@ error:
return ret;
}
-static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params)
+static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
{
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_super_info *as;
as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
if (as) {
- as->net_ns = get_net(params->net_ns);
- if (params->dyn_root)
+ as->net_ns = get_net(fc->net_ns);
+ if (ctx->dyn_root) {
as->dyn_root = true;
- else
- as->cell = afs_get_cell(params->cell);
+ } else {
+ as->cell = afs_get_cell(ctx->cell);
+ as->volume = __afs_get_volume(ctx->volume);
+ }
}
return as;
}
@@ -475,7 +493,7 @@ static void afs_kill_super(struct super_block *sb)
if (as->dyn_root)
afs_dynroot_depopulate(sb);
-
+
/* Clear the callback interests (which will do ilookup5) before
* deactivating the superblock.
*/
@@ -488,111 +506,103 @@ static void afs_kill_super(struct super_block *sb)
}
/*
- * get an AFS superblock
+ * Get an AFS superblock and root directory.
*/
-static struct dentry *afs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *options)
+static int afs_get_tree(struct fs_context *fc)
{
- struct afs_mount_params params;
+ struct afs_fs_context *ctx = fc->fs_private;
struct super_block *sb;
- struct afs_volume *candidate;
- struct key *key;
struct afs_super_info *as;
int ret;
- _enter(",,%s,%p", dev_name, options);
-
- memset(&params, 0, sizeof(params));
-
- ret = -EINVAL;
- if (current->nsproxy->net_ns != &init_net)
+ ret = afs_validate_fc(fc);
+ if (ret)
goto error;
- params.net_ns = current->nsproxy->net_ns;
- params.net = afs_net(params.net_ns);
-
- /* parse the options and device name */
- if (options) {
- ret = afs_parse_options(&params, options, &dev_name);
- if (ret < 0)
- goto error;
- }
-
- if (!params.dyn_root) {
- ret = afs_parse_device_name(&params, dev_name);
- if (ret < 0)
- goto error;
- /* try and do the mount securely */
- key = afs_request_key(params.cell);
- if (IS_ERR(key)) {
- _leave(" = %ld [key]", PTR_ERR(key));
- ret = PTR_ERR(key);
- goto error;
- }
- params.key = key;
- }
+ _enter("");
/* allocate a superblock info record */
ret = -ENOMEM;
- as = afs_alloc_sbi(&params);
+ as = afs_alloc_sbi(fc);
if (!as)
- goto error_key;
-
- if (!params.dyn_root) {
- /* Assume we're going to need a volume record; at the very
- * least we can use it to update the volume record if we have
- * one already. This checks that the volume exists within the
- * cell.
- */
- candidate = afs_create_volume(&params);
- if (IS_ERR(candidate)) {
- ret = PTR_ERR(candidate);
- goto error_as;
- }
-
- as->volume = candidate;
- }
+ goto error;
+ fc->s_fs_info = as;
/* allocate a deviceless superblock */
- sb = sget(fs_type,
- as->dyn_root ? afs_dynroot_test_super : afs_test_super,
- afs_set_super, flags, as);
+ sb = sget_fc(fc,
+ as->dyn_root ? afs_dynroot_test_super : afs_test_super,
+ afs_set_super);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
- goto error_as;
+ goto error;
}
if (!sb->s_root) {
/* initial superblock/root creation */
_debug("create");
- ret = afs_fill_super(sb, &params);
+ ret = afs_fill_super(sb, ctx);
if (ret < 0)
goto error_sb;
- as = NULL;
sb->s_flags |= SB_ACTIVE;
} else {
_debug("reuse");
ASSERTCMP(sb->s_flags, &, SB_ACTIVE);
- afs_destroy_sbi(as);
- as = NULL;
}
- afs_put_cell(params.net, params.cell);
- key_put(params.key);
+ fc->root = dget(sb->s_root);
_leave(" = 0 [%p]", sb);
- return dget(sb->s_root);
+ return 0;
error_sb:
deactivate_locked_super(sb);
- goto error_key;
-error_as:
- afs_destroy_sbi(as);
-error_key:
- key_put(params.key);
error:
- afs_put_cell(params.net, params.cell);
_leave(" = %d", ret);
- return ERR_PTR(ret);
+ return ret;
+}
+
+static void afs_free_fc(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx = fc->fs_private;
+
+ afs_destroy_sbi(fc->s_fs_info);
+ afs_put_volume(ctx->cell, ctx->volume);
+ afs_put_cell(ctx->net, ctx->cell);
+ key_put(ctx->key);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations afs_context_ops = {
+ .free = afs_free_fc,
+ .parse_param = afs_parse_param,
+ .get_tree = afs_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int afs_init_fs_context(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx;
+ struct afs_cell *cell;
+
+ ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->type = AFSVL_ROVOL;
+ ctx->net = afs_net(fc->net_ns);
+
+ /* Default to the workstation cell. */
+ rcu_read_lock();
+ cell = afs_lookup_cell_rcu(ctx->net, NULL, 0);
+ rcu_read_unlock();
+ if (IS_ERR(cell))
+ cell = NULL;
+ ctx->cell = cell;
+
+ fc->fs_private = ctx;
+ fc->ops = &afs_context_ops;
+ return 0;
}
/*
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 00975ed3640f..f6eba2def0a1 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -21,7 +21,7 @@ static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" };
/*
* Allocate a volume record and load it up from a vldb record.
*/
-static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params,
+static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
struct afs_vldb_entry *vldb,
unsigned long type_mask)
{
@@ -113,7 +113,7 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell,
* - Rule 3: If parent volume is R/W, then only mount R/W volume unless
* explicitly told otherwise
*/
-struct afs_volume *afs_create_volume(struct afs_mount_params *params)
+struct afs_volume *afs_create_volume(struct afs_fs_context *params)
{
struct afs_vldb_entry *vldb;
struct afs_volume *volume;
diff --git a/fs/aio.c b/fs/aio.c
index aaaaf4d12c73..38b741aef0bf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -167,9 +167,13 @@ struct kioctx {
unsigned id;
};
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
struct fsync_iocb {
- struct work_struct work;
struct file *file;
+ struct work_struct work;
bool datasync;
};
@@ -183,8 +187,15 @@ struct poll_iocb {
struct work_struct work;
};
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
struct aio_kiocb {
union {
+ struct file *ki_filp;
struct kiocb rw;
struct fsync_iocb fsync;
struct poll_iocb poll;
@@ -1060,6 +1071,8 @@ static inline void iocb_put(struct aio_kiocb *iocb)
{
if (refcount_read(&iocb->ki_refcnt) == 0 ||
refcount_dec_and_test(&iocb->ki_refcnt)) {
+ if (iocb->ki_filp)
+ fput(iocb->ki_filp);
percpu_ref_put(&iocb->ki_ctx->reqs);
kmem_cache_free(kiocb_cachep, iocb);
}
@@ -1424,7 +1437,6 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
file_end_write(kiocb->ki_filp);
}
- fput(kiocb->ki_filp);
aio_complete(iocb, res, res2);
}
@@ -1432,9 +1444,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
{
int ret;
- req->ki_filp = fget(iocb->aio_fildes);
- if (unlikely(!req->ki_filp))
- return -EBADF;
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
@@ -1451,7 +1460,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
ret = ioprio_check_cap(iocb->aio_reqprio);
if (ret) {
pr_debug("aio ioprio check cap error: %d\n", ret);
- goto out_fput;
+ return ret;
}
req->ki_ioprio = iocb->aio_reqprio;
@@ -1460,14 +1469,10 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
if (unlikely(ret))
- goto out_fput;
+ return ret;
req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
return 0;
-
-out_fput:
- fput(req->ki_filp);
- return ret;
}
static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
@@ -1521,24 +1526,19 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
if (ret)
return ret;
file = req->ki_filp;
-
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- goto out_fput;
+ return -EBADF;
ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
aio_rw_done(req, call_read_iter(file, req, &iter));
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
@@ -1555,16 +1555,14 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
return ret;
file = req->ki_filp;
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- goto out_fput;
- ret = -EINVAL;
+ return -EBADF;
if (unlikely(!file->f_op->write_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
/*
@@ -1582,9 +1580,6 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
aio_rw_done(req, call_write_iter(file, req, &iter));
}
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
@@ -1594,7 +1589,6 @@ static void aio_fsync_work(struct work_struct *work)
int ret;
ret = vfs_fsync(req->file, req->datasync);
- fput(req->file);
aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
}
@@ -1605,13 +1599,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
iocb->aio_rw_flags))
return -EINVAL;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
- if (unlikely(!req->file->f_op->fsync)) {
- fput(req->file);
+ if (unlikely(!req->file->f_op->fsync))
return -EINVAL;
- }
req->datasync = datasync;
INIT_WORK(&req->work, aio_fsync_work);
@@ -1621,10 +1610,7 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
{
- struct file *file = iocb->poll.file;
-
aio_complete(iocb, mangle_poll(mask), 0);
- fput(file);
}
static void aio_poll_complete_work(struct work_struct *work)
@@ -1680,6 +1666,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
__poll_t mask = key_to_poll(key);
+ unsigned long flags;
req->woken = true;
@@ -1688,10 +1675,15 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (!(mask & req->events))
return 0;
- /* try to complete the iocb inline if we can: */
- if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+ /*
+ * Try to complete the iocb inline if we can. Use
+ * irqsave/irqrestore because not all filesystems (e.g. fuse)
+ * call this function with IRQs disabled and because IRQs
+ * have to be disabled before ctx_lock is obtained.
+ */
+ if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
list_del(&iocb->ki_list);
- spin_unlock(&iocb->ki_ctx->ctx_lock);
+ spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
list_del_init(&req->wait.entry);
aio_poll_complete(iocb, mask);
@@ -1743,9 +1735,6 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
INIT_WORK(&req->work, aio_poll_complete_work);
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
req->head = NULL;
req->woken = false;
@@ -1788,10 +1777,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
spin_unlock_irq(&ctx->ctx_lock);
out:
- if (unlikely(apt.error)) {
- fput(req->file);
+ if (unlikely(apt.error))
return apt.error;
- }
if (mask)
aio_poll_complete(aiocb, mask);
@@ -1829,6 +1816,11 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
if (unlikely(!req))
goto out_put_reqs_available;
+ req->ki_filp = fget(iocb->aio_fildes);
+ ret = -EBADF;
+ if (unlikely(!req->ki_filp))
+ goto out_put_req;
+
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -2199,11 +2191,11 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
- compat_long_t, min_nr,
- compat_long_t, nr,
- struct io_event __user *, events,
- struct old_timespec32 __user *, timeout)
+SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
+ __s32, min_nr,
+ __s32, nr,
+ struct io_event __user *, events,
+ struct old_timespec32 __user *, timeout)
{
struct timespec64 t;
int ret;
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 3e59f0ed777b..70c132acdab1 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -105,6 +105,7 @@ struct autofs_wait_queue {
#define AUTOFS_SBI_CATATONIC 0x0001
#define AUTOFS_SBI_STRICTEXPIRE 0x0002
+#define AUTOFS_SBI_IGNORE 0x0004
struct autofs_sb_info {
u32 magic;
@@ -215,6 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
return -EINVAL;
/* We want a packet pipe */
pipe->f_flags |= O_DIRECT;
+ /* We don't expect -EAGAIN */
+ pipe->f_flags &= ~O_NONBLOCK;
return 0;
}
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 078992eee299..80597b88718b 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -82,18 +82,20 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",maxproto=%d", sbi->max_proto);
if (autofs_type_offset(sbi->type))
- seq_printf(m, ",offset");
+ seq_puts(m, ",offset");
else if (autofs_type_direct(sbi->type))
- seq_printf(m, ",direct");
+ seq_puts(m, ",direct");
else
- seq_printf(m, ",indirect");
+ seq_puts(m, ",indirect");
if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
- seq_printf(m, ",strictexpire");
+ seq_puts(m, ",strictexpire");
+ if (sbi->flags & AUTOFS_SBI_IGNORE)
+ seq_puts(m, ",ignore");
#ifdef CONFIG_CHECKPOINT_RESTORE
if (sbi->pipe)
seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
else
- seq_printf(m, ",pipe_ino=-1");
+ seq_puts(m, ",pipe_ino=-1");
#endif
return 0;
}
@@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = {
};
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
- Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire};
+ Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
+ Opt_ignore};
static const match_table_t tokens = {
{Opt_fd, "fd=%u"},
@@ -124,6 +127,7 @@ static const match_table_t tokens = {
{Opt_direct, "direct"},
{Opt_offset, "offset"},
{Opt_strictexpire, "strictexpire"},
+ {Opt_ignore, "ignore"},
{Opt_err, NULL}
};
@@ -206,6 +210,9 @@ static int parse_options(char *options,
case Opt_strictexpire:
sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
break;
+ case Opt_ignore:
+ sbi->flags |= AUTOFS_SBI_IGNORE;
+ break;
default:
return 1;
}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ca9725f18e00..1fefd87eb4b4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -29,97 +29,14 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/a.out-core.h>
static int load_aout_binary(struct linux_binprm *);
static int load_aout_library(struct file*);
-#ifdef CONFIG_COREDUMP
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-static int aout_core_dump(struct coredump_params *cprm)
-{
- mm_segment_t fs;
- int has_dumped = 0;
- void __user *dump_start;
- int dump_size;
- struct user dump;
-#ifdef __alpha__
-# define START_DATA(u) ((void __user *)u.start_data)
-#else
-# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
- u.start_code))
-#endif
-# define START_STACK(u) ((void __user *)u.start_stack)
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- has_dumped = 1;
- strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
- dump.u_ar0 = offsetof(struct user, regs);
- dump.signal = cprm->siginfo->si_signo;
- aout_dump_thread(cprm->regs, &dump);
-
-/* If the size of the dump file exceeds the rlimit, then see what would happen
- if we wrote the stack, but not the data area. */
- if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
- dump.u_dsize = 0;
-
-/* Make sure we have enough room to write the stack and data areas. */
- if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
- dump.u_ssize = 0;
-
-/* make sure we actually have a data and stack area to dump */
- set_fs(USER_DS);
- if (!access_ok(START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
- dump.u_dsize = 0;
- if (!access_ok(START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
- dump.u_ssize = 0;
-
- set_fs(KERNEL_DS);
-/* struct user */
- if (!dump_emit(cprm, &dump, sizeof(dump)))
- goto end_coredump;
-/* Now dump all of the user data. Include malloced stuff as well */
- if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
- goto end_coredump;
-/* now we start writing out the user space info */
- set_fs(USER_DS);
-/* Dump the data area */
- if (dump.u_dsize != 0) {
- dump_start = START_DATA(dump);
- dump_size = dump.u_dsize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-/* Now prepare to dump the stack area */
- if (dump.u_ssize != 0) {
- dump_start = START_STACK(dump);
- dump_size = dump.u_ssize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-end_coredump:
- set_fs(fs);
- return has_dumped;
-}
-#else
-#define aout_core_dump NULL
-#endif
-
static struct linux_binfmt aout_format = {
.module = THIS_MODULE,
.load_binary = load_aout_binary,
.load_shlib = load_aout_library,
- .core_dump = aout_core_dump,
- .min_coredump = PAGE_SIZE
};
#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 54207327f98f..7d09d125f148 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -57,8 +57,6 @@
#endif
static int load_elf_binary(struct linux_binprm *bprm);
-static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
- int, int, unsigned long);
#ifdef CONFIG_USELIB
static int load_elf_library(struct file *);
@@ -347,7 +345,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
#ifndef elf_map
static unsigned long elf_map(struct file *filep, unsigned long addr,
- struct elf_phdr *eppnt, int prot, int type,
+ const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
{
unsigned long map_addr;
@@ -387,7 +385,7 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
#endif /* !elf_map */
-static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
@@ -414,12 +412,13 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
* header pointed to by elf_ex, into a newly allocated array. The caller is
* responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
*/
-static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
struct file *elf_file)
{
struct elf_phdr *elf_phdata = NULL;
- int retval, size, err = -1;
+ int retval, err = -1;
loff_t pos = elf_ex->e_phoff;
+ unsigned int size;
/*
* If the size of this structure has changed, then punt, since
@@ -429,13 +428,9 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
goto out;
/* Sanity check the number of program headers... */
- if (elf_ex->e_phnum < 1 ||
- elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
- goto out;
-
/* ...and their total size. */
size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
- if (size > ELF_MIN_ALIGN)
+ if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
goto out;
elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -2033,7 +2028,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
- struct list_head *t;
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2050,10 +2044,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_add(&ets->list, &info->thread_list);
}
- list_for_each(t, &info->thread_list) {
+ list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- ets = list_entry(t, struct elf_thread_status, list);
sz = elf_dump_thread_status(siginfo->si_signo, ets);
info->thread_status_size += sz;
}
@@ -2117,20 +2110,17 @@ static size_t get_note_info_size(struct elf_note_info *info)
static int write_note_info(struct elf_note_info *info,
struct coredump_params *cprm)
{
+ struct elf_thread_status *ets;
int i;
- struct list_head *t;
for (i = 0; i < info->numnote; i++)
if (!writenote(info->notes + i, cprm))
return 0;
/* write out the thread status notes section */
- list_for_each(t, &info->thread_list) {
- struct elf_thread_status *tmp =
- list_entry(t, struct elf_thread_status, list);
-
- for (i = 0; i < tmp->num_notes; i++)
- if (!writenote(&tmp->notes[i], cprm))
+ list_for_each_entry(ets, &info->thread_list, list) {
+ for (i = 0; i < ets->num_notes; i++)
+ if (!writenote(&ets->notes[i], cprm))
return 0;
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 58a4c1217fa8..78d3257435c0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -211,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
ssize_t ret;
blk_qc_t qc;
int i;
+ struct bvec_iter_all iter_all;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -247,7 +248,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
task_io_account_write(ret);
}
if (iocb->ki_flags & IOCB_HIPRI)
- bio.bi_opf |= REQ_HIPRI;
+ bio_set_polled(&bio, iocb);
qc = submit_bio(&bio);
for (;;) {
@@ -260,7 +261,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
}
__set_current_state(TASK_RUNNING);
- bio_for_each_segment_all(bvec, &bio, i) {
+ bio_for_each_segment_all(bvec, &bio, i, iter_all) {
if (should_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
@@ -293,6 +294,14 @@ struct blkdev_dio {
static struct bio_set blkdev_dio_pool;
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
+{
+ struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
+}
+
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
@@ -327,11 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio)
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
- struct bio_vec *bvec;
- int i;
+ if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+ struct bvec_iter_all iter_all;
+ struct bio_vec *bvec;
+ int i;
- bio_for_each_segment_all(bvec, bio, i)
- put_page(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
+ put_page(bvec->bv_page);
+ }
bio_put(bio);
}
}
@@ -406,10 +418,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) {
- if (iocb->ki_flags & IOCB_HIPRI)
- bio->bi_opf |= REQ_HIPRI;
+ bool polled = false;
+
+ if (iocb->ki_flags & IOCB_HIPRI) {
+ bio_set_polled(bio, iocb);
+ polled = true;
+ }
qc = submit_bio(bio);
+
+ if (polled)
+ WRITE_ONCE(iocb->ki_cookie, qc);
break;
}
@@ -2076,6 +2095,7 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
+ .iopoll = blkdev_iopoll,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index eb8e20b740d6..4f2a8ae0aa42 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -162,13 +162,14 @@ csum_failed:
} else {
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
/*
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, i)
+ bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
SetPageChecked(bvec->bv_page);
bio_endio(cb->orig_bio);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0f4838e00fbc..b3642367a595 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3451,31 +3451,17 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, \
- ##args);\
-} while (0)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, \
- ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5216e7b3f9ad..6fe9197f6ee4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -833,9 +833,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec;
struct btrfs_root *root;
int i, ret = 0;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)
@@ -2957,7 +2958,7 @@ int open_ctree(struct super_block *sb,
sb->s_bdi->congested_fn = btrfs_congested_fn;
sb->s_bdi->congested_data = fs_info;
sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b20700ad8752..ca8b8e785cf3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -152,11 +152,12 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
{
blk_status_t ret = 0;
struct bio_vec *bvec = bio_last_bvec_all(bio);
- struct page *page = bvec->bv_page;
+ struct bio_vec bv;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
- start = page_offset(page) + bvec->bv_offset;
+ mp_bvec_last_segment(bvec, &bv);
+ start = page_offset(bv.bv_page) + bv.bv_offset;
bio->bi_private = NULL;
@@ -2379,7 +2380,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
- unsigned failed_bio_pages = bio_pages_all(failed_bio);
+ unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2451,9 +2452,10 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start;
u64 end;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2522,9 +2524,10 @@ static void end_bio_extent_readpage(struct bio *bio)
int mirror;
int ret;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3641,9 +3644,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
struct bio_vec *bvec;
struct extent_buffer *eb;
int i, done;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
eb = (struct extent_buffer *)page->private;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3f180b857e20..82fdda8ff5ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7829,6 +7829,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
struct bio_vec *bvec;
struct extent_io_tree *io_tree, *failure_tree;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@@ -7840,7 +7841,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
io_tree, done->start, bvec->bv_page,
btrfs_ino(BTRFS_I(inode)), 0);
@@ -7919,6 +7920,7 @@ static void btrfs_retry_endio(struct bio *bio)
int uptodate;
int ret;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@@ -7932,7 +7934,7 @@ static void btrfs_retry_endio(struct bio *bio)
failure_tree = &BTRFS_I(inode)->io_failure_tree;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
bvec->bv_offset, done->start,
bvec->bv_len);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6976e2280771..67a6f7d47402 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
SetPageUptodate(bvec->bv_page);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 48318fb74938..ce357602f471 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3027,13 +3027,23 @@ void guard_bio_eod(int op, struct bio *bio)
/* Uhhuh. We've got a bio that straddles the device size! */
truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
+ /*
+ * The bio contains more than one segment which spans EOD, just return
+ * and let IO layer turn it into an EIO
+ */
+ if (truncated_bytes > bvec->bv_len)
+ return;
+
/* Truncate the bio.. */
bio->bi_iter.bi_size -= truncated_bytes;
bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) {
- zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+ struct bio_vec bv;
+
+ mp_bvec_last_segment(bvec, &bv);
+ zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes);
}
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index bba28a5034ba..36a8dc699448 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -148,11 +148,17 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->caps_list_lock);
}
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt)
{
spin_lock(&mdsc->caps_list_lock);
- mdsc->caps_min_count += delta;
- BUG_ON(mdsc->caps_min_count < 0);
+ mdsc->caps_min_count = fsopt->max_readdir;
+ if (mdsc->caps_min_count < 1024)
+ mdsc->caps_min_count = 1024;
+ mdsc->caps_use_max = fsopt->caps_max;
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_max < mdsc->caps_min_count)
+ mdsc->caps_use_max = mdsc->caps_min_count;
spin_unlock(&mdsc->caps_list_lock);
}
@@ -272,6 +278,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
if (!err) {
BUG_ON(have + alloc != need);
ctx->count = need;
+ ctx->used = 0;
}
spin_lock(&mdsc->caps_list_lock);
@@ -295,13 +302,24 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
}
void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
+ struct ceph_cap_reservation *ctx)
{
+ bool reclaim = false;
+ if (!ctx->count)
+ return;
+
dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
spin_lock(&mdsc->caps_list_lock);
__ceph_unreserve_caps(mdsc, ctx->count);
ctx->count = 0;
+
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ reclaim = true;
spin_unlock(&mdsc->caps_list_lock);
+
+ if (reclaim)
+ ceph_reclaim_caps_nr(mdsc, ctx->used);
}
struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
@@ -346,6 +364,7 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
BUG_ON(list_empty(&mdsc->caps_list));
ctx->count--;
+ ctx->used++;
mdsc->caps_reserve_count--;
mdsc->caps_use_count++;
@@ -500,12 +519,12 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
ci->i_hold_caps_min = round_jiffies(jiffies +
- ma->caps_wanted_delay_min * HZ);
+ opt->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
- ma->caps_wanted_delay_max * HZ);
+ opt->caps_wanted_delay_max * HZ);
dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
}
@@ -657,6 +676,10 @@ void ceph_add_cap(struct inode *inode,
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
} else {
+ spin_lock(&session->s_cap_lock);
+ list_move_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
if (cap->cap_gen < session->s_cap_gen)
cap->issued = cap->implemented = CEPH_CAP_PIN;
@@ -1081,9 +1104,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
(!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
cap->queue_release = 1;
if (removed) {
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
+ __ceph_queue_cap_release(session, cap);
removed = 0;
}
} else {
@@ -1245,7 +1266,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
* Queue cap releases when an inode is dropped from our cache. Since
* inode is about to be destroyed, there is no need for i_ceph_lock.
*/
-void ceph_queue_caps_release(struct inode *inode)
+void __ceph_remove_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct rb_node *p;
@@ -2393,6 +2414,12 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) {
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ /* encode_caps_cb() also will reset these sequence
+ * numbers. make sure sequence numbers in cap flush
+ * message match later reconnect message */
+ cap->seq = 0;
+ cap->issue_seq = 0;
+ cap->mseq = 0;
__kick_flushing_caps(mdsc, session, ci,
oldest_flush_tid);
} else {
@@ -3880,12 +3907,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
cap->seq = seq;
cap->issue_seq = seq;
spin_lock(&session->s_cap_lock);
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
+ __ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- goto flush_cap_releases;
+ goto done;
}
/* these will work even if we don't have a cap yet */
@@ -3955,7 +3980,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_cap_op_name(op));
}
- goto done;
+done:
+ mutex_unlock(&session->s_mutex);
+done_unlocked:
+ iput(inode);
+ ceph_put_string(extra_info.pool_ns);
+ return;
flush_cap_releases:
/*
@@ -3963,14 +3993,8 @@ flush_cap_releases:
* along for the mds (who clearly thinks we still have this
* cap).
*/
- ceph_send_cap_releases(mdsc, session);
-
-done:
- mutex_unlock(&session->s_mutex);
-done_unlocked:
- iput(inode);
- ceph_put_string(extra_info.pool_ns);
- return;
+ ceph_flush_cap_releases(mdsc, session);
+ goto done;
bad:
pr_err("ceph_handle_caps: corrupt message\n");
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index abdf98deeec4..98365e74cb4a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -139,23 +139,6 @@ static int caps_show(struct seq_file *s, void *p)
return 0;
}
-static int dentry_lru_show(struct seq_file *s, void *ptr)
-{
- struct ceph_fs_client *fsc = s->private;
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_dentry_info *di;
-
- spin_lock(&mdsc->dentry_lru_lock);
- list_for_each_entry(di, &mdsc->dentry_lru, lru) {
- struct dentry *dentry = di->dentry;
- seq_printf(s, "%p %p\t%pd\n",
- di, dentry, dentry);
- }
- spin_unlock(&mdsc->dentry_lru_lock);
-
- return 0;
-}
-
static int mds_sessions_show(struct seq_file *s, void *ptr)
{
struct ceph_fs_client *fsc = s->private;
@@ -195,7 +178,6 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
CEPH_DEFINE_SHOW_FUNC(mdsc_show)
CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
@@ -231,7 +213,6 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
debugfs_remove(fsc->debugfs_mdsc);
- debugfs_remove(fsc->debugfs_dentry_lru);
}
int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
@@ -291,14 +272,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
if (!fsc->debugfs_caps)
goto out;
- fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0400,
- fsc->client->debugfs_dir,
- fsc,
- &dentry_lru_show_fops);
- if (!fsc->debugfs_dentry_lru)
- goto out;
-
return 0;
out:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 82928cea0209..a8f429882249 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -29,6 +29,9 @@
const struct dentry_operations ceph_dentry_ops;
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di);
+static int __dir_lease_try_check(const struct dentry *dentry);
+
/*
* Initialize ceph dentry state.
*/
@@ -44,7 +47,7 @@ static int ceph_d_init(struct dentry *dentry)
di->lease_session = NULL;
di->time = jiffies;
dentry->d_fsdata = di;
- ceph_dentry_lru_add(dentry);
+ INIT_LIST_HEAD(&di->lease_list);
return 0;
}
@@ -241,6 +244,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
goto out;
}
if (fpos_cmp(ctx->pos, di->offset) <= 0) {
+ __ceph_dentry_dir_lease_touch(di);
emit_dentry = true;
}
spin_unlock(&dentry->d_lock);
@@ -1125,13 +1129,277 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
}
/*
+ * Move dentry to tail of mdsc->dentry_leases list when lease is updated.
+ * Leases at front of the list will expire first. (Assume all leases have
+ * similar duration)
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
+
+ di->flags |= CEPH_DENTRY_LEASE_LIST;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_move_tail(&di->lease_list, &mdsc->dentry_leases);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
+ struct ceph_dentry_info *di)
+{
+ di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED);
+ di->lease_gen = 0;
+ di->time = jiffies;
+ list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases);
+}
+
+/*
+ * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases
+ * list if it's not in the list, otherwise set 'referenced' flag.
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n",
+ di, dn, dn, di->offset);
+
+ if (!list_empty(&di->lease_list)) {
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ /* don't remove dentry from dentry lease list
+ * if its lease is valid */
+ if (__dentry_lease_is_valid(di))
+ return;
+ } else {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+ }
+
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ di->flags &= ~CEPH_DENTRY_LEASE_LIST;
+ return;
+ }
+
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ __dentry_dir_lease_touch(mdsc, di),
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_lease_unlist(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_client *mdsc;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST)
+ return;
+ if (list_empty(&di->lease_list))
+ return;
+
+ mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_del_init(&di->lease_list);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+enum {
+ KEEP = 0,
+ DELETE = 1,
+ TOUCH = 2,
+ STOP = 4,
+};
+
+struct ceph_lease_walk_control {
+ bool dir_lease;
+ bool expire_dir_lease;
+ unsigned long nr_to_scan;
+ unsigned long dir_lease_ttl;
+};
+
+static unsigned long
+__dentry_leases_walk(struct ceph_mds_client *mdsc,
+ struct ceph_lease_walk_control *lwc,
+ int (*check)(struct dentry*, void*))
+{
+ struct ceph_dentry_info *di, *tmp;
+ struct dentry *dentry, *last = NULL;
+ struct list_head* list;
+ LIST_HEAD(dispose);
+ unsigned long freed = 0;
+ int ret = 0;
+
+ list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_for_each_entry_safe(di, tmp, list, lease_list) {
+ if (!lwc->nr_to_scan)
+ break;
+ --lwc->nr_to_scan;
+
+ dentry = di->dentry;
+ if (last == dentry)
+ break;
+
+ if (!spin_trylock(&dentry->d_lock))
+ continue;
+
+ if (dentry->d_lockref.count < 0) {
+ list_del_init(&di->lease_list);
+ goto next;
+ }
+
+ ret = check(dentry, lwc);
+ if (ret & TOUCH) {
+ /* move it into tail of dir lease list */
+ __dentry_dir_lease_touch(mdsc, di);
+ if (!last)
+ last = dentry;
+ }
+ if (ret & DELETE) {
+ /* stale lease */
+ di->flags &= ~CEPH_DENTRY_REFERENCED;
+ if (dentry->d_lockref.count > 0) {
+ /* update_dentry_lease() will re-add
+ * it to lease list, or
+ * ceph_d_delete() will return 1 when
+ * last reference is dropped */
+ list_del_init(&di->lease_list);
+ } else {
+ di->flags |= CEPH_DENTRY_SHRINK_LIST;
+ list_move_tail(&di->lease_list, &dispose);
+ dget_dlock(dentry);
+ }
+ }
+next:
+ spin_unlock(&dentry->d_lock);
+ if (ret & STOP)
+ break;
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+
+ while (!list_empty(&dispose)) {
+ di = list_first_entry(&dispose, struct ceph_dentry_info,
+ lease_list);
+ dentry = di->dentry;
+ spin_lock(&dentry->d_lock);
+
+ list_del_init(&di->lease_list);
+ di->flags &= ~CEPH_DENTRY_SHRINK_LIST;
+ if (di->flags & CEPH_DENTRY_REFERENCED) {
+ spin_lock(&mdsc->dentry_list_lock);
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ list_add_tail(&di->lease_list,
+ &mdsc->dentry_leases);
+ } else {
+ __dentry_dir_lease_touch(mdsc, di);
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+ } else {
+ freed++;
+ }
+
+ spin_unlock(&dentry->d_lock);
+ /* ceph_d_delete() does the trick */
+ dput(dentry);
+ }
+ return freed;
+}
+
+static int __dentry_lease_check(struct dentry *dentry, void *arg)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int ret;
+
+ if (__dentry_lease_is_valid(di))
+ return STOP;
+ ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0)
+ return TOUCH;
+ return DELETE;
+}
+
+static int __dir_lease_check(struct dentry *dentry, void *arg)
+{
+ struct ceph_lease_walk_control *lwc = arg;
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ int ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0) {
+ if (time_before(jiffies, di->time + lwc->dir_lease_ttl))
+ return STOP;
+ /* Move dentry to tail of dir lease list if we don't want
+ * to delete it. So dentries in the list are checked in a
+ * round robin manner */
+ if (!lwc->expire_dir_lease)
+ return TOUCH;
+ if (dentry->d_lockref.count > 0 ||
+ (di->flags & CEPH_DENTRY_REFERENCED))
+ return TOUCH;
+ /* invalidate dir lease */
+ di->lease_shared_gen = 0;
+ }
+ return DELETE;
+}
+
+int ceph_trim_dentries(struct ceph_mds_client *mdsc)
+{
+ struct ceph_lease_walk_control lwc;
+ unsigned long count;
+ unsigned long freed;
+
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ count = mdsc->caps_use_count - mdsc->caps_use_max;
+ else
+ count = 0;
+ spin_unlock(&mdsc->caps_list_lock);
+
+ lwc.dir_lease = false;
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
+ freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
+ if (!lwc.nr_to_scan) /* more invalid leases */
+ return -EAGAIN;
+
+ if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE)
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
+
+ lwc.dir_lease = true;
+ lwc.expire_dir_lease = freed < count;
+ lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
+ freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
+ if (!lwc.nr_to_scan) /* more to check */
+ return -EAGAIN;
+
+ return freed > 0 ? 1 : 0;
+}
+
+/*
* Ensure a dentry lease will no longer revalidate.
*/
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&dentry->d_lock);
- ceph_dentry(dentry)->time = jiffies;
- ceph_dentry(dentry)->lease_shared_gen = 0;
+ di->time = jiffies;
+ di->lease_shared_gen = 0;
+ __dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
@@ -1139,45 +1407,59 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
* Check if dentry lease is valid. If not, delete the lease. Try to
* renew if the least is more than half up.
*/
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_session *session;
+
+ if (!di->lease_gen)
+ return false;
+
+ session = di->lease_session;
+ if (session) {
+ u32 gen;
+ unsigned long ttl;
+
+ spin_lock(&session->s_gen_ttl_lock);
+ gen = session->s_cap_gen;
+ ttl = session->s_cap_ttl;
+ spin_unlock(&session->s_gen_ttl_lock);
+
+ if (di->lease_gen == gen &&
+ time_before(jiffies, ttl) &&
+ time_before(jiffies, di->time))
+ return true;
+ }
+ di->lease_gen = 0;
+ return false;
+}
+
static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
struct inode *dir)
{
struct ceph_dentry_info *di;
- struct ceph_mds_session *s;
- int valid = 0;
- u32 gen;
- unsigned long ttl;
struct ceph_mds_session *session = NULL;
u32 seq = 0;
+ int valid = 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di && di->lease_session) {
- s = di->lease_session;
- spin_lock(&s->s_gen_ttl_lock);
- gen = s->s_cap_gen;
- ttl = s->s_cap_ttl;
- spin_unlock(&s->s_gen_ttl_lock);
+ if (di && __dentry_lease_is_valid(di)) {
+ valid = 1;
- if (di->lease_gen == gen &&
- time_before(jiffies, di->time) &&
- time_before(jiffies, ttl)) {
- valid = 1;
- if (di->lease_renew_after &&
- time_after(jiffies, di->lease_renew_after)) {
- /*
- * We should renew. If we're in RCU walk mode
- * though, we can't do that so just return
- * -ECHILD.
- */
- if (flags & LOOKUP_RCU) {
- valid = -ECHILD;
- } else {
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
- }
+ if (di->lease_renew_after &&
+ time_after(jiffies, di->lease_renew_after)) {
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(di->lease_session);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
}
}
}
@@ -1193,6 +1475,38 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
}
/*
+ * Called under dentry->d_lock.
+ */
+static int __dir_lease_try_check(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct inode *dir;
+ struct ceph_inode_info *ci;
+ int valid = 0;
+
+ if (!di->lease_shared_gen)
+ return 0;
+ if (IS_ROOT(dentry))
+ return 0;
+
+ dir = d_inode(dentry->d_parent);
+ ci = ceph_inode(dir);
+
+ if (spin_trylock(&ci->i_ceph_lock)) {
+ if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0))
+ valid = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ } else {
+ valid = -EBUSY;
+ }
+
+ if (!valid)
+ di->lease_shared_gen = 0;
+ return valid;
+}
+
+/*
* Check if directory-wide content lease/cap is valid.
*/
static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
@@ -1205,6 +1519,8 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen)
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
spin_unlock(&ci->i_ceph_lock);
+ if (valid)
+ __ceph_dentry_dir_lease_touch(di);
dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
dir, (unsigned)atomic_read(&ci->i_shared_gen),
dentry, (unsigned)di->lease_shared_gen, valid);
@@ -1297,11 +1613,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
- if (valid) {
- ceph_dentry_lru_touch(dentry);
- } else {
+ if (!valid)
ceph_dir_clear_complete(dir);
- }
if (!(flags & LOOKUP_RCU))
dput(parent);
@@ -1309,6 +1622,31 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
/*
+ * Delete unused dentry that doesn't have valid lease
+ *
+ * Called under dentry->d_lock.
+ */
+static int ceph_d_delete(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+
+ /* won't release caps */
+ if (d_really_is_negative(dentry))
+ return 0;
+ if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
+ return 0;
+ /* vaild lease? */
+ di = ceph_dentry(dentry);
+ if (di) {
+ if (__dentry_lease_is_valid(di))
+ return 0;
+ if (__dir_lease_try_check(dentry))
+ return 0;
+ }
+ return 1;
+}
+
+/*
* Release our ceph_dentry_info.
*/
static void ceph_d_release(struct dentry *dentry)
@@ -1316,9 +1654,9 @@ static void ceph_d_release(struct dentry *dentry)
struct ceph_dentry_info *di = ceph_dentry(dentry);
dout("d_release %p\n", dentry);
- ceph_dentry_lru_del(dentry);
spin_lock(&dentry->d_lock);
+ __dentry_lease_unlist(di);
dentry->d_fsdata = NULL;
spin_unlock(&dentry->d_lock);
@@ -1419,49 +1757,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
return size - left;
}
-/*
- * We maintain a private dentry LRU.
- *
- * FIXME: this needs to be changed to a per-mds lru to be useful.
- */
-void ceph_dentry_lru_add(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_touch(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn,
- di->offset);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-void ceph_dentry_lru_del(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
/*
* Return name hash for a given dentry. This is dependent on
@@ -1531,6 +1827,7 @@ const struct inode_operations ceph_snapdir_iops = {
const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
+ .d_delete = ceph_d_delete,
.d_release = ceph_d_release,
.d_prune = ceph_d_prune,
.d_init = ceph_d_init,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 189df668b6a0..9f53c3d99304 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -590,7 +590,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
* but it will at least behave sensibly when they are
* in sequence.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ off, off + len - 1);
if (ret < 0)
return ret;
@@ -929,14 +930,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
(write ? "write" : "read"), file, pos, (unsigned)count,
snapc, snapc->seq);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ pos, pos + count - 1);
if (ret < 0)
return ret;
if (write) {
int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
- (pos + count) >> PAGE_SHIFT);
+ (pos + count - 1) >> PAGE_SHIFT);
if (ret2 < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret2);
@@ -1132,13 +1134,14 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
file, pos, (unsigned)count, snapc, snapc->seq);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ pos, pos + count - 1);
if (ret < 0)
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
- (pos + count) >> PAGE_SHIFT);
+ (pos + count - 1) >> PAGE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9d1f34d46627..e3346628efe2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -497,7 +497,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
- atomic_set(&ci->i_shared_gen, 0);
+ atomic_set(&ci->i_shared_gen, 1);
ci->i_rdcache_gen = 0;
ci->i_rdcache_revoking = 0;
@@ -537,7 +537,7 @@ void ceph_destroy_inode(struct inode *inode)
ceph_fscache_unregister_inode_cookie(ci);
- ceph_queue_caps_release(inode);
+ __ceph_remove_caps(inode);
if (__ceph_has_any_quota(ci))
ceph_adjust_quota_realms_count(inode, false);
@@ -548,17 +548,22 @@ void ceph_destroy_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct ceph_snap_realm *realm = ci->i_snap_realm;
-
- dout(" dropping residual ref to snap realm %p\n", realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm = NULL;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
+ ceph_inode_to_client(inode)->mdsc;
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
+ dout(" dropping residual ref to snap realm %p\n",
+ realm);
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm = NULL;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, realm);
+ } else {
+ ceph_put_snapid_map(mdsc, ci->i_snapid_map);
+ ci->i_snap_realm = NULL;
+ }
}
kfree(ci->i_symlink);
@@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
iinfo->pool_ns_len);
+ if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
+ ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
+
spin_lock(&ci->i_ceph_lock);
/*
@@ -869,6 +877,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ci->i_rbytes = le64_to_cpu(info->rbytes);
ci->i_rfiles = le64_to_cpu(info->rfiles);
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+ ci->i_dir_pin = iinfo->dir_pin;
ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
}
}
@@ -899,6 +908,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
case S_IFBLK:
case S_IFCHR:
case S_IFSOCK:
+ inode->i_blkbits = PAGE_SHIFT;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
inode->i_op = &ceph_file_iops;
break;
@@ -1066,9 +1076,10 @@ static void update_dentry_lease(struct dentry *dentry,
goto out_unlock;
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
-
- if (duration == 0)
+ if (duration == 0) {
+ __ceph_dentry_dir_lease_touch(di);
goto out_unlock;
+ }
if (di->lease_gen == session->s_cap_gen &&
time_before(ttl, di->time))
@@ -1079,8 +1090,6 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_session = NULL;
}
- ceph_dentry_lru_touch(dentry);
-
if (!di->lease_session)
di->lease_session = ceph_get_mds_session(session);
di->lease_gen = session->s_cap_gen;
@@ -1088,6 +1097,8 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
di->time = ttl;
+
+ __ceph_dentry_lease_touch(di);
out_unlock:
spin_unlock(&dentry->d_lock);
if (old_lease_session)
@@ -2259,10 +2270,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
if (!err) {
generic_fillattr(inode, stat);
stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
- if (ceph_snap(inode) != CEPH_NOSNAP)
- stat->dev = ceph_snap(inode);
+ if (ceph_snap(inode) == CEPH_NOSNAP)
+ stat->dev = inode->i_sb->s_dev;
else
- stat->dev = 0;
+ stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
+
if (S_ISDIR(inode->i_mode)) {
if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
RBYTES))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 163fc74bf221..21c33ed048ed 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -20,6 +20,8 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
+
/*
* A cluster of MDS (metadata server) daemons is responsible for
* managing the file system namespace (the directory hierarchy and
@@ -46,13 +48,17 @@
*/
struct ceph_reconnect_state {
- int nr_caps;
+ struct ceph_mds_session *session;
+ int nr_caps, nr_realms;
struct ceph_pagelist *pagelist;
unsigned msg_version;
+ bool allow_multi;
};
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head);
+static void ceph_cap_release_work(struct work_struct *work);
+static void ceph_cap_reclaim_work(struct work_struct *work);
static const struct ceph_connection_operations mds_con_ops;
@@ -61,6 +67,29 @@ static const struct ceph_connection_operations mds_con_ops;
* mds reply parsing
*/
+static int parse_reply_info_quota(void **p, void *end,
+ struct ceph_mds_reply_info_in *info)
+{
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only
+ * understand encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ ceph_decode_64_safe(p, end, info->max_bytes, bad);
+ ceph_decode_64_safe(p, end, info->max_files, bad);
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
/*
* parse individual inode info
*/
@@ -68,8 +97,24 @@ static int parse_reply_info_in(void **p, void *end,
struct ceph_mds_reply_info_in *info,
u64 features)
{
- int err = -EIO;
+ int err = 0;
+ u8 struct_v = 0;
+ if (features == (u64)-1) {
+ u32 struct_len;
+ u8 struct_compat;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ }
+
+ ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
info->in = *p;
*p += sizeof(struct ceph_mds_reply_inode) +
sizeof(*info->in->fragtree.splits) *
@@ -87,49 +132,136 @@ static int parse_reply_info_in(void **p, void *end,
info->xattr_data = *p;
*p += info->xattr_len;
- if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ if (features == (u64)-1) {
+ /* inline data */
ceph_decode_64_safe(p, end, info->inline_version, bad);
ceph_decode_32_safe(p, end, info->inline_len, bad);
ceph_decode_need(p, end, info->inline_len, bad);
info->inline_data = *p;
*p += info->inline_len;
- } else
- info->inline_version = CEPH_INLINE_NONE;
+ /* quota */
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ /* pool namespace */
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ /* btime, change_attr */
+ {
+ struct ceph_timespec btime;
+ u64 change_attr;
+ ceph_decode_need(p, end, sizeof(btime), bad);
+ ceph_decode_copy(p, &btime, sizeof(btime));
+ ceph_decode_64_safe(p, end, change_attr, bad);
+ }
+
+ /* dir pin */
+ if (struct_v >= 2) {
+ ceph_decode_32_safe(p, end, info->dir_pin, bad);
+ } else {
+ info->dir_pin = -ENODATA;
+ }
+
+ *p = end;
+ } else {
+ if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ ceph_decode_64_safe(p, end, info->inline_version, bad);
+ ceph_decode_32_safe(p, end, info->inline_len, bad);
+ ceph_decode_need(p, end, info->inline_len, bad);
+ info->inline_data = *p;
+ *p += info->inline_len;
+ } else
+ info->inline_version = CEPH_INLINE_NONE;
+
+ if (features & CEPH_FEATURE_MDS_QUOTA) {
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ } else {
+ info->max_bytes = 0;
+ info->max_files = 0;
+ }
+
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ }
- if (features & CEPH_FEATURE_MDS_QUOTA) {
+ info->dir_pin = -ENODATA;
+ }
+ return 0;
+bad:
+ err = -EIO;
+out_bad:
+ return err;
+}
+
+static int parse_reply_info_dir(void **p, void *end,
+ struct ceph_mds_reply_dirfrag **dirfrag,
+ u64 features)
+{
+ if (features == (u64)-1) {
u8 struct_v, struct_compat;
u32 struct_len;
-
- /*
- * both struct_v and struct_compat are expected to be >= 1
- */
ceph_decode_8_safe(p, end, struct_v, bad);
ceph_decode_8_safe(p, end, struct_compat, bad);
- if (!struct_v || !struct_compat)
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
goto bad;
ceph_decode_32_safe(p, end, struct_len, bad);
ceph_decode_need(p, end, struct_len, bad);
- ceph_decode_64_safe(p, end, info->max_bytes, bad);
- ceph_decode_64_safe(p, end, info->max_files, bad);
- } else {
- info->max_bytes = 0;
- info->max_files = 0;
+ end = *p + struct_len;
}
- info->pool_ns_len = 0;
- info->pool_ns_data = NULL;
- if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
- ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
- if (info->pool_ns_len > 0) {
- ceph_decode_need(p, end, info->pool_ns_len, bad);
- info->pool_ns_data = *p;
- *p += info->pool_ns_len;
- }
+ ceph_decode_need(p, end, sizeof(**dirfrag), bad);
+ *dirfrag = *p;
+ *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
+ if (unlikely(*p > end))
+ goto bad;
+ if (features == (u64)-1)
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
+static int parse_reply_info_lease(void **p, void *end,
+ struct ceph_mds_reply_lease **lease,
+ u64 features)
+{
+ if (features == (u64)-1) {
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
}
+ ceph_decode_need(p, end, sizeof(**lease), bad);
+ *lease = *p;
+ *p += sizeof(**lease);
+ if (features == (u64)-1)
+ *p = end;
return 0;
bad:
- return err;
+ return -EIO;
}
/*
@@ -147,20 +279,18 @@ static int parse_reply_info_trace(void **p, void *end,
if (err < 0)
goto out_bad;
- if (unlikely(*p + sizeof(*info->dirfrag) > end))
- goto bad;
- info->dirfrag = *p;
- *p += sizeof(*info->dirfrag) +
- sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
- if (unlikely(*p > end))
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dirfrag, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_32_safe(p, end, info->dname_len, bad);
ceph_decode_need(p, end, info->dname_len, bad);
info->dname = *p;
*p += info->dname_len;
- info->dlease = *p;
- *p += sizeof(*info->dlease);
+
+ err = parse_reply_info_lease(p, end, &info->dlease, features);
+ if (err < 0)
+ goto out_bad;
}
if (info->head->is_target) {
@@ -183,20 +313,16 @@ out_bad:
/*
* parse readdir results
*/
-static int parse_reply_info_dir(void **p, void *end,
+static int parse_reply_info_readdir(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
u32 num, i = 0;
int err;
- info->dir_dir = *p;
- if (*p + sizeof(*info->dir_dir) > end)
- goto bad;
- *p += sizeof(*info->dir_dir) +
- sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
- if (*p > end)
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dir_dir, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_need(p, end, sizeof(num) + 2, bad);
num = ceph_decode_32(p);
@@ -222,15 +348,16 @@ static int parse_reply_info_dir(void **p, void *end,
while (num) {
struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
/* dentry */
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
- rde->name_len = ceph_decode_32(p);
+ ceph_decode_32_safe(p, end, rde->name_len, bad);
ceph_decode_need(p, end, rde->name_len, bad);
rde->name = *p;
*p += rde->name_len;
dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
- rde->lease = *p;
- *p += sizeof(struct ceph_mds_reply_lease);
+ /* dentry lease */
+ err = parse_reply_info_lease(p, end, &rde->lease, features);
+ if (err)
+ goto out_bad;
/* inode */
err = parse_reply_info_in(p, end, &rde->inode, features);
if (err < 0)
@@ -281,7 +408,8 @@ static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
- if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+ if (features == (u64)-1 ||
+ (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
if (*p == end) {
info->has_create_ino = false;
} else {
@@ -310,7 +438,7 @@ static int parse_reply_info_extra(void **p, void *end,
if (op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
- return parse_reply_info_dir(p, end, info, features);
+ return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features);
else
@@ -494,7 +622,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
spin_lock_init(&s->s_gen_ttl_lock);
- s->s_cap_gen = 0;
+ s->s_cap_gen = 1;
s->s_cap_ttl = jiffies - 1;
spin_lock_init(&s->s_cap_lock);
@@ -510,6 +638,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
+ INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
+
INIT_LIST_HEAD(&s->s_cap_flushing);
mdsc->sessions[mds] = s;
@@ -535,6 +665,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
dout("__unregister_session mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL;
+ s->s_state = 0;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
atomic_dec(&mdsc->num_sessions);
@@ -1197,13 +1328,10 @@ static int iterate_session_caps(struct ceph_mds_session *session,
cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
- if (cap->queue_release) {
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
- } else {
+ if (cap->queue_release)
+ __ceph_queue_cap_release(session, cap);
+ else
old_cap = cap; /* put_cap it w/o locks held */
- }
}
if (ret < 0)
goto out;
@@ -1638,7 +1766,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
session->s_trim_caps = 0;
}
- ceph_send_cap_releases(mdsc, session);
+ ceph_flush_cap_releases(mdsc, session);
return 0;
}
@@ -1681,8 +1809,8 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
/*
* called under s_mutex
*/
-void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
{
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
@@ -1774,6 +1902,81 @@ out_err:
spin_unlock(&session->s_cap_lock);
}
+static void ceph_cap_release_work(struct work_struct *work)
+{
+ struct ceph_mds_session *session =
+ container_of(work, struct ceph_mds_session, s_cap_release_work);
+
+ mutex_lock(&session->s_mutex);
+ if (session->s_state == CEPH_MDS_SESSION_OPEN ||
+ session->s_state == CEPH_MDS_SESSION_HUNG)
+ ceph_send_cap_releases(session->s_mdsc, session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+}
+
+void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ if (mdsc->stopping)
+ return;
+
+ get_session(session);
+ if (queue_work(mdsc->fsc->cap_wq,
+ &session->s_cap_release_work)) {
+ dout("cap release work queued\n");
+ } else {
+ ceph_put_mds_session(session);
+ dout("failed to queue cap release work\n");
+ }
+}
+
+/*
+ * caller holds session->s_cap_lock
+ */
+void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap)
+{
+ list_add_tail(&cap->session_caps, &session->s_cap_releases);
+ session->s_num_cap_releases++;
+
+ if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
+ ceph_flush_cap_releases(session->s_mdsc, session);
+}
+
+static void ceph_cap_reclaim_work(struct work_struct *work)
+{
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, cap_reclaim_work);
+ int ret = ceph_trim_dentries(mdsc);
+ if (ret == -EAGAIN)
+ ceph_queue_cap_reclaim_work(mdsc);
+}
+
+void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
+{
+ if (mdsc->stopping)
+ return;
+
+ if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
+ dout("caps reclaim work queued\n");
+ } else {
+ dout("failed to queue caps release work\n");
+ }
+}
+
+void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
+{
+ int val;
+ if (!nr)
+ return;
+ val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
+ if (!(val % CEPH_CAPS_PER_RELEASE)) {
+ atomic_set(&mdsc->cap_reclaim_pending, 0);
+ ceph_queue_cap_reclaim_work(mdsc);
+ }
+}
+
/*
* requests
*/
@@ -2653,7 +2856,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
+ err = parse_reply_info(msg, rinfo, (u64)-1);
+ else
+ err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -2684,7 +2890,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
req->r_op == CEPH_MDS_OP_LSSNAP))
ceph_readdir_prepopulate(req, req->r_session);
- ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
current->journal_info = NULL;
mutex_unlock(&req->r_fill_mutex);
@@ -2693,12 +2898,18 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (realm)
ceph_put_snap_realm(mdsc, realm);
- if (err == 0 && req->r_target_inode &&
- test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
- struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
- spin_unlock(&ci->i_unsafe_lock);
+ if (err == 0) {
+ if (req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item,
+ &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+
+ ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
out_err:
mutex_lock(&mdsc->mutex);
@@ -2777,6 +2988,25 @@ bad:
pr_err("mdsc_handle_forward decode error err=%d\n", err);
}
+static int __decode_and_drop_session_metadata(void **p, void *end)
+{
+ /* map<string,string> */
+ u32 n;
+ ceph_decode_32_safe(p, end, n, bad);
+ while (n-- > 0) {
+ u32 len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ }
+ return 0;
+bad:
+ return -1;
+}
+
/*
* handle a mds session control message
*/
@@ -2784,18 +3014,36 @@ static void handle_session(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ int mds = session->s_mds;
+ int msg_version = le16_to_cpu(msg->hdr.version);
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ struct ceph_mds_session_head *h;
u32 op;
u64 seq;
- int mds = session->s_mds;
- struct ceph_mds_session_head *h = msg->front.iov_base;
+ unsigned long features = 0;
int wake = 0;
/* decode */
- if (msg->front.iov_len < sizeof(*h))
- goto bad;
+ ceph_decode_need(&p, end, sizeof(*h), bad);
+ h = p;
+ p += sizeof(*h);
+
op = le32_to_cpu(h->op);
seq = le64_to_cpu(h->seq);
+ if (msg_version >= 3) {
+ u32 len;
+ /* version >= 2, metadata */
+ if (__decode_and_drop_session_metadata(&p, end) < 0)
+ goto bad;
+ /* version >= 3, feature bits */
+ ceph_decode_32_safe(&p, end, len, bad);
+ ceph_decode_need(&p, end, len, bad);
+ memcpy(&features, p, min_t(size_t, len, sizeof(features)));
+ p += len;
+ }
+
mutex_lock(&mdsc->mutex);
if (op == CEPH_SESSION_CLOSE) {
get_session(session);
@@ -2821,6 +3069,7 @@ static void handle_session(struct ceph_mds_session *session,
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect success\n", session->s_mds);
session->s_state = CEPH_MDS_SESSION_OPEN;
+ session->s_features = features;
renewed_caps(mdsc, session, 0);
wake = 1;
if (mdsc->stopping)
@@ -2947,6 +3196,82 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
}
+static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
+{
+ struct ceph_msg *reply;
+ struct ceph_pagelist *_pagelist;
+ struct page *page;
+ __le32 *addr;
+ int err = -ENOMEM;
+
+ if (!recon_state->allow_multi)
+ return -ENOSPC;
+
+ /* can't handle message that contains both caps and realm */
+ BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
+
+ /* pre-allocate new pagelist */
+ _pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!_pagelist)
+ return -ENOMEM;
+
+ reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
+ if (!reply)
+ goto fail_msg;
+
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
+ if (err)
+ goto fail;
+ } else {
+ /* placeholder for nr_realms (currently encoding relams) */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+ }
+
+ err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
+ if (err)
+ goto fail;
+
+ page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
+ addr = kmap_atomic(page);
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ *addr = cpu_to_le32(recon_state->nr_caps);
+ } else {
+ /* currently encoding relams */
+ *(addr + 1) = cpu_to_le32(recon_state->nr_realms);
+ }
+ kunmap_atomic(addr);
+
+ reply->hdr.version = cpu_to_le16(5);
+ reply->hdr.compat_version = cpu_to_le16(4);
+
+ reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
+
+ ceph_con_send(&recon_state->session->s_con, reply);
+ ceph_pagelist_release(recon_state->pagelist);
+
+ recon_state->pagelist = _pagelist;
+ recon_state->nr_caps = 0;
+ recon_state->nr_realms = 0;
+ recon_state->msg_version = 5;
+ return 0;
+fail:
+ ceph_msg_put(reply);
+fail_msg:
+ ceph_pagelist_release(_pagelist);
+ return err;
+}
+
/*
* Encode information about a cap for a reconnect with the MDS.
*/
@@ -2966,9 +3291,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
ceph_cap_string(cap->issued));
- err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- if (err)
- return err;
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
@@ -3008,7 +3330,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
struct ceph_filelock *flocks = NULL;
- size_t struct_len, total_len = 0;
+ size_t struct_len, total_len = sizeof(u64);
u8 struct_v = 0;
encode_again:
@@ -3043,7 +3365,7 @@ encode_again:
if (recon_state->msg_version >= 3) {
/* version, compat_version and struct_len */
- total_len = 2 * sizeof(u8) + sizeof(u32);
+ total_len += 2 * sizeof(u8) + sizeof(u32);
struct_v = 2;
}
/*
@@ -3060,12 +3382,19 @@ encode_again:
struct_len += sizeof(u64); /* snap_follows */
total_len += struct_len;
- err = ceph_pagelist_reserve(pagelist, total_len);
- if (err) {
- kfree(flocks);
- goto out_err;
+
+ if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto out_freeflocks;
+ pagelist = recon_state->pagelist;
}
+ err = ceph_pagelist_reserve(pagelist, total_len);
+ if (err)
+ goto out_freeflocks;
+
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
if (recon_state->msg_version >= 3) {
ceph_pagelist_encode_8(pagelist, struct_v);
ceph_pagelist_encode_8(pagelist, 1);
@@ -3077,7 +3406,7 @@ encode_again:
num_fcntl_locks, num_flock_locks);
if (struct_v >= 2)
ceph_pagelist_encode_64(pagelist, snap_follows);
-
+out_freeflocks:
kfree(flocks);
} else {
u64 pathbase = 0;
@@ -3098,20 +3427,81 @@ encode_again:
}
err = ceph_pagelist_reserve(pagelist,
- pathlen + sizeof(u32) + sizeof(rec.v1));
+ sizeof(u64) + sizeof(u32) +
+ pathlen + sizeof(rec.v1));
if (err) {
- kfree(path);
- goto out_err;
+ goto out_freepath;
}
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
-
+out_freepath:
kfree(path);
}
- recon_state->nr_caps++;
out_err:
+ if (err >= 0)
+ recon_state->nr_caps++;
+ return err;
+}
+
+static int encode_snap_realms(struct ceph_mds_client *mdsc,
+ struct ceph_reconnect_state *recon_state)
+{
+ struct rb_node *p;
+ struct ceph_pagelist *pagelist = recon_state->pagelist;
+ int err = 0;
+
+ if (recon_state->msg_version >= 4) {
+ err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
+ if (err < 0)
+ goto fail;
+ }
+
+ /*
+ * snaprealms. we provide mds with the ino, seq (version), and
+ * parent for all of our realms. If the mds has any newer info,
+ * it will tell us.
+ */
+ for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+ struct ceph_snap_realm *realm =
+ rb_entry(p, struct ceph_snap_realm, node);
+ struct ceph_mds_snaprealm_reconnect sr_rec;
+
+ if (recon_state->msg_version >= 4) {
+ size_t need = sizeof(u8) * 2 + sizeof(u32) +
+ sizeof(sr_rec);
+
+ if (pagelist->length + need > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto fail;
+ pagelist = recon_state->pagelist;
+ }
+
+ err = ceph_pagelist_reserve(pagelist, need);
+ if (err)
+ goto fail;
+
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
+ }
+
+ dout(" adding snap realm %llx seq %lld parent %llx\n",
+ realm->ino, realm->seq, realm->parent_ino);
+ sr_rec.ino = cpu_to_le64(realm->ino);
+ sr_rec.seq = cpu_to_le64(realm->seq);
+ sr_rec.parent = cpu_to_le64(realm->parent_ino);
+
+ err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+ if (err)
+ goto fail;
+
+ recon_state->nr_realms++;
+ }
+fail:
return err;
}
@@ -3132,18 +3522,17 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_msg *reply;
- struct rb_node *p;
int mds = session->s_mds;
int err = -ENOMEM;
- int s_nr_caps;
- struct ceph_pagelist *pagelist;
- struct ceph_reconnect_state recon_state;
+ struct ceph_reconnect_state recon_state = {
+ .session = session,
+ };
LIST_HEAD(dispose);
pr_info("mds%d reconnect start\n", mds);
- pagelist = ceph_pagelist_alloc(GFP_NOFS);
- if (!pagelist)
+ recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!recon_state.pagelist)
goto fail_nopagelist;
reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
@@ -3187,63 +3576,90 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
/* replay unsafe requests */
replay_unsafe_requests(mdsc, session);
+ ceph_early_kick_flushing_caps(mdsc, session);
+
down_read(&mdsc->snap_rwsem);
- /* traverse this session's caps */
- s_nr_caps = session->s_nr_caps;
- err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
if (err)
goto fail;
- recon_state.nr_caps = 0;
- recon_state.pagelist = pagelist;
- if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+ if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
recon_state.msg_version = 3;
- else
+ recon_state.allow_multi = true;
+ } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
+ recon_state.msg_version = 3;
+ } else {
recon_state.msg_version = 2;
+ }
+ /* trsaverse this session's caps */
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
- if (err < 0)
- goto fail;
spin_lock(&session->s_cap_lock);
session->s_cap_reconnect = 0;
spin_unlock(&session->s_cap_lock);
- /*
- * snaprealms. we provide mds with the ino, seq (version), and
- * parent for all of our realms. If the mds has any newer info,
- * it will tell us.
- */
- for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
- struct ceph_snap_realm *realm =
- rb_entry(p, struct ceph_snap_realm, node);
- struct ceph_mds_snaprealm_reconnect sr_rec;
+ if (err < 0)
+ goto fail;
- dout(" adding snap realm %llx seq %lld parent %llx\n",
- realm->ino, realm->seq, realm->parent_ino);
- sr_rec.ino = cpu_to_le64(realm->ino);
- sr_rec.seq = cpu_to_le64(realm->seq);
- sr_rec.parent = cpu_to_le64(realm->parent_ino);
- err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
- if (err)
- goto fail;
+ /* check if all realms can be encoded into current message */
+ if (mdsc->num_snap_realms) {
+ size_t total_len =
+ recon_state.pagelist->length +
+ mdsc->num_snap_realms *
+ sizeof(struct ceph_mds_snaprealm_reconnect);
+ if (recon_state.msg_version >= 4) {
+ /* number of realms */
+ total_len += sizeof(u32);
+ /* version, compat_version and struct_len */
+ total_len += mdsc->num_snap_realms *
+ (2 * sizeof(u8) + sizeof(u32));
+ }
+ if (total_len > RECONNECT_MAX_SIZE) {
+ if (!recon_state.allow_multi) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ if (recon_state.nr_caps) {
+ err = send_reconnect_partial(&recon_state);
+ if (err)
+ goto fail;
+ }
+ recon_state.msg_version = 5;
+ }
}
- reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+ err = encode_snap_realms(mdsc, &recon_state);
+ if (err < 0)
+ goto fail;
- /* raced with cap release? */
- if (s_nr_caps != recon_state.nr_caps) {
- struct page *page = list_first_entry(&pagelist->head,
- struct page, lru);
+ if (recon_state.msg_version >= 5) {
+ err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
+ if (err < 0)
+ goto fail;
+ }
+
+ if (recon_state.nr_caps || recon_state.nr_realms) {
+ struct page *page =
+ list_first_entry(&recon_state.pagelist->head,
+ struct page, lru);
__le32 *addr = kmap_atomic(page);
- *addr = cpu_to_le32(recon_state.nr_caps);
+ if (recon_state.nr_caps) {
+ WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
+ *addr = cpu_to_le32(recon_state.nr_caps);
+ } else if (recon_state.msg_version >= 4) {
+ *(addr + 1) = cpu_to_le32(recon_state.nr_realms);
+ }
kunmap_atomic(addr);
}
- reply->hdr.data_len = cpu_to_le32(pagelist->length);
- ceph_msg_data_add_pagelist(reply, pagelist);
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+ if (recon_state.msg_version >= 4)
+ reply->hdr.compat_version = cpu_to_le16(4);
- ceph_early_kick_flushing_caps(mdsc, session);
+ reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
ceph_con_send(&session->s_con, reply);
@@ -3254,7 +3670,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
up_read(&mdsc->snap_rwsem);
- ceph_pagelist_release(pagelist);
+ ceph_pagelist_release(recon_state.pagelist);
return;
fail:
@@ -3262,7 +3678,7 @@ fail:
up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
fail_nomsg:
- ceph_pagelist_release(pagelist);
+ ceph_pagelist_release(recon_state.pagelist);
fail_nopagelist:
pr_err("error %d preparing reconnect for mds%d\n", err, mds);
return;
@@ -3580,7 +3996,6 @@ static void delayed_work(struct work_struct *work)
int renew_caps;
dout("mdsc delayed_work\n");
- ceph_check_delayed_caps(mdsc);
mutex_lock(&mdsc->mutex);
renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
@@ -3628,6 +4043,12 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
+ ceph_check_delayed_caps(mdsc);
+
+ ceph_queue_cap_reclaim_work(mdsc);
+
+ ceph_trim_snapid_map(mdsc);
+
schedule_delayed(mdsc);
}
@@ -3660,6 +4081,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
init_rwsem(&mdsc->snap_rwsem);
mdsc->snap_realms = RB_ROOT;
INIT_LIST_HEAD(&mdsc->snap_empty);
+ mdsc->num_snap_realms = 0;
spin_lock_init(&mdsc->snap_empty_lock);
mdsc->last_tid = 0;
mdsc->oldest_tid = 0;
@@ -3677,11 +4099,19 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
mdsc->num_cap_flushing = 0;
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
- spin_lock_init(&mdsc->dentry_lru_lock);
- INIT_LIST_HEAD(&mdsc->dentry_lru);
+ INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+ atomic_set(&mdsc->cap_reclaim_pending, 0);
+
+ spin_lock_init(&mdsc->dentry_list_lock);
+ INIT_LIST_HEAD(&mdsc->dentry_leases);
+ INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
ceph_caps_init(mdsc);
- ceph_adjust_min_caps(mdsc, fsc->min_caps);
+ ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
+
+ spin_lock_init(&mdsc->snapid_map_lock);
+ mdsc->snapid_map_tree = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->snapid_map_lru);
init_rwsem(&mdsc->pool_perm_rwsem);
mdsc->pool_perm_tree = RB_ROOT;
@@ -3876,8 +4306,10 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
WARN_ON(!list_empty(&mdsc->cap_delay_list));
mutex_unlock(&mdsc->mutex);
+ ceph_cleanup_snapid_map(mdsc);
ceph_cleanup_empty_realms(mdsc);
+ cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
dout("stopped\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 729da155ebf0..50385a481fdb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -21,11 +21,14 @@
#define CEPHFS_FEATURE_REPLY_ENCODING 9
#define CEPHFS_FEATURE_RECLAIM_CLIENT 10
#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11
+#define CEPHFS_FEATURE_MULTI_RECONNECT 12
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
+ CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
+ CEPHFS_FEATURE_MULTI_RECONNECT, \
}
#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
@@ -65,6 +68,7 @@ struct ceph_mds_reply_info_in {
char *pool_ns_data;
u64 max_bytes;
u64 max_files;
+ s32 dir_pin;
};
struct ceph_mds_reply_dir_entry {
@@ -152,6 +156,7 @@ struct ceph_mds_session {
int s_mds;
int s_state;
unsigned long s_ttl; /* time until mds kills us */
+ unsigned long s_features;
u64 s_seq; /* incoming msg seq # */
struct mutex s_mutex; /* serialize session messages */
@@ -167,19 +172,20 @@ struct ceph_mds_session {
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
struct list_head s_caps; /* all caps issued by this session */
+ struct ceph_cap *s_cap_iterator;
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
int s_cap_reconnect;
int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
- struct ceph_cap *s_cap_iterator;
+ struct work_struct s_cap_release_work;
/* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- refcount_t s_ref;
+ refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
@@ -310,6 +316,15 @@ struct ceph_pool_perm {
char pool_ns[];
};
+struct ceph_snapid_map {
+ struct rb_node node;
+ struct list_head lru;
+ atomic_t ref;
+ u64 snap;
+ dev_t dev;
+ unsigned long last_used;
+};
+
/*
* mds client state
*/
@@ -341,6 +356,7 @@ struct ceph_mds_client {
struct rw_semaphore snap_rwsem;
struct rb_root snap_realms;
struct list_head snap_empty;
+ int num_snap_realms;
spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */
@@ -362,6 +378,9 @@ struct ceph_mds_client {
spinlock_t cap_dirty_lock; /* protects above items */
wait_queue_head_t cap_flushing_wq;
+ struct work_struct cap_reclaim_work;
+ atomic_t cap_reclaim_pending;
+
/*
* Cap reservations
*
@@ -378,13 +397,18 @@ struct ceph_mds_client {
unreserved) */
int caps_total_count; /* total caps allocated */
int caps_use_count; /* in use */
+ int caps_use_max; /* max used caps */
int caps_reserve_count; /* unused, reserved */
int caps_avail_count; /* unused, unreserved */
int caps_min_count; /* keep at least this many
(unreserved) */
- spinlock_t dentry_lru_lock;
- struct list_head dentry_lru;
- int num_dentry;
+ spinlock_t dentry_list_lock;
+ struct list_head dentry_leases; /* fifo list */
+ struct list_head dentry_dir_leases; /* lru list */
+
+ spinlock_t snapid_map_lock;
+ struct rb_root snapid_map_tree;
+ struct list_head snapid_map_lru;
struct rw_semaphore pool_perm_rwsem;
struct rb_root pool_perm_tree;
@@ -438,9 +462,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
-extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-
+extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap);
+extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
+extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index f74193da0e09..89aa37fa0f84 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -3,12 +3,13 @@
#include <linux/sort.h>
#include <linux/slab.h>
-
#include "super.h"
#include "mds_client.h"
-
#include <linux/ceph/decode.h>
+/* unused map expires after 5 minutes */
+#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ)
+
/*
* Snapshots in ceph are driven in large part by cooperation from the
* client. In contrast to local file systems or file servers that
@@ -124,6 +125,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
+ mdsc->num_snap_realms++;
+
dout("create_snap_realm %llx %p\n", realm->ino, realm);
return realm;
}
@@ -175,6 +178,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
+ mdsc->num_snap_realms--;
if (realm->parent) {
list_del_init(&realm->child_item);
@@ -986,3 +990,154 @@ out:
up_write(&mdsc->snap_rwsem);
return;
}
+
+struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap)
+{
+ struct ceph_snapid_map *sm, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&mdsc->snapid_map_lock);
+ p = &mdsc->snapid_map_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap) {
+ p = &(*p)->rb_left;
+ } else if (snap < exist->snap) {
+ p = &(*p)->rb_right;
+ } else {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ break;
+ }
+ exist = NULL;
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ return exist;
+ }
+
+ sm = kmalloc(sizeof(*sm), GFP_NOFS);
+ if (!sm)
+ return NULL;
+
+ ret = get_anon_bdev(&sm->dev);
+ if (ret < 0) {
+ kfree(sm);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&sm->lru);
+ atomic_set(&sm->ref, 1);
+ sm->snap = snap;
+
+ exist = NULL;
+ parent = NULL;
+ p = &mdsc->snapid_map_tree.rb_node;
+ spin_lock(&mdsc->snapid_map_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap)
+ p = &(*p)->rb_left;
+ else if (snap < exist->snap)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist) {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ } else {
+ rb_link_node(&sm->node, parent, p);
+ rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ return exist;
+ }
+
+ dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ return sm;
+}
+
+void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm)
+{
+ if (!sm)
+ return;
+ if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
+ if (!RB_EMPTY_NODE(&sm->node)) {
+ sm->last_used = jiffies;
+ list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
+ spin_unlock(&mdsc->snapid_map_lock);
+ } else {
+ /* already cleaned up by
+ * ceph_cleanup_snapid_map() */
+ spin_unlock(&mdsc->snapid_map_lock);
+ kfree(sm);
+ }
+ }
+}
+
+void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snapid_map *sm;
+ unsigned long now;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ now = jiffies;
+
+ while (!list_empty(&mdsc->snapid_map_lru)) {
+ sm = list_first_entry(&mdsc->snapid_map_lru,
+ struct ceph_snapid_map, lru);
+ if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now))
+ break;
+
+ rb_erase(&sm->node, &mdsc->snapid_map_tree);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ }
+}
+
+void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snapid_map *sm;
+ struct rb_node *p;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ while ((p = rb_first(&mdsc->snapid_map_tree))) {
+ sm = rb_entry(p, struct ceph_snapid_map, node);
+ rb_erase(p, &mdsc->snapid_map_tree);
+ RB_CLEAR_NODE(p);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ free_anon_bdev(sm->dev);
+ if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
+ pr_err("snapid map %llx -> %x still in use\n",
+ sm->snap, sm->dev);
+ }
+ }
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index da2cd8e89062..6d5bb2f74612 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -133,6 +133,7 @@ enum {
Opt_rasize,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
+ Opt_caps_max,
Opt_readdir_max_entries,
Opt_readdir_max_bytes,
Opt_congestion_kb,
@@ -175,6 +176,7 @@ static match_table_t fsopt_tokens = {
{Opt_rasize, "rasize=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+ {Opt_caps_max, "caps_max=%d"},
{Opt_readdir_max_entries, "readdir_max_entries=%d"},
{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
{Opt_congestion_kb, "write_congestion_kb=%d"},
@@ -286,6 +288,11 @@ static int parse_fsopt_token(char *c, void *private)
return -EINVAL;
fsopt->caps_wanted_delay_max = intval;
break;
+ case Opt_caps_max:
+ if (intval < 0)
+ return -EINVAL;
+ fsopt->caps_max = intval;
+ break;
case Opt_readdir_max_entries:
if (intval < 1)
return -EINVAL;
@@ -576,6 +583,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",rasize=%d", fsopt->rasize);
if (fsopt->congestion_kb != default_congestion_kb())
seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+ if (fsopt->caps_max)
+ seq_printf(m, ",caps_max=%d", fsopt->caps_max);
if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
seq_printf(m, ",caps_wanted_delay_min=%d",
fsopt->caps_wanted_delay_min);
@@ -671,6 +680,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
if (!fsc->trunc_wq)
goto fail_pg_inv_wq;
+ fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
+ if (!fsc->cap_wq)
+ goto fail_trunc_wq;
/* set up mempools */
err = -ENOMEM;
@@ -678,13 +690,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
size = sizeof (struct page *) * (page_count ? page_count : 1);
fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
if (!fsc->wb_pagevec_pool)
- goto fail_trunc_wq;
-
- /* caps */
- fsc->min_caps = fsopt->max_readdir;
+ goto fail_cap_wq;
return fsc;
+fail_cap_wq:
+ destroy_workqueue(fsc->cap_wq);
fail_trunc_wq:
destroy_workqueue(fsc->trunc_wq);
fail_pg_inv_wq:
@@ -706,6 +717,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc)
flush_workqueue(fsc->wb_wq);
flush_workqueue(fsc->pg_inv_wq);
flush_workqueue(fsc->trunc_wq);
+ flush_workqueue(fsc->cap_wq);
}
static void destroy_fs_client(struct ceph_fs_client *fsc)
@@ -715,6 +727,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_workqueue(fsc->wb_wq);
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
+ destroy_workqueue(fsc->cap_wq);
mempool_destroy(fsc->wb_pagevec_pool);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index dfb64a5211b6..16c03188578e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -79,6 +79,7 @@ struct ceph_mount_options {
int rasize; /* max readahead */
int congestion_kb; /* max writeback in flight */
int caps_wanted_delay_min, caps_wanted_delay_max;
+ int caps_max;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
@@ -100,17 +101,18 @@ struct ceph_fs_client {
struct ceph_client *client;
unsigned long mount_state;
- int min_caps; /* min caps i added */
loff_t max_file_size;
struct ceph_mds_client *mdsc;
/* writeback */
mempool_t *wb_pagevec_pool;
+ atomic_long_t writeback_count;
+
struct workqueue_struct *wb_wq;
struct workqueue_struct *pg_inv_wq;
struct workqueue_struct *trunc_wq;
- atomic_long_t writeback_count;
+ struct workqueue_struct *cap_wq;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
@@ -260,17 +262,22 @@ struct ceph_inode_xattr {
* Ceph dentry state
*/
struct ceph_dentry_info {
+ struct dentry *dentry;
struct ceph_mds_session *lease_session;
+ struct list_head lease_list;
+ unsigned flags;
int lease_shared_gen;
u32 lease_gen;
u32 lease_seq;
unsigned long lease_renew_after, lease_renew_from;
- struct list_head lru;
- struct dentry *dentry;
unsigned long time;
u64 offset;
};
+#define CEPH_DENTRY_REFERENCED 1
+#define CEPH_DENTRY_LEASE_LIST 2
+#define CEPH_DENTRY_SHRINK_LIST 4
+
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
@@ -318,6 +325,8 @@ struct ceph_inode_info {
/* quotas */
u64 i_max_bytes, i_max_files;
+ s32 i_dir_pin;
+
struct rb_root i_fragtree;
int i_fragtree_nsplits;
struct mutex i_fragtree_mutex;
@@ -370,7 +379,10 @@ struct ceph_inode_info {
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
- struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ union {
+ struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
+ };
int i_snap_realm_counter; /* snap realm (if caps) */
struct list_head i_snap_realm_item;
struct list_head i_snap_flush_item;
@@ -587,7 +599,7 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag,
int *found);
-static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry)
{
return (struct ceph_dentry_info *)dentry->d_fsdata;
}
@@ -656,7 +668,8 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt);
extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -837,6 +850,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap);
+extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm);
+extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+
+
/*
* a cap_snap is "pending" if it is still awaiting an in-progress
* sync write (that may/may not still update size, mtime, etc.).
@@ -975,11 +996,11 @@ extern void ceph_add_cap(struct inode *inode,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void __ceph_remove_caps(struct inode* inode);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
extern int ceph_is_any_caps(struct inode *inode);
-extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
@@ -1049,10 +1070,10 @@ extern int ceph_handle_snapdir(struct ceph_mds_request *req,
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err);
-extern void ceph_dentry_lru_add(struct dentry *dn);
-extern void ceph_dentry_lru_touch(struct dentry *dn);
-extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di);
+extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern int ceph_trim_dentries(struct ceph_mds_client *mdsc);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 316f6ad10644..0cc42c8879e9 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -228,8 +228,19 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
ci->i_rctime.tv_nsec);
}
-/* quotas */
+/* dir pin */
+static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci)
+{
+ return ci->i_dir_pin != -ENODATA;
+}
+
+static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%d", (int)ci->i_dir_pin);
+}
+/* quotas */
static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
{
bool ret = false;
@@ -315,6 +326,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rbytes),
XATTR_RSTAT_FIELD(dir, rctime),
{
+ .name = "ceph.dir.pin",
+ .name_size = sizeof("ceph.dir_pin"),
+ .getxattr_cb = ceph_vxattrcb_dir_pin,
+ .exists_cb = ceph_vxattrcb_dir_pin_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ {
.name = "ceph.quota",
.name_size = sizeof("ceph.quota"),
.getxattr_cb = ceph_vxattrcb_quota,
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f1ddc9d03c10..76724efc831c 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -117,25 +117,25 @@ config CIFS_UPCALL
secure Kerberos authentication is required). If unsure, say Y.
config CIFS_XATTR
- bool "CIFS extended attributes"
- depends on CIFS
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page for details).
- CIFS maps the name of extended attributes beginning with the user
- namespace prefix to SMB/CIFS EAs. EAs are stored on Windows
- servers without the user namespace prefix, but their names are
- seen by Linux cifs clients prefaced by the user namespace prefix.
- The system namespace (used by some filesystems to store ACLs) is
- not supported at this time.
-
- If unsure, say Y.
+ bool "CIFS extended attributes"
+ depends on CIFS
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page for details).
+ CIFS maps the name of extended attributes beginning with the user
+ namespace prefix to SMB/CIFS EAs. EAs are stored on Windows
+ servers without the user namespace prefix, but their names are
+ seen by Linux cifs clients prefaced by the user namespace prefix.
+ The system namespace (used by some filesystems to store ACLs) is
+ not supported at this time.
+
+ If unsure, say Y.
config CIFS_POSIX
- bool "CIFS POSIX Extensions"
- depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
- help
- Enabling this option will cause the cifs client to attempt to
+ bool "CIFS POSIX Extensions"
+ depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
+ help
+ Enabling this option will cause the cifs client to attempt to
negotiate a newer dialect with servers, such as Samba 3.0.5
or later, that optionally can handle more POSIX like (rather
than Windows like) file behavior. It also enables
@@ -144,61 +144,62 @@ config CIFS_POSIX
CIFS POSIX ACL support. If unsure, say N.
config CIFS_ACL
- bool "Provide CIFS ACL support"
- depends on CIFS_XATTR && KEYS
- help
- Allows fetching CIFS/NTFS ACL from the server. The DACL blob
- is handed over to the application/caller. See the man
- page for getcifsacl for more information. If unsure, say Y.
+ bool "Provide CIFS ACL support"
+ depends on CIFS_XATTR && KEYS
+ help
+ Allows fetching CIFS/NTFS ACL from the server. The DACL blob
+ is handed over to the application/caller. See the man
+ page for getcifsacl for more information. If unsure, say Y.
config CIFS_DEBUG
bool "Enable CIFS debugging routines"
default y
depends on CIFS
help
- Enabling this option adds helpful debugging messages to
- the cifs code which increases the size of the cifs module.
- If unsure, say Y.
+ Enabling this option adds helpful debugging messages to
+ the cifs code which increases the size of the cifs module.
+ If unsure, say Y.
+
config CIFS_DEBUG2
bool "Enable additional CIFS debugging routines"
depends on CIFS_DEBUG
help
- Enabling this option adds a few more debugging routines
- to the cifs code which slightly increases the size of
- the cifs module and can cause additional logging of debug
- messages in some error paths, slowing performance. This
- option can be turned off unless you are debugging
- cifs problems. If unsure, say N.
+ Enabling this option adds a few more debugging routines
+ to the cifs code which slightly increases the size of
+ the cifs module and can cause additional logging of debug
+ messages in some error paths, slowing performance. This
+ option can be turned off unless you are debugging
+ cifs problems. If unsure, say N.
config CIFS_DEBUG_DUMP_KEYS
bool "Dump encryption keys for offline decryption (Unsafe)"
depends on CIFS_DEBUG
help
- Enabling this will dump the encryption and decryption keys
- used to communicate on an encrypted share connection on the
- console. This allows Wireshark to decrypt and dissect
- encrypted network captures. Enable this carefully.
- If unsure, say N.
+ Enabling this will dump the encryption and decryption keys
+ used to communicate on an encrypted share connection on the
+ console. This allows Wireshark to decrypt and dissect
+ encrypted network captures. Enable this carefully.
+ If unsure, say N.
config CIFS_DFS_UPCALL
- bool "DFS feature support"
- depends on CIFS && KEYS
- select DNS_RESOLVER
- help
- Distributed File System (DFS) support is used to access shares
- transparently in an enterprise name space, even if the share
- moves to a different server. This feature also enables
- an upcall mechanism for CIFS which contacts userspace helper
- utilities to provide server name resolution (host names to
- IP addresses) which is needed in order to reconnect to
- servers if their addresses change or for implicit mounts of
- DFS junction points. If unsure, say Y.
+ bool "DFS feature support"
+ depends on CIFS && KEYS
+ select DNS_RESOLVER
+ help
+ Distributed File System (DFS) support is used to access shares
+ transparently in an enterprise name space, even if the share
+ moves to a different server. This feature also enables
+ an upcall mechanism for CIFS which contacts userspace helper
+ utilities to provide server name resolution (host names to
+ IP addresses) which is needed in order to reconnect to
+ servers if their addresses change or for implicit mounts of
+ DFS junction points. If unsure, say Y.
config CIFS_NFSD_EXPORT
- bool "Allow nfsd to export CIFS file system"
- depends on CIFS && BROKEN
- help
- Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+ bool "Allow nfsd to export CIFS file system"
+ depends on CIFS && BROKEN
+ help
+ Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB_DIRECT
bool "SMB Direct support (Experimental)"
@@ -209,10 +210,9 @@ config CIFS_SMB_DIRECT
say N.
config CIFS_FSCACHE
- bool "Provide CIFS client caching support"
- depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
- help
- Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
- to be cached locally on disk through the general filesystem cache
- manager. If unsure, say N.
-
+ bool "Provide CIFS client caching support"
+ depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+ help
+ Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+ to be cached locally on disk through the general filesystem cache
+ manager. If unsure, say N.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index e92a2fee3c57..13c1288b04a7 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -115,7 +115,12 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
seq_puts(m, " type: CDROM ");
else
seq_printf(m, " type: %d ", dev_type);
- if (tcon->seal)
+
+ seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number);
+
+ if ((tcon->seal) ||
+ (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) ||
+ (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA))
seq_printf(m, " Encrypted");
if (tcon->nocase)
seq_printf(m, " nocase");
@@ -371,6 +376,10 @@ skip_rdma:
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
#endif
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+ seq_puts(m, " encrypted");
+ if (ses->sign)
+ seq_puts(m, " signed");
seq_puts(m, "\n\tShares:");
j = 0;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d9b99abe1243..5d83c924cc47 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -285,9 +285,9 @@ static void dump_referral(const struct dfs_info3_param *ref)
{
cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
- cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+ cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n",
ref->flags, ref->server_type);
- cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+ cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n",
ref->ref_flag, ref->path_consumed);
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 42f0d67f1054..ed49222abecb 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -58,6 +58,7 @@ struct cifs_sb_info {
spinlock_t tlink_tree_lock;
struct tcon_link *master_tlink;
struct nls_table *local_nls;
+ unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
unsigned long actimeo; /* attribute cache timeout (jiffies) */
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index d8bce2f862de..086ddc5108af 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -43,6 +43,9 @@ struct smb_snapshot_array {
/* snapshots[]; */
} __packed;
+/* query_info flags */
+#define PASSTHRU_QUERY_INFO 0x00000000
+#define PASSTHRU_FSCTL 0x00000001
struct smb_query_info {
__u32 info_type;
__u32 file_info_class;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 62d48d486d8f..f9b71c12cc9f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -381,7 +381,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
seq_puts(s, "ntlm");
break;
case Kerberos:
- seq_puts(s, "krb5");
+ seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid));
break;
case RawNTLMSSP:
seq_puts(s, "ntlmssp");
@@ -554,6 +554,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rsize=%u", cifs_sb->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+ seq_printf(s, ",bsize=%u", cifs_sb->bsize);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
if (tcon->snapshot_time)
@@ -1007,7 +1008,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
unsigned int xid;
int rc;
- if (remap_flags & ~REMAP_FILE_ADVISORY)
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
cifs_dbg(FYI, "clone range\n");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7652551a1fc4..5c0298b9998f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.17"
+#define CIFS_VERSION "2.19"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94dbdbe5be34..38feae812b47 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -216,6 +216,7 @@ struct cifs_io_parms;
struct cifs_search_info;
struct cifsInodeInfo;
struct cifs_open_parms;
+struct cifs_credits;
struct smb_version_operations {
int (*send_cancel)(struct TCP_Server_Info *, struct smb_rqst *,
@@ -230,12 +231,15 @@ struct smb_version_operations {
/* check response: verify signature, map error */
int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
bool);
- void (*add_credits)(struct TCP_Server_Info *, const unsigned int,
- const int);
+ void (*add_credits)(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits,
+ const int optype);
void (*set_credits)(struct TCP_Server_Info *, const int);
int * (*get_credits_field)(struct TCP_Server_Info *, const int);
unsigned int (*get_credits)(struct mid_q_entry *);
__u64 (*get_next_mid)(struct TCP_Server_Info *);
+ void (*revert_current_mid)(struct TCP_Server_Info *server,
+ const unsigned int val);
/* data offset from read response message */
unsigned int (*read_data_offset)(char *);
/*
@@ -383,8 +387,8 @@ struct smb_version_operations {
struct cifs_fid *);
/* calculate a size of SMB message */
unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
- /* check for STATUS_PENDING and process it in a positive case */
- bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
+ /* check for STATUS_PENDING and process the response if yes */
+ bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server);
/* check for STATUS_NETWORK_SESSION_EXPIRED */
bool (*is_session_expired)(char *);
/* send oplock break response */
@@ -452,7 +456,11 @@ struct smb_version_operations {
unsigned int (*wp_retry_size)(struct inode *);
/* get mtu credits */
int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
- unsigned int *, unsigned int *);
+ unsigned int *, struct cifs_credits *);
+ /* adjust previously taken mtu credits to request size */
+ int (*adjust_credits)(struct TCP_Server_Info *server,
+ struct cifs_credits *credits,
+ const unsigned int payload_size);
/* check if we need to issue closedir */
bool (*dir_needs_close)(struct cifsFileInfo *);
long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
@@ -471,6 +479,14 @@ struct smb_version_operations {
struct cifs_tcon *tcon,
__le16 *path, int is_dir,
unsigned long p);
+ /* make unix special files (block, char, fifo, socket) */
+ int (*make_node)(unsigned int xid,
+ struct inode *inode,
+ struct dentry *dentry,
+ struct cifs_tcon *tcon,
+ char *full_path,
+ umode_t mode,
+ dev_t device_number);
};
struct smb_version_values {
@@ -557,6 +573,7 @@ struct smb_vol {
bool resilient:1; /* noresilient not required since not fored for CA */
bool domainauto:1;
bool rdma:1;
+ unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -710,6 +727,11 @@ struct TCP_Server_Info {
int nr_targets;
};
+struct cifs_credits {
+ unsigned int value;
+ unsigned int instance;
+};
+
static inline unsigned int
in_flight(struct TCP_Server_Info *server)
{
@@ -721,28 +743,28 @@ in_flight(struct TCP_Server_Info *server)
}
static inline bool
-has_credits(struct TCP_Server_Info *server, int *credits)
+has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
{
int num;
spin_lock(&server->req_lock);
num = *credits;
spin_unlock(&server->req_lock);
- return num > 0;
+ return num >= num_credits;
}
static inline void
-add_credits(struct TCP_Server_Info *server, const unsigned int add,
+add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits,
const int optype)
{
- server->ops->add_credits(server, add, optype);
+ server->ops->add_credits(server, credits, optype);
}
static inline void
-add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+add_credits_and_wake_if(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
- if (add) {
- server->ops->add_credits(server, add, optype);
+ if (credits->value) {
+ server->ops->add_credits(server, credits, optype);
wake_up(&server->request_q);
}
}
@@ -753,6 +775,14 @@ set_credits(struct TCP_Server_Info *server, const int val)
server->ops->set_credits(server, val);
}
+static inline int
+adjust_credits(struct TCP_Server_Info *server, struct cifs_credits *credits,
+ const unsigned int payload_size)
+{
+ return server->ops->adjust_credits ?
+ server->ops->adjust_credits(server, credits, payload_size) : 0;
+}
+
static inline __le64
get_next_mid64(struct TCP_Server_Info *server)
{
@@ -770,6 +800,22 @@ get_next_mid(struct TCP_Server_Info *server)
return cpu_to_le16(mid);
}
+static inline void
+revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+ if (server->ops->revert_current_mid)
+ server->ops->revert_current_mid(server, val);
+}
+
+static inline void
+revert_current_mid_from_hdr(struct TCP_Server_Info *server,
+ const struct smb2_sync_hdr *shdr)
+{
+ unsigned int num = le16_to_cpu(shdr->CreditCharge);
+
+ return revert_current_mid(server, num > 0 ? num : 1);
+}
+
static inline __u16
get_mid(const struct smb_hdr *smb)
{
@@ -924,11 +970,14 @@ cap_unix(struct cifs_ses *ses)
struct cached_fid {
bool is_valid:1; /* Do we have a useable root fid */
+ bool file_all_info_is_valid:1;
+
struct kref refcount;
struct cifs_fid *fid;
struct mutex fid_mutex;
struct cifs_tcon *tcon;
struct work_struct lease_break;
+ struct smb2_file_all_info file_all_info;
};
/*
@@ -1234,7 +1283,7 @@ struct cifs_readdata {
unsigned int pagesz;
unsigned int page_offset;
unsigned int tailsz;
- unsigned int credits;
+ struct cifs_credits credits;
unsigned int nr_pages;
struct page **pages;
};
@@ -1260,7 +1309,7 @@ struct cifs_writedata {
unsigned int pagesz;
unsigned int page_offset;
unsigned int tailsz;
- unsigned int credits;
+ struct cifs_credits credits;
unsigned int nr_pages;
struct page **pages;
};
@@ -1422,6 +1471,7 @@ struct mid_q_entry {
struct kref refcount;
struct TCP_Server_Info *server; /* server corresponding to this mid */
__u64 mid; /* multiplex id */
+ __u16 credits; /* number of credits consumed by this mid */
__u32 pid; /* process id */
__u32 sequence_number; /* for CIFS signing */
unsigned long when_alloc; /* when mid was created */
@@ -1696,6 +1746,7 @@ require use of the stronger protocol */
* GlobalMid_Lock protects:
* list operations on pending_mid_q and oplockQ
* updates to XID counters, multiplex id and SMB sequence numbers
+ * list operations on global DnotifyReqList
* tcp_ses_lock protects:
* list operations on tcp and SMB session lists
* tcon->open_file_lock protects the list of open files hanging off the tcon
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 336c116995d7..4f96b3b00a7a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -93,7 +93,8 @@ extern int cifs_discard_remaining_data(struct TCP_Server_Info *server);
extern int cifs_call_async(struct TCP_Server_Info *server,
struct smb_rqst *rqst,
mid_receive_t *receive, mid_callback_t *callback,
- mid_handle_t *handle, void *cbdata, const int flags);
+ mid_handle_t *handle, void *cbdata, const int flags,
+ const struct cifs_credits *exist_credits);
extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
struct smb_rqst *rqst, int *resp_buf_type,
const int flags, struct kvec *resp_iov);
@@ -115,7 +116,7 @@ extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
unsigned int size, unsigned int *num,
- unsigned int *credits);
+ struct cifs_credits *credits);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec *, int /* nvec to send */,
int * /* type of buf returned */, const int flags,
@@ -133,6 +134,9 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
+ bool fsuid_only,
+ struct cifsFileInfo **ret_file);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bb54ccf8481c..f43747c062a7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -139,8 +139,8 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
return -ENOMEM;
if (tcon->ipc) {
- snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
- tcon->ses->server->hostname);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
goto out;
}
@@ -172,7 +172,7 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
continue;
}
- snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
if (!rc)
@@ -822,9 +822,10 @@ static void
cifs_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
DeleteMidQEntry(mid);
- add_credits(server, 1, CIFS_ECHO_OP);
+ add_credits(server, &credits, CIFS_ECHO_OP);
}
int
@@ -859,7 +860,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
iov[1].iov_base = (char *)smb + 4;
rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, NULL,
- server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
+ server, CIFS_ASYNC_OP | CIFS_ECHO_OP, NULL);
if (rc)
cifs_dbg(FYI, "Echo request failed: %d\n", rc);
@@ -1605,16 +1606,17 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, 0)) {
+ server->ops->is_status_pending(buf, server)) {
cifs_discard_remaining_data(server);
return -1;
}
/* set up first two iov for signature check and to get credits */
rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
- rdata->iov[1].iov_len = server->total_read - 4;
+ rdata->iov[0].iov_len = server->vals->header_preamble_size;
+ rdata->iov[1].iov_base = buf + server->vals->header_preamble_size;
+ rdata->iov[1].iov_len =
+ server->total_read - server->vals->header_preamble_size;
cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
rdata->iov[0].iov_base, rdata->iov[0].iov_len);
cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
@@ -1713,6 +1715,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
.rq_npages = rdata->nr_pages,
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
__func__, mid->mid, mid->mid_state, rdata->result,
@@ -1750,7 +1753,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- add_credits(server, 1, 0);
+ add_credits(server, &credits, 0);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -1809,7 +1812,7 @@ cifs_async_readv(struct cifs_readdata *rdata)
kref_get(&rdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
- cifs_readv_callback, NULL, rdata, 0);
+ cifs_readv_callback, NULL, rdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
@@ -2123,14 +2126,18 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wdata2->tailsz = tailsz;
wdata2->bytes = cur_len;
- wdata2->cfile = find_writable_file(CIFS_I(inode), false);
+ rc = cifs_get_writable_file(CIFS_I(inode), false,
+ &wdata2->cfile);
if (!wdata2->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- break;
+ cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
+ rc);
+ if (!is_retryable_error(rc))
+ rc = -EBADF;
+ } else {
+ wdata2->pid = wdata2->cfile->pid;
+ rc = server->ops->async_writev(wdata2,
+ cifs_writedata_release);
}
- wdata2->pid = wdata2->cfile->pid;
- rc = server->ops->async_writev(wdata2, cifs_writedata_release);
for (j = 0; j < nr_pages; j++) {
unlock_page(wdata2->pages[j]);
@@ -2145,6 +2152,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
kref_put(&wdata2->refcount, cifs_writedata_release);
if (is_retryable_error(rc))
continue;
+ i += nr_pages;
break;
}
@@ -2152,6 +2160,13 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
i += nr_pages;
} while (i < wdata->nr_pages);
+ /* cleanup remaining pages from the original wdata */
+ for (; i < wdata->nr_pages; i++) {
+ SetPageError(wdata->pages[i]);
+ end_page_writeback(wdata->pages[i]);
+ put_page(wdata->pages[i]);
+ }
+
if (rc != 0 && !is_retryable_error(rc))
mapping_set_error(inode->i_mapping, rc);
kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2226,6 +2241,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
unsigned int written;
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -2261,7 +2277,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- add_credits(tcon->ses->server, 1, 0);
+ add_credits(tcon->ses->server, &credits, 0);
}
/* cifs_async_writev - send an async write, and set up mid to handle result */
@@ -2339,7 +2355,7 @@ cifs_async_writev(struct cifs_writedata *wdata,
kref_get(&wdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
- cifs_writev_callback, NULL, wdata, 0);
+ cifs_writev_callback, NULL, wdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8463c940e0e5..a8e9738db691 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -102,7 +102,7 @@ enum {
Opt_backupuid, Opt_backupgid, Opt_uid,
Opt_cruid, Opt_gid, Opt_file_mode,
Opt_dirmode, Opt_port,
- Opt_rsize, Opt_wsize, Opt_actimeo,
+ Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo,
Opt_echo_interval, Opt_max_credits,
Opt_snapshot,
@@ -204,6 +204,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_dirmode, "dirmode=%s" },
{ Opt_dirmode, "dir_mode=%s" },
{ Opt_port, "port=%s" },
+ { Opt_blocksize, "bsize=%s" },
{ Opt_rsize, "rsize=%s" },
{ Opt_wsize, "wsize=%s" },
{ Opt_actimeo, "actimeo=%s" },
@@ -348,7 +349,7 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server)
cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
return -ENOMEM;
}
- snprintf(unc, len, "\\\\%s", server->hostname);
+ scnprintf(unc, len, "\\\\%s", server->hostname);
rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
kfree(unc);
@@ -592,6 +593,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
+ set_credits(server, 1);
spin_lock(&GlobalMid_Lock);
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
@@ -1053,7 +1055,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, length))
+ server->ops->is_status_pending(buf, server))
return -1;
if (!mid)
@@ -1063,6 +1065,26 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return 0;
}
+static void
+smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
+{
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+
+ /*
+ * SMB1 does not use credits.
+ */
+ if (server->vals->header_preamble_size)
+ return;
+
+ if (shdr->CreditRequest) {
+ spin_lock(&server->req_lock);
+ server->credits += le16_to_cpu(shdr->CreditRequest);
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ }
+}
+
+
static int
cifs_demultiplex_thread(void *p)
{
@@ -1169,10 +1191,6 @@ next_pdu:
continue;
}
- if (server->large_buf)
- buf = server->bigbuf;
-
-
server->lstrp = jiffies;
for (i = 0; i < num_mids; i++) {
@@ -1192,6 +1210,7 @@ next_pdu:
} else if (server->ops->is_oplock_break &&
server->ops->is_oplock_break(bufs[i],
server)) {
+ smb2_add_credits_from_hdr(bufs[i], server);
cifs_dbg(FYI, "Received oplock break\n");
} else {
cifs_dbg(VFS, "No task to wake, unknown frame "
@@ -1203,6 +1222,7 @@ next_pdu:
if (server->ops->dump_detail)
server->ops->dump_detail(bufs[i],
server);
+ smb2_add_credits_from_hdr(bufs[i], server);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
}
@@ -1486,6 +1506,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol)
const char *delims = "/\\";
size_t len;
+ if (unlikely(!devname || !*devname)) {
+ cifs_dbg(VFS, "Device name not specified.\n");
+ return -EINVAL;
+ }
+
/* make sure we have a valid UNC double delimiter prefix */
len = strspn(devname, delims);
if (len != 2)
@@ -1571,7 +1596,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->cred_uid = current_uid();
vol->linux_uid = current_uid();
vol->linux_gid = current_gid();
-
+ vol->bsize = 1024 * 1024; /* can improve cp performance significantly */
/*
* default to SFM style remapping of seven reserved characters
* unless user overrides it or we negotiate CIFS POSIX where
@@ -1944,6 +1969,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
port = (unsigned short)option;
break;
+ case Opt_blocksize:
+ if (get_option_ul(args, &option)) {
+ cifs_dbg(VFS, "%s: Invalid blocksize value\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ /*
+ * inode blocksize realistically should never need to be
+ * less than 16K or greater than 16M and default is 1MB.
+ * Note that small inode block sizes (e.g. 64K) can lead
+ * to very poor performance of common tools like cp and scp
+ */
+ if ((option < CIFS_MAX_MSGSIZE) ||
+ (option > (4 * SMB3_DEFAULT_IOSIZE))) {
+ cifs_dbg(VFS, "%s: Invalid blocksize\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->bsize = option;
+ break;
case Opt_rsize:
if (get_option_ul(args, &option)) {
cifs_dbg(VFS, "%s: Invalid rsize value\n",
@@ -2609,7 +2654,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
- tcp_ses->reconnect_instance = 0;
+ tcp_ses->reconnect_instance = 1;
tcp_ses->lstrp = jiffies;
spin_lock_init(&tcp_ses->req_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
@@ -2770,7 +2815,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info)
if (tcon == NULL)
return -ENOMEM;
- snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
+ scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
/* cannot fail */
nls_codepage = load_nls_default();
@@ -3839,6 +3884,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
+ cifs_sb->bsize = pvolume_info->bsize;
/*
* Temporarily set r/wsize for matching superblock. If we end up using
* new sb then client will later negotiate it downward if needed.
@@ -4198,7 +4244,7 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it,
new_unc = kmalloc(len, GFP_KERNEL);
if (!new_unc)
return -ENOMEM;
- snprintf(new_unc, len, "\\%s", tgt);
+ scnprintf(new_unc, len, "\\%s", tgt);
kfree(vol->UNC);
vol->UNC = new_unc;
@@ -4902,8 +4948,6 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
if (!server->ops->need_neg(server))
return 0;
- set_credits(server, 1);
-
rc = server->ops->negotiate(xid, ses);
if (rc == 0) {
spin_lock(&GlobalMid_Lock);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 907e85d65bb4..f26a48dd2e39 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -621,20 +621,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
{
int rc = -EPERM;
unsigned int xid;
- int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
- struct cifs_io_parms io_parms;
char *full_path = NULL;
- struct inode *newinode = NULL;
- __u32 oplock = 0;
- struct cifs_fid fid;
- struct cifs_open_parms oparms;
- FILE_ALL_INFO *buf = NULL;
- unsigned int bytes_written;
- struct win_dev *pdev;
- struct kvec iov[2];
if (!old_valid_dev(device_number))
return -EINVAL;
@@ -654,103 +644,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
goto mknod_out;
}
- if (tcon->unix_ext) {
- struct cifs_unix_set_info_args args = {
- .mode = mode & ~current_umask(),
- .ctime = NO_CHANGE_64,
- .atime = NO_CHANGE_64,
- .mtime = NO_CHANGE_64,
- .device = device_number,
- };
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = current_fsuid();
- args.gid = current_fsgid();
- } else {
- args.uid = INVALID_UID; /* no change */
- args.gid = INVALID_GID; /* no change */
- }
- rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (rc)
- goto mknod_out;
-
- rc = cifs_get_inode_info_unix(&newinode, full_path,
- inode->i_sb, xid);
-
- if (rc == 0)
- d_instantiate(direntry, newinode);
- goto mknod_out;
- }
-
- if (!S_ISCHR(mode) && !S_ISBLK(mode))
- goto mknod_out;
-
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- goto mknod_out;
-
-
- cifs_dbg(FYI, "sfu compat create special file\n");
-
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto mknod_out;
- }
-
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = create_options;
- oparms.disposition = FILE_CREATE;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
-
- if (tcon->ses->server->oplocks)
- oplock = REQ_OPLOCK;
- else
- oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
- if (rc)
- goto mknod_out;
-
- /*
- * BB Do not bother to decode buf since no local inode yet to put
- * timestamps in, but we can reuse it safely.
- */
-
- pdev = (struct win_dev *)buf;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = buf;
- iov[1].iov_len = sizeof(struct win_dev);
- if (S_ISCHR(mode)) {
- memcpy(pdev->type, "IntxCHR", 8);
- pdev->major = cpu_to_le64(MAJOR(device_number));
- pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- } else if (S_ISBLK(mode)) {
- memcpy(pdev->type, "IntxBLK", 8);
- pdev->major = cpu_to_le64(MAJOR(device_number));
- pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- }
- tcon->ses->server->ops->close(xid, tcon, &fid);
- d_drop(direntry);
-
- /* FIXME: add code here to set EAs */
+ rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon,
+ full_path, mode,
+ device_number);
mknod_out:
kfree(full_path);
- kfree(buf);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 659ce1b92c44..89006e044973 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1645,8 +1645,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
rc = server->ops->mand_unlock_range(cfile, flock, xid);
out:
- if (flock->fl_flags & FL_POSIX && !rc)
+ if (flock->fl_flags & FL_POSIX) {
+ /*
+ * If this is a request to remove all locks because we
+ * are closing the file, it doesn't matter if the
+ * unlocking failed as both cifs.ko and the SMB server
+ * remove the lock on file close
+ */
+ if (rc) {
+ cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
+ if (!(flock->fl_flags & FL_CLOSE))
+ return rc;
+ }
rc = locks_lock_file_wait(file, flock);
+ }
return rc;
}
@@ -1842,24 +1854,30 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
return NULL;
}
-struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
- bool fsuid_only)
+/* Return -EBADF if no handle is found and general rc otherwise */
+int
+cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
+ struct cifsFileInfo **ret_file)
{
struct cifsFileInfo *open_file, *inv_file = NULL;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
bool any_available = false;
- int rc;
+ int rc = -EBADF;
unsigned int refind = 0;
- /* Having a null inode here (because mapping->host was set to zero by
- the VFS or MM) should not happen but we had reports of on oops (due to
- it being zero) during stress testcases so we need to check for it */
+ *ret_file = NULL;
+
+ /*
+ * Having a null inode here (because mapping->host was set to zero by
+ * the VFS or MM) should not happen but we had reports of on oops (due
+ * to it being zero) during stress testcases so we need to check for it
+ */
if (cifs_inode == NULL) {
cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n");
dump_stack();
- return NULL;
+ return rc;
}
cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
@@ -1873,7 +1891,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
refind_writable:
if (refind > MAX_REOPEN_ATT) {
spin_unlock(&tcon->open_file_lock);
- return NULL;
+ return rc;
}
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (!any_available && open_file->pid != current->tgid)
@@ -1885,7 +1903,8 @@ refind_writable:
/* found a good writable file */
cifsFileInfo_get(open_file);
spin_unlock(&tcon->open_file_lock);
- return open_file;
+ *ret_file = open_file;
+ return 0;
} else {
if (!inv_file)
inv_file = open_file;
@@ -1907,22 +1926,35 @@ refind_writable:
if (inv_file) {
rc = cifs_reopen_file(inv_file, false);
- if (!rc)
- return inv_file;
- else {
- spin_lock(&tcon->open_file_lock);
- list_move_tail(&inv_file->flist,
- &cifs_inode->openFileList);
- spin_unlock(&tcon->open_file_lock);
- cifsFileInfo_put(inv_file);
- ++refind;
- inv_file = NULL;
- spin_lock(&tcon->open_file_lock);
- goto refind_writable;
+ if (!rc) {
+ *ret_file = inv_file;
+ return 0;
}
+
+ spin_lock(&tcon->open_file_lock);
+ list_move_tail(&inv_file->flist, &cifs_inode->openFileList);
+ spin_unlock(&tcon->open_file_lock);
+ cifsFileInfo_put(inv_file);
+ ++refind;
+ inv_file = NULL;
+ spin_lock(&tcon->open_file_lock);
+ goto refind_writable;
}
- return NULL;
+ return rc;
+}
+
+struct cifsFileInfo *
+find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
+{
+ struct cifsFileInfo *cfile;
+ int rc;
+
+ rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile);
+ if (rc)
+ cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc);
+
+ return cfile;
}
static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
@@ -1959,8 +1991,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (mapping->host->i_size - offset < (loff_t)to)
to = (unsigned)(mapping->host->i_size - offset);
- open_file = find_writable_file(CIFS_I(mapping->host), false);
- if (open_file) {
+ rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file);
+ if (!rc) {
bytes_written = cifs_write(open_file, open_file->pid,
write_data, to - from, &offset);
cifsFileInfo_put(open_file);
@@ -1970,9 +2002,12 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
rc = 0;
else if (bytes_written < 0)
rc = bytes_written;
+ else
+ rc = -EFAULT;
} else {
- cifs_dbg(FYI, "No writeable filehandles for inode\n");
- rc = -EIO;
+ cifs_dbg(FYI, "No writable handle for write page rc=%d\n", rc);
+ if (!is_retryable_error(rc))
+ rc = -EIO;
}
kunmap(page);
@@ -2079,9 +2114,9 @@ static int
wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
struct address_space *mapping, struct writeback_control *wbc)
{
- int rc = 0;
- struct TCP_Server_Info *server;
- unsigned int i;
+ int rc;
+ struct TCP_Server_Info *server =
+ tlink_tcon(wdata->cfile->tlink)->ses->server;
wdata->sync_mode = wbc->sync_mode;
wdata->nr_pages = nr_pages;
@@ -2091,21 +2126,16 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
page_offset(wdata->pages[nr_pages - 1]),
(loff_t)PAGE_SIZE);
wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
+ wdata->pid = wdata->cfile->pid;
- if (wdata->cfile != NULL)
- cifsFileInfo_put(wdata->cfile);
- wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
- if (!wdata->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- } else {
- wdata->pid = wdata->cfile->pid;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata, cifs_writedata_release);
- }
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+ if (rc)
+ return rc;
- for (i = 0; i < nr_pages; ++i)
- unlock_page(wdata->pages[i]);
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata, cifs_writedata_release);
return rc;
}
@@ -2113,11 +2143,13 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+ struct inode *inode = mapping->host;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct TCP_Server_Info *server;
bool done = false, scanned = false, range_whole = false;
pgoff_t end, index;
struct cifs_writedata *wdata;
+ struct cifsFileInfo *cfile = NULL;
int rc = 0;
int saved_rc = 0;
unsigned int xid;
@@ -2143,11 +2175,23 @@ static int cifs_writepages(struct address_space *mapping,
server = cifs_sb_master_tcon(cifs_sb)->ses->server;
retry:
while (!done && index <= end) {
- unsigned int i, nr_pages, found_pages, wsize, credits;
+ unsigned int i, nr_pages, found_pages, wsize;
pgoff_t next = 0, tofind, saved_index = index;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+ int get_file_rc = 0;
+
+ if (cfile)
+ cifsFileInfo_put(cfile);
+
+ rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile);
+
+ /* in case of an error store it to return later */
+ if (rc)
+ get_file_rc = rc;
rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
- &wsize, &credits);
+ &wsize, credits);
if (rc != 0) {
done = true;
break;
@@ -2180,13 +2224,26 @@ retry:
continue;
}
- wdata->credits = credits;
+ wdata->credits = credits_on_stack;
+ wdata->cfile = cfile;
+ cfile = NULL;
+
+ if (!wdata->cfile) {
+ cifs_dbg(VFS, "No writable handle in writepages rc=%d\n",
+ get_file_rc);
+ if (is_retryable_error(get_file_rc))
+ rc = get_file_rc;
+ else
+ rc = -EBADF;
+ } else
+ rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
- rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+ for (i = 0; i < nr_pages; ++i)
+ unlock_page(wdata->pages[i]);
/* send failure -- clean up the mess */
if (rc != 0) {
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
for (i = 0; i < nr_pages; ++i) {
if (is_retryable_error(rc))
redirty_page_for_writepage(wbc,
@@ -2238,6 +2295,8 @@ retry:
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+ if (cfile)
+ cifsFileInfo_put(cfile);
free_xid(xid);
return rc;
}
@@ -2567,47 +2626,62 @@ static int
cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
struct cifs_aio_ctx *ctx)
{
- unsigned int wsize, credits;
+ unsigned int wsize;
+ struct cifs_credits credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(wdata->cfile->tlink)->ses->server;
- /*
- * Wait for credits to resend this wdata.
- * Note: we are attempting to resend the whole wdata not in segments
- */
do {
- rc = server->ops->wait_mtu_credits(
- server, wdata->bytes, &wsize, &credits);
+ if (wdata->cfile->invalidHandle) {
+ rc = cifs_reopen_file(wdata->cfile, false);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
- if (rc)
- goto out;
- if (wsize < wdata->bytes) {
- add_credits_and_wake_if(server, credits, 0);
- msleep(1000);
- }
- } while (wsize < wdata->bytes);
+ /*
+ * Wait for credits to resend this wdata.
+ * Note: we are attempting to resend the whole wdata not in
+ * segments
+ */
+ do {
+ rc = server->ops->wait_mtu_credits(server, wdata->bytes,
+ &wsize, &credits);
+ if (rc)
+ goto fail;
+
+ if (wsize < wdata->bytes) {
+ add_credits_and_wake_if(server, &credits, 0);
+ msleep(1000);
+ }
+ } while (wsize < wdata->bytes);
+ wdata->credits = credits;
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
- rc = 0;
- if (wdata->cfile->invalidHandle)
- rc = cifs_reopen_file(wdata->cfile, false);
- if (!rc)
- rc = server->ops->async_writev(wdata,
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+
+ if (!rc) {
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
- }
+ }
- if (!rc) {
- list_add_tail(&wdata->list, wdata_list);
- return 0;
- }
+ /* If the write was successfully sent, we are done */
+ if (!rc) {
+ list_add_tail(&wdata->list, wdata_list);
+ return 0;
+ }
- add_credits_and_wake_if(server, wdata->credits, 0);
-out:
- kref_put(&wdata->refcount, cifs_uncached_writedata_release);
+ /* Roll back credits and retry if needed */
+ add_credits_and_wake_if(server, &wdata->credits, 0);
+ } while (rc == -EAGAIN);
+fail:
+ kref_put(&wdata->refcount, cifs_uncached_writedata_release);
return rc;
}
@@ -2627,6 +2701,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
struct TCP_Server_Info *server;
struct page **pagevec;
size_t start;
+ unsigned int xid;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2634,12 +2709,23 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
pid = current->tgid;
server = tlink_tcon(open_file->tlink)->ses->server;
+ xid = get_xid();
do {
- unsigned int wsize, credits;
+ unsigned int wsize;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, false);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
- &wsize, &credits);
+ &wsize, credits);
if (rc)
break;
@@ -2731,16 +2817,22 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
wdata->pid = pid;
wdata->bytes = cur_len;
wdata->pagesz = PAGE_SIZE;
- wdata->credits = credits;
+ wdata->credits = credits_on_stack;
wdata->ctx = ctx;
kref_get(&ctx->refcount);
- if (!wdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(wdata->cfile, false)))
- rc = server->ops->async_writev(wdata,
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+
+ if (!rc) {
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
if (rc == -EAGAIN) {
@@ -2756,6 +2848,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
len -= cur_len;
} while (len > 0);
+ free_xid(xid);
return rc;
}
@@ -2816,12 +2909,12 @@ restart_loop:
wdata->bytes, &tmp_from,
ctx->cfile, cifs_sb, &tmp_list,
ctx);
+
+ kref_put(&wdata->refcount,
+ cifs_uncached_writedata_release);
}
list_splice(&tmp_list, &ctx->list);
-
- kref_put(&wdata->refcount,
- cifs_uncached_writedata_release);
goto restart_loop;
}
}
@@ -3028,14 +3121,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
* these pages but not on the region from pos to ppos+len-1.
*/
written = cifs_user_writev(iocb, from);
- if (written > 0 && CIFS_CACHE_READ(cinode)) {
+ if (CIFS_CACHE_READ(cinode)) {
/*
- * Windows 7 server can delay breaking level2 oplock if a write
- * request comes - break it on the client to prevent reading
- * an old data.
+ * We have read level caching and we have just sent a write
+ * request to the server thus making data in the cache stale.
+ * Zap the cache and set oplock/lease level to NONE to avoid
+ * reading stale data from the cache. All subsequent read
+ * operations will read new data from the server.
*/
cifs_zap_mapping(inode);
- cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+ cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n",
inode);
cinode->oplock = 0;
}
@@ -3260,48 +3355,61 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
struct list_head *rdata_list,
struct cifs_aio_ctx *ctx)
{
- unsigned int rsize, credits;
+ unsigned int rsize;
+ struct cifs_credits credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(rdata->cfile->tlink)->ses->server;
- /*
- * Wait for credits to resend this rdata.
- * Note: we are attempting to resend the whole rdata not in segments
- */
do {
- rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+ if (rdata->cfile->invalidHandle) {
+ rc = cifs_reopen_file(rdata->cfile, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
+
+ /*
+ * Wait for credits to resend this rdata.
+ * Note: we are attempting to resend the whole rdata not in
+ * segments
+ */
+ do {
+ rc = server->ops->wait_mtu_credits(server, rdata->bytes,
&rsize, &credits);
- if (rc)
- goto out;
+ if (rc)
+ goto fail;
- if (rsize < rdata->bytes) {
- add_credits_and_wake_if(server, credits, 0);
- msleep(1000);
- }
- } while (rsize < rdata->bytes);
+ if (rsize < rdata->bytes) {
+ add_credits_and_wake_if(server, &credits, 0);
+ msleep(1000);
+ }
+ } while (rsize < rdata->bytes);
+ rdata->credits = credits;
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
- rc = 0;
- if (rdata->cfile->invalidHandle)
- rc = cifs_reopen_file(rdata->cfile, true);
- if (!rc)
- rc = server->ops->async_readv(rdata);
- }
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
- if (!rc) {
- /* Add to aio pending list */
- list_add_tail(&rdata->list, rdata_list);
- return 0;
- }
+ /* If the read was successfully sent, we are done */
+ if (!rc) {
+ /* Add to aio pending list */
+ list_add_tail(&rdata->list, rdata_list);
+ return 0;
+ }
- add_credits_and_wake_if(server, rdata->credits, 0);
-out:
- kref_put(&rdata->refcount,
- cifs_uncached_readdata_release);
+ /* Roll back credits and retry if needed */
+ add_credits_and_wake_if(server, &rdata->credits, 0);
+ } while (rc == -EAGAIN);
+fail:
+ kref_put(&rdata->refcount, cifs_uncached_readdata_release);
return rc;
}
@@ -3311,7 +3419,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
struct cifs_aio_ctx *ctx)
{
struct cifs_readdata *rdata;
- unsigned int npages, rsize, credits;
+ unsigned int npages, rsize;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
size_t cur_len;
int rc;
pid_t pid;
@@ -3331,8 +3441,16 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
iov_iter_advance(&direct_iov, offset - ctx->pos);
do {
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
- &rsize, &credits);
+ &rsize, credits);
if (rc)
break;
@@ -3406,15 +3524,21 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->pagesz = PAGE_SIZE;
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
- rdata->credits = credits;
+ rdata->credits = credits_on_stack;
rdata->ctx = ctx;
kref_get(&ctx->refcount);
- if (!rdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(rdata->cfile, true)))
- rc = server->ops->async_readv(rdata);
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
if (rc == -EAGAIN) {
@@ -3533,8 +3657,6 @@ again:
ctx->total_len = ctx->len - iov_iter_count(to);
}
- cifs_stats_bytes_read(tcon, ctx->total_len);
-
/* mask nodata case */
if (rc == -ENODATA)
rc = 0;
@@ -4095,10 +4217,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
loff_t offset;
struct page *page, *tpage;
struct cifs_readdata *rdata;
- unsigned credits;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
- &rsize, &credits);
+ &rsize, credits);
if (rc)
break;
@@ -4144,18 +4275,24 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->copy_into_pages = cifs_readpages_copy_into_pages;
- rdata->credits = credits;
+ rdata->credits = credits_on_stack;
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
list_del(&page->lru);
rdata->pages[rdata->nr_pages++] = page;
}
- if (!rdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(rdata->cfile, true)))
- rc = server->ops->async_readv(rdata);
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
for (i = 0; i < rdata->nr_pages; i++) {
page = rdata->pages[i];
lru_cache_add_file(page);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 478003644916..53fdb5df0d2e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2080,7 +2080,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
return rc;
generic_fillattr(inode, stat);
- stat->blksize = CIFS_MAX_MSGSIZE;
+ stat->blksize = cifs_sb->bsize;
stat->ino = CIFS_I(inode)->uniqueid;
/* old CIFS Unix Extensions doesn't return create time */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 2148b0f60e5e..62216dc8f9f5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -103,9 +103,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
return rc;
}
- snprintf(md5_str2, sizeof(md5_str2),
- CIFS_MF_SYMLINK_MD5_FORMAT,
- CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+ scnprintf(md5_str2, sizeof(md5_str2),
+ CIFS_MF_SYMLINK_MD5_FORMAT,
+ CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
if (strncmp(md5_str1, md5_str2, 17) != 0)
return -EINVAL;
@@ -142,10 +142,10 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
return rc;
}
- snprintf(buf, buf_len,
- CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
- link_len,
- CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+ scnprintf(buf, buf_len,
+ CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+ link_len,
+ CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
memcpy(buf + ofs, link_str, link_len);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 32a6c020478f..c711f1f39bf2 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -117,11 +117,11 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
}
static void
-cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+cifs_add_credits(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
spin_lock(&server->req_lock);
- server->credits += add;
+ server->credits += credits->value;
server->in_flight--;
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
@@ -308,7 +308,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
remaining = tgt_total_cnt - total_in_tgt;
if (remaining < 0) {
- cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+ cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n",
tgt_total_cnt, total_in_tgt);
return -EPROTO;
}
@@ -1027,6 +1027,131 @@ cifs_can_echo(struct TCP_Server_Info *server)
return false;
}
+static int
+cifs_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct inode *newinode = NULL;
+ int rc = -EPERM;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+ FILE_ALL_INFO *buf = NULL;
+ struct cifs_io_parms io_parms;
+ __u32 oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ unsigned int bytes_written;
+ struct win_dev *pdev;
+ struct kvec iov[2];
+
+ if (tcon->unix_ext) {
+ /*
+ * SMB1 Unix Extensions: requires server support but
+ * works with all special files
+ */
+ struct cifs_unix_set_info_args args = {
+ .mode = mode & ~current_umask(),
+ .ctime = NO_CHANGE_64,
+ .atime = NO_CHANGE_64,
+ .mtime = NO_CHANGE_64,
+ .device = dev,
+ };
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+ args.uid = current_fsuid();
+ args.gid = current_fsgid();
+ } else {
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
+ }
+ rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ if (rc)
+ goto out;
+
+ rc = cifs_get_inode_info_unix(&newinode, full_path,
+ inode->i_sb, xid);
+
+ if (rc == 0)
+ d_instantiate(dentry, newinode);
+ goto out;
+ }
+
+ /*
+ * SMB1 SFU emulation: should work with all servers, but only
+ * support block and char device (no socket & fifo)
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ goto out;
+
+ if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ goto out;
+
+ cifs_dbg(FYI, "sfu compat create special file\n");
+
+ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ if (rc)
+ goto out;
+
+ /*
+ * BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely.
+ */
+
+ pdev = (struct win_dev *)buf;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = sizeof(struct win_dev);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct win_dev);
+ if (S_ISCHR(mode)) {
+ memcpy(pdev->type, "IntxCHR", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ } else if (S_ISBLK(mode)) {
+ memcpy(pdev->type, "IntxBLK", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ }
+ tcon->ses->server->ops->close(xid, tcon, &fid);
+ d_drop(dentry);
+
+ /* FIXME: add code here to set EAs */
+out:
+ kfree(buf);
+ return rc;
+}
+
+
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1110,6 +1235,7 @@ struct smb_version_operations smb1_operations = {
.get_acl_by_fid = get_cifs_acl_by_fid,
.set_acl = set_cifs_acl,
#endif /* CIFS_ACL */
+ .make_node = cifs_make_node,
};
struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 01a76bccdb8d..278405d26c47 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -37,6 +37,16 @@
#include "smb2pdu.h"
#include "smb2proto.h"
+static void
+free_set_inf_compound(struct smb_rqst *rqst)
+{
+ if (rqst[1].rq_iov)
+ SMB2_set_info_free(&rqst[1]);
+ if (rqst[2].rq_iov)
+ SMB2_close_free(&rqst[2]);
+}
+
+
static int
smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
@@ -112,14 +122,18 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
PATH_MAX * 2, 0, NULL);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid,
+ full_path);
break;
case SMB2_OP_DELETE:
+ trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_MKDIR:
/*
* Directories are created through parameters in the
* SMB2_open() call.
*/
+ trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_RMDIR:
memset(&si_iov, 0, sizeof(si_iov));
@@ -135,6 +149,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_SET_EOF:
memset(&si_iov, 0, sizeof(si_iov));
@@ -150,6 +165,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_SET_INFO:
memset(&si_iov, 0, sizeof(si_iov));
@@ -166,6 +182,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid,
+ full_path);
break;
case SMB2_OP_RENAME:
memset(&si_iov, 0, sizeof(si_iov));
@@ -190,6 +208,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_HARDLINK:
memset(&si_iov, 0, sizeof(si_iov));
@@ -214,6 +233,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
break;
default:
cifs_dbg(VFS, "Invalid command\n");
@@ -252,21 +272,65 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_query_info_free(&rqst[1]);
if (rqst[2].rq_iov)
SMB2_close_free(&rqst[2]);
+ if (rc)
+ trace_smb3_query_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_query_info_compound_done(xid, ses->Suid,
+ tcon->tid);
break;
case SMB2_OP_DELETE:
+ if (rc)
+ trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+ if (rqst[1].rq_iov)
+ SMB2_close_free(&rqst[1]);
+ break;
case SMB2_OP_MKDIR:
+ if (rc)
+ trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
if (rqst[1].rq_iov)
SMB2_close_free(&rqst[1]);
break;
case SMB2_OP_HARDLINK:
+ if (rc)
+ trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_RENAME:
+ if (rc)
+ trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_RMDIR:
+ if (rc)
+ trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_SET_EOF:
+ if (rc)
+ trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_SET_INFO:
- if (rqst[1].rq_iov)
- SMB2_set_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
+ if (rc)
+ trace_smb3_set_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_set_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ free_set_inf_compound(rqst);
break;
}
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
@@ -309,12 +373,17 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = open_shroot(xid, tcon, &fid);
if (rc)
goto out;
- rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, smb2_data);
+
+ if (tcon->crfid.file_all_info_is_valid) {
+ move_smb2_info_to_cifs(data,
+ &tcon->crfid.file_all_info);
+ } else {
+ rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, smb2_data);
+ if (!rc)
+ move_smb2_info_to_cifs(data, smb2_data);
+ }
close_shroot(&tcon->crfid);
- if (rc)
- goto out;
- move_smb2_info_to_cifs(data, smb2_data);
goto out;
}
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 924269cec135..e32c264e3adb 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -1036,7 +1036,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_UNFINISHED_CONTEXT_DELETED, -EIO,
"STATUS_UNFINISHED_CONTEXT_DELETED"},
{STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"},
- {STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"},
+ /* Note that ENOATTTR and ENODATA are the same errno */
+ {STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"},
{STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"},
{STATUS_WRONG_CREDENTIAL_HANDLE, -EIO,
"STATUS_WRONG_CREDENTIAL_HANDLE"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 7b8b58fb4d3f..0e3570e40ff8 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -517,7 +517,6 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
__u8 lease_state;
struct list_head *tmp;
struct cifsFileInfo *cfile;
- struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_pending_open *open;
struct cifsInodeInfo *cinode;
int ack_req = le32_to_cpu(rsp->Flags &
@@ -537,13 +536,25 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
le32_to_cpu(rsp->NewLeaseState));
- server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
-
if (ack_req)
cfile->oplock_break_cancelled = false;
else
cfile->oplock_break_cancelled = true;
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+
+ /*
+ * Set or clear flags depending on the lease state being READ.
+ * HANDLE caching flag should be added when the client starts
+ * to defer closing remote file handles with HANDLE leases.
+ */
+ if (lease_state & SMB2_LEASE_READ_CACHING_HE)
+ set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+ else
+ clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+
queue_work(cifsoplockd_wq, &cfile->oplock_break);
kfree(lw);
return true;
@@ -648,13 +659,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
return false;
- if (rsp->sync_hdr.CreditRequest) {
- spin_lock(&server->req_lock);
- server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
- }
-
if (rsp->StructureSize !=
smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
if (le16_to_cpu(rsp->StructureSize) == 44)
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 6f96e2292856..1022a3771e14 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -67,10 +67,13 @@ change_conf(struct TCP_Server_Info *server)
}
static void
-smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+smb2_add_credits(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
int *val, rc = -1;
+ unsigned int add = credits->value;
+ unsigned int instance = credits->instance;
+ bool reconnect_detected = false;
spin_lock(&server->req_lock);
val = server->ops->get_credits_field(server, optype);
@@ -79,8 +82,11 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0))
trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
server->hostname, *val);
+ if ((instance == 0) || (instance == server->reconnect_instance))
+ *val += add;
+ else
+ reconnect_detected = true;
- *val += add;
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
printk_once(KERN_WARNING "server overflowed SMB3 credits\n");
@@ -102,7 +108,12 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- if (server->tcpStatus == CifsNeedReconnect)
+ if (reconnect_detected)
+ cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n",
+ add, instance);
+
+ if (server->tcpStatus == CifsNeedReconnect
+ || server->tcpStatus == CifsExiting)
return;
switch (rc) {
@@ -163,7 +174,7 @@ smb2_get_credits(struct mid_q_entry *mid)
static int
smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, unsigned int *credits)
+ unsigned int *num, struct cifs_credits *credits)
{
int rc = 0;
unsigned int scredits;
@@ -174,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
spin_unlock(&server->req_lock);
cifs_num_waiters_inc(server);
rc = wait_event_killable(server->request_q,
- has_credits(server, &server->credits));
+ has_credits(server, &server->credits, 1));
cifs_num_waiters_dec(server);
if (rc)
return rc;
@@ -189,7 +200,8 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
/* can deadlock with reopen */
if (scredits <= 8) {
*num = SMB2_MAX_BUFFER_SIZE;
- *credits = 0;
+ credits->value = 0;
+ credits->instance = 0;
break;
}
@@ -198,8 +210,10 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
*num = min_t(unsigned int, size,
scredits * SMB2_MAX_BUFFER_SIZE);
- *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
- server->credits -= *credits;
+ credits->value =
+ DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
+ credits->instance = server->reconnect_instance;
+ server->credits -= credits->value;
server->in_flight++;
break;
}
@@ -208,6 +222,38 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
return rc;
}
+static int
+smb2_adjust_credits(struct TCP_Server_Info *server,
+ struct cifs_credits *credits,
+ const unsigned int payload_size)
+{
+ int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE);
+
+ if (!credits->value || credits->value == new_val)
+ return 0;
+
+ if (credits->value < new_val) {
+ WARN_ONCE(1, "request has less credits (%d) than required (%d)",
+ credits->value, new_val);
+ return -ENOTSUPP;
+ }
+
+ spin_lock(&server->req_lock);
+
+ if (server->reconnect_instance != credits->instance) {
+ spin_unlock(&server->req_lock);
+ cifs_dbg(VFS, "trying to return %d credits to old session\n",
+ credits->value - new_val);
+ return -EAGAIN;
+ }
+
+ server->credits += credits->value - new_val;
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ credits->value = new_val;
+ return 0;
+}
+
static __u64
smb2_get_next_mid(struct TCP_Server_Info *server)
{
@@ -219,6 +265,15 @@ smb2_get_next_mid(struct TCP_Server_Info *server)
return mid;
}
+static void
+smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+ spin_lock(&GlobalMid_Lock);
+ if (server->CurrentMid >= val)
+ server->CurrentMid -= val;
+ spin_unlock(&GlobalMid_Lock);
+}
+
static struct mid_q_entry *
smb2_find_mid(struct TCP_Server_Info *server, char *buf)
{
@@ -564,6 +619,7 @@ smb2_close_cached_fid(struct kref *ref)
SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid,
cfid->fid->volatile_fid);
cfid->is_valid = false;
+ cfid->file_all_info_is_valid = false;
}
}
@@ -588,9 +644,18 @@ smb2_cached_lease_break(struct work_struct *work)
*/
int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
{
- struct cifs_open_parms oparams;
- int rc;
- __le16 srch_path = 0; /* Null - since an open of top of share */
+ struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = ses->server;
+ struct cifs_open_parms oparms;
+ struct smb2_create_rsp *o_rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ int resp_buftype[2];
+ struct smb_rqst rqst[2];
+ struct kvec rsp_iov[2];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qi_iov[1];
+ int rc, flags = 0;
+ __le16 utf16_path = 0; /* Null - since an open of top of share */
u8 oplock = SMB2_OPLOCK_LEVEL_II;
mutex_lock(&tcon->crfid.fid_mutex);
@@ -602,22 +667,89 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
return 0;
}
- oparams.tcon = tcon;
- oparams.create_options = 0;
- oparams.desired_access = FILE_READ_ATTRIBUTES;
- oparams.disposition = FILE_OPEN;
- oparams.fid = pfid;
- oparams.reconnect = false;
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
- rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL);
- if (rc == 0) {
- memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
- tcon->crfid.tcon = tcon;
- tcon->crfid.is_valid = true;
- kref_init(&tcon->crfid.refcount);
- kref_get(&tcon->crfid.refcount);
- }
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ oparms.tcon = tcon;
+ oparms.create_options = 0;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.fid = pfid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path);
+ if (rc)
+ goto oshr_exit;
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ if (rc)
+ goto oshr_exit;
+
+ smb2_set_related(&rqst[1]);
+
+ rc = compound_send_recv(xid, ses, flags, 2, rqst,
+ resp_buftype, rsp_iov);
+ if (rc)
+ goto oshr_exit;
+
+ o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
+ oparms.fid->persistent_fid = o_rsp->PersistentFileId;
+ oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+#ifdef CONFIG_CIFS_DEBUG2
+ oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
+#endif /* CIFS_DEBUG2 */
+
+ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
+ oplock = smb2_parse_lease_state(server, o_rsp,
+ &oparms.fid->epoch,
+ oparms.fid->lease_key);
+ else
+ goto oshr_exit;
+
+
+ memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
+ tcon->crfid.tcon = tcon;
+ tcon->crfid.is_valid = true;
+ kref_init(&tcon->crfid.refcount);
+ kref_get(&tcon->crfid.refcount);
+
+
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+ goto oshr_exit;
+ rc = smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ sizeof(struct smb2_file_all_info),
+ &rsp_iov[1], sizeof(struct smb2_file_all_info),
+ (char *)&tcon->crfid.file_all_info);
+ if (rc)
+ goto oshr_exit;
+ tcon->crfid.file_all_info_is_valid = 1;
+
+ oshr_exit:
mutex_unlock(&tcon->crfid.fid_mutex);
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
return rc;
}
@@ -940,6 +1072,16 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
+ if (ses->server->ops->query_all_EAs) {
+ if (!ea_value) {
+ rc = ses->server->ops->query_all_EAs(xid, tcon, path,
+ ea_name, NULL, 0,
+ cifs_sb);
+ if (rc == -ENODATA)
+ goto sea_exit;
+ }
+ }
+
/* Open */
memset(&open_iov, 0, sizeof(open_iov));
rqst[0].rq_iov = open_iov;
@@ -1188,7 +1330,8 @@ smb2_ioctl_query_info(const unsigned int xid,
struct smb_query_info __user *pqi;
int rc = 0;
int flags = 0;
- struct smb2_query_info_rsp *rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ struct smb2_ioctl_rsp *io_rsp = NULL;
void *buffer = NULL;
struct smb_rqst rqst[3];
int resp_buftype[3];
@@ -1198,6 +1341,7 @@ smb2_ioctl_query_info(const unsigned int xid,
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_fid fid;
struct kvec qi_iov[1];
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec close_iov[1];
memset(rqst, 0, sizeof(rqst));
@@ -1248,15 +1392,35 @@ smb2_ioctl_query_info(const unsigned int xid,
smb2_set_next_command(tcon, &rqst[0]);
/* Query */
- memset(&qi_iov, 0, sizeof(qi_iov));
- rqst[1].rq_iov = qi_iov;
- rqst[1].rq_nvec = 1;
-
- rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID,
- qi.file_info_class, qi.info_type,
- qi.additional_information,
+ if (qi.flags & PASSTHRU_FSCTL) {
+ /* Can eventually relax perm check since server enforces too */
+ if (!capable(CAP_SYS_ADMIN))
+ rc = -EPERM;
+ else {
+ memset(&io_iov, 0, sizeof(io_iov));
+ rqst[1].rq_iov = io_iov;
+ rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, &rqst[1],
+ COMPOUND_FID, COMPOUND_FID,
+ qi.info_type, true, NULL,
+ 0);
+ }
+ } else if (qi.flags == PASSTHRU_QUERY_INFO) {
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, qi.file_info_class,
+ qi.info_type, qi.additional_information,
qi.input_buffer_length,
qi.output_buffer_length, buffer);
+ } else { /* unknown flags */
+ cifs_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags);
+ rc = -EINVAL;
+ }
+
if (rc)
goto iqinf_exit;
smb2_set_next_command(tcon, &rqst[1]);
@@ -1276,24 +1440,44 @@ smb2_ioctl_query_info(const unsigned int xid,
resp_buftype, rsp_iov);
if (rc)
goto iqinf_exit;
- pqi = (struct smb_query_info __user *)arg;
- rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
- if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length)
- qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength);
- if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
- sizeof(qi.input_buffer_length))) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
- if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
+ if (qi.flags & PASSTHRU_FSCTL) {
+ pqi = (struct smb_query_info __user *)arg;
+ io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, &io_rsp[1], qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ } else {
+ pqi = (struct smb_query_info __user *)arg;
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
}
iqinf_exit:
kfree(buffer);
SMB2_open_free(&rqst[0]);
- SMB2_query_info_free(&rqst[1]);
+ if (qi.flags & PASSTHRU_FSCTL)
+ SMB2_ioctl_free(&rqst[1]);
+ else
+ SMB2_query_info_free(&rqst[1]);
+
SMB2_close_free(&rqst[2]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
@@ -1753,14 +1937,14 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
* the number of credits and return true. Otherwise - return false.
*/
static bool
-smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
+smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
if (shdr->Status != STATUS_PENDING)
return false;
- if (!length) {
+ if (shdr->CreditRequest) {
spin_lock(&server->req_lock);
server->credits += le16_to_cpu(shdr->CreditRequest);
spin_unlock(&server->req_lock);
@@ -2407,22 +2591,38 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb,
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len, bool keep_size)
{
+ struct cifs_ses *ses = tcon->ses;
struct inode *inode;
struct cifsInodeInfo *cifsi;
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
+ struct smb_rqst rqst[2];
+ int resp_buftype[2];
+ struct kvec rsp_iov[2];
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec si_iov[1];
+ unsigned int size[1];
+ void *data[1];
long rc;
unsigned int xid;
+ int num = 0, flags = 0;
+ __le64 eof;
xid = get_xid();
inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
+ trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len);
+
+
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
if (keep_size == false) {
rc = -EOPNOTSUPP;
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, ses->Suid, offset, len, rc);
free_xid(xid);
return rc;
}
@@ -2433,33 +2633,73 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
*/
if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) {
rc = -EOPNOTSUPP;
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len, rc);
free_xid(xid);
return rc;
}
- /*
- * need to make sure we are not asked to extend the file since the SMB3
- * fsctl does not change the file size. In the future we could change
- * this to zero the first part of the range then set the file size
- * which for a non sparse file would zero the newly extended range
- */
- if (keep_size == false)
- if (i_size_read(inode) < offset + len) {
- rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
- }
-
cifs_dbg(FYI, "offset %lld len %lld", offset, len);
fsctl_buf.FileOffset = cpu_to_le64(offset);
fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
- rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
- true /* is_fctl */, (char *)&fsctl_buf,
- sizeof(struct file_zero_data_information), NULL, NULL);
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+
+ memset(&io_iov, 0, sizeof(io_iov));
+ rqst[num].rq_iov = io_iov;
+ rqst[num].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+ rc = SMB2_ioctl_init(tcon, &rqst[num++], cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+ true /* is_fctl */, (char *)&fsctl_buf,
+ sizeof(struct file_zero_data_information));
+ if (rc)
+ goto zero_range_exit;
+
+ /*
+ * do we also need to change the size of the file?
+ */
+ if (keep_size == false && i_size_read(inode) < offset + len) {
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num].rq_iov = si_iov;
+ rqst[num].rq_nvec = 1;
+
+ eof = cpu_to_le64(offset + len);
+ size[0] = 8; /* sizeof __le64 */
+ data[0] = &eof;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num++],
+ cfile->fid.persistent_fid,
+ cfile->fid.persistent_fid,
+ current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_related(&rqst[1]);
+ }
+
+ rc = compound_send_recv(xid, ses, flags, num, rqst,
+ resp_buftype, rsp_iov);
+
+ zero_range_exit:
+ SMB2_ioctl_free(&rqst[0]);
+ SMB2_set_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_xid(xid);
+ if (rc)
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len, rc);
+ else
+ trace_smb3_zero_done(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len);
return rc;
}
@@ -2508,15 +2748,20 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile = file->private_data;
long rc = -EOPNOTSUPP;
unsigned int xid;
+ __le64 eof;
xid = get_xid();
inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
+ trace_smb3_falloc_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len);
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
if (keep_size == false) {
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
free_xid(xid);
return rc;
}
@@ -2536,6 +2781,12 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
/* BB: in future add else clause to extend file */
else
rc = -EOPNOTSUPP;
+ if (rc)
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
+ else
+ trace_smb3_falloc_done(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len);
free_xid(xid);
return rc;
}
@@ -2551,14 +2802,31 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
*/
if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) {
rc = -EOPNOTSUPP;
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
free_xid(xid);
return rc;
}
- rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+ rc = 0;
+ } else {
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+ rc = 0;
+ if (i_size_read(inode) < off + len) {
+ eof = cpu_to_le64(off + len);
+ rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, cfile->pid,
+ &eof);
+ }
}
- /* BB: else ... in future add code to extend file and set sparse */
+ if (rc)
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len, rc);
+ else
+ trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len);
free_xid(xid);
return rc;
@@ -2595,6 +2863,15 @@ smb2_downgrade_oplock(struct TCP_Server_Info *server,
}
static void
+smb21_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ server->ops->set_oplock_level(cinode,
+ set_level2 ? SMB2_LEASE_READ_CACHING_HE :
+ 0, 0, NULL);
+}
+
+static void
smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache)
{
@@ -3210,15 +3487,15 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, 0))
+ server->ops->is_status_pending(buf, server))
return -1;
/* set up first two iov to get credits */
rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
+ rdata->iov[0].iov_len = 0;
+ rdata->iov[1].iov_base = buf;
rdata->iov[1].iov_len =
- min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4;
+ min_t(unsigned int, buf_len, server->vals->read_rsp_size);
cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
rdata->iov[0].iov_base, rdata->iov[0].iov_len);
cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
@@ -3530,6 +3807,104 @@ smb2_next_header(char *buf)
return le32_to_cpu(hdr->NextCommand);
}
+static int
+smb2_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ int rc = -EPERM;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+ FILE_ALL_INFO *buf = NULL;
+ struct cifs_io_parms io_parms;
+ __u32 oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ unsigned int bytes_written;
+ struct win_dev *pdev;
+ struct kvec iov[2];
+
+ /*
+ * Check if mounted with mount parm 'sfu' mount parm.
+ * SFU emulation should work with all servers, but only
+ * supports block and char device (no socket & fifo),
+ * and was used by default in earlier versions of Windows
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ goto out;
+
+ /*
+ * TODO: Add ability to create instead via reparse point. Windows (e.g.
+ * their current NFS server) uses this approach to expose special files
+ * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
+ */
+
+ if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ goto out;
+
+ cifs_dbg(FYI, "sfu compat create special file\n");
+
+ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ if (rc)
+ goto out;
+
+ /*
+ * BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely.
+ */
+
+ pdev = (struct win_dev *)buf;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = sizeof(struct win_dev);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct win_dev);
+ if (S_ISCHR(mode)) {
+ memcpy(pdev->type, "IntxCHR", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ } else if (S_ISBLK(mode)) {
+ memcpy(pdev->type, "IntxBLK", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ }
+ tcon->ses->server->ops->close(xid, tcon, &fid);
+ d_drop(dentry);
+
+ /* FIXME: add code here to set EAs */
+out:
+ kfree(buf);
+ return rc;
+}
+
+
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -3541,6 +3916,7 @@ struct smb_version_operations smb20_operations = {
.get_credits = smb2_get_credits,
.wait_mtu_credits = cifs_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3623,6 +3999,7 @@ struct smb_version_operations smb20_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb21_operations = {
@@ -3635,7 +4012,9 @@ struct smb_version_operations smb21_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3646,7 +4025,7 @@ struct smb_version_operations smb21_operations = {
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -3719,6 +4098,7 @@ struct smb_version_operations smb21_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb30_operations = {
@@ -3731,7 +4111,9 @@ struct smb_version_operations smb30_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3743,7 +4125,7 @@ struct smb_version_operations smb30_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
@@ -3824,6 +4206,7 @@ struct smb_version_operations smb30_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb311_operations = {
@@ -3836,7 +4219,9 @@ struct smb_version_operations smb311_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3848,7 +4233,7 @@ struct smb_version_operations smb311_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
@@ -3930,6 +4315,7 @@ struct smb_version_operations smb311_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 77b3aaa39b35..21ac19ff19cb 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -107,13 +107,13 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
struct TCP_Server_Info *server = tcon->ses->server;
spin_lock(&server->req_lock);
- /* Request up to 2 credits but don't go over the limit. */
+ /* Request up to 10 credits but don't go over the limit. */
if (server->credits >= server->max_credits)
shdr->CreditRequest = cpu_to_le16(0);
else
shdr->CreditRequest = cpu_to_le16(
min_t(int, server->max_credits -
- server->credits, 2));
+ server->credits, 10));
spin_unlock(&server->req_lock);
} else {
shdr->CreditRequest = cpu_to_le16(2);
@@ -173,8 +173,8 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
return -ENOMEM;
if (tcon->ipc) {
- snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
- tcon->ses->server->hostname);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
goto out;
}
@@ -206,7 +206,7 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
continue;
}
- snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
if (!rc)
@@ -490,6 +490,23 @@ build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
{
pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE;
pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
+ /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+ pneg_ctxt->Name[0] = 0x93;
+ pneg_ctxt->Name[1] = 0xAD;
+ pneg_ctxt->Name[2] = 0x25;
+ pneg_ctxt->Name[3] = 0x50;
+ pneg_ctxt->Name[4] = 0x9C;
+ pneg_ctxt->Name[5] = 0xB4;
+ pneg_ctxt->Name[6] = 0x11;
+ pneg_ctxt->Name[7] = 0xE7;
+ pneg_ctxt->Name[8] = 0xB4;
+ pneg_ctxt->Name[9] = 0x23;
+ pneg_ctxt->Name[10] = 0x83;
+ pneg_ctxt->Name[11] = 0xDE;
+ pneg_ctxt->Name[12] = 0x96;
+ pneg_ctxt->Name[13] = 0x8B;
+ pneg_ctxt->Name[14] = 0xCD;
+ pneg_ctxt->Name[15] = 0x7C;
}
static void
@@ -986,8 +1003,14 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
(char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen);
-
- if (rc != 0) {
+ if (rc == -EOPNOTSUPP) {
+ /*
+ * Old Windows versions or Netapp SMB server can return
+ * not supported error. Client should accept it.
+ */
+ cifs_dbg(VFS, "Server does not support validate negotiate\n");
+ return 0;
+ } else if (rc != 0) {
cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
rc = -EIO;
goto out_free_inbuf;
@@ -1605,15 +1628,25 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
iov[1].iov_base = unc_path;
iov[1].iov_len = unc_path_len;
- /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */
+ /*
+ * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1
+ * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1
+ * (Samba servers don't always set the flag so also check if null user)
+ */
if ((ses->server->dialect == SMB311_PROT_ID) &&
- !smb3_encryption_required(tcon))
+ !smb3_encryption_required(tcon) &&
+ !(ses->session_flags &
+ (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) &&
+ ((ses->user_name != NULL) || (ses->sectype == Kerberos)))
req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
memset(&rqst, 0, sizeof(struct smb_rqst));
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ /* Need 64 for max size write so ask for more in case not there yet */
+ req->sync_hdr.CreditRequest = cpu_to_le16(64);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
@@ -1771,9 +1804,10 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
return buf;
}
-static __u8
-parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
- unsigned int *epoch, char *lease_key)
+__u8
+smb2_parse_lease_state(struct TCP_Server_Info *server,
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key)
{
char *data_offset;
struct create_context *cc;
@@ -2170,6 +2204,8 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
rqst.rq_iov = iov;
rqst.rq_nvec = n_iov;
+ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE,
+ FILE_WRITE_ATTRIBUTES);
/* resource #4: response buffer */
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
if (rc) {
@@ -2388,6 +2424,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (rc)
goto creat_exit;
+ trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid,
+ oparms->create_options, oparms->desired_access);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
@@ -2425,8 +2464,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
}
if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
- *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch,
- oparms->fid->lease_key);
+ *oplock = smb2_parse_lease_state(server, rsp,
+ &oparms->fid->epoch,
+ oparms->fid->lease_key);
else
*oplock = rsp->OplockLevel;
creat_exit:
@@ -2435,65 +2475,46 @@ creat_exit:
return rc;
}
-/*
- * SMB2 IOCTL is used for both IOCTLs and FSCTLs
- */
int
-SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 opcode, bool is_fsctl,
- char *in_data, u32 indatalen,
- char **out_data, u32 *plen /* returned data len */)
+SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen)
{
- struct smb_rqst rqst;
struct smb2_ioctl_req *req;
- struct smb2_ioctl_rsp *rsp;
- struct cifs_ses *ses;
- struct kvec iov[2];
- struct kvec rsp_iov;
- int resp_buftype;
- int n_iov;
- int rc = 0;
- int flags = 0;
+ struct kvec *iov = rqst->rq_iov;
unsigned int total_len;
-
- cifs_dbg(FYI, "SMB2 IOCTL\n");
-
- if (out_data != NULL)
- *out_data = NULL;
-
- /* zero out returned data len, in case of error */
- if (plen)
- *plen = 0;
-
- if (tcon)
- ses = tcon->ses;
- else
- return -EIO;
-
- if (!ses || !(ses->server))
- return -EIO;
+ int rc;
rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
if (rc)
return rc;
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
req->CtlCode = cpu_to_le32(opcode);
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
+ iov[0].iov_base = (char *)req;
+ /*
+ * If no input data, the size of ioctl struct in
+ * protocol spec still includes a 1 byte data buffer,
+ * but if input data passed to ioctl, we do not
+ * want to double count this, so we do not send
+ * the dummy one byte of data in iovec[0] if sending
+ * input data (in iovec[1]).
+ */
if (indatalen) {
req->InputCount = cpu_to_le32(indatalen);
/* do not set InputOffset if no input data */
req->InputOffset =
cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer));
+ rqst->rq_nvec = 2;
+ iov[0].iov_len = total_len - 1;
iov[1].iov_base = in_data;
iov[1].iov_len = indatalen;
- n_iov = 2;
- } else
- n_iov = 1;
+ } else {
+ rqst->rq_nvec = 1;
+ iov[0].iov_len = total_len;
+ }
req->OutputOffset = 0;
req->OutputCount = 0; /* MBZ */
@@ -2515,33 +2536,70 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
else
req->Flags = 0;
- iov[0].iov_base = (char *)req;
-
- /*
- * If no input data, the size of ioctl struct in
- * protocol spec still includes a 1 byte data buffer,
- * but if input data passed to ioctl, we do not
- * want to double count this, so we do not send
- * the dummy one byte of data in iovec[0] if sending
- * input data (in iovec[1]).
- */
-
- if (indatalen) {
- iov[0].iov_len = total_len - 1;
- } else
- iov[0].iov_len = total_len;
-
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ return 0;
+}
+
+void
+SMB2_ioctl_free(struct smb_rqst *rqst)
+{
+ if (rqst && rqst->rq_iov)
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+}
+
+/*
+ * SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+ u64 volatile_fid, u32 opcode, bool is_fsctl,
+ char *in_data, u32 indatalen,
+ char **out_data, u32 *plen /* returned data len */)
+{
+ struct smb_rqst rqst;
+ struct smb2_ioctl_rsp *rsp = NULL;
+ struct cifs_ses *ses;
+ struct kvec iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec rsp_iov = {NULL, 0};
+ int resp_buftype = CIFS_NO_BUFFER;
+ int rc = 0;
+ int flags = 0;
+
+ cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+ if (out_data != NULL)
+ *out_data = NULL;
+
+ /* zero out returned data len, in case of error */
+ if (plen)
+ *plen = 0;
+
+ if (tcon)
+ ses = tcon->ses;
+ else
+ return -EIO;
+
+ if (!ses || !(ses->server))
+ return -EIO;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
memset(&rqst, 0, sizeof(struct smb_rqst));
+ memset(&iov, 0, sizeof(iov));
rqst.rq_iov = iov;
- rqst.rq_nvec = n_iov;
+ rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid,
+ opcode, is_fsctl, in_data, indatalen);
+ if (rc)
+ goto ioctl_exit;
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
- cifs_small_buf_release(req);
rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
if (rc != 0)
@@ -2591,6 +2649,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
}
ioctl_exit:
+ SMB2_ioctl_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
@@ -2837,6 +2896,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto qinf_exit;
+ trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
@@ -2847,6 +2909,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
goto qinf_exit;
}
+ trace_smb3_query_info_done(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type);
+
if (dlen) {
*dlen = le32_to_cpu(rsp->OutputBufferLength);
if (!*data) {
@@ -2924,14 +2989,16 @@ smb2_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf;
- unsigned int credits_received = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
if (mid->mid_state == MID_RESPONSE_RECEIVED
- || mid->mid_state == MID_RESPONSE_MALFORMED)
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ || mid->mid_state == MID_RESPONSE_MALFORMED) {
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
+ }
DeleteMidQEntry(mid);
- add_credits(server, credits_received, CIFS_ECHO_OP);
+ add_credits(server, &credits, CIFS_ECHO_OP);
}
void smb2_reconnect_server(struct work_struct *work)
@@ -3023,7 +3090,7 @@ SMB2_echo(struct TCP_Server_Info *server)
iov[0].iov_base = (char *)req;
rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL,
- server, CIFS_ECHO_OP);
+ server, CIFS_ECHO_OP, NULL);
if (rc)
cifs_dbg(FYI, "Echo request failed: %d\n", rc);
@@ -3114,6 +3181,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->MinimumCount = 0;
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
+
+ trace_smb3_read_enter(0 /* xid */,
+ io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a RDMA write, fill in and append
@@ -3184,7 +3256,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = tcon->ses->server;
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)rdata->iov[0].iov_base;
- unsigned int credits_received = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_pages = rdata->pages,
@@ -3199,7 +3271,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(shdr->CreditRequest);
+ credits.value = le16_to_cpu(shdr->CreditRequest);
+ credits.instance = server->reconnect_instance;
/* result already set, check signature */
if (server->sign && !mid->decrypted) {
int rc;
@@ -3224,11 +3297,11 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_RESPONSE_MALFORMED:
- credits_received = le16_to_cpu(shdr->CreditRequest);
+ credits.value = le16_to_cpu(shdr->CreditRequest);
+ credits.instance = server->reconnect_instance;
/* fall through */
default:
- if (rdata->result != -ENODATA)
- rdata->result = -EIO;
+ rdata->result = -EIO;
}
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
@@ -3255,7 +3328,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- add_credits(server, credits_received, 0);
+ add_credits(server, &credits, 0);
}
/* smb2_async_readv - send an async read, and set up mid to handle result */
@@ -3285,17 +3358,8 @@ smb2_async_readv(struct cifs_readdata *rdata)
rc = smb2_new_read_req(
(void **) &buf, &total_len, &io_parms, rdata, 0, 0);
- if (rc) {
- if (rc == -EAGAIN && rdata->credits) {
- /* credits was reset by reconnect */
- rdata->credits = 0;
- /* reduce in_flight value since we won't send the req */
- spin_lock(&server->req_lock);
- server->in_flight--;
- spin_unlock(&server->req_lock);
- }
+ if (rc)
return rc;
- }
if (smb3_encryption_required(io_parms.tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3305,24 +3369,24 @@ smb2_async_readv(struct cifs_readdata *rdata)
shdr = (struct smb2_sync_hdr *)buf;
- if (rdata->credits) {
+ if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
SMB2_MAX_BUFFER_SIZE));
shdr->CreditRequest =
cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
- spin_lock(&server->req_lock);
- server->credits += rdata->credits -
- le16_to_cpu(shdr->CreditCharge);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
- rdata->credits = le16_to_cpu(shdr->CreditCharge);
+
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+ if (rc)
+ goto async_readv_out;
+
flags |= CIFS_HAS_CREDITS;
}
kref_get(&rdata->refcount);
rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
cifs_readv_receive, smb2_readv_callback,
- smb3_handle_read_data, rdata, flags);
+ smb3_handle_read_data, rdata, flags,
+ &rdata->credits);
if (rc) {
kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@ -3332,6 +3396,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
io_parms.offset, io_parms.length, rc);
}
+async_readv_out:
cifs_small_buf_release(buf);
return rc;
}
@@ -3378,7 +3443,10 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length,
rc);
- }
+ } else
+ trace_smb3_read_done(xid, req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
return rc == -ENODATA ? 0 : rc;
} else
@@ -3417,14 +3485,16 @@ smb2_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
- unsigned int credits_received = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
- wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
+ wdata->result = smb2_check_receive(mid, server, 0);
if (wdata->result != 0)
break;
@@ -3448,7 +3518,8 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
/* fall through */
default:
wdata->result = -EIO;
@@ -3481,7 +3552,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- add_credits(tcon->ses->server, credits_received, 0);
+ add_credits(server, &credits, 0);
}
/* smb2_async_writev - send an async write, and set up mid to handle result */
@@ -3499,17 +3570,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
unsigned int total_len;
rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len);
- if (rc) {
- if (rc == -EAGAIN && wdata->credits) {
- /* credits was reset by reconnect */
- wdata->credits = 0;
- /* reduce in_flight value since we won't send the req */
- spin_lock(&server->req_lock);
- server->in_flight--;
- spin_unlock(&server->req_lock);
- }
- goto async_writev_out;
- }
+ if (rc)
+ return rc;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3526,6 +3588,9 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->DataOffset = cpu_to_le16(
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
+
+ trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a server RDMA read, fill in and append
@@ -3595,23 +3660,22 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->Length = cpu_to_le32(wdata->bytes);
#endif
- if (wdata->credits) {
+ if (wdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
SMB2_MAX_BUFFER_SIZE));
shdr->CreditRequest =
cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
- spin_lock(&server->req_lock);
- server->credits += wdata->credits -
- le16_to_cpu(shdr->CreditCharge);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
- wdata->credits = le16_to_cpu(shdr->CreditCharge);
+
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+ if (rc)
+ goto async_writev_out;
+
flags |= CIFS_HAS_CREDITS;
}
kref_get(&wdata->refcount);
rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL,
- wdata, flags);
+ wdata, flags, &wdata->credits);
if (rc) {
trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
@@ -3674,6 +3738,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
+ trace_smb3_write_enter(xid, io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
+
iov[0].iov_base = (char *)req;
/* 1 for Buffer */
iov[0].iov_len = total_len - 1;
@@ -3836,6 +3904,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, output_size);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -3843,18 +3914,26 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
if (rc == -ENODATA &&
rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ trace_smb3_query_dir_done(xid, persistent_fid,
+ tcon->tid, tcon->ses->Suid, index, 0);
srch_inf->endOfSearch = true;
rc = 0;
- } else
+ } else {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+ }
goto qdir_exit;
}
rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
info_buf_size);
- if (rc)
+ if (rc) {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
goto qdir_exit;
+ }
srch_inf->unicode = true;
@@ -3882,6 +3961,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
else
cifs_dbg(VFS, "illegal search buffer type\n");
+ trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, srch_inf->entries_in_buffer);
return rc;
qdir_exit:
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 538e2299805f..ee8977688e21 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -288,12 +288,12 @@ struct smb2_encryption_neg_context {
__le16 Ciphers[1]; /* Ciphers[0] since only one used now */
} __packed;
-#define POSIX_CTXT_DATA_LEN 8
+#define POSIX_CTXT_DATA_LEN 16
struct smb2_posix_neg_context {
__le16 ContextType; /* 0x100 */
__le16 DataLength;
__le32 Reserved;
- __le64 Reserved1; /* In case needed for future (eg version or caps) */
+ __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
} __packed;
struct smb2_negotiate_rsp {
@@ -959,6 +959,13 @@ struct duplicate_extents_to_file {
__le64 ByteCount; /* Bytes to be copied */
} __packed;
+/*
+ * Maximum number of iovs we need for an ioctl request.
+ * [0] : struct smb2_ioctl_req
+ * [1] : in_data
+ */
+#define SMB2_IOCTL_IOV_SIZE 2
+
struct smb2_ioctl_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 57 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 87733b27a65f..3c32d0cfea69 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -144,6 +144,10 @@ extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
bool is_fsctl, char *in_data, u32 indatalen,
char **out_data, u32 *plen /* returned data len */);
+extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen);
+extern void SMB2_ioctl_free(struct smb_rqst *rqst);
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
@@ -223,6 +227,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
+extern __u8 smb2_parse_lease_state(struct TCP_Server_Info *server,
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key);
extern int smb3_encryption_required(const struct cifs_tcon *tcon);
extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size);
diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h
index 3d5f62150de4..447c0c6e4c64 100644
--- a/fs/cifs/smb2status.h
+++ b/fs/cifs/smb2status.h
@@ -30,9 +30,9 @@
*/
#define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000)
-#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001)
-#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002)
-#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003)
+#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001)
+#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002)
+#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003)
struct ntstatus {
/* Facility is the high 12 bits of the following field */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 7b351c65ee46..d1181572758b 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -576,6 +576,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
+ unsigned int credits = le16_to_cpu(shdr->CreditCharge);
if (server == NULL) {
cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
@@ -586,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
memset(temp, 0, sizeof(struct mid_q_entry));
kref_init(&temp->refcount);
temp->mid = le64_to_cpu(shdr->MessageId);
+ temp->credits = credits > 0 ? credits : 1;
temp->pid = current->pid;
temp->command = shdr->Command; /* Always LE */
temp->when_alloc = jiffies;
@@ -600,6 +602,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
+ trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId,
+ le16_to_cpu(shdr->Command), temp->mid);
return temp;
}
@@ -615,6 +619,10 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
return -EAGAIN;
}
+ if (ses->server->tcpStatus == CifsNeedNegotiate &&
+ shdr->Command != SMB2_NEGOTIATE)
+ return -EAGAIN;
+
if (ses->status == CifsNew) {
if ((shdr->Command != SMB2_SESSION_SETUP) &&
(shdr->Command != SMB2_NEGOTIATE))
@@ -634,6 +642,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
spin_lock(&GlobalMid_Lock);
list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q);
spin_unlock(&GlobalMid_Lock);
+
return 0;
}
@@ -674,13 +683,18 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
smb2_seq_num_into_buf(ses->server, shdr);
rc = smb2_get_mid_entry(ses, shdr, &mid);
- if (rc)
+ if (rc) {
+ revert_current_mid_from_hdr(ses->server, shdr);
return ERR_PTR(rc);
+ }
+
rc = smb2_sign_rqst(rqst, ses->server);
if (rc) {
+ revert_current_mid_from_hdr(ses->server, shdr);
cifs_delete_mid(mid);
return ERR_PTR(rc);
}
+
return mid;
}
@@ -692,14 +706,21 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
+ if (server->tcpStatus == CifsNeedNegotiate &&
+ shdr->Command != SMB2_NEGOTIATE)
+ return ERR_PTR(-EAGAIN);
+
smb2_seq_num_into_buf(server, shdr);
mid = smb2_mid_entry_alloc(shdr, server);
- if (mid == NULL)
+ if (mid == NULL) {
+ revert_current_mid_from_hdr(server, shdr);
return ERR_PTR(-ENOMEM);
+ }
rc = smb2_sign_rqst(rqst, server);
if (rc) {
+ revert_current_mid_from_hdr(server, shdr);
DeleteMidQEntry(mid);
return ERR_PTR(rc);
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index a568dac7b3a1..b943b74cd246 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1550,7 +1550,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
char name[MAX_NAME_LEN];
int rc;
- snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
info->request_cache =
kmem_cache_create(
name,
@@ -1566,7 +1566,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->request_mempool)
goto out1;
- snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
info->response_cache =
kmem_cache_create(
name,
@@ -1582,7 +1582,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->response_mempool)
goto out3;
- snprintf(name, MAX_NAME_LEN, "smbd_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
info->workqueue = create_workqueue(name);
if (!info->workqueue)
goto out4;
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index 59be48206932..99c4d799c24b 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -58,6 +58,9 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
DEFINE_SMB3_RW_ERR_EVENT(write_err);
DEFINE_SMB3_RW_ERR_EVENT(read_err);
+DEFINE_SMB3_RW_ERR_EVENT(query_dir_err);
+DEFINE_SMB3_RW_ERR_EVENT(zero_err);
+DEFINE_SMB3_RW_ERR_EVENT(falloc_err);
/* For logging successful read or write */
@@ -100,8 +103,16 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
__u32 len), \
TP_ARGS(xid, fid, tid, sesid, offset, len))
+DEFINE_SMB3_RW_DONE_EVENT(write_enter);
+DEFINE_SMB3_RW_DONE_EVENT(read_enter);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter);
+DEFINE_SMB3_RW_DONE_EVENT(zero_enter);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_enter);
DEFINE_SMB3_RW_DONE_EVENT(write_done);
DEFINE_SMB3_RW_DONE_EVENT(read_done);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
+DEFINE_SMB3_RW_DONE_EVENT(zero_done);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
/*
* For handle based calls other than read and write, and get/set info
@@ -148,6 +159,48 @@ DEFINE_SMB3_FD_ERR_EVENT(close_err);
/*
* For handle based query/set info calls
*/
+DECLARE_EVENT_CLASS(smb3_inf_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u8 infclass,
+ __u32 type),
+ TP_ARGS(xid, fid, tid, sesid, infclass, type),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u8, infclass)
+ __field(__u32, type)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->infclass = infclass;
+ __entry->type = type;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->infclass, __entry->type)
+)
+
+#define DEFINE_SMB3_INF_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_inf_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u8 infclass, \
+ __u32 type), \
+ TP_ARGS(xid, fid, tid, sesid, infclass, type))
+
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_enter);
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_done);
+
DECLARE_EVENT_CLASS(smb3_inf_err_class,
TP_PROTO(unsigned int xid,
__u64 fid,
@@ -195,6 +248,123 @@ DEFINE_SMB3_INF_ERR_EVENT(query_info_err);
DEFINE_SMB3_INF_ERR_EVENT(set_info_err);
DEFINE_SMB3_INF_ERR_EVENT(fsctl_err);
+DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ const char *full_path),
+ TP_ARGS(xid, tid, sesid, full_path),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __string(path, full_path)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __assign_str(path, full_path);
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __get_str(path))
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ const char *full_path), \
+ TP_ARGS(xid, tid, sesid, full_path))
+
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid),
+ TP_ARGS(xid, tid, sesid),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid), \
+ TP_ARGS(xid, tid, sesid))
+
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int rc),
+ TP_ARGS(xid, tid, sesid, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->rc = rc;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->rc)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int rc), \
+ TP_ARGS(xid, tid, sesid, rc))
+
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
+
/*
* For logging SMB3 Status code and Command for responses which return errors
*/
@@ -270,6 +440,7 @@ DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \
__u64 mid), \
TP_ARGS(tid, sesid, cmd, mid))
+DEFINE_SMB3_CMD_DONE_EVENT(cmd_enter);
DEFINE_SMB3_CMD_DONE_EVENT(cmd_done);
DEFINE_SMB3_CMD_DONE_EVENT(ses_expired);
@@ -378,19 +549,19 @@ DECLARE_EVENT_CLASS(smb3_tcon_class,
__field(unsigned int, xid)
__field(__u32, tid)
__field(__u64, sesid)
- __field(const char *, unc_name)
+ __string(name, unc_name)
__field(int, rc)
),
TP_fast_assign(
__entry->xid = xid;
__entry->tid = tid;
__entry->sesid = sesid;
- __entry->unc_name = unc_name;
+ __assign_str(name, unc_name);
__entry->rc = rc;
),
TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d",
__entry->xid, __entry->sesid, __entry->tid,
- __entry->unc_name, __entry->rc)
+ __get_str(name), __entry->rc)
)
#define DEFINE_SMB3_TCON_EVENT(name) \
@@ -406,8 +577,47 @@ DEFINE_SMB3_TCON_EVENT(tcon);
/*
- * For smb2/smb3 open call
+ * For smb2/smb3 open (including create and mkdir) calls
*/
+
+DECLARE_EVENT_CLASS(smb3_open_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int create_options,
+ int desired_access),
+ TP_ARGS(xid, tid, sesid, create_options, desired_access),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, create_options)
+ __field(int, desired_access)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->create_options = create_options;
+ __entry->desired_access = desired_access;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->create_options, __entry->desired_access)
+)
+
+#define DEFINE_SMB3_OPEN_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_open_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int create_options, \
+ int desired_access), \
+ TP_ARGS(xid, tid, sesid, create_options, desired_access))
+
+DEFINE_SMB3_OPEN_ENTER_EVENT(open_enter);
+DEFINE_SMB3_OPEN_ENTER_EVENT(posix_mkdir_enter);
+
DECLARE_EVENT_CLASS(smb3_open_err_class,
TP_PROTO(unsigned int xid,
__u32 tid,
@@ -626,6 +836,7 @@ DEFINE_EVENT(smb3_credit_class, smb3_##name, \
TP_ARGS(currmid, hostname, credits))
DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits);
+DEFINE_SMB3_CREDIT_EVENT(credit_timeout);
#endif /* _CIFS_TRACE_H */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 53532bd3f50d..1de8e996e566 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -33,6 +33,7 @@
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/mempool.h>
+#include <linux/signal.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -291,6 +292,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
int n_vec;
unsigned int send_length = 0;
unsigned int i, j;
+ sigset_t mask, oldmask;
size_t total_len = 0, sent, size;
struct socket *ssocket = server->ssocket;
struct msghdr smb_msg;
@@ -301,8 +303,14 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rc = smbd_send(server, rqst);
goto smbd_done;
}
+
if (ssocket == NULL)
- return -ENOTSOCK;
+ return -EAGAIN;
+
+ if (signal_pending(current)) {
+ cifs_dbg(FYI, "signal is pending before sending any data\n");
+ return -EINTR;
+ }
/* cork the socket */
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -312,6 +320,16 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
send_length += smb_rqst_len(server, &rqst[j]);
rfc1002_marker = cpu_to_be32(send_length);
+ /*
+ * We should not allow signals to interrupt the network send because
+ * any partial send will cause session reconnects thus increasing
+ * latency of system calls and overload a server with unnecessary
+ * requests.
+ */
+
+ sigfillset(&mask);
+ sigprocmask(SIG_BLOCK, &mask, &oldmask);
+
/* Generate a rfc1002 marker for SMB2+ */
if (server->vals->header_preamble_size == 0) {
struct kvec hiov = {
@@ -321,7 +339,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
- goto uncork;
+ goto unmask;
total_len += sent;
send_length += 4;
@@ -343,7 +361,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
- goto uncork;
+ goto unmask;
total_len += sent;
@@ -365,7 +383,25 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
}
}
-uncork:
+unmask:
+ sigprocmask(SIG_SETMASK, &oldmask, NULL);
+
+ /*
+ * If signal is pending but we have already sent the whole packet to
+ * the server we need to return success status to allow a corresponding
+ * mid entry to be kept in the pending requests queue thus allowing
+ * to handle responses from the server by the client.
+ *
+ * If only part of the packet has been sent there is no need to hide
+ * interrupt because the session will be reconnected anyway, so there
+ * won't be any response from the server to handle.
+ */
+
+ if (signal_pending(current) && (total_len != send_length)) {
+ cifs_dbg(FYI, "signal is pending after attempt to send\n");
+ rc = -EINTR;
+ }
+
/* uncork it */
val = 0;
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -450,29 +486,55 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
}
static int
-wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
- int *credits)
+wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
+ const int timeout, const int flags,
+ unsigned int *instance)
{
int rc;
+ int *credits;
+ int optype;
+ long int t;
+
+ if (timeout < 0)
+ t = MAX_JIFFY_OFFSET;
+ else
+ t = msecs_to_jiffies(timeout);
+
+ optype = flags & CIFS_OP_MASK;
+
+ *instance = 0;
+
+ credits = server->ops->get_credits_field(server, optype);
+ /* Since an echo is already inflight, no need to wait to send another */
+ if (*credits <= 0 && optype == CIFS_ECHO_OP)
+ return -EAGAIN;
spin_lock(&server->req_lock);
- if (timeout == CIFS_ASYNC_OP) {
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) {
/* oplock breaks must not be held up */
server->in_flight++;
*credits -= 1;
+ *instance = server->reconnect_instance;
spin_unlock(&server->req_lock);
return 0;
}
while (1) {
- if (*credits <= 0) {
+ if (*credits < num_credits) {
spin_unlock(&server->req_lock);
cifs_num_waiters_inc(server);
- rc = wait_event_killable(server->request_q,
- has_credits(server, credits));
+ rc = wait_event_killable_timeout(server->request_q,
+ has_credits(server, credits, num_credits), t);
cifs_num_waiters_dec(server);
- if (rc)
- return rc;
+ if (!rc) {
+ trace_smb3_credit_timeout(server->CurrentMid,
+ server->hostname, num_credits);
+ cifs_dbg(VFS, "wait timed out after %d ms\n",
+ timeout);
+ return -ENOTSUPP;
+ }
+ if (rc == -ERESTARTSYS)
+ return -ERESTARTSYS;
spin_lock(&server->req_lock);
} else {
if (server->tcpStatus == CifsExiting) {
@@ -481,14 +543,53 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
/*
+ * For normal commands, reserve the last MAX_COMPOUND
+ * credits to compound requests.
+ * Otherwise these compounds could be permanently
+ * starved for credits by single-credit requests.
+ *
+ * To prevent spinning CPU, block this thread until
+ * there are >MAX_COMPOUND credits available.
+ * But only do this is we already have a lot of
+ * credits in flight to avoid triggering this check
+ * for servers that are slow to hand out credits on
+ * new sessions.
+ */
+ if (!optype && num_credits == 1 &&
+ server->in_flight > 2 * MAX_COMPOUND &&
+ *credits <= MAX_COMPOUND) {
+ spin_unlock(&server->req_lock);
+ cifs_num_waiters_inc(server);
+ rc = wait_event_killable_timeout(
+ server->request_q,
+ has_credits(server, credits,
+ MAX_COMPOUND + 1),
+ t);
+ cifs_num_waiters_dec(server);
+ if (!rc) {
+ trace_smb3_credit_timeout(
+ server->CurrentMid,
+ server->hostname, num_credits);
+ cifs_dbg(VFS, "wait timed out after %d ms\n",
+ timeout);
+ return -ENOTSUPP;
+ }
+ if (rc == -ERESTARTSYS)
+ return -ERESTARTSYS;
+ spin_lock(&server->req_lock);
+ continue;
+ }
+
+ /*
* Can not count locking commands against total
* as they are allowed to block on server.
*/
/* update # of requests on the wire to server */
- if (timeout != CIFS_BLOCKING_OP) {
- *credits -= 1;
- server->in_flight++;
+ if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) {
+ *credits -= num_credits;
+ server->in_flight += num_credits;
+ *instance = server->reconnect_instance;
}
spin_unlock(&server->req_lock);
break;
@@ -498,24 +599,45 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
static int
-wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
- const int optype)
+wait_for_free_request(struct TCP_Server_Info *server, const int flags,
+ unsigned int *instance)
{
- int *val;
+ return wait_for_free_credits(server, 1, -1, flags,
+ instance);
+}
- val = server->ops->get_credits_field(server, optype);
- /* Since an echo is already inflight, no need to wait to send another */
- if (*val <= 0 && optype == CIFS_ECHO_OP)
- return -EAGAIN;
- return wait_for_free_credits(server, timeout, val);
+static int
+wait_for_compound_request(struct TCP_Server_Info *server, int num,
+ const int flags, unsigned int *instance)
+{
+ int *credits;
+
+ credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK);
+
+ spin_lock(&server->req_lock);
+ if (*credits < num) {
+ /*
+ * Return immediately if not too many requests in flight since
+ * we will likely be stuck on waiting for credits.
+ */
+ if (server->in_flight < num - *credits) {
+ spin_unlock(&server->req_lock);
+ return -ENOTSUPP;
+ }
+ }
+ spin_unlock(&server->req_lock);
+
+ return wait_for_free_credits(server, num, 60000, flags,
+ instance);
}
int
cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, unsigned int *credits)
+ unsigned int *num, struct cifs_credits *credits)
{
*num = size;
- *credits = 0;
+ credits->value = 0;
+ credits->instance = server->reconnect_instance;
return 0;
}
@@ -602,27 +724,43 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
int
cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
mid_receive_t *receive, mid_callback_t *callback,
- mid_handle_t *handle, void *cbdata, const int flags)
+ mid_handle_t *handle, void *cbdata, const int flags,
+ const struct cifs_credits *exist_credits)
{
- int rc, timeout, optype;
+ int rc;
struct mid_q_entry *mid;
- unsigned int credits = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
+ unsigned int instance;
+ int optype;
- timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
if ((flags & CIFS_HAS_CREDITS) == 0) {
- rc = wait_for_free_request(server, timeout, optype);
+ rc = wait_for_free_request(server, flags, &instance);
if (rc)
return rc;
- credits = 1;
- }
+ credits.value = 1;
+ credits.instance = instance;
+ } else
+ instance = exist_credits->instance;
mutex_lock(&server->srv_mutex);
+
+ /*
+ * We can't use credits obtained from the previous session to send this
+ * request. Check if there were reconnects after we obtained credits and
+ * return -EAGAIN in such cases to let callers handle it.
+ */
+ if (instance != server->reconnect_instance) {
+ mutex_unlock(&server->srv_mutex);
+ add_credits_and_wake_if(server, &credits, optype);
+ return -EAGAIN;
+ }
+
mid = server->ops->setup_async_request(server, rqst);
if (IS_ERR(mid)) {
mutex_unlock(&server->srv_mutex);
- add_credits_and_wake_if(server, credits, optype);
+ add_credits_and_wake_if(server, &credits, optype);
return PTR_ERR(mid);
}
@@ -647,6 +785,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
cifs_in_send_dec(server);
if (rc < 0) {
+ revert_current_mid(server, mid->credits);
server->sequence_number -= 2;
cifs_delete_mid(mid);
}
@@ -656,7 +795,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
if (rc == 0)
return 0;
- add_credits_and_wake_if(server, credits, optype);
+ add_credits_and_wake_if(server, &credits, optype);
return rc;
}
@@ -786,8 +925,12 @@ static void
cifs_compound_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->server;
+ struct cifs_credits credits;
+
+ credits.value = server->ops->get_credits(mid);
+ credits.instance = server->reconnect_instance;
- add_credits(server, server->ops->get_credits(mid), mid->optype);
+ add_credits(server, &credits, mid->optype);
}
static void
@@ -809,14 +952,15 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
const int flags, const int num_rqst, struct smb_rqst *rqst,
int *resp_buf_type, struct kvec *resp_iov)
{
- int i, j, rc = 0;
- int timeout, optype;
+ int i, j, optype, rc = 0;
struct mid_q_entry *midQ[MAX_COMPOUND];
bool cancelled_mid[MAX_COMPOUND] = {false};
- unsigned int credits[MAX_COMPOUND] = {0};
+ struct cifs_credits credits[MAX_COMPOUND] = {
+ { .value = 0, .instance = 0 }
+ };
+ unsigned int instance;
char *buf;
- timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
for (i = 0; i < num_rqst; i++)
@@ -831,30 +975,21 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
return -ENOENT;
/*
- * Ensure we obtain 1 credit per request in the compound chain.
- * It can be optimized further by waiting for all the credits
- * at once but this can wait long enough if we don't have enough
- * credits due to some heavy operations in progress or the server
- * not granting us much, so a fallback to the current approach is
- * needed anyway.
+ * Wait for all the requests to become available.
+ * This approach still leaves the possibility to be stuck waiting for
+ * credits if the server doesn't grant credits to the outstanding
+ * requests and if the client is completely idle, not generating any
+ * other requests.
+ * This can be handled by the eventual session reconnect.
*/
+ rc = wait_for_compound_request(ses->server, num_rqst, flags,
+ &instance);
+ if (rc)
+ return rc;
+
for (i = 0; i < num_rqst; i++) {
- rc = wait_for_free_request(ses->server, timeout, optype);
- if (rc) {
- /*
- * We haven't sent an SMB packet to the server yet but
- * we already obtained credits for i requests in the
- * compound chain - need to return those credits back
- * for future use. Note that we need to call add_credits
- * multiple times to match the way we obtained credits
- * in the first place and to account for in flight
- * requests correctly.
- */
- for (j = 0; j < i; j++)
- add_credits(ses->server, 1, optype);
- return rc;
- }
- credits[i] = 1;
+ credits[i].value = 1;
+ credits[i].instance = instance;
}
/*
@@ -865,16 +1000,31 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
mutex_lock(&ses->server->srv_mutex);
+ /*
+ * All the parts of the compound chain belong obtained credits from the
+ * same session. We can not use credits obtained from the previous
+ * session to send this request. Check if there were reconnects after
+ * we obtained credits and return -EAGAIN in such cases to let callers
+ * handle it.
+ */
+ if (instance != ses->server->reconnect_instance) {
+ mutex_unlock(&ses->server->srv_mutex);
+ for (j = 0; j < num_rqst; j++)
+ add_credits(ses->server, &credits[j], optype);
+ return -EAGAIN;
+ }
+
for (i = 0; i < num_rqst; i++) {
midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]);
if (IS_ERR(midQ[i])) {
+ revert_current_mid(ses->server, i);
for (j = 0; j < i; j++)
cifs_delete_mid(midQ[j]);
mutex_unlock(&ses->server->srv_mutex);
/* Update # of requests on wire to server */
for (j = 0; j < num_rqst; j++)
- add_credits(ses->server, credits[j], optype);
+ add_credits(ses->server, &credits[j], optype);
return PTR_ERR(midQ[i]);
}
@@ -897,15 +1047,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
for (i = 0; i < num_rqst; i++)
cifs_save_when_sent(midQ[i]);
- if (rc < 0)
+ if (rc < 0) {
+ revert_current_mid(ses->server, num_rqst);
ses->server->sequence_number -= 2;
+ }
mutex_unlock(&ses->server->srv_mutex);
if (rc < 0) {
/* Sending failed for some reason - return credits back */
for (i = 0; i < num_rqst; i++)
- add_credits(ses->server, credits[i], optype);
+ add_credits(ses->server, &credits[i], optype);
goto out;
}
@@ -924,7 +1076,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
smb311_update_preauth_hash(ses, rqst[0].rq_iov,
rqst[0].rq_nvec);
- if (timeout == CIFS_ASYNC_OP)
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP)
goto out;
for (i = 0; i < num_rqst; i++) {
@@ -942,7 +1094,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
- credits[i] = 0;
+ credits[i].value = 0;
}
spin_unlock(&GlobalMid_Lock);
}
@@ -1061,13 +1213,14 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
int
SendReceive(const unsigned int xid, struct cifs_ses *ses,
struct smb_hdr *in_buf, struct smb_hdr *out_buf,
- int *pbytes_returned, const int timeout)
+ int *pbytes_returned, const int flags)
{
int rc = 0;
struct mid_q_entry *midQ;
unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
struct kvec iov = { .iov_base = in_buf, .iov_len = len };
struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
if (ses == NULL) {
cifs_dbg(VFS, "Null smb session\n");
@@ -1091,7 +1244,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- rc = wait_for_free_request(ses->server, timeout, 0);
+ rc = wait_for_free_request(ses->server, flags, &credits.instance);
if (rc)
return rc;
@@ -1105,7 +1258,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc) {
mutex_unlock(&ses->server->srv_mutex);
/* Update # of requests on wire to server */
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1130,7 +1283,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc < 0)
goto out;
- if (timeout == CIFS_ASYNC_OP)
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP)
goto out;
rc = wait_for_response(ses->server, midQ);
@@ -1141,7 +1294,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
/* no longer considered to be "in-flight" */
midQ->callback = DeleteMidQEntry;
spin_unlock(&GlobalMid_Lock);
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
spin_unlock(&GlobalMid_Lock);
@@ -1149,7 +1302,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_sync_mid_result(midQ, ses->server);
if (rc != 0) {
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1165,7 +1318,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_check_receive(midQ, ses->server, 0);
out:
cifs_delete_mid(midQ);
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1207,6 +1360,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
struct kvec iov = { .iov_base = in_buf, .iov_len = len };
struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ unsigned int instance;
if (tcon == NULL || tcon->ses == NULL) {
cifs_dbg(VFS, "Null smb session\n");
@@ -1232,7 +1386,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return -EIO;
}
- rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0);
+ rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, &instance);
if (rc)
return rc;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 02b7d91c9231..f0de238000c0 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -1,16 +1,16 @@
config FS_ENCRYPTION
- tristate "FS Encryption (Per-file encryption)"
+ bool "FS Encryption (Per-file encryption)"
select CRYPTO
select CRYPTO_AES
select CRYPTO_CBC
select CRYPTO_ECB
select CRYPTO_XTS
select CRYPTO_CTS
- select CRYPTO_CTR
select CRYPTO_SHA256
select KEYS
help
Enable encryption of files and directories. This
feature is similar to ecryptfs, but it is more memory
efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
+ decrypted pages in the page cache. Currently Ext4,
+ F2FS and UBIFS make use of this feature.
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0959044c5cee..5759bcd018cd 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
int ret = fscrypt_decrypt_page(page->mapping->host, page,
PAGE_SIZE, 0, page->index);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7424f851eb5c..7da276159593 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,7 +12,6 @@
#ifndef _FSCRYPT_PRIVATE_H
#define _FSCRYPT_PRIVATE_H
-#define __FS_HAS_ENCRYPTION 1
#include <linux/fscrypt.h>
#include <crypto/hash.h>
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 926e5df20ec3..56debb1fcf5e 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -58,7 +58,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
return err;
if (!fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ return -EXDEV;
return 0;
}
@@ -82,13 +82,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_ENCRYPTED(new_dir) &&
!fscrypt_has_permitted_context(new_dir,
d_inode(old_dentry)))
- return -EPERM;
+ return -EXDEV;
if ((flags & RENAME_EXCHANGE) &&
IS_ENCRYPTED(old_dir) &&
!fscrypt_has_permitted_context(old_dir,
d_inode(new_dentry)))
- return -EPERM;
+ return -EXDEV;
}
return 0;
}
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 1e11a683f63d..322ce9686bdb 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -47,7 +47,7 @@ static int derive_key_aes(const u8 *master_key,
tfm = NULL;
goto out;
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
res = -ENOMEM;
@@ -257,7 +257,7 @@ allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
mode->friendly_name,
crypto_skcipher_alg(tfm)->base.cra_driver_name);
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
if (err)
goto err_free_tfm;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f490de921ce8..bd7eaf9b3f00 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -151,8 +151,7 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
* malicious offline violations of this constraint, while the link and rename
* checks are needed to prevent online violations of this constraint.
*
- * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail
- * the filesystem operation with EPERM.
+ * Return: 1 if permitted, 0 if forbidden.
*/
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
diff --git a/fs/dax.c b/fs/dax.c
index 6959837cc465..ca0671d55aa6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -788,7 +788,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
address = pgoff_address(index, vma);
/*
- * Note because we provide start/end to follow_pte_pmd it will
+ * Note because we provide range to follow_pte_pmd it will
* call mmu_notifier_invalidate_range_start() on our behalf
* before taking any lock.
*/
@@ -843,9 +843,8 @@ unlock_pte:
static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
struct address_space *mapping, void *entry)
{
- unsigned long pfn;
+ unsigned long pfn, index, count;
long ret = 0;
- size_t size;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -894,17 +893,18 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
xas_unlock_irq(xas);
/*
- * Even if dax_writeback_mapping_range() was given a wbc->range_start
- * in the middle of a PMD, the 'index' we are given will be aligned to
- * the start index of the PMD, as will the pfn we pull from 'entry'.
+ * If dax_writeback_mapping_range() was given a wbc->range_start
+ * in the middle of a PMD, the 'index' we use needs to be
+ * aligned to the start of the PMD.
* This allows us to flush for PMD_SIZE and not have to worry about
* partial PMD writebacks.
*/
pfn = dax_to_pfn(entry);
- size = PAGE_SIZE << dax_entry_order(entry);
+ count = 1UL << dax_entry_order(entry);
+ index = xas->xa_index & ~(count - 1);
- dax_entry_mkclean(mapping, xas->xa_index, pfn);
- dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
+ dax_entry_mkclean(mapping, index, pfn);
+ dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
@@ -917,8 +917,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
dax_wake_entry(xas, entry, false);
- trace_dax_writeback_one(mapping->host, xas->xa_index,
- size >> PAGE_SHIFT);
+ trace_dax_writeback_one(mapping->host, index, count);
return ret;
put_unlocked:
@@ -1220,9 +1219,7 @@ static vm_fault_t dax_fault_return(int error)
{
if (error == 0)
return VM_FAULT_NOPAGE;
- if (error == -ENOMEM)
- return VM_FAULT_OOM;
- return VM_FAULT_SIGBUS;
+ return vmf_error(error);
}
/*
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 29c68c5d44d5..95b5e78c22b1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -423,8 +423,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
* debugfs core.
*
* It is your responsibility to protect your struct file_operation
- * methods against file removals by means of debugfs_use_file_start()
- * and debugfs_use_file_finish(). ->open() is still protected by
+ * methods against file removals by means of debugfs_file_get()
+ * and debugfs_file_put(). ->open() is still protected by
* debugfs though.
*
* Any struct file_operations defined by means of
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c53814539070..553a3f3300ae 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
s->s_op = &devpts_sops;
+ s->s_d_op = &simple_dentry_operations;
s->s_time_gran = 1;
error = -ENOMEM;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ec2fb6fe6d37..9bb015bc4a83 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
- bio_for_each_segment_all(bvec, bio, i) {
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
if (dio->op == REQ_OP_READ && !PageCompound(page) &&
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 76976d6e50f9..c98ad9777ad9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1089,12 +1089,12 @@ static void sctp_connect_to_sock(struct connection *con)
* since O_NONBLOCK argument in connect() function does not work here,
* then, we should restore the default value of this attribute.
*/
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
0);
memset(&tv, 0, sizeof(tv));
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
if (result == -EINPROGRESS)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 4dd842f72846..f664da55234e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -610,7 +610,8 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
full_alg_name);
goto out_free;
}
- crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(crypt_stat->tfm,
+ CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
rc = 0;
out_free:
kfree(full_alg_name);
@@ -1590,7 +1591,7 @@ ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
"[%s]; rc = [%d]\n", full_alg_name, rc);
goto out;
}
- crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
if (*key_size == 0)
*key_size = crypto_skcipher_default_keysize(*key_tfm);
get_random_bytes(dummy_key, *key_size);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a5d219d920e7..4a0e98d87fcc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -50,10 +50,10 @@
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->wq.lock (spinlock)
+ * 3) ep->lock (rwlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->wq.lock) because we manipulate objects
+ * We need a rwlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -85,7 +85,7 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->wq.lock") to have it working,
+ * mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
@@ -182,8 +182,6 @@ struct epitem {
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
- *
- * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
/*
@@ -203,13 +201,16 @@ struct eventpoll {
/* List of ready file descriptors */
struct list_head rdllist;
+ /* Lock which protects rdllist and ovflist */
+ rwlock_t lock;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->wq.lock.
+ * holding ->lock.
*/
struct epitem *ovflist;
@@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
WRITE_ONCE(ep->ovflist, NULL);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -722,7 +723,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ /*
+ * ->ovflist is LIFO, so we have to reverse it in order
+ * to keep in FIFO.
+ */
+ list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
@@ -745,11 +750,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
@@ -789,10 +794,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -842,7 +847,7 @@ static void ep_free(struct eventpoll *ep)
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->wq.lock".
+ * us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
@@ -1023,6 +1028,7 @@ static int ep_alloc(struct eventpoll **pep)
goto free_uid;
mutex_init(&ep->mtx);
+ rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1112,21 +1118,107 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+/**
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ * existing list until all changes are completed, in other words
+ * concurrent list_add_tail_lockless() calls should be protected
+ * with a read lock, where write lock acts as a barrier which
+ * makes sure all list_add_tail_lockless() calls are fully
+ * completed.
+ *
+ * Also an element can be locklessly added to the list only in one
+ * direction i.e. either to the tail either to the head, otherwise
+ * concurrent access will corrupt the list.
+ *
+ * Returns %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+ struct list_head *head)
+{
+ struct list_head *prev;
+
+ /*
+ * This is simple 'new->next = head' operation, but cmpxchg()
+ * is used in order to detect that same element has been just
+ * added to the list from another CPU: the winner observes
+ * new->next == new.
+ */
+ if (cmpxchg(&new->next, new, head) != new)
+ return false;
+
+ /*
+ * Initially ->next of a new element must be updated with the head
+ * (we are inserting to the tail) and only then pointers are atomically
+ * exchanged. XCHG guarantees memory ordering, thus ->next should be
+ * updated before pointers are actually swapped and pointers are
+ * swapped before prev->next is updated.
+ */
+
+ prev = xchg(&head->prev, new);
+
+ /*
+ * It is safe to modify prev->next and new->prev, because a new element
+ * is added only to the tail and new->next is updated before XCHG.
+ */
+
+ prev->next = new;
+ new->prev = prev;
+
+ return true;
+}
+
+/**
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Returns %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+ struct eventpoll *ep = epi->ep;
+
+ /* Check that the same epi has not been just chained from another CPU */
+ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+ return false;
+
+ /* Atomically exchange tail */
+ epi->next = xchg(&ep->ovflist, epi);
+
+ return true;
+}
+
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
+ *
+ * This callback takes a read lock in order not to content with concurrent
+ * events from another file descriptors, thus all modifications to ->rdllist
+ * or ->ovflist are lockless. Read lock is paired with the write lock from
+ * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
+ * Another thing worth to mention is that ep_poll_callback() can be called
+ * concurrently for the same @epi from different CPUs if poll table was inited
+ * with several wait queues entries. Plural wakeup from different CPUs of a
+ * single wait queue is serialized by wq.lock, but the case when multiple wait
+ * queues are used should be detected accordingly. This is detected using
+ * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
- unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
+ unsigned long flags;
int ewake = 0;
- spin_lock_irqsave(&ep->wq.lock, flags);
+ read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1155,24 +1247,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = READ_ONCE(ep->ovflist);
- WRITE_ONCE(ep->ovflist, epi);
- if (epi->ws) {
- /*
- * Activate ep->ws since epi->ws may get
- * deactivated at any time.
- */
- __pm_stay_awake(ep->ws);
- }
-
- }
+ if (epi->next == EP_UNACTIVE_PTR &&
+ chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ if (!ep_is_linked(epi) &&
+ list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
@@ -1197,13 +1280,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->wq.lock, flags);
+ read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1488,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1500,12 +1583,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
@@ -1531,10 +1614,10 @@ error_unregister:
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
@@ -1578,9 +1661,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->wq.lock while
+ * We need this because we did not take ep->lock while
* changing epi above (but ep_poll_callback does take
- * ep->wq.lock).
+ * ep->lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
@@ -1599,18 +1682,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -1771,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
*/
timed_out = 1;
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
goto send_events;
}
diff --git a/fs/exec.c b/fs/exec.c
index fb72d36f7823..2e0033348d8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,7 +932,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
if (bytes < 0) {
ret = bytes;
- goto out;
+ goto out_free;
}
if (bytes == 0)
@@ -1189,7 +1189,7 @@ no_thread_group:
flush_itimer_signals();
#endif
- if (atomic_read(&oldsighand->count) != 1) {
+ if (refcount_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
/*
* This ->sighand is shared with the CLONE_SIGHAND
@@ -1199,7 +1199,7 @@ no_thread_group:
if (!newsighand)
return -ENOMEM;
- atomic_set(&newsighand->count, 1);
+ refcount_set(&newsighand->count, 1);
memcpy(newsighand->action, oldsighand->action,
sizeof(newsighand->action));
@@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
/*
* Fill the binprm structure from the inode.
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ * Check permissions, then read the first BINPRM_BUF_SIZE bytes
*
* This may be called multiple times for binary chains (scripts for example).
*/
@@ -1944,15 +1944,10 @@ EXPORT_SYMBOL(set_binfmt);
*/
void set_dumpable(struct mm_struct *mm, int value)
{
- unsigned long old, new;
-
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
return;
- do {
- old = READ_ONCE(mm->flags);
- new = (old & ~MMF_DUMPABLE_MASK) | value;
- } while (cmpxchg(&mm->flags, old, new) != old);
+ set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}
SYSCALL_DEFINE3(execve,
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
deleted file mode 100644
index 1b2d4c63a579..000000000000
--- a/fs/exofs/BUGS
+++ /dev/null
@@ -1,3 +0,0 @@
-- Out-of-space may cause a severe problem if the object (and directory entry)
- were written, but the inode attributes failed. Then if the filesystem was
- unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
deleted file mode 100644
index a364fd0965ec..000000000000
--- a/fs/exofs/Kbuild
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Kbuild for the EXOFS module
-#
-# Copyright (C) 2008 Panasas Inc. All rights reserved.
-#
-# Authors:
-# Boaz Harrosh <ooo@electrozaur.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2
-#
-# Kbuild - Gets included from the Kernels Makefile and build system
-#
-
-# ore module library
-libore-y := ore.o ore_raid.o
-obj-$(CONFIG_ORE) += libore.o
-
-exofs-y := inode.o file.o namei.o dir.o super.o sys.o
-obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
deleted file mode 100644
index 86194b2f799d..000000000000
--- a/fs/exofs/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-config EXOFS_FS
- tristate "exofs: OSD based file system support"
- depends on SCSI_OSD_ULD
- help
- EXOFS is a file system that uses an OSD storage device,
- as its backing storage.
-
-# Debugging-related stuff
-config EXOFS_DEBUG
- bool "Enable debugging"
- depends on EXOFS_FS
- help
- This option enables EXOFS debug prints.
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
deleted file mode 100644
index 2daf2329c28d..000000000000
--- a/fs/exofs/Kconfig.ore
+++ /dev/null
@@ -1,14 +0,0 @@
-# ORE - Objects Raid Engine (libore.ko)
-#
-# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
-# for every ORE user we do it like this. Any user should add itself here
-# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
-# selected here, and we default to "ON". So in effect it is like been
-# selected by any of the users.
-config ORE
- tristate
- depends on EXOFS_FS || PNFS_OBJLAYOUT
- select ASYNC_XOR
- select RAID6_PQ
- select ASYNC_PQ
- default SCSI_OSD_ULD
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
deleted file mode 100644
index 7d88ef566213..000000000000
--- a/fs/exofs/common.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * common.h - Common definitions for both Kernel and user-mode utilities
- *
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef __EXOFS_COM_H__
-#define __EXOFS_COM_H__
-
-#include <linux/types.h>
-
-#include <scsi/osd_attributes.h>
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_sec.h>
-
-/****************************************************************************
- * Object ID related defines
- * NOTE: inode# = object ID - EXOFS_OBJ_OFF
- ****************************************************************************/
-#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
-#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
-#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
-#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
-#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
-
-/* exofs Application specific page/attribute */
-/* Inode attrs */
-# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
-# define EXOFS_ATTR_INODE_DATA 1
-# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
-# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
-/* Partition attrs */
-# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3)
-# define EXOFS_ATTR_SB_STATS 1
-
-/*
- * The maximum number of files we can have is limited by the size of the
- * inode number. This is the largest object ID that the file system supports.
- * Object IDs 0, 1, and 2 are always in use (see above defines).
- */
-enum {
- EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
- (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
- EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
-};
-
-/****************************************************************************
- * Misc.
- ****************************************************************************/
-#define EXOFS_BLKSHIFT 12
-#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT)
-
-/****************************************************************************
- * superblock-related things
- ****************************************************************************/
-#define EXOFS_SUPER_MAGIC 0x5DF5
-
-/*
- * The file system control block - stored in object EXOFS_SUPER_ID's data.
- * This is where the in-memory superblock is stored on disk.
- */
-enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
-struct exofs_fscb {
- __le64 s_nextid; /* Only used after mkfs */
- __le64 s_numfiles; /* Only used after mkfs */
- __le32 s_version; /* == EXOFS_FSCB_VER */
- __le16 s_magic; /* Magic signature */
- __le16 s_newfs; /* Non-zero if this is a new fs */
-
- /* From here on it's a static part, only written by mkexofs */
- __le64 s_dev_table_oid; /* Resurved, not used */
- __le64 s_dev_table_count; /* == 0 means no dev_table */
-} __packed;
-
-/*
- * This struct is set on the FS partition's attributes.
- * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
- * with the create command, to atomically persist the sb writeable information.
- */
-struct exofs_sb_stats {
- __le64 s_nextid; /* Highest object ID used */
- __le64 s_numfiles; /* Number of files on fs */
-} __packed;
-
-/*
- * Describes the raid used in the FS. It is part of the device table.
- * This here is taken from the pNFS-objects definition. In exofs we
- * use one raid policy through-out the filesystem. (NOTE: the funny
- * alignment at beginning. We take care of it at exofs_device_table.
- */
-struct exofs_dt_data_map {
- __le32 cb_num_comps;
- __le64 cb_stripe_unit;
- __le32 cb_group_width;
- __le32 cb_group_depth;
- __le32 cb_mirror_cnt;
- __le32 cb_raid_algorithm;
-} __packed;
-
-/*
- * This is an osd device information descriptor. It is a single entry in
- * the exofs device table. It describes an osd target lun which
- * contains data belonging to this FS. (Same partition_id on all devices)
- */
-struct exofs_dt_device_info {
- __le32 systemid_len;
- u8 systemid[OSD_SYSTEMID_LEN];
- __le64 long_name_offset; /* If !0 then offset-in-file */
- __le32 osdname_len; /* */
- u8 osdname[44]; /* Embbeded, Usually an asci uuid */
-} __packed;
-
-/*
- * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
- * It contains the raid used for this multy-device FS and an array of
- * participating devices.
- */
-struct exofs_device_table {
- __le32 dt_version; /* == EXOFS_DT_VER */
- struct exofs_dt_data_map dt_data_map; /* Raid policy to use */
-
- /* Resurved space For future use. Total includeing this:
- * (8 * sizeof(le64))
- */
- __le64 __Resurved[4];
-
- __le64 dt_num_devices; /* Array size */
- struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */
-} __packed;
-
-/****************************************************************************
- * inode-related things
- ****************************************************************************/
-#define EXOFS_IDATA 5
-
-/*
- * The file control block - stored in an object's attributes. This is where
- * the in-memory inode is stored on disk.
- */
-struct exofs_fcb {
- __le64 i_size; /* Size of the file */
- __le16 i_mode; /* File mode */
- __le16 i_links_count; /* Links count */
- __le32 i_uid; /* Owner Uid */
- __le32 i_gid; /* Group Id */
- __le32 i_atime; /* Access time */
- __le32 i_ctime; /* Creation time */
- __le32 i_mtime; /* Modification time */
- __le32 i_flags; /* File flags (unused for now)*/
- __le32 i_generation; /* File version (for NFS) */
- __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */
-};
-
-#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb)
-
-/* This is the Attribute the fcb is stored in */
-static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
- EXOFS_APAGE_FS_DATA,
- EXOFS_ATTR_INODE_DATA,
- EXOFS_INO_ATTR_SIZE);
-
-/****************************************************************************
- * dentry-related things
- ****************************************************************************/
-#define EXOFS_NAME_LEN 255
-
-/*
- * The on-disk directory entry
- */
-struct exofs_dir_entry {
- __le64 inode_no; /* inode number */
- __le16 rec_len; /* directory entry length */
- u8 name_len; /* name length */
- u8 file_type; /* umm...file type */
- char name[EXOFS_NAME_LEN]; /* file name */
-};
-
-enum {
- EXOFS_FT_UNKNOWN,
- EXOFS_FT_REG_FILE,
- EXOFS_FT_DIR,
- EXOFS_FT_CHRDEV,
- EXOFS_FT_BLKDEV,
- EXOFS_FT_FIFO,
- EXOFS_FT_SOCK,
- EXOFS_FT_SYMLINK,
- EXOFS_FT_MAX
-};
-
-#define EXOFS_DIR_PAD 4
-#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1)
-#define EXOFS_DIR_REC_LEN(name_len) \
- (((name_len) + offsetof(struct exofs_dir_entry, name) + \
- EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
-
-/*
- * The on-disk (optional) layout structure.
- * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
- * attribute, attached to any inode, usually to a directory.
- */
-
-enum exofs_inode_layout_gen_functions {
- LAYOUT_MOVING_WINDOW = 0,
- LAYOUT_IMPLICT = 1,
-};
-
-struct exofs_on_disk_inode_layout {
- __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
- __le16 pad;
- union {
- /* gen_func == LAYOUT_MOVING_WINDOW (default) */
- struct exofs_layout_sliding_window {
- __le32 num_devices; /* first n devices in global-table*/
- } sliding_window __packed;
-
- /* gen_func == LAYOUT_IMPLICT */
- struct exofs_layout_implict_list {
- struct exofs_dt_data_map data_map;
- /* Variable array of size data_map.cb_num_comps. These
- * are device indexes of the devices in the global table
- */
- __le32 dev_indexes[];
- } implict __packed;
- };
-} __packed;
-
-static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
-{
- return sizeof(struct exofs_on_disk_inode_layout) +
- max_devs * sizeof(__le32);
-}
-
-#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
deleted file mode 100644
index f0138674c1ed..000000000000
--- a/fs/exofs/dir.c
+++ /dev/null
@@ -1,661 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/iversion.h>
-#include "exofs.h"
-
-static inline unsigned exofs_chunk_size(struct inode *inode)
-{
- return inode->i_sb->s_blocksize;
-}
-
-static inline void exofs_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
-static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
-{
- loff_t last_byte = inode->i_size;
-
- last_byte -= page_nr << PAGE_SHIFT;
- if (last_byte > PAGE_SIZE)
- last_byte = PAGE_SIZE;
- return last_byte;
-}
-
-static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
-{
- struct address_space *mapping = page->mapping;
- struct inode *dir = mapping->host;
- int err = 0;
-
- inode_inc_iversion(dir);
-
- if (!PageUptodate(page))
- SetPageUptodate(page);
-
- if (pos+len > dir->i_size) {
- i_size_write(dir, pos+len);
- mark_inode_dirty(dir);
- }
- set_page_dirty(page);
-
- if (IS_DIRSYNC(dir))
- err = write_one_page(page);
- else
- unlock_page(page);
-
- return err;
-}
-
-static bool exofs_check_page(struct page *page)
-{
- struct inode *dir = page->mapping->host;
- unsigned chunk_size = exofs_chunk_size(dir);
- char *kaddr = page_address(page);
- unsigned offs, rec_len;
- unsigned limit = PAGE_SIZE;
- struct exofs_dir_entry *p;
- char *error;
-
- /* if the page is the last one in the directory */
- if ((dir->i_size >> PAGE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_MASK;
- if (limit & (chunk_size - 1))
- goto Ebadsize;
- if (!limit)
- goto out;
- }
- for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
- p = (struct exofs_dir_entry *)(kaddr + offs);
- rec_len = le16_to_cpu(p->rec_len);
-
- if (rec_len < EXOFS_DIR_REC_LEN(1))
- goto Eshort;
- if (rec_len & 3)
- goto Ealign;
- if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
- goto Enamelen;
- if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
- goto Espan;
- }
- if (offs != limit)
- goto Eend;
-out:
- SetPageChecked(page);
- return true;
-
-Ebadsize:
- EXOFS_ERR("ERROR [exofs_check_page]: "
- "size of directory(0x%lx) is not a multiple of chunk size\n",
- dir->i_ino
- );
- goto fail;
-Eshort:
- error = "rec_len is smaller than minimal";
- goto bad_entry;
-Ealign:
- error = "unaligned directory entry";
- goto bad_entry;
-Enamelen:
- error = "rec_len is too small for name_len";
- goto bad_entry;
-Espan:
- error = "directory entry across blocks";
- goto bad_entry;
-bad_entry:
- EXOFS_ERR(
- "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
- "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n",
- dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
- _LLU(le64_to_cpu(p->inode_no)),
- rec_len, p->name_len);
- goto fail;
-Eend:
- p = (struct exofs_dir_entry *)(kaddr + offs);
- EXOFS_ERR("ERROR [exofs_check_page]: "
- "entry in directory(0x%lx) spans the page boundary"
- "offset=%lu, inode=0x%llx\n",
- dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
- _LLU(le64_to_cpu(p->inode_no)));
-fail:
- SetPageError(page);
- return false;
-}
-
-static struct page *exofs_get_page(struct inode *dir, unsigned long n)
-{
- struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
-
- if (!IS_ERR(page)) {
- kmap(page);
- if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !exofs_check_page(page))
- goto fail;
- }
- }
- return page;
-
-fail:
- exofs_put_page(page);
- return ERR_PTR(-EIO);
-}
-
-static inline int exofs_match(int len, const unsigned char *name,
- struct exofs_dir_entry *de)
-{
- if (len != de->name_len)
- return 0;
- if (!de->inode_no)
- return 0;
- return !memcmp(name, de->name, len);
-}
-
-static inline
-struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
-{
- return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
-}
-
-static inline unsigned
-exofs_validate_entry(char *base, unsigned offset, unsigned mask)
-{
- struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
- struct exofs_dir_entry *p =
- (struct exofs_dir_entry *)(base + (offset&mask));
- while ((char *)p < (char *)de) {
- if (p->rec_len == 0)
- break;
- p = exofs_next_entry(p);
- }
- return (char *)p - base;
-}
-
-static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
- [EXOFS_FT_UNKNOWN] = DT_UNKNOWN,
- [EXOFS_FT_REG_FILE] = DT_REG,
- [EXOFS_FT_DIR] = DT_DIR,
- [EXOFS_FT_CHRDEV] = DT_CHR,
- [EXOFS_FT_BLKDEV] = DT_BLK,
- [EXOFS_FT_FIFO] = DT_FIFO,
- [EXOFS_FT_SOCK] = DT_SOCK,
- [EXOFS_FT_SYMLINK] = DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK,
-};
-
-static inline
-void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
-{
- umode_t mode = inode->i_mode;
- de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
-}
-
-static int
-exofs_readdir(struct file *file, struct dir_context *ctx)
-{
- loff_t pos = ctx->pos;
- struct inode *inode = file_inode(file);
- unsigned int offset = pos & ~PAGE_MASK;
- unsigned long n = pos >> PAGE_SHIFT;
- unsigned long npages = dir_pages(inode);
- unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
- bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
-
- if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
- return 0;
-
- for ( ; n < npages; n++, offset = 0) {
- char *kaddr, *limit;
- struct exofs_dir_entry *de;
- struct page *page = exofs_get_page(inode, n);
-
- if (IS_ERR(page)) {
- EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
- inode->i_ino);
- ctx->pos += PAGE_SIZE - offset;
- return PTR_ERR(page);
- }
- kaddr = page_address(page);
- if (unlikely(need_revalidate)) {
- if (offset) {
- offset = exofs_validate_entry(kaddr, offset,
- chunk_mask);
- ctx->pos = (n<<PAGE_SHIFT) + offset;
- }
- file->f_version = inode_query_iversion(inode);
- need_revalidate = false;
- }
- de = (struct exofs_dir_entry *)(kaddr + offset);
- limit = kaddr + exofs_last_byte(inode, n) -
- EXOFS_DIR_REC_LEN(1);
- for (; (char *)de <= limit; de = exofs_next_entry(de)) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: "
- "zero-length entry in directory(0x%lx)\n",
- inode->i_ino);
- exofs_put_page(page);
- return -EIO;
- }
- if (de->inode_no) {
- unsigned char t;
-
- if (de->file_type < EXOFS_FT_MAX)
- t = exofs_filetype_table[de->file_type];
- else
- t = DT_UNKNOWN;
-
- if (!dir_emit(ctx, de->name, de->name_len,
- le64_to_cpu(de->inode_no),
- t)) {
- exofs_put_page(page);
- return 0;
- }
- }
- ctx->pos += le16_to_cpu(de->rec_len);
- }
- exofs_put_page(page);
- }
- return 0;
-}
-
-struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
- struct dentry *dentry, struct page **res_page)
-{
- const unsigned char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
- unsigned long start, n;
- unsigned long npages = dir_pages(dir);
- struct page *page = NULL;
- struct exofs_i_info *oi = exofs_i(dir);
- struct exofs_dir_entry *de;
-
- if (npages == 0)
- goto out;
-
- *res_page = NULL;
-
- start = oi->i_dir_start_lookup;
- if (start >= npages)
- start = 0;
- n = start;
- do {
- char *kaddr;
- page = exofs_get_page(dir, n);
- if (!IS_ERR(page)) {
- kaddr = page_address(page);
- de = (struct exofs_dir_entry *) kaddr;
- kaddr += exofs_last_byte(dir, n) - reclen;
- while ((char *) de <= kaddr) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: zero-length entry in "
- "directory(0x%lx)\n",
- dir->i_ino);
- exofs_put_page(page);
- goto out;
- }
- if (exofs_match(namelen, name, de))
- goto found;
- de = exofs_next_entry(de);
- }
- exofs_put_page(page);
- }
- if (++n >= npages)
- n = 0;
- } while (n != start);
-out:
- return NULL;
-
-found:
- *res_page = page;
- oi->i_dir_start_lookup = n;
- return de;
-}
-
-struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
-{
- struct page *page = exofs_get_page(dir, 0);
- struct exofs_dir_entry *de = NULL;
-
- if (!IS_ERR(page)) {
- de = exofs_next_entry(
- (struct exofs_dir_entry *)page_address(page));
- *p = page;
- }
- return de;
-}
-
-ino_t exofs_parent_ino(struct dentry *child)
-{
- struct page *page;
- struct exofs_dir_entry *de;
- ino_t ino;
-
- de = exofs_dotdot(d_inode(child), &page);
- if (!de)
- return 0;
-
- ino = le64_to_cpu(de->inode_no);
- exofs_put_page(page);
- return ino;
-}
-
-ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
-{
- ino_t res = 0;
- struct exofs_dir_entry *de;
- struct page *page;
-
- de = exofs_find_entry(dir, dentry, &page);
- if (de) {
- res = le64_to_cpu(de->inode_no);
- exofs_put_page(page);
- }
- return res;
-}
-
-int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
- struct page *page, struct inode *inode)
-{
- loff_t pos = page_offset(page) +
- (char *) de - (char *) page_address(page);
- unsigned len = le16_to_cpu(de->rec_len);
- int err;
-
- lock_page(page);
- err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL);
- if (err)
- EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
- err);
-
- de->inode_no = cpu_to_le64(inode->i_ino);
- exofs_set_de_type(de, inode);
- if (likely(!err))
- err = exofs_commit_chunk(page, pos, len);
- exofs_put_page(page);
- dir->i_mtime = dir->i_ctime = current_time(dir);
- mark_inode_dirty(dir);
- return err;
-}
-
-int exofs_add_link(struct dentry *dentry, struct inode *inode)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- const unsigned char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- unsigned chunk_size = exofs_chunk_size(dir);
- unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
- unsigned short rec_len, name_len;
- struct page *page = NULL;
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- struct exofs_dir_entry *de;
- unsigned long npages = dir_pages(dir);
- unsigned long n;
- char *kaddr;
- loff_t pos;
- int err;
-
- for (n = 0; n <= npages; n++) {
- char *dir_end;
-
- page = exofs_get_page(dir, n);
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto out;
- lock_page(page);
- kaddr = page_address(page);
- dir_end = kaddr + exofs_last_byte(dir, n);
- de = (struct exofs_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - reclen;
- while ((char *)de <= kaddr) {
- if ((char *)de == dir_end) {
- name_len = 0;
- rec_len = chunk_size;
- de->rec_len = cpu_to_le16(chunk_size);
- de->inode_no = 0;
- goto got_it;
- }
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: exofs_add_link: "
- "zero-length entry in directory(0x%lx)\n",
- inode->i_ino);
- err = -EIO;
- goto out_unlock;
- }
- err = -EEXIST;
- if (exofs_match(namelen, name, de))
- goto out_unlock;
- name_len = EXOFS_DIR_REC_LEN(de->name_len);
- rec_len = le16_to_cpu(de->rec_len);
- if (!de->inode_no && rec_len >= reclen)
- goto got_it;
- if (rec_len >= name_len + reclen)
- goto got_it;
- de = (struct exofs_dir_entry *) ((char *) de + rec_len);
- }
- unlock_page(page);
- exofs_put_page(page);
- }
-
- EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
- dentry, inode->i_ino);
- return -EINVAL;
-
-got_it:
- pos = page_offset(page) +
- (char *)de - (char *)page_address(page);
- err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
- &page, NULL);
- if (err)
- goto out_unlock;
- if (de->inode_no) {
- struct exofs_dir_entry *de1 =
- (struct exofs_dir_entry *)((char *)de + name_len);
- de1->rec_len = cpu_to_le16(rec_len - name_len);
- de->rec_len = cpu_to_le16(name_len);
- de = de1;
- }
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
- de->inode_no = cpu_to_le64(inode->i_ino);
- exofs_set_de_type(de, inode);
- err = exofs_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = current_time(dir);
- mark_inode_dirty(dir);
- sbi->s_numfiles++;
-
-out_put:
- exofs_put_page(page);
-out:
- return err;
-out_unlock:
- unlock_page(page);
- goto out_put;
-}
-
-int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
-{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- char *kaddr = page_address(page);
- unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
- unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
- loff_t pos;
- struct exofs_dir_entry *pde = NULL;
- struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
- int err;
-
- while (de < dir) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: exofs_delete_entry:"
- "zero-length entry in directory(0x%lx)\n",
- inode->i_ino);
- err = -EIO;
- goto out;
- }
- pde = de;
- de = exofs_next_entry(de);
- }
- if (pde)
- from = (char *)pde - (char *)page_address(page);
- pos = page_offset(page) + from;
- lock_page(page);
- err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
- &page, NULL);
- if (err)
- EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
- err);
- if (pde)
- pde->rec_len = cpu_to_le16(to - from);
- dir->inode_no = 0;
- if (likely(!err))
- err = exofs_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = current_time(inode);
- mark_inode_dirty(inode);
- sbi->s_numfiles--;
-out:
- exofs_put_page(page);
- return err;
-}
-
-/* kept aligned on 4 bytes */
-#define THIS_DIR ".\0\0"
-#define PARENT_DIR "..\0"
-
-int exofs_make_empty(struct inode *inode, struct inode *parent)
-{
- struct address_space *mapping = inode->i_mapping;
- struct page *page = grab_cache_page(mapping, 0);
- unsigned chunk_size = exofs_chunk_size(inode);
- struct exofs_dir_entry *de;
- int err;
- void *kaddr;
-
- if (!page)
- return -ENOMEM;
-
- err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
- &page, NULL);
- if (err) {
- unlock_page(page);
- goto fail;
- }
-
- kaddr = kmap_atomic(page);
- de = (struct exofs_dir_entry *)kaddr;
- de->name_len = 1;
- de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
- memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
- de->inode_no = cpu_to_le64(inode->i_ino);
- exofs_set_de_type(de, inode);
-
- de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
- de->name_len = 2;
- de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
- de->inode_no = cpu_to_le64(parent->i_ino);
- memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
- exofs_set_de_type(de, inode);
- kunmap_atomic(kaddr);
- err = exofs_commit_chunk(page, 0, chunk_size);
-fail:
- put_page(page);
- return err;
-}
-
-int exofs_empty_dir(struct inode *inode)
-{
- struct page *page = NULL;
- unsigned long i, npages = dir_pages(inode);
-
- for (i = 0; i < npages; i++) {
- char *kaddr;
- struct exofs_dir_entry *de;
- page = exofs_get_page(inode, i);
-
- if (IS_ERR(page))
- continue;
-
- kaddr = page_address(page);
- de = (struct exofs_dir_entry *)kaddr;
- kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
-
- while ((char *)de <= kaddr) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: exofs_empty_dir: "
- "zero-length directory entry"
- "kaddr=%p, de=%p\n", kaddr, de);
- goto not_empty;
- }
- if (de->inode_no != 0) {
- /* check for . and .. */
- if (de->name[0] != '.')
- goto not_empty;
- if (de->name_len > 2)
- goto not_empty;
- if (de->name_len < 2) {
- if (le64_to_cpu(de->inode_no) !=
- inode->i_ino)
- goto not_empty;
- } else if (de->name[1] != '.')
- goto not_empty;
- }
- de = exofs_next_entry(de);
- }
- exofs_put_page(page);
- }
- return 1;
-
-not_empty:
- exofs_put_page(page);
- return 0;
-}
-
-const struct file_operations exofs_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .iterate_shared = exofs_readdir,
-};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
deleted file mode 100644
index 5dc392404559..000000000000
--- a/fs/exofs/exofs.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __EXOFS_H__
-#define __EXOFS_H__
-
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/backing-dev.h>
-#include <scsi/osd_ore.h>
-
-#include "common.h"
-
-#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define EXOFS_DBGMSG(fmt, a...) \
- printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define EXOFS_DBGMSG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-struct exofs_dev {
- struct ore_dev ored;
- unsigned did;
- unsigned urilen;
- uint8_t *uri;
- struct kobject ed_kobj;
-};
-/*
- * our extension to the in-memory superblock
- */
-struct exofs_sb_info {
- struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
- int s_timeout; /* timeout for OSD operations */
- uint64_t s_nextid; /* highest object ID used */
- uint32_t s_numfiles; /* number of files on fs */
- spinlock_t s_next_gen_lock; /* spinlock for gen # update */
- u32 s_next_generation; /* next gen # to use */
- atomic_t s_curr_pending; /* number of pending commands */
-
- struct ore_layout layout; /* Default files layout */
- struct ore_comp one_comp; /* id & cred of partition id=0*/
- struct ore_components oc; /* comps for the partition */
- struct kobject s_kobj; /* holds per-sbi kobject */
-};
-
-/*
- * our extension to the in-memory inode
- */
-struct exofs_i_info {
- struct inode vfs_inode; /* normal in-memory inode */
- wait_queue_head_t i_wq; /* wait queue for inode */
- unsigned long i_flags; /* various atomic flags */
- uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
- uint32_t i_dir_start_lookup; /* which page to start lookup */
- uint64_t i_commit_size; /* the object's written length */
- struct ore_comp one_comp; /* same component for all devices */
- struct ore_components oc; /* inode view of the device table */
-};
-
-static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
-{
- return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
-}
-
-/*
- * our inode flags
- */
-#define OBJ_2BCREATED 0 /* object will be created soon*/
-#define OBJ_CREATED 1 /* object has been created on the osd*/
-
-static inline int obj_2bcreated(struct exofs_i_info *oi)
-{
- return test_bit(OBJ_2BCREATED, &oi->i_flags);
-}
-
-static inline void set_obj_2bcreated(struct exofs_i_info *oi)
-{
- set_bit(OBJ_2BCREATED, &oi->i_flags);
-}
-
-static inline int obj_created(struct exofs_i_info *oi)
-{
- return test_bit(OBJ_CREATED, &oi->i_flags);
-}
-
-static inline void set_obj_created(struct exofs_i_info *oi)
-{
- set_bit(OBJ_CREATED, &oi->i_flags);
-}
-
-int __exofs_wait_obj_created(struct exofs_i_info *oi);
-static inline int wait_obj_created(struct exofs_i_info *oi)
-{
- if (likely(obj_created(oi)))
- return 0;
-
- return __exofs_wait_obj_created(oi);
-}
-
-/*
- * get to our inode from the vfs inode
- */
-static inline struct exofs_i_info *exofs_i(struct inode *inode)
-{
- return container_of(inode, struct exofs_i_info, vfs_inode);
-}
-
-/*
- * Maximum count of links to a file
- */
-#define EXOFS_LINK_MAX 32000
-
-/*************************
- * function declarations *
- *************************/
-
-/* inode.c */
-unsigned exofs_max_io_pages(struct ore_layout *layout,
- unsigned expected_pages);
-int exofs_setattr(struct dentry *, struct iattr *);
-int exofs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata);
-extern struct inode *exofs_iget(struct super_block *, unsigned long);
-struct inode *exofs_new_inode(struct inode *, umode_t);
-extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
-extern void exofs_evict_inode(struct inode *);
-
-/* dir.c: */
-int exofs_add_link(struct dentry *, struct inode *);
-ino_t exofs_inode_by_name(struct inode *, struct dentry *);
-int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
-int exofs_make_empty(struct inode *, struct inode *);
-struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
- struct page **);
-int exofs_empty_dir(struct inode *);
-struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
-ino_t exofs_parent_ino(struct dentry *child);
-int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
- struct inode *);
-
-/* super.c */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
- const struct osd_obj_id *obj);
-int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
-
-/* sys.c */
-int exofs_sysfs_init(void);
-void exofs_sysfs_uninit(void);
-int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
- struct exofs_dt_device_info *dt_dev);
-void exofs_sysfs_sb_del(struct exofs_sb_info *sbi);
-int exofs_sysfs_odev_add(struct exofs_dev *edev,
- struct exofs_sb_info *sbi);
-void exofs_sysfs_dbg_print(void);
-
-/*********************
- * operation vectors *
- *********************/
-/* dir.c: */
-extern const struct file_operations exofs_dir_operations;
-
-/* file.c */
-extern const struct inode_operations exofs_file_inode_operations;
-extern const struct file_operations exofs_file_operations;
-
-/* inode.c */
-extern const struct address_space_operations exofs_aops;
-
-/* namei.c */
-extern const struct inode_operations exofs_dir_inode_operations;
-extern const struct inode_operations exofs_special_inode_operations;
-
-/* exofs_init_comps will initialize an ore_components device array
- * pointing to a single ore_comp struct, and a round-robin view
- * of the device table.
- * The first device of each inode is the [inode->ino % num_devices]
- * and the rest of the devices sequentially following where the
- * first device is after the last device.
- * It is assumed that the global device array at @sbi is twice
- * bigger and that the device table repeats twice.
- * See: exofs_read_lookup_dev_table()
- */
-static inline void exofs_init_comps(struct ore_components *oc,
- struct ore_comp *one_comp,
- struct exofs_sb_info *sbi, osd_id oid)
-{
- unsigned dev_mod = (unsigned)oid, first_dev;
-
- one_comp->obj.partition = sbi->one_comp.obj.partition;
- one_comp->obj.id = oid;
- exofs_make_credential(one_comp->cred, &one_comp->obj);
-
- oc->first_dev = 0;
- oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
- sbi->layout.group_count;
- oc->single_comp = EC_SINGLE_COMP;
- oc->comps = one_comp;
-
- /* Round robin device view of the table */
- first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
- oc->ods = &sbi->oc.ods[first_dev];
-}
-
-#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
deleted file mode 100644
index a94594ea2aa3..000000000000
--- a/fs/exofs/file.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "exofs.h"
-
-static int exofs_release_file(struct inode *inode, struct file *filp)
-{
- return 0;
-}
-
-/* exofs_file_fsync - flush the inode to disk
- *
- * Note, in exofs all metadata is written as part of inode, regardless.
- * The writeout is synchronous
- */
-static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = filp->f_mapping->host;
- int ret;
-
- ret = file_write_and_wait_range(filp, start, end);
- if (ret)
- return ret;
-
- inode_lock(inode);
- ret = sync_inode_metadata(filp->f_mapping->host, 1);
- inode_unlock(inode);
- return ret;
-}
-
-static int exofs_flush(struct file *file, fl_owner_t id)
-{
- int ret = vfs_fsync(file, 0);
- /* TODO: Flush the OSD target */
- return ret;
-}
-
-const struct file_operations exofs_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
- .open = generic_file_open,
- .release = exofs_release_file,
- .fsync = exofs_file_fsync,
- .flush = exofs_flush,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
-};
-
-const struct inode_operations exofs_file_inode_operations = {
- .setattr = exofs_setattr,
-};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
deleted file mode 100644
index 5f81fcd383a4..000000000000
--- a/fs/exofs/inode.c
+++ /dev/null
@@ -1,1514 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/slab.h>
-
-#include "exofs.h"
-
-#define EXOFS_DBGMSG2(M...) do {} while (0)
-
-unsigned exofs_max_io_pages(struct ore_layout *layout,
- unsigned expected_pages)
-{
- unsigned pages = min_t(unsigned, expected_pages,
- layout->max_io_length / PAGE_SIZE);
-
- return pages;
-}
-
-struct page_collect {
- struct exofs_sb_info *sbi;
- struct inode *inode;
- unsigned expected_pages;
- struct ore_io_state *ios;
-
- struct page **pages;
- unsigned alloc_pages;
- unsigned nr_pages;
- unsigned long length;
- loff_t pg_first; /* keep 64bit also in 32-arches */
- bool read_4_write; /* This means two things: that the read is sync
- * And the pages should not be unlocked.
- */
- struct page *that_locked_page;
-};
-
-static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
- struct inode *inode)
-{
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-
- pcol->sbi = sbi;
- pcol->inode = inode;
- pcol->expected_pages = expected_pages;
-
- pcol->ios = NULL;
- pcol->pages = NULL;
- pcol->alloc_pages = 0;
- pcol->nr_pages = 0;
- pcol->length = 0;
- pcol->pg_first = -1;
- pcol->read_4_write = false;
- pcol->that_locked_page = NULL;
-}
-
-static void _pcol_reset(struct page_collect *pcol)
-{
- pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
-
- pcol->pages = NULL;
- pcol->alloc_pages = 0;
- pcol->nr_pages = 0;
- pcol->length = 0;
- pcol->pg_first = -1;
- pcol->ios = NULL;
- pcol->that_locked_page = NULL;
-
- /* this is probably the end of the loop but in writes
- * it might not end here. don't be left with nothing
- */
- if (!pcol->expected_pages)
- pcol->expected_pages =
- exofs_max_io_pages(&pcol->sbi->layout, ~0);
-}
-
-static int pcol_try_alloc(struct page_collect *pcol)
-{
- unsigned pages;
-
- /* TODO: easily support bio chaining */
- pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
-
- for (; pages; pages >>= 1) {
- pcol->pages = kmalloc_array(pages, sizeof(struct page *),
- GFP_KERNEL);
- if (likely(pcol->pages)) {
- pcol->alloc_pages = pages;
- return 0;
- }
- }
-
- EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
- pcol->expected_pages);
- return -ENOMEM;
-}
-
-static void pcol_free(struct page_collect *pcol)
-{
- kfree(pcol->pages);
- pcol->pages = NULL;
-
- if (pcol->ios) {
- ore_put_io_state(pcol->ios);
- pcol->ios = NULL;
- }
-}
-
-static int pcol_add_page(struct page_collect *pcol, struct page *page,
- unsigned len)
-{
- if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
- return -ENOMEM;
-
- pcol->pages[pcol->nr_pages++] = page;
- pcol->length += len;
- return 0;
-}
-
-enum {PAGE_WAS_NOT_IN_IO = 17};
-static int update_read_page(struct page *page, int ret)
-{
- switch (ret) {
- case 0:
- /* Everything is OK */
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
- break;
- case -EFAULT:
- /* In this case we were trying to read something that wasn't on
- * disk yet - return a page full of zeroes. This should be OK,
- * because the object should be empty (if there was a write
- * before this read, the read would be waiting with the page
- * locked */
- clear_highpage(page);
-
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
- EXOFS_DBGMSG("recovered read error\n");
- /* fall through */
- case PAGE_WAS_NOT_IN_IO:
- ret = 0; /* recovered error */
- break;
- default:
- SetPageError(page);
- }
- return ret;
-}
-
-static void update_write_page(struct page *page, int ret)
-{
- if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
- return; /* don't pass start don't collect $200 */
-
- if (ret) {
- mapping_set_error(page->mapping, ret);
- SetPageError(page);
- }
- end_page_writeback(page);
-}
-
-/* Called at the end of reads, to optionally unlock pages and update their
- * status.
- */
-static int __readpages_done(struct page_collect *pcol)
-{
- int i;
- u64 good_bytes;
- u64 length = 0;
- int ret = ore_check_io(pcol->ios, NULL);
-
- if (likely(!ret)) {
- good_bytes = pcol->length;
- ret = PAGE_WAS_NOT_IN_IO;
- } else {
- good_bytes = 0;
- }
-
- EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
- " length=0x%lx nr_pages=%u\n",
- pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
- pcol->nr_pages);
-
- for (i = 0; i < pcol->nr_pages; i++) {
- struct page *page = pcol->pages[i];
- struct inode *inode = page->mapping->host;
- int page_stat;
-
- if (inode != pcol->inode)
- continue; /* osd might add more pages at end */
-
- if (likely(length < good_bytes))
- page_stat = 0;
- else
- page_stat = ret;
-
- EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n",
- inode->i_ino, page->index,
- page_stat ? "bad_bytes" : "good_bytes");
-
- ret = update_read_page(page, page_stat);
- if (!pcol->read_4_write)
- unlock_page(page);
- length += PAGE_SIZE;
- }
-
- pcol_free(pcol);
- EXOFS_DBGMSG2("readpages_done END\n");
- return ret;
-}
-
-/* callback of async reads */
-static void readpages_done(struct ore_io_state *ios, void *p)
-{
- struct page_collect *pcol = p;
-
- __readpages_done(pcol);
- atomic_dec(&pcol->sbi->s_curr_pending);
- kfree(pcol);
-}
-
-static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
-{
- int i;
-
- for (i = 0; i < pcol->nr_pages; i++) {
- struct page *page = pcol->pages[i];
-
- if (rw == READ)
- update_read_page(page, ret);
- else
- update_write_page(page, ret);
-
- unlock_page(page);
- }
-}
-
-static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
- struct page_collect *pcol_src, struct page_collect *pcol)
-{
- /* length was wrong or offset was not page aligned */
- BUG_ON(pcol_src->nr_pages < ios->nr_pages);
-
- if (pcol_src->nr_pages > ios->nr_pages) {
- struct page **src_page;
- unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
- unsigned long len_less = pcol_src->length - ios->length;
- unsigned i;
- int ret;
-
- /* This IO was trimmed */
- pcol_src->nr_pages = ios->nr_pages;
- pcol_src->length = ios->length;
-
- /* Left over pages are passed to the next io */
- pcol->expected_pages += pages_less;
- pcol->nr_pages = pages_less;
- pcol->length = len_less;
- src_page = pcol_src->pages + pcol_src->nr_pages;
- pcol->pg_first = (*src_page)->index;
-
- ret = pcol_try_alloc(pcol);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < pages_less; ++i)
- pcol->pages[i] = *src_page++;
-
- EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
- "pages_less=0x%x expected_pages=0x%x "
- "next_offset=0x%llx next_len=0x%lx\n",
- pcol_src->nr_pages, pages_less, pcol->expected_pages,
- pcol->pg_first * PAGE_SIZE, pcol->length);
- }
- return 0;
-}
-
-static int read_exec(struct page_collect *pcol)
-{
- struct exofs_i_info *oi = exofs_i(pcol->inode);
- struct ore_io_state *ios;
- struct page_collect *pcol_copy = NULL;
- int ret;
-
- if (!pcol->pages)
- return 0;
-
- if (!pcol->ios) {
- int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
- pcol->pg_first << PAGE_SHIFT,
- pcol->length, &pcol->ios);
-
- if (ret)
- return ret;
- }
-
- ios = pcol->ios;
- ios->pages = pcol->pages;
-
- if (pcol->read_4_write) {
- ore_read(pcol->ios);
- return __readpages_done(pcol);
- }
-
- pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
- if (!pcol_copy) {
- ret = -ENOMEM;
- goto err;
- }
-
- *pcol_copy = *pcol;
- ios->done = readpages_done;
- ios->private = pcol_copy;
-
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
-
- ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
- if (unlikely(ret))
- goto err;
-
- EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
- pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
-
- ret = ore_read(ios);
- if (unlikely(ret))
- goto err;
-
- atomic_inc(&pcol->sbi->s_curr_pending);
-
- return 0;
-
-err:
- if (!pcol_copy) /* Failed before ownership transfer */
- pcol_copy = pcol;
- _unlock_pcol_pages(pcol_copy, ret, READ);
- pcol_free(pcol_copy);
- kfree(pcol_copy);
-
- return ret;
-}
-
-/* readpage_strip is called either directly from readpage() or by the VFS from
- * within read_cache_pages(), to add one more page to be read. It will try to
- * collect as many contiguous pages as posible. If a discontinuity is
- * encountered, or it runs out of resources, it will submit the previous segment
- * and will start a new collection. Eventually caller must submit the last
- * segment if present.
- */
-static int readpage_strip(void *data, struct page *page)
-{
- struct page_collect *pcol = data;
- struct inode *inode = pcol->inode;
- struct exofs_i_info *oi = exofs_i(inode);
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- size_t len;
- int ret;
-
- BUG_ON(!PageLocked(page));
-
- /* FIXME: Just for debugging, will be removed */
- if (PageUptodate(page))
- EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
- page->index);
-
- pcol->that_locked_page = page;
-
- if (page->index < end_index)
- len = PAGE_SIZE;
- else if (page->index == end_index)
- len = i_size & ~PAGE_MASK;
- else
- len = 0;
-
- if (!len || !obj_created(oi)) {
- /* this will be out of bounds, or doesn't exist yet.
- * Current page is cleared and the request is split
- */
- clear_highpage(page);
-
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
-
- if (!pcol->read_4_write)
- unlock_page(page);
- EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
- "read_4_write=%d index=0x%lx end_index=0x%lx "
- "splitting\n", inode->i_ino, len,
- pcol->read_4_write, page->index, end_index);
-
- return read_exec(pcol);
- }
-
-try_again:
-
- if (unlikely(pcol->pg_first == -1)) {
- pcol->pg_first = page->index;
- } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
- page->index)) {
- /* Discontinuity detected, split the request */
- ret = read_exec(pcol);
- if (unlikely(ret))
- goto fail;
- goto try_again;
- }
-
- if (!pcol->pages) {
- ret = pcol_try_alloc(pcol);
- if (unlikely(ret))
- goto fail;
- }
-
- if (len != PAGE_SIZE)
- zero_user(page, len, PAGE_SIZE - len);
-
- EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
- inode->i_ino, page->index, len);
-
- ret = pcol_add_page(pcol, page, len);
- if (ret) {
- EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
- "this_len=0x%zx nr_pages=%u length=0x%lx\n",
- page, len, pcol->nr_pages, pcol->length);
-
- /* split the request, and start again with current page */
- ret = read_exec(pcol);
- if (unlikely(ret))
- goto fail;
-
- goto try_again;
- }
-
- return 0;
-
-fail:
- /* SetPageError(page); ??? */
- unlock_page(page);
- return ret;
-}
-
-static int exofs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
-{
- struct page_collect pcol;
- int ret;
-
- _pcol_init(&pcol, nr_pages, mapping->host);
-
- ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
- if (ret) {
- EXOFS_ERR("read_cache_pages => %d\n", ret);
- return ret;
- }
-
- ret = read_exec(&pcol);
- if (unlikely(ret))
- return ret;
-
- return read_exec(&pcol);
-}
-
-static int _readpage(struct page *page, bool read_4_write)
-{
- struct page_collect pcol;
- int ret;
-
- _pcol_init(&pcol, 1, page->mapping->host);
-
- pcol.read_4_write = read_4_write;
- ret = readpage_strip(&pcol, page);
- if (ret) {
- EXOFS_ERR("_readpage => %d\n", ret);
- return ret;
- }
-
- return read_exec(&pcol);
-}
-
-/*
- * We don't need the file
- */
-static int exofs_readpage(struct file *file, struct page *page)
-{
- return _readpage(page, false);
-}
-
-/* Callback for osd_write. All writes are asynchronous */
-static void writepages_done(struct ore_io_state *ios, void *p)
-{
- struct page_collect *pcol = p;
- int i;
- u64 good_bytes;
- u64 length = 0;
- int ret = ore_check_io(ios, NULL);
-
- atomic_dec(&pcol->sbi->s_curr_pending);
-
- if (likely(!ret)) {
- good_bytes = pcol->length;
- ret = PAGE_WAS_NOT_IN_IO;
- } else {
- good_bytes = 0;
- }
-
- EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
- " length=0x%lx nr_pages=%u\n",
- pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
- pcol->nr_pages);
-
- for (i = 0; i < pcol->nr_pages; i++) {
- struct page *page = pcol->pages[i];
- struct inode *inode = page->mapping->host;
- int page_stat;
-
- if (inode != pcol->inode)
- continue; /* osd might add more pages to a bio */
-
- if (likely(length < good_bytes))
- page_stat = 0;
- else
- page_stat = ret;
-
- update_write_page(page, page_stat);
- unlock_page(page);
- EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
- inode->i_ino, page->index, page_stat);
-
- length += PAGE_SIZE;
- }
-
- pcol_free(pcol);
- kfree(pcol);
- EXOFS_DBGMSG2("writepages_done END\n");
-}
-
-static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
-{
- struct page_collect *pcol = priv;
- pgoff_t index = offset / PAGE_SIZE;
-
- if (!pcol->that_locked_page ||
- (pcol->that_locked_page->index != index)) {
- struct page *page;
- loff_t i_size = i_size_read(pcol->inode);
-
- if (offset >= i_size) {
- *uptodate = true;
- EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
- return ZERO_PAGE(0);
- }
-
- page = find_get_page(pcol->inode->i_mapping, index);
- if (!page) {
- page = find_or_create_page(pcol->inode->i_mapping,
- index, GFP_NOFS);
- if (unlikely(!page)) {
- EXOFS_DBGMSG("grab_cache_page Failed "
- "index=0x%llx\n", _LLU(index));
- return NULL;
- }
- unlock_page(page);
- }
- *uptodate = PageUptodate(page);
- EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
- return page;
- } else {
- EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
- pcol->that_locked_page->index);
- *uptodate = true;
- return pcol->that_locked_page;
- }
-}
-
-static void __r4w_put_page(void *priv, struct page *page)
-{
- struct page_collect *pcol = priv;
-
- if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
- EXOFS_DBGMSG2("index=0x%lx\n", page->index);
- put_page(page);
- return;
- }
- EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
- ZERO_PAGE(0) == page ? -1 : page->index);
-}
-
-static const struct _ore_r4w_op _r4w_op = {
- .get_page = &__r4w_get_page,
- .put_page = &__r4w_put_page,
-};
-
-static int write_exec(struct page_collect *pcol)
-{
- struct exofs_i_info *oi = exofs_i(pcol->inode);
- struct ore_io_state *ios;
- struct page_collect *pcol_copy = NULL;
- int ret;
-
- if (!pcol->pages)
- return 0;
-
- BUG_ON(pcol->ios);
- ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
- pcol->pg_first << PAGE_SHIFT,
- pcol->length, &pcol->ios);
- if (unlikely(ret))
- goto err;
-
- pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
- if (!pcol_copy) {
- EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
- ret = -ENOMEM;
- goto err;
- }
-
- *pcol_copy = *pcol;
-
- ios = pcol->ios;
- ios->pages = pcol_copy->pages;
- ios->done = writepages_done;
- ios->r4w = &_r4w_op;
- ios->private = pcol_copy;
-
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
-
- ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
- if (unlikely(ret))
- goto err;
-
- EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
- pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
-
- ret = ore_write(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("write_exec: ore_write() Failed\n");
- goto err;
- }
-
- atomic_inc(&pcol->sbi->s_curr_pending);
- return 0;
-
-err:
- if (!pcol_copy) /* Failed before ownership transfer */
- pcol_copy = pcol;
- _unlock_pcol_pages(pcol_copy, ret, WRITE);
- pcol_free(pcol_copy);
- kfree(pcol_copy);
-
- return ret;
-}
-
-/* writepage_strip is called either directly from writepage() or by the VFS from
- * within write_cache_pages(), to add one more page to be written to storage.
- * It will try to collect as many contiguous pages as possible. If a
- * discontinuity is encountered or it runs out of resources it will submit the
- * previous segment and will start a new collection.
- * Eventually caller must submit the last segment if present.
- */
-static int writepage_strip(struct page *page,
- struct writeback_control *wbc_unused, void *data)
-{
- struct page_collect *pcol = data;
- struct inode *inode = pcol->inode;
- struct exofs_i_info *oi = exofs_i(inode);
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- size_t len;
- int ret;
-
- BUG_ON(!PageLocked(page));
-
- ret = wait_obj_created(oi);
- if (unlikely(ret))
- goto fail;
-
- if (page->index < end_index)
- /* in this case, the page is within the limits of the file */
- len = PAGE_SIZE;
- else {
- len = i_size & ~PAGE_MASK;
-
- if (page->index > end_index || !len) {
- /* in this case, the page is outside the limits
- * (truncate in progress)
- */
- ret = write_exec(pcol);
- if (unlikely(ret))
- goto fail;
- if (PageError(page))
- ClearPageError(page);
- unlock_page(page);
- EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
- "outside the limits\n",
- inode->i_ino, page->index);
- return 0;
- }
- }
-
-try_again:
-
- if (unlikely(pcol->pg_first == -1)) {
- pcol->pg_first = page->index;
- } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
- page->index)) {
- /* Discontinuity detected, split the request */
- ret = write_exec(pcol);
- if (unlikely(ret))
- goto fail;
-
- EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
- inode->i_ino, page->index);
- goto try_again;
- }
-
- if (!pcol->pages) {
- ret = pcol_try_alloc(pcol);
- if (unlikely(ret))
- goto fail;
- }
-
- EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
- inode->i_ino, page->index, len);
-
- ret = pcol_add_page(pcol, page, len);
- if (unlikely(ret)) {
- EXOFS_DBGMSG2("Failed pcol_add_page "
- "nr_pages=%u total_length=0x%lx\n",
- pcol->nr_pages, pcol->length);
-
- /* split the request, next loop will start again */
- ret = write_exec(pcol);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("write_exec failed => %d", ret);
- goto fail;
- }
-
- goto try_again;
- }
-
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
-
- return 0;
-
-fail:
- EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
- inode->i_ino, page->index, ret);
- mapping_set_error(page->mapping, -EIO);
- unlock_page(page);
- return ret;
-}
-
-static int exofs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct page_collect pcol;
- long start, end, expected_pages;
- int ret;
-
- start = wbc->range_start >> PAGE_SHIFT;
- end = (wbc->range_end == LLONG_MAX) ?
- start + mapping->nrpages :
- wbc->range_end >> PAGE_SHIFT;
-
- if (start || end)
- expected_pages = end - start + 1;
- else
- expected_pages = mapping->nrpages;
-
- if (expected_pages < 32L)
- expected_pages = 32L;
-
- EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
- "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
- mapping->host->i_ino, wbc->range_start, wbc->range_end,
- mapping->nrpages, start, end, expected_pages);
-
- _pcol_init(&pcol, expected_pages, mapping->host);
-
- ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
- if (unlikely(ret)) {
- EXOFS_ERR("write_cache_pages => %d\n", ret);
- return ret;
- }
-
- ret = write_exec(&pcol);
- if (unlikely(ret))
- return ret;
-
- if (wbc->sync_mode == WB_SYNC_ALL) {
- return write_exec(&pcol); /* pump the last reminder */
- } else if (pcol.nr_pages) {
- /* not SYNC let the reminder join the next writeout */
- unsigned i;
-
- for (i = 0; i < pcol.nr_pages; i++) {
- struct page *page = pcol.pages[i];
-
- end_page_writeback(page);
- set_page_dirty(page);
- unlock_page(page);
- }
- }
- return 0;
-}
-
-/*
-static int exofs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct page_collect pcol;
- int ret;
-
- _pcol_init(&pcol, 1, page->mapping->host);
-
- ret = writepage_strip(page, NULL, &pcol);
- if (ret) {
- EXOFS_ERR("exofs_writepage => %d\n", ret);
- return ret;
- }
-
- return write_exec(&pcol);
-}
-*/
-/* i_mutex held using inode->i_size directly */
-static void _write_failed(struct inode *inode, loff_t to)
-{
- if (to > inode->i_size)
- truncate_pagecache(inode, inode->i_size);
-}
-
-int exofs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- int ret = 0;
- struct page *page;
-
- page = *pagep;
- if (page == NULL) {
- page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT,
- flags);
- if (!page) {
- EXOFS_DBGMSG("grab_cache_page_write_begin failed\n");
- return -ENOMEM;
- }
- *pagep = page;
- }
-
- /* read modify write */
- if (!PageUptodate(page) && (len != PAGE_SIZE)) {
- loff_t i_size = i_size_read(mapping->host);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
-
- if (page->index > end_index) {
- clear_highpage(page);
- SetPageUptodate(page);
- } else {
- ret = _readpage(page, true);
- if (ret) {
- unlock_page(page);
- EXOFS_DBGMSG("__readpage failed\n");
- }
- }
- }
- return ret;
-}
-
-static int exofs_write_begin_export(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- *pagep = NULL;
-
- return exofs_write_begin(file, mapping, pos, len, flags, pagep,
- fsdata);
-}
-
-static int exofs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = mapping->host;
- loff_t last_pos = pos + copied;
-
- if (!PageUptodate(page)) {
- if (copied < len) {
- _write_failed(inode, pos + len);
- copied = 0;
- goto out;
- }
- SetPageUptodate(page);
- }
- if (last_pos > inode->i_size) {
- i_size_write(inode, last_pos);
- mark_inode_dirty(inode);
- }
- set_page_dirty(page);
-out:
- unlock_page(page);
- put_page(page);
- return copied;
-}
-
-static int exofs_releasepage(struct page *page, gfp_t gfp)
-{
- EXOFS_DBGMSG("page 0x%lx\n", page->index);
- WARN_ON(1);
- return 0;
-}
-
-static void exofs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
- page->index, offset, length);
- WARN_ON(1);
-}
-
-
- /* TODO: Should be easy enough to do proprly */
-static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- return 0;
-}
-
-const struct address_space_operations exofs_aops = {
- .readpage = exofs_readpage,
- .readpages = exofs_readpages,
- .writepage = NULL,
- .writepages = exofs_writepages,
- .write_begin = exofs_write_begin_export,
- .write_end = exofs_write_end,
- .releasepage = exofs_releasepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
- .invalidatepage = exofs_invalidatepage,
-
- /* Not implemented Yet */
- .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
- .direct_IO = exofs_direct_IO,
-
- /* With these NULL has special meaning or default is not exported */
- .migratepage = NULL,
- .launder_page = NULL,
- .is_partially_uptodate = NULL,
- .error_remove_page = NULL,
-};
-
-/******************************************************************************
- * INODE OPERATIONS
- *****************************************************************************/
-
-/*
- * Test whether an inode is a fast symlink.
- */
-static inline int exofs_inode_is_fast_symlink(struct inode *inode)
-{
- struct exofs_i_info *oi = exofs_i(inode);
-
- return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
-}
-
-static int _do_truncate(struct inode *inode, loff_t newsize)
-{
- struct exofs_i_info *oi = exofs_i(inode);
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- int ret;
-
- inode->i_mtime = inode->i_ctime = current_time(inode);
-
- ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
- if (likely(!ret))
- truncate_setsize(inode, newsize);
-
- EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
- inode->i_ino, newsize, ret);
- return ret;
-}
-
-/*
- * Set inode attributes - update size attribute on OSD if needed,
- * otherwise just call generic functions.
- */
-int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
- struct inode *inode = d_inode(dentry);
- int error;
-
- /* if we are about to modify an object, and it hasn't been
- * created yet, wait
- */
- error = wait_obj_created(exofs_i(inode));
- if (unlikely(error))
- return error;
-
- error = setattr_prepare(dentry, iattr);
- if (unlikely(error))
- return error;
-
- if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode)) {
- error = _do_truncate(inode, iattr->ia_size);
- if (unlikely(error))
- return error;
- }
-
- setattr_copy(inode, iattr);
- mark_inode_dirty(inode);
- return 0;
-}
-
-static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
- EXOFS_APAGE_FS_DATA,
- EXOFS_ATTR_INODE_FILE_LAYOUT,
- 0);
-static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
- EXOFS_APAGE_FS_DATA,
- EXOFS_ATTR_INODE_DIR_LAYOUT,
- 0);
-
-/*
- * Read the Linux inode info from the OSD, and return it as is. In exofs the
- * inode info is in an application specific page/attribute of the osd-object.
- */
-static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
- struct exofs_fcb *inode)
-{
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct osd_attr attrs[] = {
- [0] = g_attr_inode_data,
- [1] = g_attr_inode_file_layout,
- [2] = g_attr_inode_dir_layout,
- };
- struct ore_io_state *ios;
- struct exofs_on_disk_inode_layout *layout;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- return ret;
- }
-
- attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
- attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
-
- ios->in_attr = attrs;
- ios->in_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_read(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
- _LLU(oi->one_comp.obj.id), ret);
- memset(inode, 0, sizeof(*inode));
- inode->i_mode = 0040000 | (0777 & ~022);
- /* If object is lost on target we might as well enable it's
- * delete.
- */
- ret = 0;
- goto out;
- }
-
- ret = extract_attr_from_ios(ios, &attrs[0]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
- goto out;
- }
- WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
- memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
-
- ret = extract_attr_from_ios(ios, &attrs[1]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
- goto out;
- }
- if (attrs[1].len) {
- layout = attrs[1].val_ptr;
- if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
- EXOFS_ERR("%s: unsupported files layout %d\n",
- __func__, layout->gen_func);
- ret = -ENOTSUPP;
- goto out;
- }
- }
-
- ret = extract_attr_from_ios(ios, &attrs[2]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
- goto out;
- }
- if (attrs[2].len) {
- layout = attrs[2].val_ptr;
- if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
- EXOFS_ERR("%s: unsupported meta-data layout %d\n",
- __func__, layout->gen_func);
- ret = -ENOTSUPP;
- goto out;
- }
- }
-
-out:
- ore_put_io_state(ios);
- return ret;
-}
-
-static void __oi_init(struct exofs_i_info *oi)
-{
- init_waitqueue_head(&oi->i_wq);
- oi->i_flags = 0;
-}
-/*
- * Fill in an inode read from the OSD and set it up for use
- */
-struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
-{
- struct exofs_i_info *oi;
- struct exofs_fcb fcb;
- struct inode *inode;
- int ret;
-
- inode = iget_locked(sb, ino);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
- oi = exofs_i(inode);
- __oi_init(oi);
- exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
- exofs_oi_objno(oi));
-
- /* read the inode from the osd */
- ret = exofs_get_inode(sb, oi, &fcb);
- if (ret)
- goto bad_inode;
-
- set_obj_created(oi);
-
- /* copy stuff from on-disk struct to in-memory struct */
- inode->i_mode = le16_to_cpu(fcb.i_mode);
- i_uid_write(inode, le32_to_cpu(fcb.i_uid));
- i_gid_write(inode, le32_to_cpu(fcb.i_gid));
- set_nlink(inode, le16_to_cpu(fcb.i_links_count));
- inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
- inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
- inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
- inode->i_ctime.tv_nsec =
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
- oi->i_commit_size = le64_to_cpu(fcb.i_size);
- i_size_write(inode, oi->i_commit_size);
- inode->i_blkbits = EXOFS_BLKSHIFT;
- inode->i_generation = le32_to_cpu(fcb.i_generation);
-
- oi->i_dir_start_lookup = 0;
-
- if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
- ret = -ESTALE;
- goto bad_inode;
- }
-
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (fcb.i_data[0])
- inode->i_rdev =
- old_decode_dev(le32_to_cpu(fcb.i_data[0]));
- else
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(fcb.i_data[1]));
- } else {
- memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
- }
-
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = &exofs_file_inode_operations;
- inode->i_fop = &exofs_file_operations;
- inode->i_mapping->a_ops = &exofs_aops;
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &exofs_dir_inode_operations;
- inode->i_fop = &exofs_dir_operations;
- inode->i_mapping->a_ops = &exofs_aops;
- } else if (S_ISLNK(inode->i_mode)) {
- if (exofs_inode_is_fast_symlink(inode)) {
- inode->i_op = &simple_symlink_inode_operations;
- inode->i_link = (char *)oi->i_data;
- } else {
- inode->i_op = &page_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &exofs_aops;
- }
- } else {
- inode->i_op = &exofs_special_inode_operations;
- if (fcb.i_data[0])
- init_special_inode(inode, inode->i_mode,
- old_decode_dev(le32_to_cpu(fcb.i_data[0])));
- else
- init_special_inode(inode, inode->i_mode,
- new_decode_dev(le32_to_cpu(fcb.i_data[1])));
- }
-
- unlock_new_inode(inode);
- return inode;
-
-bad_inode:
- iget_failed(inode);
- return ERR_PTR(ret);
-}
-
-int __exofs_wait_obj_created(struct exofs_i_info *oi)
-{
- if (!obj_created(oi)) {
- EXOFS_DBGMSG("!obj_created\n");
- BUG_ON(!obj_2bcreated(oi));
- wait_event(oi->i_wq, obj_created(oi));
- EXOFS_DBGMSG("wait_event done\n");
- }
- return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
-}
-
-/*
- * Callback function from exofs_new_inode(). The important thing is that we
- * set the obj_created flag so that other methods know that the object exists on
- * the OSD.
- */
-static void create_done(struct ore_io_state *ios, void *p)
-{
- struct inode *inode = p;
- struct exofs_i_info *oi = exofs_i(inode);
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- int ret;
-
- ret = ore_check_io(ios, NULL);
- ore_put_io_state(ios);
-
- atomic_dec(&sbi->s_curr_pending);
-
- if (unlikely(ret)) {
- EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
- _LLU(exofs_oi_objno(oi)),
- _LLU(oi->one_comp.obj.partition));
- /*TODO: When FS is corrupted creation can fail, object already
- * exist. Get rid of this asynchronous creation, if exist
- * increment the obj counter and try the next object. Until we
- * succeed. All these dangling objects will be made into lost
- * files by chkfs.exofs
- */
- }
-
- set_obj_created(oi);
-
- wake_up(&oi->i_wq);
-}
-
-/*
- * Set up a new inode and create an object for it on the OSD
- */
-struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
-{
- struct super_block *sb = dir->i_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct inode *inode;
- struct exofs_i_info *oi;
- struct ore_io_state *ios;
- int ret;
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- oi = exofs_i(inode);
- __oi_init(oi);
-
- set_obj_2bcreated(oi);
-
- inode_init_owner(inode, dir, mode);
- inode->i_ino = sbi->s_nextid++;
- inode->i_blkbits = EXOFS_BLKSHIFT;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- oi->i_commit_size = inode->i_size = 0;
- spin_lock(&sbi->s_next_gen_lock);
- inode->i_generation = sbi->s_next_generation++;
- spin_unlock(&sbi->s_next_gen_lock);
- insert_inode_hash(inode);
-
- exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
- exofs_oi_objno(oi));
- exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
-
- mark_inode_dirty(inode);
-
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
- return ERR_PTR(ret);
- }
-
- ios->done = create_done;
- ios->private = inode;
-
- ret = ore_create(ios);
- if (ret) {
- ore_put_io_state(ios);
- return ERR_PTR(ret);
- }
- atomic_inc(&sbi->s_curr_pending);
-
- return inode;
-}
-
-/*
- * struct to pass two arguments to update_inode's callback
- */
-struct updatei_args {
- struct exofs_sb_info *sbi;
- struct exofs_fcb fcb;
-};
-
-/*
- * Callback function from exofs_update_inode().
- */
-static void updatei_done(struct ore_io_state *ios, void *p)
-{
- struct updatei_args *args = p;
-
- ore_put_io_state(ios);
-
- atomic_dec(&args->sbi->s_curr_pending);
-
- kfree(args);
-}
-
-/*
- * Write the inode to the OSD. Just fill up the struct, and set the attribute
- * synchronously or asynchronously depending on the do_sync flag.
- */
-static int exofs_update_inode(struct inode *inode, int do_sync)
-{
- struct exofs_i_info *oi = exofs_i(inode);
- struct super_block *sb = inode->i_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct ore_io_state *ios;
- struct osd_attr attr;
- struct exofs_fcb *fcb;
- struct updatei_args *args;
- int ret;
-
- args = kzalloc(sizeof(*args), GFP_KERNEL);
- if (!args) {
- EXOFS_DBGMSG("Failed kzalloc of args\n");
- return -ENOMEM;
- }
-
- fcb = &args->fcb;
-
- fcb->i_mode = cpu_to_le16(inode->i_mode);
- fcb->i_uid = cpu_to_le32(i_uid_read(inode));
- fcb->i_gid = cpu_to_le32(i_gid_read(inode));
- fcb->i_links_count = cpu_to_le16(inode->i_nlink);
- fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
- fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
- oi->i_commit_size = i_size_read(inode);
- fcb->i_size = cpu_to_le64(oi->i_commit_size);
- fcb->i_generation = cpu_to_le32(inode->i_generation);
-
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (old_valid_dev(inode->i_rdev)) {
- fcb->i_data[0] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- fcb->i_data[1] = 0;
- } else {
- fcb->i_data[0] = 0;
- fcb->i_data[1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- fcb->i_data[2] = 0;
- }
- } else
- memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
-
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- goto free_args;
- }
-
- attr = g_attr_inode_data;
- attr.val_ptr = fcb;
- ios->out_attr_len = 1;
- ios->out_attr = &attr;
-
- wait_obj_created(oi);
-
- if (!do_sync) {
- args->sbi = sbi;
- ios->done = updatei_done;
- ios->private = args;
- }
-
- ret = ore_write(ios);
- if (!do_sync && !ret) {
- atomic_inc(&sbi->s_curr_pending);
- goto out; /* deallocation in updatei_done */
- }
-
- ore_put_io_state(ios);
-free_args:
- kfree(args);
-out:
- EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
- inode->i_ino, do_sync, ret);
- return ret;
-}
-
-int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
- return exofs_update_inode(inode, 1);
-}
-
-/*
- * Callback function from exofs_delete_inode() - don't have much cleaning up to
- * do.
- */
-static void delete_done(struct ore_io_state *ios, void *p)
-{
- struct exofs_sb_info *sbi = p;
-
- ore_put_io_state(ios);
-
- atomic_dec(&sbi->s_curr_pending);
-}
-
-/*
- * Called when the refcount of an inode reaches zero. We remove the object
- * from the OSD here. We make sure the object was created before we try and
- * delete it.
- */
-void exofs_evict_inode(struct inode *inode)
-{
- struct exofs_i_info *oi = exofs_i(inode);
- struct super_block *sb = inode->i_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct ore_io_state *ios;
- int ret;
-
- truncate_inode_pages_final(&inode->i_data);
-
- /* TODO: should do better here */
- if (inode->i_nlink || is_bad_inode(inode))
- goto no_delete;
-
- inode->i_size = 0;
- clear_inode(inode);
-
- /* if we are deleting an obj that hasn't been created yet, wait.
- * This also makes sure that create_done cannot be called with an
- * already evicted inode.
- */
- wait_obj_created(oi);
- /* ignore the error, attempt a remove anyway */
-
- /* Now Remove the OSD objects */
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
- return;
- }
-
- ios->done = delete_done;
- ios->private = sbi;
-
- ret = ore_remove(ios);
- if (ret) {
- EXOFS_ERR("%s: ore_remove failed\n", __func__);
- ore_put_io_state(ios);
- return;
- }
- atomic_inc(&sbi->s_curr_pending);
-
- return;
-
-no_delete:
- clear_inode(inode);
-}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
deleted file mode 100644
index 7295cd722770..000000000000
--- a/fs/exofs/namei.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "exofs.h"
-
-static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
-{
- int err = exofs_add_link(dentry, inode);
- if (!err) {
- d_instantiate(dentry, inode);
- return 0;
- }
- inode_dec_link_count(inode);
- iput(inode);
- return err;
-}
-
-static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- struct inode *inode;
- ino_t ino;
-
- if (dentry->d_name.len > EXOFS_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
- ino = exofs_inode_by_name(dir, dentry);
- inode = ino ? exofs_iget(dir->i_sb, ino) : NULL;
- return d_splice_alias(inode, dentry);
-}
-
-static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
-{
- struct inode *inode = exofs_new_inode(dir, mode);
- int err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &exofs_file_inode_operations;
- inode->i_fop = &exofs_file_operations;
- inode->i_mapping->a_ops = &exofs_aops;
- mark_inode_dirty(inode);
- err = exofs_add_nondir(dentry, inode);
- }
- return err;
-}
-
-static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
- dev_t rdev)
-{
- struct inode *inode;
- int err;
-
- inode = exofs_new_inode(dir, mode);
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- init_special_inode(inode, inode->i_mode, rdev);
- mark_inode_dirty(inode);
- err = exofs_add_nondir(dentry, inode);
- }
- return err;
-}
-
-static int exofs_symlink(struct inode *dir, struct dentry *dentry,
- const char *symname)
-{
- struct super_block *sb = dir->i_sb;
- int err = -ENAMETOOLONG;
- unsigned l = strlen(symname)+1;
- struct inode *inode;
- struct exofs_i_info *oi;
-
- if (l > sb->s_blocksize)
- goto out;
-
- inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out;
-
- oi = exofs_i(inode);
- if (l > sizeof(oi->i_data)) {
- /* slow symlink */
- inode->i_op = &page_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &exofs_aops;
- memset(oi->i_data, 0, sizeof(oi->i_data));
-
- err = page_symlink(inode, symname, l);
- if (err)
- goto out_fail;
- } else {
- /* fast symlink */
- inode->i_op = &simple_symlink_inode_operations;
- inode->i_link = (char *)oi->i_data;
- memcpy(oi->i_data, symname, l);
- inode->i_size = l-1;
- }
- mark_inode_dirty(inode);
-
- err = exofs_add_nondir(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- iput(inode);
- goto out;
-}
-
-static int exofs_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
-{
- struct inode *inode = d_inode(old_dentry);
-
- inode->i_ctime = current_time(inode);
- inode_inc_link_count(inode);
- ihold(inode);
-
- return exofs_add_nondir(dentry, inode);
-}
-
-static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- struct inode *inode;
- int err;
-
- inode_inc_link_count(dir);
-
- inode = exofs_new_inode(dir, S_IFDIR | mode);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_dir;
-
- inode->i_op = &exofs_dir_inode_operations;
- inode->i_fop = &exofs_dir_operations;
- inode->i_mapping->a_ops = &exofs_aops;
-
- inode_inc_link_count(inode);
-
- err = exofs_make_empty(inode, dir);
- if (err)
- goto out_fail;
-
- err = exofs_add_link(dentry, inode);
- if (err)
- goto out_fail;
-
- d_instantiate(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- inode_dec_link_count(inode);
- iput(inode);
-out_dir:
- inode_dec_link_count(dir);
- goto out;
-}
-
-static int exofs_unlink(struct inode *dir, struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- struct exofs_dir_entry *de;
- struct page *page;
- int err = -ENOENT;
-
- de = exofs_find_entry(dir, dentry, &page);
- if (!de)
- goto out;
-
- err = exofs_delete_entry(de, page);
- if (err)
- goto out;
-
- inode->i_ctime = dir->i_ctime;
- inode_dec_link_count(inode);
- err = 0;
-out:
- return err;
-}
-
-static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- int err = -ENOTEMPTY;
-
- if (exofs_empty_dir(inode)) {
- err = exofs_unlink(dir, dentry);
- if (!err) {
- inode->i_size = 0;
- inode_dec_link_count(inode);
- inode_dec_link_count(dir);
- }
- }
- return err;
-}
-
-static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
-{
- struct inode *old_inode = d_inode(old_dentry);
- struct inode *new_inode = d_inode(new_dentry);
- struct page *dir_page = NULL;
- struct exofs_dir_entry *dir_de = NULL;
- struct page *old_page;
- struct exofs_dir_entry *old_de;
- int err = -ENOENT;
-
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
- if (!old_de)
- goto out;
-
- if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
- dir_de = exofs_dotdot(old_inode, &dir_page);
- if (!dir_de)
- goto out_old;
- }
-
- if (new_inode) {
- struct page *new_page;
- struct exofs_dir_entry *new_de;
-
- err = -ENOTEMPTY;
- if (dir_de && !exofs_empty_dir(new_inode))
- goto out_dir;
-
- err = -ENOENT;
- new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
- if (!new_de)
- goto out_dir;
- err = exofs_set_link(new_dir, new_de, new_page, old_inode);
- new_inode->i_ctime = current_time(new_inode);
- if (dir_de)
- drop_nlink(new_inode);
- inode_dec_link_count(new_inode);
- if (err)
- goto out_dir;
- } else {
- err = exofs_add_link(new_dentry, old_inode);
- if (err)
- goto out_dir;
- if (dir_de)
- inode_inc_link_count(new_dir);
- }
-
- old_inode->i_ctime = current_time(old_inode);
-
- exofs_delete_entry(old_de, old_page);
- mark_inode_dirty(old_inode);
-
- if (dir_de) {
- err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
- inode_dec_link_count(old_dir);
- if (err)
- goto out_dir;
- }
- return 0;
-
-
-out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
-out_old:
- kunmap(old_page);
- put_page(old_page);
-out:
- return err;
-}
-
-const struct inode_operations exofs_dir_inode_operations = {
- .create = exofs_create,
- .lookup = exofs_lookup,
- .link = exofs_link,
- .unlink = exofs_unlink,
- .symlink = exofs_symlink,
- .mkdir = exofs_mkdir,
- .rmdir = exofs_rmdir,
- .mknod = exofs_mknod,
- .rename = exofs_rename,
- .setattr = exofs_setattr,
-};
-
-const struct inode_operations exofs_special_inode_operations = {
- .setattr = exofs_setattr,
-};
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
deleted file mode 100644
index 5331a15a61f1..000000000000
--- a/fs/exofs/ore.c
+++ /dev/null
@@ -1,1178 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <asm/div64.h>
-#include <linux/lcm.h>
-
-#include "ore_raid.h"
-
-MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
-MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
-MODULE_LICENSE("GPL");
-
-/* ore_verify_layout does a couple of things:
- * 1. Given a minimum number of needed parameters fixes up the rest of the
- * members to be operatonals for the ore. The needed parameters are those
- * that are defined by the pnfs-objects layout STD.
- * 2. Check to see if the current ore code actually supports these parameters
- * for example stripe_unit must be a multple of the system PAGE_SIZE,
- * and etc...
- * 3. Cache some havily used calculations that will be needed by users.
- */
-
-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
-
-int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
-{
- u64 stripe_length;
-
- switch (layout->raid_algorithm) {
- case PNFS_OSD_RAID_0:
- layout->parity = 0;
- break;
- case PNFS_OSD_RAID_5:
- layout->parity = 1;
- break;
- case PNFS_OSD_RAID_PQ:
- layout->parity = 2;
- break;
- case PNFS_OSD_RAID_4:
- default:
- ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
- layout->raid_algorithm);
- return -EINVAL;
- }
- if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
- ORE_ERR("Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(layout->stripe_unit), PAGE_SIZE);
- return -EINVAL;
- }
- if (layout->group_width) {
- if (!layout->group_depth) {
- ORE_ERR("group_depth == 0 && group_width != 0\n");
- return -EINVAL;
- }
- if (total_comps < (layout->group_width * layout->mirrors_p1)) {
- ORE_ERR("Data Map wrong, "
- "numdevs=%d < group_width=%d * mirrors=%d\n",
- total_comps, layout->group_width,
- layout->mirrors_p1);
- return -EINVAL;
- }
- layout->group_count = total_comps / layout->mirrors_p1 /
- layout->group_width;
- } else {
- if (layout->group_depth) {
- printk(KERN_NOTICE "Warning: group_depth ignored "
- "group_width == 0 && group_depth == %lld\n",
- _LLU(layout->group_depth));
- }
- layout->group_width = total_comps / layout->mirrors_p1;
- layout->group_depth = -1;
- layout->group_count = 1;
- }
-
- stripe_length = (u64)layout->group_width * layout->stripe_unit;
- if (stripe_length >= (1ULL << 32)) {
- ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
- _LLU(stripe_length));
- return -EINVAL;
- }
-
- layout->max_io_length =
- (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
- (layout->group_width - layout->parity);
- if (layout->parity) {
- unsigned stripe_length =
- (layout->group_width - layout->parity) *
- layout->stripe_unit;
-
- layout->max_io_length /= stripe_length;
- layout->max_io_length *= stripe_length;
- }
- ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
-
- return 0;
-}
-EXPORT_SYMBOL(ore_verify_layout);
-
-static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
-{
- return ios->oc->comps[index & ios->oc->single_comp].cred;
-}
-
-static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
-{
- return &ios->oc->comps[index & ios->oc->single_comp].obj;
-}
-
-static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
-{
- ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
- ios->oc->first_dev, ios->oc->numdevs, index,
- ios->oc->ods);
-
- return ore_comp_dev(ios->oc, index);
-}
-
-int _ore_get_io_state(struct ore_layout *layout,
- struct ore_components *oc, unsigned numdevs,
- unsigned sgs_per_dev, unsigned num_par_pages,
- struct ore_io_state **pios)
-{
- struct ore_io_state *ios;
- size_t size_ios, size_extra, size_total;
- void *ios_extra;
-
- /*
- * The desired layout looks like this, with the extra_allocation
- * items pointed at from fields within ios or per_dev:
-
- struct __alloc_all_io_state {
- struct ore_io_state ios;
- struct ore_per_dev_state per_dev[numdevs];
- union {
- struct osd_sg_entry sglist[sgs_per_dev * numdevs];
- struct page *pages[num_par_pages];
- } extra_allocation;
- } whole_allocation;
-
- */
-
- /* This should never happen, so abort early if it ever does. */
- if (sgs_per_dev && num_par_pages) {
- ORE_DBGMSG("Tried to use both pages and sglist\n");
- *pios = NULL;
- return -EINVAL;
- }
-
- if (numdevs > (INT_MAX - sizeof(*ios)) /
- sizeof(struct ore_per_dev_state))
- return -ENOMEM;
- size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs;
-
- if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry))
- return -ENOMEM;
- if (num_par_pages > INT_MAX / sizeof(struct page *))
- return -ENOMEM;
- size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs),
- sizeof(struct page *) * num_par_pages);
-
- size_total = size_ios + size_extra;
-
- if (likely(size_total <= PAGE_SIZE)) {
- ios = kzalloc(size_total, GFP_KERNEL);
- if (unlikely(!ios)) {
- ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total);
- *pios = NULL;
- return -ENOMEM;
- }
- ios_extra = (char *)ios + size_ios;
- } else {
- ios = kzalloc(size_ios, GFP_KERNEL);
- if (unlikely(!ios)) {
- ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
- size_ios);
- *pios = NULL;
- return -ENOMEM;
- }
- ios_extra = kzalloc(size_extra, GFP_KERNEL);
- if (unlikely(!ios_extra)) {
- ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
- size_extra);
- kfree(ios);
- *pios = NULL;
- return -ENOMEM;
- }
-
- /* In this case the per_dev[0].sgilist holds the pointer to
- * be freed
- */
- ios->extra_part_alloc = true;
- }
-
- if (num_par_pages) {
- ios->parity_pages = ios_extra;
- ios->max_par_pages = num_par_pages;
- }
- if (sgs_per_dev) {
- struct osd_sg_entry *sgilist = ios_extra;
- unsigned d;
-
- for (d = 0; d < numdevs; ++d) {
- ios->per_dev[d].sglist = sgilist;
- sgilist += sgs_per_dev;
- }
- ios->sgs_per_dev = sgs_per_dev;
- }
-
- ios->layout = layout;
- ios->oc = oc;
- *pios = ios;
- return 0;
-}
-
-/* Allocate an io_state for only a single group of devices
- *
- * If a user needs to call ore_read/write() this version must be used becase it
- * allocates extra stuff for striping and raid.
- * The ore might decide to only IO less then @length bytes do to alignmets
- * and constrains as follows:
- * - The IO cannot cross group boundary.
- * - In raid5/6 The end of the IO must align at end of a stripe eg.
- * (@offset + @length) % strip_size == 0. Or the complete range is within a
- * single stripe.
- * - Memory condition only permitted a shorter IO. (A user can use @length=~0
- * And check the returned ios->length for max_io_size.)
- *
- * The caller must check returned ios->length (and/or ios->nr_pages) and
- * re-issue these pages that fall outside of ios->length
- */
-int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
- bool is_reading, u64 offset, u64 length,
- struct ore_io_state **pios)
-{
- struct ore_io_state *ios;
- unsigned numdevs = layout->group_width * layout->mirrors_p1;
- unsigned sgs_per_dev = 0, max_par_pages = 0;
- int ret;
-
- if (layout->parity && length) {
- unsigned data_devs = layout->group_width - layout->parity;
- unsigned stripe_size = layout->stripe_unit * data_devs;
- unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
- u32 remainder;
- u64 num_stripes;
- u64 num_raid_units;
-
- num_stripes = div_u64_rem(length, stripe_size, &remainder);
- if (remainder)
- ++num_stripes;
-
- num_raid_units = num_stripes * layout->parity;
-
- if (is_reading) {
- /* For reads add per_dev sglist array */
- /* TODO: Raid 6 we need twice more. Actually:
- * num_stripes / LCMdP(W,P);
- * if (W%P != 0) num_stripes *= parity;
- */
-
- /* first/last seg is split */
- num_raid_units += layout->group_width;
- sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
- } else {
- /* For Writes add parity pages array. */
- max_par_pages = num_raid_units * pages_in_unit *
- sizeof(struct page *);
- }
- }
-
- ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
- pios);
- if (unlikely(ret))
- return ret;
-
- ios = *pios;
- ios->reading = is_reading;
- ios->offset = offset;
-
- if (length) {
- ore_calc_stripe_info(layout, offset, length, &ios->si);
- ios->length = ios->si.length;
- ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
- ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
- if (layout->parity)
- _ore_post_alloc_raid_stuff(ios);
- }
-
- return 0;
-}
-EXPORT_SYMBOL(ore_get_rw_state);
-
-/* Allocate an io_state for all the devices in the comps array
- *
- * This version of io_state allocation is used mostly by create/remove
- * and trunc where we currently need all the devices. The only wastful
- * bit is the read/write_attributes with no IO. Those sites should
- * be converted to use ore_get_rw_state() with length=0
- */
-int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
- struct ore_io_state **pios)
-{
- return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
-}
-EXPORT_SYMBOL(ore_get_io_state);
-
-void ore_put_io_state(struct ore_io_state *ios)
-{
- if (ios) {
- unsigned i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[i];
-
- if (per_dev->or)
- osd_end_request(per_dev->or);
- if (per_dev->bio)
- bio_put(per_dev->bio);
- }
-
- _ore_free_raid_stuff(ios);
- kfree(ios);
- }
-}
-EXPORT_SYMBOL(ore_put_io_state);
-
-static void _sync_done(struct ore_io_state *ios, void *p)
-{
- struct completion *waiting = p;
-
- complete(waiting);
-}
-
-static void _last_io(struct kref *kref)
-{
- struct ore_io_state *ios = container_of(
- kref, struct ore_io_state, kref);
-
- ios->done(ios, ios->private);
-}
-
-static void _done_io(struct osd_request *or, void *p)
-{
- struct ore_io_state *ios = p;
-
- kref_put(&ios->kref, _last_io);
-}
-
-int ore_io_execute(struct ore_io_state *ios)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- bool sync = (ios->done == NULL);
- int i, ret;
-
- if (sync) {
- ios->done = _sync_done;
- ios->private = &wait;
- }
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
- if (unlikely(!or))
- continue;
-
- ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
- if (unlikely(ret)) {
- ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
- ret);
- return ret;
- }
- }
-
- kref_init(&ios->kref);
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
- if (unlikely(!or))
- continue;
-
- kref_get(&ios->kref);
- osd_execute_request_async(or, _done_io, ios);
- }
-
- kref_put(&ios->kref, _last_io);
- ret = 0;
-
- if (sync) {
- wait_for_completion(&wait);
- ret = ore_check_io(ios, NULL);
- }
- return ret;
-}
-
-static void _clear_bio(struct bio *bio)
-{
- struct bio_vec *bv;
- unsigned i;
-
- bio_for_each_segment_all(bv, bio, i) {
- unsigned this_count = bv->bv_len;
-
- if (likely(PAGE_SIZE == this_count))
- clear_highpage(bv->bv_page);
- else
- zero_user(bv->bv_page, bv->bv_offset, this_count);
- }
-}
-
-int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
-{
- enum osd_err_priority acumulated_osd_err = 0;
- int acumulated_lin_err = 0;
- int i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_sense_info osi;
- struct ore_per_dev_state *per_dev = &ios->per_dev[i];
- struct osd_request *or = per_dev->or;
- int ret;
-
- if (unlikely(!or))
- continue;
-
- ret = osd_req_decode_sense(or, &osi);
- if (likely(!ret))
- continue;
-
- if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
- per_dev->bio) {
- /* start read offset passed endof file.
- * Note: if we do not have bio it means read-attributes
- * In this case we should return error to caller.
- */
- _clear_bio(per_dev->bio);
- ORE_DBGMSG("start read offset passed end of file "
- "offset=0x%llx, length=0x%llx\n",
- _LLU(per_dev->offset),
- _LLU(per_dev->length));
-
- continue; /* we recovered */
- }
-
- if (on_dev_error) {
- u64 residual = ios->reading ?
- or->in.residual : or->out.residual;
- u64 offset = (ios->offset + ios->length) - residual;
- unsigned dev = per_dev->dev - ios->oc->first_dev;
- struct ore_dev *od = ios->oc->ods[dev];
-
- on_dev_error(ios, od, dev, osi.osd_err_pri,
- offset, residual);
- }
- if (osi.osd_err_pri >= acumulated_osd_err) {
- acumulated_osd_err = osi.osd_err_pri;
- acumulated_lin_err = ret;
- }
- }
-
- return acumulated_lin_err;
-}
-EXPORT_SYMBOL(ore_check_io);
-
-/*
- * L - logical offset into the file
- *
- * D - number of Data devices
- * D = group_width - parity
- *
- * U - The number of bytes in a stripe within a group
- * U = stripe_unit * D
- *
- * T - The number of bytes striped within a group of component objects
- * (before advancing to the next group)
- * T = U * group_depth
- *
- * S - The number of bytes striped across all component objects
- * before the pattern repeats
- * S = T * group_count
- *
- * M - The "major" (i.e., across all components) cycle number
- * M = L / S
- *
- * G - Counts the groups from the beginning of the major cycle
- * G = (L - (M * S)) / T [or (L % S) / T]
- *
- * H - The byte offset within the group
- * H = (L - (M * S)) % T [or (L % S) % T]
- *
- * N - The "minor" (i.e., across the group) stripe number
- * N = H / U
- *
- * C - The component index coresponding to L
- *
- * C = (H - (N * U)) / stripe_unit + G * D
- * [or (L % U) / stripe_unit + G * D]
- *
- * O - The component offset coresponding to L
- * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
- *
- * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
- * divide by parity
- * LCMdP = lcm(group_width, parity) / parity
- *
- * R - The parity Rotation stripe
- * (Note parity cycle always starts at a group's boundary)
- * R = N % LCMdP
- *
- * I = the first parity device index
- * I = (group_width + group_width - R*parity - parity) % group_width
- *
- * Craid - The component index Rotated
- * Craid = (group_width + C - R*parity) % group_width
- * (We add the group_width to avoid negative numbers modulo math)
- */
-void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
- u64 length, struct ore_striping_info *si)
-{
- u32 stripe_unit = layout->stripe_unit;
- u32 group_width = layout->group_width;
- u64 group_depth = layout->group_depth;
- u32 parity = layout->parity;
-
- u32 D = group_width - parity;
- u32 U = D * stripe_unit;
- u64 T = U * group_depth;
- u64 S = T * layout->group_count;
- u64 M = div64_u64(file_offset, S);
-
- /*
- G = (L - (M * S)) / T
- H = (L - (M * S)) % T
- */
- u64 LmodS = file_offset - M * S;
- u32 G = div64_u64(LmodS, T);
- u64 H = LmodS - G * T;
-
- u32 N = div_u64(H, U);
- u32 Nlast;
-
- /* "H - (N * U)" is just "H % U" so it's bound to u32 */
- u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
- u32 first_dev = C - C % group_width;
-
- div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-
- si->obj_offset = si->unit_off + (N * stripe_unit) +
- (M * group_depth * stripe_unit);
- si->cur_comp = C - first_dev;
- si->cur_pg = si->unit_off / PAGE_SIZE;
-
- if (parity) {
- u32 LCMdP = lcm(group_width, parity) / parity;
- /* R = N % LCMdP; */
- u32 RxP = (N % LCMdP) * parity;
-
- si->par_dev = (group_width + group_width - parity - RxP) %
- group_width + first_dev;
- si->dev = (group_width + group_width + C - RxP) %
- group_width + first_dev;
- si->bytes_in_stripe = U;
- si->first_stripe_start = M * S + G * T + N * U;
- } else {
- /* Make the math correct see _prepare_one_group */
- si->par_dev = group_width;
- si->dev = C;
- }
-
- si->dev *= layout->mirrors_p1;
- si->par_dev *= layout->mirrors_p1;
- si->offset = file_offset;
- si->length = T - H;
- if (si->length > length)
- si->length = length;
-
- Nlast = div_u64(H + si->length + U - 1, U);
- si->maxdevUnits = Nlast - N;
-
- si->M = M;
-}
-EXPORT_SYMBOL(ore_calc_stripe_info);
-
-int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct page **pages,
- struct ore_per_dev_state *per_dev, int cur_len)
-{
- unsigned pg = *cur_pg;
- struct request_queue *q =
- osd_request_queue(_ios_od(ios, per_dev->dev));
- unsigned len = cur_len;
- int ret;
-
- if (per_dev->bio == NULL) {
- unsigned bio_size;
-
- if (!ios->reading) {
- bio_size = ios->si.maxdevUnits;
- } else {
- bio_size = (ios->si.maxdevUnits + 1) *
- (ios->layout->group_width - ios->layout->parity) /
- ios->layout->group_width;
- }
- bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
-
- per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
- if (unlikely(!per_dev->bio)) {
- ORE_DBGMSG("Failed to allocate BIO size=%u\n",
- bio_size);
- ret = -ENOMEM;
- goto out;
- }
- }
-
- while (cur_len > 0) {
- unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
- unsigned added_len;
-
- cur_len -= pglen;
-
- added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
- pglen, pgbase);
- if (unlikely(pglen != added_len)) {
- /* If bi_vcnt == bi_max then this is a SW BUG */
- ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
- "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
- per_dev->bio->bi_vcnt,
- per_dev->bio->bi_max_vecs,
- BIO_MAX_PAGES_KMALLOC, cur_len);
- ret = -ENOMEM;
- goto out;
- }
- _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
-
- pgbase = 0;
- ++pg;
- }
- BUG_ON(cur_len);
-
- per_dev->length += len;
- *cur_pg = pg;
- ret = 0;
-out: /* we fail the complete unit on an error eg don't advance
- * per_dev->length and cur_pg. This means that we might have a bigger
- * bio than the CDB requested length (per_dev->length). That's fine
- * only the oposite is fatal.
- */
- return ret;
-}
-
-static int _add_parity_units(struct ore_io_state *ios,
- struct ore_striping_info *si,
- unsigned dev, unsigned first_dev,
- unsigned mirrors_p1, unsigned devs_in_group,
- unsigned cur_len)
-{
- unsigned do_parity;
- int ret = 0;
-
- for (do_parity = ios->layout->parity; do_parity; --do_parity) {
- struct ore_per_dev_state *per_dev;
-
- per_dev = &ios->per_dev[dev - first_dev];
- if (!per_dev->length && !per_dev->offset) {
- /* Only/always the parity unit of the first
- * stripe will be empty. So this is a chance to
- * initialize the per_dev info.
- */
- per_dev->dev = dev;
- per_dev->offset = si->obj_offset - si->unit_off;
- }
-
- ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
- do_parity == 1);
- if (unlikely(ret))
- break;
-
- if (do_parity != 1) {
- dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
- si->cur_comp = (si->cur_comp + 1) %
- ios->layout->group_width;
- }
- }
-
- return ret;
-}
-
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
- struct ore_striping_info *si = &ios->si;
- unsigned stripe_unit = ios->layout->stripe_unit;
- unsigned mirrors_p1 = ios->layout->mirrors_p1;
- unsigned group_width = ios->layout->group_width;
- unsigned devs_in_group = group_width * mirrors_p1;
- unsigned dev = si->dev;
- unsigned first_dev = dev - (dev % devs_in_group);
- unsigned cur_pg = ios->pages_consumed;
- u64 length = ios->length;
- int ret = 0;
-
- if (!ios->pages) {
- ios->numdevs = ios->layout->mirrors_p1;
- return 0;
- }
-
- BUG_ON(length > si->length);
-
- while (length) {
- struct ore_per_dev_state *per_dev =
- &ios->per_dev[dev - first_dev];
- unsigned cur_len, page_off = 0;
-
- if (!per_dev->length && !per_dev->offset) {
- /* First time initialize the per_dev info. */
- per_dev->dev = dev;
- if (dev == si->dev) {
- WARN_ON(dev == si->par_dev);
- per_dev->offset = si->obj_offset;
- cur_len = stripe_unit - si->unit_off;
- page_off = si->unit_off & ~PAGE_MASK;
- BUG_ON(page_off && (page_off != ios->pgbase));
- } else {
- per_dev->offset = si->obj_offset - si->unit_off;
- cur_len = stripe_unit;
- }
- } else {
- cur_len = stripe_unit;
- }
- if (cur_len >= length)
- cur_len = length;
-
- ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
- per_dev, cur_len);
- if (unlikely(ret))
- goto out;
-
- length -= cur_len;
-
- dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
- si->cur_comp = (si->cur_comp + 1) % group_width;
- if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
- if (!length && ios->sp2d) {
- /* If we are writing and this is the very last
- * stripe. then operate on parity dev.
- */
- dev = si->par_dev;
- /* If last stripe operate on parity comp */
- si->cur_comp = group_width - ios->layout->parity;
- }
-
- /* In writes cur_len just means if it's the
- * last one. See _ore_add_parity_unit.
- */
- ret = _add_parity_units(ios, si, dev, first_dev,
- mirrors_p1, devs_in_group,
- ios->sp2d ? length : cur_len);
- if (unlikely(ret))
- goto out;
-
- /* Rotate next par_dev backwards with wraping */
- si->par_dev = (devs_in_group + si->par_dev -
- ios->layout->parity * mirrors_p1) %
- devs_in_group + first_dev;
- /* Next stripe, start fresh */
- si->cur_comp = 0;
- si->cur_pg = 0;
- si->obj_offset += cur_len;
- si->unit_off = 0;
- }
- }
-out:
- ios->numdevs = devs_in_group;
- ios->pages_consumed = cur_pg;
- return ret;
-}
-
-int ore_create(struct ore_io_state *ios)
-{
- int i, ret;
-
- for (i = 0; i < ios->oc->numdevs; i++) {
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, i));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- ios->per_dev[i].or = or;
- ios->numdevs++;
-
- osd_req_create_object(or, _ios_obj(ios, i));
- }
- ret = ore_io_execute(ios);
-
-out:
- return ret;
-}
-EXPORT_SYMBOL(ore_create);
-
-int ore_remove(struct ore_io_state *ios)
-{
- int i, ret;
-
- for (i = 0; i < ios->oc->numdevs; i++) {
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, i));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- ios->per_dev[i].or = or;
- ios->numdevs++;
-
- osd_req_remove_object(or, _ios_obj(ios, i));
- }
- ret = ore_io_execute(ios);
-
-out:
- return ret;
-}
-EXPORT_SYMBOL(ore_remove);
-
-static int _write_mirror(struct ore_io_state *ios, int cur_comp)
-{
- struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
- unsigned dev = ios->per_dev[cur_comp].dev;
- unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
- int ret = 0;
-
- if (ios->pages && !master_dev->length)
- return 0; /* Just an empty slot */
-
- for (; cur_comp < last_comp; ++cur_comp, ++dev) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, dev));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- per_dev->or = or;
-
- if (ios->pages) {
- struct bio *bio;
-
- if (per_dev != master_dev) {
- bio = bio_clone_fast(master_dev->bio,
- GFP_KERNEL, NULL);
- if (unlikely(!bio)) {
- ORE_DBGMSG(
- "Failed to allocate BIO size=%u\n",
- master_dev->bio->bi_max_vecs);
- ret = -ENOMEM;
- goto out;
- }
-
- bio->bi_disk = NULL;
- bio->bi_next = NULL;
- per_dev->offset = master_dev->offset;
- per_dev->length = master_dev->length;
- per_dev->bio = bio;
- per_dev->dev = dev;
- } else {
- bio = master_dev->bio;
- /* FIXME: bio_set_dir() */
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- }
-
- osd_req_write(or, _ios_obj(ios, cur_comp),
- per_dev->offset, bio, per_dev->length);
- ORE_DBGMSG("write(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d\n",
- _LLU(_ios_obj(ios, cur_comp)->id),
- _LLU(per_dev->offset),
- _LLU(per_dev->length), dev);
- } else if (ios->kern_buff) {
- per_dev->offset = ios->si.obj_offset;
- per_dev->dev = ios->si.dev + dev;
-
- /* no cross device without page array */
- BUG_ON((ios->layout->group_width > 1) &&
- (ios->si.unit_off + ios->length >
- ios->layout->stripe_unit));
-
- ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
- per_dev->offset,
- ios->kern_buff, ios->length);
- if (unlikely(ret))
- goto out;
- ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d\n",
- _LLU(_ios_obj(ios, cur_comp)->id),
- _LLU(per_dev->offset),
- _LLU(ios->length), per_dev->dev);
- } else {
- osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
- ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
- _LLU(_ios_obj(ios, cur_comp)->id),
- ios->out_attr_len, dev);
- }
-
- if (ios->out_attr)
- osd_req_add_set_attr_list(or, ios->out_attr,
- ios->out_attr_len);
-
- if (ios->in_attr)
- osd_req_add_get_attr_list(or, ios->in_attr,
- ios->in_attr_len);
- }
-
-out:
- return ret;
-}
-
-int ore_write(struct ore_io_state *ios)
-{
- int i;
- int ret;
-
- if (unlikely(ios->sp2d && !ios->r4w)) {
- /* A library is attempting a RAID-write without providing
- * a pages lock interface.
- */
- WARN_ON_ONCE(1);
- return -ENOTSUPP;
- }
-
- ret = _prepare_for_striping(ios);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _write_mirror(ios, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = ore_io_execute(ios);
- return ret;
-}
-EXPORT_SYMBOL(ore_write);
-
-int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
-{
- struct osd_request *or;
- struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
- unsigned first_dev = (unsigned)obj->id;
-
- if (ios->pages && !per_dev->length)
- return 0; /* Just an empty slot */
-
- first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
- or = osd_start_request(_ios_od(ios, first_dev));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- return -ENOMEM;
- }
- per_dev->or = or;
-
- if (ios->pages) {
- if (per_dev->cur_sg) {
- /* finalize the last sg_entry */
- _ore_add_sg_seg(per_dev, 0, false);
- if (unlikely(!per_dev->cur_sg))
- return 0; /* Skip parity only device */
-
- osd_req_read_sg(or, obj, per_dev->bio,
- per_dev->sglist, per_dev->cur_sg);
- } else {
- /* The no raid case */
- osd_req_read(or, obj, per_dev->offset,
- per_dev->bio, per_dev->length);
- }
-
- ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
- " dev=%d sg_len=%d\n", _LLU(obj->id),
- _LLU(per_dev->offset), _LLU(per_dev->length),
- first_dev, per_dev->cur_sg);
- } else {
- BUG_ON(ios->kern_buff);
-
- osd_req_get_attributes(or, obj);
- ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
- _LLU(obj->id),
- ios->in_attr_len, first_dev);
- }
- if (ios->out_attr)
- osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
-
- if (ios->in_attr)
- osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
-
- return 0;
-}
-
-int ore_read(struct ore_io_state *ios)
-{
- int i;
- int ret;
-
- ret = _prepare_for_striping(ios);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _ore_read_mirror(ios, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = ore_io_execute(ios);
- return ret;
-}
-EXPORT_SYMBOL(ore_read);
-
-int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
-{
- struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
- void *iter = NULL;
- int nelem;
-
- do {
- nelem = 1;
- osd_req_decode_get_attr_list(ios->per_dev[0].or,
- &cur_attr, &nelem, &iter);
- if ((cur_attr.attr_page == attr->attr_page) &&
- (cur_attr.attr_id == attr->attr_id)) {
- attr->len = cur_attr.len;
- attr->val_ptr = cur_attr.val_ptr;
- return 0;
- }
- } while (iter);
-
- return -EIO;
-}
-EXPORT_SYMBOL(extract_attr_from_ios);
-
-static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
- struct osd_attr *attr)
-{
- int last_comp = cur_comp + ios->layout->mirrors_p1;
-
- for (; cur_comp < last_comp; ++cur_comp) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, cur_comp));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- return -ENOMEM;
- }
- per_dev->or = or;
-
- osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
- osd_req_add_set_attr_list(or, attr, 1);
- }
-
- return 0;
-}
-
-struct _trunc_info {
- struct ore_striping_info si;
- u64 prev_group_obj_off;
- u64 next_group_obj_off;
-
- unsigned first_group_dev;
- unsigned nex_group_dev;
-};
-
-static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
- struct _trunc_info *ti)
-{
- unsigned stripe_unit = layout->stripe_unit;
-
- ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
-
- ti->prev_group_obj_off = ti->si.M * stripe_unit;
- ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
-
- ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
- ti->nex_group_dev = ti->first_group_dev + layout->group_width;
-}
-
-int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
- u64 size)
-{
- struct ore_io_state *ios;
- struct exofs_trunc_attr {
- struct osd_attr attr;
- __be64 newsize;
- } *size_attrs;
- struct _trunc_info ti;
- int i, ret;
-
- ret = ore_get_io_state(layout, oc, &ios);
- if (unlikely(ret))
- return ret;
-
- _calc_trunk_info(ios->layout, size, &ti);
-
- size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
- GFP_KERNEL);
- if (unlikely(!size_attrs)) {
- ret = -ENOMEM;
- goto out;
- }
-
- ios->numdevs = ios->oc->numdevs;
-
- for (i = 0; i < ios->numdevs; ++i) {
- struct exofs_trunc_attr *size_attr = &size_attrs[i];
- u64 obj_size;
-
- if (i < ti.first_group_dev)
- obj_size = ti.prev_group_obj_off;
- else if (i >= ti.nex_group_dev)
- obj_size = ti.next_group_obj_off;
- else if (i < ti.si.dev) /* dev within this group */
- obj_size = ti.si.obj_offset +
- ios->layout->stripe_unit - ti.si.unit_off;
- else if (i == ti.si.dev)
- obj_size = ti.si.obj_offset;
- else /* i > ti.dev */
- obj_size = ti.si.obj_offset - ti.si.unit_off;
-
- size_attr->newsize = cpu_to_be64(obj_size);
- size_attr->attr = g_attr_logical_length;
- size_attr->attr.val_ptr = &size_attr->newsize;
-
- ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
- _LLU(oc->comps->obj.id), _LLU(obj_size), i);
- ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
- &size_attr->attr);
- if (unlikely(ret))
- goto out;
- }
- ret = ore_io_execute(ios);
-
-out:
- kfree(size_attrs);
- ore_put_io_state(ios);
- return ret;
-}
-EXPORT_SYMBOL(ore_truncate);
-
-const struct osd_attr g_attr_logical_length = ATTR_DEF(
- OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-EXPORT_SYMBOL(g_attr_logical_length);
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
deleted file mode 100644
index 199590f36203..000000000000
--- a/fs/exofs/ore_raid.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/*
- * Copyright (C) 2011
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of the objects raid engine (ore).
- *
- * It is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with "ore". If not, write to the Free Software Foundation, Inc:
- * "Free Software Foundation <info@fsf.org>"
- */
-
-#include <linux/gfp.h>
-#include <linux/async_tx.h>
-
-#include "ore_raid.h"
-
-#undef ORE_DBGMSG2
-#define ORE_DBGMSG2 ORE_DBGMSG
-
-static struct page *_raid_page_alloc(void)
-{
- return alloc_page(GFP_KERNEL);
-}
-
-static void _raid_page_free(struct page *p)
-{
- __free_page(p);
-}
-
-/* This struct is forward declare in ore_io_state, but is private to here.
- * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
- *
- * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
- * Ascending page index access is sp2d(p-minor, c-major). But storage is
- * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
- * API.
- */
-struct __stripe_pages_2d {
- /* Cache some hot path repeated calculations */
- unsigned parity;
- unsigned data_devs;
- unsigned pages_in_unit;
-
- bool needed ;
-
- /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
- struct __1_page_stripe {
- bool alloc;
- unsigned write_count;
- struct async_submit_ctl submit;
- struct dma_async_tx_descriptor *tx;
-
- /* The size of this array is data_devs + parity */
- struct page **pages;
- struct page **scribble;
- /* bool array, size of this array is data_devs */
- char *page_is_read;
- } _1p_stripes[];
-};
-
-/* This can get bigger then a page. So support multiple page allocations
- * _sp2d_free should be called even if _sp2d_alloc fails (by returning
- * none-zero).
- */
-static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
- unsigned parity, struct __stripe_pages_2d **psp2d)
-{
- struct __stripe_pages_2d *sp2d;
- unsigned data_devs = group_width - parity;
-
- /*
- * Desired allocation layout is, though when larger than PAGE_SIZE,
- * each struct __alloc_1p_arrays is separately allocated:
-
- struct _alloc_all_bytes {
- struct __alloc_stripe_pages_2d {
- struct __stripe_pages_2d sp2d;
- struct __1_page_stripe _1p_stripes[pages_in_unit];
- } __asp2d;
- struct __alloc_1p_arrays {
- struct page *pages[group_width];
- struct page *scribble[group_width];
- char page_is_read[data_devs];
- } __a1pa[pages_in_unit];
- } *_aab;
-
- struct __alloc_1p_arrays *__a1pa;
- struct __alloc_1p_arrays *__a1pa_end;
-
- */
-
- char *__a1pa;
- char *__a1pa_end;
-
- const size_t sizeof_stripe_pages_2d =
- sizeof(struct __stripe_pages_2d) +
- sizeof(struct __1_page_stripe) * pages_in_unit;
- const size_t sizeof__a1pa =
- ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs,
- sizeof(void *));
- const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit;
- const size_t alloc_total = sizeof_stripe_pages_2d +
- sizeof__a1pa_arrays;
-
- unsigned num_a1pa, alloc_size, i;
-
- /* FIXME: check these numbers in ore_verify_layout */
- BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE);
- BUG_ON(sizeof__a1pa > PAGE_SIZE);
-
- /*
- * If alloc_total would be larger than PAGE_SIZE, only allocate
- * as many a1pa items as would fill the rest of the page, instead
- * of the full pages_in_unit count.
- */
- if (alloc_total > PAGE_SIZE) {
- num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa;
- alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa;
- } else {
- num_a1pa = pages_in_unit;
- alloc_size = alloc_total;
- }
-
- *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL);
- if (unlikely(!sp2d)) {
- ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
- return -ENOMEM;
- }
- /* From here Just call _sp2d_free */
-
- /* Find start of a1pa area. */
- __a1pa = (char *)sp2d + sizeof_stripe_pages_2d;
- /* Find end of the _allocated_ a1pa area. */
- __a1pa_end = __a1pa + alloc_size;
-
- /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */
- for (i = 0; i < pages_in_unit; ++i) {
- struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i];
-
- if (unlikely(__a1pa >= __a1pa_end)) {
- num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
- pages_in_unit - i);
- alloc_size = sizeof__a1pa * num_a1pa;
-
- __a1pa = kzalloc(alloc_size, GFP_KERNEL);
- if (unlikely(!__a1pa)) {
- ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
- num_a1pa);
- return -ENOMEM;
- }
- __a1pa_end = __a1pa + alloc_size;
- /* First *pages is marked for kfree of the buffer */
- stripe->alloc = true;
- }
-
- /*
- * Attach all _lp_stripes pointers to the allocation for
- * it which was either part of the original PAGE_SIZE
- * allocation or the subsequent allocation in this loop.
- */
- stripe->pages = (void *)__a1pa;
- stripe->scribble = stripe->pages + group_width;
- stripe->page_is_read = (char *)stripe->scribble + group_width;
- __a1pa += sizeof__a1pa;
- }
-
- sp2d->parity = parity;
- sp2d->data_devs = data_devs;
- sp2d->pages_in_unit = pages_in_unit;
- return 0;
-}
-
-static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
- const struct _ore_r4w_op *r4w, void *priv)
-{
- unsigned data_devs = sp2d->data_devs;
- unsigned group_width = data_devs + sp2d->parity;
- int p, c;
-
- if (!sp2d->needed)
- return;
-
- for (c = data_devs - 1; c >= 0; --c)
- for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->page_is_read[c]) {
- struct page *page = _1ps->pages[c];
-
- r4w->put_page(priv, page);
- _1ps->page_is_read[c] = false;
- }
- }
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
- _1ps->write_count = 0;
- _1ps->tx = NULL;
- }
-
- sp2d->needed = false;
-}
-
-static void _sp2d_free(struct __stripe_pages_2d *sp2d)
-{
- unsigned i;
-
- if (!sp2d)
- return;
-
- for (i = 0; i < sp2d->pages_in_unit; ++i) {
- if (sp2d->_1p_stripes[i].alloc)
- kfree(sp2d->_1p_stripes[i].pages);
- }
-
- kfree(sp2d);
-}
-
-static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
-{
- unsigned p;
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->write_count)
- return p;
- }
-
- return ~0;
-}
-
-static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
-{
- int p;
-
- for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->write_count)
- return p;
- }
-
- return ~0;
-}
-
-static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
-{
- unsigned p;
- unsigned tx_flags = ASYNC_TX_ACK;
-
- if (sp2d->parity == 1)
- tx_flags |= ASYNC_TX_XOR_ZERO_DST;
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (!_1ps->write_count)
- continue;
-
- init_async_submit(&_1ps->submit, tx_flags,
- NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
-
- if (sp2d->parity == 1)
- _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
- _1ps->pages, 0, sp2d->data_devs,
- PAGE_SIZE, &_1ps->submit);
- else /* parity == 2 */
- _1ps->tx = async_gen_syndrome(_1ps->pages, 0,
- sp2d->data_devs + sp2d->parity,
- PAGE_SIZE, &_1ps->submit);
- }
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
- /* NOTE: We wait for HW synchronously (I don't have such HW
- * to test with.) Is parallelism needed with today's multi
- * cores?
- */
- async_tx_issue_pending(_1ps->tx);
- }
-}
-
-void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
- struct ore_striping_info *si, struct page *page)
-{
- struct __1_page_stripe *_1ps;
-
- sp2d->needed = true;
-
- _1ps = &sp2d->_1p_stripes[si->cur_pg];
- _1ps->pages[si->cur_comp] = page;
- ++_1ps->write_count;
-
- si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
- /* si->cur_comp is advanced outside at main loop */
-}
-
-void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
- bool not_last)
-{
- struct osd_sg_entry *sge;
-
- ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
- "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
- per_dev->dev, cur_len, not_last, per_dev->cur_sg,
- _LLU(per_dev->offset), per_dev->length,
- per_dev->last_sgs_total);
-
- if (!per_dev->cur_sg) {
- sge = per_dev->sglist;
-
- /* First time we prepare two entries */
- if (per_dev->length) {
- ++per_dev->cur_sg;
- sge->offset = per_dev->offset;
- sge->len = per_dev->length;
- } else {
- /* Here the parity is the first unit of this object.
- * This happens every time we reach a parity device on
- * the same stripe as the per_dev->offset. We need to
- * just skip this unit.
- */
- per_dev->offset += cur_len;
- return;
- }
- } else {
- /* finalize the last one */
- sge = &per_dev->sglist[per_dev->cur_sg - 1];
- sge->len = per_dev->length - per_dev->last_sgs_total;
- }
-
- if (not_last) {
- /* Partly prepare the next one */
- struct osd_sg_entry *next_sge = sge + 1;
-
- ++per_dev->cur_sg;
- next_sge->offset = sge->offset + sge->len + cur_len;
- /* Save cur len so we know how mutch was added next time */
- per_dev->last_sgs_total = per_dev->length;
- next_sge->len = 0;
- } else if (!sge->len) {
- /* Optimize for when the last unit is a parity */
- --per_dev->cur_sg;
- }
-}
-
-static int _alloc_read_4_write(struct ore_io_state *ios)
-{
- struct ore_layout *layout = ios->layout;
- int ret;
- /* We want to only read those pages not in cache so worst case
- * is a stripe populated with every other page
- */
- unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
-
- ret = _ore_get_io_state(layout, ios->oc,
- layout->group_width * layout->mirrors_p1,
- sgs_per_dev, 0, &ios->ios_read_4_write);
- return ret;
-}
-
-/* @si contains info of the to-be-inserted page. Update of @si should be
- * maintained by caller. Specificaly si->dev, si->obj_offset, ...
- */
-static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
- struct page *page, unsigned pg_len)
-{
- struct request_queue *q;
- struct ore_per_dev_state *per_dev;
- struct ore_io_state *read_ios;
- unsigned first_dev = si->dev - (si->dev %
- (ios->layout->group_width * ios->layout->mirrors_p1));
- unsigned comp = si->dev - first_dev;
- unsigned added_len;
-
- if (!ios->ios_read_4_write) {
- int ret = _alloc_read_4_write(ios);
-
- if (unlikely(ret))
- return ret;
- }
-
- read_ios = ios->ios_read_4_write;
- read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
-
- per_dev = &read_ios->per_dev[comp];
- if (!per_dev->length) {
- per_dev->bio = bio_kmalloc(GFP_KERNEL,
- ios->sp2d->pages_in_unit);
- if (unlikely(!per_dev->bio)) {
- ORE_DBGMSG("Failed to allocate BIO size=%u\n",
- ios->sp2d->pages_in_unit);
- return -ENOMEM;
- }
- per_dev->offset = si->obj_offset;
- per_dev->dev = si->dev;
- } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
- u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
-
- _ore_add_sg_seg(per_dev, gap, true);
- }
- q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
- added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
- si->obj_offset % PAGE_SIZE);
- if (unlikely(added_len != pg_len)) {
- ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
- per_dev->bio->bi_vcnt);
- return -ENOMEM;
- }
-
- per_dev->length += pg_len;
- return 0;
-}
-
-/* read the beginning of an unaligned first page */
-static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
-{
- struct ore_striping_info si;
- unsigned pg_len;
-
- ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
-
- pg_len = si.obj_offset % PAGE_SIZE;
- si.obj_offset -= pg_len;
-
- ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
- _LLU(si.obj_offset), pg_len, page->index, si.dev);
-
- return _add_to_r4w(ios, &si, page, pg_len);
-}
-
-/* read the end of an incomplete last page */
-static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
-{
- struct ore_striping_info si;
- struct page *page;
- unsigned pg_len, p, c;
-
- ore_calc_stripe_info(ios->layout, *offset, 0, &si);
-
- p = si.cur_pg;
- c = si.cur_comp;
- page = ios->sp2d->_1p_stripes[p].pages[c];
-
- pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
- *offset += pg_len;
-
- ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
- p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
-
- BUG_ON(!page);
-
- return _add_to_r4w(ios, &si, page, pg_len);
-}
-
-static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
-{
- struct bio_vec *bv;
- unsigned i, d;
-
- /* loop on all devices all pages */
- for (d = 0; d < ios->numdevs; d++) {
- struct bio *bio = ios->per_dev[d].bio;
-
- if (!bio)
- continue;
-
- bio_for_each_segment_all(bv, bio, i) {
- struct page *page = bv->bv_page;
-
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
- }
- }
-}
-
-/* read_4_write is hacked to read the start of the first stripe and/or
- * the end of the last stripe. If needed, with an sg-gap at each device/page.
- * It is assumed to be called after the to_be_written pages of the first stripe
- * are populating ios->sp2d[][]
- *
- * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
- * These pages are held at sp2d[p].pages[c] but with
- * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
- * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
- * @uptodate=true, so we don't need to read it, only unlock, after IO.
- *
- * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
- * to-be-written count, we should consider the xor-in-place mode.
- * need_to_read_pages_count is the actual number of pages not present in cache.
- * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
- * approximation? In this mode the read pages are put in the empty places of
- * ios->sp2d[p][*], xor is calculated the same way. These pages are
- * allocated/freed and don't go through cache
- */
-static int _read_4_write_first_stripe(struct ore_io_state *ios)
-{
- struct ore_striping_info read_si;
- struct __stripe_pages_2d *sp2d = ios->sp2d;
- u64 offset = ios->si.first_stripe_start;
- unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
-
- if (offset == ios->offset) /* Go to start collect $200 */
- goto read_last_stripe;
-
- min_p = _sp2d_min_pg(sp2d);
- max_p = _sp2d_max_pg(sp2d);
-
- ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
- offset, ios->offset, min_p, max_p);
-
- for (c = 0; ; c++) {
- ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- read_si.obj_offset += min_p * PAGE_SIZE;
- offset += min_p * PAGE_SIZE;
- for (p = min_p; p <= max_p; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
- struct page **pp = &_1ps->pages[c];
- bool uptodate;
-
- if (*pp) {
- if (ios->offset % PAGE_SIZE)
- /* Read the remainder of the page */
- _add_to_r4w_first_page(ios, *pp);
- /* to-be-written pages start here */
- goto read_last_stripe;
- }
-
- *pp = ios->r4w->get_page(ios->private, offset,
- &uptodate);
- if (unlikely(!*pp))
- return -ENOMEM;
-
- if (!uptodate)
- _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
-
- /* Mark read-pages to be cache_released */
- _1ps->page_is_read[c] = true;
- read_si.obj_offset += PAGE_SIZE;
- offset += PAGE_SIZE;
- }
- offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
- }
-
-read_last_stripe:
- return 0;
-}
-
-static int _read_4_write_last_stripe(struct ore_io_state *ios)
-{
- struct ore_striping_info read_si;
- struct __stripe_pages_2d *sp2d = ios->sp2d;
- u64 offset;
- u64 last_stripe_end;
- unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
- unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
-
- offset = ios->offset + ios->length;
- if (offset % PAGE_SIZE)
- _add_to_r4w_last_page(ios, &offset);
- /* offset will be aligned to next page */
-
- last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
- * bytes_in_stripe;
- if (offset == last_stripe_end) /* Optimize for the aligned case */
- goto read_it;
-
- ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- p = read_si.cur_pg;
- c = read_si.cur_comp;
-
- if (min_p == sp2d->pages_in_unit) {
- /* Didn't do it yet */
- min_p = _sp2d_min_pg(sp2d);
- max_p = _sp2d_max_pg(sp2d);
- }
-
- ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
- offset, last_stripe_end, min_p, max_p);
-
- while (offset < last_stripe_end) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if ((min_p <= p) && (p <= max_p)) {
- struct page *page;
- bool uptodate;
-
- BUG_ON(_1ps->pages[c]);
- page = ios->r4w->get_page(ios->private, offset,
- &uptodate);
- if (unlikely(!page))
- return -ENOMEM;
-
- _1ps->pages[c] = page;
- /* Mark read-pages to be cache_released */
- _1ps->page_is_read[c] = true;
- if (!uptodate)
- _add_to_r4w(ios, &read_si, page, PAGE_SIZE);
- }
-
- offset += PAGE_SIZE;
- if (p == (sp2d->pages_in_unit - 1)) {
- ++c;
- p = 0;
- ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- } else {
- read_si.obj_offset += PAGE_SIZE;
- ++p;
- }
- }
-
-read_it:
- return 0;
-}
-
-static int _read_4_write_execute(struct ore_io_state *ios)
-{
- struct ore_io_state *ios_read;
- unsigned i;
- int ret;
-
- ios_read = ios->ios_read_4_write;
- if (!ios_read)
- return 0;
-
- /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
- * to check for per_dev->bio
- */
- ios_read->pages = ios->pages;
-
- /* Now read these devices */
- for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
- ret = _ore_read_mirror(ios_read, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = ore_io_execute(ios_read); /* Synchronus execution */
- if (unlikely(ret)) {
- ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
- return ret;
- }
-
- _mark_read4write_pages_uptodate(ios_read, ret);
- ore_put_io_state(ios_read);
- ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
- return 0;
-}
-
-/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
-int _ore_add_parity_unit(struct ore_io_state *ios,
- struct ore_striping_info *si,
- struct ore_per_dev_state *per_dev,
- unsigned cur_len, bool do_xor)
-{
- if (ios->reading) {
- if (per_dev->cur_sg >= ios->sgs_per_dev) {
- ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
- per_dev->cur_sg, ios->sgs_per_dev);
- return -ENOMEM;
- }
- _ore_add_sg_seg(per_dev, cur_len, true);
- } else {
- struct __stripe_pages_2d *sp2d = ios->sp2d;
- struct page **pages = ios->parity_pages + ios->cur_par_page;
- unsigned num_pages;
- unsigned array_start = 0;
- unsigned i;
- int ret;
-
- si->cur_pg = _sp2d_min_pg(sp2d);
- num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
-
- if (!per_dev->length) {
- per_dev->offset += si->cur_pg * PAGE_SIZE;
- /* If first stripe, Read in all read4write pages
- * (if needed) before we calculate the first parity.
- */
- if (do_xor)
- _read_4_write_first_stripe(ios);
- }
- if (!cur_len && do_xor)
- /* If last stripe r4w pages of last stripe */
- _read_4_write_last_stripe(ios);
- _read_4_write_execute(ios);
-
- for (i = 0; i < num_pages; i++) {
- pages[i] = _raid_page_alloc();
- if (unlikely(!pages[i]))
- return -ENOMEM;
-
- ++(ios->cur_par_page);
- }
-
- BUG_ON(si->cur_comp < sp2d->data_devs);
- BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
-
- ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
- per_dev, num_pages * PAGE_SIZE);
- if (unlikely(ret))
- return ret;
-
- if (do_xor) {
- _gen_xor_unit(sp2d);
- _sp2d_reset(sp2d, ios->r4w, ios->private);
- }
- }
- return 0;
-}
-
-int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
-{
- if (ios->parity_pages) {
- struct ore_layout *layout = ios->layout;
- unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
-
- if (_sp2d_alloc(pages_in_unit, layout->group_width,
- layout->parity, &ios->sp2d)) {
- return -ENOMEM;
- }
- }
- return 0;
-}
-
-void _ore_free_raid_stuff(struct ore_io_state *ios)
-{
- if (ios->sp2d) { /* writing and raid */
- unsigned i;
-
- for (i = 0; i < ios->cur_par_page; i++) {
- struct page *page = ios->parity_pages[i];
-
- if (page)
- _raid_page_free(page);
- }
- if (ios->extra_part_alloc)
- kfree(ios->parity_pages);
- /* If IO returned an error pages might need unlocking */
- _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
- _sp2d_free(ios->sp2d);
- } else {
- /* Will only be set if raid reading && sglist is big */
- if (ios->extra_part_alloc)
- kfree(ios->per_dev[0].sglist);
- }
- if (ios->ios_read_4_write)
- ore_put_io_state(ios->ios_read_4_write);
-}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
deleted file mode 100644
index a6e746775570..000000000000
--- a/fs/exofs/ore_raid.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) from 2011
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of the objects raid engine (ore).
- *
- * It is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with "ore". If not, write to the Free Software Foundation, Inc:
- * "Free Software Foundation <info@fsf.org>"
- */
-
-#include <scsi/osd_ore.h>
-
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define ORE_DBGMSG(fmt, a...) \
- printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define ORE_DBGMSG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-#define ORE_DBGMSG2(M...) do {} while (0)
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
-
-/* ios_raid.c stuff needed by ios.c */
-int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
-void _ore_free_raid_stuff(struct ore_io_state *ios);
-
-void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
- bool not_last);
-int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
- struct ore_per_dev_state *per_dev, unsigned cur_len,
- bool do_xor);
-void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
- struct ore_striping_info *si, struct page *page);
-static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
- struct ore_striping_info *si, struct page *page)
-{
- if (!sp2d) /* Inline the fast path */
- return; /* Hay no raid stuff */
- _ore_add_stripe_page(sp2d, si, page);
-}
-
-/* ios.c stuff needed by ios_raid.c */
-int _ore_get_io_state(struct ore_layout *layout,
- struct ore_components *oc, unsigned numdevs,
- unsigned sgs_per_dev, unsigned num_par_pages,
- struct ore_io_state **pios);
-int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct page **pages,
- struct ore_per_dev_state *per_dev, int cur_len);
-int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
-int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
deleted file mode 100644
index fc80c7233fa5..000000000000
--- a/fs/exofs/super.c
+++ /dev/null
@@ -1,1071 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/string.h>
-#include <linux/parser.h>
-#include <linux/vfs.h>
-#include <linux/random.h>
-#include <linux/module.h>
-#include <linux/exportfs.h>
-#include <linux/slab.h>
-#include <linux/iversion.h>
-
-#include "exofs.h"
-
-#define EXOFS_DBGMSG2(M...) do {} while (0)
-
-/******************************************************************************
- * MOUNT OPTIONS
- *****************************************************************************/
-
-/*
- * struct to hold what we get from mount options
- */
-struct exofs_mountopt {
- bool is_osdname;
- const char *dev_name;
- uint64_t pid;
- int timeout;
-};
-
-/*
- * exofs-specific mount-time options.
- */
-enum { Opt_name, Opt_pid, Opt_to, Opt_err };
-
-/*
- * Our mount-time options. These should ideally be 64-bit unsigned, but the
- * kernel's parsing functions do not currently support that. 32-bit should be
- * sufficient for most applications now.
- */
-static match_table_t tokens = {
- {Opt_name, "osdname=%s"},
- {Opt_pid, "pid=%u"},
- {Opt_to, "to=%u"},
- {Opt_err, NULL}
-};
-
-/*
- * The main option parsing method. Also makes sure that all of the mandatory
- * mount options were set.
- */
-static int parse_options(char *options, struct exofs_mountopt *opts)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- bool s_pid = false;
-
- EXOFS_DBGMSG("parse_options %s\n", options);
- /* defaults */
- memset(opts, 0, sizeof(*opts));
- opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- char str[32];
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_name:
- kfree(opts->dev_name);
- opts->dev_name = match_strdup(&args[0]);
- if (unlikely(!opts->dev_name)) {
- EXOFS_ERR("Error allocating dev_name");
- return -ENOMEM;
- }
- opts->is_osdname = true;
- break;
- case Opt_pid:
- if (0 == match_strlcpy(str, &args[0], sizeof(str)))
- return -EINVAL;
- opts->pid = simple_strtoull(str, NULL, 0);
- if (opts->pid < EXOFS_MIN_PID) {
- EXOFS_ERR("Partition ID must be >= %u",
- EXOFS_MIN_PID);
- return -EINVAL;
- }
- s_pid = true;
- break;
- case Opt_to:
- if (match_int(&args[0], &option))
- return -EINVAL;
- if (option <= 0) {
- EXOFS_ERR("Timeout must be > 0");
- return -EINVAL;
- }
- opts->timeout = option * HZ;
- break;
- }
- }
-
- if (!s_pid) {
- EXOFS_ERR("Need to specify the following options:\n");
- EXOFS_ERR(" -o pid=pid_no_to_use\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-/******************************************************************************
- * INODE CACHE
- *****************************************************************************/
-
-/*
- * Our inode cache. Isn't it pretty?
- */
-static struct kmem_cache *exofs_inode_cachep;
-
-/*
- * Allocate an inode in the cache
- */
-static struct inode *exofs_alloc_inode(struct super_block *sb)
-{
- struct exofs_i_info *oi;
-
- oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
- if (!oi)
- return NULL;
-
- inode_set_iversion(&oi->vfs_inode, 1);
- return &oi->vfs_inode;
-}
-
-static void exofs_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
-}
-
-/*
- * Remove an inode from the cache
- */
-static void exofs_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, exofs_i_callback);
-}
-
-/*
- * Initialize the inode
- */
-static void exofs_init_once(void *foo)
-{
- struct exofs_i_info *oi = foo;
-
- inode_init_once(&oi->vfs_inode);
-}
-
-/*
- * Create and initialize the inode cache
- */
-static int init_inodecache(void)
-{
- exofs_inode_cachep = kmem_cache_create_usercopy("exofs_inode_cache",
- sizeof(struct exofs_i_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
- SLAB_ACCOUNT,
- offsetof(struct exofs_i_info, i_data),
- sizeof_field(struct exofs_i_info, i_data),
- exofs_init_once);
- if (exofs_inode_cachep == NULL)
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Destroy the inode cache
- */
-static void destroy_inodecache(void)
-{
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(exofs_inode_cachep);
-}
-
-/******************************************************************************
- * Some osd helpers
- *****************************************************************************/
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
- osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
- u64 offset, void *p, unsigned length)
-{
- struct osd_request *or = osd_start_request(od);
-/* struct osd_sense_info osi = {.key = 0};*/
- int ret;
-
- if (unlikely(!or)) {
- EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
- return -ENOMEM;
- }
- ret = osd_req_read_kern(or, obj, offset, p, length);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
- goto out;
- }
-
- ret = osd_finalize_request(or, 0, cred, NULL);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
- goto out;
- }
-
- ret = osd_execute_request(or);
- if (unlikely(ret))
- EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
- /* osd_req_decode_sense(or, ret); */
-
-out:
- osd_end_request(or);
- EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%p ret=>%d\n",
- _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
- return ret;
-}
-
-static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
- EXOFS_APAGE_SB_DATA,
- EXOFS_ATTR_SB_STATS,
- sizeof(struct exofs_sb_stats));
-
-static int __sbi_read_stats(struct exofs_sb_info *sbi)
-{
- struct osd_attr attrs[] = {
- [0] = g_attr_sb_stats,
- };
- struct ore_io_state *ios;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- return ret;
- }
-
- ios->in_attr = attrs;
- ios->in_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_read(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("Error reading super_block stats => %d\n", ret);
- goto out;
- }
-
- ret = extract_attr_from_ios(ios, &attrs[0]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
- goto out;
- }
- if (attrs[0].len) {
- struct exofs_sb_stats *ess;
-
- if (unlikely(attrs[0].len != sizeof(*ess))) {
- EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
- "size(%d) != expected(%zd)\n",
- __func__, attrs[0].len, sizeof(*ess));
- goto out;
- }
-
- ess = attrs[0].val_ptr;
- sbi->s_nextid = le64_to_cpu(ess->s_nextid);
- sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
- }
-
-out:
- ore_put_io_state(ios);
- return ret;
-}
-
-static void stats_done(struct ore_io_state *ios, void *p)
-{
- ore_put_io_state(ios);
- /* Good thanks nothing to do anymore */
-}
-
-/* Asynchronously write the stats attribute */
-int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
-{
- struct osd_attr attrs[] = {
- [0] = g_attr_sb_stats,
- };
- struct ore_io_state *ios;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- return ret;
- }
-
- sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid);
- sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
- attrs[0].val_ptr = &sbi->s_ess;
-
-
- ios->done = stats_done;
- ios->private = sbi;
- ios->out_attr = attrs;
- ios->out_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_write(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_write failed.\n", __func__);
- ore_put_io_state(ios);
- }
-
- return ret;
-}
-
-/******************************************************************************
- * SUPERBLOCK FUNCTIONS
- *****************************************************************************/
-static const struct super_operations exofs_sops;
-static const struct export_operations exofs_export_ops;
-
-/*
- * Write the superblock to the OSD
- */
-static int exofs_sync_fs(struct super_block *sb, int wait)
-{
- struct exofs_sb_info *sbi;
- struct exofs_fscb *fscb;
- struct ore_comp one_comp;
- struct ore_components oc;
- struct ore_io_state *ios;
- int ret = -ENOMEM;
-
- fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
- if (unlikely(!fscb))
- return -ENOMEM;
-
- sbi = sb->s_fs_info;
-
- /* NOTE: We no longer dirty the super_block anywhere in exofs. The
- * reason we write the fscb here on unmount is so we can stay backwards
- * compatible with fscb->s_version == 1. (What we are not compatible
- * with is if a new version FS crashed and then we try to mount an old
- * version). Otherwise the exofs_fscb is read-only from mkfs time. All
- * the writeable info is set in exofs_sbi_write_stats() above.
- */
-
- exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
-
- ret = ore_get_io_state(&sbi->layout, &oc, &ios);
- if (unlikely(ret))
- goto out;
-
- ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
- memset(fscb, 0, ios->length);
- fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
- fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles);
- fscb->s_magic = cpu_to_le16(sb->s_magic);
- fscb->s_newfs = 0;
- fscb->s_version = EXOFS_FSCB_VER;
-
- ios->offset = 0;
- ios->kern_buff = fscb;
-
- ret = ore_write(ios);
- if (unlikely(ret))
- EXOFS_ERR("%s: ore_write failed.\n", __func__);
-
-out:
- EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
- ore_put_io_state(ios);
- kfree(fscb);
- return ret;
-}
-
-static void _exofs_print_device(const char *msg, const char *dev_path,
- struct osd_dev *od, u64 pid)
-{
- const struct osd_dev_info *odi = osduld_device_info(od);
-
- printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
- msg, dev_path ?: "", odi->osdname, _LLU(pid));
-}
-
-static void exofs_free_sbi(struct exofs_sb_info *sbi)
-{
- unsigned numdevs = sbi->oc.numdevs;
-
- while (numdevs) {
- unsigned i = --numdevs;
- struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
-
- if (od) {
- ore_comp_set_dev(&sbi->oc, i, NULL);
- osduld_put_device(od);
- }
- }
- kfree(sbi->oc.ods);
- kfree(sbi);
-}
-
-/*
- * This function is called when the vfs is freeing the superblock. We just
- * need to free our own part.
- */
-static void exofs_put_super(struct super_block *sb)
-{
- int num_pend;
- struct exofs_sb_info *sbi = sb->s_fs_info;
-
- /* make sure there are no pending commands */
- for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
- num_pend = atomic_read(&sbi->s_curr_pending)) {
- wait_queue_head_t wq;
-
- printk(KERN_NOTICE "%s: !!Pending operations in flight. "
- "This is a BUG. please report to osd-dev@open-osd.org\n",
- __func__);
- init_waitqueue_head(&wq);
- wait_event_timeout(wq,
- (atomic_read(&sbi->s_curr_pending) == 0),
- msecs_to_jiffies(100));
- }
-
- _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
- sbi->one_comp.obj.partition);
-
- exofs_sysfs_sb_del(sbi);
- exofs_free_sbi(sbi);
- sb->s_fs_info = NULL;
-}
-
-static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
- struct exofs_device_table *dt)
-{
- int ret;
-
- sbi->layout.stripe_unit =
- le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
- sbi->layout.group_width =
- le32_to_cpu(dt->dt_data_map.cb_group_width);
- sbi->layout.group_depth =
- le32_to_cpu(dt->dt_data_map.cb_group_depth);
- sbi->layout.mirrors_p1 =
- le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
- sbi->layout.raid_algorithm =
- le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
-
- ret = ore_verify_layout(numdevs, &sbi->layout);
-
- EXOFS_DBGMSG("exofs: layout: "
- "num_comps=%u stripe_unit=0x%x group_width=%u "
- "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
- numdevs,
- sbi->layout.stripe_unit,
- sbi->layout.group_width,
- _LLU(sbi->layout.group_depth),
- sbi->layout.mirrors_p1,
- sbi->layout.raid_algorithm);
- return ret;
-}
-
-static unsigned __ra_pages(struct ore_layout *layout)
-{
- const unsigned _MIN_RA = 32; /* min 128K read-ahead */
- unsigned ra_pages = layout->group_width * layout->stripe_unit /
- PAGE_SIZE;
- unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
-
- ra_pages *= 2; /* two stripes */
- if (ra_pages < _MIN_RA)
- ra_pages = roundup(_MIN_RA, ra_pages / 2);
-
- if (ra_pages > max_io_pages)
- ra_pages = max_io_pages;
-
- return ra_pages;
-}
-
-/* @odi is valid only as long as @fscb_dev is valid */
-static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
- struct osd_dev_info *odi)
-{
- odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
- if (likely(odi->systemid_len))
- memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN);
-
- odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
- odi->osdname = dt_dev->osdname;
-
- /* FIXME support long names. Will need a _put function */
- if (dt_dev->long_name_offset)
- return -EINVAL;
-
- /* Make sure osdname is printable!
- * mkexofs should give us space for a null-terminator else the
- * device-table is invalid.
- */
- if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
- odi->osdname_len = sizeof(dt_dev->osdname) - 1;
- dt_dev->osdname[odi->osdname_len] = 0;
-
- /* If it's all zeros something is bad we read past end-of-obj */
- return !(odi->systemid_len || odi->osdname_len);
-}
-
-static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
- struct exofs_dev **peds)
-{
- /* Twice bigger table: See exofs_init_comps() and comment at
- * exofs_read_lookup_dev_table()
- */
- const size_t numores = numdevs * 2 - 1;
- struct exofs_dev *eds;
- unsigned i;
-
- sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) +
- numdevs * sizeof(struct exofs_dev), GFP_KERNEL);
- if (unlikely(!sbi->oc.ods)) {
- EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
- numdevs);
- return -ENOMEM;
- }
-
- /* Start of allocated struct exofs_dev entries */
- *peds = eds = (void *)sbi->oc.ods[numores];
- /* Initialize pointers into struct exofs_dev */
- for (i = 0; i < numdevs; ++i)
- sbi->oc.ods[i] = &eds[i].ored;
- return 0;
-}
-
-static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
- struct osd_dev *fscb_od,
- unsigned table_count)
-{
- struct ore_comp comp;
- struct exofs_device_table *dt;
- struct exofs_dev *eds;
- unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
- sizeof(*dt);
- unsigned numdevs, i;
- int ret;
-
- dt = kmalloc(table_bytes, GFP_KERNEL);
- if (unlikely(!dt)) {
- EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
- table_bytes);
- return -ENOMEM;
- }
-
- sbi->oc.numdevs = 0;
-
- comp.obj.partition = sbi->one_comp.obj.partition;
- comp.obj.id = EXOFS_DEVTABLE_ID;
- exofs_make_credential(comp.cred, &comp.obj);
-
- ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
- table_bytes);
- if (unlikely(ret)) {
- EXOFS_ERR("ERROR: reading device table\n");
- goto out;
- }
-
- numdevs = le64_to_cpu(dt->dt_num_devices);
- if (unlikely(!numdevs)) {
- ret = -EINVAL;
- goto out;
- }
- WARN_ON(table_count != numdevs);
-
- ret = _read_and_match_data_map(sbi, numdevs, dt);
- if (unlikely(ret))
- goto out;
-
- ret = __alloc_dev_table(sbi, numdevs, &eds);
- if (unlikely(ret))
- goto out;
- /* exofs round-robins the device table view according to inode
- * number. We hold a: twice bigger table hence inodes can point
- * to any device and have a sequential view of the table
- * starting at this device. See exofs_init_comps()
- */
- memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
- (numdevs - 1) * sizeof(sbi->oc.ods[0]));
-
- /* create sysfs subdir under which we put the device table
- * And cluster layout. A Superblock is identified by the string:
- * "dev[0].osdname"_"pid"
- */
- exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]);
-
- for (i = 0; i < numdevs; i++) {
- struct exofs_fscb fscb;
- struct osd_dev_info odi;
- struct osd_dev *od;
-
- if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
- EXOFS_ERR("E