aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorTrond Myklebust <Trond.Myklebust@netapp.com>2011-11-02 23:56:40 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2011-11-02 23:56:40 -0400
commit31cbecb4ab538f433145bc5a46f3bea9b9627031 (patch)
treed6206d42dea7298f7ef05fd1f7bf474245f0d43a /fs
parent2b72c9ccd22c4a3299e5a358dcd639fb253730f4 (diff)
parent278c023a99b0d6b471d0f4a79835c703482e29ac (diff)
downloadlinux-stericsson-31cbecb4ab538f433145bc5a46f3bea9b9627031.tar.gz
Merge branch 'osd-devel' into nfs-for-next
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c33
-rw-r--r--fs/9p/vfs_dir.c14
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/aio.c4
-rw-r--r--fs/attr.c5
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/xattr.c50
-rw-r--r--fs/buffer.c9
-rw-r--r--fs/ceph/addr.c193
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/inode.c46
-rw-r--r--fs/ceph/ioctl.c34
-rw-r--r--fs/ceph/ioctl.h55
-rw-r--r--fs/ceph/mds_client.c11
-rw-r--r--fs/ceph/super.c61
-rw-r--r--fs/ceph/super.h19
-rw-r--r--fs/cifs/README14
-rw-r--r--fs/cifs/cifs_debug.c9
-rw-r--r--fs/cifs/cifs_fs_sb.h4
-rw-r--r--fs/cifs/cifsacl.c347
-rw-r--r--fs/cifs/cifsencrypt.c105
-rw-r--r--fs/cifs/cifsfs.c26
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h55
-rw-r--r--fs/cifs/cifspdu.h48
-rw-r--r--fs/cifs/cifsproto.h48
-rw-r--r--fs/cifs/cifssmb.c454
-rw-r--r--fs/cifs/connect.c699
-rw-r--r--fs/cifs/dir.c22
-rw-r--r--fs/cifs/export.c4
-rw-r--r--fs/cifs/file.c1126
-rw-r--r--fs/cifs/inode.c54
-rw-r--r--fs/cifs/link.c17
-rw-r--r--fs/cifs/misc.c66
-rw-r--r--fs/cifs/sess.c4
-rw-r--r--fs/cifs/smbencrypt.c121
-rw-r--r--fs/cifs/transport.c19
-rw-r--r--fs/cifs/xattr.c42
-rw-r--r--fs/coda/coda_linux.h5
-rw-r--r--fs/compat.c15
-rw-r--r--fs/configfs/inode.c3
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/direct-io.c646
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/eventpoll.c27
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exofs/Kbuild3
-rw-r--r--fs/exofs/Kconfig9
-rw-r--r--fs/exofs/exofs.h26
-rw-r--r--fs/exofs/inode.c233
-rw-r--r--fs/exofs/ore.c656
-rw-r--r--fs/exofs/ore_raid.c660
-rw-r--r--fs/exofs/ore_raid.h79
-rw-r--r--fs/exofs/super.c205
-rw-r--r--fs/ext2/ext2.h8
-rw-r--r--fs/ext2/xattr_security.c34
-rw-r--r--fs/ext3/xattr_security.c36
-rw-r--r--fs/ext4/ext4.h44
-rw-r--r--fs/ext4/file.c47
-rw-r--r--fs/ext4/inode.c6
-rw-r--r--fs/ext4/xattr_security.c36
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/fat.h9
-rw-r--r--fs/gfs2/acl.c5
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c199
-rw-r--r--fs/gfs2/dir.c50
-rw-r--r--fs/gfs2/file.c299
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c89
-rw-r--r--fs/gfs2/glops.h2
-rw-r--r--fs/gfs2/incore.h23
-rw-r--r--fs/gfs2/inode.c150
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/lops.c66
-rw-r--r--fs/gfs2/ops_fstype.c6
-rw-r--r--fs/gfs2/quota.c28
-rw-r--r--fs/gfs2/rgrp.c573
-rw-r--r--fs/gfs2/rgrp.h31
-rw-r--r--fs/gfs2/super.c134
-rw-r--r--fs/gfs2/trans.c5
-rw-r--r--fs/gfs2/trans.h22
-rw-r--r--fs/gfs2/xattr.c28
-rw-r--r--fs/hpfs/hpfs_fn.h4
-rw-r--r--fs/inode.c2
-rw-r--r--fs/jffs2/security.c35
-rw-r--r--fs/jfs/xattr.c57
-rw-r--r--fs/lockd/host.c25
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/locks.c225
-rw-r--r--fs/logfs/logfs.h1
-rw-r--r--fs/logfs/super.c22
-rw-r--r--fs/namei.c17
-rw-r--r--fs/namespace.c1
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/file.c10
-rw-r--r--fs/nfs/nfs4_fs.h24
-rw-r--r--fs/nfs/objlayout/objio_osd.c872
-rw-r--r--fs/nfs/objlayout/objlayout.c209
-rw-r--r--fs/nfs/objlayout/objlayout.h48
-rw-r--r--fs/nfsd/export.c16
-rw-r--r--fs/nfsd/nfs4callback.c20
-rw-r--r--fs/nfsd/nfs4proc.c374
-rw-r--r--fs/nfsd/nfs4recover.c53
-rw-r--r--fs/nfsd/nfs4state.c1794
-rw-r--r--fs/nfsd/nfs4xdr.c380
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/nfsd.h33
-rw-r--r--fs/nfsd/nfsfh.c39
-rw-r--r--fs/nfsd/state.h174
-rw-r--r--fs/nfsd/vfs.c31
-rw-r--r--fs/nfsd/vfs.h29
-rw-r--r--fs/nfsd/xdr4.h28
-rw-r--r--fs/nilfs2/nilfs.h8
-rw-r--r--fs/ntfs/debug.h15
-rw-r--r--fs/ocfs2/super.h14
-rw-r--r--fs/ocfs2/xattr.c38
-rw-r--r--fs/open.c4
-rw-r--r--fs/partitions/ldm.c16
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/posix_acl.c2
-rw-r--r--fs/proc/base.c13
-rw-r--r--fs/proc/stat.c41
-rw-r--r--fs/proc/task_mmu.c5
-rw-r--r--fs/pstore/inode.c40
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c93
-rw-r--r--fs/read_write.c82
-rw-r--r--fs/reiserfs/journal.c9
-rw-r--r--fs/reiserfs/resize.c4
-rw-r--r--fs/reiserfs/xattr_security.c4
-rw-r--r--fs/squashfs/Kconfig6
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysfs/dir.c196
-rw-r--r--fs/sysfs/file.c56
-rw-r--r--fs/sysfs/inode.c16
-rw-r--r--fs/sysfs/sysfs.h17
-rw-r--r--fs/udf/udfdecl.h4
-rw-r--r--fs/ufs/ufs.h9
-rw-r--r--fs/xattr.c63
-rw-r--r--fs/xfs/kmem.h7
-rw-r--r--fs/xfs/xfs_alloc.c4
-rw-r--r--fs/xfs/xfs_aops.c127
-rw-r--r--fs/xfs/xfs_aops.h4
-rw-r--r--fs/xfs/xfs_attr.c89
-rw-r--r--fs/xfs/xfs_attr_leaf.c7
-rw-r--r--fs/xfs/xfs_bmap.c2531
-rw-r--r--fs/xfs/xfs_bmap.h318
-rw-r--r--fs/xfs/xfs_btree.c11
-rw-r--r--fs/xfs/xfs_buf.c244
-rw-r--r--fs/xfs/xfs_buf.h51
-rw-r--r--fs/xfs/xfs_buf_item.c12
-rw-r--r--fs/xfs/xfs_da_btree.c54
-rw-r--r--fs/xfs/xfs_dfrag.c6
-rw-r--r--fs/xfs/xfs_dir2_leaf.c6
-rw-r--r--fs/xfs/xfs_discard.c20
-rw-r--r--fs/xfs/xfs_dquot.c32
-rw-r--r--fs/xfs/xfs_export.c12
-rw-r--r--fs/xfs/xfs_file.c168
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fsops.c60
-rw-r--r--fs/xfs/xfs_ialloc.c15
-rw-r--r--fs/xfs/xfs_iget.c2
-rw-r--r--fs/xfs/xfs_inode.c43
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_iomap.c39
-rw-r--r--fs/xfs/xfs_iops.c53
-rw-r--r--fs/xfs/xfs_log.c20
-rw-r--r--fs/xfs/xfs_log_recover.c43
-rw-r--r--fs/xfs/xfs_message.h42
-rw-r--r--fs/xfs/xfs_mount.c36
-rw-r--r--fs/xfs/xfs_qm.c12
-rw-r--r--fs/xfs/xfs_qm_syscalls.c2
-rw-r--r--fs/xfs/xfs_rename.c8
-rw-r--r--fs/xfs/xfs_rtalloc.c48
-rw-r--r--fs/xfs/xfs_rw.c23
-rw-r--r--fs/xfs/xfs_rw.h2
-rw-r--r--fs/xfs/xfs_super.c13
-rw-r--r--fs/xfs/xfs_sync.c16
-rw-r--r--fs/xfs/xfs_trace.h39
-rw-r--r--fs/xfs/xfs_trans.c13
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_trans_ail.c43
-rw-r--r--fs/xfs/xfs_trans_buf.c24
-rw-r--r--fs/xfs/xfs_trans_inode.c25
-rw-r--r--fs/xfs/xfs_trans_priv.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c109
192 files changed, 10003 insertions, 8095 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index ef9661886112..2b78014a124a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
options = tmp_options;
while ((p = strsep(&options, ",")) != NULL) {
- int token;
+ int token, r;
if (!*p)
continue;
token = match_token(p, tokens, args);
- if (token < Opt_uname) {
- int r = match_int(&args[0], &option);
+ switch (token) {
+ case Opt_debug:
+ r = match_int(&args[0], &option);
if (r < 0) {
P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ "integer field, but no integer?\n");
ret = r;
continue;
}
- }
- switch (token) {
- case Opt_debug:
v9ses->debug = option;
#ifdef CONFIG_NET_9P_DEBUG
p9_debug_level = option;
@@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
break;
case Opt_dfltuid:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
v9ses->dfltuid = option;
break;
case Opt_dfltgid:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
v9ses->dfltgid = option;
break;
case Opt_afid:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
v9ses->afid = option;
break;
case Opt_uname:
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 9c2bdda5cd9d..598fff1a54e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
while (rdir->head < rdir->tail) {
p9stat_init(&st);
- err = p9stat_read(rdir->buf + rdir->head,
- rdir->tail - rdir->head, &st,
- fid->clnt->proto_version);
+ err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
+ rdir->tail - rdir->head, &st);
if (err) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
@@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
while (err == 0) {
if (rdir->tail == rdir->head) {
err = p9_client_readdir(fid, rdir->buf, buflen,
- filp->f_pos);
+ filp->f_pos);
if (err <= 0)
goto unlock_and_exit;
@@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
while (rdir->head < rdir->tail) {
- err = p9dirent_read(rdir->buf + rdir->head,
- rdir->tail - rdir->head,
- &curdirent,
- fid->clnt->proto_version);
+ err = p9dirent_read(fid->clnt, rdir->buf + rdir->head,
+ rdir->tail - rdir->head,
+ &curdirent);
if (err < 0) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e3c03db3c788..b5a1076aaa6c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -278,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
case S_IFSOCK:
if (v9fs_proto_dotl(v9ses)) {
inode->i_op = &v9fs_file_inode_operations_dotl;
- inode->i_fop = &v9fs_file_operations_dotl;
} else if (v9fs_proto_dotu(v9ses)) {
inode->i_op = &v9fs_file_inode_operations;
- inode->i_fop = &v9fs_file_operations;
} else {
P9_DPRINTK(P9_DEBUG_ERROR,
"special files without extended mode\n");
diff --git a/fs/Kconfig b/fs/Kconfig
index 9fe0b349f4cd..5f4c45d4aa10 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -109,7 +109,7 @@ source "fs/proc/Kconfig"
source "fs/sysfs/Kconfig"
config TMPFS
- bool "Virtual memory file system support (former shm fs)"
+ bool "Tmpfs virtual memory file system support (former shm fs)"
depends on SHMEM
help
Tmpfs is a file system which keeps all files in virtual memory.
diff --git a/fs/Makefile b/fs/Makefile
index afc109691a9b..d2c3353d5477 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
-obj-$(CONFIG_EXOFS_FS) += exofs/
+obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/aio.c b/fs/aio.c
index e29ec485af25..632b235f4fbe 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1387,13 +1387,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
ret = compat_rw_copy_check_uvector(type,
(struct compat_iovec __user *)kiocb->ki_buf,
kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
- &kiocb->ki_iovec);
+ &kiocb->ki_iovec, 1);
else
#endif
ret = rw_copy_check_uvector(type,
(struct iovec __user *)kiocb->ki_buf,
kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
- &kiocb->ki_iovec);
+ &kiocb->ki_iovec, 1);
if (ret < 0)
goto out;
diff --git a/fs/attr.c b/fs/attr.c
index 538e27959d3f..7ee7ba488313 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -13,6 +13,7 @@
#include <linux/fsnotify.h>
#include <linux/fcntl.h>
#include <linux/security.h>
+#include <linux/evm.h>
/**
* inode_change_ok - check if attribute changes to an inode are allowed
@@ -237,8 +238,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
else
error = simple_setattr(dentry, attr);
- if (!error)
+ if (!error) {
fsnotify_change(dentry, ia_valid);
+ evm_inode_post_setattr(dentry, ia_valid);
+ }
return error;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4e57d59edb7..1266f6e9cdb2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1821,7 +1821,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
switch (origin) {
case SEEK_END:
case SEEK_CUR:
- offset = generic_file_llseek_unlocked(file, offset, origin);
+ offset = generic_file_llseek(file, offset, origin);
goto out;
case SEEK_DATA:
case SEEK_HOLE:
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 69565e5fc6a0..426aa464f1af 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -383,36 +383,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
XATTR_REPLACE);
}
-int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *suffix;
+ const struct xattr *xattr;
+ struct btrfs_trans_handle *trans = fs_info;
char *name;
+ int err = 0;
- err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
- &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
- }
-
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
- GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- } else {
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ break;
+ }
strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
- err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ err = __btrfs_setxattr(trans, inode, name,
+ xattr->value, xattr->value_len, 0);
kfree(name);
+ if (err < 0)
+ break;
}
-
- kfree(suffix);
- kfree(value);
return err;
}
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &btrfs_initxattrs, trans);
+}
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a80b048ade8..70a19745cb61 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -213,13 +213,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
* elsewhere, don't buffer_error if we had some unmapped buffers
*/
if (all_mapped) {
+ char b[BDEVNAME_SIZE];
+
printk("__find_get_block_slow() failed. "
"block=%llu, b_blocknr=%llu\n",
(unsigned long long)block,
(unsigned long long)bh->b_blocknr);
printk("b_state=0x%08lx, b_size=%zu\n",
bh->b_state, bh->b_size);
- printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
+ printk("device %s blocksize: %d\n", bdevname(bdev, b),
+ 1 << bd_inode->i_blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->private_lock);
@@ -1470,13 +1473,13 @@ static void discard_buffer(struct buffer_head * bh)
}
/**
- * block_invalidatepage - invalidate part of all of a buffer-backed page
+ * block_invalidatepage - invalidate part or all of a buffer-backed page
*
* @page: the page which is affected
* @offset: the index of the truncation point
*
* block_invalidatepage() is called when all or part of the page has become
- * invalidatedby a truncate operation.
+ * invalidated by a truncate operation.
*
* block_invalidatepage() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5a3953db8118..4144caf2f9d3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -228,102 +228,155 @@ static int ceph_readpage(struct file *filp, struct page *page)
}
/*
- * Build a vector of contiguous pages from the provided page list.
+ * Finish an async read(ahead) op.
*/
-static struct page **page_vector_from_list(struct list_head *page_list,
- unsigned *nr_pages)
+static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
{
- struct page **pages;
- struct page *page;
- int next_index, contig_pages = 0;
+ struct inode *inode = req->r_inode;
+ struct ceph_osd_reply_head *replyhead;
+ int rc, bytes;
+ int i;
- /* build page vector */
- pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
- if (!pages)
- return ERR_PTR(-ENOMEM);
+ /* parse reply */
+ replyhead = msg->front.iov_base;
+ WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+ rc = le32_to_cpu(replyhead->result);
+ bytes = le32_to_cpu(msg->hdr.data_len);
- BUG_ON(list_empty(page_list));
- next_index = list_entry(page_list->prev, struct page, lru)->index;
- list_for_each_entry_reverse(page, page_list, lru) {
- if (page->index == next_index) {
- dout("readpages page %d %p\n", contig_pages, page);
- pages[contig_pages] = page;
- contig_pages++;
- next_index++;
- } else {
- break;
+ dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+
+ /* unlock all pages, zeroing any data we didn't read */
+ for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
+ struct page *page = req->r_pages[i];
+
+ if (bytes < (int)PAGE_CACHE_SIZE) {
+ /* zero (remainder of) page */
+ int s = bytes < 0 ? 0 : bytes;
+ zero_user_segment(page, s, PAGE_CACHE_SIZE);
}
+ dout("finish_read %p uptodate %p idx %lu\n", inode, page,
+ page->index);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ page_cache_release(page);
}
- *nr_pages = contig_pages;
- return pages;
+ kfree(req->r_pages);
}
/*
- * Read multiple pages. Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
+ * start an async read(ahead) operation. return nr_pages we submitted
+ * a read for on success, or negative error code.
*/
-static int ceph_readpages(struct file *file, struct address_space *mapping,
- struct list_head *page_list, unsigned nr_pages)
+static int start_read(struct inode *inode, struct list_head *page_list, int max)
{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
- int rc = 0;
- struct page **pages;
- loff_t offset;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct page *page = list_entry(page_list->prev, struct page, lru);
+ struct ceph_osd_request *req;
+ u64 off;
u64 len;
+ int i;
+ struct page **pages;
+ pgoff_t next_index;
+ int nr_pages = 0;
+ int ret;
- dout("readpages %p file %p nr_pages %d\n",
- inode, file, nr_pages);
-
- pages = page_vector_from_list(page_list, &nr_pages);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
+ off = page->index << PAGE_CACHE_SHIFT;
- /* guess read extent */
- offset = pages[0]->index << PAGE_CACHE_SHIFT;
+ /* count pages */
+ next_index = page->index;
+ list_for_each_entry_reverse(page, page_list, lru) {
+ if (page->index != next_index)
+ break;
+ nr_pages++;
+ next_index++;
+ if (max && nr_pages == max)
+ break;
+ }
len = nr_pages << PAGE_CACHE_SHIFT;
- rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
- offset, &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- pages, nr_pages, 0);
- if (rc == -ENOENT)
- rc = 0;
- if (rc < 0)
- goto out;
-
- for (; !list_empty(page_list) && len > 0;
- rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
- struct page *page =
- list_entry(page_list->prev, struct page, lru);
+ dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
+ off, len);
+
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
+ off, &len,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, 0,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ NULL, false, 1, 0);
+ if (!req)
+ return -ENOMEM;
+ /* build page vector */
+ nr_pages = len >> PAGE_CACHE_SHIFT;
+ pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
+ ret = -ENOMEM;
+ if (!pages)
+ goto out;
+ for (i = 0; i < nr_pages; ++i) {
+ page = list_entry(page_list->prev, struct page, lru);
+ BUG_ON(PageLocked(page));
list_del(&page->lru);
-
- if (rc < (int)PAGE_CACHE_SIZE) {
- /* zero (remainder of) page */
- int s = rc < 0 ? 0 : rc;
- zero_user_segment(page, s, PAGE_CACHE_SIZE);
- }
-
- if (add_to_page_cache_lru(page, mapping, page->index,
+
+ dout("start_read %p adding %p idx %lu\n", inode, page,
+ page->index);
+ if (add_to_page_cache_lru(page, &inode->i_data, page->index,
GFP_NOFS)) {
page_cache_release(page);
- dout("readpages %p add_to_page_cache failed %p\n",
+ dout("start_read %p add_to_page_cache failed %p\n",
inode, page);
- continue;
+ nr_pages = i;
+ goto out_pages;
}
- dout("readpages %p adding %p idx %lu\n", inode, page,
- page->index);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- page_cache_release(page);
+ pages[i] = page;
}
- rc = 0;
+ req->r_pages = pages;
+ req->r_num_pages = nr_pages;
+ req->r_callback = finish_read;
+ req->r_inode = inode;
+
+ dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
+ ret = ceph_osdc_start_request(osdc, req, false);
+ if (ret < 0)
+ goto out_pages;
+ ceph_osdc_put_request(req);
+ return nr_pages;
+out_pages:
+ ceph_release_page_vector(pages, nr_pages);
+out:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+
+
+/*
+ * Read multiple pages. Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *page_list, unsigned nr_pages)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ int rc = 0;
+ int max = 0;
+
+ if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+ max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+ >> PAGE_SHIFT;
+
+ dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
+ max);
+ while (!list_empty(page_list)) {
+ rc = start_read(inode, page_list, max);
+ if (rc < 0)
+ goto out;
+ BUG_ON(rc == 0);
+ }
out:
- kfree(pages);
+ dout("readpages %p file %p ret %d\n", inode, file, rc);
return rc;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8d74ad7ba556..b8731bf3ef1f 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -945,7 +945,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
seq, issue_seq, mseq, follows, size, max_size,
xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
if (!msg)
return -ENOMEM;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 095799ba9dd1..5dde7d51dc11 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,7 +9,6 @@
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/vmalloc.h>
-#include <linux/pagevec.h>
#include "super.h"
#include "mds_client.h"
@@ -1364,49 +1363,6 @@ void ceph_queue_invalidate(struct inode *inode)
}
/*
- * invalidate any pages that are not dirty or under writeback. this
- * includes pages that are clean and mapped.
- */
-static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
-{
- struct pagevec pvec;
- pgoff_t next = 0;
- int i;
-
- pagevec_init(&pvec, 0);
- while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
- pgoff_t index;
- int skip_page =
- (PageDirty(page) || PageWriteback(page));
-
- if (!skip_page)
- skip_page = !trylock_page(page);
-
- /*
- * We really shouldn't be looking at the ->index of an
- * unlocked page. But we're not allowed to lock these
- * pages. So we rely upon nobody altering the ->index
- * of this (pinned-by-us) page.
- */
- index = page->index;
- if (index > next)
- next = index;
- next++;
-
- if (skip_page)
- continue;
-
- generic_error_remove_page(mapping, page);
- unlock_page(page);
- }
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
-/*
* Invalidate inode pages in a worker thread. (This can't be done
* in the message handler context.)
*/
@@ -1429,7 +1385,7 @@ static void ceph_invalidate_work(struct work_struct *work)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&inode->i_lock);
- ceph_invalidate_nondirty_pages(inode->i_mapping);
+ truncate_inode_pages(&inode->i_data, 0);
spin_lock(&inode->i_lock);
if (orig_gen == ci->i_rdcache_gen &&
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 3b256b50f7d8..5a14c29cbba6 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -42,17 +42,39 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
+ struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+ struct ceph_ioctl_layout nl;
int err, i;
- /* copy and validate */
if (copy_from_user(&l, arg, sizeof(l)))
return -EFAULT;
- if ((l.object_size & ~PAGE_MASK) ||
- (l.stripe_unit & ~PAGE_MASK) ||
- !l.stripe_unit ||
- (l.object_size &&
- (unsigned)l.object_size % (unsigned)l.stripe_unit))
+ /* validate changed params against current layout */
+ err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+ if (!err) {
+ nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+ nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+ nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+ nl.preferred_osd =
+ (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+ } else
+ return err;
+
+ if (l.stripe_count)
+ nl.stripe_count = l.stripe_count;
+ if (l.stripe_unit)
+ nl.stripe_unit = l.stripe_unit;
+ if (l.object_size)
+ nl.object_size = l.object_size;
+ if (l.data_pool)
+ nl.data_pool = l.data_pool;
+ if (l.preferred_osd)
+ nl.preferred_osd = l.preferred_osd;
+
+ if ((nl.object_size & ~PAGE_MASK) ||
+ (nl.stripe_unit & ~PAGE_MASK) ||
+ ((unsigned)nl.object_size % (unsigned)nl.stripe_unit))
return -EINVAL;
/* make sure it's a valid data pool */
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 0c5167e43180..be4a60487333 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -6,7 +6,31 @@
#define CEPH_IOCTL_MAGIC 0x97
-/* just use u64 to align sanely on all archs */
+/*
+ * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
+ * CEPH_IOC_SET_LAYOUT - set file layout
+ * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
+ *
+ * The file layout specifies how file data is striped over objects in
+ * the distributed object store, which object pool they belong to (if
+ * it differs from the default), and an optional 'preferred osd' to
+ * store them on.
+ *
+ * Files get a new layout based on the policy set on the containing
+ * directory or one of its ancestors. The GET_LAYOUT ioctl will let
+ * you examine the layout for a file or the policy on a directory.
+ *
+ * SET_LAYOUT will let you set a layout on a newly created file. This
+ * only works immediately after the file is created and before any
+ * data is written to it.
+ *
+ * SET_LAYOUT_POLICY will let you set a layout policy (default layout)
+ * on a directory that will apply to any new files created in that
+ * directory (or any child directory that doesn't specify a layout of
+ * its own).
+ */
+
+/* use u64 to align sanely on all archs */
struct ceph_ioctl_layout {
__u64 stripe_unit, stripe_count, object_size;
__u64 data_pool;
@@ -21,6 +45,8 @@ struct ceph_ioctl_layout {
struct ceph_ioctl_layout)
/*
+ * CEPH_IOC_GET_DATALOC - get location of file data in the cluster
+ *
* Extract identity, address of the OSD and object storing a given
* file offset.
*/
@@ -39,7 +65,34 @@ struct ceph_ioctl_dataloc {
#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
struct ceph_ioctl_dataloc)
+/*
+ * CEPH_IOC_LAZYIO - relax consistency
+ *
+ * Normally Ceph switches to synchronous IO when multiple clients have
+ * the file open (and or more for write). Reads and writes bypass the
+ * page cache and go directly to the OSD. Setting this flag on a file
+ * descriptor will allow buffered IO for this file in cases where the
+ * application knows it won't interfere with other nodes (or doesn't
+ * care).
+ */
#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
+
+/*
+ * CEPH_IOC_SYNCIO - force synchronous IO
+ *
+ * This ioctl sets a file flag that forces the synchronous IO that
+ * bypasses the page cache, even if it is not necessary. This is
+ * essentially the opposite behavior of IOC_LAZYIO. This forces the
+ * same read/write path as a file opened by multiple clients when one
+ * or more of those clients is opened for write.
+ *
+ * Note that this type of sync IO takes a different path than a file
+ * opened with O_SYNC/D_SYNC (writes hit the page cache and are
+ * immediately flushed on page boundaries). It is very similar to
+ * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
+ * are not copied (user page must remain stable) and O_DIRECT writes
+ * have alignment restrictions (on the buffer and file offset).
+ */
#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 86c59e16ba74..1d72f15fe9f4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -764,7 +764,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
+ false);
if (!msg) {
pr_err("create_session_msg ENOMEM creating msg\n");
return NULL;
@@ -1240,7 +1241,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
while (session->s_num_cap_releases < session->s_nr_caps + extra) {
spin_unlock(&session->s_cap_lock);
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
- GFP_NOFS);
+ GFP_NOFS, false);
if (!msg)
goto out_unlocked;
dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1652,7 +1653,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_old_dentry_drop)
len += req->r_old_dentry->d_name.len;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
if (!msg) {
msg = ERR_PTR(-ENOMEM);
goto out_free2;
@@ -2518,7 +2519,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
goto fail_nopagelist;
ceph_pagelist_init(pagelist);
- reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
+ reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
if (!reply)
goto fail_nomsg;
@@ -2831,7 +2832,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
dnamelen = dentry->d_name.len;
len += dnamelen;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
if (!msg)
return;
lease = msg->front.iov_base;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 88bacaf385d9..788f5ad8e66d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -114,6 +114,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
enum {
Opt_wsize,
Opt_rsize,
+ Opt_rasize,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
Opt_cap_release_safety,
@@ -136,6 +137,7 @@ enum {
static match_table_t fsopt_tokens = {
{Opt_wsize, "wsize=%d"},
{Opt_rsize, "rsize=%d"},
+ {Opt_rasize, "rasize=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
{Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -196,6 +198,9 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_rsize:
fsopt->rsize = intval;
break;
+ case Opt_rasize:
+ fsopt->rasize = intval;
+ break;
case Opt_caps_wanted_delay_min:
fsopt->caps_wanted_delay_min = intval;
break;
@@ -289,28 +294,29 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
- fsopt->sb_flags = flags;
- fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+ fsopt->sb_flags = flags;
+ fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
- fsopt->rsize = CEPH_RSIZE_DEFAULT;
- fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ fsopt->rsize = CEPH_RSIZE_DEFAULT;
+ fsopt->rasize = CEPH_RASIZE_DEFAULT;
+ fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
- fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
- fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
- fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
- fsopt->congestion_kb = default_congestion_kb();
-
- /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
- err = -EINVAL;
- if (!dev_name)
- goto out;
- *path = strstr(dev_name, ":/");
- if (*path == NULL) {
- pr_err("device name is missing path (no :/ in %s)\n",
- dev_name);
- goto out;
- }
+ fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+ fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+ fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+ fsopt->congestion_kb = default_congestion_kb();
+
+ /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+ err = -EINVAL;
+ if (!dev_name)
+ goto out;
+ *path = strstr(dev_name, ":/");
+ if (*path == NULL) {
+ pr_err("device name is missing path (no :/ in %s)\n",
+ dev_name);
+ goto out;
+ }
dev_name_end = *path;
dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
@@ -376,6 +382,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
seq_printf(m, ",rsize=%d", fsopt->rsize);
+ if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
+ seq_printf(m, ",rasize=%d", fsopt->rsize);
if (fsopt->congestion_kb != default_congestion_kb())
seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
@@ -422,20 +430,23 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
+ const unsigned supported_features =
+ CEPH_FEATURE_FLOCK |
+ CEPH_FEATURE_DIRLAYOUTHASH;
+ const unsigned required_features = 0;
int err = -ENOMEM;
fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
if (!fsc)
return ERR_PTR(-ENOMEM);
- fsc->client = ceph_create_client(opt, fsc);
+ fsc->client = ceph_create_client(opt, fsc, supported_features,
+ required_features);
if (IS_ERR(fsc->client)) {
err = PTR_ERR(fsc->client);
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
- CEPH_FEATURE_DIRLAYOUTHASH;
fsc->client->monc.want_mdsmap = 1;
fsc->mount_options = fsopt;
@@ -774,10 +785,10 @@ static int ceph_register_bdi(struct super_block *sb,
{
int err;
- /* set ra_pages based on rsize mount option? */
- if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+ /* set ra_pages based on rasize mount option? */
+ if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
fsc->backing_dev_info.ra_pages =
- (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+ (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
else
fsc->backing_dev_info.ra_pages =
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a23eed526f05..b01442aaf278 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -36,7 +36,8 @@
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */
+#define CEPH_RSIZE_DEFAULT 0 /* max read size */
+#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */
#define CEPH_MAX_READDIR_DEFAULT 1024
#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
@@ -45,8 +46,9 @@ struct ceph_mount_options {
int flags;
int sb_flags;
- int wsize;
- int rsize; /* max readahead */
+ int wsize; /* max write size */
+ int rsize; /* max read size */
+ int rasize; /* max readahead */
int congestion_kb; /* max writeback in flight */
int caps_wanted_delay_min, caps_wanted_delay_max;
int cap_release_safety;
@@ -344,9 +346,10 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
* x86_64+ino32 64 32
* x86_64 64 64
*/
-static inline u32 ceph_ino_to_ino32(ino_t ino)
+static inline u32 ceph_ino_to_ino32(__u64 vino)
{
- ino ^= ino >> (sizeof(ino) * 8 - 32);
+ u32 ino = vino & 0xffffffff;
+ ino ^= vino >> 32;
if (!ino)
ino = 1;
return ino;
@@ -357,11 +360,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino)
*/
static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
{
- ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
#if BITS_PER_LONG == 32
- ino = ceph_ino_to_ino32(ino);
+ return ceph_ino_to_ino32(vino.ino);
+#else
+ return (ino_t)vino.ino;
#endif
- return ino;
}
/*
diff --git a/fs/cifs/README b/fs/cifs/README
index c5c2c5e5f0f2..895da1dc1550 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -745,4 +745,18 @@ installed and something like the following lines should be added to the
create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
create dns_resolver * * /usr/local/sbin/cifs.upcall %k
+CIFS kernel module parameters
+=============================
+These module parameters can be specified or modified either during the time of
+module loading or during the runtime by using the interface
+ /proc/module/cifs/parameters/<param>
+
+i.e. echo "value" > /sys/module/cifs/parameters/<param>
+
+1. echo_retries - The number of echo attempts before giving up and
+ reconnecting to the server. The default is 5. The value 0
+ means never reconnect.
+
+2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
+ [Y/y/1]. To disable use any of [N/n/0].
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 6d40656e1e29..84e8c0724704 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = {
static int cifs_oplock_proc_show(struct seq_file *m, void *v)
{
- seq_printf(m, "%d\n", oplockEnabled);
+ seq_printf(m, "%d\n", enable_oplocks);
return 0;
}
@@ -526,13 +526,16 @@ static ssize_t cifs_oplock_proc_write(struct file *file,
char c;
int rc;
+ printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface "
+ "will be removed in kernel version 3.4. Please migrate to "
+ "using the 'enable_oplocks' module parameter in cifs.ko.\n");
rc = get_user(c, buffer);
if (rc)
return rc;
if (c == '0' || c == 'n' || c == 'N')
- oplockEnabled = 0;
+ enable_oplocks = false;
else if (c == '1' || c == 'y' || c == 'Y')
- oplockEnabled = 1;
+ enable_oplocks = true;
return count;
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7260e11e21f8..500d65859279 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,6 +43,8 @@
#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */
#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */
#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
+#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
struct cifs_sb_info {
struct rb_root tlink_tree;
@@ -55,6 +57,8 @@ struct cifs_sb_info {
atomic_t active;
uid_t mnt_uid;
gid_t mnt_gid;
+ uid_t mnt_backupuid;
+ gid_t mnt_backupgid;
mode_t mnt_file_mode;
mode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index d0f59faefb78..72ddf23ef6f7 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
spin_unlock(&sidgidlock);
+ root = &siduidtree;
+ spin_lock(&uidsidlock);
+ shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+ spin_unlock(&uidsidlock);
+
+ root = &sidgidtree;
+ spin_lock(&gidsidlock);
+ shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+ spin_unlock(&gidsidlock);
+
return nr_rem;
}
+static void
+sid_rb_insert(struct rb_root *root, unsigned long cid,
+ struct cifs_sid_id **psidid, char *typestr)
+{
+ char *strptr;
+ struct rb_node *node = root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rb_node **linkto = &(root->rb_node);
+ struct cifs_sid_id *lsidid;
+
+ while (node) {
+ lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+ parent = node;
+ if (cid > lsidid->id) {
+ linkto = &(node->rb_left);
+ node = node->rb_left;
+ }
+ if (cid < lsidid->id) {
+ linkto = &(node->rb_right);
+ node = node->rb_right;
+ }
+ }
+
+ (*psidid)->id = cid;
+ (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
+ (*psidid)->refcount = 0;
+
+ sprintf((*psidid)->sidstr, "%s", typestr);
+ strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+ sprintf(strptr, "%ld", cid);
+
+ clear_bit(SID_ID_PENDING, &(*psidid)->state);
+ clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+
+ rb_link_node(&(*psidid)->rbnode, parent, linkto);
+ rb_insert_color(&(*psidid)->rbnode, root);
+}
+
+static struct cifs_sid_id *
+sid_rb_search(struct rb_root *root, unsigned long cid)
+{
+ struct rb_node *node = root->rb_node;
+ struct cifs_sid_id *lsidid;
+
+ while (node) {
+ lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+ if (cid > lsidid->id)
+ node = node->rb_left;
+ else if (cid < lsidid->id)
+ node = node->rb_right;
+ else /* node found */
+ return lsidid;
+ }
+
+ return NULL;
+}
+
static struct shrinker cifs_shrinker = {
.shrink = cifs_idmap_shrinker,
.seeks = DEFAULT_SEEKS,
@@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
memcpy(payload, data, datalen);
key->payload.data = payload;
+ key->datalen = datalen;
return 0;
}
@@ -224,6 +292,120 @@ sidid_pending_wait(void *unused)
}
static int
+id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
+{
+ int rc = 0;
+ struct key *sidkey;
+ const struct cred *saved_cred;
+ struct cifs_sid *lsid;
+ struct cifs_sid_id *psidid, *npsidid;
+ struct rb_root *cidtree;
+ spinlock_t *cidlock;
+
+ if (sidtype == SIDOWNER) {
+ cidlock = &siduidlock;
+ cidtree = &uidtree;
+ } else if (sidtype == SIDGROUP) {
+ cidlock = &sidgidlock;
+ cidtree = &gidtree;
+ } else
+ return -EINVAL;
+
+ spin_lock(cidlock);
+ psidid = sid_rb_search(cidtree, cid);
+
+ if (!psidid) { /* node does not exist, allocate one & attempt adding */
+ spin_unlock(cidlock);
+ npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
+ if (!npsidid)
+ return -ENOMEM;
+
+ npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
+ if (!npsidid->sidstr) {
+ kfree(npsidid);
+ return -ENOMEM;
+ }
+
+ spin_lock(cidlock);
+ psidid = sid_rb_search(cidtree, cid);
+ if (psidid) { /* node happened to get inserted meanwhile */
+ ++psidid->refcount;
+ spin_unlock(cidlock);
+ kfree(npsidid->sidstr);
+ kfree(npsidid);
+ } else {
+ psidid = npsidid;
+ sid_rb_insert(cidtree, cid, &psidid,
+ sidtype == SIDOWNER ? "oi:" : "gi:");
+ ++psidid->refcount;
+ spin_unlock(cidlock);
+ }
+ } else {
+ ++psidid->refcount;
+ spin_unlock(cidlock);
+ }
+
+ /*
+ * If we are here, it is safe to access psidid and its fields
+ * since a reference was taken earlier while holding the spinlock.
+ * A reference on the node is put without holding the spinlock
+ * and it is OK to do so in this case, shrinker will not erase
+ * this node until all references are put and we do not access
+ * any fields of the node after a reference is put .
+ */
+ if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+ memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+ psidid->time = jiffies; /* update ts for accessing */
+ goto id_sid_out;
+ }
+
+ if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) {
+ rc = -EINVAL;
+ goto id_sid_out;
+ }
+
+ if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
+ saved_cred = override_creds(root_cred);
+ sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+ if (IS_ERR(sidkey)) {
+ rc = -EINVAL;
+ cFYI(1, "%s: Can't map and id to a SID", __func__);
+ } else {
+ lsid = (struct cifs_sid *)sidkey->payload.data;
+ memcpy(&psidid->sid, lsid,
+ sidkey->datalen < sizeof(struct cifs_sid) ?
+ sidkey->datalen : sizeof(struct cifs_sid));
+ memcpy(ssid, &psidid->sid,
+ sidkey->datalen < sizeof(struct cifs_sid) ?
+ sidkey->datalen : sizeof(struct cifs_sid));
+ set_bit(SID_ID_MAPPED, &psidid->state);
+ key_put(sidkey);
+ kfree(psidid->sidstr);
+ }
+ psidid->time = jiffies; /* update ts for accessing */
+ revert_creds(saved_cred);
+ clear_bit(SID_ID_PENDING, &psidid->state);
+ wake_up_bit(&psidid->state, SID_ID_PENDING);
+ } else {
+ rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+ sidid_pending_wait, TASK_INTERRUPTIBLE);
+ if (rc) {
+ cFYI(1, "%s: sidid_pending_wait interrupted %d",
+ __func__, rc);
+ --psidid->refcount;
+ return rc;
+ }
+ if (test_bit(SID_ID_MAPPED, &psidid->state))
+ memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+ else
+ rc = -EINVAL;
+ }
+id_sid_out:
+ --psidid->refcount;
+ return rc;
+}
+
+static int
sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
struct cifs_fattr *fattr, uint sidtype)
{
@@ -383,6 +565,10 @@ init_cifs_idmap(void)
spin_lock_init(&sidgidlock);
gidtree = RB_ROOT;
+ spin_lock_init(&uidsidlock);
+ siduidtree = RB_ROOT;
+ spin_lock_init(&gidsidlock);
+ sidgidtree = RB_ROOT;
register_shrinker(&cifs_shrinker);
cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
@@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void)
while ((node = rb_first(root)))
rb_erase(node, root);
spin_unlock(&sidgidlock);
+
+ root = &siduidtree;
+ spin_lock(&uidsidlock);
+ while ((node = rb_first(root)))
+ rb_erase(node, root);
+ spin_unlock(&uidsidlock);
+
+ root = &sidgidtree;
+ spin_lock(&gidsidlock);
+ while ((node = rb_first(root)))
+ rb_erase(node, root);
+ spin_unlock(&gidsidlock);
}
/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -706,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
acl_size = sizeof(struct cifs_acl);
num_aces = le32_to_cpu(pdacl->num_aces);
- if (num_aces > 0) {
+ if (num_aces > 0) {
umode_t user_mask = S_IRWXU;
umode_t group_mask = S_IRWXG;
umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
@@ -868,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
else
cFYI(1, "no ACL"); /* BB grant all or default perms? */
-/* cifscred->uid = owner_sid_ptr->rid;
- cifscred->gid = group_sid_ptr->rid;
- memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
- sizeof(struct cifs_sid));
- memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
- sizeof(struct cifs_sid)); */
-
return rc;
}
-
/* Convert permission bits from mode to equivalent CIFS ACL */
static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
- struct inode *inode, __u64 nmode)
+ __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag)
{
int rc = 0;
__u32 dacloffset;
__u32 ndacloffset;
__u32 sidsoffset;
struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+ struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */
struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
- if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
- return -EIO;
-
- owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ if (nmode != NO_CHANGE_64) { /* chmod */
+ owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
- group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
le32_to_cpu(pntsd->gsidoffset));
-
- dacloffset = le32_to_cpu(pntsd->dacloffset);
- dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-
- ndacloffset = sizeof(struct cifs_ntsd);
- ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
- ndacl_ptr->revision = dacl_ptr->revision;
- ndacl_ptr->size = 0;
- ndacl_ptr->num_aces = 0;
-
- rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode);
-
- sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
-
- /* copy security descriptor control portion and owner and group sid */
- copy_sec_desc(pntsd, pnntsd, sidsoffset);
+ dacloffset = le32_to_cpu(pntsd->dacloffset);
+ dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+ ndacloffset = sizeof(struct cifs_ntsd);
+ ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+ ndacl_ptr->revision = dacl_ptr->revision;
+ ndacl_ptr->size = 0;
+ ndacl_ptr->num_aces = 0;
+
+ rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
+ nmode);
+ sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+ /* copy sec desc control portion & owner and group sids */
+ copy_sec_desc(pntsd, pnntsd, sidsoffset);
+ *aclflag = CIFS_ACL_DACL;
+ } else {
+ memcpy(pnntsd, pntsd, secdesclen);
+ if (uid != NO_CHANGE_32) { /* chown */
+ owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+ le32_to_cpu(pnntsd->osidoffset));
+ nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ GFP_KERNEL);
+ if (!nowner_sid_ptr)
+ return -ENOMEM;
+ rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr);
+ if (rc) {
+ cFYI(1, "%s: Mapping error %d for owner id %d",
+ __func__, rc, uid);
+ kfree(nowner_sid_ptr);
+ return rc;
+ }
+ memcpy(owner_sid_ptr, nowner_sid_ptr,
+ sizeof(struct cifs_sid));
+ kfree(nowner_sid_ptr);
+ *aclflag = CIFS_ACL_OWNER;
+ }
+ if (gid != NO_CHANGE_32) { /* chgrp */
+ group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+ le32_to_cpu(pnntsd->gsidoffset));
+ ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ GFP_KERNEL);
+ if (!ngroup_sid_ptr)
+ return -ENOMEM;
+ rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr);
+ if (rc) {
+ cFYI(1, "%s: Mapping error %d for group id %d",
+ __func__, rc, gid);
+ kfree(ngroup_sid_ptr);
+ return rc;
+ }
+ memcpy(group_sid_ptr, ngroup_sid_ptr,
+ sizeof(struct cifs_sid));
+ kfree(ngroup_sid_ptr);
+ *aclflag = CIFS_ACL_GROUP;
+ }
+ }
return rc;
}
@@ -945,7 +1173,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
{
struct cifs_ntsd *pntsd = NULL;
int oplock = 0;
- int xid, rc;
+ int xid, rc, create_options = 0;
__u16 fid;
struct cifs_tcon *tcon;
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -956,9 +1184,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
tcon = tlink_tcon(tlink);
xid = GetXid();
- rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
- &fid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL,
+ create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
if (!rc) {
rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
CIFSSMBClose(xid, tcon, fid);
@@ -991,13 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
return pntsd;
}
-static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
- struct cifs_ntsd *pnntsd, u32 acllen)
+ /* Set an ACL on the server */
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+ struct inode *inode, const char *path, int aclflag)
{
int oplock = 0;
- int xid, rc;
+ int xid, rc, access_flags, create_options = 0;
__u16 fid;
struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -1006,15 +1239,23 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
tcon = tlink_tcon(tlink);
xid = GetXid();
- rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
- &fid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
+ access_flags = WRITE_OWNER;
+ else
+ access_flags = WRITE_DAC;
+
+ rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags,
+ create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc) {
cERROR(1, "Unable to open file to set ACL");
goto out;
}
- rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
+ rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
cFYI(DBG2, "SetCIFSACL rc = %d", rc);
CIFSSMBClose(xid, tcon, fid);
@@ -1024,17 +1265,6 @@ out:
return rc;
}
-/* Set an ACL on the server */
-int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
- struct inode *inode, const char *path)
-{
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-
- cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
-
- return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
-}
-
/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
int
cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -1066,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
}
/* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
+int
+id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
+ uid_t uid, gid_t gid)
{
int rc = 0;
+ int aclflag = CIFS_ACL_DACL; /* default flag to set */
__u32 secdesclen = 0;
struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
@@ -1098,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
return -ENOMEM;
}
- rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
+ rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+ &aclflag);
cFYI(DBG2, "build_sec_desc rc: %d", rc);
if (!rc) {
/* Set the security descriptor */
- rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
+ rc = set_cifs_acl(pnntsd, secdesclen, inode,
+ path, aclflag);
cFYI(DBG2, "set_cifs_acl rc: %d", rc);
}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 30acd22147e1..2cfb695d1f89 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -37,83 +37,8 @@
* the sequence number before this function is called. Also, this function
* should be called with the server->srv_mutex held.
*/
-static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
- struct TCP_Server_Info *server, char *signature)
-{
- int rc;
-
- if (cifs_pdu == NULL || signature == NULL || server == NULL)
- return -EINVAL;
-
- if (!server->secmech.sdescmd5) {
- cERROR(1, "%s: Can't generate signature\n", __func__);
- return -1;
- }
-
- rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
- if (rc) {
- cERROR(1, "%s: Could not init md5\n", __func__);
- return rc;
- }
-
- rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
- server->session_key.response, server->session_key.len);
- if (rc) {
- cERROR(1, "%s: Could not update with response\n", __func__);
- return rc;
- }
-
- rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
- cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
- if (rc) {
- cERROR(1, "%s: Could not update with payload\n", __func__);
- return rc;
- }
-
- rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
- if (rc)
- cERROR(1, "%s: Could not generate md5 hash\n", __func__);
-
- return rc;
-}
-
-/* must be called with server->srv_mutex held */
-int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
- __u32 *pexpected_response_sequence_number)
-{
- int rc = 0;
- char smb_signature[20];
-
- if ((cifs_pdu == NULL) || (server == NULL))
- return -EINVAL;
-
- if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
- server->tcpStatus == CifsNeedNegotiate)
- return rc;
-
- if (!server->session_estab) {
- strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
- return rc;
- }
-
- cifs_pdu->Signature.Sequence.SequenceNumber =
- cpu_to_le32(server->sequence_number);
- cifs_pdu->Signature.Sequence.Reserved = 0;
-
- *pexpected_response_sequence_number = server->sequence_number++;
- server->sequence_number++;
-
- rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
- if (rc)
- memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
- else
- memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
-
- return rc;
-}
-
-static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
- struct TCP_Server_Info *server, char *signature)
+static int cifs_calc_signature(const struct kvec *iov, int n_vec,
+ struct TCP_Server_Info *server, char *signature)
{
int i;
int rc;
@@ -179,7 +104,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
{
int rc = 0;
char smb_signature[20];
- struct smb_hdr *cifs_pdu = iov[0].iov_base;
+ struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
if ((cifs_pdu == NULL) || (server == NULL))
return -EINVAL;
@@ -189,7 +114,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
return rc;
if (!server->session_estab) {
- strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
+ memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
return rc;
}
@@ -200,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
*pexpected_response_sequence_number = server->sequence_number++;
server->sequence_number++;
- rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
+ rc = cifs_calc_signature(iov, n_vec, server, smb_signature);
if (rc)
memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
else
@@ -209,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
return rc;
}
-int cifs_verify_signature(struct smb_hdr *cifs_pdu,
+/* must be called with server->srv_mutex held */
+int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
+ __u32 *pexpected_response_sequence_number)
+{
+ struct kvec iov;
+
+ iov.iov_base = cifs_pdu;
+ iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4;
+
+ return cifs_sign_smb2(&iov, 1, server,
+ pexpected_response_sequence_number);
+}
+
+int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
struct TCP_Server_Info *server,
__u32 expected_sequence_number)
{
unsigned int rc;
char server_response_sig[8];
char what_we_think_sig_should_be[20];
+ struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
if (cifs_pdu == NULL || server == NULL)
return -EINVAL;
@@ -247,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
cifs_pdu->Signature.Sequence.Reserved = 0;
mutex_lock(&server->srv_mutex);
- rc = cifs_calculate_signature(cifs_pdu, server,
- what_we_think_sig_should_be);
+ rc = cifs_calc_signature(iov, nr_iov, server,
+ what_we_think_sig_should_be);
mutex_unlock(&server->srv_mutex);
if (rc)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 54b8f1e7da94..8f1fe324162b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -53,7 +53,7 @@
int cifsFYI = 0;
int cifsERROR = 1;
int traceSMB = 0;
-unsigned int oplockEnabled = 1;
+bool enable_oplocks = true;
unsigned int linuxExtEnabled = 1;
unsigned int lookupCacheEnabled = 1;
unsigned int multiuser_mount = 0;
@@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0);
MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
"Range: 2 to 256");
unsigned int cifs_max_pending = CIFS_MAX_REQ;
-module_param(cifs_max_pending, int, 0);
+module_param(cifs_max_pending, int, 0444);
MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
"Default: 50 Range: 2 to 256");
unsigned short echo_retries = 5;
@@ -82,6 +82,10 @@ module_param(echo_retries, ushort, 0644);
MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
"reconnecting server. Default: 5. 0 means "
"never reconnect.");
+module_param(enable_oplocks, bool, 0644);
+MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
+ "y/Y/1");
+
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
@@ -132,12 +136,12 @@ cifs_read_super(struct super_block *sb)
else
sb->s_d_op = &cifs_dentry_ops;
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cFYI(1, "export ops supported");
sb->s_export_op = &cifs_export_ops;
}
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
return 0;
@@ -432,6 +436,12 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
seq_printf(s, ",mfsymlinks");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
seq_printf(s, ",fsc");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
+ seq_printf(s, ",nostrictsync");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+ seq_printf(s, ",noperm");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+ seq_printf(s, ",strictcache");
seq_printf(s, ",rsize=%d", cifs_sb->rsize);
seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -530,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
char *full_path = NULL;
char *s, *p;
char sep;
- int xid;
full_path = cifs_build_path_to_root(vol, cifs_sb,
cifs_sb_master_tcon(cifs_sb));
@@ -539,7 +548,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
cFYI(1, "Get root dentry for %s", full_path);
- xid = GetXid();
sep = CIFS_DIR_SEP(cifs_sb);
dentry = dget(sb->s_root);
p = s = full_path;
@@ -570,7 +578,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
- _FreeXid(xid);
kfree(full_path);
return dentry;
}
@@ -723,7 +730,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
if (rc < 0)
return (loff_t)rc;
}
- return generic_file_llseek_unlocked(file, offset, origin);
+ return generic_file_llseek(file, offset, origin);
}
static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
@@ -942,7 +949,8 @@ cifs_init_once(void *inode)
struct cifsInodeInfo *cifsi = inode;
inode_init_once(&cifsi->vfs_inode);
- INIT_LIST_HEAD(&cifsi->lockList);
+ INIT_LIST_HEAD(&cifsi->llist);
+ mutex_init(&cifsi->lock_mutex);
}
static int
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 95da8027983d..d9dbaf869cd1 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -121,9 +121,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
extern const struct export_operations cifs_export_ops;
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
#define CIFS_VERSION "1.75"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 95dad9d14cf1..8238aa13e01c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -167,6 +167,8 @@ struct smb_vol {
uid_t cred_uid;
uid_t linux_uid;
gid_t linux_gid;
+ uid_t backupuid;
+ gid_t backupgid;
mode_t file_mode;
mode_t dir_mode;
unsigned secFlg;
@@ -179,6 +181,8 @@ struct smb_vol {
bool noperm:1;
bool no_psx_acl:1; /* set if posix acl support should be disabled */
bool cifs_acl:1;
+ bool backupuid_specified; /* mount option backupuid is specified */
+ bool backupgid_specified; /* mount option backupgid is specified */
bool no_xattr:1; /* set if xattr (EA) support should be disabled*/
bool server_ino:1; /* use inode numbers from server ie UniqueId */
bool direct_io:1;
@@ -219,7 +223,8 @@ struct smb_vol {
CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
- CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO)
+ CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
+ CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
MS_NODEV | MS_SYNCHRONOUS)
@@ -286,7 +291,13 @@ struct TCP_Server_Info {
bool sec_kerberosu2u; /* supports U2U Kerberos */
bool sec_kerberos; /* supports plain Kerberos */
bool sec_mskerberos; /* supports legacy MS Kerberos */
+ bool large_buf; /* is current buffer large? */
struct delayed_work echo; /* echo ping workqueue job */
+ struct kvec *iov; /* reusable kvec array for receives */
+ unsigned int nr_iov; /* number of kvecs in array */
+ char *smallbuf; /* pointer to current "small" buffer */
+ char *bigbuf; /* pointer to current "big" buffer */
+ unsigned int total_read; /* total amount of data read in this pass */
#ifdef CONFIG_CIFS_FSCACHE
struct fscache_cookie *fscache; /* client index cache cookie */
#endif
@@ -485,9 +496,13 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
*/
struct cifsLockInfo {
struct list_head llist; /* pointer to next cifsLockInfo */
+ struct list_head blist; /* pointer to locks blocked on this */
+ wait_queue_head_t block_q;
__u64 offset;
__u64 length;
+ __u32 pid;
__u8 type;
+ __u16 netfid;
};
/*
@@ -520,8 +535,6 @@ struct cifsFileInfo {
struct dentry *dentry;
unsigned int f_flags;
struct tcon_link *tlink;
- struct mutex lock_mutex;
- struct list_head llist; /* list of byte range locks we have. */
bool invalidHandle:1; /* file closed via session abend */
bool oplock_break_cancelled:1;
int count; /* refcount protected by cifs_file_list_lock */
@@ -554,7 +567,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
*/
struct cifsInodeInfo {
- struct list_head lockList;
+ struct list_head llist; /* brlocks for this inode */
+ bool can_cache_brlcks;
+ struct mutex lock_mutex; /* protect two fields above */
/* BB add in lists for dirty pages i.e. write caching info for oplock */
struct list_head openFileList;
__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
@@ -643,8 +658,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
struct mid_q_entry;
/*
- * This is the prototype for the mid callback function. When creating one,
- * take special care to avoid deadlocks. Things to bear in mind:
+ * This is the prototype for the mid receive function. This function is for
+ * receiving the rest of the SMB frame, starting with the WordCount (which is
+ * just after the MID in struct smb_hdr). Note:
+ *
+ * - This will be called by cifsd, with no locks held.
+ * - The mid will still be on the pending_mid_q.
+ * - mid->resp_buf will point to the current buffer.
+ *
+ * Returns zero on a successful receive, or an error. The receive state in
+ * the TCP_Server_Info will also be updated.
+ */
+typedef int (mid_receive_t)(struct TCP_Server_Info *server,
+ struct mid_q_entry *mid);
+
+/*
+ * This is the prototype for the mid callback function. This is called once the
+ * mid has been received off of the socket. When creating one, take special
+ * care to avoid deadlocks. Things to bear in mind:
*
* - it will be called by cifsd, with no locks held
* - the mid will be removed from any lists
@@ -662,9 +693,10 @@ struct mid_q_entry {
unsigned long when_sent; /* time when smb send finished */
unsigned long when_received; /* when demux complete (taken off wire) */
#endif
+ mid_receive_t *receive; /* call receive callback */
mid_callback_t *callback; /* call completion callback */
void *callback_data; /* general purpose pointer for callback */
- struct smb_hdr *resp_buf; /* response buffer */
+ struct smb_hdr *resp_buf; /* pointer to received SMB header */
int midState; /* wish this were enum but can not pass to wait_event */
__u8 command; /* smb command code */
bool largeBuf:1; /* if valid response, is pointer to large buf */
@@ -964,7 +996,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
to be established on existing mount if we
have the uid/password or Kerberos credential
or equivalent for current user */
-GLOBAL_EXTERN unsigned int oplockEnabled;
+/* enable or disable oplocks */
+GLOBAL_EXTERN bool enable_oplocks;
GLOBAL_EXTERN unsigned int lookupCacheEnabled;
GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
@@ -978,10 +1011,16 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
/* reconnect after this many failed echo attempts */
GLOBAL_EXTERN unsigned short echo_retries;
+#ifdef CONFIG_CIFS_ACL
GLOBAL_EXTERN struct rb_root uidtree;
GLOBAL_EXTERN struct rb_root gidtree;
GLOBAL_EXTERN spinlock_t siduidlock;
GLOBAL_EXTERN spinlock_t sidgidlock;
+GLOBAL_EXTERN struct rb_root siduidtree;
+GLOBAL_EXTERN struct rb_root sidgidtree;
+GLOBAL_EXTERN spinlock_t uidsidlock;
+GLOBAL_EXTERN spinlock_t gidsidlock;
+#endif /* CONFIG_CIFS_ACL */
void cifs_oplock_break(struct work_struct *work);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de3aa285de03..3fb03e2c8e86 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp {
__le16 DataLengthHigh;
__u64 Reserved2;
__u16 ByteCount;
- __u8 Pad; /* BB check for whether padded to DWORD
- boundary and optimum performance here */
- char Data[1];
+ /* read response data immediately follows */
} __attribute__((packed)) READ_RSP;
typedef struct locking_andx_range {
@@ -1913,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */
/* SETFSInfo Levels */
#define SMB_SET_CIFS_UNIX_INFO 0x200
+/* level 0x203 is defined above in list of QFS info levels */
+/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */
+
+/* Level 0x200 request structure follows */
typedef struct smb_com_transaction2_setfsi_req {
struct smb_hdr hdr; /* wct = 15 */
__le16 TotalParameterCount;
@@ -1940,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req {
__le64 ClientUnixCap; /* Data end */
} __attribute__((packed)) TRANSACTION2_SETFSI_REQ;
+/* level 0x203 request structure follows */
+typedef struct smb_com_transaction2_setfs_enc_req {
+ struct smb_hdr hdr; /* wct = 15 */
+ __le16 TotalParameterCount;
+ __le16 TotalDataCount;
+ __le16 MaxParameterCount;
+ __le16 MaxDataCount;
+ __u8 MaxSetupCount;
+ __u8 Reserved;
+ __le16 Flags;
+ __le32 Timeout;
+ __u16 Reserved2;
+ __le16 ParameterCount; /* 4 */
+ __le16 ParameterOffset;
+ __le16 DataCount; /* 12 */
+ __le16 DataOffset;
+ __u8 SetupCount; /* one */
+ __u8 Reserved3;
+ __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */
+ __le16 ByteCount;
+ __u8 Pad;
+ __u16 Reserved4; /* Parameters start. */
+ __le16 InformationLevel;/* Parameters end. */
+ /* NTLMSSP Blob, Data start. */
+} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ;
+
+/* response for setfsinfo levels 0x200 and 0x203 */
typedef struct smb_com_transaction2_setfsi_rsp {
struct smb_hdr hdr; /* wct = 10 */
struct trans2_resp t2;
__u16 ByteCount;
} __attribute__((packed)) TRANSACTION2_SETFSI_RSP;
-
typedef struct smb_com_transaction2_get_dfs_refer_req {
struct smb_hdr hdr; /* wct = 15 */
__le16 TotalParameterCount;
@@ -2098,13 +2126,13 @@ typedef struct {
#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and
QFS PROXY call */
#ifdef CONFIG_CIFS_POSIX
-/* Can not set pathnames cap yet until we send new posix create SMB since
- otherwise server can treat such handles opened with older ntcreatex
- (by a new client which knows how to send posix path ops)
- as non-posix handles (can affect write behavior with byte range locks.
- We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */
+/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
+ LockingX instead of posix locking call on unix sess (and we do not expect
+ LockingX to use different (ie Windows) semantics than posix locking on
+ the same session (if WINE needs to do this later, we can add this cap
+ back in later */
/* #define CIFS_UNIX_CAP_MASK 0x000000fb */
-#define CIFS_UNIX_CAP_MASK 0x000000db
+#define CIFS_UNIX_CAP_MASK 0x000003db
#else
#define CIFS_UNIX_CAP_MASK 0x00000013
#endif /* CONFIG_CIFS_POSIX */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8df28e925e5b..ef4f631e4c01 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
struct TCP_Server_Info *server);
extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
- unsigned int nvec, mid_callback_t *callback,
- void *cbdata, bool ignore_pend);
+ unsigned int nvec, mid_receive_t *receive,
+ mid_callback_t *callback, void *cbdata,
+ bool ignore_pend);
extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
struct smb_hdr * /* input */ ,
struct smb_hdr * /* out */ ,
@@ -90,6 +91,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
extern bool is_valid_oplock_break(struct smb_hdr *smb,
struct TCP_Server_Info *);
+extern bool backup_cred(struct cifs_sb_info *);
extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
@@ -145,12 +147,19 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr, struct inode *inode,
const char *path, const __u16 *pfid);
-extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
+ uid_t, gid_t);
extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
const char *, u32 *);
extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
- const char *);
+ const char *, int);
+extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
+extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+ unsigned int to_read);
+extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
+ struct kvec *iov_orig, unsigned int nr_segs,
+ unsigned int to_read);
extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
struct cifs_sb_info *cifs_sb);
extern int cifs_match_super(struct super_block *, void *);
@@ -359,14 +368,17 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
const struct nls_table *nls_codepage,
int remap_special_chars);
+extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
+ const __u8 lock_type, const __u32 num_unlock,
+ const __u32 num_lock, LOCKING_ANDX_RANGE *buf);
extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
- const __u16 netfid, const __u64 len,
+ const __u16 netfid, const __u32 netpid, const __u64 len,
const __u64 offset, const __u32 numUnlock,
const __u32 numLock, const __u8 lockType,
const bool waitFlag, const __u8 oplock_level);
extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
- const __u16 smb_file_id, const int get_flag,
- const __u64 len, struct file_lock *,
+ const __u16 smb_file_id, const __u32 netpid,
+ const int get_flag, const __u64 len, struct file_lock *,
const __u16 lock_type, const bool waitFlag);
extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon);
extern int CIFSSMBEcho(struct TCP_Server_Info *server);
@@ -380,7 +392,7 @@ extern void tconInfoFree(struct cifs_tcon *);
extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
__u32 *);
-extern int cifs_verify_signature(struct smb_hdr *,
+extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
struct TCP_Server_Info *server,
__u32 expected_sequence_number);
extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
@@ -419,7 +431,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon,
extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon,
__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16,
- struct cifs_ntsd *, __u32);
+ struct cifs_ntsd *, __u32, int);
extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
const unsigned char *searchName,
char *acl_inf, const int buflen, const int acl_type,
@@ -440,6 +452,24 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
unsigned char *p24);
+/* asynchronous read support */
+struct cifs_readdata {
+ struct cifsFileInfo *cfile;
+ struct address_space *mapping;
+ __u64 offset;
+ unsigned int bytes;
+ pid_t pid;
+ int result;
+ struct list_head pages;
+ struct work_struct work;
+ unsigned int nr_iov;
+ struct kvec iov[1];
+};
+
+struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages);
+void cifs_readdata_free(struct cifs_readdata *rdata);
+int cifs_async_readv(struct cifs_readdata *rdata);
+
/* asynchronous write support */
struct cifs_writedata {
struct kref refcount;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a80f7bd97b90..6600aa2d2ef3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -33,6 +33,8 @@
#include <linux/slab.h>
#include <linux/posix_acl_xattr.h>
#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/task_io_accounting_ops.h>
#include <asm/uaccess.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -40,6 +42,7 @@
#include "cifsproto.h"
#include "cifs_unicode.h"
#include "cifs_debug.h"
+#include "fscache.h"
#ifdef CONFIG_CIFS_POSIX
static struct {
@@ -83,6 +86,9 @@ static struct {
#endif /* CONFIG_CIFS_WEAK_PW_HASH */
#endif /* CIFS_POSIX */
+/* Forward declarations */
+static void cifs_readv_complete(struct work_struct *work);
+
/* Mark as invalid, all open files on tree connections since they
were closed when session to server was lost */
static void mark_open_files_invalid(struct cifs_tcon *pTcon)
@@ -453,8 +459,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
}
server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
- server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
- (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+ server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
/* even though we do not use raw we might as well set this
accurately, in case we ever find a need for it */
@@ -561,8 +566,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
little endian */
server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
/* probably no need to store and check maxvcs */
- server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
- (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -739,7 +743,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
iov.iov_base = smb;
iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
- rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true);
+ rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback,
+ server, true);
if (rc)
cFYI(1, "Echo request failed: %d", rc);
@@ -1376,6 +1381,359 @@ openRetry:
return rc;
}
+struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages)
+{
+ struct cifs_readdata *rdata;
+
+ /* readdata + 1 kvec for each page */
+ rdata = kzalloc(sizeof(*rdata) +
+ sizeof(struct kvec) * nr_pages, GFP_KERNEL);
+ if (rdata != NULL) {
+ INIT_WORK(&rdata->work, cifs_readv_complete);
+ INIT_LIST_HEAD(&rdata->pages);
+ }
+ return rdata;
+}
+
+void
+cifs_readdata_free(struct cifs_readdata *rdata)
+{
+ cifsFileInfo_put(rdata->cfile);
+ kfree(rdata);
+}
+
+/*
+ * Discard any remaining data in the current SMB. To do this, we borrow the
+ * current bigbuf.
+ */
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+ unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length);
+ int remaining = rfclen + 4 - server->total_read;
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ while (remaining > 0) {
+ int length;
+
+ length = cifs_read_from_socket(server, server->bigbuf,
+ min_t(unsigned int, remaining,
+ CIFSMaxBufSize + MAX_CIFS_HDR_SIZE));
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ remaining -= length;
+ }
+
+ dequeue_mid(mid, rdata->result);
+ return 0;
+}
+
+static int
+cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length, len;
+ unsigned int data_offset, remaining, data_len;
+ struct cifs_readdata *rdata = mid->callback_data;
+ READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+ unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4;
+ u64 eof;
+ pgoff_t eof_index;
+ struct page *page, *tpage;
+
+ cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__,
+ mid->mid, rdata->offset, rdata->bytes);
+
+ /*
+ * read the rest of READ_RSP header (sans Data array), or whatever we
+ * can if there's not enough data. At this point, we've read down to
+ * the Mid.
+ */
+ len = min_t(unsigned int, rfclen, sizeof(*rsp)) -
+ sizeof(struct smb_hdr) + 1;
+
+ rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1;
+ rdata->iov[0].iov_len = len;
+
+ length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+
+ /* Was the SMB read successful? */
+ rdata->result = map_smb_to_linux_error(&rsp->hdr, false);
+ if (rdata->result != 0) {
+ cFYI(1, "%s: server returned error %d", __func__,
+ rdata->result);
+ return cifs_readv_discard(server, mid);
+ }
+
+ /* Is there enough to get to the rest of the READ_RSP header? */
+ if (server->total_read < sizeof(READ_RSP)) {
+ cFYI(1, "%s: server returned short header. got=%u expected=%zu",
+ __func__, server->total_read, sizeof(READ_RSP));
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ data_offset = le16_to_cpu(rsp->DataOffset) + 4;
+ if (data_offset < server->total_read) {
+ /*
+ * win2k8 sometimes sends an offset of 0 when the read
+ * is beyond the EOF. Treat it as if the data starts just after
+ * the header.
+ */
+ cFYI(1, "%s: data offset (%u) inside read response header",
+ __func__, data_offset);
+ data_offset = server->total_read;
+ } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+ /* data_offset is beyond the end of smallbuf */
+ cFYI(1, "%s: data offset (%u) beyond end of smallbuf",
+ __func__, data_offset);
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ cFYI(1, "%s: total_read=%u data_offset=%u", __func__,
+ server->total_read, data_offset);
+
+ len = data_offset - server->total_read;
+ if (len > 0) {
+ /* read any junk before data into the rest of smallbuf */
+ rdata->iov[0].iov_base = server->smallbuf + server->total_read;
+ rdata->iov[0].iov_len = len;
+ length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ }
+
+ /* set up first iov for signature check */
+ rdata->iov[0].iov_base = server->smallbuf;
+ rdata->iov[0].iov_len = server->total_read;
+ cFYI(1, "0: iov_base=%p iov_len=%zu",
+ rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+
+ /* how much data is in the response? */
+ data_len = le16_to_cpu(rsp->DataLengthHigh) << 16;
+ data_len += le16_to_cpu(rsp->DataLength);
+ if (data_offset + data_len > rfclen) {
+ /* data_len is corrupt -- discard frame */
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ /* marshal up the page array */
+ len = 0;
+ remaining = data_len;
+ rdata->nr_iov = 1;
+
+ /* determine the eof that the server (probably) has */
+ eof = CIFS_I(rdata->mapping->host)->server_eof;
+ eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+ cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
+
+ list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+ if (remaining >= PAGE_CACHE_SIZE) {
+ /* enough data to fill the page */
+ rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+ rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
+ cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+ rdata->nr_iov, page->index,
+ rdata->iov[rdata->nr_iov].iov_base,
+ rdata->iov[rdata->nr_iov].iov_len);
+ ++rdata->nr_iov;
+ len += PAGE_CACHE_SIZE;
+ remaining -= PAGE_CACHE_SIZE;
+ } else if (remaining > 0) {
+ /* enough for partial page, fill and zero the rest */
+ rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+ rdata->iov[rdata->nr_iov].iov_len = remaining;
+ cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+ rdata->nr_iov, page->index,
+ rdata->iov[rdata->nr_iov].iov_base,
+ rdata->iov[rdata->nr_iov].iov_len);
+ memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
+ '\0', PAGE_CACHE_SIZE - remaining);
+ ++rdata->nr_iov;
+ len += remaining;
+ remaining = 0;
+ } else if (page->index > eof_index) {
+ /*
+ * The VFS will not try to do readahead past the
+ * i_size, but it's possible that we have outstanding
+ * writes with gaps in the middle and the i_size hasn't
+ * caught up yet. Populate those with zeroed out pages
+ * to prevent the VFS from repeatedly attempting to
+ * fill them until the writes are flushed.
+ */
+ zero_user(page, 0, PAGE_CACHE_SIZE);
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ page_cache_release(page);
+ } else {
+ /* no need to hold page hostage */
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+
+ /* issue the read if we have any iovecs left to fill */
+ if (rdata->nr_iov > 1) {
+ length = cifs_readv_from_socket(server, &rdata->iov[1],
+ rdata->nr_iov - 1, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ } else {
+ length = 0;
+ }
+
+ rdata->bytes = length;
+
+ cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read,
+ rfclen, remaining);
+
+ /* discard anything left over */
+ if (server->total_read < rfclen)
+ return cifs_readv_discard(server, mid);
+
+ dequeue_mid(mid, false);
+ return length;
+}
+
+static void
+cifs_readv_complete(struct work_struct *work)
+{
+ struct cifs_readdata *rdata = container_of(work,
+ struct cifs_readdata, work);
+ struct page *page, *tpage;
+
+ list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+
+ if (rdata->result == 0) {
+ kunmap(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+
+ unlock_page(page);
+
+ if (rdata->result == 0)
+ cifs_readpage_to_fscache(rdata->mapping->host, page);
+
+ page_cache_release(page);
+ }
+ cifs_readdata_free(rdata);
+}
+
+static void
+cifs_readv_callback(struct mid_q_entry *mid)
+{
+ struct cifs_readdata *rdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__,
+ mid->mid, mid->midState, rdata->result, rdata->bytes);
+
+ switch (mid->midState) {
+ case MID_RESPONSE_RECEIVED:
+ /* result already set, check signature */
+ if (server->sec_mode &
+ (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (cifs_verify_signature(rdata->iov, rdata->nr_iov,
+ server, mid->sequence_number + 1))
+ cERROR(1, "Unexpected SMB signature");
+ }
+ /* FIXME: should this be counted toward the initiating task? */
+ task_io_account_read(rdata->bytes);
+ cifs_stats_bytes_read(tcon, rdata->bytes);
+ break;
+ case MID_REQUEST_SUBMITTED:
+ case MID_RETRY_NEEDED:
+ rdata->result = -EAGAIN;
+ break;
+ default:
+ rdata->result = -EIO;
+ }
+
+ queue_work(system_nrt_wq, &rdata->work);
+ DeleteMidQEntry(mid);
+ atomic_dec(&server->inFlight);
+ wake_up(&server->request_q);
+}
+
+/* cifs_async_readv - send an async write, and set up mid to handle result */
+int
+cifs_async_readv(struct cifs_readdata *rdata)
+{
+ int rc;
+ READ_REQ *smb = NULL;
+ int wct;
+ struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+
+ cFYI(1, "%s: offset=%llu bytes=%u", __func__,
+ rdata->offset, rdata->bytes);
+
+ if (tcon->ses->capabilities & CAP_LARGE_FILES)
+ wct = 12;
+ else {
+ wct = 10; /* old style read */
+ if ((rdata->offset >> 32) > 0) {
+ /* can not handle this big offset for old */
+ return -EIO;
+ }
+ }
+
+ rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb);
+ if (rc)
+ return rc;
+
+ smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
+ smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
+
+ smb->AndXCommand = 0xFF; /* none */
+ smb->Fid = rdata->cfile->netfid;
+ smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
+ if (wct == 12)
+ smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
+ smb->Remaining = 0;
+ smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
+ smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
+ if (wct == 12)
+ smb->ByteCount = 0;
+ else {
+ /* old style read */
+ struct smb_com_readx_req *smbr =
+ (struct smb_com_readx_req *)smb;
+ smbr->ByteCount = 0;
+ }
+
+ /* 4 for RFC1001 length + 1 for BCC */
+ rdata->iov[0].iov_base = smb;
+ rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+
+ rc = cifs_call_async(tcon->ses->server, rdata->iov, 1,
+ cifs_readv_receive, cifs_readv_callback,
+ rdata, false);
+
+ if (rc == 0)
+ cifs_stats_inc(&tcon->num_reads);
+
+ cifs_small_buf_release(smb);
+ return rc;
+}
+
int
CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
char **buf, int *pbuf_type)
@@ -1836,7 +2194,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
kref_get(&wdata->refcount);
rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
- cifs_writev_callback, wdata, false);
+ NULL, cifs_writev_callback, wdata, false);
if (rc == 0)
cifs_stats_inc(&tcon->num_writes);
@@ -1962,10 +2320,50 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
return rc;
}
+int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
+ const __u8 lock_type, const __u32 num_unlock,
+ const __u32 num_lock, LOCKING_ANDX_RANGE *buf)
+{
+ int rc = 0;
+ LOCK_REQ *pSMB = NULL;
+ struct kvec iov[2];
+ int resp_buf_type;
+ __u16 count;
+
+ cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock);
+
+ rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
+ if (rc)
+ return rc;
+
+ pSMB->Timeout = 0;
+ pSMB->NumberOfLocks = cpu_to_le16(num_lock);
+ pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock);
+ pSMB->LockType = lock_type;
+ pSMB->AndXCommand = 0xFF; /* none */
+ pSMB->Fid = netfid; /* netfid stays le */
+
+ count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+ inc_rfc1001_len(pSMB, count);
+ pSMB->ByteCount = cpu_to_le16(count);
+
+ iov[0].iov_base = (char *)pSMB;
+ iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 -
+ (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+ iov[1].iov_base = (char *)buf;
+ iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+
+ cifs_stats_inc(&tcon->num_locks);
+ rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+ if (rc)
+ cFYI(1, "Send error in cifs_lockv = %d", rc);
+
+ return rc;
+}
int
CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
- const __u16 smb_file_id, const __u64 len,
+ const __u16 smb_file_id, const __u32 netpid, const __u64 len,
const __u64 offset, const __u32 numUnlock,
const __u32 numLock, const __u8 lockType,
const bool waitFlag, const __u8 oplock_level)
@@ -2001,7 +2399,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
pSMB->Fid = smb_file_id; /* netfid stays le */
if ((numLock != 0) || (numUnlock != 0)) {
- pSMB->Locks[0].Pid = cpu_to_le16(current->tgid);
+ pSMB->Locks[0].Pid = cpu_to_le16(netpid);
/* BB where to store pid high? */
pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len);
pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32));
@@ -2035,9 +2433,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
int
CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
- const __u16 smb_file_id, const int get_flag, const __u64 len,
- struct file_lock *pLockData, const __u16 lock_type,
- const bool waitFlag)
+ const __u16 smb_file_id, const __u32 netpid, const int get_flag,
+ const __u64 len, struct file_lock *pLockData,
+ const __u16 lock_type, const bool waitFlag)
{
struct smb_com_transaction2_sfi_req *pSMB = NULL;
struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
@@ -2095,7 +2493,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
} else
pSMB->Timeout = 0;
- parm_data->pid = cpu_to_le32(current->tgid);
+ parm_data->pid = cpu_to_le32(netpid);
parm_data->start = cpu_to_le64(pLockData->fl_start);
parm_data->length = cpu_to_le64(len); /* normalize negative numbers */
@@ -2812,8 +3210,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon,
pSMB->TotalDataCount = 0;
pSMB->MaxParameterCount = cpu_to_le32(2);
/* BB find exact data count max from sess structure BB */
- pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+ pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->MaxSetupCount = 4;
pSMB->Reserved = 0;
pSMB->ParameterOffset = 0;
@@ -3306,8 +3703,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count,
pSMB->Reserved = 0;
pSMB->TotalParameterCount = cpu_to_le32(parm_len);
pSMB->TotalDataCount = 0;
- pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+ pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->ParameterCount = pSMB->TotalParameterCount;
pSMB->DataCount = pSMB->TotalDataCount;
temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
@@ -3467,7 +3863,7 @@ qsec_out:
int
CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
- struct cifs_ntsd *pntsd, __u32 acllen)
+ struct cifs_ntsd *pntsd, __u32 acllen, int aclflag)
{
__u16 byte_count, param_count, data_count, param_offset, data_offset;
int rc = 0;
@@ -3504,7 +3900,7 @@ setCifsAclRetry:
pSMB->Fid = fid; /* file handle always le */
pSMB->Reserved2 = 0;
- pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL);
+ pSMB->AclFlags = cpu_to_le32(aclflag);
if (pntsd && acllen) {
memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
@@ -3977,8 +4373,7 @@ findFirstRetry:
params = 12 + name_len /* includes null */ ;
pSMB->TotalDataCount = 0; /* no EAs */
pSMB->MaxParameterCount = cpu_to_le16(10);
- pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+ pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->MaxSetupCount = 0;
pSMB->Reserved = 0;
pSMB->Flags = 0;
@@ -4052,8 +4447,7 @@ findFirstRetry:
psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
psrch_inf->entries_in_buffer;
lnoff = le16_to_cpu(parms->LastNameOffset);
- if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
- lnoff) {
+ if (CIFSMaxBufSize < lnoff) {
cERROR(1, "ignoring corrupt resume name");
psrch_inf->last_entry = NULL;
return rc;
@@ -4097,9 +4491,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
byte_count = 0;
pSMB->TotalDataCount = 0; /* no EAs */
pSMB->MaxParameterCount = cpu_to_le16(8);
- pSMB->MaxDataCount =
- cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) &
- 0xFFFFFF00);
+ pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->MaxSetupCount = 0;
pSMB->Reserved = 0;
pSMB->Flags = 0;
@@ -4181,8 +4573,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
psrch_inf->index_of_last_entry +=
psrch_inf->entries_in_buffer;
lnoff = le16_to_cpu(parms->LastNameOffset);
- if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
- lnoff) {
+ if (CIFSMaxBufSize < lnoff) {
cERROR(1, "ignoring corrupt resume name");
psrch_inf->last_entry = NULL;
return rc;
@@ -5840,7 +6231,7 @@ QAllEAsRetry:
if (ea_name) {
if (ea_name_len == name_len &&
- strncmp(ea_name, temp_ptr, name_len) == 0) {
+ memcmp(ea_name, temp_ptr, name_len) == 0) {
temp_ptr += name_len + 1;
rc = value_len;
if (buf_size == 0)
@@ -6035,12 +6426,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
pSMB->TotalParameterCount = 0 ;
pSMB->TotalDataCount = 0;
pSMB->MaxParameterCount = cpu_to_le32(2);
- /* BB find exact data count max from sess structure BB */
- pSMB->MaxDataCount = 0; /* same in little endian or be */
-/* BB VERIFY verify which is correct for above BB */
- pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-
+ pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->MaxSetupCount = 4;
pSMB->Reserved = 0;
pSMB->ParameterOffset = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 71beb0201970..d545a95c30ed 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -181,7 +181,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
-EINVAL = invalid transact2
*/
-static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
+static int check2ndT2(struct smb_hdr *pSMB)
{
struct smb_t2_rsp *pSMBt;
int remaining;
@@ -214,9 +214,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
cFYI(1, "missing %d bytes from transact2, check next response",
remaining);
- if (total_data_size > maxBufSize) {
+ if (total_data_size > CIFSMaxBufSize) {
cERROR(1, "TotalDataSize %d is over maximum buffer %d",
- total_data_size, maxBufSize);
+ total_data_size, CIFSMaxBufSize);
return -EINVAL;
}
return remaining;
@@ -320,27 +320,24 @@ requeue_echo:
}
static bool
-allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
- bool is_large_buf)
+allocate_buffers(struct TCP_Server_Info *server)
{
- char *bbuf = *bigbuf, *sbuf = *smallbuf;
-
- if (bbuf == NULL) {
- bbuf = (char *)cifs_buf_get();
- if (!bbuf) {
+ if (!server->bigbuf) {
+ server->bigbuf = (char *)cifs_buf_get();
+ if (!server->bigbuf) {
cERROR(1, "No memory for large SMB response");
msleep(3000);
/* retry will check if exiting */
return false;
}
- } else if (is_large_buf) {
+ } else if (server->large_buf) {
/* we are reusing a dirty large buf, clear its start */
- memset(bbuf, 0, size);
+ memset(server->bigbuf, 0, sizeof(struct smb_hdr));
}
- if (sbuf == NULL) {
- sbuf = (char *)cifs_small_buf_get();
- if (!sbuf) {
+ if (!server->smallbuf) {
+ server->smallbuf = (char *)cifs_small_buf_get();
+ if (!server->smallbuf) {
cERROR(1, "No memory for SMB response");
msleep(1000);
/* retry will check if exiting */
@@ -349,36 +346,116 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
/* beginning of smb buffer is cleared in our buf_get */
} else {
/* if existing small buf clear beginning */
- memset(sbuf, 0, size);
+ memset(server->smallbuf, 0, sizeof(struct smb_hdr));
}
- *bigbuf = bbuf;
- *smallbuf = sbuf;
-
return true;
}
-static int
-read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
- struct kvec *iov, unsigned int to_read,
- unsigned int *ptotal_read, bool is_header_read)
+static bool
+server_unresponsive(struct TCP_Server_Info *server)
+{
+ if (echo_retries > 0 && server->tcpStatus == CifsGood &&
+ time_after(jiffies, server->lstrp +
+ (echo_retries * SMB_ECHO_INTERVAL))) {
+ cERROR(1, "Server %s has not responded in %d seconds. "
+ "Reconnecting...", server->hostname,
+ (echo_retries * SMB_ECHO_INTERVAL / HZ));
+ cifs_reconnect(server);
+ wake_up(&server->response_q);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * kvec_array_init - clone a kvec array, and advance into it
+ * @new: pointer to memory for cloned array
+ * @iov: pointer to original array
+ * @nr_segs: number of members in original array
+ * @bytes: number of bytes to advance into the cloned array
+ *
+ * This function will copy the array provided in iov to a section of memory
+ * and advance the specified number of bytes into the new array. It returns
+ * the number of segments in the new array. "new" must be at least as big as
+ * the original iov array.
+ */
+static unsigned int
+kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
+ size_t bytes)
+{
+ size_t base = 0;
+
+ while (bytes || !iov->iov_len) {
+ int copy = min(bytes, iov->iov_len);
+
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ memcpy(new, iov, sizeof(*iov) * nr_segs);
+ new->iov_base += base;
+ new->iov_len -= base;
+ return nr_segs;
+}
+
+static struct kvec *
+get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
{
- int length, rc = 0;
- unsigned int total_read;
- char *buf = iov->iov_base;
+ struct kvec *new_iov;
+
+ if (server->iov && nr_segs <= server->nr_iov)
+ return server->iov;
+
+ /* not big enough -- allocate a new one and release the old */
+ new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
+ if (new_iov) {
+ kfree(server->iov);
+ server->iov = new_iov;
+ server->nr_iov = nr_segs;
+ }
+ return new_iov;
+}
+
+int
+cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
+ unsigned int nr_segs, unsigned int to_read)
+{
+ int length = 0;
+ int total_read;
+ unsigned int segs;
+ struct msghdr smb_msg;
+ struct kvec *iov;
+
+ iov = get_server_iovec(server, nr_segs);
+ if (!iov)
+ return -ENOMEM;
+
+ smb_msg.msg_control = NULL;
+ smb_msg.msg_controllen = 0;
+
+ for (total_read = 0; to_read; total_read += length, to_read -= length) {
+ if (server_unresponsive(server)) {
+ total_read = -EAGAIN;
+ break;
+ }
+
+ segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
+
+ length = kernel_recvmsg(server->ssocket, &smb_msg,
+ iov, segs, to_read, 0);
- for (total_read = 0; total_read < to_read; total_read += length) {
- length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1,
- to_read - total_read, 0);
if (server->tcpStatus == CifsExiting) {
- /* then will exit */
- rc = 2;
+ total_read = -ESHUTDOWN;
break;
} else if (server->tcpStatus == CifsNeedReconnect) {
cifs_reconnect(server);
- /* Reconnect wakes up rspns q */
- /* Now we will reread sock */
- rc = 1;
+ total_read = -EAGAIN;
break;
} else if (length == -ERESTARTSYS ||
length == -EAGAIN ||
@@ -390,56 +467,54 @@ read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
*/
usleep_range(1000, 2000);
length = 0;
- if (!is_header_read)
- continue;
- /* Special handling for header read */
- if (total_read) {
- iov->iov_base = (to_read - total_read) +
- buf;
- iov->iov_len = to_read - total_read;
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
- rc = 3;
- } else
- rc = 1;
- break;
+ continue;
} else if (length <= 0) {
- cERROR(1, "Received no data, expecting %d",
- to_read - total_read);
+ cFYI(1, "Received no data or error: expecting %d "
+ "got %d", to_read, length);
cifs_reconnect(server);
- rc = 1;
+ total_read = -EAGAIN;
break;
}
}
+ return total_read;
+}
- *ptotal_read = total_read;
- return rc;
+int
+cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+ unsigned int to_read)
+{
+ struct kvec iov;
+
+ iov.iov_base = buf;
+ iov.iov_len = to_read;
+
+ return cifs_readv_from_socket(server, &iov, 1, to_read);
}
static bool
-check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
+is_smb_response(struct TCP_Server_Info *server, unsigned char type)
{
- char temp = *buf;
- unsigned int pdu_length = be32_to_cpu(
- ((struct smb_hdr *)buf)->smb_buf_length);
-
/*
* The first byte big endian of the length field,
* is actually not part of the length but the type
* with the most common, zero, as regular data.
*/
- if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
- return false;
- } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
- cFYI(1, "Good RFC 1002 session rsp");
- return false;
- } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
+ switch (type) {
+ case RFC1002_SESSION_MESSAGE:
+ /* Regular SMB response */
+ return true;
+ case RFC1002_SESSION_KEEP_ALIVE:
+ cFYI(1, "RFC 1002 session keep alive");
+ break;
+ case RFC1002_POSITIVE_SESSION_RESPONSE:
+ cFYI(1, "RFC 1002 positive session response");
+ break;
+ case RFC1002_NEGATIVE_SESSION_RESPONSE:
/*
* We get this from Windows 98 instead of an error on
* SMB negprot response.
*/
- cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
- pdu_length);
+ cFYI(1, "RFC 1002 negative session response");
/* give server a second to clean up */
msleep(1000);
/*
@@ -448,87 +523,89 @@ check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
* is since we do not begin with RFC1001 session
* initialize frame).
*/
- cifs_set_port((struct sockaddr *)
- &server->dstaddr, CIFS_PORT);
+ cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
cifs_reconnect(server);
wake_up(&server->response_q);
- return false;
- } else if (temp != (char) 0) {
- cERROR(1, "Unknown RFC 1002 frame");
- cifs_dump_mem(" Received Data: ", buf, 4);
- cifs_reconnect(server);
- return false;
- }
-
- /* else we have an SMB response */
- if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
- (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
- cERROR(1, "Invalid size SMB length %d pdu_length %d",
- 4, pdu_length+4);
+ break;
+ default:
+ cERROR(1, "RFC 1002 unknown response type 0x%x", type);
cifs_reconnect(server);
- wake_up(&server->response_q);
- return false;
}
- return true;
+ return false;
}
static struct mid_q_entry *
-find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
- int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
+find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf)
{
- struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL;
+ struct mid_q_entry *mid;
spin_lock(&GlobalMid_Lock);
- list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) {
- if (mid->mid != buf->Mid ||
- mid->midState != MID_REQUEST_SUBMITTED ||
- mid->command != buf->Command)
- continue;
-
- if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) {
- /* We have a multipart transact2 resp */
- *is_multi_rsp = true;
- if (mid->resp_buf) {
- /* merge response - fix up 1st*/
- *length = coalesce_t2(buf, mid->resp_buf);
- if (*length > 0) {
- *length = 0;
- mid->multiRsp = true;
- break;
- }
- /* All parts received or packet is malformed. */
- mid->multiEnd = true;
- goto multi_t2_fnd;
- }
- if (!is_large_buf) {
- /*FIXME: switch to already allocated largebuf?*/
- cERROR(1, "1st trans2 resp needs bigbuf");
- } else {
- /* Have first buffer */
- mid->resp_buf = buf;
- mid->largeBuf = true;
- *bigbuf = NULL;
- }
- break;
+ list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+ if (mid->mid == buf->Mid &&
+ mid->midState == MID_REQUEST_SUBMITTED &&
+ mid->command == buf->Command) {
+ spin_unlock(&GlobalMid_Lock);
+ return mid;
}
- mid->resp_buf = buf;
- mid->largeBuf = is_large_buf;
-multi_t2_fnd:
- if (*length == 0)
- mid->midState = MID_RESPONSE_RECEIVED;
- else
- mid->midState = MID_RESPONSE_MALFORMED;
+ }
+ spin_unlock(&GlobalMid_Lock);
+ return NULL;
+}
+
+void
+dequeue_mid(struct mid_q_entry *mid, bool malformed)
+{
#ifdef CONFIG_CIFS_STATS2
- mid->when_received = jiffies;
+ mid->when_received = jiffies;
#endif
- list_del_init(&mid->qhead);
- ret = mid;
- break;
- }
+ spin_lock(&GlobalMid_Lock);
+ if (!malformed)
+ mid->midState = MID_RESPONSE_RECEIVED;
+ else
+ mid->midState = MID_RESPONSE_MALFORMED;
+ list_del_init(&mid->qhead);
spin_unlock(&GlobalMid_Lock);
+}
- return ret;
+static void
+handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+ struct smb_hdr *buf, int malformed)
+{
+ if (malformed == 0 && check2ndT2(buf) > 0) {
+ mid->multiRsp = true;
+ if (mid->resp_buf) {
+ /* merge response - fix up 1st*/
+ malformed = coalesce_t2(buf, mid->resp_buf);
+ if (malformed > 0)
+ return;
+
+ /* All parts received or packet is malformed. */
+ mid->multiEnd = true;
+ return dequeue_mid(mid, malformed);
+ }
+ if (!server->large_buf) {
+ /*FIXME: switch to already allocated largebuf?*/
+ cERROR(1, "1st trans2 resp needs bigbuf");
+ } else {
+ /* Have first buffer */
+ mid->resp_buf = buf;
+ mid->largeBuf = true;
+ server->bigbuf = NULL;
+ }
+ return;
+ }
+ mid->resp_buf = buf;
+ mid->largeBuf = server->large_buf;
+ /* Was previous buf put in mpx struct for multi-rsp? */
+ if (!mid->multiRsp) {
+ /* smb buffer will be freed by user thread */
+ if (server->large_buf)
+ server->bigbuf = NULL;
+ else
+ server->smallbuf = NULL;
+ }
+ dequeue_mid(mid, malformed);
}
static void clean_demultiplex_info(struct TCP_Server_Info *server)
@@ -618,6 +695,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
}
kfree(server->hostname);
+ kfree(server->iov);
kfree(server);
length = atomic_dec_return(&tcpSesAllocCount);
@@ -627,20 +705,70 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
}
static int
+standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length;
+ char *buf = server->smallbuf;
+ struct smb_hdr *smb_buffer = (struct smb_hdr *)buf;
+ unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
+
+ /* make sure this will fit in a large buffer */
+ if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ cERROR(1, "SMB response too long (%u bytes)",
+ pdu_length);
+ cifs_reconnect(server);
+ wake_up(&server->response_q);
+ return -EAGAIN;
+ }
+
+ /* switch to large buffer if too big for a small one */
+ if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
+ server->large_buf = true;
+ memcpy(server->bigbuf, server->smallbuf, server->total_read);
+ buf = server->bigbuf;
+ smb_buffer = (struct smb_hdr *)buf;
+ }
+
+ /* now read the rest */
+ length = cifs_read_from_socket(server,
+ buf + sizeof(struct smb_hdr) - 1,
+ pdu_length - sizeof(struct smb_hdr) + 1 + 4);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+
+ dump_smb(smb_buffer, server->total_read);
+
+ /*
+ * We know that we received enough to get to the MID as we
+ * checked the pdu_length earlier. Now check to see
+ * if the rest of the header is OK. We borrow the length
+ * var for the rest of the loop to avoid a new stack var.
+ *
+ * 48 bytes is enough to display the header and a little bit
+ * into the payload for debugging purposes.
+ */
+ length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read);
+ if (length != 0)
+ cifs_dump_mem("Bad SMB: ", buf,
+ min_t(unsigned int, server->total_read, 48));
+
+ if (mid)
+ handle_mid(mid, server, smb_buffer, length);
+
+ return length;
+}
+
+static int
cifs_demultiplex_thread(void *p)
{
int length;
struct TCP_Server_Info *server = p;
- unsigned int pdu_length, total_read;
- char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
+ unsigned int pdu_length;
+ char *buf = NULL;
struct smb_hdr *smb_buffer = NULL;
- struct msghdr smb_msg;
- struct kvec iov;
struct task_struct *task_to_wake = NULL;
struct mid_q_entry *mid_entry;
- bool isLargeBuf = false;
- bool isMultiRsp = false;
- int rc;
current->flags |= PF_MEMALLOC;
cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -655,111 +783,65 @@ cifs_demultiplex_thread(void *p)
if (try_to_freeze())
continue;
- if (!allocate_buffers(&bigbuf, &smallbuf,
- sizeof(struct smb_hdr), isLargeBuf))
+ if (!allocate_buffers(server))
continue;
- isLargeBuf = false;
- isMultiRsp = false;
- smb_buffer = (struct smb_hdr *)smallbuf;
- buf = smallbuf;
- iov.iov_base = buf;
- iov.iov_len = 4;
- smb_msg.msg_control = NULL;
- smb_msg.msg_controllen = 0;
+ server->large_buf = false;
+ smb_buffer = (struct smb_hdr *)server->smallbuf;
+ buf = server->smallbuf;
pdu_length = 4; /* enough to get RFC1001 header */
-incomplete_rcv:
- if (echo_retries > 0 && server->tcpStatus == CifsGood &&
- time_after(jiffies, server->lstrp +
- (echo_retries * SMB_ECHO_INTERVAL))) {
- cERROR(1, "Server %s has not responded in %d seconds. "
- "Reconnecting...", server->hostname,
- (echo_retries * SMB_ECHO_INTERVAL / HZ));
- cifs_reconnect(server);
- wake_up(&server->response_q);
- continue;
- }
-
- rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
- &total_read, true /* header read */);
- if (rc == 3)
- goto incomplete_rcv;
- else if (rc == 2)
- break;
- else if (rc == 1)
+ length = cifs_read_from_socket(server, buf, pdu_length);
+ if (length < 0)
continue;
+ server->total_read = length;
/*
* The right amount was read from socket - 4 bytes,
* so we can now interpret the length field.
*/
-
- /*
- * Note that RFC 1001 length is big endian on the wire,
- * but we convert it here so it is always manipulated
- * as host byte order.
- */
pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
- cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
- if (!check_rfc1002_header(server, buf))
+ cFYI(1, "RFC1002 header 0x%x", pdu_length);
+ if (!is_smb_response(server, buf[0]))
continue;
- /* else length ok */
- if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
- isLargeBuf = true;
- memcpy(bigbuf, smallbuf, 4);
- smb_buffer = (struct smb_hdr *)bigbuf;
- buf = bigbuf;
+ /* make sure we have enough to get to the MID */
+ if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) {
+ cERROR(1, "SMB response too short (%u bytes)",
+ pdu_length);
+ cifs_reconnect(server);
+ wake_up(&server->response_q);
+ continue;
}
- iov.iov_base = 4 + buf;
- iov.iov_len = pdu_length;
- rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
- &total_read, false);
- if (rc == 2)
- break;
- else if (rc == 1)
+ /* read down to the MID */
+ length = cifs_read_from_socket(server, buf + 4,
+ sizeof(struct smb_hdr) - 1 - 4);
+ if (length < 0)
continue;
+ server->total_read += length;
- total_read += 4; /* account for rfc1002 hdr */
+ mid_entry = find_mid(server, smb_buffer);
- dump_smb(smb_buffer, total_read);
+ if (!mid_entry || !mid_entry->receive)
+ length = standard_receive3(server, mid_entry);
+ else
+ length = mid_entry->receive(server, mid_entry);
- /*
- * We know that we received enough to get to the MID as we
- * checked the pdu_length earlier. Now check to see
- * if the rest of the header is OK. We borrow the length
- * var for the rest of the loop to avoid a new stack var.
- *
- * 48 bytes is enough to display the header and a little bit
- * into the payload for debugging purposes.
- */
- length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
- if (length != 0)
- cifs_dump_mem("Bad SMB: ", buf,
- min_t(unsigned int, total_read, 48));
+ if (length < 0)
+ continue;
- server->lstrp = jiffies;
+ if (server->large_buf) {
+ buf = server->bigbuf;
+ smb_buffer = (struct smb_hdr *)buf;
+ }
- mid_entry = find_cifs_mid(server, smb_buffer, &length,
- isLargeBuf, &isMultiRsp, &bigbuf);
+ server->lstrp = jiffies;
if (mid_entry != NULL) {
- mid_entry->callback(mid_entry);
- /* Was previous buf put in mpx struct for multi-rsp? */
- if (!isMultiRsp) {
- /* smb buffer will be freed by user thread */
- if (isLargeBuf)
- bigbuf = NULL;
- else
- smallbuf = NULL;
- }
- } else if (length != 0) {
- /* response sanity checks failed */
- continue;
- } else if (!is_valid_oplock_break(smb_buffer, server) &&
- !isMultiRsp) {
+ if (!mid_entry->multiRsp || mid_entry->multiEnd)
+ mid_entry->callback(mid_entry);
+ } else if (!is_valid_oplock_break(smb_buffer, server)) {
cERROR(1, "No task to wake, unknown frame received! "
"NumMids %d", atomic_read(&midCount));
cifs_dump_mem("Received Data is: ", buf,
@@ -773,9 +855,9 @@ incomplete_rcv:
} /* end while !EXITING */
/* buffer usually freed in free_mid - need to free it here on exit */
- cifs_buf_release(bigbuf);
- if (smallbuf) /* no sense logging a debug message if NULL */
- cifs_small_buf_release(smallbuf);
+ cifs_buf_release(server->bigbuf);
+ if (server->smallbuf) /* no sense logging a debug message if NULL */
+ cifs_small_buf_release(server->smallbuf);
task_to_wake = xchg(&server->tsk, NULL);
clean_demultiplex_info(server);
@@ -827,6 +909,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
{
char *value, *data, *end;
char *mountdata_copy = NULL, *options;
+ int err;
unsigned int temp_len, i, j;
char separator[2];
short int override_uid = -1;
@@ -883,6 +966,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
cFYI(1, "Null separator not allowed");
}
}
+ vol->backupuid_specified = false; /* no backup intent for a user */
+ vol->backupgid_specified = false; /* no backup intent for a group */
while ((data = strsep(&options, separator)) != NULL) {
if (!*data)
@@ -1442,6 +1527,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->mfsymlinks = true;
} else if (strnicmp(data, "multiuser", 8) == 0) {
vol->multiuser = true;
+ } else if (!strnicmp(data, "backupuid", 9) && value && *value) {
+ err = kstrtouint(value, 0, &vol->backupuid);
+ if (err < 0) {
+ cERROR(1, "%s: Invalid backupuid value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->backupuid_specified = true;
+ } else if (!strnicmp(data, "backupgid", 9) && value && *value) {
+ err = kstrtouint(value, 0, &vol->backupgid);
+ if (err < 0) {
+ cERROR(1, "%s: Invalid backupgid value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->backupgid_specified = true;
} else
printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
data);
@@ -2209,16 +2310,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
(new->mnt_cifs_flags & CIFS_MOUNT_MASK))
return 0;
- if (old->rsize != new->rsize)
- return 0;
-
/*
- * We want to share sb only if we don't specify wsize or specified wsize
- * is greater or equal than existing one.
+ * We want to share sb only if we don't specify an r/wsize or
+ * specified r/wsize is greater than or equal to existing one.
*/
if (new->wsize && new->wsize < old->wsize)
return 0;
+ if (new->rsize && new->rsize < old->rsize)
+ return 0;
+
if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
return 0;
@@ -2656,14 +2757,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
CIFS_MOUNT_POSIX_PATHS;
}
- if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) {
- if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
- cifs_sb->rsize = 127 * 1024;
- cFYI(DBG2, "larger reads not supported by srv");
- }
- }
-
-
cFYI(1, "Negotiate caps 0x%x", (int)cap);
#ifdef CONFIG_CIFS_DEBUG2
if (cap & CIFS_UNIX_FCNTL_CAP)
@@ -2708,31 +2801,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
- if (pvolume_info->rsize > CIFSMaxBufSize) {
- cERROR(1, "rsize %d too large, using MaxBufSize",
- pvolume_info->rsize);
- cifs_sb->rsize = CIFSMaxBufSize;
- } else if ((pvolume_info->rsize) &&
- (pvolume_info->rsize <= CIFSMaxBufSize))
- cifs_sb->rsize = pvolume_info->rsize;
- else /* default */
- cifs_sb->rsize = CIFSMaxBufSize;
-
- if (cifs_sb->rsize < 2048) {
- cifs_sb->rsize = 2048;
- /* Windows ME may prefer this */
- cFYI(1, "readsize set to minimum: 2048");
- }
-
/*
- * Temporarily set wsize for matching superblock. If we end up using
- * new sb then cifs_negotiate_wsize will later negotiate it downward
- * if needed.
+ * Temporarily set r/wsize for matching superblock. If we end up using
+ * new sb then client will later negotiate it downward if needed.
*/
+ cifs_sb->rsize = pvolume_info->rsize;
cifs_sb->wsize = pvolume_info->wsize;
cifs_sb->mnt_uid = pvolume_info->linux_uid;
cifs_sb->mnt_gid = pvolume_info->linux_gid;
+ if (pvolume_info->backupuid_specified)
+ cifs_sb->mnt_backupuid = pvolume_info->backupuid;
+ if (pvolume_info->backupgid_specified)
+ cifs_sb->mnt_backupgid = pvolume_info->backupgid;
cifs_sb->mnt_file_mode = pvolume_info->file_mode;
cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
cFYI(1, "file mode: 0x%x dir mode: 0x%x",
@@ -2763,6 +2844,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
if (pvolume_info->cifs_acl)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+ if (pvolume_info->backupuid_specified)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID;
+ if (pvolume_info->backupgid_specified)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID;
if (pvolume_info->override_uid)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
if (pvolume_info->override_gid)
@@ -2795,29 +2880,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
}
/*
- * When the server supports very large writes via POSIX extensions, we can
- * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including
- * the RFC1001 length.
+ * When the server supports very large reads and writes via POSIX extensions,
+ * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
+ * including the RFC1001 length.
*
* Note that this might make for "interesting" allocation problems during
* writeback however as we have to allocate an array of pointers for the
* pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ *
+ * For reads, there is a similar problem as we need to allocate an array
+ * of kvecs to handle the receive, though that should only need to be done
+ * once.
*/
#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
/*
- * When the server doesn't allow large posix writes, only allow a wsize of
- * 128k minus the size of the WRITE_AND_X header. That allows for a write up
- * to the maximum size described by RFC1002.
+ * When the server doesn't allow large posix writes, only allow a rsize/wsize
+ * of 2^17-1 minus the size of the call header. That allows for a read or
+ * write up to the maximum size described by RFC1002.
*/
-#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
/*
* The default wsize is 1M. find_get_pages seems to return a maximum of 256
* pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
* a single wsize request with a single call.
*/
-#define CIFS_DEFAULT_WSIZE (1024 * 1024)
+#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+
+/*
+ * Windows only supports a max of 60k reads. Default to that when posix
+ * extensions aren't in force.
+ */
+#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
static unsigned int
cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
@@ -2825,7 +2922,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
struct TCP_Server_Info *server = tcon->ses->server;
unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
- CIFS_DEFAULT_WSIZE;
+ CIFS_DEFAULT_IOSIZE;
/* can server support 24-bit write sizes? (via UNIX extensions) */
if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -2848,6 +2945,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
return wsize;
}
+static unsigned int
+cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
+{
+ __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+ struct TCP_Server_Info *server = tcon->ses->server;
+ unsigned int rsize, defsize;
+
+ /*
+ * Set default value...
+ *
+ * HACK alert! Ancient servers have very small buffers. Even though
+ * MS-CIFS indicates that servers are only limited by the client's
+ * bufsize for reads, testing against win98se shows that it throws
+ * INVALID_PARAMETER errors if you try to request too large a read.
+ *
+ * If the server advertises a MaxBufferSize of less than one page,
+ * assume that it also can't satisfy reads larger than that either.
+ *
+ * FIXME: Is there a better heuristic for this?
+ */
+ if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
+ defsize = CIFS_DEFAULT_IOSIZE;
+ else if (server->capabilities & CAP_LARGE_READ_X)
+ defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
+ else if (server->maxBuf >= PAGE_CACHE_SIZE)
+ defsize = CIFSMaxBufSize;
+ else
+ defsize = server->maxBuf - sizeof(READ_RSP);
+
+ rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize;
+
+ /*
+ * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
+ * the client's MaxBufferSize.
+ */
+ if (!(server->capabilities & CAP_LARGE_READ_X))
+ rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
+
+ /* hard limit of CIFS_MAX_RSIZE */
+ rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
+
+ return rsize;
+}
+
static int
is_path_accessible(int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -2877,9 +3018,9 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
{
kfree(volume_info->username);
kzfree(volume_info->password);
- kfree(volume_info->UNC);
if (volume_info->UNCip != volume_info->UNC + 2)
kfree(volume_info->UNCip);
+ kfree(volume_info->UNC);
kfree(volume_info->domainname);
kfree(volume_info->iocharset);
kfree(volume_info->prepath);
@@ -3041,6 +3182,22 @@ cifs_get_volume_info(char *mount_data, const char *devname)
return volume_info;
}
+/* make sure ra_pages is a multiple of rsize */
+static inline unsigned int
+cifs_ra_pages(struct cifs_sb_info *cifs_sb)
+{
+ unsigned int reads;
+ unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
+
+ if (rsize_pages >= default_backing_dev_info.ra_pages)
+ return default_backing_dev_info.ra_pages;
+ else if (rsize_pages == 0)
+ return rsize_pages;
+
+ reads = default_backing_dev_info.ra_pages / rsize_pages;
+ return reads * rsize_pages;
+}
+
int
cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
{
@@ -3059,8 +3216,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
if (rc)
return rc;
- cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-
#ifdef CONFIG_CIFS_DFS_UPCALL
try_mount_again:
/* cleanup activities if we're chasing a referral */
@@ -3125,15 +3280,11 @@ try_mount_again:
CIFSSMBQFSAttributeInfo(xid, tcon);
}
- if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
- cifs_sb->rsize = 1024 * 127;
- cFYI(DBG2, "no very large read support, rsize now 127K");
- }
- if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
- cifs_sb->rsize = min(cifs_sb->rsize,
- (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
-
cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
+ cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info);
+
+ /* tune readahead according to rsize */
+ cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb);
remote_path_check:
#ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 72d448bf96ce..d7eeb9d3ed6f 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
}
tcon = tlink_tcon(tlink);
- if (oplockEnabled)
+ if (enable_oplocks)
oplock = REQ_OPLOCK;
if (nd)
@@ -244,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
create_options |= CREATE_OPTION_READONLY;
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
if (tcon->ses->capabilities & CAP_NT_SMBS)
rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
desiredAccess, create_options,
@@ -357,6 +360,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
{
int rc = -EPERM;
int xid;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
@@ -431,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
return rc;
}
- /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
- GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+ GENERIC_WRITE, create_options,
&fileHandle, &oplock, buf, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc)
@@ -642,8 +648,16 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
if (direntry->d_inode) {
if (cifs_revalidate_dentry(direntry))
return 0;
- else
+ else {
+ /*
+ * Forcibly invalidate automounting directory inodes
+ * (remote DFS directories) so to have them
+ * instantiated again for automount
+ */
+ if (IS_AUTOMOUNT(direntry->d_inode))
+ return 0;
return 1;
+ }
}
/*
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 55d87ac52000..9c7ecdccf2f3 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,7 +45,7 @@
#include "cifs_debug.h"
#include "cifsfs.h"
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
static struct dentry *cifs_get_parent(struct dentry *dentry)
{
/* BB need to add code here eventually to enable export via NFSD */
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
.encode_fs = */
};
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9f41a10523a1..ea096ce5d4f7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -32,6 +32,7 @@
#include <linux/delay.h>
#include <linux/mount.h>
#include <linux/slab.h>
+#include <linux/swap.h>
#include <asm/div64.h>
#include "cifsfs.h"
#include "cifspdu.h"
@@ -174,6 +175,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
int rc;
int desiredAccess;
int disposition;
+ int create_options = CREATE_NOT_DIR;
FILE_ALL_INFO *buf;
desiredAccess = cifs_convert_flags(f_flags);
@@ -210,9 +212,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
if (!buf)
return -ENOMEM;
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
if (tcon->ses->capabilities & CAP_NT_SMBS)
rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
- desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+ desiredAccess, create_options, pnetfid, poplock, buf,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
& CIFS_MOUNT_MAP_SPECIAL_CHR);
else
@@ -258,8 +263,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
pCifsFile->invalidHandle = false;
pCifsFile->tlink = cifs_get_tlink(tlink);
mutex_init(&pCifsFile->fh_mutex);
- mutex_init(&pCifsFile->lock_mutex);
- INIT_LIST_HEAD(&pCifsFile->llist);
INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
spin_lock(&cifs_file_list_lock);
@@ -272,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
spin_unlock(&cifs_file_list_lock);
cifs_set_oplock_level(pCifsInode, oplock);
+ pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll;
file->private_data = pCifsFile;
return pCifsFile;
}
+static void cifs_del_lock_waiters(struct cifsLockInfo *lock);
+
/*
* Release a reference on the file private data. This may involve closing
* the filehandle out on the server. Must be called without holding
@@ -327,12 +333,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
/* Delete any outstanding lock records. We'll lose them when the file
* is closed anyway.
*/
- mutex_lock(&cifs_file->lock_mutex);
- list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+ mutex_lock(&cifsi->lock_mutex);
+ list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) {
+ if (li->netfid != cifs_file->netfid)
+ continue;
list_del(&li->llist);
+ cifs_del_lock_waiters(li);
kfree(li);
}
- mutex_unlock(&cifs_file->lock_mutex);
+ mutex_unlock(&cifsi->lock_mutex);
cifs_put_tlink(cifs_file->tlink);
dput(cifs_file->dentry);
@@ -371,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file)
cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
inode, file->f_flags, full_path);
- if (oplockEnabled)
+ if (enable_oplocks)
oplock = REQ_OPLOCK;
else
oplock = 0;
@@ -465,6 +474,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
char *full_path = NULL;
int desiredAccess;
int disposition = FILE_OPEN;
+ int create_options = CREATE_NOT_DIR;
__u16 netfid;
xid = GetXid();
@@ -495,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
cFYI(1, "inode = 0x%p file flags 0x%x for %s",
inode, pCifsFile->f_flags, full_path);
- if (oplockEnabled)
+ if (enable_oplocks)
oplock = REQ_OPLOCK;
else
oplock = 0;
@@ -524,6 +534,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
/* Can not refresh inode by passing in file_info buf to be returned
by SMBOpen and then calling get_inode_info with returned buf
since file might have write behind data that needs to be flushed
@@ -531,7 +544,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
that inode was not dirty locally we could do this */
rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
- CREATE_NOT_DIR, &netfid, &oplock, NULL,
+ create_options, &netfid, &oplock, NULL,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc) {
@@ -631,219 +644,687 @@ int cifs_closedir(struct inode *inode, struct file *file)
return rc;
}
-static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
- __u64 offset, __u8 lockType)
+static struct cifsLockInfo *
+cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid)
{
struct cifsLockInfo *li =
kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
- if (li == NULL)
- return -ENOMEM;
+ if (!li)
+ return li;
+ li->netfid = netfid;
li->offset = offset;
li->length = len;
- li->type = lockType;
- mutex_lock(&fid->lock_mutex);
- list_add(&li->llist, &fid->llist);
- mutex_unlock(&fid->lock_mutex);
+ li->type = type;
+ li->pid = current->tgid;
+ INIT_LIST_HEAD(&li->blist);
+ init_waitqueue_head(&li->block_q);
+ return li;
+}
+
+static void
+cifs_del_lock_waiters(struct cifsLockInfo *lock)
+{
+ struct cifsLockInfo *li, *tmp;
+ list_for_each_entry_safe(li, tmp, &lock->blist, blist) {
+ list_del_init(&li->blist);
+ wake_up(&li->block_q);
+ }
+}
+
+static bool
+cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
+ __u64 length, __u8 type, __u16 netfid,
+ struct cifsLockInfo **conf_lock)
+{
+ struct cifsLockInfo *li, *tmp;
+
+ list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+ if (offset + length <= li->offset ||
+ offset >= li->offset + li->length)
+ continue;
+ else if ((type & LOCKING_ANDX_SHARED_LOCK) &&
+ ((netfid == li->netfid && current->tgid == li->pid) ||
+ type == li->type))
+ continue;
+ else {
+ *conf_lock = li;
+ return true;
+ }
+ }
+ return false;
+}
+
+static int
+cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
+ __u8 type, __u16 netfid, struct file_lock *flock)
+{
+ int rc = 0;
+ struct cifsLockInfo *conf_lock;
+ bool exist;
+
+ mutex_lock(&cinode->lock_mutex);
+
+ exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+ &conf_lock);
+ if (exist) {
+ flock->fl_start = conf_lock->offset;
+ flock->fl_end = conf_lock->offset + conf_lock->length - 1;
+ flock->fl_pid = conf_lock->pid;
+ if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK)
+ flock->fl_type = F_RDLCK;
+ else
+ flock->fl_type = F_WRLCK;
+ } else if (!cinode->can_cache_brlcks)
+ rc = 1;
+ else
+ flock->fl_type = F_UNLCK;
+
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+}
+
+static int
+cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset,
+ __u8 type, __u16 netfid)
+{
+ struct cifsLockInfo *li;
+
+ li = cifs_lock_init(len, offset, type, netfid);
+ if (!li)
+ return -ENOMEM;
+
+ mutex_lock(&cinode->lock_mutex);
+ list_add_tail(&li->llist, &cinode->llist);
+ mutex_unlock(&cinode->lock_mutex);
return 0;
}
-int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
+static int
+cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
+ __u8 type, __u16 netfid, bool wait)
{
- int rc, xid;
- __u32 numLock = 0;
- __u32 numUnlock = 0;
- __u64 length;
- bool wait_flag = false;
- struct cifs_sb_info *cifs_sb;
+ struct cifsLockInfo *lock, *conf_lock;
+ bool exist;
+ int rc = 0;
+
+ lock = cifs_lock_init(length, offset, type, netfid);
+ if (!lock)
+ return -ENOMEM;
+
+try_again:
+ exist = false;
+ mutex_lock(&cinode->lock_mutex);
+
+ exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+ &conf_lock);
+ if (!exist && cinode->can_cache_brlcks) {
+ list_add_tail(&lock->llist, &cinode->llist);
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+ }
+
+ if (!exist)
+ rc = 1;
+ else if (!wait)
+ rc = -EACCES;
+ else {
+ list_add_tail(&lock->blist, &conf_lock->blist);
+ mutex_unlock(&cinode->lock_mutex);
+ rc = wait_event_interruptible(lock->block_q,
+ (lock->blist.prev == &lock->blist) &&
+ (lock->blist.next == &lock->blist));
+ if (!rc)
+ goto try_again;
+ else {
+ mutex_lock(&cinode->lock_mutex);
+ list_del_init(&lock->blist);
+ mutex_unlock(&cinode->lock_mutex);
+ }
+ }
+
+ kfree(lock);
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+}
+
+static int
+cifs_posix_lock_test(struct file *file, struct file_lock *flock)
+{
+ int rc = 0;
+ struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+ unsigned char saved_type = flock->fl_type;
+
+ mutex_lock(&cinode->lock_mutex);
+ posix_test_lock(file, flock);
+
+ if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
+ flock->fl_type = saved_type;
+ rc = 1;
+ }
+
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+}
+
+static int
+cifs_posix_lock_set(struct file *file, struct file_lock *flock)
+{
+ struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+ int rc;
+
+ mutex_lock(&cinode->lock_mutex);
+ if (!cinode->can_cache_brlcks) {
+ mutex_unlock(&cinode->lock_mutex);
+ return 1;
+ }
+ rc = posix_lock_file_wait(file, flock);
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+}
+
+static int
+cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
+{
+ int xid, rc = 0, stored_rc;
+ struct cifsLockInfo *li, *tmp;
struct cifs_tcon *tcon;
- __u16 netfid;
- __u8 lockType = LOCKING_ANDX_LARGE_FILES;
- bool posix_locking = 0;
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ unsigned int num, max_num;
+ LOCKING_ANDX_RANGE *buf, *cur;
+ int types[] = {LOCKING_ANDX_LARGE_FILES,
+ LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+ int i;
+
+ xid = GetXid();
+ tcon = tlink_tcon(cfile->tlink);
+
+ mutex_lock(&cinode->lock_mutex);
+ if (!cinode->can_cache_brlcks) {
+ mutex_unlock(&cinode->lock_mutex);
+ FreeXid(xid);
+ return rc;
+ }
+
+ max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
+ sizeof(LOCKING_ANDX_RANGE);
+ buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ if (!buf) {
+ mutex_unlock(&cinode->lock_mutex);
+ FreeXid(xid);
+ return rc;
+ }
+
+ for (i = 0; i < 2; i++) {
+ cur = buf;
+ num = 0;
+ list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+ if (li->type != types[i])
+ continue;
+ cur->Pid = cpu_to_le16(li->pid);
+ cur->LengthLow = cpu_to_le32((u32)li->length);
+ cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+ cur->OffsetLow = cpu_to_le32((u32)li->offset);
+ cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+ if (++num == max_num) {
+ stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+ li->type, 0, num, buf);
+ if (stored_rc)
+ rc = stored_rc;
+ cur = buf;
+ num = 0;
+ } else
+ cur++;
+ }
+
+ if (num) {
+ stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+ types[i], 0, num, buf);
+ if (stored_rc)
+ rc = stored_rc;
+ }
+ }
+
+ cinode->can_cache_brlcks = false;
+ mutex_unlock(&cinode->lock_mutex);
+
+ kfree(buf);
+ FreeXid(xid);
+ return rc;
+}
+
+/* copied from fs/locks.c with a name change */
+#define cifs_for_each_lock(inode, lockp) \
+ for (lockp = &inode->i_flock; *lockp != NULL; \
+ lockp = &(*lockp)->fl_next)
+
+static int
+cifs_push_posix_locks(struct cifsFileInfo *cfile)
+{
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct file_lock *flock, **before;
+ struct cifsLockInfo *lck, *tmp;
+ int rc = 0, xid, type;
+ __u64 length;
+ struct list_head locks_to_send;
- length = 1 + pfLock->fl_end - pfLock->fl_start;
- rc = -EACCES;
xid = GetXid();
- cFYI(1, "Lock parm: 0x%x flockflags: "
- "0x%x flocktype: 0x%x start: %lld end: %lld",
- cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
- pfLock->fl_end);
+ mutex_lock(&cinode->lock_mutex);
+ if (!cinode->can_cache_brlcks) {
+ mutex_unlock(&cinode->lock_mutex);
+ FreeXid(xid);
+ return rc;
+ }
+
+ INIT_LIST_HEAD(&locks_to_send);
- if (pfLock->fl_flags & FL_POSIX)
+ lock_flocks();
+ cifs_for_each_lock(cfile->dentry->d_inode, before) {
+ flock = *before;
+ length = 1 + flock->fl_end - flock->fl_start;
+ if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
+ type = CIFS_RDLCK;
+ else
+ type = CIFS_WRLCK;
+
+ lck = cifs_lock_init(length, flock->fl_start, type,
+ cfile->netfid);
+ if (!lck) {
+ rc = -ENOMEM;
+ goto send_locks;
+ }
+ lck->pid = flock->fl_pid;
+
+ list_add_tail(&lck->llist, &locks_to_send);
+ }
+
+send_locks:
+ unlock_flocks();
+
+ list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+ struct file_lock tmp_lock;
+ int stored_rc;
+
+ tmp_lock.fl_start = lck->offset;
+ stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid,
+ 0, lck->length, &tmp_lock,
+ lck->type, 0);
+ if (stored_rc)
+ rc = stored_rc;
+ list_del(&lck->llist);
+ kfree(lck);
+ }
+
+ cinode->can_cache_brlcks = false;
+ mutex_unlock(&cinode->lock_mutex);
+
+ FreeXid(xid);
+ return rc;
+}
+
+static int
+cifs_push_locks(struct cifsFileInfo *cfile)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+
+ if ((tcon->ses->capabilities & CAP_UNIX) &&
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+ return cifs_push_posix_locks(cfile);
+
+ return cifs_push_mandatory_locks(cfile);
+}
+
+static void
+cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
+ bool *wait_flag)
+{
+ if (flock->fl_flags & FL_POSIX)
cFYI(1, "Posix");
- if (pfLock->fl_flags & FL_FLOCK)
+ if (flock->fl_flags & FL_FLOCK)
cFYI(1, "Flock");
- if (pfLock->fl_flags & FL_SLEEP) {
+ if (flock->fl_flags & FL_SLEEP) {
cFYI(1, "Blocking lock");
- wait_flag = true;
+ *wait_flag = true;
}
- if (pfLock->fl_flags & FL_ACCESS)
+ if (flock->fl_flags & FL_ACCESS)
cFYI(1, "Process suspended by mandatory locking - "
- "not implemented yet");
- if (pfLock->fl_flags & FL_LEASE)
+ "not implemented yet");
+ if (flock->fl_flags & FL_LEASE)
cFYI(1, "Lease on file - not implemented yet");
- if (pfLock->fl_flags &
+ if (flock->fl_flags &
(~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
- cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
+ cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
- if (pfLock->fl_type == F_WRLCK) {
+ *type = LOCKING_ANDX_LARGE_FILES;
+ if (flock->fl_type == F_WRLCK) {
cFYI(1, "F_WRLCK ");
- numLock = 1;
- } else if (pfLock->fl_type == F_UNLCK) {
+ *lock = 1;
+ } else if (flock->fl_type == F_UNLCK) {
cFYI(1, "F_UNLCK");
- numUnlock = 1;
- /* Check if unlock includes more than
- one lock range */
- } else if (pfLock->fl_type == F_RDLCK) {
+ *unlock = 1;
+ /* Check if unlock includes more than one lock range */
+ } else if (flock->fl_type == F_RDLCK) {
cFYI(1, "F_RDLCK");
- lockType |= LOCKING_ANDX_SHARED_LOCK;
- numLock = 1;
- } else if (pfLock->fl_type == F_EXLCK) {
+ *type |= LOCKING_ANDX_SHARED_LOCK;
+ *lock = 1;
+ } else if (flock->fl_type == F_EXLCK) {
cFYI(1, "F_EXLCK");
- numLock = 1;
- } else if (pfLock->fl_type == F_SHLCK) {
+ *lock = 1;
+ } else if (flock->fl_type == F_SHLCK) {
cFYI(1, "F_SHLCK");
- lockType |= LOCKING_ANDX_SHARED_LOCK;
- numLock = 1;
+ *type |= LOCKING_ANDX_SHARED_LOCK;
+ *lock = 1;
} else
cFYI(1, "Unknown type of lock");
+}
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
- netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
+static int
+cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
+ bool wait_flag, bool posix_lck, int xid)
+{
+ int rc = 0;
+ __u64 length = 1 + flock->fl_end - flock->fl_start;
+ struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ __u16 netfid = cfile->netfid;
- if ((tcon->ses->capabilities & CAP_UNIX) &&
- (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
- ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- posix_locking = 1;
- /* BB add code here to normalize offset and length to
- account for negative length which we can not accept over the
- wire */
- if (IS_GETLK(cmd)) {
- if (posix_locking) {
- int posix_lock_type;
- if (lockType & LOCKING_ANDX_SHARED_LOCK)
- posix_lock_type = CIFS_RDLCK;
- else
- posix_lock_type = CIFS_WRLCK;
- rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
- length, pfLock, posix_lock_type,
- wait_flag);
- FreeXid(xid);
+ if (posix_lck) {
+ int posix_lock_type;
+
+ rc = cifs_posix_lock_test(file, flock);
+ if (!rc)
return rc;
- }
- /* BB we could chain these into one lock request BB */
- rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
- 0, 1, lockType, 0 /* wait flag */, 0);
- if (rc == 0) {
- rc = CIFSSMBLock(xid, tcon, netfid, length,
- pfLock->fl_start, 1 /* numUnlock */ ,
- 0 /* numLock */ , lockType,
- 0 /* wait flag */, 0);
- pfLock->fl_type = F_UNLCK;
- if (rc != 0)
- cERROR(1, "Error unlocking previously locked "
- "range %d during test of lock", rc);
- rc = 0;
+ if (type & LOCKING_ANDX_SHARED_LOCK)
+ posix_lock_type = CIFS_RDLCK;
+ else
+ posix_lock_type = CIFS_WRLCK;
+ rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
+ 1 /* get */, length, flock,
+ posix_lock_type, wait_flag);
+ return rc;
+ }
- } else {
- /* if rc == ERR_SHARING_VIOLATION ? */
- rc = 0;
+ rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid,
+ flock);
+ if (!rc)
+ return rc;
- if (lockType & LOCKING_ANDX_SHARED_LOCK) {
- pfLock->fl_type = F_WRLCK;
- } else {
- rc = CIFSSMBLock(xid, tcon, netfid, length,
- pfLock->fl_start, 0, 1,
- lockType | LOCKING_ANDX_SHARED_LOCK,
- 0 /* wait flag */, 0);
- if (rc == 0) {
- rc = CIFSSMBLock(xid, tcon, netfid,
- length, pfLock->fl_start, 1, 0,
- lockType |
- LOCKING_ANDX_SHARED_LOCK,
- 0 /* wait flag */, 0);
- pfLock->fl_type = F_RDLCK;
- if (rc != 0)
- cERROR(1, "Error unlocking "
- "previously locked range %d "
- "during test of lock", rc);
- rc = 0;
- } else {
- pfLock->fl_type = F_WRLCK;
- rc = 0;
- }
- }
- }
+ /* BB we could chain these into one lock request BB */
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
+ flock->fl_start, 0, 1, type, 0, 0);
+ if (rc == 0) {
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
+ length, flock->fl_start, 1, 0,
+ type, 0, 0);
+ flock->fl_type = F_UNLCK;
+ if (rc != 0)
+ cERROR(1, "Error unlocking previously locked "
+ "range %d during test of lock", rc);
+ rc = 0;
+ return rc;
+ }
- FreeXid(xid);
+ if (type & LOCKING_ANDX_SHARED_LOCK) {
+ flock->fl_type = F_WRLCK;
+ rc = 0;
return rc;
}
- if (!numLock && !numUnlock) {
- /* if no lock or unlock then nothing
- to do since we do not know what it is */
- FreeXid(xid);
- return -EOPNOTSUPP;
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
+ flock->fl_start, 0, 1,
+ type | LOCKING_ANDX_SHARED_LOCK, 0, 0);
+ if (rc == 0) {
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
+ length, flock->fl_start, 1, 0,
+ type | LOCKING_ANDX_SHARED_LOCK,
+ 0, 0);
+ flock->fl_type = F_RDLCK;
+ if (rc != 0)
+ cERROR(1, "Error unlocking previously locked "
+ "range %d during test of lock", rc);
+ } else
+ flock->fl_type = F_WRLCK;
+
+ rc = 0;
+ return rc;
+}
+
+static void
+cifs_move_llist(struct list_head *source, struct list_head *dest)
+{
+ struct list_head *li, *tmp;
+ list_for_each_safe(li, tmp, source)
+ list_move(li, dest);
+}
+
+static void
+cifs_free_llist(struct list_head *llist)
+{
+ struct cifsLockInfo *li, *tmp;
+ list_for_each_entry_safe(li, tmp, llist, llist) {
+ cifs_del_lock_waiters(li);
+ list_del(&li->llist);
+ kfree(li);
}
+}
+
+static int
+cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
+{
+ int rc = 0, stored_rc;
+ int types[] = {LOCKING_ANDX_LARGE_FILES,
+ LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+ unsigned int i;
+ unsigned int max_num, num;
+ LOCKING_ANDX_RANGE *buf, *cur;
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsLockInfo *li, *tmp;
+ __u64 length = 1 + flock->fl_end - flock->fl_start;
+ struct list_head tmp_llist;
+
+ INIT_LIST_HEAD(&tmp_llist);
+
+ max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
+ sizeof(LOCKING_ANDX_RANGE);
+ buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_lock(&cinode->lock_mutex);
+ for (i = 0; i < 2; i++) {
+ cur = buf;
+ num = 0;
+ list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+ if (flock->fl_start > li->offset ||
+ (flock->fl_start + length) <
+ (li->offset + li->length))
+ continue;
+ if (current->tgid != li->pid)
+ continue;
+ if (cfile->netfid != li->netfid)
+ continue;
+ if (types[i] != li->type)
+ continue;
+ if (!cinode->can_cache_brlcks) {
+ cur->Pid = cpu_to_le16(li->pid);
+ cur->LengthLow = cpu_to_le32((u32)li->length);
+ cur->LengthHigh =
+ cpu_to_le32((u32)(li->length>>32));
+ cur->OffsetLow = cpu_to_le32((u32)li->offset);
+ cur->OffsetHigh =
+ cpu_to_le32((u32)(li->offset>>32));
+ /*
+ * We need to save a lock here to let us add
+ * it again to the inode list if the unlock
+ * range request fails on the server.
+ */
+ list_move(&li->llist, &tmp_llist);
+ if (++num == max_num) {
+ stored_rc = cifs_lockv(xid, tcon,
+ cfile->netfid,
+ li->type, num,
+ 0, buf);
+ if (stored_rc) {
+ /*
+ * We failed on the unlock range
+ * request - add all locks from
+ * the tmp list to the head of
+ * the inode list.
+ */
+ cifs_move_llist(&tmp_llist,
+ &cinode->llist);
+ rc = stored_rc;
+ } else
+ /*
+ * The unlock range request
+ * succeed - free the tmp list.
+ */
+ cifs_free_llist(&tmp_llist);
+ cur = buf;
+ num = 0;
+ } else
+ cur++;
+ } else {
+ /*
+ * We can cache brlock requests - simply remove
+ * a lock from the inode list.
+ */
+ list_del(&li->llist);
+ cifs_del_lock_waiters(li);
+ kfree(li);
+ }
+ }
+ if (num) {
+ stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+ types[i], num, 0, buf);
+ if (stored_rc) {
+ cifs_move_llist(&tmp_llist, &cinode->llist);
+ rc = stored_rc;
+ } else
+ cifs_free_llist(&tmp_llist);
+ }
+ }
+
+ mutex_unlock(&cinode->lock_mutex);
+ kfree(buf);
+ return rc;
+}
- if (posix_locking) {
+static int
+cifs_setlk(struct file *file, struct file_lock *flock, __u8 type,
+ bool wait_flag, bool posix_lck, int lock, int unlock, int xid)
+{
+ int rc = 0;
+ __u64 length = 1 + flock->fl_end - flock->fl_start;
+ struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+ __u16 netfid = cfile->netfid;
+
+ if (posix_lck) {
int posix_lock_type;
- if (lockType & LOCKING_ANDX_SHARED_LOCK)
+
+ rc = cifs_posix_lock_set(file, flock);
+ if (!rc || rc < 0)
+ return rc;
+
+ if (type & LOCKING_ANDX_SHARED_LOCK)
posix_lock_type = CIFS_RDLCK;
else
posix_lock_type = CIFS_WRLCK;
- if (numUnlock == 1)
+ if (unlock == 1)
posix_lock_type = CIFS_UNLCK;
- rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */,
- length, pfLock, posix_lock_type,
- wait_flag);
- } else {
- struct cifsFileInfo *fid = file->private_data;
-
- if (numLock) {
- rc = CIFSSMBLock(xid, tcon, netfid, length,
- pfLock->fl_start, 0, numLock, lockType,
- wait_flag, 0);
+ rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
+ 0 /* set */, length, flock,
+ posix_lock_type, wait_flag);
+ goto out;
+ }
- if (rc == 0) {
- /* For Windows locks we must store them. */
- rc = store_file_lock(fid, length,
- pfLock->fl_start, lockType);
- }
- } else if (numUnlock) {
- /* For each stored lock that this unlock overlaps
- completely, unlock it. */
- int stored_rc = 0;
- struct cifsLockInfo *li, *tmp;
+ if (lock) {
+ rc = cifs_lock_add_if(cinode, flock->fl_start, length,
+ type, netfid, wait_flag);
+ if (rc < 0)
+ return rc;
+ else if (!rc)
+ goto out;
- rc = 0;
- mutex_lock(&fid->lock_mutex);
- list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
- if (pfLock->fl_start <= li->offset &&
- (pfLock->fl_start + length) >=
- (li->offset + li->length)) {
- stored_rc = CIFSSMBLock(xid, tcon,
- netfid, li->length,
- li->offset, 1, 0,
- li->type, false, 0);
- if (stored_rc)
- rc = stored_rc;
- else {
- list_del(&li->llist);
- kfree(li);
- }
- }
- }
- mutex_unlock(&fid->lock_mutex);
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
+ flock->fl_start, 0, 1, type, wait_flag, 0);
+ if (rc == 0) {
+ /* For Windows locks we must store them. */
+ rc = cifs_lock_add(cinode, length, flock->fl_start,
+ type, netfid);
}
+ } else if (unlock)
+ rc = cifs_unlock_range(cfile, flock, xid);
+
+out:
+ if (flock->fl_flags & FL_POSIX)
+ posix_lock_file_wait(file, flock);
+ return rc;
+}
+
+int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
+{
+ int rc, xid;
+ int lock = 0, unlock = 0;
+ bool wait_flag = false;
+ bool posix_lck = false;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+ struct cifsInodeInfo *cinode;
+ struct cifsFileInfo *cfile;
+ __u16 netfid;
+ __u8 type;
+
+ rc = -EACCES;
+ xid = GetXid();
+
+ cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld "
+ "end: %lld", cmd, flock->fl_flags, flock->fl_type,
+ flock->fl_start, flock->fl_end);
+
+ cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag);
+
+ cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ cfile = (struct cifsFileInfo *)file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+ netfid = cfile->netfid;
+ cinode = CIFS_I(file->f_path.dentry->d_inode);
+
+ if ((tcon->ses->capabilities & CAP_UNIX) &&
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+ posix_lck = true;
+ /*
+ * BB add code here to normalize offset and length to account for
+ * negative length which we can not accept over the wire.
+ */
+ if (IS_GETLK(cmd)) {
+ rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid);
+ FreeXid(xid);
+ return rc;
}
- if (pfLock->fl_flags & FL_POSIX)
- posix_lock_file_wait(file, pfLock);
+ if (!lock && !unlock) {
+ /*
+ * if no lock or unlock then nothing to do since we do not
+ * know what it is
+ */
+ FreeXid(xid);
+ return -EOPNOTSUPP;
+ }
+
+ rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock,
+ xid);
FreeXid(xid);
return rc;
}
@@ -1714,6 +2195,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
struct smb_com_read_rsp *pSMBr;
struct cifs_io_parms io_parms;
char *read_data;
+ unsigned int rsize;
__u32 pid;
if (!nr_segs)
@@ -1726,6 +2208,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
xid = GetXid();
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ /* FIXME: set up handlers for larger reads and/or convert to async */
+ rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
open_file = file->private_data;
pTcon = tlink_tcon(open_file->tlink);
@@ -1738,7 +2223,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
cFYI(1, "attempting read on write only file instance");
for (total_read = 0; total_read < len; total_read += bytes_read) {
- cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+ cur_len = min_t(const size_t, len - total_read, rsize);
rc = -EAGAIN;
read_data = NULL;
@@ -1830,6 +2315,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
unsigned int bytes_read = 0;
unsigned int total_read;
unsigned int current_read_size;
+ unsigned int rsize;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *pTcon;
int xid;
@@ -1842,6 +2328,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
xid = GetXid();
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ /* FIXME: set up handlers for larger reads and/or convert to async */
+ rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
if (file->private_data == NULL) {
rc = -EBADF;
FreeXid(xid);
@@ -1861,14 +2350,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
for (total_read = 0, current_offset = read_data;
read_size > total_read;
total_read += bytes_read, current_offset += bytes_read) {
- current_read_size = min_t(const int, read_size - total_read,
- cifs_sb->rsize);
+ current_read_size = min_t(uint, read_size - total_read, rsize);
+
/* For windows me and 9x we do not want to request more
than it negotiated since it will refuse the read then */
if ((pTcon->ses) &&
!(pTcon->ses->capabilities & CAP_LARGE_FILES)) {
- current_read_size = min_t(const int, current_read_size,
- pTcon->ses->server->maxBuf - 128);
+ current_read_size = min_t(uint, current_read_size,
+ CIFSMaxBufSize);
}
rc = -EAGAIN;
while (rc == -EAGAIN) {
@@ -1957,82 +2446,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
return rc;
}
-
-static void cifs_copy_cache_pages(struct address_space *mapping,
- struct list_head *pages, int bytes_read, char *data)
-{
- struct page *page;
- char *target;
-
- while (bytes_read > 0) {
- if (list_empty(pages))
- break;
-
- page = list_entry(pages->prev, struct page, lru);
- list_del(&page->lru);
-
- if (add_to_page_cache_lru(page, mapping, page->index,
- GFP_KERNEL)) {
- page_cache_release(page);
- cFYI(1, "Add page cache failed");
- data += PAGE_CACHE_SIZE;
- bytes_read -= PAGE_CACHE_SIZE;
- continue;
- }
- page_cache_release(page);
-
- target = kmap_atomic(page, KM_USER0);
-
- if (PAGE_CACHE_SIZE > bytes_read) {
- memcpy(target, data, bytes_read);
- /* zero the tail end of this partial page */
- memset(target + bytes_read, 0,
- PAGE_CACHE_SIZE - bytes_read);
- bytes_read = 0;
- } else {
- memcpy(target, data, PAGE_CACHE_SIZE);
- bytes_read -= PAGE_CACHE_SIZE;
- }
- kunmap_atomic(target, KM_USER0);
-
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- data += PAGE_CACHE_SIZE;
-
- /* add page to FS-Cache */
- cifs_readpage_to_fscache(mapping->host, page);
- }
- return;
-}
-
static int cifs_readpages(struct file *file, struct address_space *mapping,
struct list_head *page_list, unsigned num_pages)
{
- int rc = -EACCES;
- int xid;
- loff_t offset;
- struct page *page;
- struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *pTcon;
- unsigned int bytes_read = 0;
- unsigned int read_size, i;
- char *smb_read_data = NULL;
- struct smb_com_read_rsp *pSMBr;
- struct cifsFileInfo *open_file;
- struct cifs_io_parms io_parms;
- int buf_type = CIFS_NO_BUFFER;
- __u32 pid;
+ int rc;
+ struct list_head tmplist;
+ struct cifsFileInfo *open_file = file->private_data;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ unsigned int rsize = cifs_sb->rsize;
+ pid_t pid;
- xid = GetXid();
- if (file->private_data == NULL) {
- rc = -EBADF;
- FreeXid(xid);
- return rc;
- }
- open_file = file->private_data;
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- pTcon = tlink_tcon(open_file->tlink);
+ /*
+ * Give up immediately if rsize is too small to read an entire page.
+ * The VFS will fall back to readpage. We should never reach this
+ * point however since we set ra_pages to 0 when the rsize is smaller
+ * than a cache page.
+ */
+ if (unlikely(rsize < PAGE_CACHE_SIZE))
+ return 0;
/*
* Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2041,125 +2472,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
&num_pages);
if (rc == 0)
- goto read_complete;
+ return rc;
- cFYI(DBG2, "rpages: num pages %d", num_pages);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
else
pid = current->tgid;
- for (i = 0; i < num_pages; ) {
- unsigned contig_pages;
- struct page *tmp_page;
- unsigned long expected_index;
+ rc = 0;
+ INIT_LIST_HEAD(&tmplist);
- if (list_empty(page_list))
- break;
+ cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file,
+ mapping, num_pages);
+
+ /*
+ * Start with the page at end of list and move it to private
+ * list. Do the same with any following pages until we hit
+ * the rsize limit, hit an index discontinuity, or run out of
+ * pages. Issue the async read and then start the loop again
+ * until the list is empty.
+ *
+ * Note that list order is important. The page_list is in
+ * the order of declining indexes. When we put the pages in
+ * the rdata->pages, then we want them in increasing order.
+ */
+ while (!list_empty(page_list)) {
+ unsigned int bytes = PAGE_CACHE_SIZE;
+ unsigned int expected_index;
+ unsigned int nr_pages = 1;
+ loff_t offset;
+ struct page *page, *tpage;
+ struct cifs_readdata *rdata;
page = list_entry(page_list->prev, struct page, lru);
+
+ /*
+ * Lock the page and put it in the cache. Since no one else
+ * should have access to this page, we're safe to simply set
+ * PG_locked without checking it first.
+ */
+ __set_page_locked(page);
+ rc = add_to_page_cache_locked(page, mapping,
+ page->index, GFP_KERNEL);
+
+ /* give up if we can't stick it in the cache */
+ if (rc) {
+ __clear_page_locked(page);
+ break;
+ }
+
+ /* move first page to the tmplist */
offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ list_move_tail(&page->lru, &tmplist);
- /* count adjacent pages that we will read into */
- contig_pages = 0;
- expected_index =
- list_entry(page_list->prev, struct page, lru)->index;
- list_for_each_entry_reverse(tmp_page, page_list, lru) {
- if (tmp_page->index == expected_index) {
- contig_pages++;
- expected_index++;
- } else
+ /* now try and add more pages onto the request */
+ expected_index = page->index + 1;
+ list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
+ /* discontinuity ? */
+ if (page->index != expected_index)
break;
+
+ /* would this page push the read over the rsize? */
+ if (bytes + PAGE_CACHE_SIZE > rsize)
+ break;
+
+ __set_page_locked(page);
+ if (add_to_page_cache_locked(page, mapping,
+ page->index, GFP_KERNEL)) {
+ __clear_page_locked(page);
+ break;
+ }
+ list_move_tail(&page->lru, &tmplist);
+ bytes += PAGE_CACHE_SIZE;
+ expected_index++;
+ nr_pages++;
}
- if (contig_pages + i > num_pages)
- contig_pages = num_pages - i;
-
- /* for reads over a certain size could initiate async
- read ahead */
-
- read_size = contig_pages * PAGE_CACHE_SIZE;
- /* Read size needs to be in multiples of one page */
- read_size = min_t(const unsigned int, read_size,
- cifs_sb->rsize & PAGE_CACHE_MASK);
- cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d",
- read_size, contig_pages);
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
+
+ rdata = cifs_readdata_alloc(nr_pages);
+ if (!rdata) {
+ /* best to give up if we're out of mem */
+ list_for_each_entry_safe(page, tpage, &tmplist, lru) {
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ rc = -ENOMEM;
+ break;
+ }
+
+ spin_lock(&cifs_file_list_lock);
+ cifsFileInfo_get(open_file);
+ spin_unlock(&cifs_file_list_lock);
+ rdata->cfile = open_file;
+ rdata->mapping = mapping;
+ rdata->offset = offset;
+ rdata->bytes = bytes;
+ rdata->pid = pid;
+ list_splice_init(&tmplist, &rdata->pages);
+
+ do {
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, true);
if (rc != 0)
- break;
+ continue;
}
- io_parms.netfid = open_file->netfid;
- io_parms.pid = pid;
- io_parms.tcon = pTcon;
- io_parms.offset = offset;
- io_parms.length = read_size;
- rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
- &smb_read_data, &buf_type);
- /* BB more RC checks ? */
- if (rc == -EAGAIN) {
- if (smb_read_data) {
- if (buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(smb_read_data);
- else if (buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(smb_read_data);
- smb_read_data = NULL;
- }
- }
- }
- if ((rc < 0) || (smb_read_data == NULL)) {
- cFYI(1, "Read error in readpages: %d", rc);
- break;
- } else if (bytes_read > 0) {
- task_io_account_read(bytes_read);
- pSMBr = (struct smb_com_read_rsp *)smb_read_data;
- cifs_copy_cache_pages(mapping, page_list, bytes_read,
- smb_read_data + 4 /* RFC1001 hdr */ +
- le16_to_cpu(pSMBr->DataOffset));
-
- i += bytes_read >> PAGE_CACHE_SHIFT;
- cifs_stats_bytes_read(pTcon, bytes_read);
- if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) {
- i++; /* account for partial page */
-
- /* server copy of file can have smaller size
- than client */
- /* BB do we need to verify this common case ?
- this case is ok - if we are at server EOF
- we will hit it on next read */
+ rc = cifs_async_readv(rdata);
+ } while (rc == -EAGAIN);
- /* break; */
+ if (rc != 0) {
+ list_for_each_entry_safe(page, tpage, &rdata->pages,
+ lru) {
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ unlock_page(page);
+ page_cache_release(page);
}
- } else {
- cFYI(1, "No bytes read (%d) at offset %lld . "
- "Cleaning remaining pages from readahead list",
- bytes_read, offset);
- /* BB turn off caching and do new lookup on
- file size at server? */
+ cifs_readdata_free(rdata);
break;
}
- if (smb_read_data) {
- if (buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(smb_read_data);
- else if (buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(smb_read_data);
- smb_read_data = NULL;
- }
- bytes_read = 0;
}
-/* need to free smb_read_data buf before exit */
- if (smb_read_data) {
- if (buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(smb_read_data);
- else if (buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(smb_read_data);
- smb_read_data = NULL;
- }
-
-read_complete:
- FreeXid(xid);
return rc;
}
@@ -2408,6 +2841,10 @@ void cifs_oplock_break(struct work_struct *work)
cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
}
+ rc = cifs_push_locks(cfile);
+ if (rc)
+ cERROR(1, "Push locks rc = %d", rc);
+
/*
* releasing stale oplock after recent reconnect of smb session using
* a now incorrect file handle is not a data integrity issue but do
@@ -2415,8 +2852,9 @@ void cifs_oplock_break(struct work_struct *work)
* disconnected since oplock already released by the server
*/
if (!cfile->oplock_break_cancelled) {
- rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
- 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+ rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid,
+ current->tgid, 0, 0, 0, 0,
+ LOCKING_ANDX_OPLOCK_RELEASE, false,
cinode->clientCanCacheRead ? 1 : 0);
cFYI(1, "Oplock release rc = %d", rc);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a7b2dcd4a53e..2c50bd2f65d1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp)
xid = GetXid();
rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
- if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+ switch (rc) {
+ case 0:
+ cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+ break;
+ case -EREMOTE:
+ cifs_create_dfs_fattr(&fattr, inode->i_sb);
+ rc = 0;
+ break;
+ case -EOPNOTSUPP:
+ case -EINVAL:
/*
* FIXME: legacy server -- fall back to path-based call?
* for now, just skip revalidating and mark inode for
@@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp)
*/
rc = 0;
CIFS_I(inode)->time = 0;
+ default:
goto cgfi_exit;
- } else if (rc == -EREMOTE) {
- cifs_create_dfs_fattr(&fattr, inode->i_sb);
- rc = 0;
- } else if (rc)
- goto cgfi_exit;
+ }
/*
* don't bother with SFU junk here -- just mark inode as needing
* revalidation.
*/
- cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
cifs_fattr_to_inode(inode, &fattr);
@@ -2096,6 +2101,8 @@ static int
cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
{
int xid;
+ uid_t uid = NO_CHANGE_32;
+ gid_t gid = NO_CHANGE_32;
struct inode *inode = direntry->d_inode;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2146,13 +2153,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
goto cifs_setattr_exit;
}
- /*
- * Without unix extensions we can't send ownership changes to the
- * server, so silently ignore them. This is consistent with how
- * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
- * CIFSACL support + proper Windows to Unix idmapping, we may be
- * able to support this in the future.
- */
+ if (attrs->ia_valid & ATTR_UID)
+ uid = attrs->ia_uid;
+
+ if (attrs->ia_valid & ATTR_GID)
+ gid = attrs->ia_gid;
+
+#ifdef CONFIG_CIFS_ACL
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+ if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) {
+ rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
+ uid, gid);
+ if (rc) {
+ cFYI(1, "%s: Setting id failed with error: %d",
+ __func__, rc);
+ goto cifs_setattr_exit;
+ }
+ }
+ } else
+#endif /* CONFIG_CIFS_ACL */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
@@ -2161,15 +2180,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
attrs->ia_valid &= ~ATTR_MODE;
if (attrs->ia_valid & ATTR_MODE) {
- cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
mode = attrs->ia_mode;
- }
-
- if (attrs->ia_valid & ATTR_MODE) {
rc = 0;
#ifdef CONFIG_CIFS_ACL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
- rc = mode_to_cifs_acl(inode, full_path, mode);
+ rc = id_mode_to_cifs_acl(inode, full_path, mode,
+ NO_CHANGE_32, NO_CHANGE_32);
if (rc) {
cFYI(1, "%s: Setting ACL failed with error: %d",
__func__, rc);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index db3f18cdf024..8693b5d0e180 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -183,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
static int
CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
const char *fromName, const char *toName,
- const struct nls_table *nls_codepage, int remap)
+ struct cifs_sb_info *cifs_sb)
{
int rc;
int oplock = 0;
+ int remap;
+ int create_options = CREATE_NOT_DIR;
__u16 netfid = 0;
u8 *buf;
unsigned int bytes_written = 0;
struct cifs_io_parms io_parms;
+ struct nls_table *nls_codepage;
+
+ nls_codepage = cifs_sb->local_nls;
+ remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
if (!buf)
@@ -202,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
return rc;
}
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
- CREATE_NOT_DIR, &netfid, &oplock, NULL,
+ create_options, &netfid, &oplock, NULL,
nls_codepage, remap);
if (rc != 0) {
kfree(buf);
@@ -559,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
/* BB what if DFS and this volume is on different share? BB */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb);
else if (pTcon->unix_ext)
rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
cifs_sb->local_nls);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 7c1693392598..703ef5c6fdb1 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
}
int
-checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
+checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read)
{
- __u32 len = be32_to_cpu(smb->smb_buf_length);
+ __u32 rfclen = be32_to_cpu(smb->smb_buf_length);
__u32 clc_len; /* calculated length */
- cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
+ cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x",
+ total_read, rfclen);
- if (length < 2 + sizeof(struct smb_hdr)) {
- if ((length >= sizeof(struct smb_hdr) - 1)
+ /* is this frame too small to even get to a BCC? */
+ if (total_read < 2 + sizeof(struct smb_hdr)) {
+ if ((total_read >= sizeof(struct smb_hdr) - 1)
&& (smb->Status.CifsError != 0)) {
+ /* it's an error return */
smb->WordCount = 0;
/* some error cases do not return wct and bcc */
return 0;
- } else if ((length == sizeof(struct smb_hdr) + 1) &&
+ } else if ((total_read == sizeof(struct smb_hdr) + 1) &&
(smb->WordCount == 0)) {
char *tmp = (char *)smb;
/* Need to work around a bug in two servers here */
@@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
} else {
cERROR(1, "Length less than smb header size");
}
- return 1;
- }
- if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
- cERROR(1, "smb length greater than MaxBufSize, mid=%d",
- smb->Mid);
- return 1;
+ return -EIO;
}
+ /* otherwise, there is enough to get to the BCC */
if (check_smb_hdr(smb, mid))
- return 1;
+ return -EIO;
clc_len = smbCalcSize(smb);
- if (4 + len != length) {
+ if (4 + rfclen != total_read) {
cERROR(1, "Length read does not match RFC1001 length %d",
- len);
- return 1;
+ rfclen);
+ return -EIO;
}
- if (4 + len != clc_len) {
+ if (4 + rfclen != clc_len) {
/* check if bcc wrapped around for large read responses */
- if ((len > 64 * 1024) && (len > clc_len)) {
+ if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
/* check if lengths match mod 64K */
- if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
+ if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
return 0; /* bcc wrapped */
}
cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
- clc_len, 4 + len, smb->Mid);
+ clc_len, 4 + rfclen, smb->Mid);
- if (4 + len < clc_len) {
+ if (4 + rfclen < clc_len) {
cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
- len, smb->Mid);
- return 1;
- } else if (len > clc_len + 512) {
+ rfclen, smb->Mid);
+ return -EIO;
+ } else if (rfclen > clc_len + 512) {
/*
* Some servers (Windows XP in particular) send more
* data than the lengths in the SMB packet would
@@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
* data to 512 bytes.
*/
cERROR(1, "RFC1001 size %u more than 512 bytes larger "
- "than SMB for mid=%u", len, smb->Mid);
- return 1;
+ "than SMB for mid=%u", rfclen, smb->Mid);
+ return -EIO;
}
}
return 0;
@@ -676,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
cinode->clientCanCacheRead = false;
}
}
+
+bool
+backup_cred(struct cifs_sb_info *cifs_sb)
+{
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
+ if (cifs_sb->mnt_backupuid == current_fsuid())
+ return true;
+ }
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
+ if (in_group_p(cifs_sb->mnt_backupgid))
+ return true;
+ }
+
+ return false;
+}
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d3e619692ee0..c7d80e24f24e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
/* that we use in next few lines */
/* Note that header is initialized to zero in header_assemble */
pSMB->req.AndXCommand = 0xFF;
- pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
+ pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32,
+ CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
+ USHRT_MAX));
pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
pSMB->req.VcNumber = get_next_vcnum(ses);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 42b9fff48751..ac1221d969d6 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -265,91 +265,6 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
return rc;
}
-#if 0 /* currently unused */
-/* Does both the NT and LM owfs of a user's password */
-static void
-nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
-{
- char passwd[514];
-
- memset(passwd, '\0', 514);
- if (strlen(pwd) < 513)
- strcpy(passwd, pwd);
- else
- memcpy(passwd, pwd, 512);
- /* Calculate the MD4 hash (NT compatible) of the password */
- memset(nt_p16, '\0', 16);
- E_md4hash(passwd, nt_p16);
-
- /* Mangle the passwords into Lanman format */
- passwd[14] = '\0';
-/* strupper(passwd); */
-
- /* Calculate the SMB (lanman) hash functions of the password */
-
- memset(p16, '\0', 16);
- E_P16((unsigned char *) passwd, (unsigned char *) p16);
-
- /* clear out local copy of user's password (just being paranoid). */
- memset(passwd, '\0', sizeof(passwd));
-}
-#endif
-
-/* Does the NTLMv2 owfs of a user's password */
-#if 0 /* function not needed yet - but will be soon */
-static void
-ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
- const char *domain_n, unsigned char kr_buf[16],
- const struct nls_table *nls_codepage)
-{
- wchar_t *user_u;
- wchar_t *dom_u;
- int user_l, domain_l;
- struct HMACMD5Context ctx;
-
- /* might as well do one alloc to hold both (user_u and dom_u) */
- user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL);
- if (user_u == NULL)
- return;
- dom_u = user_u + 1024;
-
- /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
- STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
- push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
- STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
-
- /* BB user and domain may need to be uppercased */
- user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
- domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage);
-
- user_l++; /* trailing null */
- domain_l++;
-
- hmac_md5_init_limK_to_64(owf, 16, &ctx);
- hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx);
- hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx);
- hmac_md5_final(kr_buf, &ctx);
-
- kfree(user_u);
-}
-#endif
-
-/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
-#if 0 /* currently unused */
-static void
-NTLMSSPOWFencrypt(unsigned char passwd[8],
- unsigned char *ntlmchalresp, unsigned char p24[24])
-{
- unsigned char p21[21];
-
- memset(p21, '\0', 21);
- memcpy(p21, passwd, 8);
- memset(p21 + 8, 0xbd, 8);
-
- E_P24(p21, ntlmchalresp, p24);
-}
-#endif
-
/* Does the NT MD4 hash then des encryption. */
int
SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
@@ -369,39 +284,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
rc = E_P24(p21, c8, p24);
return rc;
}
-
-
-/* Does the md5 encryption from the NT hash for NTLMv2. */
-/* These routines will be needed later */
-#if 0
-static void
-SMBOWFencrypt_ntv2(const unsigned char kr[16],
- const struct data_blob *srv_chal,
- const struct data_blob *cli_chal, unsigned char resp_buf[16])
-{
- struct HMACMD5Context ctx;
-
- hmac_md5_init_limK_to_64(kr, 16, &ctx);
- hmac_md5_update(srv_chal->data, srv_chal->length, &ctx);
- hmac_md5_update(cli_chal->data, cli_chal->length, &ctx);
- hmac_md5_final(resp_buf, &ctx);
-}
-
-static void
-SMBsesskeygen_ntv2(const unsigned char kr[16],
- const unsigned char *nt_resp, __u8 sess_key[16])
-{
- struct HMACMD5Context ctx;
-
- hmac_md5_init_limK_to_64(kr, 16, &ctx);
- hmac_md5_update(nt_resp, 16, &ctx);
- hmac_md5_final((unsigned char *) sess_key, &ctx);
-}
-
-static void
-SMBsesskeygen_ntv1(const unsigned char kr[16],
- const unsigned char *nt_resp, __u8 sess_key[16])
-{
- mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16);
-}
-#endif
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 10ca6b2c26b7..0cc9584f5889 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -26,6 +26,7 @@
#include <linux/wait.h>
#include <linux/net.h>
#include <linux/delay.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <linux/mempool.h>
@@ -324,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
{
int error;
- error = wait_event_killable(server->response_q,
+ error = wait_event_freezekillable(server->response_q,
midQ->midState != MID_REQUEST_SUBMITTED);
if (error < 0)
return -ERESTARTSYS;
@@ -339,8 +340,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
*/
int
cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
- unsigned int nvec, mid_callback_t *callback, void *cbdata,
- bool ignore_pend)
+ unsigned int nvec, mid_receive_t *receive,
+ mid_callback_t *callback, void *cbdata, bool ignore_pend)
{
int rc;
struct mid_q_entry *mid;
@@ -374,6 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
goto out_err;
}
+ mid->receive = receive;
mid->callback = callback;
mid->callback_data = cbdata;
mid->midState = MID_REQUEST_SUBMITTED;
@@ -496,13 +498,18 @@ int
cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
bool log_error)
{
- dump_smb(mid->resp_buf,
- min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length)));
+ unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4;
+
+ dump_smb(mid->resp_buf, min_t(u32, 92, len));
/* convert the length into a more usable form */
if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ struct kvec iov;
+
+ iov.iov_base = mid->resp_buf;
+ iov.iov_len = len;
/* FIXME: add code to kill session */
- if (cifs_verify_signature(mid->resp_buf, server,
+ if (cifs_verify_signature(&iov, 1, server,
mid->sequence_number + 1) != 0)
cERROR(1, "Unexpected SMB signature");
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 2a22fb2989e4..45f07c46f3ed 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/posix_acl_xattr.h>
#include <linux/slab.h>
+#include <linux/xattr.h>
#include "cifsfs.h"
#include "cifspdu.h"
#include "cifsglob.h"
@@ -31,16 +32,8 @@
#define MAX_EA_VALUE_SIZE 65535
#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
-#define CIFS_XATTR_USER_PREFIX "user."
-#define CIFS_XATTR_SYSTEM_PREFIX "system."
-#define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX "security."
-#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
-#define XATTR_TRUSTED_PREFIX_LEN 8
-#define XATTR_SECURITY_PREFIX_LEN 9
-/* BB need to add server (Samba e.g) support for security and trusted prefix */
-
+/* BB need to add server (Samba e.g) support for security and trusted prefix */
int cifs_removexattr(struct dentry *direntry, const char *ea_name)
{
@@ -76,8 +69,8 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
}
if (ea_name == NULL) {
cFYI(1, "Null xattr names not supported");
- } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
- && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
+ } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+ && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
cFYI(1,
"illegal xattr request %s (only user namespace supported)",
ea_name);
@@ -88,7 +81,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto remove_ea_exit;
- ea_name += 5; /* skip past user. prefix */
+ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
(__u16)0, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -149,21 +142,23 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
if (ea_name == NULL) {
cFYI(1, "Null xattr names not supported");
- } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+ } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+ == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto set_ea_exit;
if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
cFYI(1, "attempt to set cifs inode metadata");
- ea_name += 5; /* skip past user. prefix */
+ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
(__u16)value_size, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
- } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+ } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
+ == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto set_ea_exit;
- ea_name += 4; /* skip past os2. prefix */
+ ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
(__u16)value_size, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -178,7 +173,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
#ifdef CONFIG_CIFS_ACL
memcpy(pacl, ea_value, value_size);
rc = set_cifs_acl(pacl, value_size,
- direntry->d_inode, full_path);
+ direntry->d_inode, full_path, CIFS_ACL_DACL);
if (rc == 0) /* force revalidate of the inode */
CIFS_I(direntry->d_inode)->time = 0;
kfree(pacl);
@@ -269,7 +264,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
/* return alt name if available as pseudo attr */
if (ea_name == NULL) {
cFYI(1, "Null xattr names not supported");
- } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+ } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+ == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto get_ea_exit;
@@ -277,15 +273,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
cFYI(1, "attempt to query cifs inode metadata");
/* revalidate/getattr then populate from inode */
} /* BB add else when above is implemented */
- ea_name += 5; /* skip past user. prefix */
+ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
buf_size, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
- } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+ } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto get_ea_exit;
- ea_name += 4; /* skip past os2. prefix */
+ ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
buf_size, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -339,10 +335,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
cFYI(1, "Query CIFS ACL not supported yet");
#endif /* CONFIG_CIFS_ACL */
} else if (strncmp(ea_name,
- CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
+ XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
cFYI(1, "Trusted xattr namespace not supported yet");
} else if (strncmp(ea_name,
- CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
+ XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
cFYI(1, "Security xattr namespace not supported yet");
} else
cFYI(1,
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 44e17e9c21ae..cc0ea9fe5ecf 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -59,12 +59,11 @@ void coda_sysctl_clean(void);
#define CODA_ALLOC(ptr, cast, size) do { \
if (size < PAGE_SIZE) \
- ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+ ptr = kzalloc((unsigned long) size, GFP_KERNEL); \
else \
- ptr = (cast)vmalloc((unsigned long) size); \
+ ptr = (cast)vzalloc((unsigned long) size); \
if (!ptr) \
printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
- else memset( ptr, 0, size ); \
} while (0)
diff --git a/fs/compat.c b/fs/compat.c
index 58b1da459893..c98787536bb8 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -37,7 +37,6 @@
#include <linux/dirent.h>
#include <linux/fsnotify.h>
#include <linux/highuid.h>
-#include <linux/nfsd/syscall.h>
#include <linux/personality.h>
#include <linux/rwsem.h>
#include <linux/tsacct_kern.h>
@@ -247,11 +246,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
__put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
__put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
__put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
- __put_user(0, &ubuf->f_spare[0]) ||
- __put_user(0, &ubuf->f_spare[1]) ||
- __put_user(0, &ubuf->f_spare[2]) ||
- __put_user(0, &ubuf->f_spare[3]) ||
- __put_user(0, &ubuf->f_spare[4]))
+ __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+ __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
return -EFAULT;
return 0;
}
@@ -550,7 +546,7 @@ out:
ssize_t compat_rw_copy_check_uvector(int type,
const struct compat_iovec __user *uvector, unsigned long nr_segs,
unsigned long fast_segs, struct iovec *fast_pointer,
- struct iovec **ret_pointer)
+ struct iovec **ret_pointer, int check_access)
{
compat_ssize_t tot_len;
struct iovec *iov = *ret_pointer = fast_pointer;
@@ -597,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
}
if (len < 0) /* size_t not fitting in compat_ssize_t .. */
goto out;
- if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+ if (check_access &&
+ !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
ret = -EFAULT;
goto out;
}
@@ -1111,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
goto out;
tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
- UIO_FASTIOV, iovstack, &iov);
+ UIO_FASTIOV, iovstack, &iov, 1);
if (tot_len == 0) {
ret = 0;
goto out;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c83f4768eeaa..ca418aaf6352 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -23,7 +23,8 @@
*
* configfs Copyright (C) 2005 Oracle. All rights reserved.
*
- * Please see Documentation/filesystems/configfs.txt for more information.
+ * Please see Documentation/filesystems/configfs/configfs.txt for more
+ * information.
*/
#undef DEBUG
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 76dc4c3e5d51..50cee7f9110b 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -23,7 +23,7 @@
*
* configfs Copyright (C) 2005 Oracle. All rights reserved.
*
- * Please see the file Documentation/filesystems/configfs.txt for
+ * Please see the file Documentation/filesystems/configfs/configfs.txt for
* critical information about using the config_item interface.
*/
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e7a7a2f07324..f3a257d7a985 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,5 +1,5 @@
/*
- * file.c - part of debugfs, a tiny little debug file system
+ * inode.c - part of debugfs, a tiny little debug file system
*
* Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (C) 2004 IBM Inc.
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 44a360ca8046..d740ab67ff6e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -39,7 +39,7 @@
/*
* How many user pages to map in one call to get_user_pages(). This determines
- * the size of a structure on the stack.
+ * the size of a structure in the slab cache
*/
#define DIO_PAGES 64
@@ -55,13 +55,10 @@
* blocksize.
*/
-struct dio {
- /* BIO submission state */
+/* dio_state only used in the submission path */
+
+struct dio_submit {
struct bio *bio; /* bio under assembly */
- struct inode *inode;
- int rw;
- loff_t i_size; /* i_size when submitted */
- int flags; /* doesn't change */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft
@@ -76,18 +73,17 @@ struct dio {
sector_t block_in_file; /* Current offset into the underlying
file in dio_block units. */
unsigned blocks_available; /* At block_in_file. changes */
+ int reap_counter; /* rate limit reaping */
sector_t final_block_in_request;/* doesn't change */
unsigned first_block_in_page; /* doesn't change, Used only once */
int boundary; /* prev block is at a boundary */
- int reap_counter; /* rate limit reaping */
get_block_t *get_block; /* block mapping function */
- dio_iodone_t *end_io; /* IO completion function */
dio_submit_t *submit_io; /* IO submition function */
+
loff_t logical_offset_in_bio; /* current first logical block in bio */
sector_t final_block_in_bio; /* current final block in bio + 1 */
sector_t next_block_for_io; /* next block to be put under IO,
in dio_blocks units */
- struct buffer_head map_bh; /* last get_block() result */
/*
* Deferred addition of a page to the dio. These variables are
@@ -100,18 +96,6 @@ struct dio {
sector_t cur_page_block; /* Where it starts */
loff_t cur_page_fs_offset; /* Offset in file */
- /* BIO completion state */
- spinlock_t bio_lock; /* protects BIO fields below */
- unsigned long refcount; /* direct_io_worker() and bios */
- struct bio *bio_list; /* singly linked via bi_private */
- struct task_struct *waiter; /* waiting task (NULL if none) */
-
- /* AIO related stuff */
- struct kiocb *iocb; /* kiocb */
- int is_async; /* is IO async ? */
- int io_error; /* IO error in completion path */
- ssize_t result; /* IO result */
-
/*
* Page fetching state. These variables belong to dio_refill_pages().
*/
@@ -125,7 +109,30 @@ struct dio {
*/
unsigned head; /* next page to process */
unsigned tail; /* last valid page + 1 */
+};
+
+/* dio_state communicated between submission path and end_io */
+struct dio {
+ int flags; /* doesn't change */
+ int rw;
+ struct inode *inode;
+ loff_t i_size; /* i_size when submitted */
+ dio_iodone_t *end_io; /* IO completion function */
+
+ void *private; /* copy from map_bh.b_private */
+
+ /* BIO completion state */
+ spinlock_t bio_lock; /* protects BIO fields below */
int page_errors; /* errno from get_user_pages() */
+ int is_async; /* is IO async ? */
+ int io_error; /* IO error in completion path */
+ unsigned long refcount; /* direct_io_worker() and bios */
+ struct bio *bio_list; /* singly linked via bi_private */
+ struct task_struct *waiter; /* waiting task (NULL if none) */
+
+ /* AIO related stuff */
+ struct kiocb *iocb; /* kiocb */
+ ssize_t result; /* IO result */
/*
* pages[] (and any fields placed after it) are not zeroed out at
@@ -133,7 +140,9 @@ struct dio {
* wish that they not be zeroed.
*/
struct page *pages[DIO_PAGES]; /* page buffer */
-};
+} ____cacheline_aligned_in_smp;
+
+static struct kmem_cache *dio_cache __read_mostly;
static void __inode_dio_wait(struct inode *inode)
{
@@ -182,27 +191,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done);
/*
* How many pages are in the queue?
*/
-static inline unsigned dio_pages_present(struct dio *dio)
+static inline unsigned dio_pages_present(struct dio_submit *sdio)
{
- return dio->tail - dio->head;
+ return sdio->tail - sdio->head;
}
/*
* Go grab and pin some userspace pages. Typically we'll get 64 at a time.
*/
-static int dio_refill_pages(struct dio *dio)
+static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
int ret;
int nr_pages;
- nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+ nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
ret = get_user_pages_fast(
- dio->curr_user_address, /* Where from? */
+ sdio->curr_user_address, /* Where from? */
nr_pages, /* How many pages? */
dio->rw == READ, /* Write to memory? */
&dio->pages[0]); /* Put results here */
- if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
+ if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
struct page *page = ZERO_PAGE(0);
/*
* A memory fault, but the filesystem has some outstanding
@@ -213,17 +222,17 @@ static int dio_refill_pages(struct dio *dio)
dio->page_errors = ret;
page_cache_get(page);
dio->pages[0] = page;
- dio->head = 0;
- dio->tail = 1;
+ sdio->head = 0;
+ sdio->tail = 1;
ret = 0;
goto out;
}
if (ret >= 0) {
- dio->curr_user_address += ret * PAGE_SIZE;
- dio->curr_page += ret;
- dio->head = 0;
- dio->tail = ret;
+ sdio->curr_user_address += ret * PAGE_SIZE;
+ sdio->curr_page += ret;
+ sdio->head = 0;
+ sdio->tail = ret;
ret = 0;
}
out:
@@ -236,17 +245,18 @@ out:
* decent number of pages, less frequently. To provide nicer use of the
* L1 cache.
*/
-static struct page *dio_get_page(struct dio *dio)
+static inline struct page *dio_get_page(struct dio *dio,
+ struct dio_submit *sdio)
{
- if (dio_pages_present(dio) == 0) {
+ if (dio_pages_present(sdio) == 0) {
int ret;
- ret = dio_refill_pages(dio);
+ ret = dio_refill_pages(dio, sdio);
if (ret)
return ERR_PTR(ret);
- BUG_ON(dio_pages_present(dio) == 0);
+ BUG_ON(dio_pages_present(sdio) == 0);
}
- return dio->pages[dio->head++];
+ return dio->pages[sdio->head++];
}
/**
@@ -292,7 +302,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
if (dio->end_io && dio->result) {
dio->end_io(dio->iocb, offset, transferred,
- dio->map_bh.b_private, ret, is_async);
+ dio->private, ret, is_async);
} else {
if (is_async)
aio_complete(dio->iocb, ret, 0);
@@ -323,7 +333,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
if (remaining == 0) {
dio_complete(dio, dio->iocb->ki_pos, 0, true);
- kfree(dio);
+ kmem_cache_free(dio_cache, dio);
}
}
@@ -367,9 +377,10 @@ void dio_end_io(struct bio *bio, int error)
}
EXPORT_SYMBOL_GPL(dio_end_io);
-static void
-dio_bio_alloc(struct dio *dio, struct block_device *bdev,
- sector_t first_sector, int nr_vecs)
+static inline void
+dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
+ struct block_device *bdev,
+ sector_t first_sector, int nr_vecs)
{
struct bio *bio;
@@ -386,8 +397,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
else
bio->bi_end_io = dio_bio_end_io;
- dio->bio = bio;
- dio->logical_offset_in_bio = dio->cur_page_fs_offset;
+ sdio->bio = bio;
+ sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
/*
@@ -397,9 +408,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
*
* bios hold a dio reference between submit_bio and ->end_io.
*/
-static void dio_bio_submit(struct dio *dio)
+static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
{
- struct bio *bio = dio->bio;
+ struct bio *bio = sdio->bio;
unsigned long flags;
bio->bi_private = dio;
@@ -411,24 +422,24 @@ static void dio_bio_submit(struct dio *dio)
if (dio->is_async && dio->rw == READ)
bio_set_pages_dirty(bio);
- if (dio->submit_io)
- dio->submit_io(dio->rw, bio, dio->inode,
- dio->logical_offset_in_bio);
+ if (sdio->submit_io)
+ sdio->submit_io(dio->rw, bio, dio->inode,
+ sdio->logical_offset_in_bio);
else
submit_bio(dio->rw, bio);
- dio->bio = NULL;
- dio->boundary = 0;
- dio->logical_offset_in_bio = 0;
+ sdio->bio = NULL;
+ sdio->boundary = 0;
+ sdio->logical_offset_in_bio = 0;
}
/*
* Release any resources in case of a failure
*/
-static void dio_cleanup(struct dio *dio)
+static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
{
- while (dio_pages_present(dio))
- page_cache_release(dio_get_page(dio));
+ while (dio_pages_present(sdio))
+ page_cache_release(dio_get_page(dio, sdio));
}
/*
@@ -518,11 +529,11 @@ static void dio_await_completion(struct dio *dio)
*
* This also helps to limit the peak amount of pinned userspace memory.
*/
-static int dio_bio_reap(struct dio *dio)
+static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
{
int ret = 0;
- if (dio->reap_counter++ >= 64) {
+ if (sdio->reap_counter++ >= 64) {
while (dio->bio_list) {
unsigned long flags;
struct bio *bio;
@@ -536,14 +547,14 @@ static int dio_bio_reap(struct dio *dio)
if (ret == 0)
ret = ret2;
}
- dio->reap_counter = 0;
+ sdio->reap_counter = 0;
}
return ret;
}
/*
* Call into the fs to map some more disk blocks. We record the current number
- * of available blocks at dio->blocks_available. These are in units of the
+ * of available blocks at sdio->blocks_available. These are in units of the
* fs blocksize, (1 << inode->i_blkbits).
*
* The fs is allowed to map lots of blocks at once. If it wants to do that,
@@ -564,10 +575,10 @@ static int dio_bio_reap(struct dio *dio)
* buffer_mapped(). However the direct-io code will only process holes one
* block at a time - it will repeatedly call get_block() as it walks the hole.
*/
-static int get_more_blocks(struct dio *dio)
+static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
+ struct buffer_head *map_bh)
{
int ret;
- struct buffer_head *map_bh = &dio->map_bh;
sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
unsigned long fs_count; /* Number of filesystem-sized blocks */
unsigned long dio_count;/* Number of dio_block-sized blocks */
@@ -580,11 +591,11 @@ static int get_more_blocks(struct dio *dio)
*/
ret = dio->page_errors;
if (ret == 0) {
- BUG_ON(dio->block_in_file >= dio->final_block_in_request);
- fs_startblk = dio->block_in_file >> dio->blkfactor;
- dio_count = dio->final_block_in_request - dio->block_in_file;
- fs_count = dio_count >> dio->blkfactor;
- blkmask = (1 << dio->blkfactor) - 1;
+ BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
+ fs_startblk = sdio->block_in_file >> sdio->blkfactor;
+ dio_count = sdio->final_block_in_request - sdio->block_in_file;
+ fs_count = dio_count >> sdio->blkfactor;
+ blkmask = (1 << sdio->blkfactor) - 1;
if (dio_count & blkmask)
fs_count++;
@@ -604,13 +615,16 @@ static int get_more_blocks(struct dio *dio)
*/
create = dio->rw & WRITE;
if (dio->flags & DIO_SKIP_HOLES) {
- if (dio->block_in_file < (i_size_read(dio->inode) >>
- dio->blkbits))
+ if (sdio->block_in_file < (i_size_read(dio->inode) >>
+ sdio->blkbits))
create = 0;
}
- ret = (*dio->get_block)(dio->inode, fs_startblk,
+ ret = (*sdio->get_block)(dio->inode, fs_startblk,
map_bh, create);
+
+ /* Store for completion */
+ dio->private = map_bh->b_private;
}
return ret;
}
@@ -618,20 +632,21 @@ static int get_more_blocks(struct dio *dio)
/*
* There is no bio. Make one now.
*/
-static int dio_new_bio(struct dio *dio, sector_t start_sector)
+static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
+ sector_t start_sector, struct buffer_head *map_bh)
{
sector_t sector;
int ret, nr_pages;
- ret = dio_bio_reap(dio);
+ ret = dio_bio_reap(dio, sdio);
if (ret)
goto out;
- sector = start_sector << (dio->blkbits - 9);
- nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+ sector = start_sector << (sdio->blkbits - 9);
+ nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
nr_pages = min(nr_pages, BIO_MAX_PAGES);
BUG_ON(nr_pages <= 0);
- dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
- dio->boundary = 0;
+ dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
+ sdio->boundary = 0;
out:
return ret;
}
@@ -643,21 +658,21 @@ out:
*
* Return zero on success. Non-zero means the caller needs to start a new BIO.
*/
-static int dio_bio_add_page(struct dio *dio)
+static inline int dio_bio_add_page(struct dio_submit *sdio)
{
int ret;
- ret = bio_add_page(dio->bio, dio->cur_page,
- dio->cur_page_len, dio->cur_page_offset);
- if (ret == dio->cur_page_len) {
+ ret = bio_add_page(sdio->bio, sdio->cur_page,
+ sdio->cur_page_len, sdio->cur_page_offset);
+ if (ret == sdio->cur_page_len) {
/*
* Decrement count only, if we are done with this page
*/
- if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)
- dio->pages_in_io--;
- page_cache_get(dio->cur_page);
- dio->final_block_in_bio = dio->cur_page_block +
- (dio->cur_page_len >> dio->blkbits);
+ if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
+ sdio->pages_in_io--;
+ page_cache_get(sdio->cur_page);
+ sdio->final_block_in_bio = sdio->cur_page_block +
+ (sdio->cur_page_len >> sdio->blkbits);
ret = 0;
} else {
ret = 1;
@@ -675,14 +690,15 @@ static int dio_bio_add_page(struct dio *dio)
* The caller of this function is responsible for removing cur_page from the
* dio, and for dropping the refcount which came from that presence.
*/
-static int dio_send_cur_page(struct dio *dio)
+static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
+ struct buffer_head *map_bh)
{
int ret = 0;
- if (dio->bio) {
- loff_t cur_offset = dio->cur_page_fs_offset;
- loff_t bio_next_offset = dio->logical_offset_in_bio +
- dio->bio->bi_size;
+ if (sdio->bio) {
+ loff_t cur_offset = sdio->cur_page_fs_offset;
+ loff_t bio_next_offset = sdio->logical_offset_in_bio +
+ sdio->bio->bi_size;
/*
* See whether this new request is contiguous with the old.
@@ -698,28 +714,28 @@ static int dio_send_cur_page(struct dio *dio)
* be the next logical offset in the bio, submit the bio we
* have.
*/
- if (dio->final_block_in_bio != dio->cur_page_block ||
+ if (sdio->final_block_in_bio != sdio->cur_page_block ||
cur_offset != bio_next_offset)
- dio_bio_submit(dio);
+ dio_bio_submit(dio, sdio);
/*
* Submit now if the underlying fs is about to perform a
* metadata read
*/
- else if (dio->boundary)
- dio_bio_submit(dio);
+ else if (sdio->boundary)
+ dio_bio_submit(dio, sdio);
}
- if (dio->bio == NULL) {
- ret = dio_new_bio(dio, dio->cur_page_block);
+ if (sdio->bio == NULL) {
+ ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
if (ret)
goto out;
}
- if (dio_bio_add_page(dio) != 0) {
- dio_bio_submit(dio);
- ret = dio_new_bio(dio, dio->cur_page_block);
+ if (dio_bio_add_page(sdio) != 0) {
+ dio_bio_submit(dio, sdio);
+ ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
if (ret == 0) {
- ret = dio_bio_add_page(dio);
+ ret = dio_bio_add_page(sdio);
BUG_ON(ret != 0);
}
}
@@ -744,9 +760,10 @@ out:
* If that doesn't work out then we put the old page into the bio and add this
* page to the dio instead.
*/
-static int
-submit_page_section(struct dio *dio, struct page *page,
- unsigned offset, unsigned len, sector_t blocknr)
+static inline int
+submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
+ unsigned offset, unsigned len, sector_t blocknr,
+ struct buffer_head *map_bh)
{
int ret = 0;
@@ -760,20 +777,20 @@ submit_page_section(struct dio *dio, struct page *page,
/*
* Can we just grow the current page's presence in the dio?
*/
- if ( (dio->cur_page == page) &&
- (dio->cur_page_offset + dio->cur_page_len == offset) &&
- (dio->cur_page_block +
- (dio->cur_page_len >> dio->blkbits) == blocknr)) {
- dio->cur_page_len += len;
+ if (sdio->cur_page == page &&
+ sdio->cur_page_offset + sdio->cur_page_len == offset &&
+ sdio->cur_page_block +
+ (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
+ sdio->cur_page_len += len;
/*
- * If dio->boundary then we want to schedule the IO now to
+ * If sdio->boundary then we want to schedule the IO now to
* avoid metadata seeks.
*/
- if (dio->boundary) {
- ret = dio_send_cur_page(dio);
- page_cache_release(dio->cur_page);
- dio->cur_page = NULL;
+ if (sdio->boundary) {
+ ret = dio_send_cur_page(dio, sdio, map_bh);
+ page_cache_release(sdio->cur_page);
+ sdio->cur_page = NULL;
}
goto out;
}
@@ -781,20 +798,20 @@ submit_page_section(struct dio *dio, struct page *page,
/*
* If there's a deferred page already there then send it.
*/
- if (dio->cur_page) {
- ret = dio_send_cur_page(dio);
- page_cache_release(dio->cur_page);
- dio->cur_page = NULL;
+ if (sdio->cur_page) {
+ ret = dio_send_cur_page(dio, sdio, map_bh);
+ page_cache_release(sdio->cur_page);
+ sdio->cur_page = NULL;
if (ret)
goto out;
}
page_cache_get(page); /* It is in dio */
- dio->cur_page = page;
- dio->cur_page_offset = offset;
- dio->cur_page_len = len;
- dio->cur_page_block = blocknr;
- dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
+ sdio->cur_page = page;
+ sdio->cur_page_offset = offset;
+ sdio->cur_page_len = len;
+ sdio->cur_page_block = blocknr;
+ sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
out:
return ret;
}
@@ -804,16 +821,16 @@ out:
* file blocks. Only called for S_ISREG files - blockdevs do not set
* buffer_new
*/
-static void clean_blockdev_aliases(struct dio *dio)
+static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
{
unsigned i;
unsigned nblocks;
- nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits;
+ nblocks = map_bh->b_size >> dio->inode->i_blkbits;
for (i = 0; i < nblocks; i++) {
- unmap_underlying_metadata(dio->map_bh.b_bdev,
- dio->map_bh.b_blocknr + i);
+ unmap_underlying_metadata(map_bh->b_bdev,
+ map_bh->b_blocknr + i);
}
}
@@ -826,19 +843,20 @@ static void clean_blockdev_aliases(struct dio *dio)
* `end' is zero if we're doing the start of the IO, 1 at the end of the
* IO.
*/
-static void dio_zero_block(struct dio *dio, int end)
+static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
+ int end, struct buffer_head *map_bh)
{
unsigned dio_blocks_per_fs_block;
unsigned this_chunk_blocks; /* In dio_blocks */
unsigned this_chunk_bytes;
struct page *page;
- dio->start_zero_done = 1;
- if (!dio->blkfactor || !buffer_new(&dio->map_bh))
+ sdio->start_zero_done = 1;
+ if (!sdio->blkfactor || !buffer_new(map_bh))
return;
- dio_blocks_per_fs_block = 1 << dio->blkfactor;
- this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
+ dio_blocks_per_fs_block = 1 << sdio->blkfactor;
+ this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
if (!this_chunk_blocks)
return;
@@ -850,14 +868,14 @@ static void dio_zero_block(struct dio *dio, int end)
if (end)
this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
- this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+ this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
page = ZERO_PAGE(0);
- if (submit_page_section(dio, page, 0, this_chunk_bytes,
- dio->next_block_for_io))
+ if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
+ sdio->next_block_for_io, map_bh))
return;
- dio->next_block_for_io += this_chunk_blocks;
+ sdio->next_block_for_io += this_chunk_blocks;
}
/*
@@ -876,20 +894,20 @@ static void dio_zero_block(struct dio *dio, int end)
* it should set b_size to PAGE_SIZE or more inside get_block(). This gives
* fine alignment but still allows this function to work in PAGE_SIZE units.
*/
-static int do_direct_IO(struct dio *dio)
+static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
+ struct buffer_head *map_bh)
{
- const unsigned blkbits = dio->blkbits;
+ const unsigned blkbits = sdio->blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
struct page *page;
unsigned block_in_page;
- struct buffer_head *map_bh = &dio->map_bh;
int ret = 0;
/* The I/O can start at any block offset within the first page */
- block_in_page = dio->first_block_in_page;
+ block_in_page = sdio->first_block_in_page;
- while (dio->block_in_file < dio->final_block_in_request) {
- page = dio_get_page(dio);
+ while (sdio->block_in_file < sdio->final_block_in_request) {
+ page = dio_get_page(dio, sdio);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
@@ -901,14 +919,14 @@ static int do_direct_IO(struct dio *dio)
unsigned this_chunk_blocks; /* # of blocks */
unsigned u;
- if (dio->blocks_available == 0) {
+ if (sdio->blocks_available == 0) {
/*
* Need to go and map some more disk
*/
unsigned long blkmask;
unsigned long dio_remainder;
- ret = get_more_blocks(dio);
+ ret = get_more_blocks(dio, sdio, map_bh);
if (ret) {
page_cache_release(page);
goto out;
@@ -916,18 +934,18 @@ static int do_direct_IO(struct dio *dio)
if (!buffer_mapped(map_bh))
goto do_holes;
- dio->blocks_available =
- map_bh->b_size >> dio->blkbits;
- dio->next_block_for_io =
- map_bh->b_blocknr << dio->blkfactor;
+ sdio->blocks_available =
+ map_bh->b_size >> sdio->blkbits;
+ sdio->next_block_for_io =
+ map_bh->b_blocknr << sdio->blkfactor;
if (buffer_new(map_bh))
- clean_blockdev_aliases(dio);
+ clean_blockdev_aliases(dio, map_bh);
- if (!dio->blkfactor)
+ if (!sdio->blkfactor)
goto do_holes;
- blkmask = (1 << dio->blkfactor) - 1;
- dio_remainder = (dio->block_in_file & blkmask);
+ blkmask = (1 << sdio->blkfactor) - 1;
+ dio_remainder = (sdio->block_in_file & blkmask);
/*
* If we are at the start of IO and that IO
@@ -941,8 +959,8 @@ static int do_direct_IO(struct dio *dio)
* on-disk
*/
if (!buffer_new(map_bh))
- dio->next_block_for_io += dio_remainder;
- dio->blocks_available -= dio_remainder;
+ sdio->next_block_for_io += dio_remainder;
+ sdio->blocks_available -= dio_remainder;
}
do_holes:
/* Handle holes */
@@ -961,7 +979,7 @@ do_holes:
*/
i_size_aligned = ALIGN(i_size_read(dio->inode),
1 << blkbits);
- if (dio->block_in_file >=
+ if (sdio->block_in_file >=
i_size_aligned >> blkbits) {
/* We hit eof */
page_cache_release(page);
@@ -969,7 +987,7 @@ do_holes:
}
zero_user(page, block_in_page << blkbits,
1 << blkbits);
- dio->block_in_file++;
+ sdio->block_in_file++;
block_in_page++;
goto next_block;
}
@@ -979,38 +997,41 @@ do_holes:
* is finer than the underlying fs, go check to see if
* we must zero out the start of this block.
*/
- if (unlikely(dio->blkfactor && !dio->start_zero_done))
- dio_zero_block(dio, 0);
+ if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
+ dio_zero_block(dio, sdio, 0, map_bh);
/*
* Work out, in this_chunk_blocks, how much disk we
* can add to this page
*/
- this_chunk_blocks = dio->blocks_available;
+ this_chunk_blocks = sdio->blocks_available;
u = (PAGE_SIZE - offset_in_page) >> blkbits;
if (this_chunk_blocks > u)
this_chunk_blocks = u;
- u = dio->final_block_in_request - dio->block_in_file;
+ u = sdio->final_block_in_request - sdio->block_in_file;
if (this_chunk_blocks > u)
this_chunk_blocks = u;
this_chunk_bytes = this_chunk_blocks << blkbits;
BUG_ON(this_chunk_bytes == 0);
- dio->boundary = buffer_boundary(map_bh);
- ret = submit_page_section(dio, page, offset_in_page,
- this_chunk_bytes, dio->next_block_for_io);
+ sdio->boundary = buffer_boundary(map_bh);
+ ret = submit_page_section(dio, sdio, page,
+ offset_in_page,
+ this_chunk_bytes,
+ sdio->next_block_for_io,
+ map_bh);
if (ret) {
page_cache_release(page);
goto out;
}
- dio->next_block_for_io += this_chunk_blocks;
+ sdio->next_block_for_io += this_chunk_blocks;
- dio->block_in_file += this_chunk_blocks;
+ sdio->block_in_file += this_chunk_blocks;
block_in_page += this_chunk_blocks;
- dio->blocks_available -= this_chunk_blocks;
+ sdio->blocks_available -= this_chunk_blocks;
next_block:
- BUG_ON(dio->block_in_file > dio->final_block_in_request);
- if (dio->block_in_file == dio->final_block_in_request)
+ BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
+ if (sdio->block_in_file == sdio->final_block_in_request)
break;
}
@@ -1022,135 +1043,10 @@ out:
return ret;
}
-static ssize_t
-direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs,
- unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
- dio_submit_t submit_io, struct dio *dio)
+static inline int drop_refcount(struct dio *dio)
{
- unsigned long user_addr;
+ int ret2;
unsigned long flags;
- int seg;
- ssize_t ret = 0;
- ssize_t ret2;
- size_t bytes;
-
- dio->inode = inode;
- dio->rw = rw;
- dio->blkbits = blkbits;
- dio->blkfactor = inode->i_blkbits - blkbits;
- dio->block_in_file = offset >> blkbits;
-
- dio->get_block = get_block;
- dio->end_io = end_io;
- dio->submit_io = submit_io;
- dio->final_block_in_bio = -1;
- dio->next_block_for_io = -1;
-
- dio->iocb = iocb;
- dio->i_size = i_size_read(inode);
-
- spin_lock_init(&dio->bio_lock);
- dio->refcount = 1;
-
- /*
- * In case of non-aligned buffers, we may need 2 more
- * pages since we need to zero out first and last block.
- */
- if (unlikely(dio->blkfactor))
- dio->pages_in_io = 2;
-
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- dio->pages_in_io +=
- ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
- - user_addr/PAGE_SIZE);
- }
-
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- dio->size += bytes = iov[seg].iov_len;
-
- /* Index into the first page of the first block */
- dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
- dio->final_block_in_request = dio->block_in_file +
- (bytes >> blkbits);
- /* Page fetching state */
- dio->head = 0;
- dio->tail = 0;
- dio->curr_page = 0;
-
- dio->total_pages = 0;
- if (user_addr & (PAGE_SIZE-1)) {
- dio->total_pages++;
- bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
- }
- dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
- dio->curr_user_address = user_addr;
-
- ret = do_direct_IO(dio);
-
- dio->result += iov[seg].iov_len -
- ((dio->final_block_in_request - dio->block_in_file) <<
- blkbits);
-
- if (ret) {
- dio_cleanup(dio);
- break;
- }
- } /* end iovec loop */
-
- if (ret == -ENOTBLK) {
- /*
- * The remaining part of the request will be
- * be handled by buffered I/O when we return
- */
- ret = 0;
- }
- /*
- * There may be some unwritten disk at the end of a part-written
- * fs-block-sized block. Go zero that now.
- */
- dio_zero_block(dio, 1);
-
- if (dio->cur_page) {
- ret2 = dio_send_cur_page(dio);
- if (ret == 0)
- ret = ret2;
- page_cache_release(dio->cur_page);
- dio->cur_page = NULL;
- }
- if (dio->bio)
- dio_bio_submit(dio);
-
- /*
- * It is possible that, we return short IO due to end of file.
- * In that case, we need to release all the pages we got hold on.
- */
- dio_cleanup(dio);
-
- /*
- * All block lookups have been performed. For READ requests
- * we can let i_mutex go now that its achieved its purpose
- * of protecting us from looking up uninitialized blocks.
- */
- if (rw == READ && (dio->flags & DIO_LOCKING))
- mutex_unlock(&dio->inode->i_mutex);
-
- /*
- * The only time we want to leave bios in flight is when a successful
- * partial aio read or full aio write have been setup. In that case
- * bio completion will call aio_complete. The only time it's safe to
- * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
- * This had *better* be the only place that raises -EIOCBQUEUED.
- */
- BUG_ON(ret == -EIOCBQUEUED);
- if (dio->is_async && ret == 0 && dio->result &&
- ((rw & READ) || (dio->result == dio->size)))
- ret = -EIOCBQUEUED;
-
- if (ret != -EIOCBQUEUED)
- dio_await_completion(dio);
/*
* Sync will always be dropping the final ref and completing the
@@ -1166,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
spin_lock_irqsave(&dio->bio_lock, flags);
ret2 = --dio->refcount;
spin_unlock_irqrestore(&dio->bio_lock, flags);
-
- if (ret2 == 0) {
- ret = dio_complete(dio, offset, ret, false);
- kfree(dio);
- } else
- BUG_ON(ret != -EIOCBQUEUED);
-
- return ret;
+ return ret2;
}
/*
@@ -1195,6 +1084,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
* expected that filesystem provide exclusion between new direct I/O
* and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
* but other filesystems need to take care of this on their own.
+ *
+ * NOTE: if you pass "sdio" to anything by pointer make sure that function
+ * is always inlined. Otherwise gcc is unable to split the structure into
+ * individual fields and will generate much worse code. This is important
+ * for the whole file.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1211,6 +1105,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
+ struct dio_submit sdio = { 0, };
+ unsigned long user_addr;
+ size_t bytes;
+ struct buffer_head map_bh = { 0, };
if (rw & WRITE)
rw = WRITE_ODIRECT;
@@ -1244,7 +1142,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (rw == READ && end == offset)
return 0;
- dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+ dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
goto out;
@@ -1268,7 +1166,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
end - 1);
if (retval) {
mutex_unlock(&inode->i_mutex);
- kfree(dio);
+ kmem_cache_free(dio_cache, dio);
goto out;
}
}
@@ -1288,11 +1186,141 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
(end > i_size_read(inode)));
- retval = direct_io_worker(rw, iocb, inode, iov, offset,
- nr_segs, blkbits, get_block, end_io,
- submit_io, dio);
+ retval = 0;
+
+ dio->inode = inode;
+ dio->rw = rw;
+ sdio.blkbits = blkbits;
+ sdio.blkfactor = inode->i_blkbits - blkbits;
+ sdio.block_in_file = offset >> blkbits;
+
+ sdio.get_block = get_block;
+ dio->end_io = end_io;
+ sdio.submit_io = submit_io;
+ sdio.final_block_in_bio = -1;
+ sdio.next_block_for_io = -1;
+
+ dio->iocb = iocb;
+ dio->i_size = i_size_read(inode);
+
+ spin_lock_init(&dio->bio_lock);
+ dio->refcount = 1;
+
+ /*
+ * In case of non-aligned buffers, we may need 2 more
+ * pages since we need to zero out first and last block.
+ */
+ if (unlikely(sdio.blkfactor))
+ sdio.pages_in_io = 2;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ user_addr = (unsigned long)iov[seg].iov_base;
+ sdio.pages_in_io +=
+ ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+ PAGE_SIZE - user_addr / PAGE_SIZE);
+ }
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ user_addr = (unsigned long)iov[seg].iov_base;
+ sdio.size += bytes = iov[seg].iov_len;
+
+ /* Index into the first page of the first block */
+ sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+ sdio.final_block_in_request = sdio.block_in_file +
+ (bytes >> blkbits);
+ /* Page fetching state */
+ sdio.head = 0;
+ sdio.tail = 0;
+ sdio.curr_page = 0;
+
+ sdio.total_pages = 0;
+ if (user_addr & (PAGE_SIZE-1)) {
+ sdio.total_pages++;
+ bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+ }
+ sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ sdio.curr_user_address = user_addr;
+
+ retval = do_direct_IO(dio, &sdio, &map_bh);
+
+ dio->result += iov[seg].iov_len -
+ ((sdio.final_block_in_request - sdio.block_in_file) <<
+ blkbits);
+
+ if (retval) {
+ dio_cleanup(dio, &sdio);
+ break;
+ }
+ } /* end iovec loop */
+
+ if (retval == -ENOTBLK) {
+ /*
+ * The remaining part of the request will be
+ * be handled by buffered I/O when we return
+ */
+ retval = 0;
+ }
+ /*
+ * There may be some unwritten disk at the end of a part-written
+ * fs-block-sized block. Go zero that now.
+ */
+ dio_zero_block(dio, &sdio, 1, &map_bh);
+
+ if (sdio.cur_page) {
+ ssize_t ret2;
+
+ ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
+ if (retval == 0)
+ retval = ret2;
+ page_cache_release(sdio.cur_page);
+ sdio.cur_page = NULL;
+ }
+ if (sdio.bio)
+ dio_bio_submit(dio, &sdio);
+
+ /*
+ * It is possible that, we return short IO due to end of file.
+ * In that case, we need to release all the pages we got hold on.
+ */
+ dio_cleanup(dio, &sdio);
+
+ /*
+ * All block lookups have been performed. For READ requests
+ * we can let i_mutex go now that its achieved its purpose
+ * of protecting us from looking up uninitialized blocks.
+ */
+ if (rw == READ && (dio->flags & DIO_LOCKING))
+ mutex_unlock(&dio->inode->i_mutex);
+
+ /*
+ * The only time we want to leave bios in flight is when a successful
+ * partial aio read or full aio write have been setup. In that case
+ * bio completion will call aio_complete. The only time it's safe to
+ * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+ * This had *better* be the only place that raises -EIOCBQUEUED.
+ */
+ BUG_ON(retval == -EIOCBQUEUED);
+ if (dio->is_async && retval == 0 && dio->result &&
+ ((rw & READ) || (dio->result == sdio.size)))
+ retval = -EIOCBQUEUED;
+
+ if (retval != -EIOCBQUEUED)
+ dio_await_completion(dio);
+
+ if (drop_refcount(dio) == 0) {
+ retval = dio_complete(dio, offset, retval, false);
+ kmem_cache_free(dio_cache, dio);
+ } else
+ BUG_ON(retval != -EIOCBQUEUED);
out:
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
+
+static __init int dio_init(void)
+{
+ dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
+ return 0;
+}
+module_init(dio_init)
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b36c5572b3f3..54481a3b2c79 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -514,7 +514,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
#define ecryptfs_printk(type, fmt, arg...) \
__ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
-__attribute__ ((format(printf, 1, 2)))
+__printf(1, 2)
void __ecryptfs_printk(const char *fmt, ...);
extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index fe047d966dc5..828e750af23a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -70,6 +70,15 @@
* simultaneous inserts (A into B and B into A) from racing and
* constructing a cycle without either insert observing that it is
* going to.
+ * It is necessary to acquire multiple "ep->mtx"es at once in the
+ * case when one epoll fd is added to another. In this case, we
+ * always acquire the locks in the order of nesting (i.e. after
+ * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
+ * before e2->mtx). Since we disallow cycles of epoll file
+ * descriptors, this ensures that the mutexes are well-ordered. In
+ * order to communicate this nesting to lockdep, when walking a tree
+ * of epoll file descriptors, we use the current recursion depth as
+ * the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
* mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
* @ep: Pointer to the epoll private data structure.
* @sproc: Pointer to the scan callback.
* @priv: Private opaque data passed to the @sproc callback.
+ * @depth: The current depth of recursive f_op->poll calls.
*
* Returns: The same integer error code returned by the @sproc callback.
*/
static int ep_scan_ready_list(struct eventpoll *ep,
int (*sproc)(struct eventpoll *,
struct list_head *, void *),
- void *priv)
+ void *priv,
+ int depth)
{
int error, pwake = 0;
unsigned long flags;
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl().
*/
- mutex_lock(&ep->mtx);
+ mutex_lock_nested(&ep->mtx, depth);
/*
* Steal the ready list, and re-init the original one to the
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
{
- return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+ return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
}
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
@@ -700,7 +711,7 @@ static const struct file_operations eventpoll_fops = {
.llseek = noop_llseek,
};
-/* Fast test to see if the file is an evenpoll file */
+/* Fast test to see if the file is an eventpoll file */
static inline int is_file_epoll(struct file *f)
{
return f->f_op == &eventpoll_fops;
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
ep = epi->ep;
list_del_init(&epi->fllink);
- mutex_lock(&ep->mtx);
+ mutex_lock_nested(&ep->mtx, 0);
ep_remove(ep, epi);
mutex_unlock(&ep->mtx);
}
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
esed.maxevents = maxevents;
esed.events = events;
- return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
+ return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
}
static inline struct timespec ep_set_mstimeout(long ms)
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
struct rb_node *rbp;
struct epitem *epi;
- mutex_lock(&ep->mtx);
+ mutex_lock_nested(&ep->mtx, call_nests + 1);
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
}
- mutex_lock(&ep->mtx);
+ mutex_lock_nested(&ep->mtx, 0);
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
diff --git a/fs/exec.c b/fs/exec.c
index 25dcbe5fc356..36254645b7cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm = mm;
tsk->active_mm = mm;
activate_mm(active_mm, mm);
- if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
- atomic_dec(&old_mm->oom_disable_count);
- atomic_inc(&tsk->mm->oom_disable_count);
- }
task_unlock(tsk);
arch_pick_mmap_layout(mm);
if (old_mm) {
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855a6c44..352ba149d23e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -13,7 +13,8 @@
#
# ore module library
-obj-$(CONFIG_ORE) += ore.o
+libore-y := ore.o ore_raid.o
+obj-$(CONFIG_ORE) += libore.o
exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 70bae4149291..da42f32c49be 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,10 +1,17 @@
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
config ORE
tristate
+ depends on EXOFS_FS || PNFS_OBJLAYOUT
+ select ASYNC_XOR
+ default SCSI_OSD_ULD
config EXOFS_FS
tristate "exofs: OSD based file system support"
depends on SCSI_OSD_ULD
- select ORE
help
EXOFS is a file system that uses an OSD storage device,
as its backing storage.
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index f4e442ec7445..51f4b4c40f09 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -53,6 +53,10 @@
/* u64 has problems with printk this will cast it to unsigned long long */
#define _LLU(x) (unsigned long long)(x)
+struct exofs_dev {
+ struct ore_dev ored;
+ unsigned did;
+};
/*
* our extension to the in-memory superblock
*/
@@ -66,13 +70,9 @@ struct exofs_sb_info {
u32 s_next_generation; /* next gen # to use */
atomic_t s_curr_pending; /* number of pending commands */
- struct pnfs_osd_data_map data_map; /* Default raid to use
- * FIXME: Needed ?
- */
struct ore_layout layout; /* Default files layout */
struct ore_comp one_comp; /* id & cred of partition id=0*/
- struct ore_components comps; /* comps for the partition */
- struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
+ struct ore_components oc; /* comps for the partition */
};
/*
@@ -86,7 +86,7 @@ struct exofs_i_info {
uint32_t i_dir_start_lookup; /* which page to start lookup */
uint64_t i_commit_size; /* the object's written length */
struct ore_comp one_comp; /* same component for all devices */
- struct ore_components comps; /* inode view of the device table */
+ struct ore_components oc; /* inode view of the device table */
};
static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations;
* bigger and that the device table repeats twice.
* See: exofs_read_lookup_dev_table()
*/
-static inline void exofs_init_comps(struct ore_components *comps,
+static inline void exofs_init_comps(struct ore_components *oc,
struct ore_comp *one_comp,
struct exofs_sb_info *sbi, osd_id oid)
{
@@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps,
one_comp->obj.id = oid;
exofs_make_credential(one_comp->cred, &one_comp->obj);
- comps->numdevs = sbi->comps.numdevs;
- comps->single_comp = EC_SINGLE_COMP;
- comps->comps = one_comp;
+ oc->first_dev = 0;
+ oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
+ sbi->layout.group_count;
+ oc->single_comp = EC_SINGLE_COMP;
+ oc->comps = one_comp;
/* Round robin device view of the table */
- first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
- comps->ods = sbi->comps.ods + first_dev;
+ first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
+ oc->ods = &sbi->oc.ods[first_dev];
}
#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f39a38fc2349..3e5f3a6be90a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,11 +37,7 @@
#define EXOFS_DBGMSG2(M...) do {} while (0)
-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
- MAX_PAGES_KMALLOC =
- PAGE_SIZE / sizeof(struct page *),
-};
+enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
unsigned exofs_max_io_pages(struct ore_layout *layout,
unsigned expected_pages)
@@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout,
unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
/* TODO: easily support bio chaining */
- pages = min_t(unsigned, pages,
- layout->group_width * BIO_MAX_PAGES_KMALLOC);
+ pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
return pages;
}
@@ -68,6 +63,7 @@ struct page_collect {
bool read_4_write; /* This means two things: that the read is sync
* And the pages should not be unlocked.
*/
+ struct page *that_locked_page;
};
static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
pcol->length = 0;
pcol->pg_first = -1;
pcol->read_4_write = false;
+ pcol->that_locked_page = NULL;
}
static void _pcol_reset(struct page_collect *pcol)
@@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol)
pcol->length = 0;
pcol->pg_first = -1;
pcol->ios = NULL;
+ pcol->that_locked_page = NULL;
/* this is probably the end of the loop but in writes
* it might not end here. don't be left with nothing
@@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page,
return 0;
}
+enum {PAGE_WAS_NOT_IN_IO = 17};
static int update_read_page(struct page *page, int ret)
{
- if (ret == 0) {
+ switch (ret) {
+ case 0:
/* Everything is OK */
SetPageUptodate(page);
if (PageError(page))
ClearPageError(page);
- } else if (ret == -EFAULT) {
+ break;
+ case -EFAULT:
/* In this case we were trying to read something that wasn't on
* disk yet - return a page full of zeroes. This should be OK,
* because the object should be empty (if there was a write
@@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret)
SetPageUptodate(page);
if (PageError(page))
ClearPageError(page);
- ret = 0; /* recovered error */
EXOFS_DBGMSG("recovered read error\n");
- } else /* Error */
+ /* fall through */
+ case PAGE_WAS_NOT_IN_IO:
+ ret = 0; /* recovered error */
+ break;
+ default:
SetPageError(page);
-
+ }
return ret;
}
static void update_write_page(struct page *page, int ret)
{
+ if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
+ return; /* don't pass start don't collect $200 */
+
if (ret) {
mapping_set_error(page->mapping, ret);
SetPageError(page);
@@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret)
static int __readpages_done(struct page_collect *pcol)
{
int i;
- u64 resid;
u64 good_bytes;
u64 length = 0;
- int ret = ore_check_io(pcol->ios, &resid);
+ int ret = ore_check_io(pcol->ios, NULL);
- if (likely(!ret))
+ if (likely(!ret)) {
good_bytes = pcol->length;
- else
- good_bytes = pcol->length - resid;
+ ret = PAGE_WAS_NOT_IN_IO;
+ } else {
+ good_bytes = 0;
+ }
EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
@@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
}
}
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+ struct page_collect *pcol_src, struct page_collect *pcol)
+{
+ /* length was wrong or offset was not page aligned */
+ BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+
+ if (pcol_src->nr_pages > ios->nr_pages) {
+ struct page **src_page;
+ unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+ unsigned long len_less = pcol_src->length - ios->length;
+ unsigned i;
+ int ret;
+
+ /* This IO was trimmed */
+ pcol_src->nr_pages = ios->nr_pages;
+ pcol_src->length = ios->length;
+
+ /* Left over pages are passed to the next io */
+ pcol->expected_pages += pages_less;
+ pcol->nr_pages = pages_less;
+ pcol->length = len_less;
+ src_page = pcol_src->pages + pcol_src->nr_pages;
+ pcol->pg_first = (*src_page)->index;
+
+ ret = pcol_try_alloc(pcol);
+ if (unlikely(ret))
+ return ret;
+
+ for (i = 0; i < pages_less; ++i)
+ pcol->pages[i] = *src_page++;
+
+ EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+ "pages_less=0x%x expected_pages=0x%x "
+ "next_offset=0x%llx next_len=0x%lx\n",
+ pcol_src->nr_pages, pages_less, pcol->expected_pages,
+ pcol->pg_first * PAGE_SIZE, pcol->length);
+ }
+ return 0;
+}
+
static int read_exec(struct page_collect *pcol)
{
struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol)
return 0;
if (!pcol->ios) {
- int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
+ int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
pcol->pg_first << PAGE_CACHE_SHIFT,
pcol->length, &pcol->ios);
@@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol)
ios = pcol->ios;
ios->pages = pcol->pages;
- ios->nr_pages = pcol->nr_pages;
if (pcol->read_4_write) {
ore_read(pcol->ios);
@@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol)
*pcol_copy = *pcol;
ios->done = readpages_done;
ios->private = pcol_copy;
+
+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+
+ ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+ if (unlikely(ret))
+ goto err;
+
+ EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+ pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
ret = ore_read(ios);
if (unlikely(ret))
goto err;
atomic_inc(&pcol->sbi->s_curr_pending);
- EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
- oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
-
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
return 0;
err:
@@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page)
EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
page->index);
+ pcol->that_locked_page = page;
+
if (page->index < end_index)
len = PAGE_CACHE_SIZE;
else if (page->index == end_index)
@@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
return ret;
}
+ ret = read_exec(&pcol);
+ if (unlikely(ret))
+ return ret;
+
return read_exec(&pcol);
}
@@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p)
{
struct page_collect *pcol = p;
int i;
- u64 resid;
u64 good_bytes;
u64 length = 0;
- int ret = ore_check_io(ios, &resid);
+ int ret = ore_check_io(ios, NULL);
atomic_dec(&pcol->sbi->s_curr_pending);
- if (likely(!ret))
+ if (likely(!ret)) {
good_bytes = pcol->length;
- else
- good_bytes = pcol->length - resid;
+ ret = PAGE_WAS_NOT_IN_IO;
+ } else {
+ good_bytes = 0;
+ }
EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
@@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p)
EXOFS_DBGMSG2("writepages_done END\n");
}
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+ struct page_collect *pcol = priv;
+ pgoff_t index = offset / PAGE_SIZE;
+
+ if (!pcol->that_locked_page ||
+ (pcol->that_locked_page->index != index)) {
+ struct page *page = find_get_page(pcol->inode->i_mapping, index);
+
+ if (!page) {
+ page = find_or_create_page(pcol->inode->i_mapping,
+ index, GFP_NOFS);
+ if (unlikely(!page)) {
+ EXOFS_DBGMSG("grab_cache_page Failed "
+ "index=0x%llx\n", _LLU(index));
+ return NULL;
+ }
+ unlock_page(page);
+ }
+ if (PageDirty(page) || PageWriteback(page))
+ *uptodate = true;
+ else
+ *uptodate = PageUptodate(page);
+ EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+ return page;
+ } else {
+ EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+ pcol->that_locked_page->index);
+ *uptodate = true;
+ return pcol->that_locked_page;
+ }
+}
+
+static void __r4w_put_page(void *priv, struct page *page)
+{
+ struct page_collect *pcol = priv;
+
+ if (pcol->that_locked_page != page) {
+ EXOFS_DBGMSG("index=0x%lx\n", page->index);
+ page_cache_release(page);
+ return;
+ }
+ EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+}
+
+static const struct _ore_r4w_op _r4w_op = {
+ .get_page = &__r4w_get_page,
+ .put_page = &__r4w_put_page,
+};
+
static int write_exec(struct page_collect *pcol)
{
struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol)
return 0;
BUG_ON(pcol->ios);
- ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
+ ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
pcol->pg_first << PAGE_CACHE_SHIFT,
pcol->length, &pcol->ios);
-
if (unlikely(ret))
goto err;
@@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol)
ios = pcol->ios;
ios->pages = pcol_copy->pages;
- ios->nr_pages = pcol_copy->nr_pages;
ios->done = writepages_done;
+ ios->r4w = &_r4w_op;
ios->private = pcol_copy;
+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+
+ ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+ if (unlikely(ret))
+ goto err;
+
+ EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+ pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
ret = ore_write(ios);
if (unlikely(ret)) {
EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol)
}
atomic_inc(&pcol->sbi->s_curr_pending);
- EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
- pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
- pcol->length);
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
return 0;
err:
@@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping,
_pcol_init(&pcol, expected_pages, mapping->host);
ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
- if (ret) {
+ if (unlikely(ret)) {
EXOFS_ERR("write_cache_pages => %d\n", ret);
return ret;
}
- return write_exec(&pcol);
+ ret = write_exec(&pcol);
+ if (unlikely(ret))
+ return ret;
+
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ return write_exec(&pcol); /* pump the last reminder */
+ } else if (pcol.nr_pages) {
+ /* not SYNC let the reminder join the next writeout */
+ unsigned i;
+
+ for (i = 0; i < pcol.nr_pages; i++) {
+ struct page *page = pcol.pages[i];
+
+ end_page_writeback(page);
+ set_page_dirty(page);
+ unlock_page(page);
+ }
+ }
+ return 0;
}
+/*
static int exofs_writepage(struct page *page, struct writeback_control *wbc)
{
struct page_collect pcol;
@@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
return write_exec(&pcol);
}
-
+*/
/* i_mutex held using inode->i_size directly */
static void _write_failed(struct inode *inode, loff_t to)
{
@@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset)
const struct address_space_operations exofs_aops = {
.readpage = exofs_readpage,
.readpages = exofs_readpages,
- .writepage = exofs_writepage,
+ .writepage = NULL,
.writepages = exofs_writepages,
.write_begin = exofs_write_begin_export,
.write_end = exofs_write_end,
@@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
+ ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
if (likely(!ret))
truncate_setsize(inode, newsize);
@@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
struct exofs_on_disk_inode_layout *layout;
int ret;
- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
}
- attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
- attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
+ attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
+ attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
ios->in_attr = attrs;
ios->in_attr_len = ARRAY_SIZE(attrs);
@@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
return inode;
oi = exofs_i(inode);
__oi_init(oi);
- exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+ exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
exofs_oi_objno(oi));
/* read the inode from the osd */
@@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
spin_unlock(&sbi->s_next_gen_lock);
insert_inode_hash(inode);
- exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+ exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
exofs_oi_objno(oi));
exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
mark_inode_dirty(inode);
- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
return ERR_PTR(ret);
@@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
} else
memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
goto free_args;
@@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode)
/* ignore the error, attempt a remove anyway */
/* Now Remove the OSD objects */
- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
return;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 25305af88198..fcfa86ae6faf 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -24,76 +24,287 @@
#include <linux/slab.h>
#include <asm/div64.h>
+#include <linux/lcm.h>
-#include <scsi/osd_ore.h>
+#include "ore_raid.h"
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
+MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
+MODULE_LICENSE("GPL");
+
+/* ore_verify_layout does a couple of things:
+ * 1. Given a minimum number of needed parameters fixes up the rest of the
+ * members to be operatonals for the ore. The needed parameters are those
+ * that are defined by the pnfs-objects layout STD.
+ * 2. Check to see if the current ore code actually supports these parameters
+ * for example stripe_unit must be a multple of the system PAGE_SIZE,
+ * and etc...
+ * 3. Cache some havily used calculations that will be needed by users.
+ */
+
+enum { BIO_MAX_PAGES_KMALLOC =
+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
-#ifdef CONFIG_EXOFS_DEBUG
-#define ORE_DBGMSG(fmt, a...) \
- printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define ORE_DBGMSG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
+{
+ u64 stripe_length;
+
+ switch (layout->raid_algorithm) {
+ case PNFS_OSD_RAID_0:
+ layout->parity = 0;
+ break;
+ case PNFS_OSD_RAID_5:
+ layout->parity = 1;
+ break;
+ case PNFS_OSD_RAID_PQ:
+ case PNFS_OSD_RAID_4:
+ default:
+ ORE_ERR("Only RAID_0/5 for now\n");
+ return -EINVAL;
+ }
+ if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
+ ORE_ERR("Stripe Unit(0x%llx)"
+ " must be Multples of PAGE_SIZE(0x%lx)\n",
+ _LLU(layout->stripe_unit), PAGE_SIZE);
+ return -EINVAL;
+ }
+ if (layout->group_width) {
+ if (!layout->group_depth) {
+ ORE_ERR("group_depth == 0 && group_width != 0\n");
+ return -EINVAL;
+ }
+ if (total_comps < (layout->group_width * layout->mirrors_p1)) {
+ ORE_ERR("Data Map wrong, "
+ "numdevs=%d < group_width=%d * mirrors=%d\n",
+ total_comps, layout->group_width,
+ layout->mirrors_p1);
+ return -EINVAL;
+ }
+ layout->group_count = total_comps / layout->mirrors_p1 /
+ layout->group_width;
+ } else {
+ if (layout->group_depth) {
+ printk(KERN_NOTICE "Warning: group_depth ignored "
+ "group_width == 0 && group_depth == %lld\n",
+ _LLU(layout->group_depth));
+ }
+ layout->group_width = total_comps / layout->mirrors_p1;
+ layout->group_depth = -1;
+ layout->group_count = 1;
+ }
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
+ stripe_length = (u64)layout->group_width * layout->stripe_unit;
+ if (stripe_length >= (1ULL << 32)) {
+ ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
+ _LLU(stripe_length));
+ return -EINVAL;
+ }
-#define ORE_DBGMSG2(M...) do {} while (0)
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
+ layout->max_io_length =
+ (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
+ layout->group_width;
+ if (layout->parity) {
+ unsigned stripe_length =
+ (layout->group_width - layout->parity) *
+ layout->stripe_unit;
-MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
-MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
-MODULE_LICENSE("GPL");
+ layout->max_io_length /= stripe_length;
+ layout->max_io_length *= stripe_length;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ore_verify_layout);
static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
{
- return ios->comps->comps[index & ios->comps->single_comp].cred;
+ return ios->oc->comps[index & ios->oc->single_comp].cred;
}
static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
{
- return &ios->comps->comps[index & ios->comps->single_comp].obj;
+ return &ios->oc->comps[index & ios->oc->single_comp].obj;
}
static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
{
- return ios->comps->ods[index];
+ ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
+ ios->oc->first_dev, ios->oc->numdevs, index,
+ ios->oc->ods);
+
+ return ore_comp_dev(ios->oc, index);
}
-int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
+int _ore_get_io_state(struct ore_layout *layout,
+ struct ore_components *oc, unsigned numdevs,
+ unsigned sgs_per_dev, unsigned num_par_pages,
+ struct ore_io_state **pios)
+{
+ struct ore_io_state *ios;
+ struct page **pages;
+ struct osd_sg_entry *sgilist;
+ struct __alloc_all_io_state {
+ struct ore_io_state ios;
+ struct ore_per_dev_state per_dev[numdevs];
+ union {
+ struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+ struct page *pages[num_par_pages];
+ };
+ } *_aios;
+
+ if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
+ _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
+ if (unlikely(!_aios)) {
+ ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
+ sizeof(*_aios));
+ *pios = NULL;
+ return -ENOMEM;
+ }
+ pages = num_par_pages ? _aios->pages : NULL;
+ sgilist = sgs_per_dev ? _aios->sglist : NULL;
+ ios = &_aios->ios;
+ } else {
+ struct __alloc_small_io_state {
+ struct ore_io_state ios;
+ struct ore_per_dev_state per_dev[numdevs];
+ } *_aio_small;
+ union __extra_part {
+ struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+ struct page *pages[num_par_pages];
+ } *extra_part;
+
+ _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
+ if (unlikely(!_aio_small)) {
+ ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
+ sizeof(*_aio_small));
+ *pios = NULL;
+ return -ENOMEM;
+ }
+ extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
+ if (unlikely(!extra_part)) {
+ ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
+ sizeof(*extra_part));
+ kfree(_aio_small);
+ *pios = NULL;
+ return -ENOMEM;
+ }
+
+ pages = num_par_pages ? extra_part->pages : NULL;
+ sgilist = sgs_per_dev ? extra_part->sglist : NULL;
+ /* In this case the per_dev[0].sgilist holds the pointer to
+ * be freed
+ */
+ ios = &_aio_small->ios;
+ ios->extra_part_alloc = true;
+ }
+
+ if (pages) {
+ ios->parity_pages = pages;
+ ios->max_par_pages = num_par_pages;
+ }
+ if (sgilist) {
+ unsigned d;
+
+ for (d = 0; d < numdevs; ++d) {
+ ios->per_dev[d].sglist = sgilist;
+ sgilist += sgs_per_dev;
+ }
+ ios->sgs_per_dev = sgs_per_dev;
+ }
+
+ ios->layout = layout;
+ ios->oc = oc;
+ *pios = ios;
+ return 0;
+}
+
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ * (@offset + @length) % strip_size == 0. Or the complete range is within a
+ * single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ * And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
bool is_reading, u64 offset, u64 length,
struct ore_io_state **pios)
{
struct ore_io_state *ios;
+ unsigned numdevs = layout->group_width * layout->mirrors_p1;
+ unsigned sgs_per_dev = 0, max_par_pages = 0;
+ int ret;
- /*TODO: Maybe use kmem_cach per sbi of size
- * exofs_io_state_size(layout->s_numdevs)
- */
- ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
- if (unlikely(!ios)) {
- ORE_DBGMSG("Failed kzalloc bytes=%d\n",
- ore_io_state_size(comps->numdevs));
- *pios = NULL;
- return -ENOMEM;
+ if (layout->parity && length) {
+ unsigned data_devs = layout->group_width - layout->parity;
+ unsigned stripe_size = layout->stripe_unit * data_devs;
+ unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+ u32 remainder;
+ u64 num_stripes;
+ u64 num_raid_units;
+
+ num_stripes = div_u64_rem(length, stripe_size, &remainder);
+ if (remainder)
+ ++num_stripes;
+
+ num_raid_units = num_stripes * layout->parity;
+
+ if (is_reading) {
+ /* For reads add per_dev sglist array */
+ /* TODO: Raid 6 we need twice more. Actually:
+ * num_stripes / LCMdP(W,P);
+ * if (W%P != 0) num_stripes *= parity;
+ */
+
+ /* first/last seg is split */
+ num_raid_units += layout->group_width;
+ sgs_per_dev = div_u64(num_raid_units, data_devs);
+ } else {
+ /* For Writes add parity pages array. */
+ max_par_pages = num_raid_units * pages_in_unit *
+ sizeof(struct page *);
+ }
}
- ios->layout = layout;
- ios->comps = comps;
- ios->offset = offset;
- ios->length = length;
+ ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
+ pios);
+ if (unlikely(ret))
+ return ret;
+
+ ios = *pios;
ios->reading = is_reading;
+ ios->offset = offset;
+
+ if (length) {
+ ore_calc_stripe_info(layout, offset, length, &ios->si);
+ ios->length = ios->si.length;
+ ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+ if (layout->parity)
+ _ore_post_alloc_raid_stuff(ios);
+ }
- *pios = ios;
return 0;
}
EXPORT_SYMBOL(ore_get_rw_state);
-int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
- struct ore_io_state **ios)
+/* Allocate an io_state for all the devices in the comps array
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
+int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
+ struct ore_io_state **pios)
{
- return ore_get_rw_state(layout, comps, true, 0, 0, ios);
+ return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
}
EXPORT_SYMBOL(ore_get_io_state);
@@ -111,6 +322,7 @@ void ore_put_io_state(struct ore_io_state *ios)
bio_put(per_dev->bio);
}
+ _ore_free_raid_stuff(ios);
kfree(ios);
}
}
@@ -138,7 +350,7 @@ static void _done_io(struct osd_request *or, void *p)
kref_put(&ios->kref, _last_io);
}
-static int ore_io_execute(struct ore_io_state *ios)
+int ore_io_execute(struct ore_io_state *ios)
{
DECLARE_COMPLETION_ONSTACK(wait);
bool sync = (ios->done == NULL);
@@ -198,7 +410,7 @@ static void _clear_bio(struct bio *bio)
}
}
-int ore_check_io(struct ore_io_state *ios, u64 *resid)
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
{
enum osd_err_priority acumulated_osd_err = 0;
int acumulated_lin_err = 0;
@@ -206,7 +418,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
for (i = 0; i < ios->numdevs; i++) {
struct osd_sense_info osi;
- struct osd_request *or = ios->per_dev[i].or;
+ struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+ struct osd_request *or = per_dev->or;
int ret;
if (unlikely(!or))
@@ -218,29 +431,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
/* start read offset passed endof file */
- _clear_bio(ios->per_dev[i].bio);
+ _clear_bio(per_dev->bio);
ORE_DBGMSG("start read offset passed end of file "
"offset=0x%llx, length=0x%llx\n",
- _LLU(ios->per_dev[i].offset),
- _LLU(ios->per_dev[i].length));
+ _LLU(per_dev->offset),
+ _LLU(per_dev->length));
continue; /* we recovered */
}
+ if (on_dev_error) {
+ u64 residual = ios->reading ?
+ or->in.residual : or->out.residual;
+ u64 offset = (ios->offset + ios->length) - residual;
+ struct ore_dev *od = ios->oc->ods[
+ per_dev->dev - ios->oc->first_dev];
+
+ on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+ offset, residual);
+ }
if (osi.osd_err_pri >= acumulated_osd_err) {
acumulated_osd_err = osi.osd_err_pri;
acumulated_lin_err = ret;
}
}
- /* TODO: raid specific residual calculations */
- if (resid) {
- if (likely(!acumulated_lin_err))
- *resid = 0;
- else
- *resid = ios->length;
- }
-
return acumulated_lin_err;
}
EXPORT_SYMBOL(ore_check_io);
@@ -248,61 +463,65 @@ EXPORT_SYMBOL(ore_check_io);
/*
* L - logical offset into the file
*
- * U - The number of bytes in a stripe within a group
+ * D - number of Data devices
+ * D = group_width - parity
*
- * U = stripe_unit * group_width
+ * U - The number of bytes in a stripe within a group
+ * U = stripe_unit * D
*
* T - The number of bytes striped within a group of component objects
* (before advancing to the next group)
- *
- * T = stripe_unit * group_width * group_depth
+ * T = U * group_depth
*
* S - The number of bytes striped across all component objects
* before the pattern repeats
+ * S = T * group_count
*
- * S = stripe_unit * group_width * group_depth * group_count
- *
- * M - The "major" (i.e., across all components) stripe number
- *
+ * M - The "major" (i.e., across all components) cycle number
* M = L / S
*
- * G - Counts the groups from the beginning of the major stripe
- *
+ * G - Counts the groups from the beginning of the major cycle
* G = (L - (M * S)) / T [or (L % S) / T]
*
* H - The byte offset within the group
- *
* H = (L - (M * S)) % T [or (L % S) % T]
*
* N - The "minor" (i.e., across the group) stripe number
- *
* N = H / U
*
* C - The component index coresponding to L
*
- * C = (H - (N * U)) / stripe_unit + G * group_width
- * [or (L % U) / stripe_unit + G * group_width]
+ * C = (H - (N * U)) / stripe_unit + G * D
+ * [or (L % U) / stripe_unit + G * D]
*
* O - The component offset coresponding to L
- *
* O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ *
+ * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
+ * divide by parity
+ * LCMdP = lcm(group_width, parity) / parity
+ *
+ * R - The parity Rotation stripe
+ * (Note parity cycle always starts at a group's boundary)
+ * R = N % LCMdP
+ *
+ * I = the first parity device index
+ * I = (group_width + group_width - R*parity - parity) % group_width
+ *
+ * Craid - The component index Rotated
+ * Craid = (group_width + C - R*parity) % group_width
+ * (We add the group_width to avoid negative numbers modulo math)
*/
-struct _striping_info {
- u64 obj_offset;
- u64 group_length;
- u64 M; /* for truncate */
- unsigned dev;
- unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
- struct _striping_info *si)
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+ u64 length, struct ore_striping_info *si)
{
u32 stripe_unit = layout->stripe_unit;
u32 group_width = layout->group_width;
u64 group_depth = layout->group_depth;
+ u32 parity = layout->parity;
- u32 U = stripe_unit * group_width;
+ u32 D = group_width - parity;
+ u32 U = D * stripe_unit;
u64 T = U * group_depth;
u64 S = T * layout->group_count;
u64 M = div64_u64(file_offset, S);
@@ -318,39 +537,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
u32 N = div_u64(H, U);
/* "H - (N * U)" is just "H % U" so it's bound to u32 */
- si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
- si->dev *= layout->mirrors_p1;
+ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
div_u64_rem(file_offset, stripe_unit, &si->unit_off);
si->obj_offset = si->unit_off + (N * stripe_unit) +
(M * group_depth * stripe_unit);
- si->group_length = T - H;
+ if (parity) {
+ u32 LCMdP = lcm(group_width, parity) / parity;
+ /* R = N % LCMdP; */
+ u32 RxP = (N % LCMdP) * parity;
+ u32 first_dev = C - C % group_width;
+
+ si->par_dev = (group_width + group_width - parity - RxP) %
+ group_width + first_dev;
+ si->dev = (group_width + C - RxP) % group_width + first_dev;
+ si->bytes_in_stripe = U;
+ si->first_stripe_start = M * S + G * T + N * U;
+ } else {
+ /* Make the math correct see _prepare_one_group */
+ si->par_dev = group_width;
+ si->dev = C;
+ }
+
+ si->dev *= layout->mirrors_p1;
+ si->par_dev *= layout->mirrors_p1;
+ si->offset = file_offset;
+ si->length = T - H;
+ if (si->length > length)
+ si->length = length;
si->M = M;
}
+EXPORT_SYMBOL(ore_calc_stripe_info);
-static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct ore_per_dev_state *per_dev,
- int cur_len)
+int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct page **pages,
+ struct ore_per_dev_state *per_dev, int cur_len)
{
unsigned pg = *cur_pg;
struct request_queue *q =
osd_request_queue(_ios_od(ios, per_dev->dev));
-
- per_dev->length += cur_len;
+ unsigned len = cur_len;
+ int ret;
if (per_dev->bio == NULL) {
unsigned pages_in_stripe = ios->layout->group_width *
(ios->layout->stripe_unit / PAGE_SIZE);
- unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
- ios->layout->group_width;
+ unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
+ (ios->layout->group_width -
+ ios->layout->parity);
+ unsigned bio_size = (nr_pages + pages_in_stripe) /
+ ios->layout->group_width;
per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
if (unlikely(!per_dev->bio)) {
ORE_DBGMSG("Failed to allocate BIO size=%u\n",
bio_size);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
}
@@ -358,64 +603,90 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
unsigned added_len;
- BUG_ON(ios->nr_pages <= pg);
cur_len -= pglen;
- added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+ added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
pglen, pgbase);
- if (unlikely(pglen != added_len))
- return -ENOMEM;
+ if (unlikely(pglen != added_len)) {
+ ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
+ per_dev->bio->bi_vcnt);
+ ret = -ENOMEM;
+ goto out;
+ }
+ _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
+
pgbase = 0;
++pg;
}
BUG_ON(cur_len);
+ per_dev->length += len;
*cur_pg = pg;
- return 0;
+ ret = 0;
+out: /* we fail the complete unit on an error eg don't advance
+ * per_dev->length and cur_pg. This means that we might have a bigger
+ * bio than the CDB requested length (per_dev->length). That's fine
+ * only the oposite is fatal.
+ */
+ return ret;
}
-static int _prepare_one_group(struct ore_io_state *ios, u64 length,
- struct _striping_info *si)
+static int _prepare_for_striping(struct ore_io_state *ios)
{
+ struct ore_striping_info *si = &ios->si;
unsigned stripe_unit = ios->layout->stripe_unit;
unsigned mirrors_p1 = ios->layout->mirrors_p1;
- unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+ unsigned group_width = ios->layout->group_width;
+ unsigned devs_in_group = group_width * mirrors_p1;
unsigned dev = si->dev;
unsigned first_dev = dev - (dev % devs_in_group);
- unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+ unsigned dev_order;
unsigned cur_pg = ios->pages_consumed;
+ u64 length = ios->length;
int ret = 0;
+ if (!ios->pages) {
+ ios->numdevs = ios->layout->mirrors_p1;
+ return 0;
+ }
+
+ BUG_ON(length > si->length);
+
+ dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
+ si->cur_comp = dev_order;
+ si->cur_pg = si->unit_off / PAGE_SIZE;
+
while (length) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
+ unsigned comp = dev - first_dev;
+ struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
unsigned cur_len, page_off = 0;
if (!per_dev->length) {
per_dev->dev = dev;
- if (dev < si->dev) {
- per_dev->offset = si->obj_offset + stripe_unit -
- si->unit_off;
- cur_len = stripe_unit;
- } else if (dev == si->dev) {
+ if (dev == si->dev) {
+ WARN_ON(dev == si->par_dev);
per_dev->offset = si->obj_offset;
cur_len = stripe_unit - si->unit_off;
page_off = si->unit_off & ~PAGE_MASK;
BUG_ON(page_off && (page_off != ios->pgbase));
- } else { /* dev > si->dev */
- per_dev->offset = si->obj_offset - si->unit_off;
+ } else {
+ if (si->cur_comp > dev_order)
+ per_dev->offset =
+ si->obj_offset - si->unit_off;
+ else /* si->cur_comp < dev_order */
+ per_dev->offset =
+ si->obj_offset + stripe_unit -
+ si->unit_off;
cur_len = stripe_unit;
}
-
- if (max_comp < dev)
- max_comp = dev;
} else {
cur_len = stripe_unit;
}
if (cur_len >= length)
cur_len = length;
- ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
- cur_len);
+ ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
+ per_dev, cur_len);
if (unlikely(ret))
goto out;
@@ -423,60 +694,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
dev = (dev % devs_in_group) + first_dev;
length -= cur_len;
- }
-out:
- ios->numdevs = max_comp + mirrors_p1;
- ios->pages_consumed = cur_pg;
- return ret;
-}
-
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
- u64 length = ios->length;
- u64 offset = ios->offset;
- struct _striping_info si;
- int ret = 0;
- if (!ios->pages) {
- if (ios->kern_buff) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[0];
+ si->cur_comp = (si->cur_comp + 1) % group_width;
+ if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
+ if (!length && ios->sp2d) {
+ /* If we are writing and this is the very last
+ * stripe. then operate on parity dev.
+ */
+ dev = si->par_dev;
+ }
+ if (ios->sp2d)
+ /* In writes cur_len just means if it's the
+ * last one. See _ore_add_parity_unit.
+ */
+ cur_len = length;
+ per_dev = &ios->per_dev[dev - first_dev];
+ if (!per_dev->length) {
+ /* Only/always the parity unit of the first
+ * stripe will be empty. So this is a chance to
+ * initialize the per_dev info.
+ */
+ per_dev->dev = dev;
+ per_dev->offset = si->obj_offset - si->unit_off;
+ }
- _calc_stripe_info(ios->layout, ios->offset, &si);
- per_dev->offset = si.obj_offset;
- per_dev->dev = si.dev;
+ ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
+ if (unlikely(ret))
+ goto out;
- /* no cross device without page array */
- BUG_ON((ios->layout->group_width > 1) &&
- (si.unit_off + ios->length >
- ios->layout->stripe_unit));
+ /* Rotate next par_dev backwards with wraping */
+ si->par_dev = (devs_in_group + si->par_dev -
+ ios->layout->parity * mirrors_p1) %
+ devs_in_group + first_dev;
+ /* Next stripe, start fresh */
+ si->cur_comp = 0;
+ si->cur_pg = 0;
}
- ios->numdevs = ios->layout->mirrors_p1;
- return 0;
- }
-
- while (length) {
- _calc_stripe_info(ios->layout, offset, &si);
-
- if (length < si.group_length)
- si.group_length = length;
-
- ret = _prepare_one_group(ios, si.group_length, &si);
- if (unlikely(ret))
- goto out;
-
- offset += si.group_length;
- length -= si.group_length;
}
-
out:
- return ret;
+ ios->numdevs = devs_in_group;
+ ios->pages_consumed = cur_pg;
+ if (unlikely(ret)) {
+ if (length == ios->length)
+ return ret;
+ else
+ ios->length -= length;
+ }
+ return 0;
}
int ore_create(struct ore_io_state *ios)
{
int i, ret;
- for (i = 0; i < ios->comps->numdevs; i++) {
+ for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;
or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -501,7 +772,7 @@ int ore_remove(struct ore_io_state *ios)
{
int i, ret;
- for (i = 0; i < ios->comps->numdevs; i++) {
+ for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;
or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -543,7 +814,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
goto out;
}
per_dev->or = or;
- per_dev->offset = master_dev->offset;
if (ios->pages) {
struct bio *bio;
@@ -562,6 +832,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
__bio_clone(bio, master_dev->bio);
bio->bi_bdev = NULL;
bio->bi_next = NULL;
+ per_dev->offset = master_dev->offset;
per_dev->length = master_dev->length;
per_dev->bio = bio;
per_dev->dev = dev;
@@ -579,7 +850,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
_LLU(per_dev->offset),
_LLU(per_dev->length), dev);
} else if (ios->kern_buff) {
- ret = osd_req_write_kern(or, _ios_obj(ios, dev),
+ per_dev->offset = ios->si.obj_offset;
+ per_dev->dev = ios->si.dev + dev;
+
+ /* no cross device without page array */
+ BUG_ON((ios->layout->group_width > 1) &&
+ (ios->si.unit_off + ios->length >
+ ios->layout->stripe_unit));
+
+ ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
per_dev->offset,
ios->kern_buff, ios->length);
if (unlikely(ret))
@@ -588,7 +867,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
"length=0x%llx dev=%d\n",
_LLU(_ios_obj(ios, dev)->id),
_LLU(per_dev->offset),
- _LLU(ios->length), dev);
+ _LLU(ios->length), per_dev->dev);
} else {
osd_req_set_attributes(or, _ios_obj(ios, dev));
ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
@@ -614,6 +893,14 @@ int ore_write(struct ore_io_state *ios)
int i;
int ret;
+ if (unlikely(ios->sp2d && !ios->r4w)) {
+ /* A library is attempting a RAID-write without providing
+ * a pages lock interface.
+ */
+ WARN_ON_ONCE(1);
+ return -ENOTSUPP;
+ }
+
ret = _prepare_for_striping(ios);
if (unlikely(ret))
return ret;
@@ -629,7 +916,7 @@ int ore_write(struct ore_io_state *ios)
}
EXPORT_SYMBOL(ore_write);
-static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
{
struct osd_request *or;
struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
@@ -648,22 +935,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
per_dev->or = or;
if (ios->pages) {
- osd_req_read(or, obj, per_dev->offset,
- per_dev->bio, per_dev->length);
+ if (per_dev->cur_sg) {
+ /* finalize the last sg_entry */
+ _ore_add_sg_seg(per_dev, 0, false);
+ if (unlikely(!per_dev->cur_sg))
+ return 0; /* Skip parity only device */
+
+ osd_req_read_sg(or, obj, per_dev->bio,
+ per_dev->sglist, per_dev->cur_sg);
+ } else {
+ /* The no raid case */
+ osd_req_read(or, obj, per_dev->offset,
+ per_dev->bio, per_dev->length);
+ }
+
ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
- " dev=%d\n", _LLU(obj->id),
+ " dev=%d sg_len=%d\n", _LLU(obj->id),
_LLU(per_dev->offset), _LLU(per_dev->length),
- first_dev);
- } else if (ios->kern_buff) {
- int ret = osd_req_read_kern(or, obj, per_dev->offset,
- ios->kern_buff, ios->length);
- ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d ret=>%d\n",
- _LLU(obj->id), _LLU(per_dev->offset),
- _LLU(ios->length), first_dev, ret);
- if (unlikely(ret))
- return ret;
+ first_dev, per_dev->cur_sg);
} else {
+ BUG_ON(ios->kern_buff);
+
osd_req_get_attributes(or, obj);
ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
_LLU(obj->id),
@@ -688,7 +980,7 @@ int ore_read(struct ore_io_state *ios)
return ret;
for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _read_mirror(ios, i);
+ ret = _ore_read_mirror(ios, i);
if (unlikely(ret))
return ret;
}
@@ -744,31 +1036,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
}
struct _trunc_info {
- struct _striping_info si;
+ struct ore_striping_info si;
u64 prev_group_obj_off;
u64 next_group_obj_off;
unsigned first_group_dev;
unsigned nex_group_dev;
- unsigned max_devs;
};
-void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
- struct _trunc_info *ti)
+static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
+ struct _trunc_info *ti)
{
unsigned stripe_unit = layout->stripe_unit;
- _calc_stripe_info(layout, file_offset, &ti->si);
+ ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
ti->prev_group_obj_off = ti->si.M * stripe_unit;
ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
ti->nex_group_dev = ti->first_group_dev + layout->group_width;
- ti->max_devs = layout->group_width * layout->group_count;
}
-int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
+int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
u64 size)
{
struct ore_io_state *ios;
@@ -779,22 +1069,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
struct _trunc_info ti;
int i, ret;
- ret = ore_get_io_state(layout, comps, &ios);
+ ret = ore_get_io_state(layout, oc, &ios);
if (unlikely(ret))
return ret;
_calc_trunk_info(ios->layout, size, &ti);
- size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
+ size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
GFP_KERNEL);
if (unlikely(!size_attrs)) {
ret = -ENOMEM;
goto out;
}
- ios->numdevs = ios->comps->numdevs;
+ ios->numdevs = ios->oc->numdevs;
- for (i = 0; i < ti.max_devs; ++i) {
+ for (i = 0; i < ios->numdevs; ++i) {
struct exofs_trunc_attr *size_attr = &size_attrs[i];
u64 obj_size;
@@ -815,7 +1105,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
size_attr->attr.val_ptr = &size_attr->newsize;
ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
- _LLU(comps->comps->obj.id), _LLU(obj_size), i);
+ _LLU(oc->comps->obj.id), _LLU(obj_size), i);
ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
&size_attr->attr);
if (unlikely(ret))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 000000000000..29c47e5c4a86
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (C) 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ * "Free Software Foundation <info@fsf.org>"
+ */
+
+#include <linux/gfp.h>
+#include <linux/async_tx.h>
+
+#include "ore_raid.h"
+
+#undef ORE_DBGMSG2
+#define ORE_DBGMSG2 ORE_DBGMSG
+
+struct page *_raid_page_alloc(void)
+{
+ return alloc_page(GFP_KERNEL);
+}
+
+void _raid_page_free(struct page *p)
+{
+ __free_page(p);
+}
+
+/* This struct is forward declare in ore_io_state, but is private to here.
+ * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
+ *
+ * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
+ * Ascending page index access is sp2d(p-minor, c-major). But storage is
+ * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
+ * API.
+ */
+struct __stripe_pages_2d {
+ /* Cache some hot path repeated calculations */
+ unsigned parity;
+ unsigned data_devs;
+ unsigned pages_in_unit;
+
+ bool needed ;
+
+ /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
+ struct __1_page_stripe {
+ bool alloc;
+ unsigned write_count;
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
+
+ /* The size of this array is data_devs + parity */
+ struct page **pages;
+ struct page **scribble;
+ /* bool array, size of this array is data_devs */
+ char *page_is_read;
+ } _1p_stripes[];
+};
+
+/* This can get bigger then a page. So support multiple page allocations
+ * _sp2d_free should be called even if _sp2d_alloc fails (by returning
+ * none-zero).
+ */
+static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
+ unsigned parity, struct __stripe_pages_2d **psp2d)
+{
+ struct __stripe_pages_2d *sp2d;
+ unsigned data_devs = group_width - parity;
+ struct _alloc_all_bytes {
+ struct __alloc_stripe_pages_2d {
+ struct __stripe_pages_2d sp2d;
+ struct __1_page_stripe _1p_stripes[pages_in_unit];
+ } __asp2d;
+ struct __alloc_1p_arrays {
+ struct page *pages[group_width];
+ struct page *scribble[group_width];
+ char page_is_read[data_devs];
+ } __a1pa[pages_in_unit];
+ } *_aab;
+ struct __alloc_1p_arrays *__a1pa;
+ struct __alloc_1p_arrays *__a1pa_end;
+ const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
+ unsigned num_a1pa, alloc_size, i;
+
+ /* FIXME: check these numbers in ore_verify_layout */
+ BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
+ BUG_ON(sizeof__a1pa > PAGE_SIZE);
+
+ if (sizeof(*_aab) > PAGE_SIZE) {
+ num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
+ alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
+ } else {
+ num_a1pa = pages_in_unit;
+ alloc_size = sizeof(*_aab);
+ }
+
+ _aab = kzalloc(alloc_size, GFP_KERNEL);
+ if (unlikely(!_aab)) {
+ ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
+ return -ENOMEM;
+ }
+
+ sp2d = &_aab->__asp2d.sp2d;
+ *psp2d = sp2d; /* From here Just call _sp2d_free */
+
+ __a1pa = _aab->__a1pa;
+ __a1pa_end = __a1pa + num_a1pa;
+
+ for (i = 0; i < pages_in_unit; ++i) {
+ if (unlikely(__a1pa >= __a1pa_end)) {
+ num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
+ pages_in_unit - i);
+
+ __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
+ if (unlikely(!__a1pa)) {
+ ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
+ num_a1pa);
+ return -ENOMEM;
+ }
+ __a1pa_end = __a1pa + num_a1pa;
+ /* First *pages is marked for kfree of the buffer */
+ sp2d->_1p_stripes[i].alloc = true;
+ }
+
+ sp2d->_1p_stripes[i].pages = __a1pa->pages;
+ sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
+ sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
+ ++__a1pa;
+ }
+
+ sp2d->parity = parity;
+ sp2d->data_devs = data_devs;
+ sp2d->pages_in_unit = pages_in_unit;
+ return 0;
+}
+
+static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
+ const struct _ore_r4w_op *r4w, void *priv)
+{
+ unsigned data_devs = sp2d->data_devs;
+ unsigned group_width = data_devs + sp2d->parity;
+ unsigned p;
+
+ if (!sp2d->needed)
+ return;
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (_1ps->write_count < group_width) {
+ unsigned c;
+
+ for (c = 0; c < data_devs; c++)
+ if (_1ps->page_is_read[c]) {
+ struct page *page = _1ps->pages[c];
+
+ r4w->put_page(priv, page);
+ _1ps->page_is_read[c] = false;
+ }
+ }
+
+ memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
+ _1ps->write_count = 0;
+ _1ps->tx = NULL;
+ }
+
+ sp2d->needed = false;
+}
+
+static void _sp2d_free(struct __stripe_pages_2d *sp2d)
+{
+ unsigned i;
+
+ if (!sp2d)
+ return;
+
+ for (i = 0; i < sp2d->pages_in_unit; ++i) {
+ if (sp2d->_1p_stripes[i].alloc)
+ kfree(sp2d->_1p_stripes[i].pages);
+ }
+
+ kfree(sp2d);
+}
+
+static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
+{
+ unsigned p;
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (_1ps->write_count)
+ return p;
+ }
+
+ return ~0;
+}
+
+static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
+{
+ unsigned p;
+
+ for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (_1ps->write_count)
+ return p;
+ }
+
+ return ~0;
+}
+
+static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
+{
+ unsigned p;
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (!_1ps->write_count)
+ continue;
+
+ init_async_submit(&_1ps->submit,
+ ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
+ NULL,
+ NULL, NULL,
+ (addr_conv_t *)_1ps->scribble);
+
+ /* TODO: raid6 */
+ _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
+ 0, sp2d->data_devs, PAGE_SIZE,
+ &_1ps->submit);
+ }
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+ /* NOTE: We wait for HW synchronously (I don't have such HW
+ * to test with.) Is parallelism needed with today's multi
+ * cores?
+ */
+ async_tx_issue_pending(_1ps->tx);
+ }
+}
+
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+ struct ore_striping_info *si, struct page *page)
+{
+ struct __1_page_stripe *_1ps;
+
+ sp2d->needed = true;
+
+ _1ps = &sp2d->_1p_stripes[si->cur_pg];
+ _1ps->pages[si->cur_comp] = page;
+ ++_1ps->write_count;
+
+ si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
+ /* si->cur_comp is advanced outside at main loop */
+}
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool not_last)
+{
+ struct osd_sg_entry *sge;
+
+ ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
+ "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
+ per_dev->dev, cur_len, not_last, per_dev->cur_sg,
+ _LLU(per_dev->offset), per_dev->length,
+ per_dev->last_sgs_total);
+
+ if (!per_dev->cur_sg) {
+ sge = per_dev->sglist;
+
+ /* First time we prepare two entries */
+ if (per_dev->length) {
+ ++per_dev->cur_sg;
+ sge->offset = per_dev->offset;
+ sge->len = per_dev->length;
+ } else {
+ /* Here the parity is the first unit of this object.
+ * This happens every time we reach a parity device on
+ * the same stripe as the per_dev->offset. We need to
+ * just skip this unit.
+ */
+ per_dev->offset += cur_len;
+ return;
+ }
+ } else {
+ /* finalize the last one */
+ sge = &per_dev->sglist[per_dev->cur_sg - 1];
+ sge->len = per_dev->length - per_dev->last_sgs_total;
+ }
+
+ if (not_last) {
+ /* Partly prepare the next one */
+ struct osd_sg_entry *next_sge = sge + 1;
+
+ ++per_dev->cur_sg;
+ next_sge->offset = sge->offset + sge->len + cur_len;
+ /* Save cur len so we know how mutch was added next time */
+ per_dev->last_sgs_total = per_dev->length;
+ next_sge->len = 0;
+ } else if (!sge->len) {
+ /* Optimize for when the last unit is a parity */
+ --per_dev->cur_sg;
+ }
+}
+
+static int _alloc_read_4_write(struct ore_io_state *ios)
+{
+ struct ore_layout *layout = ios->layout;
+ int ret;
+ /* We want to only read those pages not in cache so worst case
+ * is a stripe populated with every other page
+ */
+ unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
+
+ ret = _ore_get_io_state(layout, ios->oc,
+ layout->group_width * layout->mirrors_p1,
+ sgs_per_dev, 0, &ios->ios_read_4_write);
+ return ret;
+}
+
+/* @si contains info of the to-be-inserted page. Update of @si should be
+ * maintained by caller. Specificaly si->dev, si->obj_offset, ...
+ */
+static int _add_to_read_4_write(struct ore_io_state *ios,
+ struct ore_striping_info *si, struct page *page)
+{
+ struct request_queue *q;
+ struct ore_per_dev_state *per_dev;
+ struct ore_io_state *read_ios;
+ unsigned first_dev = si->dev - (si->dev %
+ (ios->layout->group_width * ios->layout->mirrors_p1));
+ unsigned comp = si->dev - first_dev;
+ unsigned added_len;
+
+ if (!ios->ios_read_4_write) {
+ int ret = _alloc_read_4_write(ios);
+
+ if (unlikely(ret))
+ return ret;
+ }
+
+ read_ios = ios->ios_read_4_write;
+ read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
+
+ per_dev = &read_ios->per_dev[comp];
+ if (!per_dev->length) {
+ per_dev->bio = bio_kmalloc(GFP_KERNEL,
+ ios->sp2d->pages_in_unit);
+ if (unlikely(!per_dev->bio)) {
+ ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+ ios->sp2d->pages_in_unit);
+ return -ENOMEM;
+ }
+ per_dev->offset = si->obj_offset;
+ per_dev->dev = si->dev;
+ } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
+ u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
+
+ _ore_add_sg_seg(per_dev, gap, true);
+ }
+ q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
+ added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
+ if (unlikely(added_len != PAGE_SIZE)) {
+ ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
+ per_dev->bio->bi_vcnt);
+ return -ENOMEM;
+ }
+
+ per_dev->length += PAGE_SIZE;
+ return 0;
+}
+
+static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
+{
+ struct bio_vec *bv;
+ unsigned i, d;
+
+ /* loop on all devices all pages */
+ for (d = 0; d < ios->numdevs; d++) {
+ struct bio *bio = ios->per_dev[d].bio;
+
+ if (!bio)
+ continue;
+
+ __bio_for_each_segment(bv, bio, i, 0) {
+ struct page *page = bv->bv_page;
+
+ SetPageUptodate(page);
+ if (PageError(page))
+ ClearPageError(page);
+ }
+ }
+}
+
+/* read_4_write is hacked to read the start of the first stripe and/or
+ * the end of the last stripe. If needed, with an sg-gap at each device/page.
+ * It is assumed to be called after the to_be_written pages of the first stripe
+ * are populating ios->sp2d[][]
+ *
+ * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
+ * These pages are held at sp2d[p].pages[c] but with
+ * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
+ * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
+ * @uptodate=true, so we don't need to read it, only unlock, after IO.
+ *
+ * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
+ * to-be-written count, we should consider the xor-in-place mode.
+ * need_to_read_pages_count is the actual number of pages not present in cache.
+ * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
+ * approximation? In this mode the read pages are put in the empty places of
+ * ios->sp2d[p][*], xor is calculated the same way. These pages are
+ * allocated/freed and don't go through cache
+ */
+static int _read_4_write(struct ore_io_state *ios)
+{
+ struct ore_io_state *ios_read;
+ struct ore_striping_info read_si;
+ struct __stripe_pages_2d *sp2d = ios->sp2d;
+ u64 offset = ios->si.first_stripe_start;
+ u64 last_stripe_end;
+ unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+ unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+ int ret;
+
+ if (offset == ios->offset) /* Go to start collect $200 */
+ goto read_last_stripe;
+
+ min_p = _sp2d_min_pg(sp2d);
+ max_p = _sp2d_max_pg(sp2d);
+
+ for (c = 0; ; c++) {
+ ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+ read_si.obj_offset += min_p * PAGE_SIZE;
+ offset += min_p * PAGE_SIZE;
+ for (p = min_p; p <= max_p; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+ struct page **pp = &_1ps->pages[c];
+ bool uptodate;
+
+ if (*pp)
+ /* to-be-written pages start here */
+ goto read_last_stripe;
+
+ *pp = ios->r4w->get_page(ios->private, offset,
+ &uptodate);
+ if (unlikely(!*pp))
+ return -ENOMEM;
+
+ if (!uptodate)
+ _add_to_read_4_write(ios, &read_si, *pp);
+
+ /* Mark read-pages to be cache_released */
+ _1ps->page_is_read[c] = true;
+ read_si.obj_offset += PAGE_SIZE;
+ offset += PAGE_SIZE;
+ }
+ offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
+ }
+
+read_last_stripe:
+ offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
+ PAGE_SIZE * PAGE_SIZE;
+ last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
+ * bytes_in_stripe;
+ if (offset == last_stripe_end) /* Optimize for the aligned case */
+ goto read_it;
+
+ ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+ p = read_si.unit_off / PAGE_SIZE;
+ c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
+ ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
+
+ BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
+ /* unaligned IO must be within a single stripe */
+
+ if (min_p == sp2d->pages_in_unit) {
+ /* Didn't do it yet */
+ min_p = _sp2d_min_pg(sp2d);
+ max_p = _sp2d_max_pg(sp2d);
+ }
+
+ while (offset < last_stripe_end) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if ((min_p <= p) && (p <= max_p)) {
+ struct page *page;
+ bool uptodate;
+
+ BUG_ON(_1ps->pages[c]);
+ page = ios->r4w->get_page(ios->private, offset,
+ &uptodate);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ _1ps->pages[c] = page;
+ /* Mark read-pages to be cache_released */
+ _1ps->page_is_read[c] = true;
+ if (!uptodate)
+ _add_to_read_4_write(ios, &read_si, page);
+ }
+
+ offset += PAGE_SIZE;
+ if (p == (sp2d->pages_in_unit - 1)) {
+ ++c;
+ p = 0;
+ ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+ } else {
+ read_si.obj_offset += PAGE_SIZE;
+ ++p;
+ }
+ }
+
+read_it:
+ ios_read = ios->ios_read_4_write;
+ if (!ios_read)
+ return 0;
+
+ /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
+ * to check for per_dev->bio
+ */
+ ios_read->pages = ios->pages;
+
+ /* Now read these devices */
+ for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
+ ret = _ore_read_mirror(ios_read, i);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ ret = ore_io_execute(ios_read); /* Synchronus execution */
+ if (unlikely(ret)) {
+ ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
+ return ret;
+ }
+
+ _mark_read4write_pages_uptodate(ios_read, ret);
+ return 0;
+}
+
+/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
+int _ore_add_parity_unit(struct ore_io_state *ios,
+ struct ore_striping_info *si,
+ struct ore_per_dev_state *per_dev,
+ unsigned cur_len)
+{
+ if (ios->reading) {
+ BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
+ _ore_add_sg_seg(per_dev, cur_len, true);
+ } else {
+ struct __stripe_pages_2d *sp2d = ios->sp2d;
+ struct page **pages = ios->parity_pages + ios->cur_par_page;
+ unsigned num_pages;
+ unsigned array_start = 0;
+ unsigned i;
+ int ret;
+
+ si->cur_pg = _sp2d_min_pg(sp2d);
+ num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
+
+ if (!cur_len) /* If last stripe operate on parity comp */
+ si->cur_comp = sp2d->data_devs;
+
+ if (!per_dev->length) {
+ per_dev->offset += si->cur_pg * PAGE_SIZE;
+ /* If first stripe, Read in all read4write pages
+ * (if needed) before we calculate the first parity.
+ */
+ _read_4_write(ios);
+ }
+
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = _raid_page_alloc();
+ if (unlikely(!pages[i]))
+ return -ENOMEM;
+
+ ++(ios->cur_par_page);
+ }
+
+ BUG_ON(si->cur_comp != sp2d->data_devs);
+ BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
+
+ ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
+ per_dev, num_pages * PAGE_SIZE);
+ if (unlikely(ret))
+ return ret;
+
+ /* TODO: raid6 if (last_parity_dev) */
+ _gen_xor_unit(sp2d);
+ _sp2d_reset(sp2d, ios->r4w, ios->private);
+ }
+ return 0;
+}
+
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
+{
+ struct ore_layout *layout = ios->layout;
+
+ if (ios->parity_pages) {
+ unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+ unsigned stripe_size = ios->si.bytes_in_stripe;
+ u64 last_stripe, first_stripe;
+
+ if (_sp2d_alloc(pages_in_unit, layout->group_width,
+ layout->parity, &ios->sp2d)) {
+ return -ENOMEM;
+ }
+
+ BUG_ON(ios->offset % PAGE_SIZE);
+
+ /* Round io down to last full strip */
+ first_stripe = div_u64(ios->offset, stripe_size);
+ last_stripe = div_u64(ios->offset + ios->length, stripe_size);
+
+ /* If an IO spans more then a single stripe it must end at
+ * a stripe boundary. The reminder at the end is pushed into the
+ * next IO.
+ */
+ if (last_stripe != first_stripe) {
+ ios->length = last_stripe * stripe_size - ios->offset;
+
+ BUG_ON(!ios->length);
+ ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
+ PAGE_SIZE;
+ ios->si.length = ios->length; /*make it consistent */
+ }
+ }
+ return 0;
+}
+
+void _ore_free_raid_stuff(struct ore_io_state *ios)
+{
+ if (ios->sp2d) { /* writing and raid */
+ unsigned i;
+
+ for (i = 0; i < ios->cur_par_page; i++) {
+ struct page *page = ios->parity_pages[i];
+
+ if (page)
+ _raid_page_free(page);
+ }
+ if (ios->extra_part_alloc)
+ kfree(ios->parity_pages);
+ /* If IO returned an error pages might need unlocking */
+ _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
+ _sp2d_free(ios->sp2d);
+ } else {
+ /* Will only be set if raid reading && sglist is big */
+ if (ios->extra_part_alloc)
+ kfree(ios->per_dev[0].sglist);
+ }
+ if (ios->ios_read_4_write)
+ ore_put_io_state(ios->ios_read_4_write);
+}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 000000000000..2ffd2c3c6e46
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) from 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ * "Free Software Foundation <info@fsf.org>"
+ */
+
+#include <scsi/osd_ore.h>
+
+#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define ORE_DBGMSG(fmt, a...) \
+ printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define ORE_DBGMSG(fmt, a...) \
+ do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+#define ORE_DBGMSG2(M...) do {} while (0)
+/* #define ORE_DBGMSG2 ORE_DBGMSG */
+
+/* Calculate the component order in a stripe. eg the logical data unit
+ * address within the stripe of @dev given the @par_dev of this stripe.
+ */
+static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
+ unsigned par_dev, unsigned dev)
+{
+ unsigned first_dev = dev - dev % devs_in_group;
+
+ dev -= first_dev;
+ par_dev -= first_dev;
+
+ if (devs_in_group == par_dev) /* The raid 0 case */
+ return dev / mirrors_p1;
+ /* raid4/5/6 case */
+ return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
+ mirrors_p1;
+}
+
+/* ios_raid.c stuff needed by ios.c */
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
+void _ore_free_raid_stuff(struct ore_io_state *ios);
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool not_last);
+int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
+ struct ore_per_dev_state *per_dev, unsigned cur_len);
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+ struct ore_striping_info *si, struct page *page);
+static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
+ struct ore_striping_info *si, struct page *page)
+{
+ if (!sp2d) /* Inline the fast path */
+ return; /* Hay no raid stuff */
+ _ore_add_stripe_page(sp2d, si, page);
+}
+
+/* ios.c stuff needed by ios_raid.c */
+int _ore_get_io_state(struct ore_layout *layout,
+ struct ore_components *oc, unsigned numdevs,
+ unsigned sgs_per_dev, unsigned num_par_pages,
+ struct ore_io_state **pios);
+int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct page **pages,
+ struct ore_per_dev_state *per_dev, int cur_len);
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
+int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 274894053b02..057b237b8b69 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
struct ore_io_state *ios;
int ret;
- ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
@@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
struct ore_io_state *ios;
int ret;
- ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
@@ -355,12 +355,12 @@ static const struct export_operations exofs_export_ops;
/*
* Write the superblock to the OSD
*/
-int exofs_sync_fs(struct super_block *sb, int wait)
+static int exofs_sync_fs(struct super_block *sb, int wait)
{
struct exofs_sb_info *sbi;
struct exofs_fscb *fscb;
struct ore_comp one_comp;
- struct ore_components comps;
+ struct ore_components oc;
struct ore_io_state *ios;
int ret = -ENOMEM;
@@ -378,9 +378,9 @@ int exofs_sync_fs(struct super_block *sb, int wait)
* the writeable info is set in exofs_sbi_write_stats() above.
*/
- exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
+ exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
- ret = ore_get_io_state(&sbi->layout, &comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oc, &ios);
if (unlikely(ret))
goto out;
@@ -429,19 +429,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
msg, dev_path ?: "", odi->osdname, _LLU(pid));
}
-void exofs_free_sbi(struct exofs_sb_info *sbi)
+static void exofs_free_sbi(struct exofs_sb_info *sbi)
{
- while (sbi->comps.numdevs) {
- int i = --sbi->comps.numdevs;
- struct osd_dev *od = sbi->comps.ods[i];
+ unsigned numdevs = sbi->oc.numdevs;
+
+ while (numdevs) {
+ unsigned i = --numdevs;
+ struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
if (od) {
- sbi->comps.ods[i] = NULL;
+ ore_comp_set_dev(&sbi->oc, i, NULL);
osduld_put_device(od);
}
}
- if (sbi->comps.ods != sbi->_min_one_dev)
- kfree(sbi->comps.ods);
+ kfree(sbi->oc.ods);
kfree(sbi);
}
@@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb)
msecs_to_jiffies(100));
}
- _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
+ _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
sbi->one_comp.obj.partition);
bdi_destroy(&sbi->bdi);
@@ -479,76 +480,20 @@ static void exofs_put_super(struct super_block *sb)
static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
struct exofs_device_table *dt)
{
- u64 stripe_length;
+ int ret;
- sbi->data_map.odm_num_comps =
- le32_to_cpu(dt->dt_data_map.cb_num_comps);
- sbi->data_map.odm_stripe_unit =
+ sbi->layout.stripe_unit =
le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
- sbi->data_map.odm_group_width =
+ sbi->layout.group_width =
le32_to_cpu(dt->dt_data_map.cb_group_width);
- sbi->data_map.odm_group_depth =
+ sbi->layout.group_depth =
le32_to_cpu(dt->dt_data_map.cb_group_depth);
- sbi->data_map.odm_mirror_cnt =
- le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
- sbi->data_map.odm_raid_algorithm =
+ sbi->layout.mirrors_p1 =
+ le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
+ sbi->layout.raid_algorithm =
le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
-/* FIXME: Only raid0 for now. if not so, do not mount */
- if (sbi->data_map.odm_num_comps != numdevs) {
- EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
- sbi->data_map.odm_num_comps, numdevs);
- return -EINVAL;
- }
- if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
- EXOFS_ERR("Only RAID_0 for now\n");
- return -EINVAL;
- }
- if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
- EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
- numdevs, sbi->data_map.odm_mirror_cnt);
- return -EINVAL;
- }
-
- if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
- EXOFS_ERR("Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
- return -EINVAL;
- }
-
- sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
- sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
-
- if (sbi->data_map.odm_group_width) {
- sbi->layout.group_width = sbi->data_map.odm_group_width;
- sbi->layout.group_depth = sbi->data_map.odm_group_depth;
- if (!sbi->layout.group_depth) {
- EXOFS_ERR("group_depth == 0 && group_width != 0\n");
- return -EINVAL;
- }
- sbi->layout.group_count = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1 /
- sbi->data_map.odm_group_width;
- } else {
- if (sbi->data_map.odm_group_depth) {
- printk(KERN_NOTICE "Warning: group_depth ignored "
- "group_width == 0 && group_depth == %d\n",
- sbi->data_map.odm_group_depth);
- sbi->data_map.odm_group_depth = 0;
- }
- sbi->layout.group_width = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1;
- sbi->layout.group_depth = -1;
- sbi->layout.group_count = 1;
- }
-
- stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
- if (stripe_length >= (1ULL << 32)) {
- EXOFS_ERR("Total Stripe length(0x%llx)"
- " >= 32bit is not supported\n", _LLU(stripe_length));
- return -EINVAL;
- }
+ ret = ore_verify_layout(numdevs, &sbi->layout);
EXOFS_DBGMSG("exofs: layout: "
"num_comps=%u stripe_unit=0x%x group_width=%u "
@@ -558,8 +503,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
sbi->layout.group_width,
_LLU(sbi->layout.group_depth),
sbi->layout.mirrors_p1,
- sbi->data_map.odm_raid_algorithm);
- return 0;
+ sbi->layout.raid_algorithm);
+ return ret;
}
static unsigned __ra_pages(struct ore_layout *layout)
@@ -605,12 +550,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
return !(odi->systemid_len || odi->osdname_len);
}
+int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+ struct exofs_dev **peds)
+{
+ struct __alloc_ore_devs_and_exofs_devs {
+ /* Twice bigger table: See exofs_init_comps() and comment at
+ * exofs_read_lookup_dev_table()
+ */
+ struct ore_dev *oreds[numdevs * 2 - 1];
+ struct exofs_dev eds[numdevs];
+ } *aoded;
+ struct exofs_dev *eds;
+ unsigned i;
+
+ aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
+ if (unlikely(!aoded)) {
+ EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+ numdevs);
+ return -ENOMEM;
+ }
+
+ sbi->oc.ods = aoded->oreds;
+ *peds = eds = aoded->eds;
+ for (i = 0; i < numdevs; ++i)
+ aoded->oreds[i] = &eds[i].ored;
+ return 0;
+}
+
static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
struct osd_dev *fscb_od,
unsigned table_count)
{
struct ore_comp comp;
struct exofs_device_table *dt;
+ struct exofs_dev *eds;
unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
sizeof(*dt);
unsigned numdevs, i;
@@ -623,7 +596,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
return -ENOMEM;
}
- sbi->comps.numdevs = 0;
+ sbi->oc.numdevs = 0;
comp.obj.partition = sbi->one_comp.obj.partition;
comp.obj.id = EXOFS_DEVTABLE_ID;
@@ -647,20 +620,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
if (unlikely(ret))
goto out;
- if (likely(numdevs > 1)) {
- unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
-
- /* Twice bigger table: See exofs_init_comps() and below
- * comment
- */
- sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
- if (unlikely(!sbi->comps.ods)) {
- EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
- numdevs);
- ret = -ENOMEM;
- goto out;
- }
- }
+ ret = __alloc_dev_table(sbi, numdevs, &eds);
+ if (unlikely(ret))
+ goto out;
+ /* exofs round-robins the device table view according to inode
+ * number. We hold a: twice bigger table hence inodes can point
+ * to any device and have a sequential view of the table
+ * starting at this device. See exofs_init_comps()
+ */
+ memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
+ (numdevs - 1) * sizeof(sbi->oc.ods[0]));
for (i = 0; i < numdevs; i++) {
struct exofs_fscb fscb;
@@ -676,13 +645,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
i, odi.osdname);
+ /* the exofs id is currently the table index */
+ eds[i].did = i;
+
/* On all devices the device table is identical. The user can
* specify any one of the participating devices on the command
* line. We always keep them in device-table order.
*/
if (fscb_od && osduld_device_same(fscb_od, &odi)) {
- sbi->comps.ods[i] = fscb_od;
- ++sbi->comps.numdevs;
+ eds[i].ored.od = fscb_od;
+ ++sbi->oc.numdevs;
fscb_od = NULL;
continue;
}
@@ -695,8 +667,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
goto out;
}
- sbi->comps.ods[i] = od;
- ++sbi->comps.numdevs;
+ eds[i].ored.od = od;
+ ++sbi->oc.numdevs;
/* Read the fscb of the other devices to make sure the FS
* partition is there.
@@ -718,21 +690,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
out:
kfree(dt);
- if (likely(!ret)) {
- unsigned numdevs = sbi->comps.numdevs;
-
- if (unlikely(fscb_od)) {
+ if (unlikely(fscb_od && !ret)) {
EXOFS_ERR("ERROR: Bad device-table container device not present\n");
osduld_put_device(fscb_od);
return -EINVAL;
- }
- /* exofs round-robins the device table view according to inode
- * number. We hold a: twice bigger table hence inodes can point
- * to any device and have a sequential view of the table
- * starting at this device. See exofs_init_comps()
- */
- for (i = 0; i < numdevs - 1; ++i)
- sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
}
return ret;
}
@@ -783,10 +744,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
sbi->one_comp.obj.partition = opts->pid;
sbi->one_comp.obj.id = 0;
exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
- sbi->comps.numdevs = 1;
- sbi->comps.single_comp = EC_SINGLE_COMP;
- sbi->comps.comps = &sbi->one_comp;
- sbi->comps.ods = sbi->_min_one_dev;
+ sbi->oc.numdevs = 1;
+ sbi->oc.single_comp = EC_SINGLE_COMP;
+ sbi->oc.comps = &sbi->one_comp;
/* fill in some other data by hand */
memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -835,7 +795,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
if (unlikely(ret))
goto free_sbi;
} else {
- sbi->comps.ods[0] = od;
+ struct exofs_dev *eds;
+
+ ret = __alloc_dev_table(sbi, 1, &eds);
+ if (unlikely(ret))
+ goto free_sbi;
+
+ ore_comp_set_dev(&sbi->oc, 0, od);
}
__sbi_read_stats(sbi);
@@ -875,7 +841,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
+ _exofs_print_device("Mounting", opts->dev_name,
+ ore_comp_dev(&sbi->oc, 0),
sbi->one_comp.obj.partition);
return 0;
@@ -924,7 +891,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
uint64_t used = ULLONG_MAX;
int ret;
- ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (ret) {
EXOFS_DBGMSG("ore_get_io_state failed.\n");
return ret;
@@ -981,7 +948,7 @@ static const struct super_operations exofs_sops = {
* EXPORT OPERATIONS
*****************************************************************************/
-struct dentry *exofs_get_parent(struct dentry *child)
+static struct dentry *exofs_get_parent(struct dentry *child)
{
unsigned long ino = exofs_parent_ino(child);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index af9fc89b1b2d..9a4e5e206d08 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -135,10 +135,10 @@ extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
struct dentry *ext2_get_parent(struct dentry *child);
/* super.c */
-extern void ext2_error (struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
-extern void ext2_msg(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ext2_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext2_msg(struct super_block *, const char *, const char *, ...);
extern void ext2_update_dynamic_rev (struct super_block *sb);
extern void ext2_write_super (struct super_block *);
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 5d979b4347b0..c922adc8ef41 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -46,28 +46,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
value, size, flags);
}
-int
-ext2_init_security(struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ int err = 0;
- err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
+int
+ext2_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext2_initxattrs, NULL);
+}
+
const struct xattr_handler ext2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext2_xattr_security_list,
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index b8d9f83aa5c5..3c218b8a51d4 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -48,28 +48,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ handle_t *handle = fs_info;
+ int err = 0;
- err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext3_xattr_set_handle(handle, inode,
+ EXT3_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext3_initxattrs, handle);
+}
+
const struct xattr_handler ext3_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext3_xattr_security_list,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b7d7bd0f066e..cec3145e532c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1878,40 +1878,40 @@ extern int ext4_group_extend(struct super_block *sb,
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern void ext4_kvfree(void *ptr);
-extern void __ext4_error(struct super_block *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ext4_error(struct super_block *, const char *, unsigned int,
+ const char *, ...);
#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
__LINE__, ## message)
-extern void ext4_error_inode(struct inode *, const char *, unsigned int,
- ext4_fsblk_t, const char *, ...)
- __attribute__ ((format (printf, 5, 6)));
-extern void ext4_error_file(struct file *, const char *, unsigned int,
- ext4_fsblk_t, const char *, ...)
- __attribute__ ((format (printf, 5, 6)));
+extern __printf(5, 6)
+void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
+extern __printf(5, 6)
+void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
-extern void __ext4_abort(struct super_block *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ext4_abort(struct super_block *, const char *, unsigned int,
+ const char *, ...);
#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
__LINE__, ## message)
-extern void __ext4_warning(struct super_block *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ext4_warning(struct super_block *, const char *, unsigned int,
+ const char *, ...);
#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
__LINE__, ## message)
-extern void ext4_msg(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
const char *, unsigned int, const char *);
#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
__LINE__, msg)
-extern void __ext4_grp_locked_error(const char *, unsigned int, \
- struct super_block *, ext4_group_t, \
- unsigned long, ext4_fsblk_t, \
- const char *, ...)
- __attribute__ ((format (printf, 7, 8)));
+extern __printf(7, 8)
+void __ext4_grp_locked_error(const char *, unsigned int,
+ struct super_block *, ext4_group_t,
+ unsigned long, ext4_fsblk_t,
+ const char *, ...);
#define ext4_grp_locked_error(sb, grp, message...) \
__ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
extern void ext4_update_dynamic_rev(struct super_block *sb);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e4095e988eba..b9548f477bb8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -224,53 +224,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
else
maxbytes = inode->i_sb->s_maxbytes;
- mutex_lock(&inode->i_mutex);
- switch (origin) {
- case SEEK_END:
- offset += inode->i_size;
- break;
- case SEEK_CUR:
- if (offset == 0) {
- mutex_unlock(&inode->i_mutex);
- return file->f_pos;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- /*
- * In the generic case the entire file is data, so as long as
- * offset isn't at the end of the file then the offset is data.
- */
- if (offset >= inode->i_size) {
- mutex_unlock(&inode->i_mutex);
- return -ENXIO;
- }
- break;
- case SEEK_HOLE:
- /*
- * There is a virtual hole at the end of the file, so as long as
- * offset isn't i_size or larger, return i_size.
- */
- if (offset >= inode->i_size) {
- mutex_unlock(&inode->i_mutex);
- return -ENXIO;
- }
- offset = inode->i_size;
- break;
- }
-
- if (offset < 0 || offset > maxbytes) {
- mutex_unlock(&inode->i_mutex);
- return -EINVAL;
- }
-
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- mutex_unlock(&inode->i_mutex);
- return offset;
+ return generic_file_llseek_size(file, offset, origin, maxbytes);
}
const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986e2388f031..0defe0bfe019 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1811,8 +1811,12 @@ static int ext4_writepage(struct page *page,
* We don't want to do block allocation, so redirty
* the page and return. We may reach here when we do
* a journal commit via journal_submit_inode_data_buffers.
- * We can also reach here via shrink_page_list
+ * We can also reach here via shrink_page_list but it
+ * should never be for direct reclaim so warn if that
+ * happens
*/
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+ PF_MEMALLOC);
goto redirty_page;
}
if (commit_write)
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 007c3bfbf094..34e4350dd4d9 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -48,28 +48,32 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ handle_t *handle = fs_info;
+ int err = 0;
- err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext4_initxattrs, handle);
+}
+
const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext4_xattr_security_list,
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 5efbd5d7701a..aca191bd5f8f 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -156,8 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
} else {
if (uni_xlate == 1) {
*op++ = ':';
- op = pack_hex_byte(op, ec >> 8);
- op = pack_hex_byte(op, ec);
+ op = hex_byte_pack(op, ec >> 8);
+ op = hex_byte_pack(op, ec);
len -= 5;
} else {
*op++ = '?';
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a5d3853822e0..1510a4d51990 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -326,15 +326,14 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
struct inode *i2);
/* fat/misc.c */
-extern void
-__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4))) __cold;
+extern __printf(3, 4) __cold
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
#define fat_fs_error(sb, fmt, args...) \
__fat_fs_error(sb, 1, fmt , ## args)
#define fat_fs_error_ratelimit(sb, fmt, args...) \
__fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
-void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4))) __cold;
+__printf(3, 4) __cold
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
extern int fat_clusters_flush(struct super_block *sb);
extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 34501b64bc47..65978d7885c8 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -82,7 +82,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
iattr.ia_valid = ATTR_MODE;
iattr.ia_mode = mode;
- error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
+ error = gfs2_setattr_simple(inode, &iattr);
}
return error;
@@ -160,6 +160,7 @@ out:
int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
{
+ struct inode *inode = &ip->i_inode;
struct posix_acl *acl;
char *data;
unsigned int len;
@@ -169,7 +170,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
if (IS_ERR(acl))
return PTR_ERR(acl);
if (!acl)
- return gfs2_setattr_simple(ip, attr);
+ return gfs2_setattr_simple(inode, attr);
error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
if (error)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f9fbbe96c222..4858e1fed8b1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -663,7 +663,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
if (&ip->i_inode == sdp->sd_rindex)
rblocks += 2 * RES_STATFS;
if (alloc_required)
- rblocks += gfs2_rg_blocks(al);
+ rblocks += gfs2_rg_blocks(ip);
error = gfs2_trans_begin(sdp, rblocks,
PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -787,7 +787,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
u64 to = pos + copied;
void *kaddr;
unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
- struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
kaddr = kmap_atomic(page, KM_USER0);
@@ -804,7 +803,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
if (copied) {
if (inode->i_size < to)
i_size_write(inode, to);
- gfs2_dinode_out(ip, di);
mark_inode_dirty(inode);
}
@@ -873,10 +871,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
gfs2_page_add_databufs(ip, page, from, to);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (ret > 0) {
- gfs2_dinode_out(ip, dibh->b_data);
- mark_inode_dirty(inode);
- }
if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7878c473ae62..41d494d79709 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -10,6 +10,7 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
@@ -36,11 +37,6 @@ struct metapath {
__u16 mp_list[GFS2_MAX_META_HEIGHT];
};
-typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
- struct buffer_head *bh, __be64 *top,
- __be64 *bottom, unsigned int height,
- void *data);
-
struct strip_mine {
int sm_first;
unsigned int sm_height;
@@ -273,6 +269,30 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp
return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
}
+static void gfs2_metapath_ra(struct gfs2_glock *gl,
+ const struct buffer_head *bh, const __be64 *pos)
+{
+ struct buffer_head *rabh;
+ const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
+ const __be64 *t;
+
+ for (t = pos; t < endp; t++) {
+ if (!*t)
+ continue;
+
+ rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
+ if (trylock_buffer(rabh)) {
+ if (!buffer_uptodate(rabh)) {
+ rabh->b_end_io = end_buffer_read_sync;
+ submit_bh(READA | REQ_META, rabh);
+ continue;
+ }
+ unlock_buffer(rabh);
+ }
+ brelse(rabh);
+ }
+}
+
/**
* lookup_metapath - Walk the metadata tree to a specific point
* @ip: The inode
@@ -432,12 +452,14 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct super_block *sb = sdp->sd_vfs;
struct buffer_head *dibh = mp->mp_bh[0];
u64 bn, dblock = 0;
unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
unsigned dblks = 0;
unsigned ptrs_per_blk;
const unsigned end_of_metadata = height - 1;
+ int ret;
int eob = 0;
enum alloc_state state;
__be64 *ptr;
@@ -540,6 +562,15 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
dblock = bn;
while (n-- > 0)
*ptr++ = cpu_to_be64(bn++);
+ if (buffer_zeronew(bh_map)) {
+ ret = sb_issue_zeroout(sb, dblock, dblks,
+ GFP_NOFS);
+ if (ret) {
+ fs_err(sdp,
+ "Failed to zero data buffers\n");
+ clear_buffer_zeronew(bh_map);
+ }
+ }
break;
}
} while ((state != ALLOC_DATA) || !dblock);
@@ -668,76 +699,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
}
/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @bc: the call to make for each piece of metadata
- * @data: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
- struct metapath *mp, unsigned int height,
- u64 block, int first, block_call_t bc,
- void *data)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct buffer_head *bh = NULL;
- __be64 *top, *bottom;
- u64 bn;
- int error;
- int mh_size = sizeof(struct gfs2_meta_header);
-
- if (!height) {
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error)
- return error;
- dibh = bh;
-
- top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
- bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
- } else {
- error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
- if (error)
- return error;
-
- top = (__be64 *)(bh->b_data + mh_size) +
- (first ? mp->mp_list[height] : 0);
-
- bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
- }
-
- error = bc(ip, dibh, bh, top, bottom, height, data);
- if (error)
- goto out;
-
- if (height < ip->i_height - 1)
- for (; top < bottom; top++, first = 0) {
- if (!*top)
- continue;
-
- bn = be64_to_cpu(*top);
-
- error = recursive_scan(ip, dibh, mp, height + 1, bn,
- first, bc, data);
- if (error)
- break;
- }
-
-out:
- brelse(bh);
- return error;
-}
-
-/**
* do_strip - Look for a layer a particular layer of the file and strip it off
* @ip: the inode
* @dibh: the dinode buffer
@@ -752,9 +713,8 @@ out:
static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
struct buffer_head *bh, __be64 *top, __be64 *bottom,
- unsigned int height, void *data)
+ unsigned int height, struct strip_mine *sm)
{
- struct strip_mine *sm = data;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrp_list rlist;
u64 bn, bstart;
@@ -783,11 +743,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
else if (ip->i_depth)
revokes = sdp->sd_inptrs;
- if (ip != GFS2_I(sdp->sd_rindex))
- error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
- else if (!sdp->sd_rgrps)
- error = gfs2_ri_update(ip);
-
if (error)
return error;
@@ -805,7 +760,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
blen++;
else {
if (bstart)
- gfs2_rlist_add(sdp, &rlist, bstart);
+ gfs2_rlist_add(ip, &rlist, bstart);
bstart = bn;
blen = 1;
@@ -813,7 +768,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
}
if (bstart)
- gfs2_rlist_add(sdp, &rlist, bstart);
+ gfs2_rlist_add(ip, &rlist, bstart);
else
goto out; /* Nothing to do */
@@ -887,12 +842,82 @@ out_rg_gunlock:
out_rlist:
gfs2_rlist_free(&rlist);
out:
- if (ip != GFS2_I(sdp->sd_rindex))
- gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
return error;
}
/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @sm: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+ struct metapath *mp, unsigned int height,
+ u64 block, int first, struct strip_mine *sm)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *bh = NULL;
+ __be64 *top, *bottom;
+ u64 bn;
+ int error;
+ int mh_size = sizeof(struct gfs2_meta_header);
+
+ if (!height) {
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return error;
+ dibh = bh;
+
+ top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+ bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+ } else {
+ error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+ if (error)
+ return error;
+
+ top = (__be64 *)(bh->b_data + mh_size) +
+ (first ? mp->mp_list[height] : 0);
+
+ bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+ }
+
+ error = do_strip(ip, dibh, bh, top, bottom, height, sm);
+ if (error)
+ goto out;
+
+ if (height < ip->i_height - 1) {
+
+ gfs2_metapath_ra(ip->i_gl, bh, top);
+
+ for (; top < bottom; top++, first = 0) {
+ if (!*top)
+ continue;
+
+ bn = be64_to_cpu(*top);
+
+ error = recursive_scan(ip, dibh, mp, height + 1, bn,
+ first, sm);
+ if (error)
+ break;
+ }
+ }
+out:
+ brelse(bh);
+ return error;
+}
+
+
+/**
* gfs2_block_truncate_page - Deal with zeroing out data for truncate
*
* This is partly borrowed from ext3.
@@ -1031,7 +1056,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
sm.sm_first = !!size;
sm.sm_height = height;
- error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+ error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
if (error)
break;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1cc2f8ec52a2..8ccad2467cb6 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -240,16 +240,15 @@ fail:
return error;
}
-static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
- u64 offset, unsigned int size)
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf,
+ unsigned int size)
{
struct buffer_head *dibh;
int error;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- offset += sizeof(struct gfs2_dinode);
- memcpy(buf, dibh->b_data + offset, size);
+ memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
brelse(dibh);
}
@@ -261,13 +260,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
* gfs2_dir_read_data - Read a data from a directory inode
* @ip: The GFS2 Inode
* @buf: The buffer to place result into
- * @offset: File offset to begin jdata_readng from
* @size: Amount of data to transfer
*
* Returns: The amount of data actually copied or the error
*/
-static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
- unsigned int size, unsigned ra)
+static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
+ unsigned int size)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
u64 lblock, dblock;
@@ -275,24 +273,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
unsigned int o;
int copied = 0;
int error = 0;
- u64 disksize = i_size_read(&ip->i_inode);
-
- if (offset >= disksize)
- return 0;
-
- if (offset + size > disksize)
- size = disksize - offset;
-
- if (!size)
- return 0;
if (gfs2_is_stuffed(ip))
- return gfs2_dir_read_stuffed(ip, buf, offset, size);
+ return gfs2_dir_read_stuffed(ip, buf, size);
if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
return -EINVAL;
- lblock = offset;
+ lblock = 0;
o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
while (copied < size) {
@@ -311,8 +299,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
if (error || !dblock)
goto fail;
BUG_ON(extlen < 1);
- if (!ra)
- extlen = 1;
bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
} else {
error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
@@ -328,7 +314,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
extlen--;
memcpy(buf, bh->b_data + o, amount);
brelse(bh);
- buf += amount;
+ buf += (amount/sizeof(__be64));
copied += amount;
lblock++;
o = sizeof(struct gfs2_meta_header);
@@ -371,7 +357,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
if (hc == NULL)
return ERR_PTR(-ENOMEM);
- ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1);
+ ret = gfs2_dir_read_data(ip, hc, hsize);
if (ret < 0) {
kfree(hc);
return ERR_PTR(ret);
@@ -1695,7 +1681,6 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
const struct qstr *name = &dentry->d_name;
struct gfs2_dirent *dent, *prev = NULL;
struct buffer_head *bh;
- int error;
/* Returns _either_ the entry (if its first in block) or the
previous entry otherwise */
@@ -1724,22 +1709,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
}
brelse(bh);
- error = gfs2_meta_inode_buffer(dip, &bh);
- if (error)
- return error;
-
if (!dip->i_entries)
gfs2_consist_inode(dip);
- gfs2_trans_add_bh(dip->i_gl, bh, 1);
dip->i_entries--;
dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
if (S_ISDIR(dentry->d_inode->i_mode))
drop_nlink(&dip->i_inode);
- gfs2_dinode_out(dip, bh->b_data);
- brelse(bh);
mark_inode_dirty(&dip->i_inode);
- return error;
+ return 0;
}
/**
@@ -1829,10 +1807,6 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (error)
goto out_put;
- error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
- if (error)
- goto out_qs;
-
/* Count the number of leaves */
bh = leaf_bh;
@@ -1847,7 +1821,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (blk != leaf_no)
brelse(bh);
- gfs2_rlist_add(sdp, &rlist, blk);
+ gfs2_rlist_add(dip, &rlist, blk);
l_blocks++;
}
@@ -1911,8 +1885,6 @@ out_rg_gunlock:
gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
out_rlist:
gfs2_rlist_free(&rlist);
- gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
-out_qs:
gfs2_quota_unhold(dip);
out_put:
gfs2_alloc_put(dip);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index edeb9e802903..ce36a56dfeac 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -59,15 +59,24 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
struct gfs2_holder i_gh;
loff_t error;
- if (origin == 2) {
+ switch (origin) {
+ case SEEK_END: /* These reference inode->i_size */
+ case SEEK_DATA:
+ case SEEK_HOLE:
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (!error) {
- error = generic_file_llseek_unlocked(file, offset, origin);
+ error = generic_file_llseek(file, offset, origin);
gfs2_glock_dq_uninit(&i_gh);
}
- } else
- error = generic_file_llseek_unlocked(file, offset, origin);
+ break;
+ case SEEK_CUR:
+ case SEEK_SET:
+ error = generic_file_llseek(file, offset, origin);
+ break;
+ default:
+ error = -EINVAL;
+ }
return error;
}
@@ -357,8 +366,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned int data_blocks, ind_blocks, rblocks;
struct gfs2_holder gh;
struct gfs2_alloc *al;
+ loff_t size;
int ret;
+ /* Wait if fs is frozen. This is racy so we check again later on
+ * and retry if the fs has been frozen after the page lock has
+ * been acquired
+ */
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret)
@@ -367,8 +383,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
set_bit(GIF_SW_PAGED, &ip->i_flags);
- if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
+ if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
+ lock_page(page);
+ if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+ ret = -EAGAIN;
+ unlock_page(page);
+ }
goto out_unlock;
+ }
+
ret = -ENOMEM;
al = gfs2_alloc_get(ip);
if (al == NULL)
@@ -388,7 +411,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
rblocks += data_blocks ? data_blocks : 1;
if (ind_blocks || data_blocks) {
rblocks += RES_STATFS + RES_QUOTA;
- rblocks += gfs2_rg_blocks(al);
+ rblocks += gfs2_rg_blocks(ip);
}
ret = gfs2_trans_begin(sdp, rblocks, 0);
if (ret)
@@ -396,21 +419,29 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
lock_page(page);
ret = -EINVAL;
- last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
- if (page->index > last_index)
- goto out_unlock_page;
+ size = i_size_read(inode);
+ last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ /* Check page index against inode size */
+ if (size == 0 || (page->index > last_index))
+ goto out_trans_end;
+
+ ret = -EAGAIN;
+ /* If truncated, we must retry the operation, we may have raced
+ * with the glock demotion code.
+ */
+ if (!PageUptodate(page) || page->mapping != inode->i_mapping)
+ goto out_trans_end;
+
+ /* Unstuff, if required, and allocate backing blocks for page */
ret = 0;
- if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
- goto out_unlock_page;
- if (gfs2_is_stuffed(ip)) {
+ if (gfs2_is_stuffed(ip))
ret = gfs2_unstuff_dinode(ip, page);
- if (ret)
- goto out_unlock_page;
- }
- ret = gfs2_allocate_page_backing(page);
+ if (ret == 0)
+ ret = gfs2_allocate_page_backing(page);
-out_unlock_page:
- unlock_page(page);
+out_trans_end:
+ if (ret)
+ unlock_page(page);
gfs2_trans_end(sdp);
out_trans_fail:
gfs2_inplace_release(ip);
@@ -422,11 +453,17 @@ out_unlock:
gfs2_glock_dq(&gh);
out:
gfs2_holder_uninit(&gh);
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else if (ret)
- ret = VM_FAULT_SIGBUS;
- return ret;
+ if (ret == 0) {
+ set_page_dirty(page);
+ /* This check must be post dropping of transaction lock */
+ if (inode->i_sb->s_frozen == SB_UNFROZEN) {
+ wait_on_page_writeback(page);
+ } else {
+ ret = -EAGAIN;
+ unlock_page(page);
+ }
+ }
+ return block_page_mkwrite_return(ret);
}
static const struct vm_operations_struct gfs2_vm_ops = {
@@ -551,8 +588,16 @@ static int gfs2_close(struct inode *inode, struct file *file)
* @end: the end position in the file to sync
* @datasync: set if we can ignore timestamp changes
*
- * The VFS will flush data for us. We only need to worry
- * about metadata here.
+ * We split the data flushing here so that we don't wait for the data
+ * until after we've also sent the metadata to disk. Note that for
+ * data=ordered, we will write & wait for the data at the log flush
+ * stage anyway, so this is unlikely to make much of a difference
+ * except in the data=writeback case.
+ *
+ * If the fdatawrite fails due to any reason except -EIO, we will
+ * continue the remainder of the fsync, although we'll still report
+ * the error at the end. This is to match filemap_write_and_wait_range()
+ * behaviour.
*
* Returns: errno
*/
@@ -560,30 +605,34 @@ static int gfs2_close(struct inode *inode, struct file *file)
static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
struct gfs2_inode *ip = GFS2_I(inode);
- int ret;
+ int ret, ret1 = 0;
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
+ if (mapping->nrpages) {
+ ret1 = filemap_fdatawrite_range(mapping, start, end);
+ if (ret1 == -EIO)
+ return ret1;
+ }
if (datasync)
sync_state &= ~I_DIRTY_SYNC;
if (sync_state) {
ret = sync_inode_metadata(inode, 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
+ if (ret)
return ret;
- }
- gfs2_ail_flush(ip->i_gl);
+ if (gfs2_is_jdata(ip))
+ filemap_write_and_wait(mapping);
+ gfs2_ail_flush(ip->i_gl, 1);
}
- mutex_unlock(&inode->i_mutex);
- return 0;
+ if (mapping->nrpages)
+ ret = filemap_fdatawait_range(mapping, start, end);
+
+ return ret ? ret : ret1;
}
/**
@@ -620,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
-static int empty_write_end(struct page *page, unsigned from,
- unsigned to, int mode)
-{
- struct inode *inode = page->mapping->host;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct buffer_head *bh;
- unsigned offset, blksize = 1 << inode->i_blkbits;
- pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-
- zero_user(page, from, to-from);
- mark_page_accessed(page);
-
- if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
- if (!gfs2_is_writeback(ip))
- gfs2_page_add_databufs(ip, page, from, to);
-
- block_commit_write(page, from, to);
- return 0;
- }
-
- offset = 0;
- bh = page_buffers(page);
- while (offset < to) {
- if (offset >= from) {
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- clear_buffer_new(bh);
- write_dirty_buffer(bh, WRITE);
- }
- offset += blksize;
- bh = bh->b_this_page;
- }
-
- offset = 0;
- bh = page_buffers(page);
- while (offset < to) {
- if (offset >= from) {
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- return -EIO;
- }
- offset += blksize;
- bh = bh->b_this_page;
- }
- return 0;
-}
-
-static int needs_empty_write(sector_t block, struct inode *inode)
-{
- int error;
- struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-
- bh_map.b_size = 1 << inode->i_blkbits;
- error = gfs2_block_map(inode, block, &bh_map, 0);
- if (unlikely(error))
- return error;
- return !buffer_mapped(&bh_map);
-}
-
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
- int mode)
-{
- struct inode *inode = page->mapping->host;
- unsigned start, end, next, blksize;
- sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- int ret;
-
- blksize = 1 << inode->i_blkbits;
- next = end = 0;
- while (next < from) {
- next += blksize;
- block++;
- }
- start = next;
- do {
- next += blksize;
- ret = needs_empty_write(block, inode);
- if (unlikely(ret < 0))
- return ret;
- if (ret == 0) {
- if (end) {
- ret = __block_write_begin(page, start, end - start,
- gfs2_block_map);
- if (unlikely(ret))
- return ret;
- ret = empty_write_end(page, start, end, mode);
- if (unlikely(ret))
- return ret;
- end = 0;
- }
- start = next;
- }
- else
- end = next;
- block++;
- } while (next < to);
-
- if (end) {
- ret = __block_write_begin(page, start, end - start, gfs2_block_map);
- if (unlikely(ret))
- return ret;
- ret = empty_write_end(page, start, end, mode);
- if (unlikely(ret))
- return ret;
- }
-
- return 0;
-}
-
static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
int mode)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct buffer_head *dibh;
int error;
- u64 start = offset >> PAGE_CACHE_SHIFT;
- unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
- u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
- pgoff_t curr;
- struct page *page;
- unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
- unsigned int from, to;
-
- if (!end_offset)
- end_offset = PAGE_CACHE_SIZE;
+ unsigned int nr_blks;
+ sector_t lblock = offset >> inode->i_blkbits;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (unlikely(error))
- goto out;
+ return error;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -758,40 +690,31 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
goto out;
}
- curr = start;
- offset = start << PAGE_CACHE_SHIFT;
- from = start_offset;
- to = PAGE_CACHE_SIZE;
- while (curr <= end) {
- page = grab_cache_page_write_begin(inode->i_mapping, curr,
- AOP_FLAG_NOFS);
- if (unlikely(!page)) {
- error = -ENOMEM;
- goto out;
- }
+ while (len) {
+ struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+ bh_map.b_size = len;
+ set_buffer_zeronew(&bh_map);
- if (curr == end)
- to = end_offset;
- error = write_empty_blocks(page, from, to, mode);
- if (!error && offset + to > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- i_size_write(inode, offset + to);
- }
- unlock_page(page);
- page_cache_release(page);
- if (error)
+ error = gfs2_block_map(inode, lblock, &bh_map, 1);
+ if (unlikely(error))
goto out;
- curr++;
- offset += PAGE_CACHE_SIZE;
- from = 0;
+ len -= bh_map.b_size;
+ nr_blks = bh_map.b_size >> inode->i_blkbits;
+ lblock += nr_blks;
+ if (!buffer_new(&bh_map))
+ continue;
+ if (unlikely(!buffer_zeronew(&bh_map))) {
+ error = -EIO;
+ goto out;
+ }
}
+ if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
+ i_size_write(inode, offset + len);
- gfs2_dinode_out(ip, dibh->b_data);
mark_inode_dirty(inode);
- brelse(dibh);
-
out:
+ brelse(dibh);
return error;
}
@@ -799,7 +722,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
unsigned int *data_blocks, unsigned int *ind_blocks)
{
const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+ unsigned int max_blocks = ip->i_rgd->rd_free_clone;
unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
for (tmp = max_data; tmp > sdp->sd_diptrs;) {
@@ -831,6 +754,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
int error;
loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+ loff_t max_chunk_size = UINT_MAX & bsize_mask;
next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
/* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -884,11 +808,12 @@ retry:
goto out_qunlock;
}
max_bytes = bytes;
- calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+ calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
+ &max_bytes, &data_blocks, &ind_blocks);
al->al_requested = data_blocks + ind_blocks;
rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
- RES_RG_HDR + gfs2_rg_blocks(al);
+ RES_RG_HDR + gfs2_rg_blocks(ip);
if (gfs2_is_jdata(ip))
rblocks += data_blocks ? data_blocks : 1;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 66707118af25..2553b858a72e 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -201,7 +201,7 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
-__attribute__ ((format(printf, 2, 3)))
+__printf(2, 3)
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index da21ecaafcc2..78418b4fa857 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,40 +28,55 @@
#include "trans.h"
#include "dir.h"
+static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
+{
+ fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+ bh, (unsigned long long)bh->b_blocknr, bh->b_state,
+ bh->b_page->mapping, bh->b_page->flags);
+ fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+ gl->gl_name.ln_type, gl->gl_name.ln_number,
+ gfs2_glock2aspace(gl));
+ gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+}
+
/**
* __gfs2_ail_flush - remove all buffers for a given lock from the AIL
* @gl: the glock
+ * @fsync: set when called from fsync (not all buffers will be clean)
*
* None of the buffers should be dirty, locked, or pinned.
*/
-static void __gfs2_ail_flush(struct gfs2_glock *gl)
+static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct list_head *head = &gl->gl_ail_list;
- struct gfs2_bufdata *bd;
+ struct gfs2_bufdata *bd, *tmp;
struct buffer_head *bh;
+ const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
+ sector_t blocknr;
+ gfs2_log_lock(sdp);
spin_lock(&sdp->sd_ail_lock);
- while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata,
- bd_ail_gl_list);
+ list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) {
bh = bd->bd_bh;
- gfs2_remove_from_ail(bd);
- bd->bd_bh = NULL;
+ if (bh->b_state & b_state) {
+ if (fsync)
+ continue;
+ gfs2_ail_error(gl, bh);
+ }
+ blocknr = bh->b_blocknr;
bh->b_private = NULL;
- spin_unlock(&sdp->sd_ail_lock);
+ gfs2_remove_from_ail(bd); /* drops ref on bh */
- bd->bd_blkno = bh->b_blocknr;
- gfs2_log_lock(sdp);
- gfs2_assert_withdraw(sdp, !buffer_busy(bh));
- gfs2_trans_add_revoke(sdp, bd);
- gfs2_log_unlock(sdp);
+ bd->bd_bh = NULL;
+ bd->bd_blkno = blocknr;
- spin_lock(&sdp->sd_ail_lock);
+ gfs2_trans_add_revoke(sdp, bd);
}
- gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+ BUG_ON(!fsync && atomic_read(&gl->gl_ail_count));
spin_unlock(&sdp->sd_ail_lock);
+ gfs2_log_unlock(sdp);
}
@@ -84,13 +99,13 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
BUG_ON(current->journal_info);
current->journal_info = &tr;
- __gfs2_ail_flush(gl);
+ __gfs2_ail_flush(gl, 0);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
}
-void gfs2_ail_flush(struct gfs2_glock *gl)
+void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
unsigned int revokes = atomic_read(&gl->gl_ail_count);
@@ -102,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
ret = gfs2_trans_begin(sdp, 0, revokes);
if (ret)
return;
- __gfs2_ail_flush(gl);
+ __gfs2_ail_flush(gl, fsync);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
}
@@ -119,6 +134,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
static void rgrp_go_sync(struct gfs2_glock *gl)
{
struct address_space *metamapping = gfs2_glock2aspace(gl);
+ struct gfs2_rgrpd *rgd;
int error;
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -130,6 +146,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
error = filemap_fdatawait(metamapping);
mapping_set_error(metamapping, error);
gfs2_ail_empty_gl(gl);
+
+ spin_lock(&gl->gl_spin);
+ rgd = gl->gl_object;
+ if (rgd)
+ gfs2_free_clones(rgd);
+ spin_unlock(&gl->gl_spin);
}
/**
@@ -430,33 +452,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
}
/**
- * rgrp_go_lock - operation done after an rgrp lock is locked by
- * a first holder on this node.
- * @gl: the glock
- * @flags:
- *
- * Returns: errno
- */
-
-static int rgrp_go_lock(struct gfs2_holder *gh)
-{
- return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
-}
-
-/**
- * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
- * a last holder on this node.
- * @gl: the glock
- * @flags:
- *
- */
-
-static void rgrp_go_unlock(struct gfs2_holder *gh)
-{
- gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
-}
-
-/**
* trans_go_sync - promote/demote the transaction glock
* @gl: the glock
* @state: the requested state
@@ -558,8 +553,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_xmote_th = rgrp_go_sync,
.go_inval = rgrp_go_inval,
- .go_lock = rgrp_go_lock,
- .go_unlock = rgrp_go_unlock,
+ .go_lock = gfs2_rgrp_go_lock,
+ .go_unlock = gfs2_rgrp_go_unlock,
.go_dump = gfs2_rgrp_dump,
.go_type = LM_TYPE_RGRP,
.go_flags = GLOF_ASPACE,
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 6fce409b5a50..bf95a2dc1662 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -23,6 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
extern const struct gfs2_glock_operations gfs2_journal_glops;
extern const struct gfs2_glock_operations *gfs2_glops_list[];
-extern void gfs2_ail_flush(struct gfs2_glock *gl);
+extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37de8ae..7389dfdcc9ef 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -18,6 +18,7 @@
#include <linux/rcupdate.h>
#include <linux/rculist_bl.h>
#include <linux/completion.h>
+#include <linux/rbtree.h>
#define DIO_WAIT 0x00000010
#define DIO_METADATA 0x00000020
@@ -78,8 +79,7 @@ struct gfs2_bitmap {
};
struct gfs2_rgrpd {
- struct list_head rd_list; /* Link with superblock */
- struct list_head rd_list_mru;
+ struct rb_node rd_node; /* Link with superblock */
struct gfs2_glock *rd_gl; /* Glock for this rgrp */
u64 rd_addr; /* grp block disk address */
u64 rd_data0; /* first data location */
@@ -91,10 +91,7 @@ struct gfs2_rgrpd {
u32 rd_dinodes;
u64 rd_igeneration;
struct gfs2_bitmap *rd_bits;
- struct mutex rd_mutex;
- struct gfs2_log_element rd_le;
struct gfs2_sbd *rd_sbd;
- unsigned int rd_bh_count;
u32 rd_last_alloc;
u32 rd_flags;
#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
@@ -106,12 +103,15 @@ struct gfs2_rgrpd {
enum gfs2_state_bits {
BH_Pinned = BH_PrivateStart,
BH_Escaped = BH_PrivateStart + 1,
+ BH_Zeronew = BH_PrivateStart + 2,
};
BUFFER_FNS(Pinned, pinned)
TAS_BUFFER_FNS(Pinned, pinned)
BUFFER_FNS(Escaped, escaped)
TAS_BUFFER_FNS(Escaped, escaped)
+BUFFER_FNS(Zeronew, zeronew)
+TAS_BUFFER_FNS(Zeronew, zeronew)
struct gfs2_bufdata {
struct buffer_head *bd_bh;
@@ -246,7 +246,6 @@ struct gfs2_glock {
struct gfs2_alloc {
/* Quota stuff */
-
struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
unsigned int al_qd_num;
@@ -255,18 +254,13 @@ struct gfs2_alloc {
u32 al_alloced; /* Filled in by gfs2_alloc_*() */
/* Filled in by gfs2_inplace_reserve() */
-
- unsigned int al_line;
- char *al_file;
- struct gfs2_holder al_ri_gh;
struct gfs2_holder al_rgd_gh;
- struct gfs2_rgrpd *al_rgd;
-
};
enum {
GIF_INVALID = 0,
GIF_QD_LOCKED = 1,
+ GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
};
@@ -282,6 +276,7 @@ struct gfs2_inode {
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
struct gfs2_alloc *i_alloc;
+ struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
struct list_head i_trunc_list;
@@ -574,9 +569,7 @@ struct gfs2_sbd {
int sd_rindex_uptodate;
spinlock_t sd_rindex_spin;
struct mutex sd_rindex_mutex;
- struct list_head sd_rindex_list;
- struct list_head sd_rindex_mru_list;
- struct gfs2_rgrpd *sd_rindex_forward;
+ struct rb_root sd_rindex_tree;
unsigned int sd_rgrps;
unsigned int sd_max_rg_data;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 900cf986aadc..cfd4959b218c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -583,7 +583,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
goto fail_quota_locks;
error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- al->al_rgd->rd_length +
+ dip->i_rgd->rd_length +
2 * RES_DINODE +
RES_STATFS + RES_QUOTA, 0);
if (error)
@@ -613,8 +613,7 @@ fail_end_trans:
gfs2_trans_end(sdp);
fail_ipreserv:
- if (dip->i_alloc->al_rgd)
- gfs2_inplace_release(dip);
+ gfs2_inplace_release(dip);
fail_quota_locks:
gfs2_quota_unlock(dip);
@@ -624,31 +623,29 @@ fail:
return error;
}
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
- const struct qstr *qstr)
+int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
-
- err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
- &name, &value, &len);
-
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = __gfs2_xattr_set(inode, xattr->name, xattr->value,
+ xattr->value_len, 0,
+ GFS2_EATYPE_SECURITY);
+ if (err < 0)
+ break;
}
-
- err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
- GFS2_EATYPE_SECURITY);
- kfree(value);
- kfree(name);
-
return err;
}
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
+ &gfs2_initxattrs, NULL);
+}
+
/**
* gfs2_create_inode - Create a new inode
* @dir: The parent directory
@@ -663,7 +660,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
unsigned int mode, dev_t dev, const char *symname,
- unsigned int size)
+ unsigned int size, int excl)
{
const struct qstr *name = &dentry->d_name;
struct gfs2_holder ghs[2];
@@ -683,6 +680,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail;
error = create_ok(dip, name, mode);
+ if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+ gfs2_glock_dq_uninit(ghs);
+ d_instantiate(dentry, inode);
+ return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+ }
if (error)
goto fail_gunlock;
@@ -725,21 +728,22 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
brelse(bh);
gfs2_trans_end(sdp);
- if (dip->i_alloc->al_rgd)
- gfs2_inplace_release(dip);
+ gfs2_inplace_release(dip);
gfs2_quota_unlock(dip);
gfs2_alloc_put(dip);
- gfs2_glock_dq_uninit_m(2, ghs);
mark_inode_dirty(inode);
+ gfs2_glock_dq_uninit_m(2, ghs);
d_instantiate(dentry, inode);
return 0;
fail_gunlock2:
gfs2_glock_dq_uninit(ghs + 1);
- if (inode && !IS_ERR(inode))
- iput(inode);
fail_gunlock:
gfs2_glock_dq_uninit(ghs);
+ if (inode && !IS_ERR(inode)) {
+ set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
+ iput(inode);
+ }
fail:
if (bh)
brelse(bh);
@@ -758,24 +762,10 @@ fail:
static int gfs2_create(struct inode *dir, struct dentry *dentry,
int mode, struct nameidata *nd)
{
- struct inode *inode;
- int ret;
-
- for (;;) {
- ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
- if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
- return ret;
-
- inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (inode) {
- if (!IS_ERR(inode))
- break;
- return PTR_ERR(inode);
- }
- }
-
- d_instantiate(dentry, inode);
- return 0;
+ int excl = 0;
+ if (nd && (nd->flags & LOOKUP_EXCL))
+ excl = 1;
+ return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
}
/**
@@ -902,7 +892,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
goto out_gunlock_q;
error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- gfs2_rg_blocks(al) +
+ gfs2_rg_blocks(dip) +
2 * RES_DINODE + RES_STATFS +
RES_QUOTA, 0);
if (error)
@@ -924,8 +914,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
inc_nlink(&ip->i_inode);
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_dinode_out(ip, dibh->b_data);
- mark_inode_dirty(&ip->i_inode);
+ ihold(inode);
+ d_instantiate(dentry, inode);
+ mark_inode_dirty(inode);
out_brelse:
brelse(dibh);
@@ -947,11 +938,6 @@ out_child:
out_parent:
gfs2_holder_uninit(ghs);
gfs2_holder_uninit(ghs + 1);
- if (!error) {
- ihold(inode);
- d_instantiate(dentry, inode);
- mark_inode_dirty(inode);
- }
return error;
}
@@ -1024,8 +1010,6 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip,
clear_nlink(inode);
else
drop_nlink(inode);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_dinode_out(ip, bh->b_data);
mark_inode_dirty(inode);
if (inode->i_nlink == 0)
gfs2_unlink_di(inode);
@@ -1053,13 +1037,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
struct buffer_head *bh;
struct gfs2_holder ghs[3];
struct gfs2_rgrpd *rgd;
- struct gfs2_holder ri_gh;
int error;
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- return error;
-
gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
@@ -1116,7 +1095,6 @@ out_child:
gfs2_glock_dq(ghs);
out_parent:
gfs2_holder_uninit(ghs);
- gfs2_glock_dq_uninit(&ri_gh);
return error;
}
@@ -1139,7 +1117,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
return -ENAMETOOLONG;
- return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size);
+ return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
}
/**
@@ -1153,7 +1131,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
- return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0);
+ return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
}
/**
@@ -1168,7 +1146,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
dev_t dev)
{
- return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0);
+ return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
}
/*
@@ -1234,7 +1212,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
struct gfs2_inode *nip = NULL;
struct gfs2_sbd *sdp = GFS2_SB(odir);
- struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
+ struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
struct gfs2_rgrpd *nrgd;
unsigned int num_gh;
int dir_rename = 0;
@@ -1248,10 +1226,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
return 0;
}
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- return error;
-
if (odip != ndip) {
error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
0, &r_gh);
@@ -1388,12 +1362,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
al->al_requested = sdp->sd_max_dirres;
- error = gfs2_inplace_reserve_ri(ndip);
+ error = gfs2_inplace_reserve(ndip);
if (error)
goto out_gunlock_q;
error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- gfs2_rg_blocks(al) +
+ gfs2_rg_blocks(ndip) +
4 * RES_DINODE + 4 * RES_LEAF +
RES_STATFS + RES_QUOTA + 4, 0);
if (error)
@@ -1459,7 +1433,6 @@ out_gunlock_r:
if (r_gh.gh_gl)
gfs2_glock_dq_uninit(&r_gh);
out:
- gfs2_glock_dq_uninit(&ri_gh);
return error;
}
@@ -1563,21 +1536,10 @@ int gfs2_permission(struct inode *inode, int mask)
return error;
}
-static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
{
- struct inode *inode = &ip->i_inode;
- struct buffer_head *dibh;
- int error;
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- return error;
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
return 0;
}
@@ -1589,19 +1551,19 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
* Returns: errno
*/
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+int gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
{
int error;
if (current->journal_info)
- return __gfs2_setattr_simple(ip, attr);
+ return __gfs2_setattr_simple(inode, attr);
- error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
+ error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0);
if (error)
return error;
- error = __gfs2_setattr_simple(ip, attr);
- gfs2_trans_end(GFS2_SB(&ip->i_inode));
+ error = __gfs2_setattr_simple(inode, attr);
+ gfs2_trans_end(GFS2_SB(inode));
return error;
}
@@ -1639,7 +1601,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (error)
goto out_gunlock_q;
- error = gfs2_setattr_simple(ip, attr);
+ error = gfs2_setattr_simple(inode, attr);
if (error)
goto out_end_trans;
@@ -1695,12 +1657,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
error = gfs2_acl_chmod(ip, attr);
else
- error = gfs2_setattr_simple(ip, attr);
+ error = gfs2_setattr_simple(inode, attr);
out:
- gfs2_glock_dq_uninit(&i_gh);
if (!error)
mark_inode_dirty(inode);
+ gfs2_glock_dq_uninit(&i_gh);
return error;
}
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 8d90e0c07672..276e7b52b658 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,7 +109,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip);
extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
int is_root);
extern int gfs2_permission(struct inode *inode, int mask);
-extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 05bbb124699f..0301be655b12 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -60,6 +60,29 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
trace_gfs2_pin(bd, 1);
}
+static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
+{
+ return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP;
+}
+
+static void maybe_release_space(struct gfs2_bufdata *bd)
+{
+ struct gfs2_glock *gl = bd->bd_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_rgrpd *rgd = gl->gl_object;
+ unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
+ struct gfs2_bitmap *bi = rgd->rd_bits + index;
+
+ if (bi->bi_clone == 0)
+ return;
+ if (sdp->sd_args.ar_discard)
+ gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi);
+ memcpy(bi->bi_clone + bi->bi_offset,
+ bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
+ clear_bit(GBF_FULL, &bi->bi_flags);
+ rgd->rd_free_clone = rgd->rd_free;
+}
+
/**
* gfs2_unpin - Unpin a buffer
* @sdp: the filesystem the buffer belongs to
@@ -81,6 +104,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
mark_buffer_dirty(bh);
clear_buffer_pinned(bh);
+ if (buffer_is_rgrp(bd))
+ maybe_release_space(bd);
+
spin_lock(&sdp->sd_ail_lock);
if (bd->bd_ail) {
list_del(&bd->bd_ail_st_list);
@@ -469,42 +495,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
gfs2_revoke_clean(sdp);
}
-static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
- struct gfs2_rgrpd *rgd;
- struct gfs2_trans *tr = current->journal_info;
-
- tr->tr_touched = 1;
-
- rgd = container_of(le, struct gfs2_rgrpd, rd_le);
-
- gfs2_log_lock(sdp);
- if (!list_empty(&le->le_list)){
- gfs2_log_unlock(sdp);
- return;
- }
- gfs2_rgrp_bh_hold(rgd);
- sdp->sd_log_num_rg++;
- list_add(&le->le_list, &sdp->sd_log_le_rg);
- gfs2_log_unlock(sdp);
-}
-
-static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-{
- struct list_head *head = &sdp->sd_log_le_rg;
- struct gfs2_rgrpd *rgd;
-
- while (!list_empty(head)) {
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
- list_del_init(&rgd->rd_le.le_list);
- sdp->sd_log_num_rg--;
-
- gfs2_rgrp_repolish_clones(rgd);
- gfs2_rgrp_bh_put(rgd);
- }
- gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
-}
-
/**
* databuf_lo_add - Add a databuf to the transaction.
*
@@ -705,8 +695,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
brelse(bh_log);
brelse(bh_ip);
- if (error)
- break;
sdp->sd_replayed_blocks++;
}
@@ -771,8 +759,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
};
const struct gfs2_log_operations gfs2_rg_lops = {
- .lo_add = rg_lo_add,
- .lo_after_commit = rg_lo_after_commit,
.lo_name = "rg",
};
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 079587e53849..7e823bbd2453 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -77,8 +77,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
spin_lock_init(&sdp->sd_rindex_spin);
mutex_init(&sdp->sd_rindex_mutex);
- INIT_LIST_HEAD(&sdp->sd_rindex_list);
- INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
+ sdp->sd_rindex_tree.rb_node = NULL;
INIT_LIST_HEAD(&sdp->sd_jindex_list);
spin_lock_init(&sdp->sd_jindex_spin);
@@ -652,7 +651,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
fs_err(sdp, "can't lookup journal index: %d\n", error);
return PTR_ERR(sdp->sd_jindex);
}
- ip = GFS2_I(sdp->sd_jindex);
/* Load in the journal index special file */
@@ -764,7 +762,6 @@ fail:
static int init_inodes(struct gfs2_sbd *sdp, int undo)
{
int error = 0;
- struct gfs2_inode *ip;
struct inode *master = sdp->sd_master_dir->d_inode;
if (undo)
@@ -789,7 +786,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
fs_err(sdp, "can't get resource index inode: %d\n", error);
goto fail_statfs;
}
- ip = GFS2_I(sdp->sd_rindex);
sdp->sd_rindex_uptodate = 0;
/* Read in the quota inode */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 0e8bb13381e4..7e528dc14f85 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -638,15 +638,18 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
unsigned long index = loc >> PAGE_CACHE_SHIFT;
unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
unsigned blocksize, iblock, pos;
- struct buffer_head *bh, *dibh;
+ struct buffer_head *bh;
struct page *page;
void *kaddr, *ptr;
struct gfs2_quota q, *qp;
int err, nbytes;
u64 size;
- if (gfs2_is_stuffed(ip))
- gfs2_unstuff_dinode(ip, NULL);
+ if (gfs2_is_stuffed(ip)) {
+ err = gfs2_unstuff_dinode(ip, NULL);
+ if (err)
+ return err;
+ }
memset(&q, 0, sizeof(struct gfs2_quota));
err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
@@ -736,22 +739,13 @@ get_a_page:
goto get_a_page;
}
- /* Update the disk inode timestamp and size (if extended) */
- err = gfs2_meta_inode_buffer(ip, &dibh);
- if (err)
- goto out;
-
size = loc + sizeof(struct gfs2_quota);
if (size > inode->i_size)
i_size_write(inode, size);
inode->i_mtime = inode->i_atime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
mark_inode_dirty(inode);
-
-out:
return err;
+
unlock_out:
unlock_page(page);
page_cache_release(page);
@@ -822,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
goto out_alloc;
if (nalloc)
- blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
+ blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS;
error = gfs2_trans_begin(sdp, blocks, 0);
if (error)
@@ -936,7 +930,9 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
unsigned int x;
int error = 0;
- gfs2_quota_hold(ip, uid, gid);
+ error = gfs2_quota_hold(ip, uid, gid);
+ if (error)
+ return error;
if (capable(CAP_SYS_RESOURCE) ||
sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
@@ -1607,7 +1603,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
error = gfs2_inplace_reserve(ip);
if (error)
goto out_alloc;
- blocks += gfs2_rg_blocks(al);
+ blocks += gfs2_rg_blocks(ip);
}
/* Some quotas span block boundaries and can update two blocks,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7f8af1eb02de..96bd6d759f29 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -15,6 +15,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/prefetch.h>
#include <linux/blkdev.h>
+#include <linux/rbtree.h>
#include "gfs2.h"
#include "incore.h"
@@ -328,18 +329,22 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
{
- struct gfs2_rgrpd *rgd;
+ struct rb_node **newn;
+ struct gfs2_rgrpd *cur;
spin_lock(&sdp->sd_rindex_spin);
-
- list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
- if (rgrp_contains_block(rgd, blk)) {
- list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+ newn = &sdp->sd_rindex_tree.rb_node;
+ while (*newn) {
+ cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node);
+ if (blk < cur->rd_addr)
+ newn = &((*newn)->rb_left);
+ else if (blk >= cur->rd_data0 + cur->rd_data)
+ newn = &((*newn)->rb_right);
+ else {
spin_unlock(&sdp->sd_rindex_spin);
- return rgd;
+ return cur;
}
}
-
spin_unlock(&sdp->sd_rindex_spin);
return NULL;
@@ -354,8 +359,15 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
{
- gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
- return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
+ const struct rb_node *n;
+ struct gfs2_rgrpd *rgd;
+
+ spin_lock(&sdp->sd_rindex_spin);
+ n = rb_first(&sdp->sd_rindex_tree);
+ rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ return rgd;
}
/**
@@ -367,47 +379,60 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
{
- if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ const struct rb_node *n;
+
+ spin_lock(&sdp->sd_rindex_spin);
+ n = rb_next(&rgd->rd_node);
+ if (n == NULL)
+ n = rb_first(&sdp->sd_rindex_tree);
+
+ if (unlikely(&rgd->rd_node == n)) {
+ spin_unlock(&sdp->sd_rindex_spin);
return NULL;
- return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
+ }
+ rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+ spin_unlock(&sdp->sd_rindex_spin);
+ return rgd;
}
-static void clear_rgrpdi(struct gfs2_sbd *sdp)
+void gfs2_free_clones(struct gfs2_rgrpd *rgd)
{
- struct list_head *head;
+ int x;
+
+ for (x = 0; x < rgd->rd_length; x++) {
+ struct gfs2_bitmap *bi = rgd->rd_bits + x;
+ kfree(bi->bi_clone);
+ bi->bi_clone = NULL;
+ }
+}
+
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+{
+ struct rb_node *n;
struct gfs2_rgrpd *rgd;
struct gfs2_glock *gl;
- spin_lock(&sdp->sd_rindex_spin);
- sdp->sd_rindex_forward = NULL;
- spin_unlock(&sdp->sd_rindex_spin);
-
- head = &sdp->sd_rindex_list;
- while (!list_empty(head)) {
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
+ while ((n = rb_first(&sdp->sd_rindex_tree))) {
+ rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
gl = rgd->rd_gl;
- list_del(&rgd->rd_list);
- list_del(&rgd->rd_list_mru);
+ rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
+ spin_lock(&gl->gl_spin);
gl->gl_object = NULL;
+ spin_unlock(&gl->gl_spin);
gfs2_glock_add_to_lru(gl);
gfs2_glock_put(gl);
}
+ gfs2_free_clones(rgd);
kfree(rgd->rd_bits);
kmem_cache_free(gfs2_rgrpd_cachep, rgd);
}
}
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
-{
- mutex_lock(&sdp->sd_rindex_mutex);
- clear_rgrpdi(sdp);
- mutex_unlock(&sdp->sd_rindex_mutex);
-}
-
static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
{
printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
@@ -524,22 +549,34 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
return total_data;
}
-static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf)
+static void rgd_insert(struct gfs2_rgrpd *rgd)
{
- const struct gfs2_rindex *str = buf;
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*newn) {
+ struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
+ rd_node);
+
+ parent = *newn;
+ if (rgd->rd_addr < cur->rd_addr)
+ newn = &((*newn)->rb_left);
+ else if (rgd->rd_addr > cur->rd_addr)
+ newn = &((*newn)->rb_right);
+ else
+ return;
+ }
- rgd->rd_addr = be64_to_cpu(str->ri_addr);
- rgd->rd_length = be32_to_cpu(str->ri_length);
- rgd->rd_data0 = be64_to_cpu(str->ri_data0);
- rgd->rd_data = be32_to_cpu(str->ri_data);
- rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes);
+ rb_link_node(&rgd->rd_node, parent, newn);
+ rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
}
/**
* read_rindex_entry - Pull in a new resource index entry from the disk
* @gl: The glock covering the rindex inode
*
- * Returns: 0 on success, error code otherwise
+ * Returns: 0 on success, > 0 on EOF, error code otherwise
*/
static int read_rindex_entry(struct gfs2_inode *ip,
@@ -547,44 +584,53 @@ static int read_rindex_entry(struct gfs2_inode *ip,
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
- char buf[sizeof(struct gfs2_rindex)];
+ struct gfs2_rindex buf;
int error;
struct gfs2_rgrpd *rgd;
- error = gfs2_internal_read(ip, ra_state, buf, &pos,
+ if (pos >= i_size_read(&ip->i_inode))
+ return 1;
+
+ error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos,
sizeof(struct gfs2_rindex));
- if (!error)
- return 0;
- if (error != sizeof(struct gfs2_rindex)) {
- if (error > 0)
- error = -EIO;
- return error;
- }
+
+ if (error != sizeof(struct gfs2_rindex))
+ return (error == 0) ? 1 : error;
rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
error = -ENOMEM;
if (!rgd)
return error;
- mutex_init(&rgd->rd_mutex);
- lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
rgd->rd_sbd = sdp;
+ rgd->rd_addr = be64_to_cpu(buf.ri_addr);
+ rgd->rd_length = be32_to_cpu(buf.ri_length);
+ rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
+ rgd->rd_data = be32_to_cpu(buf.ri_data);
+ rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
- list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
- list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
-
- gfs2_rindex_in(rgd, buf);
error = compute_bitstructs(rgd);
if (error)
- return error;
+ goto fail;
error = gfs2_glock_get(sdp, rgd->rd_addr,
&gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
if (error)
- return error;
+ goto fail;
rgd->rd_gl->gl_object = rgd;
rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+ if (rgd->rd_data > sdp->sd_max_rg_data)
+ sdp->sd_max_rg_data = rgd->rd_data;
+ spin_lock(&sdp->sd_rindex_spin);
+ rgd_insert(rgd);
+ sdp->sd_rgrps++;
+ spin_unlock(&sdp->sd_rindex_spin);
+ return error;
+
+fail:
+ kfree(rgd->rd_bits);
+ kmem_cache_free(gfs2_rgrpd_cachep, rgd);
return error;
}
@@ -595,40 +641,28 @@ static int read_rindex_entry(struct gfs2_inode *ip,
* Returns: 0 on successful update, error code otherwise
*/
-int gfs2_ri_update(struct gfs2_inode *ip)
+static int gfs2_ri_update(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
struct file_ra_state ra_state;
- u64 rgrp_count = i_size_read(inode);
- struct gfs2_rgrpd *rgd;
- unsigned int max_data = 0;
int error;
- do_div(rgrp_count, sizeof(struct gfs2_rindex));
- clear_rgrpdi(sdp);
-
file_ra_state_init(&ra_state, inode->i_mapping);
- for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) {
+ do {
error = read_rindex_entry(ip, &ra_state);
- if (error) {
- clear_rgrpdi(sdp);
- return error;
- }
- }
+ } while (error == 0);
+
+ if (error < 0)
+ return error;
- list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
- if (rgd->rd_data > max_data)
- max_data = rgd->rd_data;
- sdp->sd_max_rg_data = max_data;
sdp->sd_rindex_uptodate = 1;
return 0;
}
/**
- * gfs2_rindex_hold - Grab a lock on the rindex
+ * gfs2_rindex_update - Update the rindex if required
* @sdp: The GFS2 superblock
- * @ri_gh: the glock holder
*
* We grab a lock on the rindex inode to make sure that it doesn't
* change whilst we are performing an operation. We keep this lock
@@ -640,30 +674,29 @@ int gfs2_ri_update(struct gfs2_inode *ip)
* special file, which might have been updated if someone expanded the
* filesystem (via gfs2_grow utility), which adds new resource groups.
*
- * Returns: 0 on success, error code otherwise
+ * Returns: 0 on succeess, error code otherwise
*/
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
+int gfs2_rindex_update(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
struct gfs2_glock *gl = ip->i_gl;
- int error;
-
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
- if (error)
- return error;
+ struct gfs2_holder ri_gh;
+ int error = 0;
/* Read new copy from disk if we don't have the latest */
if (!sdp->sd_rindex_uptodate) {
mutex_lock(&sdp->sd_rindex_mutex);
- if (!sdp->sd_rindex_uptodate) {
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
+ if (error)
+ return error;
+ if (!sdp->sd_rindex_uptodate)
error = gfs2_ri_update(ip);
- if (error)
- gfs2_glock_dq_uninit(ri_gh);
- }
+ gfs2_glock_dq_uninit(&ri_gh);
mutex_unlock(&sdp->sd_rindex_mutex);
}
+
return error;
}
@@ -694,7 +727,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
}
/**
- * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps
* @rgd: the struct gfs2_rgrpd describing the RG to read in
*
* Read in all of a Resource Group's header and bitmap blocks.
@@ -703,8 +736,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
* Returns: errno
*/
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
{
+ struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
struct gfs2_sbd *sdp = rgd->rd_sbd;
struct gfs2_glock *gl = rgd->rd_gl;
unsigned int length = rgd->rd_length;
@@ -712,17 +746,6 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
unsigned int x, y;
int error;
- mutex_lock(&rgd->rd_mutex);
-
- spin_lock(&sdp->sd_rindex_spin);
- if (rgd->rd_bh_count) {
- rgd->rd_bh_count++;
- spin_unlock(&sdp->sd_rindex_spin);
- mutex_unlock(&rgd->rd_mutex);
- return 0;
- }
- spin_unlock(&sdp->sd_rindex_spin);
-
for (x = 0; x < length; x++) {
bi = rgd->rd_bits + x;
error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
@@ -747,15 +770,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+ rgd->rd_free_clone = rgd->rd_free;
}
- spin_lock(&sdp->sd_rindex_spin);
- rgd->rd_free_clone = rgd->rd_free;
- rgd->rd_bh_count++;
- spin_unlock(&sdp->sd_rindex_spin);
-
- mutex_unlock(&rgd->rd_mutex);
-
return 0;
fail:
@@ -765,52 +782,32 @@ fail:
bi->bi_bh = NULL;
gfs2_assert_warn(sdp, !bi->bi_clone);
}
- mutex_unlock(&rgd->rd_mutex);
return error;
}
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
-{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
-
- spin_lock(&sdp->sd_rindex_spin);
- gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
- rgd->rd_bh_count++;
- spin_unlock(&sdp->sd_rindex_spin);
-}
-
/**
- * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
* @rgd: the struct gfs2_rgrpd describing the RG to read in
*
*/
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
+void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
+ struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
int x, length = rgd->rd_length;
- spin_lock(&sdp->sd_rindex_spin);
- gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
- if (--rgd->rd_bh_count) {
- spin_unlock(&sdp->sd_rindex_spin);
- return;
- }
-
for (x = 0; x < length; x++) {
struct gfs2_bitmap *bi = rgd->rd_bits + x;
- kfree(bi->bi_clone);
- bi->bi_clone = NULL;
brelse(bi->bi_bh);
bi->bi_bh = NULL;
}
- spin_unlock(&sdp->sd_rindex_spin);
}
-static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
- const struct gfs2_bitmap *bi)
+void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+ struct buffer_head *bh,
+ const struct gfs2_bitmap *bi)
{
struct super_block *sb = sdp->sd_vfs;
struct block_device *bdev = sb->s_bdev;
@@ -823,7 +820,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
unsigned int x;
for (x = 0; x < bi->bi_len; x++) {
- const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x;
+ const u8 *orig = bh->b_data + bi->bi_offset + x;
const u8 *clone = bi->bi_clone + bi->bi_offset + x;
u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
diff &= 0x55;
@@ -862,28 +859,6 @@ fail:
sdp->sd_args.ar_discard = 0;
}
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
-{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
- unsigned int length = rgd->rd_length;
- unsigned int x;
-
- for (x = 0; x < length; x++) {
- struct gfs2_bitmap *bi = rgd->rd_bits + x;
- if (!bi->bi_clone)
- continue;
- if (sdp->sd_args.ar_discard)
- gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
- clear_bit(GBF_FULL, &bi->bi_flags);
- memcpy(bi->bi_clone + bi->bi_offset,
- bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
- }
-
- spin_lock(&sdp->sd_rindex_spin);
- rgd->rd_free_clone = rgd->rd_free;
- spin_unlock(&sdp->sd_rindex_spin);
-}
-
/**
* gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
* @ip: the incore GFS2 inode structure
@@ -893,38 +868,35 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ int error;
BUG_ON(ip->i_alloc != NULL);
ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ fs_warn(sdp, "rindex update returns %d\n", error);
return ip->i_alloc;
}
/**
* try_rgrp_fit - See if a given reservation will fit in a given RG
* @rgd: the RG data
- * @al: the struct gfs2_alloc structure describing the reservation
+ * @ip: the inode
*
* If there's room for the requested blocks to be allocated from the RG:
- * Sets the $al_rgd field in @al.
*
* Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
*/
-static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
+static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
- int ret = 0;
+ const struct gfs2_alloc *al = ip->i_alloc;
if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
return 0;
-
- spin_lock(&sdp->sd_rindex_spin);
- if (rgd->rd_free_clone >= al->al_requested) {
- al->al_rgd = rgd;
- ret = 1;
- }
- spin_unlock(&sdp->sd_rindex_spin);
-
- return ret;
+ if (rgd->rd_free_clone >= al->al_requested)
+ return 1;
+ return 0;
}
/**
@@ -992,76 +964,6 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
}
/**
- * recent_rgrp_next - get next RG from "recent" list
- * @cur_rgd: current rgrp
- *
- * Returns: The next rgrp in the recent list
- */
-
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
-{
- struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
- struct list_head *head;
- struct gfs2_rgrpd *rgd;
-
- spin_lock(&sdp->sd_rindex_spin);
- head = &sdp->sd_rindex_mru_list;
- if (unlikely(cur_rgd->rd_list_mru.next == head)) {
- spin_unlock(&sdp->sd_rindex_spin);
- return NULL;
- }
- rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
- spin_unlock(&sdp->sd_rindex_spin);
- return rgd;
-}
-
-/**
- * forward_rgrp_get - get an rgrp to try next from full list
- * @sdp: The GFS2 superblock
- *
- * Returns: The rgrp to try next
- */
-
-static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
-{
- struct gfs2_rgrpd *rgd;
- unsigned int journals = gfs2_jindex_size(sdp);
- unsigned int rg = 0, x;
-
- spin_lock(&sdp->sd_rindex_spin);
-
- rgd = sdp->sd_rindex_forward;
- if (!rgd) {
- if (sdp->sd_rgrps >= journals)
- rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
-
- for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
- x++, rgd = gfs2_rgrpd_get_next(rgd))
- /* Do Nothing */;
-
- sdp->sd_rindex_forward = rgd;
- }
-
- spin_unlock(&sdp->sd_rindex_spin);
-
- return rgd;
-}
-
-/**
- * forward_rgrp_set - set the forward rgrp pointer
- * @sdp: the filesystem
- * @rgd: The new forward rgrp
- *
- */
-
-static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
-{
- spin_lock(&sdp->sd_rindex_spin);
- sdp->sd_rindex_forward = rgd;
- spin_unlock(&sdp->sd_rindex_spin);
-}
-
-/**
* get_local_rgrp - Choose and lock a rgrp for allocation
* @ip: the inode to reserve space for
* @rgp: the chosen and locked rgrp
@@ -1076,14 +978,18 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd, *begin = NULL;
struct gfs2_alloc *al = ip->i_alloc;
- int flags = LM_FLAG_TRY;
- int skipped = 0;
- int loops = 0;
int error, rg_locked;
+ int loops = 0;
+
+ if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
+ rgd = begin = ip->i_rgd;
+ else
+ rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal);
- rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
+ if (rgd == NULL)
+ return -EBADSLT;
- while (rgd) {
+ while (loops < 3) {
rg_locked = 0;
if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
@@ -1095,92 +1001,36 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
}
switch (error) {
case 0:
- if (try_rgrp_fit(rgd, al))
- goto out;
+ if (try_rgrp_fit(rgd, ip)) {
+ ip->i_rgd = rgd;
+ return 0;
+ }
if (rgd->rd_flags & GFS2_RDF_CHECK)
try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
/* fall through */
case GLR_TRYFAILED:
- rgd = recent_rgrp_next(rgd);
- break;
-
- default:
- return error;
- }
- }
-
- /* Go through full list of rgrps */
-
- begin = rgd = forward_rgrp_get(sdp);
-
- for (;;) {
- rg_locked = 0;
-
- if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
- rg_locked = 1;
- error = 0;
- } else {
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
- &al->al_rgd_gh);
- }
- switch (error) {
- case 0:
- if (try_rgrp_fit(rgd, al))
- goto out;
- if (rgd->rd_flags & GFS2_RDF_CHECK)
- try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
- if (!rg_locked)
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
- break;
-
- case GLR_TRYFAILED:
- skipped++;
+ rgd = gfs2_rgrpd_get_next(rgd);
+ if (rgd == begin)
+ loops++;
break;
-
default:
return error;
}
-
- rgd = gfs2_rgrpd_get_next(rgd);
- if (!rgd)
- rgd = gfs2_rgrpd_get_first(sdp);
-
- if (rgd == begin) {
- if (++loops >= 3)
- return -ENOSPC;
- if (!skipped)
- loops++;
- flags = 0;
- if (loops == 2)
- gfs2_log_flush(sdp, NULL);
- }
}
-out:
- if (begin) {
- spin_lock(&sdp->sd_rindex_spin);
- list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
- spin_unlock(&sdp->sd_rindex_spin);
- rgd = gfs2_rgrpd_get_next(rgd);
- if (!rgd)
- rgd = gfs2_rgrpd_get_first(sdp);
- forward_rgrp_set(sdp, rgd);
- }
-
- return 0;
+ return -ENOSPC;
}
/**
- * gfs2_inplace_reserve_i - Reserve space in the filesystem
+ * gfs2_inplace_reserve - Reserve space in the filesystem
* @ip: the inode to reserve space for
*
* Returns: errno
*/
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
- char *file, unsigned int line)
+int gfs2_inplace_reserve(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al = ip->i_alloc;
@@ -1191,45 +1041,22 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
if (gfs2_assert_warn(sdp, al->al_requested))
return -EINVAL;
- if (hold_rindex) {
- /* We need to hold the rindex unless the inode we're using is
- the rindex itself, in which case it's already held. */
- if (ip != GFS2_I(sdp->sd_rindex))
- error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
- else if (!sdp->sd_rgrps) /* We may not have the rindex read
- in, so: */
- error = gfs2_ri_update(ip);
- if (error)
- return error;
- }
-
-try_again:
do {
error = get_local_rgrp(ip, &last_unlinked);
- /* If there is no space, flushing the log may release some */
- if (error) {
- if (ip == GFS2_I(sdp->sd_rindex) &&
- !sdp->sd_rindex_uptodate) {
- error = gfs2_ri_update(ip);
- if (error)
- return error;
- goto try_again;
- }
- gfs2_log_flush(sdp, NULL);
+ if (error != -ENOSPC)
+ break;
+ /* Check that fs hasn't grown if writing to rindex */
+ if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
+ error = gfs2_ri_update(ip);
+ if (error)
+ break;
+ continue;
}
- } while (error && tries++ < 3);
-
- if (error) {
- if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
- gfs2_glock_dq_uninit(&al->al_ri_gh);
- return error;
- }
-
- /* no error, so we have the rgrp set in the inode's allocation. */
- al->al_file = file;
- al->al_line = line;
+ /* Flushing the log may release space */
+ gfs2_log_flush(sdp, NULL);
+ } while (tries++ < 3);
- return 0;
+ return error;
}
/**
@@ -1241,20 +1068,10 @@ try_again:
void gfs2_inplace_release(struct gfs2_inode *ip)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al = ip->i_alloc;
- if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
- fs_warn(sdp, "al_alloced = %u, al_requested = %u "
- "al_file = %s, al_line = %u\n",
- al->al_alloced, al->al_requested, al->al_file,
- al->al_line);
-
- al->al_rgd = NULL;
if (al->al_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
- gfs2_glock_dq_uninit(&al->al_ri_gh);
}
/**
@@ -1352,6 +1169,7 @@ do_search:
/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
bitmaps, so we must search the originals for that. */
buffer = bi->bi_bh->b_data + bi->bi_offset;
+ WARN_ON(!buffer_uptodate(bi->bi_bh));
if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
buffer = bi->bi_clone + bi->bi_offset;
@@ -1371,6 +1189,7 @@ skip:
if (blk == BFITNOENT)
return blk;
+
*n = 1;
if (old_state == new_state)
goto out;
@@ -1503,7 +1322,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
if (al == NULL)
return -ECANCELED;
- rgd = al->al_rgd;
+ rgd = ip->i_rgd;
if (rgrp_contains_block(rgd, ip->i_goal))
goal = ip->i_goal - rgd->rd_data0;
@@ -1518,7 +1337,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
rgd->rd_last_alloc = blk;
block = rgd->rd_data0 + blk;
- ip->i_goal = block;
+ ip->i_goal = block + *n - 1;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error == 0) {
struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
@@ -1539,9 +1358,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
- spin_lock(&sdp->sd_rindex_spin);
rgd->rd_free_clone -= *n;
- spin_unlock(&sdp->sd_rindex_spin);
trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
*bn = block;
return 0;
@@ -1564,7 +1381,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_alloc *al = dip->i_alloc;
- struct gfs2_rgrpd *rgd = al->al_rgd;
+ struct gfs2_rgrpd *rgd = dip->i_rgd;
u32 blk;
u64 block;
unsigned int n = 1;
@@ -1594,9 +1411,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
gfs2_statfs_change(sdp, 0, -1, +1);
gfs2_trans_add_unrevoke(sdp, block, 1);
- spin_lock(&sdp->sd_rindex_spin);
rgd->rd_free_clone--;
- spin_unlock(&sdp->sd_rindex_spin);
trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
*bn = block;
return 0;
@@ -1629,8 +1444,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
- gfs2_trans_add_rg(rgd);
-
/* Directories keep their data in the metadata address space */
if (meta || ip->i_depth)
gfs2_meta_wipe(ip, bstart, blen);
@@ -1666,7 +1479,6 @@ void gfs2_unlink_di(struct inode *inode)
trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
- gfs2_trans_add_rg(rgd);
}
static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
@@ -1688,7 +1500,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_statfs_change(sdp, 0, +1, -1);
- gfs2_trans_add_rg(rgd);
}
@@ -1714,41 +1525,33 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
{
struct gfs2_rgrpd *rgd;
- struct gfs2_holder ri_gh, rgd_gh;
- struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
- int ri_locked = 0;
+ struct gfs2_holder rgd_gh;
int error;
- if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- goto fail;
- ri_locked = 1;
- }
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
error = -EINVAL;
rgd = gfs2_blk2rgrpd(sdp, no_addr);
if (!rgd)
- goto fail_rindex;
+ goto fail;
error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
if (error)
- goto fail_rindex;
+ goto fail;
if (gfs2_get_block_type(rgd, no_addr) != type)
error = -ESTALE;
gfs2_glock_dq_uninit(&rgd_gh);
-fail_rindex:
- if (ri_locked)
- gfs2_glock_dq_uninit(&ri_gh);
fail:
return error;
}
/**
* gfs2_rlist_add - add a RG to a list of RGs
- * @sdp: the filesystem
+ * @ip: the inode
* @rlist: the list of resource groups
* @block: the block
*
@@ -1758,9 +1561,10 @@ fail:
*
*/
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
u64 block)
{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd;
struct gfs2_rgrpd **tmp;
unsigned int new_space;
@@ -1769,12 +1573,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
return;
- rgd = gfs2_blk2rgrpd(sdp, block);
+ if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
+ rgd = ip->i_rgd;
+ else
+ rgd = gfs2_blk2rgrpd(sdp, block);
if (!rgd) {
- if (gfs2_consist(sdp))
- fs_err(sdp, "block = %llu\n", (unsigned long long)block);
+ fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
return;
}
+ ip->i_rgd = rgd;
for (x = 0; x < rlist->rl_rgrps; x++)
if (rlist->rl_rgd[x] == rgd)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index d253f9a8c70e..cf5c50180192 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -18,18 +18,15 @@ struct gfs2_holder;
extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
-struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
-struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
-struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
-
-extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
-extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
-extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
-
-extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
+extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
static inline void gfs2_alloc_put(struct gfs2_inode *ip)
@@ -39,16 +36,9 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
ip->i_alloc = NULL;
}
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
- char *file, unsigned int line);
-#define gfs2_inplace_reserve(ip) \
- gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
-#define gfs2_inplace_reserve_ri(ip) \
- gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
-
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip);
extern void gfs2_inplace_release(struct gfs2_inode *ip);
-extern int gfs2_ri_update(struct gfs2_inode *ip);
extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
@@ -66,11 +56,14 @@ struct gfs2_rgrp_list {
struct gfs2_holder *rl_ghs;
};
-extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
u64 block);
extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+ struct buffer_head *bh,
+ const struct gfs2_bitmap *bi);
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b7beadd9ba4c..71e420989f77 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -752,51 +752,77 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
struct backing_dev_info *bdi = metamapping->backing_dev_info;
- struct gfs2_holder gh;
+ int ret = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+ if (bdi->dirty_exceeded)
+ gfs2_ail1_flush(sdp, wbc);
+ else
+ filemap_fdatawrite(metamapping);
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ ret = filemap_fdatawait(metamapping);
+ if (ret)
+ mark_inode_dirty_sync(inode);
+ return ret;
+}
+
+/**
+ * gfs2_dirty_inode - check for atime updates
+ * @inode: The inode in question
+ * @flags: The type of dirty
+ *
+ * Unfortunately it can be called under any combination of inode
+ * glock and transaction lock, so we have to check carefully.
+ *
+ * At the moment this deals only with atime - it should be possible
+ * to expand that role in future, once a review of the locking has
+ * been carried out.
+ */
+
+static void gfs2_dirty_inode(struct inode *inode, int flags)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
struct buffer_head *bh;
- struct timespec atime;
- struct gfs2_dinode *di;
- int ret = -EAGAIN;
- int unlock_required = 0;
-
- /* Skip timestamp update, if this is from a memalloc */
- if (current->flags & PF_MEMALLOC)
- goto do_flush;
+ struct gfs2_holder gh;
+ int need_unlock = 0;
+ int need_endtrans = 0;
+ int ret;
+
+ if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
+ return;
+
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- if (ret)
- goto do_flush;
- unlock_required = 1;
+ if (ret) {
+ fs_err(sdp, "dirty_inode: glock %d\n", ret);
+ return;
+ }
+ need_unlock = 1;
}
- ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (ret)
- goto do_unlock;
+
+ if (current->journal_info == NULL) {
+ ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (ret) {
+ fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret);
+ goto out;
+ }
+ need_endtrans = 1;
+ }
+
ret = gfs2_meta_inode_buffer(ip, &bh);
if (ret == 0) {
- di = (struct gfs2_dinode *)bh->b_data;
- atime.tv_sec = be64_to_cpu(di->di_atime);
- atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
- if (timespec_compare(&inode->i_atime, &atime) > 0) {
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_dinode_out(ip, bh->b_data);
- }
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
}
- gfs2_trans_end(sdp);
-do_unlock:
- if (unlock_required)
+
+ if (need_endtrans)
+ gfs2_trans_end(sdp);
+out:
+ if (need_unlock)
gfs2_glock_dq_uninit(&gh);
-do_flush:
- if (wbc->sync_mode == WB_SYNC_ALL)
- gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
- filemap_fdatawrite(metamapping);
- if (bdi->dirty_exceeded)
- gfs2_ail1_flush(sdp, wbc);
- if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
- ret = filemap_fdatawait(metamapping);
- if (ret)
- mark_inode_dirty_sync(inode);
- return ret;
}
/**
@@ -1011,7 +1037,6 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
{
- struct gfs2_holder ri_gh;
struct gfs2_rgrpd *rgd_next;
struct gfs2_holder *gha, *gh;
unsigned int slots = 64;
@@ -1024,10 +1049,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
if (!gha)
return -ENOMEM;
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- goto out;
-
rgd_next = gfs2_rgrpd_get_first(sdp);
for (;;) {
@@ -1070,9 +1091,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
yield();
}
- gfs2_glock_dq_uninit(&ri_gh);
-
-out:
kfree(gha);
return error;
}
@@ -1124,6 +1142,10 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
struct gfs2_statfs_change_host sc;
int error;
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
if (gfs2_tune_get(sdp, gt_statfs_slow))
error = gfs2_statfs_slow(sdp, &sc);
else
@@ -1394,21 +1416,17 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
if (error)
goto out;
- error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
- if (error)
- goto out_qs;
-
rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
if (!rgd) {
gfs2_consist_inode(ip);
error = -EIO;
- goto out_rindex_relse;
+ goto out_qs;
}
error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
&al->al_rgd_gh);
if (error)
- goto out_rindex_relse;
+ goto out_qs;
error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
sdp->sd_jdesc->jd_blocks);
@@ -1423,8 +1441,6 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
out_rg_gunlock:
gfs2_glock_dq_uninit(&al->al_rgd_gh);
-out_rindex_relse:
- gfs2_glock_dq_uninit(&al->al_ri_gh);
out_qs:
gfs2_quota_unhold(ip);
out:
@@ -1471,9 +1487,11 @@ static void gfs2_evict_inode(struct inode *inode)
goto out;
}
- error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
- if (error)
- goto out_truncate;
+ if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
+ error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+ if (error)
+ goto out_truncate;
+ }
if (test_bit(GIF_INVALID, &ip->i_flags)) {
error = gfs2_inode_refresh(ip);
@@ -1513,6 +1531,10 @@ static void gfs2_evict_inode(struct inode *inode)
goto out_unlock;
out_truncate:
+ gfs2_log_flush(sdp, ip->i_gl);
+ write_inode_now(inode, 1);
+ gfs2_ail_flush(ip->i_gl, 0);
+
/* Case 2 starts here */
error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
if (error)
@@ -1552,6 +1574,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
if (ip) {
ip->i_flags = 0;
ip->i_gl = NULL;
+ ip->i_rgd = NULL;
}
return &ip->i_inode;
}
@@ -1572,6 +1595,7 @@ const struct super_operations gfs2_super_ops = {
.alloc_inode = gfs2_alloc_inode,
.destroy_inode = gfs2_destroy_inode,
.write_inode = gfs2_write_inode,
+ .dirty_inode = gfs2_dirty_inode,
.evict_inode = gfs2_evict_inode,
.put_super = gfs2_put_super,
.sync_fs = gfs2_sync_fs,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 9ec73a854111..86ac75d99d31 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -185,8 +185,3 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
gfs2_log_unlock(sdp);
}
-void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
-{
- lops_add(rgd->rd_sbd, &rgd->rd_le);
-}
-
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index fb56b783e028..f8f101ef600c 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -28,20 +28,20 @@ struct gfs2_glock;
/* reserve either the number of blocks to be allocated plus the rg header
* block, or all of the blocks in the rg, whichever is smaller */
-static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip)
{
- return (al->al_requested < al->al_rgd->rd_length)?
- al->al_requested + 1 : al->al_rgd->rd_length;
+ const struct gfs2_alloc *al = ip->i_alloc;
+ if (al->al_requested < ip->i_rgd->rd_length)
+ return al->al_requested + 1;
+ return ip->i_rgd->rd_length;
}
-int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
- unsigned int revokes);
+extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+ unsigned int revokes);
-void gfs2_trans_end(struct gfs2_sbd *sdp);
-
-void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
-void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
-void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
+extern void gfs2_trans_end(struct gfs2_sbd *sdp);
+extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 439b61c03262..71d7bf830c09 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -332,15 +332,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
goto out_alloc;
- error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
- if (error)
- goto out_quota;
-
error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
- gfs2_glock_dq_uninit(&al->al_ri_gh);
-
-out_quota:
gfs2_quota_unhold(ip);
out_alloc:
gfs2_alloc_put(ip);
@@ -734,7 +727,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
goto out_gunlock_q;
error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
- blks + gfs2_rg_blocks(al) +
+ blks + gfs2_rg_blocks(ip) +
RES_DINODE + RES_STATFS + RES_QUOTA, 0);
if (error)
goto out_ipres;
@@ -1296,7 +1289,8 @@ fail:
int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct inode *inode = &ip->i_inode;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_ea_location el;
int error;
@@ -1319,7 +1313,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
if (error)
return error;
- error = gfs2_setattr_simple(ip, attr);
+ error = gfs2_setattr_simple(inode, attr);
gfs2_trans_end(sdp);
return error;
}
@@ -1362,14 +1356,14 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
blen++;
else {
if (bstart)
- gfs2_rlist_add(sdp, &rlist, bstart);
+ gfs2_rlist_add(ip, &rlist, bstart);
bstart = bn;
blen = 1;
}
blks++;
}
if (bstart)
- gfs2_rlist_add(sdp, &rlist, bstart);
+ gfs2_rlist_add(ip, &rlist, bstart);
else
goto out;
@@ -1501,24 +1495,18 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
if (error)
goto out_alloc;
- error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
- if (error)
- goto out_quota;
-
error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
if (error)
- goto out_rindex;
+ goto out_quota;
if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
error = ea_dealloc_indirect(ip);
if (error)
- goto out_rindex;
+ goto out_quota;
}
error = ea_dealloc_block(ip);
-out_rindex:
- gfs2_glock_dq_uninit(&al->al_ri_gh);
out_quota:
gfs2_quota_unhold(ip);
out_alloc:
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 331b5e234ef3..de946170ebb1 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -311,8 +311,8 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
/* super.c */
-void hpfs_error(struct super_block *, const char *, ...)
- __attribute__((format (printf, 2, 3)));
+__printf(2, 3)
+void hpfs_error(struct super_block *, const char *, ...);
int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
unsigned hpfs_count_one_bitmap(struct super_block *, secno);
diff --git a/fs/inode.c b/fs/inode.c
index ec7924696a13..ecbb68dc7e2a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -634,7 +634,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
* inode to the back of the list so we don't spin on it.
*/
if (!spin_trylock(&inode->i_lock)) {
- list_move(&inode->i_lru, &sb->s_inode_lru);
+ list_move_tail(&inode->i_lru, &sb->s_inode_lru);
continue;
}
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index cfeb7164b085..0f20208df602 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -22,26 +22,29 @@
#include <linux/security.h>
#include "nodelist.h"
-/* ---- Initial Security Label Attachment -------------- */
-int jffs2_init_security(struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+/* ---- Initial Security Label(s) Attachment callback --- */
+int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int rc;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ int err = 0;
- rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
- if (rc) {
- if (rc == -EOPNOTSUPP)
- return 0;
- return rc;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0);
+ return err;
+}
- kfree(name);
- kfree(value);
- return rc;
+/* ---- Initial Security Label(s) Attachment ----------- */
+int jffs2_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &jffs2_initxattrs, NULL);
}
/* ---- XATTR Handler for "security.*" ----------------- */
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index e87fedef23db..26683e15b3ac 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -1089,38 +1089,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
}
#ifdef CONFIG_JFS_SECURITY
-int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int rc;
- size_t len;
- void *value;
- char *suffix;
+ const struct xattr *xattr;
+ tid_t *tid = fs_info;
char *name;
-
- rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
- &len);
- if (rc) {
- if (rc == -EOPNOTSUPP)
- return 0;
- return rc;
- }
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix),
- GFP_NOFS);
- if (!name) {
- rc = -ENOMEM;
- goto kmalloc_failed;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ break;
+ }
+ strcpy(name, XATTR_SECURITY_PREFIX);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+
+ err = __jfs_setxattr(*tid, inode, name,
+ xattr->value, xattr->value_len, 0);
+ kfree(name);
+ if (err < 0)
+ break;
}
- strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-
- rc = __jfs_setxattr(tid, inode, name, value, len, 0);
-
- kfree(name);
-kmalloc_failed:
- kfree(suffix);
- kfree(value);
+ return err;
+}
- return rc;
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &jfs_initxattrs, &tid);
}
#endif
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index b7c99bfb3da6..6f29836ec0cb 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
struct hlist_node *pos;
struct nlm_host *host = NULL;
struct nsm_handle *nsm = NULL;
- struct sockaddr_in sin = {
- .sin_family = AF_INET,
- };
- struct sockaddr_in6 sin6 = {
- .sin6_family = AF_INET6,
- };
- struct sockaddr *src_sap;
- size_t src_len = rqstp->rq_addrlen;
+ struct sockaddr *src_sap = svc_daddr(rqstp);
+ size_t src_len = rqstp->rq_daddrlen;
struct nlm_lookup_host_info ni = {
.server = 1,
.sap = svc_addr(rqstp),
@@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
mutex_lock(&nlm_host_mutex);
- switch (ni.sap->sa_family) {
- case AF_INET:
- sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
- src_sap = (struct sockaddr *)&sin;
- break;
- case AF_INET6:
- ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
- src_sap = (struct sockaddr *)&sin6;
- break;
- default:
- dprintk("lockd: %s failed; unrecognized address family\n",
- __func__);
- goto out;
- }
-
if (time_after_eq(jiffies, next_gc))
nlm_gc_hosts();
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index abfff9d7979d..c061b9aa7ddb 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -282,7 +282,7 @@ int lockd_up(void)
/*
* Create the kernel thread and wait for it to start.
*/
- nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+ nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
if (IS_ERR(nlmsvc_rqst)) {
error = PTR_ERR(nlmsvc_rqst);
nlmsvc_rqst = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index 703f545097de..3b0d05dcd7c1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -60,7 +60,7 @@
*
* Initial implementation of mandatory locks. SunOS turned out to be
* a rotten model, so I implemented the "obvious" semantics.
- * See 'Documentation/mandatory.txt' for details.
+ * See 'Documentation/filesystems/mandatory-locking.txt' for details.
* Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
*
* Don't allow mandatory locks on mmap()'ed files. Added simple functions to
@@ -133,6 +133,20 @@
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
+static bool lease_breaking(struct file_lock *fl)
+{
+ return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
+}
+
+static int target_leasetype(struct file_lock *fl)
+{
+ if (fl->fl_flags & FL_UNLOCK_PENDING)
+ return F_UNLCK;
+ if (fl->fl_flags & FL_DOWNGRADE_PENDING)
+ return F_RDLCK;
+ return fl->fl_type;
+}
+
int leases_enable = 1;
int lease_break_time = 45;
@@ -1119,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode,
EXPORT_SYMBOL(locks_mandatory_area);
+static void lease_clear_pending(struct file_lock *fl, int arg)
+{
+ switch (arg) {
+ case F_UNLCK:
+ fl->fl_flags &= ~FL_UNLOCK_PENDING;
+ /* fall through: */
+ case F_RDLCK:
+ fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
+ }
+}
+
/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lock **before, int arg)
{
@@ -1127,6 +1152,7 @@ int lease_modify(struct file_lock **before, int arg)
if (error)
return error;
+ lease_clear_pending(fl, arg);
locks_wake_up_blocks(fl);
if (arg == F_UNLCK)
locks_delete_lock(before);
@@ -1135,19 +1161,25 @@ int lease_modify(struct file_lock **before, int arg)
EXPORT_SYMBOL(lease_modify);
+static bool past_time(unsigned long then)
+{
+ if (!then)
+ /* 0 is a special value meaning "this never expires": */
+ return false;
+ return time_after(jiffies, then);
+}
+
static void time_out_leases(struct inode *inode)
{
struct file_lock **before;
struct file_lock *fl;
before = &inode->i_flock;
- while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) {
- if ((fl->fl_break_time == 0)
- || time_before(jiffies, fl->fl_break_time)) {
- before = &fl->fl_next;
- continue;
- }
- lease_modify(before, fl->fl_type & ~F_INPROGRESS);
+ while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
+ if (past_time(fl->fl_downgrade_time))
+ lease_modify(before, F_RDLCK);
+ if (past_time(fl->fl_break_time))
+ lease_modify(before, F_UNLCK);
if (fl == *before) /* lease_modify may have freed fl */
before = &fl->fl_next;
}
@@ -1165,7 +1197,7 @@ static void time_out_leases(struct inode *inode)
*/
int __break_lease(struct inode *inode, unsigned int mode)
{
- int error = 0, future;
+ int error = 0;
struct file_lock *new_fl, *flock;
struct file_lock *fl;
unsigned long break_time;
@@ -1182,24 +1214,13 @@ int __break_lease(struct inode *inode, unsigned int mode)
if ((flock == NULL) || !IS_LEASE(flock))
goto out;
+ if (!locks_conflict(flock, new_fl))
+ goto out;
+
for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
if (fl->fl_owner == current->files)
i_have_this_lease = 1;
- if (want_write) {
- /* If we want write access, we have to revoke any lease. */
- future = F_UNLCK | F_INPROGRESS;
- } else if (flock->fl_type & F_INPROGRESS) {
- /* If the lease is already being broken, we just leave it */
- future = flock->fl_type;
- } else if (flock->fl_type & F_WRLCK) {
- /* Downgrade the exclusive lease to a read-only lease. */
- future = F_RDLCK | F_INPROGRESS;
- } else {
- /* the existing lease was read-only, so we can read too. */
- goto out;
- }
-
if (IS_ERR(new_fl) && !i_have_this_lease
&& ((mode & O_NONBLOCK) == 0)) {
error = PTR_ERR(new_fl);
@@ -1214,12 +1235,18 @@ int __break_lease(struct inode *inode, unsigned int mode)
}
for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
- if (fl->fl_type != future) {
- fl->fl_type = future;
+ if (want_write) {
+ if (fl->fl_flags & FL_UNLOCK_PENDING)
+ continue;
+ fl->fl_flags |= FL_UNLOCK_PENDING;
fl->fl_break_time = break_time;
- /* lease must have lmops break callback */
- fl->fl_lmops->lm_break(fl);
+ } else {
+ if (lease_breaking(flock))
+ continue;
+ fl->fl_flags |= FL_DOWNGRADE_PENDING;
+ fl->fl_downgrade_time = break_time;
}
+ fl->fl_lmops->lm_break(fl);
}
if (i_have_this_lease || (mode & O_NONBLOCK)) {
@@ -1243,10 +1270,13 @@ restart:
if (error >= 0) {
if (error == 0)
time_out_leases(inode);
- /* Wait for the next lease that has not been broken yet */
+ /*
+ * Wait for the next conflicting lease that has not been
+ * broken yet
+ */
for (flock = inode->i_flock; flock && IS_LEASE(flock);
flock = flock->fl_next) {
- if (flock->fl_type & F_INPROGRESS)
+ if (locks_conflict(new_fl, flock))
goto restart;
}
error = 0;
@@ -1314,7 +1344,7 @@ int fcntl_getlease(struct file *filp)
for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
fl = fl->fl_next) {
if (fl->fl_file == filp) {
- type = fl->fl_type & ~F_INPROGRESS;
+ type = target_leasetype(fl);
break;
}
}
@@ -1322,50 +1352,23 @@ int fcntl_getlease(struct file *filp)
return type;
}
-/**
- * generic_setlease - sets a lease on an open file
- * @filp: file pointer
- * @arg: type of lease to obtain
- * @flp: input - file_lock to use, output - file_lock inserted
- *
- * The (input) flp->fl_lmops->lm_break function is required
- * by break_lease().
- *
- * Called with file_lock_lock held.
- */
-int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
{
struct file_lock *fl, **before, **my_before = NULL, *lease;
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
- int error, rdlease_count = 0, wrlease_count = 0;
+ int error;
lease = *flp;
- error = -EACCES;
- if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
- goto out;
- error = -EINVAL;
- if (!S_ISREG(inode->i_mode))
+ error = -EAGAIN;
+ if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
goto out;
- error = security_file_lock(filp, arg);
- if (error)
+ if ((arg == F_WRLCK)
+ && ((dentry->d_count > 1)
+ || (atomic_read(&inode->i_count) > 1)))
goto out;
- time_out_leases(inode);
-
- BUG_ON(!(*flp)->fl_lmops->lm_break);
-
- if (arg != F_UNLCK) {
- error = -EAGAIN;
- if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
- goto out;
- if ((arg == F_WRLCK)
- && ((dentry->d_count > 1)
- || (atomic_read(&inode->i_count) > 1)))
- goto out;
- }
-
/*
* At this point, we know that if there is an exclusive
* lease on this file, then we hold it on this filp
@@ -1374,27 +1377,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
* then the file is not open by anyone (including us)
* except for this filp.
*/
+ error = -EAGAIN;
for (before = &inode->i_flock;
((fl = *before) != NULL) && IS_LEASE(fl);
before = &fl->fl_next) {
- if (fl->fl_file == filp)
+ if (fl->fl_file == filp) {
my_before = before;
- else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
- /*
- * Someone is in the process of opening this
- * file for writing so we may not take an
- * exclusive lease on it.
- */
- wrlease_count++;
- else
- rdlease_count++;
+ continue;
+ }
+ /*
+ * No exclusive leases if someone else has a lease on
+ * this file:
+ */
+ if (arg == F_WRLCK)
+ goto out;
+ /*
+ * Modifying our existing lease is OK, but no getting a
+ * new lease if someone else is opening for write:
+ */
+ if (fl->fl_flags & FL_UNLOCK_PENDING)
+ goto out;
}
- error = -EAGAIN;
- if ((arg == F_RDLCK && (wrlease_count > 0)) ||
- (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0)))
- goto out;
-
if (my_before != NULL) {
error = lease->fl_lmops->lm_change(my_before, arg);
if (!error)
@@ -1402,9 +1406,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
goto out;
}
- if (arg == F_UNLCK)
- goto out;
-
error = -EINVAL;
if (!leases_enable)
goto out;
@@ -1415,6 +1416,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
out:
return error;
}
+
+int generic_delete_lease(struct file *filp, struct file_lock **flp)
+{
+ struct file_lock *fl, **before;
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+
+ for (before = &inode->i_flock;
+ ((fl = *before) != NULL) && IS_LEASE(fl);
+ before = &fl->fl_next) {
+ if (fl->fl_file != filp)
+ continue;
+ return (*flp)->fl_lmops->lm_change(before, F_UNLCK);
+ }
+ return -EAGAIN;
+}
+
+/**
+ * generic_setlease - sets a lease on an open file
+ * @filp: file pointer
+ * @arg: type of lease to obtain
+ * @flp: input - file_lock to use, output - file_lock inserted
+ *
+ * The (input) flp->fl_lmops->lm_break function is required
+ * by break_lease().
+ *
+ * Called with file_lock_lock held.
+ */
+int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
+ return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ error = security_file_lock(filp, arg);
+ if (error)
+ return error;
+
+ time_out_leases(inode);
+
+ BUG_ON(!(*flp)->fl_lmops->lm_break);
+
+ switch (arg) {
+ case F_UNLCK:
+ return generic_delete_lease(filp, flp);
+ case F_RDLCK:
+ case F_WRLCK:
+ return generic_add_lease(filp, arg, flp);
+ default:
+ BUG();
+ }
+}
EXPORT_SYMBOL(generic_setlease);
static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
@@ -2126,7 +2183,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
}
} else if (IS_LEASE(fl)) {
seq_printf(f, "LEASE ");
- if (fl->fl_type & F_INPROGRESS)
+ if (lease_breaking(fl))
seq_printf(f, "BREAKING ");
else if (fl->fl_file)
seq_printf(f, "ACTIVE ");
@@ -2142,7 +2199,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
: (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
} else {
seq_printf(f, "%s ",
- (fl->fl_type & F_INPROGRESS)
+ (lease_breaking(fl))
? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
: (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index f22d108bfa5d..398ecff6e548 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -618,7 +618,6 @@ static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
void emergency_read_end(struct page *page);
void logfs_crash_dump(struct super_block *sb);
-void *memchr_inv(const void *s, int c, size_t n);
int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
int logfs_check_ds(struct logfs_disk_super *ds);
int logfs_write_sb(struct super_block *sb);
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index ce03a182c771..f2697e4df109 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -91,28 +91,6 @@ void logfs_crash_dump(struct super_block *sb)
}
/*
- * TODO: move to lib/string.c
- */
-/**
- * memchr_inv - Find a character in an area of memory.
- * @s: The memory area
- * @c: The byte to search for
- * @n: The size of the area.
- *
- * returns the address of the first character other than @c, or %NULL
- * if the whole buffer contains just @c.
- */
-void *memchr_inv(const void *s, int c, size_t n)
-{
- const unsigned char *p = s;
- while (n-- != 0)
- if ((unsigned char)c != *p++)
- return (void *)(p - 1);
-
- return NULL;
-}
-
-/*
* FIXME: There should be a reserve for root, similar to ext2.
*/
int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
diff --git a/fs/namei.c b/fs/namei.c
index 0b3138de2a3b..7657be4352bf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -221,14 +221,12 @@ static int check_acl(struct inode *inode, int mask)
}
/*
- * This does basic POSIX ACL permission checking
+ * This does the basic permission checking
*/
static int acl_permission_check(struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
- mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
-
if (current_user_ns() != inode_userns(inode))
goto other_perms;
@@ -257,7 +255,7 @@ other_perms:
/**
* generic_permission - check for access rights on a Posix-like filesystem
* @inode: inode to check access rights for
- * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
*
* Used to check for read/write/execute permissions on a file.
* We use "fsuid" for this, letting us set arbitrary permissions
@@ -273,7 +271,7 @@ int generic_permission(struct inode *inode, int mask)
int ret;
/*
- * Do the basic POSIX ACL permission checks.
+ * Do the basic permission checks.
*/
ret = acl_permission_check(inode, mask);
if (ret != -EACCES)
@@ -331,12 +329,14 @@ static inline int do_inode_permission(struct inode *inode, int mask)
/**
* inode_permission - check for access rights to a given inode
* @inode: inode to check permission on
- * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
*
* Used to check for read/write/execute permissions on an inode.
* We use "fsuid" for this, letting us set arbitrary permissions
* for filesystem access without changing the "normal" uids which
* are used for other things.
+ *
+ * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*/
int inode_permission(struct inode *inode, int mask)
{
@@ -2035,10 +2035,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
if (flag & O_NOATIME && !inode_owner_or_capable(inode))
return -EPERM;
- /*
- * Ensure there are no outstanding leases on the file.
- */
- return break_lease(inode, flag);
+ return 0;
}
static int handle_truncate(struct file *filp)
diff --git a/fs/namespace.c b/fs/namespace.c
index b4febb29d3bb..e5e1c7d1839b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1109,6 +1109,7 @@ static int show_vfsstat(struct seq_file *m, void *v)
/* device */
if (mnt->mnt_sb->s_op->show_devname) {
+ seq_puts(m, "device ");
err = mnt->mnt_sb->s_op->show_devname(m, mnt);
} else {
if (mnt->mnt_devname) {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e3d294269058..516f3375e067 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv)
else
goto out_err;
- return svc_prepare_thread(serv, &serv->sv_pools[0]);
+ return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
out_err:
if (ret == 0)
@@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
INIT_LIST_HEAD(&serv->sv_cb_list);
spin_lock_init(&serv->sv_cb_lock);
init_waitqueue_head(&serv->sv_cb_waitq);
- rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+ rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
if (IS_ERR(rqstp)) {
svc_xprt_put(serv->sv_bc_xprt);
serv->sv_bc_xprt = NULL;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index bd7dff001106..0a1f8312b4dc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -178,8 +178,6 @@ force_reval:
static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
{
- loff_t loff;
-
dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
filp->f_path.dentry->d_parent->d_name.name,
filp->f_path.dentry->d_name.name,
@@ -195,13 +193,9 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
int retval = nfs_revalidate_file_size(inode, filp);
if (retval < 0)
return (loff_t)retval;
+ }
- spin_lock(&inode->i_lock);
- loff = generic_file_llseek_unlocked(filp, offset, origin);
- spin_unlock(&inode->i_lock);
- } else
- loff = generic_file_llseek_unlocked(filp, offset, origin);
- return loff;
+ return generic_file_llseek(filp, offset, origin);
}
/*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3e93e9a1bee1..693ae22f8731 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,30 +13,6 @@
struct idmap;
-/*
- * In a seqid-mutating op, this macro controls which error return
- * values trigger incrementation of the seqid.
- *
- * from rfc 3010:
- * The client MUST monotonically increment the sequence number for the
- * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
- * operations. This is true even in the event that the previous
- * operation that used the sequence number received an error. The only
- * exception to this rule is if the previous operation received one of
- * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
- * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
- * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
- *
- */
-#define seqid_mutating_err(err) \
-(((err) != NFSERR_STALE_CLIENTID) && \
- ((err) != NFSERR_STALE_STATEID) && \
- ((err) != NFSERR_BAD_STATEID) && \
- ((err) != NFSERR_BAD_SEQID) && \
- ((err) != NFSERR_BAD_XDR) && \
- ((err) != NFSERR_RESOURCE) && \
- ((err) != NFSERR_NOFILEHANDLE))
-
enum nfs4_client_state {
NFS4CLNT_MANAGER_RUNNING = 0,
NFS4CLNT_CHECK_LEASE,
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12fddc3..c807ab93140e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,21 +38,15 @@
*/
#include <linux/module.h>
-#include <scsi/osd_initiator.h>
+#include <scsi/osd_ore.h>
#include "objlayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-#define _LLU(x) ((unsigned long long)x)
-
-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-};
-
struct objio_dev_ent {
struct nfs4_deviceid_node id_node;
- struct osd_dev *od;
+ struct ore_dev od;
};
static void
@@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
{
struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
- dprintk("%s: free od=%p\n", __func__, de->od);
- osduld_put_device(de->od);
+ dprintk("%s: free od=%p\n", __func__, de->od.od);
+ osduld_put_device(de->od.od);
kfree(de);
}
@@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss,
nfss->pnfs_curr_ld,
nfss->nfs_client,
d_id);
- de->od = od;
+ de->od.od = od;
d = nfs4_insert_deviceid_node(&de->id_node);
n = container_of(d, struct objio_dev_ent, id_node);
if (n != de) {
- dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+ dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
objio_free_deviceid_node(&de->id_node);
de = n;
}
@@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss,
return de;
}
-struct caps_buffers {
- u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
- u8 creds[OSD_CAP_LEN];
-};
-
struct objio_segment {
struct pnfs_layout_segment lseg;
- struct pnfs_osd_object_cred *comps;
-
- unsigned mirrors_p1;
- unsigned stripe_unit;
- unsigned group_width; /* Data stripe_units without integrity comps */
- u64 group_depth;
- unsigned group_count;
-
- unsigned max_io_size;
-
- unsigned comps_index;
- unsigned num_comps;
- /* variable length */
- struct objio_dev_ent *ods[];
+ struct ore_layout layout;
+ struct ore_components oc;
};
static inline struct objio_segment *
@@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
return container_of(lseg, struct objio_segment, lseg);
}
-struct objio_state;
-typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
-
struct objio_state {
/* Generic layer */
- struct objlayout_io_state ol_state;
-
- struct objio_segment *layout;
-
- struct kref kref;
- objio_done_fn done;
- void *private;
-
- unsigned long length;
- unsigned numdevs; /* Actually used devs in this IO */
- /* A per-device variable array of size numdevs */
- struct _objio_per_comp {
- struct bio *bio;
- struct osd_request *or;
- unsigned long length;
- u64 offset;
- unsigned dev;
- } per_dev[];
+ struct objlayout_io_res oir;
+
+ bool sync;
+ /*FIXME: Support for extra_bytes at ore_get_rw_state() */
+ struct ore_io_state *ios;
};
/* Send and wait for a get_device_info of devices in the layout,
then look them up with the osd_initiator library */
-static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg, unsigned comp,
- gfp_t gfp_flags)
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
+ gfp_t gfp_flags)
{
struct pnfs_osd_deviceaddr *deviceaddr;
- struct nfs4_deviceid *d_id;
struct objio_dev_ent *ode;
struct osd_dev *od;
struct osd_dev_info odi;
int err;
- d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
-
ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
- if (ode)
- return ode;
+ if (ode) {
+ objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+ return 0;
+ }
err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
if (unlikely(err)) {
dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
- return ERR_PTR(err);
+ return err;
}
odi.systemid_len = deviceaddr->oda_systemid.len;
if (odi.systemid_len > sizeof(odi.systemid)) {
+ dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
+ __func__, sizeof(odi.systemid));
err = -EINVAL;
goto out;
} else if (odi.systemid_len)
@@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
gfp_flags);
-
+ objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+ dprintk("Adding new dev_id(%llx:%llx)\n",
+ _DEVID_LO(d_id), _DEVID_HI(d_id));
out:
- dprintk("%s: return=%d\n", __func__, err);
objlayout_put_deviceinfo(deviceaddr);
- return err ? ERR_PTR(err) : ode;
+ return err;
}
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg,
- gfp_t gfp_flags)
+static void copy_single_comp(struct ore_components *oc, unsigned c,
+ struct pnfs_osd_object_cred *src_comp)
{
- unsigned i;
- int err;
+ struct ore_comp *ocomp = &oc->comps[c];
- /* lookup all devices */
- for (i = 0; i < objio_seg->num_comps; i++) {
- struct objio_dev_ent *ode;
+ WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
+ WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
- ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
- if (unlikely(IS_ERR(ode))) {
- err = PTR_ERR(ode);
- goto out;
- }
- objio_seg->ods[i] = ode;
- }
- err = 0;
+ ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
+ ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
-out:
- dprintk("%s: return=%d\n", __func__, err);
- return err;
+ memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
}
-static int _verify_data_map(struct pnfs_osd_layout *layout)
+int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
+ struct objio_segment **pseg)
{
- struct pnfs_osd_data_map *data_map = &layout->olo_map;
- u64 stripe_length;
- u32 group_width;
-
-/* FIXME: Only raid0 for now. if not go through MDS */
- if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
- printk(KERN_ERR "Only RAID_0 for now\n");
- return -ENOTSUPP;
- }
- if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
- printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
- data_map->odm_num_comps, data_map->odm_mirror_cnt);
- return -EINVAL;
- }
+ struct __alloc_objio_segment {
+ struct objio_segment olseg;
+ struct ore_dev *ods[numdevs];
+ struct ore_comp comps[numdevs];
+ } *aolseg;
- if (data_map->odm_group_width)
- group_width = data_map->odm_group_width;
- else
- group_width = data_map->odm_num_comps /
- (data_map->odm_mirror_cnt + 1);
-
- stripe_length = (u64)data_map->odm_stripe_unit * group_width;
- if (stripe_length >= (1ULL << 32)) {
- printk(KERN_ERR "Total Stripe length(0x%llx)"
- " >= 32bit is not supported\n", _LLU(stripe_length));
- return -ENOTSUPP;
+ aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
+ if (unlikely(!aolseg)) {
+ dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
+ numdevs, sizeof(*aolseg));
+ return -ENOMEM;
}
- if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
- printk(KERN_ERR "Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
- return -ENOTSUPP;
- }
+ aolseg->olseg.oc.numdevs = numdevs;
+ aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
+ aolseg->olseg.oc.comps = aolseg->comps;
+ aolseg->olseg.oc.ods = aolseg->ods;
+ *pseg = &aolseg->olseg;
return 0;
}
-static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
- struct pnfs_osd_object_cred *src_comp,
- struct caps_buffers *caps_p)
-{
- WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
- WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
-
- *cur_comp = *src_comp;
-
- memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
- sizeof(caps_p->caps_key));
- cur_comp->oc_cap_key.cred = caps_p->caps_key;
-
- memcpy(caps_p->creds, src_comp->oc_cap.cred,
- sizeof(caps_p->creds));
- cur_comp->oc_cap.cred = caps_p->creds;
-}
-
int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct pnfs_layout_hdr *pnfslay,
struct pnfs_layout_range *range,
@@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct objio_segment *objio_seg;
struct pnfs_osd_xdr_decode_layout_iter iter;
struct pnfs_osd_layout layout;
- struct pnfs_osd_object_cred *cur_comp, src_comp;
- struct caps_buffers *caps_p;
+ struct pnfs_osd_object_cred src_comp;
+ unsigned cur_comp;
int err;
err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
if (unlikely(err))
return err;
- err = _verify_data_map(&layout);
+ err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
if (unlikely(err))
return err;
- objio_seg = kzalloc(sizeof(*objio_seg) +
- sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
- sizeof(*objio_seg->comps) * layout.olo_num_comps +
- sizeof(struct caps_buffers) * layout.olo_num_comps,
- gfp_flags);
- if (!objio_seg)
- return -ENOMEM;
+ objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
+ objio_seg->layout.group_width = layout.olo_map.odm_group_width;
+ objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
+ objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+ objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
- objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
- cur_comp = objio_seg->comps;
- caps_p = (void *)(cur_comp + layout.olo_num_comps);
- while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
- copy_single_comp(cur_comp++, &src_comp, caps_p++);
+ err = ore_verify_layout(layout.olo_map.odm_num_comps,
+ &objio_seg->layout);
if (unlikely(err))
goto err;
- objio_seg->num_comps = layout.olo_num_comps;
- objio_seg->comps_index = layout.olo_comps_index;
- err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
- if (err)
- goto err;
-
- objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
- objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
- if (layout.olo_map.odm_group_width) {
- objio_seg->group_width = layout.olo_map.odm_group_width;
- objio_seg->group_depth = layout.olo_map.odm_group_depth;
- objio_seg->group_count = layout.olo_map.odm_num_comps /
- objio_seg->mirrors_p1 /
- objio_seg->group_width;
- } else {
- objio_seg->group_width = layout.olo_map.odm_num_comps /
- objio_seg->mirrors_p1;
- objio_seg->group_depth = -1;
- objio_seg->group_count = 1;
+ objio_seg->oc.first_dev = layout.olo_comps_index;
+ cur_comp = 0;
+ while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+ copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
+ err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
+ &src_comp.oc_object_id.oid_device_id,
+ gfp_flags);
+ if (err)
+ goto err;
+ ++cur_comp;
}
-
- /* Cache this calculation it will hit for every page */
- objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
- objio_seg->stripe_unit) *
- objio_seg->group_width;
+ /* pnfs_osd_xdr_decode_layout_comp returns false on error */
+ if (unlikely(err))
+ goto err;
*outp = &objio_seg->lseg;
return 0;
@@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
int i;
struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
- for (i = 0; i < objio_seg->num_comps; i++) {
- if (!objio_seg->ods[i])
+ for (i = 0; i < objio_seg->oc.numdevs; i++) {
+ struct ore_dev *od = objio_seg->oc.ods[i];
+ struct objio_dev_ent *ode;
+
+ if (!od)
break;
- nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+ ode = container_of(od, typeof(*ode), od);
+ nfs4_put_deviceid_node(&ode->id_node);
}
kfree(objio_seg);
}
-int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
- struct objlayout_io_state **outp,
- gfp_t gfp_flags)
+static int
+objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
+ struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
+ loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
+ struct objio_state **outp)
{
struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
- struct objio_state *ios;
- const unsigned first_size = sizeof(*ios) +
- objio_seg->num_comps * sizeof(ios->per_dev[0]);
- const unsigned sec_size = objio_seg->num_comps *
- sizeof(ios->ol_state.ioerrs[0]);
-
- ios = kzalloc(first_size + sec_size, gfp_flags);
- if (unlikely(!ios))
+ struct ore_io_state *ios;
+ int ret;
+ struct __alloc_objio_state {
+ struct objio_state objios;
+ struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
+ } *aos;
+
+ aos = kzalloc(sizeof(*aos), gfp_flags);
+ if (unlikely(!aos))
return -ENOMEM;
- ios->layout = objio_seg;
- ios->ol_state.ioerrs = ((void *)ios) + first_size;
- ios->ol_state.num_comps = objio_seg->num_comps;
+ objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
+ aos->ioerrs, rpcdata, pnfs_layout_type);
- *outp = &ios->ol_state;
+ ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
+ offset, count, &ios);
+ if (unlikely(ret)) {
+ kfree(aos);
+ return ret;
+ }
+
+ ios->pages = pages;
+ ios->pgbase = pgbase;
+ ios->private = aos;
+ BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
+
+ aos->objios.sync = 0;
+ aos->objios.ios = ios;
+ *outp = &aos->objios;
return 0;
}
-void objio_free_io_state(struct objlayout_io_state *ol_state)
+void objio_free_result(struct objlayout_io_res *oir)
{
- struct objio_state *ios = container_of(ol_state, struct objio_state,
- ol_state);
+ struct objio_state *objios = container_of(oir, struct objio_state, oir);
- kfree(ios);
+ ore_put_io_state(objios->ios);
+ kfree(objios);
}
enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
}
}
-static void _clear_bio(struct bio *bio)
+static void __on_dev_error(struct ore_io_state *ios,
+ struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
+ u64 dev_offset, u64 dev_len)
{
- struct bio_vec *bv;
- unsigned i;
-
- __bio_for_each_segment(bv, bio, i, 0) {
- unsigned this_count = bv->bv_len;
-
- if (likely(PAGE_SIZE == this_count))
- clear_highpage(bv->bv_page);
- else
- zero_user(bv->bv_page, bv->bv_offset, this_count);
- }
-}
-
-static int _io_check(struct objio_state *ios, bool is_write)
-{
- enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
- int lin_ret = 0;
- int i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_sense_info osi;
- struct osd_request *or = ios->per_dev[i].or;
- int ret;
-
- if (!or)
- continue;
+ struct objio_state *objios = ios->private;
+ struct pnfs_osd_objid pooid;
+ struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
+ /* FIXME: what to do with more-then-one-group layouts. We need to
+ * translate from ore_io_state index to oc->comps index
+ */
+ unsigned comp = dev_index;
- ret = osd_req_decode_sense(or, &osi);
- if (likely(!ret))
- continue;
+ pooid.oid_device_id = ode->id_node.deviceid;
+ pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
+ pooid.oid_object_id = ios->oc->comps[comp].obj.id;
- if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
- /* start read offset passed endof file */
- BUG_ON(is_write);
- _clear_bio(ios->per_dev[i].bio);
- dprintk("%s: start read offset passed end of file "
- "offset=0x%llx, length=0x%lx\n", __func__,
- _LLU(ios->per_dev[i].offset),
- ios->per_dev[i].length);
-
- continue; /* we recovered */
- }
- objlayout_io_set_result(&ios->ol_state, i,
- &ios->layout->comps[i].oc_object_id,
- osd_pri_2_pnfs_err(osi.osd_err_pri),
- ios->per_dev[i].offset,
- ios->per_dev[i].length,
- is_write);
-
- if (osi.osd_err_pri >= oep) {
- oep = osi.osd_err_pri;
- lin_ret = ret;
- }
- }
-
- return lin_ret;
-}
-
-/*
- * Common IO state helpers.
- */
-static void _io_free(struct objio_state *ios)
-{
- unsigned i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct _objio_per_comp *per_dev = &ios->per_dev[i];
-
- if (per_dev->or) {
- osd_end_request(per_dev->or);
- per_dev->or = NULL;
- }
-
- if (per_dev->bio) {
- bio_put(per_dev->bio);
- per_dev->bio = NULL;
- }
- }
-}
-
-struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
-{
- unsigned min_dev = ios->layout->comps_index;
- unsigned max_dev = min_dev + ios->layout->num_comps;
-
- BUG_ON(dev < min_dev || max_dev <= dev);
- return ios->layout->ods[dev - min_dev]->od;
-}
-
-struct _striping_info {
- u64 obj_offset;
- u64 group_length;
- unsigned dev;
- unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
- struct _striping_info *si)
-{
- u32 stripe_unit = ios->layout->stripe_unit;
- u32 group_width = ios->layout->group_width;
- u64 group_depth = ios->layout->group_depth;
- u32 U = stripe_unit * group_width;
-
- u64 T = U * group_depth;
- u64 S = T * ios->layout->group_count;
- u64 M = div64_u64(file_offset, S);
-
- /*
- G = (L - (M * S)) / T
- H = (L - (M * S)) % T
- */
- u64 LmodU = file_offset - M * S;
- u32 G = div64_u64(LmodU, T);
- u64 H = LmodU - G * T;
-
- u32 N = div_u64(H, U);
-
- div_u64_rem(file_offset, stripe_unit, &si->unit_off);
- si->obj_offset = si->unit_off + (N * stripe_unit) +
- (M * group_depth * stripe_unit);
-
- /* "H - (N * U)" is just "H % U" so it's bound to u32 */
- si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
- si->dev *= ios->layout->mirrors_p1;
-
- si->group_length = T - H;
-}
-
-static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct _objio_per_comp *per_dev, int len,
- gfp_t gfp_flags)
-{
- unsigned pg = *cur_pg;
- int cur_len = len;
- struct request_queue *q =
- osd_request_queue(_io_od(ios, per_dev->dev));
-
- if (per_dev->bio == NULL) {
- unsigned pages_in_stripe = ios->layout->group_width *
- (ios->layout->stripe_unit / PAGE_SIZE);
- unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
- ios->layout->group_width;
-
- if (BIO_MAX_PAGES_KMALLOC < bio_size)
- bio_size = BIO_MAX_PAGES_KMALLOC;
-
- per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
- if (unlikely(!per_dev->bio)) {
- dprintk("Faild to allocate BIO size=%u\n", bio_size);
- return -ENOMEM;
- }
- }
-
- while (cur_len > 0) {
- unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
- unsigned added_len;
-
- BUG_ON(ios->ol_state.nr_pages <= pg);
- cur_len -= pglen;
-
- added_len = bio_add_pc_page(q, per_dev->bio,
- ios->ol_state.pages[pg], pglen, pgbase);
- if (unlikely(pglen != added_len))
- return -ENOMEM;
- pgbase = 0;
- ++pg;
- }
- BUG_ON(cur_len);
-
- per_dev->length += len;
- *cur_pg = pg;
- return 0;
-}
-
-static int _prepare_one_group(struct objio_state *ios, u64 length,
- struct _striping_info *si, unsigned *last_pg,
- gfp_t gfp_flags)
-{
- unsigned stripe_unit = ios->layout->stripe_unit;
- unsigned mirrors_p1 = ios->layout->mirrors_p1;
- unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
- unsigned dev = si->dev;
- unsigned first_dev = dev - (dev % devs_in_group);
- unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
- unsigned cur_pg = *last_pg;
- int ret = 0;
-
- while (length) {
- struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
- unsigned cur_len, page_off = 0;
-
- if (!per_dev->length) {
- per_dev->dev = dev;
- if (dev < si->dev) {
- per_dev->offset = si->obj_offset + stripe_unit -
- si->unit_off;
- cur_len = stripe_unit;
- } else if (dev == si->dev) {
- per_dev->offset = si->obj_offset;
- cur_len = stripe_unit - si->unit_off;
- page_off = si->unit_off & ~PAGE_MASK;
- BUG_ON(page_off &&
- (page_off != ios->ol_state.pgbase));
- } else { /* dev > si->dev */
- per_dev->offset = si->obj_offset - si->unit_off;
- cur_len = stripe_unit;
- }
-
- if (max_comp < dev - first_dev)
- max_comp = dev - first_dev;
- } else {
- cur_len = stripe_unit;
- }
- if (cur_len >= length)
- cur_len = length;
-
- ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
- cur_len, gfp_flags);
- if (unlikely(ret))
- goto out;
-
- dev += mirrors_p1;
- dev = (dev % devs_in_group) + first_dev;
-
- length -= cur_len;
- ios->length += cur_len;
- }
-out:
- ios->numdevs = max_comp + mirrors_p1;
- *last_pg = cur_pg;
- return ret;
-}
-
-static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
-{
- u64 length = ios->ol_state.count;
- u64 offset = ios->ol_state.offset;
- struct _striping_info si;
- unsigned last_pg = 0;
- int ret = 0;
-
- while (length) {
- _calc_stripe_info(ios, offset, &si);
-
- if (length < si.group_length)
- si.group_length = length;
-
- ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
- if (unlikely(ret))
- goto out;
-
- offset += si.group_length;
- length -= si.group_length;
- }
-
-out:
- if (!ios->length)
- return ret;
-
- return 0;
-}
-
-static ssize_t _sync_done(struct objio_state *ios)
-{
- struct completion *waiting = ios->private;
-
- complete(waiting);
- return 0;
-}
-
-static void _last_io(struct kref *kref)
-{
- struct objio_state *ios = container_of(kref, struct objio_state, kref);
-
- ios->done(ios);
-}
-
-static void _done_io(struct osd_request *or, void *p)
-{
- struct objio_state *ios = p;
-
- kref_put(&ios->kref, _last_io);
-}
-
-static ssize_t _io_exec(struct objio_state *ios)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- ssize_t status = 0; /* sync status */
- unsigned i;
- objio_done_fn saved_done_fn = ios->done;
- bool sync = ios->ol_state.sync;
-
- if (sync) {
- ios->done = _sync_done;
- ios->private = &wait;
- }
-
- kref_init(&ios->kref);
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
-
- if (!or)
- continue;
-
- kref_get(&ios->kref);
- osd_execute_request_async(or, _done_io, ios);
- }
-
- kref_put(&ios->kref, _last_io);
-
- if (sync) {
- wait_for_completion(&wait);
- status = saved_done_fn(ios);
- }
-
- return status;
+ objlayout_io_set_result(&objios->oir, comp,
+ &pooid, osd_pri_2_pnfs_err(oep),
+ dev_offset, dev_len, !ios->reading);
}
/*
* read
*/
-static ssize_t _read_done(struct objio_state *ios)
+static void _read_done(struct ore_io_state *ios, void *private)
{
+ struct objio_state *objios = private;
ssize_t status;
- int ret = _io_check(ios, false);
+ int ret = ore_check_io(ios, &__on_dev_error);
- _io_free(ios);
+ /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
if (likely(!ret))
status = ios->length;
else
status = ret;
- objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
- return status;
+ objlayout_read_done(&objios->oir, status, objios->sync);
}
-static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+int objio_read_pagelist(struct nfs_read_data *rdata)
{
- struct osd_request *or = NULL;
- struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
- unsigned dev = per_dev->dev;
- struct pnfs_osd_object_cred *cred =
- &ios->layout->comps[cur_comp];
- struct osd_obj_id obj = {
- .partition = cred->oc_object_id.oid_partition_id,
- .id = cred->oc_object_id.oid_object_id,
- };
+ struct objio_state *objios;
int ret;
- or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
- if (unlikely(!or)) {
- ret = -ENOMEM;
- goto err;
- }
- per_dev->or = or;
-
- osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
-
- ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
- if (ret) {
- dprintk("%s: Faild to osd_finalize_request() => %d\n",
- __func__, ret);
- goto err;
- }
-
- dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
- __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
- per_dev->length);
-
-err:
- return ret;
-}
-
-static ssize_t _read_exec(struct objio_state *ios)
-{
- unsigned i;
- int ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- if (!ios->per_dev[i].length)
- continue;
- ret = _read_mirrors(ios, i);
- if (unlikely(ret))
- goto err;
- }
-
- ios->done = _read_done;
- return _io_exec(ios); /* In sync mode exec returns the io status */
-
-err:
- _io_free(ios);
- return ret;
-}
-
-ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
-{
- struct objio_state *ios = container_of(ol_state, struct objio_state,
- ol_state);
- int ret;
-
- ret = _io_rw_pagelist(ios, GFP_KERNEL);
+ ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
+ rdata->lseg, rdata->args.pages, rdata->args.pgbase,
+ rdata->args.offset, rdata->args.count, rdata,
+ GFP_KERNEL, &objios);
if (unlikely(ret))
return ret;
- return _read_exec(ios);
+ objios->ios->done = _read_done;
+ dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+ rdata->args.offset, rdata->args.count);
+ return ore_read(objios->ios);
}
/*
* write
*/
-static ssize_t _write_done(struct objio_state *ios)
+static void _write_done(struct ore_io_state *ios, void *private)
{
+ struct objio_state *objios = private;
ssize_t status;
- int ret = _io_check(ios, true);
+ int ret = ore_check_io(ios, &__on_dev_error);
- _io_free(ios);
+ /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
if (likely(!ret)) {
/* FIXME: should be based on the OSD's persistence model
* See OSD2r05 Section 4.13 Data persistence model */
- ios->ol_state.committed = NFS_FILE_SYNC;
+ objios->oir.committed = NFS_FILE_SYNC;
status = ios->length;
} else {
status = ret;
}
- objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
- return status;
+ objlayout_write_done(&objios->oir, status, objios->sync);
}
-static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
{
- struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
- unsigned dev = ios->per_dev[cur_comp].dev;
- unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
- int ret;
-
- for (; cur_comp < last_comp; ++cur_comp, ++dev) {
- struct osd_request *or = NULL;
- struct pnfs_osd_object_cred *cred =
- &ios->layout->comps[cur_comp];
- struct osd_obj_id obj = {
- .partition = cred->oc_object_id.oid_partition_id,
- .id = cred->oc_object_id.oid_object_id,
- };
- struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
- struct bio *bio;
-
- or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
- if (unlikely(!or)) {
- ret = -ENOMEM;
- goto err;
- }
- per_dev->or = or;
-
- if (per_dev != master_dev) {
- bio = bio_kmalloc(GFP_NOFS,
- master_dev->bio->bi_max_vecs);
- if (unlikely(!bio)) {
- dprintk("Faild to allocate BIO size=%u\n",
- master_dev->bio->bi_max_vecs);
- ret = -ENOMEM;
- goto err;
- }
-
- __bio_clone(bio, master_dev->bio);
- bio->bi_bdev = NULL;
- bio->bi_next = NULL;
- per_dev->bio = bio;
- per_dev->dev = dev;
- per_dev->length = master_dev->length;
- per_dev->offset = master_dev->offset;
- } else {
- bio = master_dev->bio;
- bio->bi_rw |= REQ_WRITE;
- }
-
- osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
+ struct objio_state *objios = priv;
+ struct nfs_write_data *wdata = objios->oir.rpcdata;
+ pgoff_t index = offset / PAGE_SIZE;
+ struct page *page = find_get_page(wdata->inode->i_mapping, index);
- ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
- if (ret) {
- dprintk("%s: Faild to osd_finalize_request() => %d\n",
- __func__, ret);
- goto err;
+ if (!page) {
+ page = find_or_create_page(wdata->inode->i_mapping,
+ index, GFP_NOFS);
+ if (unlikely(!page)) {
+ dprintk("%s: grab_cache_page Failed index=0x%lx\n",
+ __func__, index);
+ return NULL;
}
-
- dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
- __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
- per_dev->length);
+ unlock_page(page);
}
+ if (PageDirty(page) || PageWriteback(page))
+ *uptodate = true;
+ else
+ *uptodate = PageUptodate(page);
+ dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
+ return page;
+}
-err:
- return ret;
+static void __r4w_put_page(void *priv, struct page *page)
+{
+ dprintk("%s: index=0x%lx\n", __func__, page->index);
+ page_cache_release(page);
+ return;
}
-static ssize_t _write_exec(struct objio_state *ios)
+static const struct _ore_r4w_op _r4w_op = {
+ .get_page = &__r4w_get_page,
+ .put_page = &__r4w_put_page,
+};
+
+int objio_write_pagelist(struct nfs_write_data *wdata, int how)
{
- unsigned i;
+ struct objio_state *objios;
int ret;
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- if (!ios->per_dev[i].length)
- continue;
- ret = _write_mirrors(ios, i);
- if (unlikely(ret))
- goto err;
- }
-
- ios->done = _write_done;
- return _io_exec(ios); /* In sync mode exec returns the io->status */
+ ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
+ wdata->lseg, wdata->args.pages, wdata->args.pgbase,
+ wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+ &objios);
+ if (unlikely(ret))
+ return ret;
-err:
- _io_free(ios);
- return ret;
-}
+ objios->sync = 0 != (how & FLUSH_SYNC);
+ objios->ios->r4w = &_r4w_op;
-ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
-{
- struct objio_state *ios = container_of(ol_state, struct objio_state,
- ol_state);
- int ret;
+ if (!objios->sync)
+ objios->ios->done = _write_done;
- /* TODO: ios->stable = stable; */
- ret = _io_rw_pagelist(ios, GFP_NOFS);
+ dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+ wdata->args.offset, wdata->args.count);
+ ret = ore_write(objios->ios);
if (unlikely(ret))
return ret;
- return _write_exec(ios);
+ if (objios->sync)
+ _write_done(objios->ios, objios);
+
+ return 0;
}
static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
@@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
return false;
return pgio->pg_count + req->wb_bytes <=
- OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+ OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
}
static const struct nfs_pageio_ops objio_pg_read_ops = {
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e2adea..72074e3a04f9 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len)
return end > start ? end - 1 : NFS4_MAX_UINT64;
}
-static struct objlayout_io_state *
-objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
- struct page **pages,
- unsigned pgbase,
- loff_t offset,
- size_t count,
- struct pnfs_layout_segment *lseg,
- void *rpcdata,
- gfp_t gfp_flags)
+void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
+ struct page ***p_pages, unsigned *p_pgbase,
+ u64 offset, unsigned long count)
{
- struct objlayout_io_state *state;
u64 lseg_end_offset;
- dprintk("%s: allocating io_state\n", __func__);
- if (objio_alloc_io_state(lseg, &state, gfp_flags))
- return NULL;
-
BUG_ON(offset < lseg->pls_range.offset);
lseg_end_offset = end_offset(lseg->pls_range.offset,
lseg->pls_range.length);
BUG_ON(offset >= lseg_end_offset);
- if (offset + count > lseg_end_offset) {
- count = lseg->pls_range.length -
- (offset - lseg->pls_range.offset);
- dprintk("%s: truncated count %Zd\n", __func__, count);
- }
+ WARN_ON(offset + count > lseg_end_offset);
- if (pgbase > PAGE_SIZE) {
- pages += pgbase >> PAGE_SHIFT;
- pgbase &= ~PAGE_MASK;
+ if (*p_pgbase > PAGE_SIZE) {
+ dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
+ *p_pages += *p_pgbase >> PAGE_SHIFT;
+ *p_pgbase &= ~PAGE_MASK;
}
-
- INIT_LIST_HEAD(&state->err_list);
- state->lseg = lseg;
- state->rpcdata = rpcdata;
- state->pages = pages;
- state->pgbase = pgbase;
- state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
- state->offset = offset;
- state->count = count;
- state->sync = 0;
-
- return state;
-}
-
-static void
-objlayout_free_io_state(struct objlayout_io_state *state)
-{
- dprintk("%s: freeing io_state\n", __func__);
- if (unlikely(!state))
- return;
-
- objio_free_io_state(state);
}
/*
* I/O done common code
*/
static void
-objlayout_iodone(struct objlayout_io_state *state)
+objlayout_iodone(struct objlayout_io_res *oir)
{
- dprintk("%s: state %p status\n", __func__, state);
-
- if (likely(state->status >= 0)) {
- objlayout_free_io_state(state);
+ if (likely(oir->status >= 0)) {
+ objio_free_result(oir);
} else {
- struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+ struct objlayout *objlay = oir->objlay;
spin_lock(&objlay->lock);
objlay->delta_space_valid = OBJ_DSU_INVALID;
- list_add(&objlay->err_list, &state->err_list);
+ list_add(&objlay->err_list, &oir->err_list);
spin_unlock(&objlay->lock);
}
}
@@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
* the error for later reporting at layout-return.
*/
void
-objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
struct pnfs_osd_objid *pooid, int osd_error,
u64 offset, u64 length, bool is_write)
{
- struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+ struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
- BUG_ON(index >= state->num_comps);
+ BUG_ON(index >= oir->num_comps);
if (osd_error) {
ioerr->oer_component = *pooid;
ioerr->oer_comp_offset = offset;
@@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work)
}
void
-objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- int eof = state->eof;
- struct nfs_read_data *rdata;
+ struct nfs_read_data *rdata = oir->rpcdata;
- state->status = status;
- dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
- rdata = state->rpcdata;
- rdata->task.tk_status = status;
- if (status >= 0) {
+ oir->status = rdata->task.tk_status = status;
+ if (status >= 0)
rdata->res.count = status;
- rdata->res.eof = eof;
- }
- objlayout_iodone(state);
- /* must not use state after this point */
+ objlayout_iodone(oir);
+ /* must not use oir after this point */
+
+ dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
+ status, rdata->res.eof, sync);
if (sync)
pnfs_ld_read_done(rdata);
@@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
{
loff_t offset = rdata->args.offset;
size_t count = rdata->args.count;
- struct objlayout_io_state *state;
- ssize_t status = 0;
+ int err;
loff_t eof;
- dprintk("%s: Begin inode %p offset %llu count %d\n",
- __func__, rdata->inode, offset, (int)count);
-
eof = i_size_read(rdata->inode);
if (unlikely(offset + count > eof)) {
if (offset >= eof) {
- status = 0;
+ err = 0;
rdata->res.count = 0;
rdata->res.eof = 1;
+ /*FIXME: do we need to call pnfs_ld_read_done() */
goto out;
}
count = eof - offset;
}
- state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
- rdata->args.pages, rdata->args.pgbase,
- offset, count,
- rdata->lseg, rdata,
- GFP_KERNEL);
- if (unlikely(!state)) {
- status = -ENOMEM;
- goto out;
- }
+ rdata->res.eof = (offset + count) >= eof;
+ _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
+ &rdata->args.pgbase,
+ rdata->args.offset, rdata->args.count);
- state->eof = state->offset + state->count >= eof;
+ dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
+ __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
- status = objio_read_pagelist(state);
+ err = objio_read_pagelist(rdata);
out:
- dprintk("%s: Return status %Zd\n", __func__, status);
- rdata->pnfs_error = status;
+ if (unlikely(err)) {
+ rdata->pnfs_error = err;
+ dprintk("%s: Returned Error %d\n", __func__, err);
+ return PNFS_NOT_ATTEMPTED;
+ }
return PNFS_ATTEMPTED;
}
@@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work)
}
void
-objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
- bool sync)
+objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_write_data *wdata;
+ struct nfs_write_data *wdata = oir->rpcdata;
- dprintk("%s: Begin\n", __func__);
- wdata = state->rpcdata;
- state->status = status;
- wdata->task.tk_status = status;
+ oir->status = wdata->task.tk_status = status;
if (status >= 0) {
wdata->res.count = status;
- wdata->verf.committed = state->committed;
- dprintk("%s: Return status %d committed %d\n",
- __func__, wdata->task.tk_status,
- wdata->verf.committed);
- } else
- dprintk("%s: Return status %d\n",
- __func__, wdata->task.tk_status);
- objlayout_iodone(state);
- /* must not use state after this point */
+ wdata->verf.committed = oir->committed;
+ }
+ objlayout_iodone(oir);
+ /* must not use oir after this point */
+
+ dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
+ status, wdata->verf.committed, sync);
if (sync)
pnfs_ld_write_done(wdata);
@@ -407,30 +356,18 @@ enum pnfs_try_status
objlayout_write_pagelist(struct nfs_write_data *wdata,
int how)
{
- struct objlayout_io_state *state;
- ssize_t status;
-
- dprintk("%s: Begin inode %p offset %llu count %u\n",
- __func__, wdata->inode, wdata->args.offset, wdata->args.count);
-
- state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
- wdata->args.pages,
- wdata->args.pgbase,
- wdata->args.offset,
- wdata->args.count,
- wdata->lseg, wdata,
- GFP_NOFS);
- if (unlikely(!state)) {
- status = -ENOMEM;
- goto out;
- }
+ int err;
- state->sync = how & FLUSH_SYNC;
+ _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
+ &wdata->args.pgbase,
+ wdata->args.offset, wdata->args.count);
- status = objio_write_pagelist(state, how & FLUSH_STABLE);
- out:
- dprintk("%s: Return status %Zd\n", __func__, status);
- wdata->pnfs_error = status;
+ err = objio_write_pagelist(wdata, how);
+ if (unlikely(err)) {
+ wdata->pnfs_error = err;
+ dprintk("%s: Returned Error %d\n", __func__, err);
+ return PNFS_NOT_ATTEMPTED;
+ }
return PNFS_ATTEMPTED;
}
@@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
static void
encode_accumulated_error(struct objlayout *objlay, __be32 *p)
{
- struct objlayout_io_state *state, *tmp;
+ struct objlayout_io_res *oir, *tmp;
struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
- list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
unsigned i;
- for (i = 0; i < state->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+ for (i = 0; i < oir->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
if (!ioerr->oer_errno)
continue;
@@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
merge_ioerr(&accumulated_err, ioerr);
}
- list_del(&state->err_list);
- objlayout_free_io_state(state);
+ list_del(&oir->err_list);
+ objio_free_result(oir);
}
pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
const struct nfs4_layoutreturn_args *args)
{
struct objlayout *objlay = OBJLAYOUT(pnfslay);
- struct objlayout_io_state *state, *tmp;
+ struct objlayout_io_res *oir, *tmp;
__be32 *start;
dprintk("%s: Begin\n", __func__);
@@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
spin_lock(&objlay->lock);
- list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
__be32 *last_xdr = NULL, *p;
unsigned i;
int res = 0;
- for (i = 0; i < state->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+ for (i = 0; i < oir->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
if (!ioerr->oer_errno)
continue;
@@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
}
last_xdr = p;
- pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+ pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
}
/* TODO: use xdr_write_pages */
@@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
encode_accumulated_error(objlay, last_xdr);
goto loop_done;
}
- list_del(&state->err_list);
- objlayout_free_io_state(state);
+ list_del(&oir->err_list);
+ objio_free_result(oir);
}
loop_done:
spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8e042d..8ec34727ed21 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
* per-I/O operation state
* embedded in objects provider io_state data structure
*/
-struct objlayout_io_state {
- struct pnfs_layout_segment *lseg;
-
- struct page **pages;
- unsigned pgbase;
- unsigned nr_pages;
- unsigned long count;
- loff_t offset;
- bool sync;
+struct objlayout_io_res {
+ struct objlayout *objlay;
void *rpcdata;
int status; /* res */
- int eof; /* res */
int committed; /* res */
/* Error reporting (layout_return) */
@@ -100,6 +92,18 @@ struct objlayout_io_state {
struct pnfs_osd_ioerr *ioerrs;
};
+static inline
+void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
+ struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
+ struct pnfs_layout_hdr *pnfs_layout_type)
+{
+ oir->objlay = OBJLAYOUT(pnfs_layout_type);
+ oir->rpcdata = rpcdata;
+ INIT_LIST_HEAD(&oir->err_list);
+ oir->num_comps = num_comps;
+ oir->ioerrs = ioerrs;
+}
+
/*
* Raid engine I/O API
*/
@@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
gfp_t gfp_flags);
extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
-extern int objio_alloc_io_state(
- struct pnfs_layout_segment *lseg,
- struct objlayout_io_state **outp,
- gfp_t gfp_flags);
-extern void objio_free_io_state(struct objlayout_io_state *state);
+/* objio_free_result will free these @oir structs recieved from
+ * objlayout_{read,write}_done
+ */
+extern void objio_free_result(struct objlayout_io_res *oir);
-extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
-extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
- bool stable);
+extern int objio_read_pagelist(struct nfs_read_data *rdata);
+extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
/*
* callback API
*/
-extern void objlayout_io_set_result(struct objlayout_io_state *state,
+extern void objlayout_io_set_result(struct objlayout_io_res *oir,
unsigned index, struct pnfs_osd_objid *pooid,
int osd_error, u64 offset, u64 length, bool is_write);
static inline void
-objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
{
- struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
-
/* If one of the I/Os errored out and the delta_space_used was
* invalid we render the complete report as invalid. Protocol mandate
* the DSU be accurate or not reported.
@@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
spin_unlock(&objlay->lock);
}
-extern void objlayout_read_done(struct objlayout_io_state *state,
+extern void objlayout_read_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);
-extern void objlayout_write_done(struct objlayout_io_state *state,
+extern void objlayout_write_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);
extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f4cc1e2bfc54..62f3b9074e84 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -16,7 +16,6 @@
#include <linux/module.h>
#include <linux/exportfs.h>
-#include <linux/nfsd/syscall.h>
#include <net/ipv6.h>
#include "nfsd.h"
@@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref)
struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
path_put(&exp->ex_path);
auth_domain_put(exp->ex_client);
- kfree(exp->ex_pathname);
nfsd4_fslocs_free(&exp->ex_fslocs);
kfree(exp);
}
@@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
exp.ex_client = dom;
- err = -ENOMEM;
- exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
- if (!exp.ex_pathname)
- goto out2;
-
/* expiry */
err = -EINVAL;
exp.h.expiry_time = get_expiry(&mesg);
@@ -613,8 +606,6 @@ out4:
nfsd4_fslocs_free(&exp.ex_fslocs);
kfree(exp.ex_uuid);
out3:
- kfree(exp.ex_pathname);
-out2:
path_put(&exp.ex_path);
out1:
auth_domain_put(dom);
@@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_client = item->ex_client;
new->ex_path.dentry = dget(item->ex_path.dentry);
new->ex_path.mnt = mntget(item->ex_path.mnt);
- new->ex_pathname = NULL;
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
@@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
new->ex_fsid = item->ex_fsid;
new->ex_uuid = item->ex_uuid;
item->ex_uuid = NULL;
- new->ex_pathname = item->ex_pathname;
- item->ex_pathname = NULL;
new->ex_fslocs.locations = item->ex_fslocs.locations;
item->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
@@ -1010,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
return exp;
}
-static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
{
u32 fsidv[2];
@@ -1030,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
struct svc_export *exp;
__be32 rv;
- exp = find_fsidzero_export(rqstp);
+ exp = rqst_find_fsidzero_export(rqstp);
if (IS_ERR(exp))
return nfserrno(PTR_ERR(exp));
rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 02eb4edf0ece..7748d6a18d97 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,8 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
+static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+
#define NFSPROC4_CB_NULL 0
#define NFSPROC4_CB_COMPOUND 1
@@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
__be32 *p;
encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
- encode_stateid4(xdr, &dp->dl_stateid);
+ encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
p = xdr_reserve_space(xdr, 4);
*p++ = xdr_zero; /* truncate */
@@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
*/
status = 0;
out:
+ if (status)
+ nfsd4_mark_cb_fault(cb->cb_clp, status);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
warn_no_callback_path(clp, reason);
}
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+{
+ clp->cl_cb_state = NFSD4_CB_FAULT;
+ warn_no_callback_path(clp, reason);
+}
+
static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
{
struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
@@ -787,7 +797,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
u32 minorversion = clp->cl_minorversion;
cb->cb_minorversion = minorversion;
@@ -809,7 +819,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
dprintk("%s: minorversion=%d\n", __func__,
clp->cl_minorversion);
@@ -832,7 +842,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
nfsd4_cb_done(task, calldata);
@@ -1006,7 +1016,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
void nfsd4_cb_recall(struct nfs4_delegation *dp)
{
struct nfsd4_callback *cb = &dp->dl_recall;
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
dp->dl_retries = 1;
cb->cb_op = dp;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e80777666618..fa383361bc61 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -35,6 +35,7 @@
#include <linux/file.h>
#include <linux/slab.h>
+#include "idmap.h"
#include "cache.h"
#include "xdr4.h"
#include "vfs.h"
@@ -156,6 +157,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
return nfserr_inval;
+ accmode |= NFSD_MAY_READ_IF_EXEC;
+
if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
accmode |= NFSD_MAY_READ;
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
@@ -168,12 +171,29 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
return status;
}
+static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+{
+ umode_t mode = fh->fh_dentry->d_inode->i_mode;
+
+ if (S_ISREG(mode))
+ return nfs_ok;
+ if (S_ISDIR(mode))
+ return nfserr_isdir;
+ /*
+ * Using err_symlink as our catch-all case may look odd; but
+ * there's no other obvious error for this case in 4.0, and we
+ * happen to know that it will cause the linux v4 client to do
+ * the right thing on attempts to open something other than a
+ * regular file.
+ */
+ return nfserr_symlink;
+}
+
static __be32
do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct svc_fh resfh;
__be32 status;
- int created = 0;
fh_init(&resfh, NFS4_FHSIZE);
open->op_truncate = 0;
@@ -202,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
open->op_fname.len, &open->op_iattr,
&resfh, open->op_createmode,
(u32 *)open->op_verf.data,
- &open->op_truncate, &created);
+ &open->op_truncate, &open->op_created);
/*
* Following rfc 3530 14.2.16, use the returned bitmask
@@ -216,6 +236,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
status = nfsd_lookup(rqstp, current_fh,
open->op_fname.data, open->op_fname.len, &resfh);
fh_unlock(current_fh);
+ if (status)
+ goto out;
+ status = nfsd_check_obj_isreg(&resfh);
}
if (status)
goto out;
@@ -227,9 +250,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
fh_dup2(current_fh, &resfh);
/* set reply cache */
- fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
&resfh.fh_handle);
- if (!created)
+ if (!open->op_created)
status = do_open_permission(rqstp, current_fh, open,
NFSD_MAY_NOP);
@@ -254,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
/* set replay cache */
- fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
&current_fh->fh_handle);
open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
@@ -283,14 +306,18 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd4_compoundres *resp;
- dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
+ dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
(int)open->op_fname.len, open->op_fname.data,
- open->op_stateowner);
+ open->op_openowner);
/* This check required by spec. */
if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
return nfserr_inval;
+ /* We don't yet support WANT bits: */
+ open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
+
+ open->op_created = 0;
/*
* RFC5661 18.51.3
* Before RECLAIM_COMPLETE done, server should deny new lock
@@ -309,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
resp = rqstp->rq_resp;
status = nfsd4_process_open1(&resp->cstate, open);
if (status == nfserr_replay_me) {
- struct nfs4_replay *rp = &open->op_stateowner->so_replay;
+ struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
fh_put(&cstate->current_fh);
fh_copy_shallow(&cstate->current_fh.fh_handle,
&rp->rp_openfh);
@@ -339,32 +366,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
case NFS4_OPEN_CLAIM_NULL:
- /*
- * (1) set CURRENT_FH to the file being opened,
- * creating it if necessary, (2) set open->op_cinfo,
- * (3) set open->op_truncate if the file is to be
- * truncated after opening, (4) do permission checking.
- */
status = do_open_lookup(rqstp, &cstate->current_fh,
open);
if (status)
goto out;
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
- open->op_stateowner->so_confirmed = 1;
- /*
- * The CURRENT_FH is already set to the file being
- * opened. (1) set open->op_cinfo, (2) set
- * open->op_truncate if the file is to be truncated
- * after opening, (3) do permission checking.
- */
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
status = do_open_fhandle(rqstp, &cstate->current_fh,
open);
if (status)
goto out;
break;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
case NFS4_OPEN_CLAIM_DELEGATE_PREV:
- open->op_stateowner->so_confirmed = 1;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
dprintk("NFSD: unsupported OPEN claim type %d\n",
open->op_claim_type);
status = nfserr_notsupp;
@@ -381,12 +399,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* set, (2) sets open->op_stateid, (3) sets open->op_delegation.
*/
status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+ WARN_ON(status && open->op_created);
out:
- if (open->op_stateowner) {
- nfs4_get_stateowner(open->op_stateowner);
- cstate->replay_owner = open->op_stateowner;
- }
- nfs4_unlock_state();
+ nfsd4_cleanup_open_state(open, status);
+ if (open->op_openowner)
+ cstate->replay_owner = &open->op_openowner->oo_owner;
+ else
+ nfs4_unlock_state();
return status;
}
@@ -467,17 +486,12 @@ static __be32
nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_commit *commit)
{
- __be32 status;
-
u32 *p = (u32 *)commit->co_verf.data;
*p++ = nfssvc_boot.tv_sec;
*p++ = nfssvc_boot.tv_usec;
- status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+ return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
commit->co_count);
- if (status == nfserr_symlink)
- status = nfserr_inval;
- return status;
}
static __be32
@@ -492,8 +506,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
NFSD_MAY_CREATE);
- if (status == nfserr_symlink)
- status = nfserr_notdir;
if (status)
return status;
@@ -691,7 +703,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
- if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
+ if ((cookie == 1) || (cookie == 2) ||
(cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
return nfserr_bad_cookie;
@@ -719,8 +731,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
- if (status == nfserr_symlink)
- return nfserr_notdir;
if (!status) {
fh_unlock(&cstate->current_fh);
set_change_info(&remove->rm_cinfo, &cstate->current_fh);
@@ -751,8 +761,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
(S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
status = nfserr_exist;
- else if (status == nfserr_symlink)
- status = nfserr_notdir;
if (!status) {
set_change_info(&rename->rn_sinfo, &cstate->current_fh);
@@ -892,8 +900,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
write->wr_bytes_written = cnt;
- if (status == nfserr_symlink)
- status = nfserr_inval;
return status;
}
@@ -930,7 +936,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
count = 4 + (verify->ve_attrlen >> 2);
buf = kmalloc(count << 2, GFP_KERNEL);
if (!buf)
- return nfserr_resource;
+ return nfserr_jukebox;
status = nfsd4_encode_fattr(&cstate->current_fh,
cstate->current_fh.fh_export,
@@ -994,6 +1000,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
void *);
+typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
+
enum nfsd4_op_flags {
ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
@@ -1001,13 +1009,15 @@ enum nfsd4_op_flags {
/* For rfc 5661 section 2.6.3.1.1: */
OP_HANDLES_WRONGSEC = 1 << 3,
OP_IS_PUTFH_LIKE = 1 << 4,
-};
-
-struct nfsd4_operation {
- nfsd4op_func op_func;
- u32 op_flags;
- char *op_name;
/*
+ * These are the ops whose result size we estimate before
+ * encoding, to avoid performing an op then not being able to
+ * respond or cache a response. This includes writes and setattrs
+ * as well as the operations usually called "nonidempotent":
+ */
+ OP_MODIFIES_SOMETHING = 1 << 5,
+ /*
+ * Cache compounds containing these ops in the xid-based drc:
* We use the DRC for compounds containing non-idempotent
* operations, *except* those that are 4.1-specific (since
* sessions provide their own EOS), and except for stateful
@@ -1015,7 +1025,15 @@ struct nfsd4_operation {
* (since sequence numbers provide EOS for open, lock, etc in
* the v4.0 case).
*/
- bool op_cacheresult;
+ OP_CACHEME = 1 << 6,
+};
+
+struct nfsd4_operation {
+ nfsd4op_func op_func;
+ u32 op_flags;
+ char *op_name;
+ /* Try to get response size before operation */
+ nfsd4op_rsize op_rsize_bop;
};
static struct nfsd4_operation nfsd4_ops[];
@@ -1062,7 +1080,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
bool nfsd4_cache_this_op(struct nfsd4_op *op)
{
- return OPDESC(op)->op_cacheresult;
+ return OPDESC(op)->op_flags & OP_CACHEME;
}
static bool need_wrongsec_check(struct svc_rqst *rqstp)
@@ -1110,6 +1128,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
struct nfsd4_operation *opdesc;
struct nfsd4_compound_state *cstate = &resp->cstate;
int slack_bytes;
+ u32 plen = 0;
__be32 status;
resp->xbuf = &rqstp->rq_res;
@@ -1188,6 +1207,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
goto encode_op;
}
+ /* If op is non-idempotent */
+ if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
+ plen = opdesc->op_rsize_bop(rqstp, op);
+ op->status = nfsd4_check_resp_size(resp, plen);
+ }
+
+ if (op->status)
+ goto encode_op;
+
if (opdesc->op_func)
op->status = opdesc->op_func(rqstp, cstate, &op->u);
else
@@ -1217,7 +1245,7 @@ encode_op:
be32_to_cpu(status));
if (cstate->replay_owner) {
- nfs4_put_stateowner(cstate->replay_owner);
+ nfs4_unlock_state();
cstate->replay_owner = NULL;
}
/* XXX Ugh, we need to get rid of this kind of special case: */
@@ -1238,6 +1266,144 @@ out:
return status;
}
+#define op_encode_hdr_size (2)
+#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define op_encode_change_info_maxsz (5)
+#define nfs4_fattr_bitmap_maxsz (4)
+
+#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz)
+
+#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+
+#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz)
+#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \
+ op_encode_ace_maxsz)
+
+#define op_encode_channel_attrs_maxsz (6 + 1 + 1)
+
+static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
+}
+
+static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_stateid_maxsz
+ + op_encode_change_info_maxsz + 1
+ + nfs4_fattr_bitmap_maxsz
+ + op_encode_delegation_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = 0, rlen = 0;
+
+ maxcount = svc_max_payload(rqstp);
+ rlen = op->u.read.rd_length;
+
+ if (rlen > maxcount)
+ rlen = maxcount;
+
+ return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 rlen = op->u.readdir.rd_maxcount;
+
+ if (rlen > PAGE_SIZE)
+ rlen = PAGE_SIZE;
+
+ return (op_encode_hdr_size + op_encode_verifier_maxsz)
+ * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + op_encode_change_info_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
+ 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
+ 2 + /*eir_server_owner.so_minor_id */\
+ /* eir_server_owner.so_major_id<> */\
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+ /* eir_server_scope<> */\
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+ 1 + /* eir_server_impl_id array length */\
+ 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
+ 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
+ 2 + /* csr_sequence, csr_flags */\
+ op_encode_channel_attrs_maxsz + \
+ op_encode_channel_attrs_maxsz) * sizeof(__be32);
+}
+
static struct nfsd4_operation nfsd4_ops[] = {
[OP_ACCESS] = {
.op_func = (nfsd4op_func)nfsd4_access,
@@ -1245,20 +1411,27 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_CLOSE] = {
.op_func = (nfsd4op_func)nfsd4_close,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_CLOSE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_COMMIT] = {
.op_func = (nfsd4op_func)nfsd4_commit,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_COMMIT",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize,
},
[OP_CREATE] = {
.op_func = (nfsd4op_func)nfsd4_create,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_CREATE",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
},
[OP_DELEGRETURN] = {
.op_func = (nfsd4op_func)nfsd4_delegreturn,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_DELEGRETURN",
+ .op_rsize_bop = nfsd4_only_status_rsize,
},
[OP_GETATTR] = {
.op_func = (nfsd4op_func)nfsd4_getattr,
@@ -1271,12 +1444,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LINK] = {
.op_func = (nfsd4op_func)nfsd4_link,
+ .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
+ | OP_CACHEME,
.op_name = "OP_LINK",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
},
[OP_LOCK] = {
.op_func = (nfsd4op_func)nfsd4_lock,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_LOCK",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
},
[OP_LOCKT] = {
.op_func = (nfsd4op_func)nfsd4_lockt,
@@ -1284,7 +1461,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LOCKU] = {
.op_func = (nfsd4op_func)nfsd4_locku,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_LOCKU",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_LOOKUP] = {
.op_func = (nfsd4op_func)nfsd4_lookup,
@@ -1302,42 +1481,54 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_OPEN] = {
.op_func = (nfsd4op_func)nfsd4_open,
- .op_flags = OP_HANDLES_WRONGSEC,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
},
[OP_OPEN_CONFIRM] = {
.op_func = (nfsd4op_func)nfsd4_open_confirm,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN_CONFIRM",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_OPEN_DOWNGRADE] = {
.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN_DOWNGRADE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_PUTFH] = {
.op_func = (nfsd4op_func)nfsd4_putfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_PUTFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTPUBFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_PUTPUBFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTROOTFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_PUTROOTFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_READ] = {
.op_func = (nfsd4op_func)nfsd4_read,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READ",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
},
[OP_READDIR] = {
.op_func = (nfsd4op_func)nfsd4_readdir,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READDIR",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
},
[OP_READLINK] = {
.op_func = (nfsd4op_func)nfsd4_readlink,
@@ -1345,29 +1536,36 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_REMOVE] = {
.op_func = (nfsd4op_func)nfsd4_remove,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_REMOVE",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
},
[OP_RENAME] = {
- .op_name = "OP_RENAME",
.op_func = (nfsd4op_func)nfsd4_rename,
- .op_cacheresult = true,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_RENAME",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
},
[OP_RENEW] = {
.op_func = (nfsd4op_func)nfsd4_renew,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_RENEW",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+
},
[OP_RESTOREFH] = {
.op_func = (nfsd4op_func)nfsd4_restorefh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_RESTOREFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SAVEFH] = {
.op_func = (nfsd4op_func)nfsd4_savefh,
- .op_flags = OP_HANDLES_WRONGSEC,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
.op_name = "OP_SAVEFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SECINFO] = {
.op_func = (nfsd4op_func)nfsd4_secinfo,
@@ -1377,19 +1575,22 @@ static struct nfsd4_operation nfsd4_ops[] = {
[OP_SETATTR] = {
.op_func = (nfsd4op_func)nfsd4_setattr,
.op_name = "OP_SETATTR",
- .op_cacheresult = true,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
},
[OP_SETCLIENTID] = {
.op_func = (nfsd4op_func)nfsd4_setclientid,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_SETCLIENTID",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
},
[OP_SETCLIENTID_CONFIRM] = {
.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_SETCLIENTID_CONFIRM",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_VERIFY] = {
.op_func = (nfsd4op_func)nfsd4_verify,
@@ -1397,35 +1598,46 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_WRITE] = {
.op_func = (nfsd4op_func)nfsd4_write,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_WRITE",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
},
[OP_RELEASE_LOCKOWNER] = {
.op_func = (nfsd4op_func)nfsd4_release_lockowner,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_RELEASE_LOCKOWNER",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
/* NFSv4.1 operations */
[OP_EXCHANGE_ID] = {
.op_func = (nfsd4op_func)nfsd4_exchange_id,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_EXCHANGE_ID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
},
[OP_BIND_CONN_TO_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_BIND_CONN_TO_SESSION",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize,
},
[OP_CREATE_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_create_session,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_CREATE_SESSION",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize,
},
[OP_DESTROY_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_destroy_session,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_DESTROY_SESSION",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SEQUENCE] = {
.op_func = (nfsd4op_func)nfsd4_sequence,
@@ -1433,14 +1645,17 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_SEQUENCE",
},
[OP_DESTROY_CLIENTID] = {
- .op_func = NULL,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_func = (nfsd4op_func)nfsd4_destroy_clientid,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_DESTROY_CLIENTID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_RECLAIM_COMPLETE] = {
.op_func = (nfsd4op_func)nfsd4_reclaim_complete,
- .op_flags = ALLOWED_WITHOUT_FH,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
.op_name = "OP_RECLAIM_COMPLETE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SECINFO_NO_NAME] = {
.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
@@ -1454,8 +1669,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_FREE_STATEID] = {
.op_func = (nfsd4op_func)nfsd4_free_stateid,
- .op_flags = ALLOWED_WITHOUT_FH,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
.op_name = "OP_FREE_STATEID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
};
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 29d77f60585b..ed083b9a731b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -45,6 +45,7 @@
/* Globals */
static struct file *rec_file;
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
static int
nfs4_save_creds(const struct cred **original_creds)
@@ -88,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
struct xdr_netobj cksum;
struct hash_desc desc;
struct scatterlist sg;
- __be32 status = nfserr_resource;
+ __be32 status = nfserr_jukebox;
dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
clname->len, clname->data);
@@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
if (!rec_file || clp->cl_firststate)
return 0;
+ clp->cl_firststate = 1;
status = nfs4_save_creds(&original_cred);
if (status < 0)
return status;
@@ -143,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
goto out_unlock;
}
status = -EEXIST;
- if (dentry->d_inode) {
- dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
+ if (dentry->d_inode)
goto out_put;
- }
status = mnt_want_write(rec_file->f_path.mnt);
if (status)
goto out_put;
@@ -156,12 +156,14 @@ out_put:
dput(dentry);
out_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
- if (status == 0) {
- clp->cl_firststate = 1;
+ if (status == 0)
vfs_fsync(rec_file, 0);
- }
+ else
+ printk(KERN_ERR "NFSD: failed to write recovery record"
+ " (err %d); please check that %s exists"
+ " and is writeable", status,
+ user_recovery_dirname);
nfs4_reset_creds(original_cred);
- dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
return status;
}
@@ -354,13 +356,13 @@ nfsd4_recdir_load(void) {
*/
void
-nfsd4_init_recdir(char *rec_dirname)
+nfsd4_init_recdir()
{
const struct cred *original_cred;
int status;
printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
- rec_dirname);
+ user_recovery_dirname);
BUG_ON(rec_file);
@@ -372,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname)
return;
}
- rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
+ rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
if (IS_ERR(rec_file)) {
printk("NFSD: unable to find recovery directory %s\n",
- rec_dirname);
+ user_recovery_dirname);
rec_file = NULL;
}
@@ -390,3 +392,30 @@ nfsd4_shutdown_recdir(void)
fput(rec_file);
rec_file = NULL;
}
+
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+ int status;
+ struct path path;
+
+ status = kern_path(recdir, LOOKUP_FOLLOW, &path);
+ if (status)
+ return status;
+ status = -ENOTDIR;
+ if (S_ISDIR(path.dentry->d_inode->i_mode)) {
+ strcpy(user_recovery_dirname, recdir);
+ status = 0;
+ }
+ path_put(&path);
+ return status;
+}
+
+char *
+nfs4_recoverydir(void)
+{
+ return user_recovery_dirname;
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3787ec117400..47e94e33a975 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,9 +49,6 @@
time_t nfsd4_lease = 90; /* default lease time */
time_t nfsd4_grace = 90;
static time_t boot_time;
-static u32 current_ownerid = 1;
-static u32 current_fileid = 1;
-static u32 current_delegid = 1;
static stateid_t zerostateid; /* bits all 0 */
static stateid_t onestateid; /* bits all 1 */
static u64 current_sessionid = 1;
@@ -60,13 +57,7 @@ static u64 current_sessionid = 1;
#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
/* forward declarations */
-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
-static struct nfs4_stateid * search_for_stateid(stateid_t *stid);
-static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
-static void nfs4_set_recdir(char *recdir);
-static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner);
+static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
/* Locking: */
@@ -80,7 +71,8 @@ static DEFINE_MUTEX(client_mutex);
*/
static DEFINE_SPINLOCK(recall_lock);
-static struct kmem_cache *stateowner_slab = NULL;
+static struct kmem_cache *openowner_slab = NULL;
+static struct kmem_cache *lockowner_slab = NULL;
static struct kmem_cache *file_slab = NULL;
static struct kmem_cache *stateid_slab = NULL;
static struct kmem_cache *deleg_slab = NULL;
@@ -112,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes)
static struct list_head del_recall_lru;
+static void nfsd4_free_file(struct nfs4_file *f)
+{
+ kmem_cache_free(file_slab, f);
+}
+
static inline void
put_nfs4_file(struct nfs4_file *fi)
{
@@ -119,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi)
list_del(&fi->fi_hash);
spin_unlock(&recall_lock);
iput(fi->fi_inode);
- kmem_cache_free(file_slab, fi);
+ nfsd4_free_file(fi);
}
}
@@ -136,35 +133,33 @@ unsigned int max_delegations;
* Open owner state (share locks)
*/
-/* hash tables for nfs4_stateowner */
-#define OWNER_HASH_BITS 8
-#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
-#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
+/* hash tables for open owners */
+#define OPEN_OWNER_HASH_BITS 8
+#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS)
+#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1)
-#define ownerid_hashval(id) \
- ((id) & OWNER_HASH_MASK)
-#define ownerstr_hashval(clientid, ownername) \
- (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+{
+ unsigned int ret;
-static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE];
-static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
+ ret = opaque_hashval(ownername->data, ownername->len);
+ ret += clientid;
+ return ret & OPEN_OWNER_HASH_MASK;
+}
+
+static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
/* hash table for nfs4_file */
#define FILE_HASH_BITS 8
#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
-/* hash table for (open)nfs4_stateid */
-#define STATEID_HASH_BITS 10
-#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
-#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1)
-
-#define file_hashval(x) \
- hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id) \
- (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+static unsigned int file_hashval(struct inode *ino)
+{
+ /* XXX: why are we hashing on inode pointer, anyway? */
+ return hash_ptr(ino, FILE_HASH_BITS);
+}
static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
{
@@ -192,8 +187,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
{
if (atomic_dec_and_test(&fp->fi_access[oflag])) {
- nfs4_file_put_fd(fp, O_RDWR);
nfs4_file_put_fd(fp, oflag);
+ /*
+ * It's also safe to get rid of the RDWR open *if*
+ * we no longer have need of the other kind of access
+ * or if we already have the other kind of open:
+ */
+ if (fp->fi_fds[1-oflag]
+ || atomic_read(&fp->fi_access[1 - oflag]) == 0)
+ nfs4_file_put_fd(fp, O_RDWR);
}
}
@@ -206,8 +208,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
__nfs4_file_put_access(fp, oflag);
}
+static inline int get_new_stid(struct nfs4_stid *stid)
+{
+ static int min_stateid = 0;
+ struct idr *stateids = &stid->sc_client->cl_stateids;
+ int new_stid;
+ int error;
+
+ error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
+ /*
+ * Note: the necessary preallocation was done in
+ * nfs4_alloc_stateid(). The idr code caps the number of
+ * preallocations that can exist at a time, but the state lock
+ * prevents anyone from using ours before we get here:
+ */
+ BUG_ON(error);
+ /*
+ * It shouldn't be a problem to reuse an opaque stateid value.
+ * I don't think it is for 4.1. But with 4.0 I worry that, for
+ * example, a stray write retransmission could be accepted by
+ * the server when it should have been rejected. Therefore,
+ * adopt a trick from the sctp code to attempt to maximize the
+ * amount of time until an id is reused, by ensuring they always
+ * "increase" (mod INT_MAX):
+ */
+
+ min_stateid = new_stid+1;
+ if (min_stateid == INT_MAX)
+ min_stateid = 0;
+ return new_stid;
+}
+
+static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+{
+ stateid_t *s = &stid->sc_stateid;
+ int new_id;
+
+ stid->sc_type = type;
+ stid->sc_client = cl;
+ s->si_opaque.so_clid = cl->cl_clientid;
+ new_id = get_new_stid(stid);
+ s->si_opaque.so_id = (u32)new_id;
+ /* Will be incremented before return to client: */
+ s->si_generation = 0;
+}
+
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
+{
+ struct idr *stateids = &cl->cl_stateids;
+
+ if (!idr_pre_get(stateids, GFP_KERNEL))
+ return NULL;
+ /*
+ * Note: if we fail here (or any time between now and the time
+ * we actually get the new idr), we won't need to undo the idr
+ * preallocation, since the idr code caps the number of
+ * preallocated entries.
+ */
+ return kmem_cache_alloc(slab, GFP_KERNEL);
+}
+
+static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
+{
+ return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
+}
+
static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
{
struct nfs4_delegation *dp;
struct nfs4_file *fp = stp->st_file;
@@ -224,21 +291,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
return NULL;
if (num_delegations > max_delegations)
return NULL;
- dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
+ dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
if (dp == NULL)
return dp;
+ init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+ /*
+ * delegation seqid's are never incremented. The 4.1 special
+ * meaning of seqid 0 isn't meaningful, really, but let's avoid
+ * 0 anyway just for consistency and use 1:
+ */
+ dp->dl_stid.sc_stateid.si_generation = 1;
num_delegations++;
INIT_LIST_HEAD(&dp->dl_perfile);
INIT_LIST_HEAD(&dp->dl_perclnt);
INIT_LIST_HEAD(&dp->dl_recall_lru);
- dp->dl_client = clp;
get_nfs4_file(fp);
dp->dl_file = fp;
dp->dl_type = type;
- dp->dl_stateid.si_boot = boot_time;
- dp->dl_stateid.si_stateownerid = current_delegid++;
- dp->dl_stateid.si_fileid = 0;
- dp->dl_stateid.si_generation = 0;
fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
dp->dl_time = 0;
atomic_set(&dp->dl_count, 1);
@@ -267,10 +336,18 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
}
}
+static void unhash_stid(struct nfs4_stid *s)
+{
+ struct idr *stateids = &s->sc_client->cl_stateids;
+
+ idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+}
+
/* Called under the state lock. */
static void
unhash_delegation(struct nfs4_delegation *dp)
{
+ unhash_stid(&dp->dl_stid);
list_del_init(&dp->dl_perclnt);
spin_lock(&recall_lock);
list_del_init(&dp->dl_perfile);
@@ -292,10 +369,16 @@ static DEFINE_SPINLOCK(client_lock);
#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
-#define clientid_hashval(id) \
- ((id) & CLIENT_HASH_MASK)
-#define clientstr_hashval(name) \
- (opaque_hashval((name), 8) & CLIENT_HASH_MASK)
+static unsigned int clientid_hashval(u32 id)
+{
+ return id & CLIENT_HASH_MASK;
+}
+
+static unsigned int clientstr_hashval(const char *name)
+{
+ return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
+}
+
/*
* reclaim_str_hashtbl[] holds known client info from previous reset/reboot
* used in reboot/reset lease grace period processing
@@ -362,7 +445,7 @@ set_deny(unsigned int *deny, unsigned long bmap) {
}
static int
-test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
unsigned int access, deny;
set_access(&access, stp->st_access_bmap);
@@ -385,14 +468,13 @@ static int nfs4_access_to_omode(u32 access)
BUG();
}
-static void unhash_generic_stateid(struct nfs4_stateid *stp)
+static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
{
- list_del(&stp->st_hash);
list_del(&stp->st_perfile);
list_del(&stp->st_perstateowner);
}
-static void free_generic_stateid(struct nfs4_stateid *stp)
+static void close_generic_stateid(struct nfs4_ol_stateid *stp)
{
int i;
@@ -401,84 +483,106 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
if (test_bit(i, &stp->st_access_bmap))
nfs4_file_put_access(stp->st_file,
nfs4_access_to_omode(i));
+ __clear_bit(i, &stp->st_access_bmap);
}
}
put_nfs4_file(stp->st_file);
+ stp->st_file = NULL;
+}
+
+static void free_generic_stateid(struct nfs4_ol_stateid *stp)
+{
kmem_cache_free(stateid_slab, stp);
}
-static void release_lock_stateid(struct nfs4_stateid *stp)
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
{
struct file *file;
unhash_generic_stateid(stp);
+ unhash_stid(&stp->st_stid);
file = find_any_file(stp->st_file);
if (file)
- locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+ locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
+ close_generic_stateid(stp);
free_generic_stateid(stp);
}
-static void unhash_lockowner(struct nfs4_stateowner *sop)
+static void unhash_lockowner(struct nfs4_lockowner *lo)
{
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
- list_del(&sop->so_idhash);
- list_del(&sop->so_strhash);
- list_del(&sop->so_perstateid);
- while (!list_empty(&sop->so_stateids)) {
- stp = list_first_entry(&sop->so_stateids,
- struct nfs4_stateid, st_perstateowner);
+ list_del(&lo->lo_owner.so_strhash);
+ list_del(&lo->lo_perstateid);
+ while (!list_empty(&lo->lo_owner.so_stateids)) {
+ stp = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
release_lock_stateid(stp);
}
}
-static void release_lockowner(struct nfs4_stateowner *sop)
+static void release_lockowner(struct nfs4_lockowner *lo)
{
- unhash_lockowner(sop);
- nfs4_put_stateowner(sop);
+ unhash_lockowner(lo);
+ nfs4_free_lockowner(lo);
}
static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
+release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
{
- struct nfs4_stateowner *lock_sop;
+ struct nfs4_lockowner *lo;
while (!list_empty(&open_stp->st_lockowners)) {
- lock_sop = list_entry(open_stp->st_lockowners.next,
- struct nfs4_stateowner, so_perstateid);
- /* list_del(&open_stp->st_lockowners); */
- BUG_ON(lock_sop->so_is_open_owner);
- release_lockowner(lock_sop);
+ lo = list_entry(open_stp->st_lockowners.next,
+ struct nfs4_lockowner, lo_perstateid);
+ release_lockowner(lo);
}
}
-static void release_open_stateid(struct nfs4_stateid *stp)
+static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
{
unhash_generic_stateid(stp);
release_stateid_lockowners(stp);
+ close_generic_stateid(stp);
+}
+
+static void release_open_stateid(struct nfs4_ol_stateid *stp)
+{
+ unhash_open_stateid(stp);
+ unhash_stid(&stp->st_stid);
free_generic_stateid(stp);
}
-static void unhash_openowner(struct nfs4_stateowner *sop)
+static void unhash_openowner(struct nfs4_openowner *oo)
{
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
- list_del(&sop->so_idhash);
- list_del(&sop->so_strhash);
- list_del(&sop->so_perclient);
- list_del(&sop->so_perstateid); /* XXX: necessary? */
- while (!list_empty(&sop->so_stateids)) {
- stp = list_first_entry(&sop->so_stateids,
- struct nfs4_stateid, st_perstateowner);
+ list_del(&oo->oo_owner.so_strhash);
+ list_del(&oo->oo_perclient);
+ while (!list_empty(&oo->oo_owner.so_stateids)) {
+ stp = list_first_entry(&oo->oo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
release_open_stateid(stp);
}
}
-static void release_openowner(struct nfs4_stateowner *sop)
+static void release_last_closed_stateid(struct nfs4_openowner *oo)
{
- unhash_openowner(sop);
- list_del(&sop->so_close_lru);
- nfs4_put_stateowner(sop);
+ struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
+
+ if (s) {
+ unhash_stid(&s->st_stid);
+ free_generic_stateid(s);
+ oo->oo_last_closed_stid = NULL;
+ }
+}
+
+static void release_openowner(struct nfs4_openowner *oo)
+{
+ unhash_openowner(oo);
+ list_del(&oo->oo_close_lru);
+ release_last_closed_stateid(oo);
+ nfs4_free_openowner(oo);
}