diff options
Diffstat (limited to 'fs')
283 files changed, 6985 insertions, 5853 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 5b6a1743ea17..b3c2cc79c20d 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -276,32 +276,26 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, switch (handler->flags) { case ACL_TYPE_ACCESS: if (acl) { - umode_t mode = inode->i_mode; - retval = posix_acl_equiv_mode(acl, &mode); - if (retval < 0) + struct iattr iattr; + + retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); + if (retval) goto err_out; - else { - struct iattr iattr; - if (retval == 0) { - /* - * ACL can be represented - * by the mode bits. So don't - * update ACL. - */ - acl = NULL; - value = NULL; - size = 0; - } - /* Updte the mode bits */ - iattr.ia_mode = ((mode & S_IALLUGO) | - (inode->i_mode & ~S_IALLUGO)); - iattr.ia_valid = ATTR_MODE; - /* FIXME should we update ctime ? - * What is the following setxattr update the - * mode ? + if (!acl) { + /* + * ACL can be represented + * by the mode bits. So don't + * update ACL. */ - v9fs_vfs_setattr_dotl(dentry, &iattr); + value = NULL; + size = 0; } + iattr.ia_valid = ATTR_MODE; + /* FIXME should we update ctime ? + * What is the following setxattr update the + * mode ? + */ + v9fs_vfs_setattr_dotl(dentry, &iattr); } break; case ACL_TYPE_DEFAULT: diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8b1999b528e9..aa639bb1f289 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1094,7 +1094,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = inode_change_ok(d_inode(dentry), iattr); + retval = setattr_prepare(dentry, iattr); if (retval) return retval; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index d8220efdd752..afaa4b6de801 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -558,7 +558,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) p9_debug(P9_DEBUG_VFS, "\n"); - retval = inode_change_ok(inode, iattr); + retval = setattr_prepare(dentry, iattr); if (retval) return retval; diff --git a/fs/Kconfig b/fs/Kconfig index 2bc7ad775842..4bd03a2b0518 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -79,6 +79,7 @@ config EXPORTFS_BLOCK_OPS config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT default y + select PERCPU_RWSEM help This option enables standard file locking support, required for filesystems like NFS and for the flock() system @@ -199,6 +200,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config ARCH_HAS_GIGANTIC_PAGE + bool + source "fs/configfs/Kconfig" source "fs/efivarfs/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c7efddf6e038..4c09d93d9569 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -89,7 +89,7 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" - depends on !MMU || M68K + depends on !MMU || ARM || M68K depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 335055d828e4..f57baaa511aa 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -303,7 +303,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) unsigned int ia_valid = attr->ia_valid; int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); /* * we can't change the UID or GID of any file - diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0fdb0f5b2239..1aa243502c7f 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -219,7 +219,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr) pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); - error = inode_change_ok(inode,attr); + error = setattr_prepare(dentry, attr); if (error) goto out; diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 7ef637d7f3a5..1e9d2f84e5b5 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -461,8 +461,8 @@ static void afs_callback_updater(struct work_struct *work) */ int __init afs_callback_update_init(void) { - afs_callback_update_worker = - create_singlethread_workqueue("kafs_callbackd"); + afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd", + WQ_MEM_RECLAIM); return afs_callback_update_worker ? 0 : -ENOMEM; } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 85737e96ab8b..2037e7a77a37 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -17,19 +17,12 @@ #include "internal.h" #include "afs_cm.h" -#if 0 -struct workqueue_struct *afs_cm_workqueue; -#endif /* 0 */ - -static int afs_deliver_cb_init_call_back_state(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_init_call_back_state3(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *, - struct sk_buff *, bool); +static int afs_deliver_cb_init_call_back_state(struct afs_call *); +static int afs_deliver_cb_init_call_back_state3(struct afs_call *); +static int afs_deliver_cb_probe(struct afs_call *); +static int afs_deliver_cb_callback(struct afs_call *); +static int afs_deliver_cb_probe_uuid(struct afs_call *); +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *); static void afs_cm_destructor(struct afs_call *); /* @@ -134,7 +127,7 @@ static void afs_cm_destructor(struct afs_call *call) * received. The step number here must match the final number in * afs_deliver_cb_callback(). */ - if (call->unmarshall == 6) { + if (call->unmarshall == 5) { ASSERT(call->server && call->count && call->request); afs_break_callbacks(call->server, call->count, call->request); } @@ -168,27 +161,27 @@ static void SRXAFSCB_CallBack(struct work_struct *work) /* * deliver request data to a CB.CallBack call */ -static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_callback(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_callback *cb; struct afs_server *server; - struct in_addr addr; __be32 *bp; u32 tmp; int ret, loop; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); call->offset = 0; call->unmarshall++; /* extract the FID array and its count in two steps */ case 1: _debug("extract FID count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -205,8 +198,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, case 2: _debug("extract FID array"); - ret = afs_extract_data(call, skb, last, call->buffer, - call->count * 3 * 4); + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, true); if (ret < 0) return ret; @@ -232,7 +225,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, /* extract the callback array and its count in two steps */ case 3: _debug("extract CB count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -242,13 +235,11 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, return -EBADMSG; call->offset = 0; call->unmarshall++; - if (tmp == 0) - goto empty_cb_array; case 4: _debug("extract CB array"); - ret = afs_extract_data(call, skb, last, call->request, - call->count * 3 * 4); + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, false); if (ret < 0) return ret; @@ -261,15 +252,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, cb->type = ntohl(*bp++); } - empty_cb_array: call->offset = 0; call->unmarshall++; - case 5: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; - /* Record that the message was unmarshalled successfully so * that the call destructor can know do the callback breaking * work, even if the final ACK isn't received. @@ -278,17 +263,15 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, * updated also. */ call->unmarshall++; - case 6: + case 5: break; } - call->state = AFS_CALL_REPLYING; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -315,17 +298,17 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) /* * deliver request data to a CB.InitCallBackState call */ -static int afs_deliver_cb_init_call_back_state(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); + + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; @@ -334,8 +317,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -348,27 +330,68 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, /* * deliver request data to a CB.InitCallBackState3 call */ -static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; + struct afs_uuid *r; + unsigned loop; + __be32 *b; + int ret; + + _enter(""); + + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); + + _enter("{%u}", call->unmarshall); - _enter(",{%u},%d", skb->len, last); + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->unmarshall++; - /* There are some arguments that we ignore */ - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; + case 1: + _debug("extract UUID"); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall UUID"); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + b = call->buffer; + r = call->request; + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); + r->clock_seq_hi_and_reserved = ntohl(b[3]); + r->clock_seq_low = ntohl(b[4]); + + for (loop = 0; loop < 6; loop++) + r->node[loop] = ntohl(b[loop + 5]); + + call->offset = 0; + call->unmarshall++; + + case 2: + break; + } /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -393,14 +416,13 @@ static void SRXAFSCB_Probe(struct work_struct *work) /* * deliver request data to a CB.Probe call */ -static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe(struct afs_call *call) { int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; @@ -426,7 +448,6 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) _enter(""); - if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0) reply.match = htonl(0); else @@ -439,19 +460,14 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) /* * deliver request data to a CB.ProbeUuid call */ -static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe_uuid(struct afs_call *call) { struct afs_uuid *r; unsigned loop; __be32 *b; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -463,8 +479,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, case 1: _debug("extract UUID"); - ret = afs_extract_data(call, skb, last, call->buffer, - 11 * sizeof(__be32)); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -491,16 +507,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, call->unmarshall++; case 2: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; break; } - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; - call->state = AFS_CALL_REPLYING; INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); @@ -574,14 +583,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) /* * deliver request data to a CB.TellMeAboutYourself call */ -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) { int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index d91a9c9cfbd0..3191dff2c156 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -36,8 +36,8 @@ static int afs_init_lock_manager(void) if (!afs_lock_manager) { mutex_lock(&afs_lock_manager_mutex); if (!afs_lock_manager) { - afs_lock_manager = - create_singlethread_workqueue("kafs_lockd"); + afs_lock_manager = alloc_workqueue("kafs_lockd", + WQ_MEM_RECLAIM, 0); if (!afs_lock_manager) ret = -ENOMEM; } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 9312b92e54be..96f4d764d1a6 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -235,16 +235,15 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_status(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -307,8 +306,7 @@ int afs_fs_fetch_file_status(struct afs_server *server, /* * deliver reply data to an FS.FetchData */ -static int afs_deliver_fs_fetch_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; @@ -316,7 +314,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, void *buffer; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -332,7 +330,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, * client) */ case 1: _debug("extract data length (MSW)"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -347,7 +345,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the returned data length */ case 2: _debug("extract data length"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -363,10 +361,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, _debug("extract data"); if (call->count > 0) { page = call->reply3; - buffer = kmap_atomic(page); - ret = afs_extract_data(call, skb, last, buffer, - call->count); - kunmap_atomic(buffer); + buffer = kmap(page); + ret = afs_extract_data(call, buffer, + call->count, true); + kunmap(buffer); if (ret < 0) return ret; } @@ -376,8 +374,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the metadata */ case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - (21 + 3 + 6) * 4); + ret = afs_extract_data(call, call->buffer, + (21 + 3 + 6) * 4, false); if (ret < 0) return ret; @@ -391,18 +389,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, call->unmarshall++; case 5: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; break; } if (call->count < PAGE_SIZE) { _debug("clear"); page = call->reply3; - buffer = kmap_atomic(page); + buffer = kmap(page); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap_atomic(buffer); + kunmap(buffer); } _leave(" = 0 [done]"); @@ -515,13 +510,12 @@ int afs_fs_fetch_data(struct afs_server *server, /* * deliver reply data to an FS.GiveUpCallBacks */ -static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_give_up_callbacks(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + _enter(""); /* shouldn't be any reply data */ - return afs_data_complete(call, skb, last); + return afs_extract_data(call, NULL, 0, false); } /* @@ -599,16 +593,15 @@ int afs_fs_give_up_callbacks(struct afs_server *server, /* * deliver reply data to an FS.CreateFile or an FS.MakeDir */ -static int afs_deliver_fs_create_vnode(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_create_vnode(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -696,16 +689,15 @@ int afs_fs_create(struct afs_server *server, /* * deliver reply data to an FS.RemoveFile or FS.RemoveDir */ -static int afs_deliver_fs_remove(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_remove(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -777,16 +769,15 @@ int afs_fs_remove(struct afs_server *server, /* * deliver reply data to an FS.Link */ -static int afs_deliver_fs_link(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_link(struct afs_call *call) { struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -863,16 +854,15 @@ int afs_fs_link(struct afs_server *server, /* * deliver reply data to an FS.Symlink */ -static int afs_deliver_fs_symlink(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_symlink(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -968,16 +958,15 @@ int afs_fs_symlink(struct afs_server *server, /* * deliver reply data to an FS.Rename */ -static int afs_deliver_fs_rename(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_rename(struct afs_call *call) { struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1072,16 +1061,15 @@ int afs_fs_rename(struct afs_server *server, /* * deliver reply data to an FS.StoreData */ -static int afs_deliver_fs_store_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1251,17 +1239,16 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, /* * deliver reply data to an FS.StoreStatus */ -static int afs_deliver_fs_store_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_status(struct afs_call *call) { afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1443,14 +1430,13 @@ int afs_fs_setattr(struct afs_server *server, struct key *key, /* * deliver reply data to an FS.GetVolumeStatus */ -static int afs_deliver_fs_get_volume_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_get_volume_status(struct afs_call *call) { const __be32 *bp; char *p; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -1460,8 +1446,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the returned status record */ case 1: _debug("extract status"); - ret = afs_extract_data(call, skb, last, call->buffer, - 12 * 4); + ret = afs_extract_data(call, call->buffer, + 12 * 4, true); if (ret < 0) return ret; @@ -1472,7 +1458,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the volume name length */ case 2: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1487,8 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 3: _debug("extract volname"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1508,8 +1494,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, true); if (ret < 0) return ret; @@ -1519,7 +1505,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the offline message length */ case 5: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1534,8 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 6: _debug("extract offline"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1555,8 +1541,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 7: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, true); if (ret < 0) return ret; @@ -1566,7 +1552,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the message of the day length */ case 8: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1581,8 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 9: _debug("extract motd"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1595,26 +1581,17 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->unmarshall++; /* extract the message of the day padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_motd_padding; - } - call->count = 4 - (call->count & 3); + call->count = (4 - (call->count & 3)) & 3; case 10: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, false); if (ret < 0) return ret; call->offset = 0; call->unmarshall++; - no_motd_padding: - case 11: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; break; } @@ -1685,15 +1662,14 @@ int afs_fs_get_volume_status(struct afs_server *server, /* * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock */ -static int afs_deliver_fs_xxxx_lock(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_xxxx_lock(struct afs_call *call) { const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index df976b2a7f40..5497c8496055 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -13,13 +13,13 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/skbuff.h> #include <linux/rxrpc.h> #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> +#include <net/af_rxrpc.h> #include "afs.h" #include "afs_vl.h" @@ -56,7 +56,7 @@ struct afs_mount_params { */ struct afs_wait_mode { /* RxRPC received message notification */ - void (*rx_wakeup)(struct afs_call *call); + rxrpc_notify_rx_t notify_rx; /* synchronous call waiter and call dispatched notification */ int (*wait)(struct afs_call *call); @@ -75,10 +75,8 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ - struct sk_buff_head rx_queue; /* received packets */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct key *key; /* security for this call */ struct afs_server *server; /* server affected by incoming CM call */ @@ -92,6 +90,7 @@ struct afs_call { void *reply4; /* reply buffer (fourth part) */ pgoff_t first; /* first page in mapping to deal with */ pgoff_t last; /* last page in mapping to deal with */ + size_t offset; /* offset into received data store */ enum { /* call state */ AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ @@ -99,21 +98,18 @@ struct afs_call { AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ AFS_CALL_REPLYING, /* replying to incoming call */ AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ - AFS_CALL_COMPLETE, /* successfully completed */ - AFS_CALL_BUSY, /* server was busy */ - AFS_CALL_ABORTED, /* call was aborted */ - AFS_CALL_ERROR, /* call failed due to error */ + AFS_CALL_COMPLETE, /* Completed or failed */ } state; int error; /* error code */ + u32 abort_code; /* Remote abort ID or 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ - unsigned reply_size; /* current size of reply */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ - unsigned offset; /* offset into received data store */ unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ + bool need_attention; /* T if RxRPC poked us */ u16 service_id; /* RxRPC service ID to call */ __be16 port; /* target UDP port */ __be32 operation_ID; /* operation ID for an incoming call */ @@ -128,8 +124,7 @@ struct afs_call_type { /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ - int (*deliver)(struct afs_call *call, struct sk_buff *skb, - bool last); + int (*deliver)(struct afs_call *call); /* map an abort code to an error number */ int (*abort_to_error)(u32 abort_code); @@ -607,29 +602,22 @@ extern void afs_proc_cell_remove(struct afs_cell *); /* * rxrpc.c */ +extern struct socket *afs_socket; + extern int afs_open_socket(void); extern void afs_close_socket(void); -extern void afs_data_consumed(struct afs_call *, struct sk_buff *); extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, const struct afs_wait_mode *); extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); -extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); -extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, - size_t); +extern int afs_extract_data(struct afs_call *, void *, size_t, bool); -static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb, - bool last) +static inline int afs_transfer_reply(struct afs_call *call) { - if (skb->len > 0) - return -EBADMSG; - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; - return 0; + return afs_extract_data(call, call->buffer, call->reply_max, false); } /* @@ -654,7 +642,7 @@ do { \ extern struct afs_server *afs_lookup_server(struct afs_cell *, const struct in_addr *); -extern struct afs_server *afs_find_server(const struct in_addr *); +extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *); extern void afs_put_server(struct afs_server *); extern void __exit afs_purge_servers(void); diff --git a/fs/afs/main.c b/fs/afs/main.c index 35de0c04729f..0b187ef3b5b7 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/completion.h> #include <linux/sched.h> +#include <linux/random.h> #include "internal.h" MODULE_DESCRIPTION("AFS Client File System"); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 14d04c848465..59bdaa7527b6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -16,34 +16,36 @@ #include "internal.h" #include "afs_cm.h" -static struct socket *afs_socket; /* my RxRPC socket */ +struct socket *afs_socket; /* my RxRPC socket */ static struct workqueue_struct *afs_async_calls; +static struct afs_call *afs_spare_incoming_call; static atomic_t afs_outstanding_calls; -static atomic_t afs_outstanding_skbs; -static void afs_wake_up_call_waiter(struct afs_call *); +static void afs_free_call(struct afs_call *); +static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static int afs_wait_for_call_to_complete(struct afs_call *); -static void afs_wake_up_async_call(struct afs_call *); +static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct afs_call *); -static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); -static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); +static void afs_process_async_call(struct work_struct *); +static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static int afs_deliver_cm_op_id(struct afs_call *); /* synchronous call management */ const struct afs_wait_mode afs_sync_call = { - .rx_wakeup = afs_wake_up_call_waiter, + .notify_rx = afs_wake_up_call_waiter, .wait = afs_wait_for_call_to_complete, }; /* asynchronous call management */ const struct afs_wait_mode afs_async_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, .wait = afs_dont_wait_for_call_to_complete, }; /* asynchronous incoming call management */ static const struct afs_wait_mode afs_async_incoming_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, }; /* asynchronous incoming call initial processing */ @@ -53,17 +55,9 @@ static const struct afs_call_type afs_RXCMxxxx = { .abort_to_error = afs_abort_to_error, }; -static void afs_collect_incoming_call(struct work_struct *); +static void afs_charge_preallocation(struct work_struct *); -static struct sk_buff_head afs_incoming_calls; -static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); - -static void afs_async_workfn(struct work_struct *work) -{ - struct afs_call *call = container_of(work, struct afs_call, async_work); - - call->async_workfn(call); -} +static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation); static int afs_wait_atomic_t(atomic_t *p) { @@ -83,10 +77,8 @@ int afs_open_socket(void) _enter(""); - skb_queue_head_init(&afs_incoming_calls); - ret = -ENOMEM; - afs_async_calls = create_singlethread_workqueue("kafsd"); + afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); if (!afs_async_calls) goto error_0; @@ -110,13 +102,15 @@ int afs_open_socket(void) if (ret < 0) goto error_2; + rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, + afs_rx_discard_new_call); + ret = kernel_listen(socket, INT_MAX); if (ret < 0) goto error_2; - rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); - afs_socket = socket; + afs_charge_preallocation(NULL); _leave(" = 0"); return 0; @@ -136,52 +130,28 @@ void afs_close_socket(void) { _enter(""); + if (afs_spare_incoming_call) { + atomic_inc(&afs_outstanding_calls); + afs_free_call(afs_spare_incoming_call); + afs_spare_incoming_call = NULL; + } + + _debug("outstanding %u", atomic_read(&afs_outstanding_calls)); wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t, TASK_UNINTERRUPTIBLE); _debug("no outstanding calls"); + flush_workqueue(afs_async_calls); + kernel_sock_shutdown(afs_socket, SHUT_RDWR); + flush_workqueue(afs_async_calls); sock_release(afs_socket); _debug("dework"); destroy_workqueue(afs_async_calls); - - ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0); _leave(""); } /* - * Note that the data in a socket buffer is now consumed. - */ -void afs_data_consumed(struct afs_call *call, struct sk_buff *skb) -{ - if (!skb) { - _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("DLVR %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - rxrpc_kernel_data_consumed(call->rxcall, skb); - } -} - -/* - * free a socket buffer - */ -static void afs_free_skb(struct sk_buff *skb) -{ - if (!skb) { - _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("FREE %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_free_skb(skb); - } -} - -/* * free a call */ static void afs_free_call(struct afs_call *call) @@ -191,7 +161,6 @@ static void afs_free_call(struct afs_call *call) ASSERTCMP(call->rxcall, ==, NULL); ASSERT(!work_pending(&call->async_work)); - ASSERT(skb_queue_empty(&call->rx_queue)); ASSERT(call->type->name != NULL); kfree(call->request); @@ -207,7 +176,7 @@ static void afs_free_call(struct afs_call *call) static void afs_end_call_nofree(struct afs_call *call) { if (call->rxcall) { - rxrpc_kernel_end_call(call->rxcall); + rxrpc_kernel_end_call(afs_socket, call->rxcall); call->rxcall = NULL; } if (call->type->destructor) @@ -227,7 +196,7 @@ static void afs_end_call(struct afs_call *call) * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, - size_t request_size, size_t reply_size) + size_t request_size, size_t reply_max) { struct afs_call *call; @@ -241,7 +210,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, call->type = type; call->request_size = request_size; - call->reply_max = reply_size; + call->reply_max = reply_max; if (request_size) { call->request = kmalloc(request_size, GFP_NOFS); @@ -249,14 +218,13 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, goto nomem_free; } - if (reply_size) { - call->buffer = kmalloc(reply_size, GFP_NOFS); + if (reply_max) { + call->buffer = kmalloc(reply_max, GFP_NOFS); if (!call->buffer) goto nomem_free; } init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); return call; nomem_free: @@ -325,8 +293,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, * returns from sending the request */ if (first + loop >= last) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(call->rxcall, msg, - to - offset); + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, + msg, to - offset); kunmap(pages[loop]); if (ret < 0) break; @@ -354,7 +322,6 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; int ret; - struct sk_buff *skb; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -366,8 +333,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -380,7 +346,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, /* create a call */ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, - (unsigned long) call, gfp); + (unsigned long) call, gfp, + wait_mode->notify_rx); call->key = NULL; if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); @@ -406,7 +373,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, * request */ if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size); + ret = rxrpc_kernel_send_data(afs_socket, rxcall, + &msg, call->request_size); if (ret < 0) goto error_do_abort; @@ -421,9 +389,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, return wait_mode->wait(call); error_do_abort: - rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD"); error_kill_call: afs_end_call(call); _leave(" = %d", ret); @@ -431,140 +397,77 @@ error_kill_call: } /* - * Handles intercepted messages that were arriving in the socket's Rx queue. - * - * Called from the AF_RXRPC call processor in waitqueue process context. For - * each call, it is guaranteed this will be called in order of packet to be - * delivered. - */ -static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, - struct sk_buff *skb) -{ - struct afs_call *call = (struct afs_call *) user_call_ID; - - _enter("%p,,%u", call, skb->mark); - - _debug("ICPT %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - - ASSERTCMP(sk, ==, afs_socket->sk); - atomic_inc(&afs_outstanding_skbs); - - if (!call) { - /* its an incoming call for our callback service */ - skb_queue_tail(&afs_incoming_calls, skb); - queue_work(afs_wq, &afs_collect_incoming_call_work); - } else { - /* route the messages directly to the appropriate call */ - skb_queue_tail(&call->rx_queue, skb); - call->wait_mode->rx_wakeup(call); - } - - _leave(""); -} - -/* * deliver messages to a call */ static void afs_deliver_to_call(struct afs_call *call) { - struct sk_buff *skb; - bool last; u32 abort_code; int ret; - _enter(""); - - while ((call->state == AFS_CALL_AWAIT_REPLY || - call->state == AFS_CALL_AWAIT_OP_ID || - call->state == AFS_CALL_AWAIT_REQUEST || - call->state == AFS_CALL_AWAIT_ACK) && - (skb = skb_dequeue(&call->rx_queue))) { - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: - _debug("Rcv DATA"); - last = rxrpc_kernel_is_data_last(skb); - ret = call->type->deliver(call, skb, last); - switch (ret) { - case -EAGAIN: - if (last) { - _debug("short data"); - goto unmarshal_error; - } - break; - case 0: - ASSERT(last); - if (call->state == AFS_CALL_AWAIT_REPLY) - call->state = AFS_CALL_COMPLETE; - break; - case -ENOTCONN: - abort_code = RX_CALL_DEAD; - goto do_abort; - case -ENOTSUPP: - abort_code = RX_INVALID_OPERATION; - goto do_abort; - default: - unmarshal_error: - abort_code = RXGEN_CC_UNMARSHAL; - if (call->state != AFS_CALL_AWAIT_REPLY) - abort_code = RXGEN_SS_UNMARSHAL; - do_abort: - rxrpc_kernel_abort_call(call->rxcall, - abort_code); - call->error = ret; - call->state = AFS_CALL_ERROR; - break; + _enter("%s", call->type->name); + + while (call->state == AFS_CALL_AWAIT_REPLY || + call->state == AFS_CALL_AWAIT_OP_ID || + call->state == AFS_CALL_AWAIT_REQUEST || + call->state == AFS_CALL_AWAIT_ACK + ) { + if (call->state == AFS_CALL_AWAIT_ACK) { + size_t offset = 0; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + NULL, 0, &offset, false, + &call->abort_code); + if (ret == -EINPROGRESS || ret == -EAGAIN) + return; + if (ret == 1) { + call->state = AFS_CALL_COMPLETE; + goto done; } - break; - case RXRPC_SKB_MARK_FINAL_ACK: - _debug("Rcv ACK"); - call->state = AFS_CALL_COMPLETE; - break; - case RXRPC_SKB_MARK_BUSY: - _debug("Rcv BUSY"); - call->error = -EBUSY; - call->state = AFS_CALL_BUSY; - break; - case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Rcv ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Loc ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_NET_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv NET ERROR %d", call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv LOCAL ERROR %d", call->error); - break; - default: - BUG(); - break; + return; } - afs_free_skb(skb); - } - - /* make sure the queue is empty if the call is done with (we might have - * aborted the call early because of an unmarshalling error) */ - if (call->state >= AFS_CALL_COMPLETE) { - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); - if (call->incoming) - afs_end_call(call); + ret = call->type->deliver(call); + switch (ret) { + case 0: + if (call->state == AFS_CALL_AWAIT_REPLY) + call->state = AFS_CALL_COMPLETE; + goto done; + case -EINPROGRESS: + case -EAGAIN: + goto out; + case -ENOTCONN: + abort_code = RX_CALL_DEAD; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, -ret, "KNC"); + goto do_abort; + case -ENOTSUPP: + abort_code = RX_INVALID_OPERATION; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, -ret, "KIV"); + goto do_abort; + case -ENODATA: + case -EBADMSG: + case -EMSGSIZE: + default: + abort_code = RXGEN_CC_UNMARSHAL; + if (call->state != AFS_CALL_AWAIT_REPLY) + abort_code = RXGEN_SS_UNMARSHAL; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, EBADMSG, "KUM"); + goto do_abort; + } } +done: + if (call->state == AFS_CALL_COMPLETE && call->incoming) + afs_end_call(call); +out: _leave(""); + return; + +do_abort: + call->error = ret; + call->state = AFS_CALL_COMPLETE; + goto done; } /* @@ -572,7 +475,7 @@ static void afs_deliver_to_call(struct afs_call *call) */ static int afs_wait_for_call_to_complete(struct afs_call *call) { - struct sk_buff *skb; + const char *abort_why; int ret; DECLARE_WAITQUEUE(myself, current); @@ -584,15 +487,18 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) set_current_state(TASK_INTERRUPTIBLE); /* deliver any messages that are in the queue */ - if (!skb_queue_empty(&call->rx_queue)) { + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); continue; } + abort_why = "KWC"; ret = call->error; - if (call->state >= AFS_CALL_COMPLETE) + if (call->state == AFS_CALL_COMPLETE) break; + abort_why = "KWI"; ret = -EINTR; if (signal_pending(current)) break; @@ -605,9 +511,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* kill the call */ if (call->state < AFS_CALL_COMPLETE) { _debug("call incomplete"); - rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_CALL_DEAD, -ret, abort_why); } _debug("call complete"); @@ -619,17 +524,24 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* * wake up a waiting call */ -static void afs_wake_up_call_waiter(struct afs_call *call) +static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; wake_up(&call->waitq); } /* * wake up an asynchronous call */ -static void afs_wake_up_async_call(struct afs_call *call) +static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { - _enter(""); + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; queue_work(afs_async_calls, &call->async_work); } @@ -647,8 +559,10 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct afs_call *call) +static void afs_delete_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); afs_free_call(call); @@ -658,17 +572,19 @@ static void afs_delete_async_call(struct afs_call *call) /* * perform processing on an asynchronous call - * - on a multiple-thread workqueue this work item may try to run on several - * CPUs at the same time */ -static void afs_process_async_call(struct afs_call *call) +static void afs_process_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); - if (!skb_queue_empty(&call->rx_queue)) + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; afs_deliver_to_call(call); + } - if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) { + if (call->state == AFS_CALL_COMPLETE && call->wait_mode) { if (call->wait_mode->async_complete) call->wait_mode->async_complete(call->reply, call->error); @@ -679,122 +595,93 @@ static void afs_process_async_call(struct afs_call *call) /* we can't just delete the call because the work item may be * queued */ - call->async_workfn = afs_delete_async_call; + call->async_work.func = afs_delete_async_call; queue_work(afs_async_calls, &call->async_work); } _leave(""); } -/* - * Empty a socket buffer into a flat reply buffer. - */ -int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last) +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) { - size_t len = skb->len; - - if (len > call->reply_max - call->reply_size) { - _leave(" = -EBADMSG [%zu > %u]", - len, call->reply_max - call->reply_size); - return -EBADMSG; - } + struct afs_call *call = (struct afs_call *)user_call_ID; - if (len > 0) { - if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, - len) < 0) - BUG(); - call->reply_size += len; - } - - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; - - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } - return 0; + call->rxcall = rxcall; } /* - * accept the backlog of incoming calls + * Charge the incoming call preallocation. */ -static void afs_collect_incoming_call(struct work_struct *work) +static void afs_charge_preallocation(struct work_struct *work) { - struct rxrpc_call *rxcall; - struct afs_call *call = NULL; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&afs_incoming_calls))) { - _debug("new call"); - - /* don't need the notification */ - afs_free_skb(skb); + struct afs_call *call = afs_spare_incoming_call; + for (;;) { if (!call) { call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); - if (!call) { - rxrpc_kernel_reject_call(afs_socket); - return; - } + if (!call) + break; - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); call->wait_mode = &afs_async_incoming_call; call->type = &afs_RXCMxxxx; init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); call->state = AFS_CALL_AWAIT_OP_ID; - - _debug("CALL %p{%s} [%d]", - call, call->type->name, - atomic_read(&afs_outstanding_calls)); - atomic_inc(&afs_outstanding_calls); } - rxcall = rxrpc_kernel_accept_call(afs_socket, - (unsigned long) call); - if (!IS_ERR(rxcall)) { - call->rxcall = rxcall; - call = NULL; - } + if (rxrpc_kernel_charge_accept(afs_socket, + afs_wake_up_async_call, + afs_rx_attach, + (unsigned long)call, + GFP_KERNEL) < 0) + break; + call = NULL; } + afs_spare_incoming_call = call; +} + +/* + * Discard a preallocated call when a socket is shut down. + */ +static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + struct afs_call *call = (struct afs_call *)user_call_ID; - if (call) - afs_free_call(call); + atomic_inc(&afs_outstanding_calls); + call->rxcall = NULL; + afs_free_call(call); +} + +/* + * Notification of an incoming call. + */ +static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + atomic_inc(&afs_outstanding_calls); + queue_work(afs_wq, &afs_charge_preallocation_work); } /* * Grab the operation ID from an incoming cache manager call. The socket * buffer is discarded on error or if we don't yet have sufficient data. */ -static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cm_op_id(struct afs_call *call) { - size_t len = skb->len; - void *oibuf = (void *) &call->operation_ID; + int ret; - _enter("{%u},{%zu},%d", call->offset, len, last); + _enter("{%zu}", call->offset); ASSERTCMP(call->offset, <, 4); /* the operation ID forms the first four bytes of the request data */ - len = min_t(size_t, len, 4 - call->offset); - if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0) - BUG(); - if (!pskb_pull(skb, len)) - BUG(); - call->offset += len; - - if (call->offset < 4) { - afs_data_consumed(call, skb); - _leave(" = -EAGAIN"); - return -EAGAIN; - } + ret = afs_extract_data(call, &call->operation_ID, 4, true); + if (ret < 0) + return ret; call->state = AFS_CALL_AWAIT_REQUEST; + call->offset = 0; /* ask the cache manager to route the call (it'll change the call type * if successful) */ @@ -803,7 +690,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, /* pass responsibility for the remainer of this message off to the * cache manager op */ - return call->type->deliver(call, skb, last); + return call->type->deliver(call); } /* @@ -823,14 +710,15 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) { + switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) { case 0: _leave(" [replied]"); return; case -ENOMEM: _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT, ENOMEM, "KOO"); default: afs_end_call(call); _leave(" [error]"); @@ -859,7 +747,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(call->rxcall, &msg, len); + n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len); if (n >= 0) { /* Success */ _leave(" [replied]"); @@ -868,7 +756,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT, ENOMEM, "KOO"); } afs_end_call(call); _leave(" [error]"); @@ -877,25 +766,40 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) /* * Extract a piece of data from the received data socket buffers. */ -int afs_extract_data(struct afs_call *call, struct sk_buff *skb, - bool last, void *buf, size_t count) +int afs_extract_data(struct afs_call *call, void *buf, size_t count, + bool want_more) { - size_t len = skb->len; + int ret; - _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count); + _enter("{%s,%zu},,%zu,%d", + call->type->name, call->offset, count, want_more); - ASSERTCMP(call->offset, <, count); + ASSERTCMP(call->offset, <=, count); - len = min_t(size_t, len, count - call->offset); - if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 || - !pskb_pull(skb, len)) - BUG(); - call->offset += len; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + buf, count, &call->offset, + want_more, &call->abort_code); + if (ret == 0 || ret == -EAGAIN) + return ret; - if (call->offset < count) { - afs_data_consumed(call, skb); - _leave(" = -EAGAIN"); - return -EAGAIN; + if (ret == 1) { + switch (call->state) { + case AFS_CALL_AWAIT_REPLY: + call->state = AFS_CALL_COMPLETE; + break; + case AFS_CALL_AWAIT_REQUEST: + call->state = AFS_CALL_REPLYING; + break; + default: + break; + } + return 0; } - return 0; + + if (ret == -ECONNABORTED) + call->error = call->type->abort_to_error(call->abort_code); + else + call->error = ret; + call->state = AFS_CALL_COMPLETE; + return ret; } diff --git a/fs/afs/server.c b/fs/afs/server.c index f342acf3547d..d4066ab7dd55 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -178,13 +178,18 @@ server_in_two_cells: /* * look up a server by its IP address */ -struct afs_server *afs_find_server(const struct in_addr *_addr) +struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx) { struct afs_server *server = NULL; struct rb_node *p; - struct in_addr addr = *_addr; + struct in_addr addr = srx->transport.sin.sin_addr; - _enter("%pI4", &addr.s_addr); + _enter("{%d,%pI4}", srx->transport.family, &addr.s_addr); + + if (srx->transport.family != AF_INET) { + WARN(true, "AFS does not yes support non-IPv4 addresses\n"); + return NULL; + } read_lock(&afs_servers_lock); diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index f94d1abdc3eb..94bcd97d22b8 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -58,17 +58,16 @@ static int afs_vl_abort_to_error(u32 abort_code) /* * deliver reply data to a VL.GetEntryByXXX call */ -static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call) { struct afs_cache_vlocation *entry; __be32 *bp; u32 tmp; int loop, ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 52976785a32c..45a86396fd2d 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -594,8 +594,8 @@ static void afs_vlocation_reaper(struct work_struct *work) */ int __init afs_vlocation_update_init(void) { - afs_vlocation_update_worker = - create_singlethread_workqueue("kafs_vlupdated"); + afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated", + WQ_MEM_RECLAIM, 0); return afs_vlocation_update_worker ? 0 : -ENOMEM; } @@ -274,14 +274,17 @@ __initcall(aio_setup); static void put_aio_ring_file(struct kioctx *ctx) { struct file *aio_ring_file = ctx->aio_ring_file; + struct address_space *i_mapping; + if (aio_ring_file) { truncate_setsize(aio_ring_file->f_inode, 0); /* Prevent further access to the kioctx from migratepages */ - spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock); - aio_ring_file->f_inode->i_mapping->private_data = NULL; + i_mapping = aio_ring_file->f_inode->i_mapping; + spin_lock(&i_mapping->private_lock); + i_mapping->private_data = NULL; ctx->aio_ring_file = NULL; - spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock); + spin_unlock(&i_mapping->private_lock); fput(aio_ring_file); } diff --git a/fs/attr.c b/fs/attr.c index 42bb42bb3c72..a19a64d41e7e 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -17,19 +17,22 @@ #include <linux/ima.h> /** - * inode_change_ok - check if attribute changes to an inode are allowed - * @inode: inode to check + * setattr_prepare - check if attribute changes to a dentry are allowed + * @dentry: dentry to check * @attr: attributes to change * * Check if we are allowed to change the attributes contained in @attr - * in the given inode. This includes the normal unix access permission - * checks, as well as checks for rlimits and others. + * in the given dentry. This includes the normal unix access permission + * checks, as well as checks for rlimits and others. The function also clears + * SGID bit from mode if user is not allowed to set it. Also file capabilities + * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ -int inode_change_ok(const struct inode *inode, struct iattr *attr) +int setattr_prepare(struct dentry *dentry, struct iattr *attr) { + struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; /* @@ -44,7 +47,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* If force is set do it anyway. */ if (ia_valid & ATTR_FORCE) - return 0; + goto kill_priv; /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && @@ -77,9 +80,19 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) return -EPERM; } +kill_priv: + /* User has permission for the change */ + if (ia_valid & ATTR_KILL_PRIV) { + int error; + + error = security_inode_killpriv(dentry); + if (error) + return error; + } + return 0; } -EXPORT_SYMBOL(inode_change_ok); +EXPORT_SYMBOL(setattr_prepare); /** * inode_newsize_ok - may this inode be truncated to a given size @@ -202,6 +215,21 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return -EPERM; } + /* + * If utimes(2) and friends are called with times == NULL (or both + * times are UTIME_NOW), then we need to check for write permission + */ + if (ia_valid & ATTR_TOUCH) { + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (!inode_owner_or_capable(inode)) { + error = inode_permission(inode, MAY_WRITE); + if (error) + return error; + } + } + if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ @@ -217,13 +245,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de if (!(ia_valid & ATTR_MTIME_SET)) attr->ia_mtime = now; if (ia_valid & ATTR_KILL_PRIV) { - attr->ia_valid &= ~ATTR_KILL_PRIV; - ia_valid &= ~ATTR_KILL_PRIV; error = security_inode_need_killpriv(dentry); - if (error > 0) - error = security_inode_killpriv(dentry); - if (error) + if (error < 0) return error; + if (error == 0) + ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV; } /* diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 431fd7ee3488..e44271dfceb6 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_uid(); - wq->gid = current_gid(); + wq->uid = current_real_cred()->uid; + wq->gid = current_real_cred()->gid; wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 7da05b159ade..bfe9f9994935 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -789,7 +789,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) * Will be set to real fs blocksize later. * * Linux 2.4.10 and later refuse to read blocks smaller than - * the hardsect size for the device. But we also need to read at + * the logical block size for the device. But we also need to read at * least 1k to get the second 512 bytes of the volume. * -WD 10-26-01 */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e5495f37c6ed..2472af2798c7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1624,20 +1624,12 @@ static void do_thread_regset_writeback(struct task_struct *task, regset->writeback(task, regset, 1); } -#ifndef PR_REG_SIZE -#define PR_REG_SIZE(S) sizeof(S) -#endif - #ifndef PRSTATUS_SIZE -#define PRSTATUS_SIZE(S) sizeof(S) -#endif - -#ifndef PR_REG_PTR -#define PR_REG_PTR(S) (&((S)->pr_reg)) +#define PRSTATUS_SIZE(S, R) sizeof(S) #endif #ifndef SET_PR_FPVALID -#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) +#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V)) #endif static int fill_thread_core_info(struct elf_thread_core_info *t, @@ -1645,6 +1637,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, long signr, size_t *total) { unsigned int i; + unsigned int regset_size = view->regsets[0].n * view->regsets[0].size; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1653,12 +1646,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus, t->task, signr); - (void) view->regsets[0].get(t->task, &view->regsets[0], - 0, PR_REG_SIZE(t->prstatus.pr_reg), - PR_REG_PTR(&t->prstatus), NULL); + (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size, + &t->prstatus.pr_reg, NULL); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - PRSTATUS_SIZE(t->prstatus), &t->prstatus); + PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1688,7 +1680,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset->core_note_type, size, data); else { - SET_PR_FPVALID(&t->prstatus, 1); + SET_PR_FPVALID(&t->prstatus, + 1, regset_size); fill_note(&t->notes[i], "CORE", NT_PRFPREG, size, data); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 08ae99343d92..376e4e426324 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -180,9 +180,6 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); - if (IS_DAX(inode)) - return dax_do_io(iocb, inode, iter, blkdev_get_block, - NULL, DIO_SKIP_DIO_COUNT); return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); @@ -302,14 +299,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) error = sb->s_op->thaw_super(sb); else error = thaw_super(sb); - if (error) { + if (error) bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } out: mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; + return error; } EXPORT_SYMBOL(thaw_bdev); @@ -1275,7 +1269,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - bdev->bd_inode->i_flags = 0; if (!partno) { ret = -ENXIO; @@ -1303,11 +1296,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) { + if (!ret) bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - if (!bdev_dax_capable(bdev)) - bdev->bd_inode->i_flags &= ~S_DAX; - } /* * If the device is invalidated, rescan partition @@ -1342,8 +1332,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - if (!bdev_dax_capable(bdev)) - bdev->bd_inode->i_flags &= ~S_DAX; } } else { if (bdev->bd_contains == bdev) { diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 53bb7af4e5f0..247b8dfaf6e5 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -79,11 +79,9 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &inode->i_mode); - if (ret < 0) + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) return ret; - if (ret == 0) - acl = NULL; } ret = 0; break; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 33fe03551105..e62fd50237e4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3161,7 +3161,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_inode_set_ops(struct inode *inode); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fea31a4a6e36..4843cb994835 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2040,7 +2040,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * flags for any errors that might have happened while doing * writeback of file data. */ - ret = btrfs_inode_check_errors(inode); + ret = filemap_check_errors(inode->i_mapping); inode_unlock(inode); goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d06c6a288512..994fe5af160b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5072,7 +5072,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (btrfs_root_readonly(root)) return -EROFS; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; @@ -8412,7 +8412,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); + bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8450,7 +8450,8 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); + bio_set_op_attrs(bio, bio_op(orig_bio), + bio_flags(orig_bio)); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8618,7 +8619,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, goto out; /* If this is a write we don't need to check anymore */ - if (iov_iter_rw(iter) == WRITE) + if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) return 0; /* * Check to make sure we don't have duplicate iov_base's in this @@ -10543,21 +10544,6 @@ out_inode: } -/* Inspired by filemap_check_errors() */ -int btrfs_inode_check_errors(struct inode *inode) -{ - int ret = 0; - - if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) && - test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags)) - ret = -ENOSPC; - if (test_bit(AS_EIO, &inode->i_mapping->flags) && - test_and_clear_bit(AS_EIO, &inode->i_mapping->flags)) - ret = -EIO; - - return ret; -} - static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a87675ffd02b..1379e59277e2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4329,7 +4329,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, int ret; struct send_ctx *sctx = ctx; struct fs_path *p; - posix_acl_xattr_header dummy_acl; + struct posix_acl_xattr_header dummy_acl; p = fs_path_alloc(); if (!p) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ef9c55bc7907..8a84ebd8e7cc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3961,7 +3961,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, * i_mapping flags, so that the next fsync won't get * an outdated io error too. */ - btrfs_inode_check_errors(inode); + filemap_check_errors(inode->i_mapping); *ordered_io_error = true; break; } @@ -4198,7 +4198,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * without writing to the log tree and the fsync must report the * file data write error and not commit the current transaction. */ - ret = btrfs_inode_check_errors(inode); + ret = filemap_check_errors(inode->i_mapping); if (ret) ctx->io_err = ret; process: diff --git a/fs/buffer.c b/fs/buffer.c index 9c8eb9b6db6a..7dad8713fac8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1078,7 +1078,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) return grow_dev_page(bdev, block, index, size, sizebits, gfp); } -struct buffer_head * +static struct buffer_head * __getblk_slow(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { @@ -1109,7 +1109,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, free_more_memory(); } } -EXPORT_SYMBOL(__getblk_slow); /* * The relationship between dirty buffers and dirty pages: diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index ce5f345d70f5..e7f16a77a22a 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -253,6 +253,8 @@ static void cachefiles_drop_object(struct fscache_object *_object) struct cachefiles_object *object; struct cachefiles_cache *cache; const struct cred *saved_cred; + struct inode *inode; + blkcnt_t i_blocks = 0; ASSERT(_object); @@ -279,6 +281,10 @@ static void cachefiles_drop_object(struct fscache_object *_object) _object != cache->cache.fsdef ) { _debug("- retire object OBJ%x", object->fscache.debug_id); + inode = d_backing_inode(object->dentry); + if (inode) + i_blocks = inode->i_blocks; + cachefiles_begin_secure(cache, &saved_cred); cachefiles_delete_object(cache, object); cachefiles_end_secure(cache, saved_cred); @@ -292,7 +298,7 @@ static void cachefiles_drop_object(struct fscache_object *_object) /* note that the object is now inactive */ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) - cachefiles_mark_object_inactive(cache, object); + cachefiles_mark_object_inactive(cache, object, i_blocks); dput(object->dentry); object->dentry = NULL; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 2fcde1a34b7c..cd1effee8a49 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -160,7 +160,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); * namei.c */ extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object); + struct cachefiles_object *object, + blkcnt_t i_blocks); extern int cachefiles_delete_object(struct cachefiles_cache *cache, struct cachefiles_object *object); extern int cachefiles_walk_to_object(struct cachefiles_object *parent, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 6eb3dec2adbb..339c910da916 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -262,10 +262,9 @@ requeue: * Mark an object as being inactive. */ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object) + struct cachefiles_object *object, + blkcnt_t i_blocks) { - blkcnt_t i_blocks = d_backing_inode(object->dentry)->i_blocks; - write_lock(&cache->active_lock); rb_erase(&object->active_node, &cache->active_nodes); clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); @@ -708,7 +707,8 @@ mark_active_timed_out: check_error: _debug("check error %d", ret); - cachefiles_mark_object_inactive(cache, object); + cachefiles_mark_object_inactive( + cache, object, d_backing_inode(object->dentry)->i_blocks); release_dentry: dput(object->dentry); object->dentry = NULL; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 4f67227f69a5..987044bca1c2 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -95,11 +95,9 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &new_mode); - if (ret < 0) + ret = posix_acl_update_mode(inode, &new_mode, &acl); + if (ret) goto out; - if (ret == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: @@ -127,6 +125,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) goto out_free; } + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto out_free; + } + if (new_mode != old_mode) { newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d5b6f959a3c3..ef3ebd780aff 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -175,9 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, static int ceph_releasepage(struct page *page, gfp_t g) { - dout("%p releasepage %p idx %lu\n", page->mapping->host, - page, page->index); - WARN_ON(PageDirty(page)); + dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, + page, page->index, PageDirty(page) ? "" : "not "); /* Can we release the page from the cache? */ if (!ceph_release_fscache_page(page, g)) @@ -298,14 +297,6 @@ unlock: kfree(osd_data->pages); } -static void ceph_unlock_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - unlock_page(pages[i]); -} - /* * start an async read(ahead) operation. return nr_pages we submitted * a read for on success, or negative error code. @@ -370,6 +361,10 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) dout("start_read %p add_to_page_cache failed %p\n", inode, page); nr_pages = i; + if (nr_pages > 0) { + len = nr_pages << PAGE_SHIFT; + break; + } goto out_pages; } pages[i] = page; @@ -386,8 +381,11 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) return nr_pages; out_pages: - ceph_unlock_page_vector(pages, nr_pages); - ceph_release_page_vector(pages, nr_pages); + for (i = 0; i < nr_pages; ++i) { + ceph_fscache_readpage_cancel(inode, pages[i]); + unlock_page(pages[i]); + } + ceph_put_page_vector(pages, nr_pages, false); out: ceph_osdc_put_request(req); return ret; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0f5375d8e030..395c7fcb1cea 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -902,10 +902,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, return ret; if (write) { - ret = invalidate_inode_pages2_range(inode->i_mapping, + int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (pos + count) >> PAGE_SHIFT); - if (ret < 0) + if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); flags = CEPH_OSD_FLAG_ORDERSNAP | diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e70b4f535c79..da00b11d4a7a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1899,13 +1899,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) int inode_dirty_flags = 0; bool lock_snap_rwsem = false; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - - err = inode_change_ok(inode, attr); - if (err != 0) - return err; - prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2118,7 +2111,17 @@ out_put: */ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { - return __ceph_setattr(d_inode(dentry), attr); + struct inode *inode = d_inode(dentry); + int err; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + err = setattr_prepare(dentry, attr); + if (err != 0) + return err; + + return __ceph_setattr(inode, attr); } /* diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index a2cb0c254060..6806dbeaee19 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -210,8 +210,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; /* No mandatory locks */ - if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) - return -ENOLCK; + if (fl->fl_type & LOCK_MAND) + return -EOPNOTSUPP; dout("ceph_flock, fl_file: %p", fl->fl_file); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f72d4ae303b2..815acd1a56d4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -370,6 +370,7 @@ const char *ceph_session_state_name(int s) case CEPH_MDS_SESSION_CLOSING: return "closing"; case CEPH_MDS_SESSION_RESTARTING: return "restarting"; case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; + case CEPH_MDS_SESSION_REJECTED: return "rejected"; default: return "???"; } } @@ -1150,8 +1151,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); - list_del(&cf->i_list); - list_add(&cf->i_list, &to_remove); + list_move(&cf->i_list, &to_remove); } spin_lock(&mdsc->cap_dirty_lock); @@ -1378,7 +1378,7 @@ static int request_close_session(struct ceph_mds_client *mdsc, if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); - return 0; + return 1; } /* @@ -2131,6 +2131,10 @@ static int __do_request(struct ceph_mds_client *mdsc, ceph_session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { + if (session->s_state == CEPH_MDS_SESSION_REJECTED) { + err = -EACCES; + goto out_session; + } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) __open_session(mdsc, session); @@ -2652,6 +2656,15 @@ static void handle_session(struct ceph_mds_session *session, wake_up_session_caps(session, 0); break; + case CEPH_SESSION_REJECT: + WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING); + pr_info("mds%d rejected session\n", session->s_mds); + session->s_state = CEPH_MDS_SESSION_REJECTED; + cleanup_session_requests(mdsc, session); + remove_session_caps(session); + wake = 2; /* for good measure */ + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); @@ -3557,11 +3570,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) /* * true if all sessions are closed, or we force unmount */ -static bool done_closing_sessions(struct ceph_mds_client *mdsc) +static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) { if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return true; - return atomic_read(&mdsc->num_sessions) == 0; + return atomic_read(&mdsc->num_sessions) <= skipped; } /* @@ -3572,6 +3585,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_session *session; int i; + int skipped = 0; dout("close_sessions\n"); @@ -3583,7 +3597,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); - __close_session(mdsc, session); + if (__close_session(mdsc, session) <= 0) + skipped++; mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); @@ -3591,7 +3606,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); dout("waiting for sessions to close\n"); - wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), + wait_event_timeout(mdsc->session_close_wq, + done_closing_sessions(mdsc, skipped), ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining sessions */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 6b3679737d4a..3c6f77b7bb02 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -121,6 +121,7 @@ enum { CEPH_MDS_SESSION_CLOSING = 5, CEPH_MDS_SESSION_RESTARTING = 6, CEPH_MDS_SESSION_RECONNECTING = 7, + CEPH_MDS_SESSION_REJECTED = 8, }; struct ceph_mds_session { diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 89e6bc321df3..913dea163d5c 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -43,6 +43,8 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_RECALL_STATE: return "recall_state"; case CEPH_SESSION_FLUSHMSG: return "flushmsg"; case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; + case CEPH_SESSION_FORCE_RO: return "force_ro"; + case CEPH_SESSION_REJECT: return "reject"; } return "???"; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e247f6f0feb7..a29ffce98187 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -396,10 +396,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, */ dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); - if (!fsopt->server_path) { - err = -ENOMEM; - goto out; + if (strlen(dev_name_end) > 1) { + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) { + err = -ENOMEM; + goto out; + } } } else { dev_name_end = dev_name + strlen(dev_name); @@ -788,15 +790,10 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, struct inode *inode = req->r_target_inode; req->r_target_inode = NULL; dout("open_root_inode success\n"); - if (ceph_ino(inode) == CEPH_INO_ROOT && - fsc->sb->s_root == NULL) { - root = d_make_root(inode); - if (!root) { - root = ERR_PTR(-ENOMEM); - goto out; - } - } else { - root = d_obtain_root(inode); + root = d_make_root(inode); + if (!root) { + root = ERR_PTR(-ENOMEM); + goto out; } ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); @@ -825,17 +822,24 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) mutex_lock(&fsc->client->mount_mutex); if (!fsc->sb->s_root) { + const char *path; err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; - dout("mount opening root\n"); - root = open_root_dentry(fsc, "", started); + if (!fsc->mount_options->server_path) { + path = ""; + dout("mount opening path \\t\n"); + } else { + path = fsc->mount_options->server_path + 1; + dout("mount opening path %s\n", path); + } + root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } - fsc->sb->s_root = root; + fsc->sb->s_root = dget(root); first = 1; err = ceph_fs_debugfs_init(fsc); @@ -843,19 +847,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) goto fail; } - if (!fsc->mount_options->server_path) { - root = fsc->sb->s_root; - dget(root); - } else { - const char *path = fsc->mount_options->server_path + 1; - dout("mount opening path %s\n", path); - root = open_root_dentry(fsc, path, started); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto fail; - } - } - fsc->mount_state = CEPH_MOUNT_MOUNTED; dout("mount success\n"); mutex_unlock(&fsc->client->mount_mutex); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 9dcf974acc47..c9c00a862036 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,16 @@ cifs_uniqueid_to_ino_t(u64 fileid) } +static inline void cifs_set_time(struct dentry *dentry, unsigned long time) +{ + dentry->d_fsdata = (void *) time; +} + +static inline unsigned long cifs_get_time(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 95dab43646f0..4ead72a001f9 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -392,8 +392,7 @@ extern int CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *return_buf_type); extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, - unsigned int *nbytes, const char *buf, - const char __user *ubuf, const int long_op); + unsigned int *nbytes, const char *buf); extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, const int nvec); extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d47197ea4ab6..f82d2823622f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1228,7 +1228,6 @@ OldOpenRetry: inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); - /* long_op set to 1 to allow for oplock break timeouts */ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *)pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); @@ -1768,8 +1767,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, - unsigned int *nbytes, const char *buf, - const char __user *ubuf, const int long_op) + unsigned int *nbytes, const char *buf) { int rc = -EACCES; WRITE_REQ *pSMB = NULL; @@ -1838,12 +1836,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); if (buf) memcpy(pSMB->Data, buf, bytes_sent); - else if (ubuf) { - if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) { - cifs_buf_release(pSMB); - return -EFAULT; - } - } else if (count != 0) { + else if (count != 0) { /* No buffer */ cifs_buf_release(pSMB); return -EINVAL; @@ -1867,7 +1860,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, } rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, long_op); + (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); if (rc) { cifs_dbg(FYI, "Send error in write = %d\n", rc); @@ -3334,7 +3327,7 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, #ifdef CONFIG_CIFS_POSIX /*Convert an Access Control Entry from wire format to local POSIX xattr format*/ -static void cifs_convert_ace(posix_acl_xattr_entry *ace, +static void cifs_convert_ace(struct posix_acl_xattr_entry *ace, struct cifs_posix_ace *cifs_ace) { /* u8 cifs fields do not need le conversion */ @@ -3358,7 +3351,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, __u16 count; struct cifs_posix_ace *pACE; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src; - posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)trgt; + struct posix_acl_xattr_header *local_acl = (void *)trgt; if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) return -EOPNOTSUPP; @@ -3396,9 +3389,11 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, } else if (size > buflen) { return -ERANGE; } else /* buffer big enough */ { + struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); + local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (i = 0; i < count ; i++) { - cifs_convert_ace(&local_acl->a_entries[i], pACE); + cifs_convert_ace(&ace[i], pACE); pACE++; } } @@ -3406,7 +3401,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, } static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, - const posix_acl_xattr_entry *local_ace) + const struct posix_acl_xattr_entry *local_ace) { __u16 rc = 0; /* 0 = ACL converted ok */ @@ -3431,7 +3426,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, { __u16 rc = 0; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; - posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)pACL; + struct posix_acl_xattr_header *local_acl = (void *)pACL; int count; int i; @@ -3459,7 +3454,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, } for (i = 0; i < count; i++) { rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], - &local_acl->a_entries[i]); + (struct posix_acl_xattr_entry *)(local_acl + 1)); if (rc != 0) { /* ACE not converted */ break; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 4716c54dbfc6..789ff1df2d8d 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -40,7 +40,7 @@ renew_parental_timestamps(struct dentry *direntry) /* BB check if there is a way to get the kernel to do this or if we really need this */ do { - direntry->d_time = jiffies; + cifs_set_time(direntry, jiffies); direntry = direntry->d_parent; } while (!IS_ROOT(direntry)); } @@ -802,7 +802,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } else if (rc == -ENOENT) { rc = 0; - direntry->d_time = jiffies; + cifs_set_time(direntry, jiffies); d_add(direntry, NULL); /* if it was once a directory (but how can we tell?) we could do shrink_dcache_parent(direntry); */ @@ -862,7 +862,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; - if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled) + if (time_after(jiffies, cifs_get_time(direntry) + HZ) || !lookupCacheEnabled) return 0; return 1; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 579e41b350a2..42b99af74e0a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2478,7 +2478,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, size_t cur_len; unsigned long nr_pages, num_pages, i; struct cifs_writedata *wdata; - struct iov_iter saved_from; + struct iov_iter saved_from = *from; loff_t saved_offset = offset; pid_t pid; struct TCP_Server_Info *server; @@ -2489,7 +2489,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, pid = current->tgid; server = tlink_tcon(open_file->tlink)->ses->server; - memcpy(&saved_from, from, sizeof(struct iov_iter)); do { unsigned int wsize, credits; @@ -2551,8 +2550,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, kref_put(&wdata->refcount, cifs_uncached_writedata_release); if (rc == -EAGAIN) { - memcpy(from, &saved_from, - sizeof(struct iov_iter)); + *from = saved_from; iov_iter_advance(from, offset - saved_offset); continue; } @@ -2576,7 +2574,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) struct cifs_sb_info *cifs_sb; struct cifs_writedata *wdata, *tmp; struct list_head wdata_list; - struct iov_iter saved_from; + struct iov_iter saved_from = *from; int rc; /* @@ -2597,8 +2595,6 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) if (!tcon->ses->server->ops->async_writev) return -ENOSYS; - memcpy(&saved_from, from, sizeof(struct iov_iter)); - rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from, open_file, cifs_sb, &wdata_list); @@ -2631,13 +2627,11 @@ restart_loop: /* resend call if it's a retryable error */ if (rc == -EAGAIN) { struct list_head tmp_list; - struct iov_iter tmp_from; + struct iov_iter tmp_from = saved_from; INIT_LIST_HEAD(&tmp_list); list_del_init(&wdata->list); - memcpy(&tmp_from, &saved_from, - sizeof(struct iov_iter)); iov_iter_advance(&tmp_from, wdata->offset - iocb->ki_pos); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b87efd0c92d6..7ab5be7944aa 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1951,7 +1951,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", full_path, inode, inode->i_count.counter, - dentry, dentry->d_time, jiffies); + dentry, cifs_get_time(dentry), jiffies); if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); @@ -2154,7 +2154,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = inode_change_ok(inode, attrs); + rc = setattr_prepare(direntry, attrs); if (rc < 0) goto out; @@ -2294,7 +2294,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = inode_change_ok(inode, attrs); + rc = setattr_prepare(direntry, attrs); if (rc < 0) { free_xid(xid); return rc; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 062c2375549a..d031af8d3d4d 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -399,7 +399,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, io_parms.offset = 0; io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; - rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0); + rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf); CIFSSMBClose(xid, tcon, fid.netfid); return rc; } diff --git a/fs/coda/file.c b/fs/coda/file.c index f47c7483863b..8415d4f8d1a1 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -38,27 +38,6 @@ coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } static ssize_t -coda_file_splice_read(struct file *coda_file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t count, - unsigned int flags) -{ - ssize_t (*splice_read)(struct file *, loff_t *, - struct pipe_inode_info *, size_t, unsigned int); - struct coda_file_info *cfi; - struct file *host_file; - - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - host_file = cfi->cfi_container; - - splice_read = host_file->f_op->splice_read; - if (!splice_read) - splice_read = default_file_splice_read; - - return splice_read(host_file, ppos, pipe, count, flags); -} - -static ssize_t coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *coda_file = iocb->ki_filp; @@ -225,6 +204,6 @@ const struct file_operations coda_file_operations = { .open = coda_open, .release = coda_release, .fsync = coda_fsync, - .splice_read = coda_file_splice_read, + .splice_read = generic_file_splice_read, }; diff --git a/fs/compat.c b/fs/compat.c index be6e48b0a46c..bd064a2c3550 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -54,20 +54,6 @@ #include <asm/ioctls.h> #include "internal.h" -int compat_log = 1; - -int compat_printk(const char *fmt, ...) -{ - va_list ap; - int ret; - if (!compat_log) - return 0; - va_start(ap, fmt); - ret = vprintk(fmt, ap); - va_end(ap); - return ret; -} - /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -562,7 +548,7 @@ ssize_t compat_rw_copy_check_uvector(int type, goto out; ret = -EINVAL; - if (nr_segs > UIO_MAXIOV || nr_segs < 0) + if (nr_segs > UIO_MAXIOV) goto out; if (nr_segs > fast_segs) { ret = -ENOMEM; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index c502c116924c..61057b7dbddb 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -28,7 +28,6 @@ #include <linux/dcache.h> #include <linux/namei.h> #include <linux/fscrypto.h> -#include <linux/ecryptfs.h> static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -128,11 +127,11 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) EXPORT_SYMBOL(fscrypt_get_ctx); /** - * fscrypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation + * page_crypt_complete() - completion callback for page crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void fscrypt_complete(struct crypto_async_request *req, int res) +static void page_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -170,7 +169,7 @@ static int do_page_crypto(struct inode *inode, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fscrypt_complete, &ecr); + page_crypt_complete, &ecr); BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); memcpy(xts_tweak, &index, sizeof(index)); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 5d6d49113efa..9a28133ac3b8 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -10,21 +10,16 @@ * This has not yet undergone a rigorous security audit. */ -#include <keys/encrypted-type.h> -#include <keys/user-type.h> #include <linux/scatterlist.h> #include <linux/ratelimit.h> #include <linux/fscrypto.h> -static u32 size_round_up(size_t size, size_t blksize) -{ - return ((size + blksize - 1) / blksize) * blksize; -} - /** - * dir_crypt_complete() - + * fname_crypt_complete() - completion callback for filename crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void dir_crypt_complete(struct crypto_async_request *req, int res) +static void fname_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -35,11 +30,11 @@ static void dir_crypt_complete(struct crypto_async_request *req, int res) } /** - * fname_encrypt() - + * fname_encrypt() - encrypt a filename * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) @@ -60,10 +55,9 @@ static int fname_encrypt(struct inode *inode, if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? - FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = size_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; + ciphertext_len = max(iname->len, (u32)FS_CRYPTO_BLOCK_SIZE); + ciphertext_len = round_up(ciphertext_len, padding); + ciphertext_len = min(ciphertext_len, lim); if (ciphertext_len <= sizeof(buf)) { workbuf = buf; @@ -84,7 +78,7 @@ static int fname_encrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Copy the input */ memcpy(workbuf, iname->name, iname->len); @@ -105,20 +99,22 @@ static int fname_encrypt(struct inode *inode, } kfree(alloc_buf); skcipher_request_free(req); - if (res < 0) + if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); + return res; + } oname->len = ciphertext_len; - return res; + return 0; } -/* - * fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. +/** + * fname_decrypt() - decrypt a filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_decrypt(struct inode *inode, const struct fscrypt_str *iname, @@ -146,7 +142,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -168,7 +164,7 @@ static int fname_decrypt(struct inode *inode, } oname->len = strnlen(oname->name, iname->len); - return oname->len; + return 0; } static const char *lookup_table = @@ -231,9 +227,8 @@ u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) if (ci) padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - if (ilen < FS_CRYPTO_BLOCK_SIZE) - ilen = FS_CRYPTO_BLOCK_SIZE; - return size_round_up(ilen, padding); + ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); + return round_up(ilen, padding); } EXPORT_SYMBOL(fscrypt_fname_encrypted_size); @@ -279,6 +274,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, u32 hash, u32 minor_hash, @@ -287,13 +286,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; - int ret; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (iname->len < FS_CRYPTO_BLOCK_SIZE) @@ -303,9 +301,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; + oname->len = digest_encode(iname->name, iname->len, + oname->name); + return 0; } if (hash) { memcpy(buf, &hash, 4); @@ -315,15 +313,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name + 1); - oname->len = ret + 1; - return ret + 1; + oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, @@ -333,7 +334,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (inode->i_crypt_info) return fname_encrypt(inode, iname, oname); @@ -367,10 +368,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { ret = fscrypt_fname_alloc_buffer(dir, iname->len, &fname->crypto_buf); - if (ret < 0) + if (ret) return ret; ret = fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) + if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 1ac263eddc4e..82f0285f5d08 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -8,11 +8,8 @@ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. */ -#include <keys/encrypted-type.h> #include <keys/user-type.h> -#include <linux/random.h> #include <linux/scatterlist.h> -#include <uapi/linux/keyctl.h> #include <linux/fscrypto.h> static void derive_crypt_complete(struct crypto_async_request *req, int rc) @@ -139,6 +136,38 @@ out: return res; } +static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, + const char **cipher_str_ret, int *keysize_ret) +{ + if (S_ISREG(inode->i_mode)) { + if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { + *cipher_str_ret = "xts(aes)"; + *keysize_ret = FS_AES_256_XTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported contents encryption mode " + "%d for inode %lu\n", + ci->ci_data_mode, inode->i_ino); + return -ENOKEY; + } + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { + *cipher_str_ret = "cts(cbc(aes))"; + *keysize_ret = FS_AES_256_CTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported filenames encryption mode " + "%d for inode %lu\n", + ci->ci_filename_mode, inode->i_ino); + return -ENOKEY; + } + + pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", + (inode->i_mode & S_IFMT), inode->i_ino); + return -ENOKEY; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -155,8 +184,8 @@ int get_crypt_info(struct inode *inode) struct fscrypt_context ctx; struct crypto_skcipher *ctfm; const char *cipher_str; + int keysize; u8 raw_key[FS_MAX_KEY_SIZE]; - u8 mode; int res; res = fscrypt_initialize(); @@ -179,13 +208,19 @@ retry: if (res < 0) { if (!fscrypt_dummy_context_enabled(inode)) return res; + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; ctx.flags = 0; } else if (res != sizeof(ctx)) { return -EINVAL; } - res = 0; + + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + if (ctx.flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); if (!crypt_info) @@ -198,27 +233,11 @@ retry: crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "%s: unsupported key mode %d (ino %u)\n", - __func__, mode, (unsigned) inode->i_ino); - res = -ENOKEY; + + res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); + if (res) goto out; - } + if (fscrypt_dummy_context_enabled(inode)) { memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; @@ -253,7 +272,7 @@ got_key: crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; @@ -31,6 +31,8 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/iomap.h> +#include "internal.h" /* * We use lowest available bit in exceptional entry for locking, other two @@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry, return VM_FAULT_LOCKED; } -static int copy_user_bh(struct page *to, struct inode *inode, - struct buffer_head *bh, unsigned long vaddr) +static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, + struct page *to, unsigned long vaddr) { struct blk_dax_ctl dax = { - .sector = to_sector(bh, inode), - .size = bh->b_size, + .sector = sector, + .size = size, }; - struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) @@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping, EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, - struct buffer_head *bh, void **entryp, - struct vm_area_struct *vma, struct vm_fault *vmf) + struct block_device *bdev, sector_t sector, size_t size, + void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; - struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { - .sector = to_sector(bh, mapping->host), - .size = bh->b_size, + .sector = sector, + .size = size, }; void *ret; void *entry = *entryp; @@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) - error = copy_user_bh(new_page, inode, &bh, vaddr); + error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), + bh.b_size, new_page, vaddr); else clear_user_highpage(new_page, vaddr); if (error) @@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* Filesystem should not return unwritten buffers to us! */ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); + error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), + bh.b_size, &entry, vma, vmf); unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: @@ -1034,7 +1036,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (!write && !buffer_mapped(&bh)) { spinlock_t *ptl; pmd_t entry; - struct page *zero_page = get_huge_zero_page(); + struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { dax_pmd_dbg(&bh, address, "no zero page"); @@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) return dax_zero_page_range(inode, from, length, get_block); } EXPORT_SYMBOL_GPL(dax_truncate_page); + +#ifdef CONFIG_FS_IOMAP +static loff_t +iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *iter = data; + loff_t end = pos + length, done = 0; + ssize_t ret = 0; + + if (iov_iter_rw(iter) == READ) { + end = min(end, i_size_read(inode)); + if (pos >= end) + return 0; + + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return iov_iter_zero(min(length, end - pos), iter); + } + + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) + return -EIO; + + while (pos < end) { + unsigned offset = pos & (PAGE_SIZE - 1); + struct blk_dax_ctl dax = { 0 }; + ssize_t map_len; + + dax.sector = iomap->blkno + + (((pos & PAGE_MASK) - iomap->offset) >> 9); + dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; + map_len = dax_map_atomic(iomap->bdev, &dax); + if (map_len < 0) { + ret = map_len; + break; + } + + dax.addr += offset; + map_len -= offset; + if (map_len > end - pos) + map_len = end - pos; + + if (iov_iter_rw(iter) == WRITE) + map_len = copy_from_iter_pmem(dax.addr, map_len, iter); + else + map_len = copy_to_iter(dax.addr, map_len, iter); + dax_unmap_atomic(iomap->bdev, &dax); + if (map_len <= 0) { + ret = map_len ? map_len : -EFAULT; + break; + } + + pos += map_len; + length -= map_len; + done += map_len; + } + + return done ? done : ret; +} + +/** + * iomap_dax_rw - Perform I/O to a DAX file + * @iocb: The control block for this I/O + * @iter: The addresses to do I/O from or to + * @ops: iomap ops passed from the file system + * + * This function performs read and write operations to directly mapped + * persistent memory. The callers needs to take care of read/write exclusion + * and evicting any page cache pages in the region under I/O. + */ +ssize_t +iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, done = 0; + unsigned flags = 0; + + if (iov_iter_rw(iter) == WRITE) + flags |= IOMAP_WRITE; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + * + * XXX: This is racy against mmap, and there's nothing we can do about + * it. We'll eventually need to shift this down even further so that + * we can check if we allocated blocks over a hole first. + */ + if (mapping->nrpages) { + ret = invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, + (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + + while (iov_iter_count(iter)) { + ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, + iter, iomap_dax_actor); + if (ret <= 0) + break; + pos += ret; + done += ret; + } + + iocb->ki_pos += done; + return done ? done : ret; +} +EXPORT_SYMBOL_GPL(iomap_dax_rw); + +/** + * iomap_dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @ops: iomap ops passed from the file system + * + * When a page fault occurs, filesystems may call this helper in their fault + * or mkwrite handler for DAX files. Assumes the caller has done all the + * necessary locking for the page fault to proceed successfully. + */ +int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; + sector_t sector; + struct iomap iomap = { 0 }; + unsigned flags = 0; + int error, major = 0; + void *entry; + + /* + * Check whether offset isn't beyond end of file now. Caller is supposed + * to hold locks serializing us with truncate / punch hole so this is + * a reliable test. + */ + if (pos >= i_size_read(inode)) + return VM_FAULT_SIGBUS; + + entry = grab_mapping_entry(mapping, vmf->pgoff); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto out; + } + + if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + flags |= IOMAP_WRITE; + + /* + * Note that we don't bother to use iomap_apply here: DAX required + * the file system block size to be equal the page size, which means + * that we never have to deal with more than a single extent here. + */ + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); + if (error) + goto unlock_entry; + if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { + error = -EIO; /* fs corruption? */ + goto unlock_entry; + } + + sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); + + if (vmf->cow_page) { + switch (iomap.type) { + case IOMAP_HOLE: + case IOMAP_UNWRITTEN: + clear_user_highpage(vmf->cow_page, vaddr); + break; + case IOMAP_MAPPED: + error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, + vmf->cow_page, vaddr); + break; + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + if (error) + goto unlock_entry; + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; + } + vmf->entry = entry; + return VM_FAULT_DAX_LOCKED; + } + + switch (iomap.type) { + case IOMAP_MAPPED: + if (iomap.flags & IOMAP_F_NEW) { + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + } + error = dax_insert_mapping(mapping, iomap.bdev, sector, + PAGE_SIZE, &entry, vma, vmf); + break; + case IOMAP_UNWRITTEN: + case IOMAP_HOLE: + if (!(vmf->flags & FAULT_FLAG_WRITE)) + return dax_load_hole(mapping, entry, vmf); + /*FALLTHRU*/ + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if (error < 0 && error != -EBUSY) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; +} +EXPORT_SYMBOL_GPL(iomap_dax_fault); +#endif /* CONFIG_FS_IOMAP */ diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 592059f88e04..354e2ab62031 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -97,9 +97,6 @@ EXPORT_SYMBOL_GPL(debugfs_use_file_finish); #define F_DENTRY(filp) ((filp)->f_path.dentry) -#define REAL_FOPS_DEREF(dentry) \ - ((const struct file_operations *)(dentry)->d_fsdata) - static int open_proxy_open(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); @@ -112,7 +109,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -143,7 +140,7 @@ static ret_type full_proxy_ ## name(proto) \ { \ const struct dentry *dentry = F_DENTRY(filp); \ const struct file_operations *real_fops = \ - REAL_FOPS_DEREF(dentry); \ + debugfs_real_fops(filp); \ int srcu_idx; \ ret_type r; \ \ @@ -176,7 +173,7 @@ static unsigned int full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); int srcu_idx; unsigned int r = 0; @@ -193,7 +190,7 @@ static unsigned int full_proxy_poll(struct file *filp, static int full_proxy_release(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); const struct file_operations *proxy_fops = filp->f_op; int r = 0; @@ -209,7 +206,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp) replace_fops(filp, d_inode(dentry)->i_fop); kfree((void *)proxy_fops); fops_put(real_fops); - return 0; + return r; } static void __full_proxy_fops_init(struct file_operations *proxy_fops, @@ -241,7 +238,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index bba52634b995..b3e8443a1f47 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -19,8 +19,4 @@ extern const struct file_operations debugfs_noop_file_operations; extern const struct file_operations debugfs_open_proxy_file_operations; extern const struct file_operations debugfs_full_proxy_file_operations; -struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops); - #endif /* _DEBUGFS_INTERNAL_H_ */ diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 79a5941c2474..442d1a7e671b 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -272,13 +272,8 @@ static int mknod_ptmx(struct super_block *sb) struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - kuid_t root_uid; - kgid_t root_gid; - - root_uid = make_kuid(current_user_ns(), 0); - root_gid = make_kgid(current_user_ns(), 0); - if (!uid_valid(root_uid) || !gid_valid(root_gid)) - return -EINVAL; + kuid_t ptmx_uid = current_fsuid(); + kgid_t ptmx_gid = current_fsgid(); inode_lock(d_inode(root)); @@ -309,8 +304,8 @@ static int mknod_ptmx(struct super_block *sb) mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); - inode->i_uid = root_uid; - inode->i_gid = root_gid; + inode->i_uid = ptmx_uid; + inode->i_gid = ptmx_gid; d_add(dentry, inode); @@ -336,7 +331,6 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - sync_filesystem(sb); err = parse_mount_options(data, PARSE_REMOUNT, opts); /* @@ -395,6 +389,7 @@ static int devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; @@ -403,10 +398,16 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; + error = -ENOMEM; s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; + error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); + if (error) + goto fail; + + error = -ENOMEM; inode = new_inode(s); if (!inode) goto fail; @@ -418,13 +419,21 @@ devpts_fill_super(struct super_block *s, void *data, int silent) set_nlink(inode, 2); s->s_root = d_make_root(inode); - if (s->s_root) - return 0; + if (!s->s_root) { + pr_err("get root dentry failed\n"); + goto fail; + } - pr_err("get root dentry failed\n"); + error = mknod_ptmx(s); + if (error) + goto fail_dput; + return 0; +fail_dput: + dput(s->s_root); + s->s_root = NULL; fail: - return -ENOMEM; + return error; } /* @@ -436,43 +445,15 @@ fail: static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - int error; - struct pts_mount_opts opts; - struct super_block *s; - - error = parse_mount_options(data, PARSE_MOUNT, &opts); - if (error) - return ERR_PTR(error); - - s = sget(fs_type, NULL, set_anon_super, flags, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (!s->s_root) { - error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); - if (error) - goto out_undo_sget; - s->s_flags |= MS_ACTIVE; - } - - memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); - - error = mknod_ptmx(s); - if (error) - goto out_undo_sget; - - return dget(s->s_root); - -out_undo_sget: - deactivate_locked_super(s); - return ERR_PTR(error); + return mount_nodev(fs_type, flags, data, devpts_fill_super); } static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); - ida_destroy(&fsi->allocated_ptys); + if (fsi) + ida_destroy(&fsi->allocated_ptys); kfree(fsi); kill_litter_super(sb); } diff --git a/fs/direct-io.c b/fs/direct-io.c index 7c3ce73cb617..fb9aa16a7727 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -246,6 +246,9 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if ((dio->op == REQ_OP_READ) && ((offset + transferred) > dio->i_size)) transferred = dio->i_size - offset; + /* ignore EFAULT if some IO has been done */ + if (unlikely(ret == -EFAULT) && transferred) + ret = 0; } if (ret == 0) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 963016c8f3d1..609998de533e 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1656,16 +1656,12 @@ void dlm_lowcomms_stop(void) mutex_lock(&connections_lock); dlm_allow_conn = 0; foreach_conn(stop_conn); + clean_writequeues(); + foreach_conn(free_conn); mutex_unlock(&connections_lock); work_stop(); - mutex_lock(&connections_lock); - clean_writequeues(); - - foreach_conn(free_conn); - - mutex_unlock(&connections_lock); kmem_cache_destroy(con_cache); } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 3f2575ddd45e..ddccec3124d7 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -927,7 +927,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } mutex_unlock(&crypt_stat->cs_mutex); - rc = inode_change_ok(inode, ia); + rc = setattr_prepare(dentry, ia); if (rc) goto out; if (ia->ia_valid & ATTR_SIZE) { diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 1d73fc6dba13..cbb50cadcffc 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -105,7 +105,10 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, inode->i_private = var; - efivar_entry_add(var, &efivarfs_list); + err = efivar_entry_add(var, &efivarfs_list); + if (err) + goto out; + d_instantiate(dentry, inode); dget(dentry); out: diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 688ccc16b702..d7a7c53803c1 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -157,12 +157,14 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, goto fail_inode; } + efivar_entry_size(entry, &size); + err = efivar_entry_add(entry, &efivarfs_list); + if (err) + goto fail_inode; + /* copied by the above to local storage in the dentry. */ kfree(name); - efivar_entry_size(entry, &size); - efivar_entry_add(entry, &efivarfs_list); - inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); @@ -182,7 +184,10 @@ fail: static int efivarfs_destroy(struct efivar_entry *entry, void *data) { - efivar_entry_remove(entry); + int err = efivar_entry_remove(entry); + + if (err) + return err; kfree(entry); return 0; } diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 9dc4c6dbf3c9..5e68daee5fe4 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1034,7 +1034,7 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) if (unlikely(error)) return error; - error = inode_change_ok(inode, iattr); + error = setattr_prepare(dentry, iattr); if (unlikely(error)) return error; diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index c634874e12d9..36bea5adcaba 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,5 +1,6 @@ config EXT2_FS tristate "Second extended fs support" + select FS_IOMAP if FS_DAX help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 42f1d1814083..e725aa0890e0 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -190,15 +190,11 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - if (error == 0) - acl = NULL; - } + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); } break; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 06af2f92226c..37e2be784ac7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; +extern struct iomap_ops ext2_iomap_ops; /* namei.c */ extern const struct inode_operations ext2_dir_inode_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 538f77616f3c..a0e1478dfd04 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -22,11 +22,59 @@ #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> +#include <linux/iomap.h> +#include <linux/uio.h> #include "ext2.h" #include "xattr.h" #include "acl.h" #ifdef CONFIG_FS_DAX +static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + ssize_t ret; + + if (!iov_iter_count(to)) + return 0; /* skip atime */ + + inode_lock_shared(inode); + ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} + +static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + ret = file_update_time(file); + if (ret) + goto out_unlock; + + ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); + if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { + i_size_write(inode, iocb->ki_pos); + mark_inode_dirty(inode); + } + +out_unlock: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + /* * The lock ordering for ext2 DAX fault paths is: * @@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_fault(vma, vmf, ext2_get_block); + ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; } -/* - * We have mostly NULL's here: the current defaults are ok for - * the ext2 filesystem. - */ +static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return ext2_dax_read_iter(iocb, to); +#endif + return generic_file_read_iter(iocb, to); +} + +static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return ext2_dax_write_iter(iocb, from); +#endif + return generic_file_write_iter(iocb, from); +} + const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, + .read_iter = ext2_file_read_iter, + .write_iter = ext2_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, @@ -172,6 +234,7 @@ const struct file_operations ext2_file_operations = { .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, + .get_unmapped_area = thp_get_unmapped_area, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index efe5fb21c533..04e73a99902b 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -465,6 +465,11 @@ struct inode *ext2_new_inode(struct inode *dir, umode_t mode, for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext2_get_group_desc(sb, group, &bh2); + if (!gdp) { + if (++group == sbi->s_groups_count) + group = 0; + continue; + } brelse(bitmap_bh); bitmap_bh = read_inode_bitmap(sb, group); if (!bitmap_bh) { diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index d5c7d09919f3..6831534924e1 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -32,6 +32,7 @@ #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> #include "ext2.h" @@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode, */ static int ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, + u32 *bno, bool *new, bool *boundary, int create) { int err = -EIO; @@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); - clear_buffer_new(bh_result); /* What's this do? */ count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { @@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; - clear_buffer_new(bh_result); goto got_it; } } @@ -733,6 +732,16 @@ static int ext2_get_blocks(struct inode *inode, } if (IS_DAX(inode)) { + int i; + + /* + * We must unmap blocks before zeroing so that writeback cannot + * overwrite zeros with stale data from block device page cache. + */ + for (i = 0; i < count; i++) { + unmap_underlying_metadata(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key) + i); + } /* * block must be initialised before we put it in the tree * so that it's not found by another thread before it's @@ -745,15 +754,16 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } else - set_buffer_new(bh_result); + } else { + *new = true; + } ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); got_it: - map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + *bno = le32_to_cpu(chain[depth-1].key); if (count > blocks_to_boundary) - set_buffer_boundary(bh_result); + *boundary = true; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ @@ -765,19 +775,82 @@ cleanup: return err; } -int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +int ext2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - int ret = ext2_get_blocks(inode, iblock, max_blocks, - bh_result, create); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; + bool new = false, boundary = false; + u32 bno; + int ret; + + ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary, + create); + if (ret <= 0) + return ret; + + map_bh(bh_result, inode->i_sb, bno); + bh_result->b_size = (ret << inode->i_blkbits); + if (new) + set_buffer_new(bh_result); + if (boundary) + set_buffer_boundary(bh_result); + return 0; + +} + +#ifdef CONFIG_FS_DAX +static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) +{ + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; + bool new = false, boundary = false; + u32 bno; + int ret; + + ret = ext2_get_blocks(inode, first_block, max_blocks, + &bno, &new, &boundary, flags & IOMAP_WRITE); + if (ret < 0) + return ret; + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = (u64)first_block << blkbits; + + if (ret == 0) { + iomap->type = IOMAP_HOLE; + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->length = 1 << blkbits; + } else { + iomap->type = IOMAP_MAPPED; + iomap->blkno = (sector_t)bno << (blkbits - 9); + iomap->length = (u64)ret << blkbits; + iomap->flags |= IOMAP_F_MERGED; } - return ret; + if (new) + iomap->flags |= IOMAP_F_NEW; + return 0; } +static int +ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ + if (iomap->type == IOMAP_MAPPED && + written < length && + (flags & IOMAP_WRITE)) + ext2_write_failed(inode->i_mapping, offset + length); + return 0; +} + +struct iomap_ops ext2_iomap_ops = { + .iomap_begin = ext2_iomap_begin, + .iomap_end = ext2_iomap_end, +}; +#endif /* CONFIG_FS_DAX */ + int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -863,11 +936,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, - DIO_LOCKING); - else - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); + if (WARN_ON_ONCE(IS_DAX(inode))) + return -EIO; + + ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); return ret; @@ -1580,7 +1652,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, iattr); + error = setattr_prepare(dentry, iattr); if (error) return error; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index c6601a476c02..dfa519979038 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -193,15 +193,11 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - if (error == 0) - acl = NULL; - } + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); } break; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 67415e0e6af0..e8b365000d73 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -260,11 +260,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, 0, 0, &de_name, &fstr); + de_name = fstr; fstr.len = save_len; - if (err < 0) + if (err) goto errout; if (!dir_emit(ctx, - fstr.name, err, + de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; @@ -627,7 +628,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; - int nlen, rlen; + int rlen; unsigned int offset = 0; char *top; @@ -637,7 +638,6 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; - nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ea31931386ec..282a51b07c57 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -262,6 +262,9 @@ struct ext4_io_submit { (s)->s_first_ino) #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) @@ -1117,9 +1120,15 @@ struct ext4_inode_info { #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ -#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ -#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ @@ -1636,26 +1645,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) * Feature set definitions */ -/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */ -#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) -#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) -#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) -#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) -#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) - #define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 #define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 #define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d7ccb7f51dfc..c930a0110fb4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4679,6 +4679,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, unsigned int credits; loff_t epos; + BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; map.m_len = len; /* @@ -4693,13 +4694,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); - /* - * We can only call ext_depth() on extent based inodes - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - depth = ext_depth(inode); - else - depth = -1; + depth = ext_depth(inode); retry: while (ret >= 0 && len) { @@ -4966,13 +4961,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) trace_ext4_fallocate_enter(inode, offset, len, mode); lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - lblk; + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -5035,12 +5025,8 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, unsigned int credits, blkbits = inode->i_blkbits; map.m_lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk); + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); + /* * This is somewhat ugly but the idea is clear: When transaction is * reserved, everything goes into it. Otherwise we rather start several @@ -5734,6 +5720,9 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + } else { + ext4_ext_drop_refs(path); + kfree(path); } ret = ext4_es_remove_extent(inode, offset_lblk, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 93072dbcb3ac..2a822d30e73f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -91,9 +91,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; struct inode *inode = file_inode(iocb->ki_filp); - struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; int unaligned_aio = 0; int overwrite = 0; @@ -134,18 +132,16 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct) { size_t length = iov_iter_count(from); loff_t pos = iocb->ki_pos; - blk_start_plug(&plug); /* check whether we do a DIO overwrite or not */ if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { + pos + length <= i_size_read(inode)) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, len; map.m_lblk = pos >> blkbits; - map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) - - map.m_lblk; + map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); len = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); @@ -171,8 +167,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) ret = generic_write_sync(iocb, ret); - if (o_direct) - blk_finish_plug(&plug); return ret; @@ -703,6 +697,7 @@ const struct file_operations ext4_file_operations = { .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, + .get_unmapped_area = thp_get_unmapped_area, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5c4372512ef7..88effb1053c7 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -61,6 +61,13 @@ static int ext4_sync_parent(struct inode *inode) break; iput(inode); inode = next; + /* + * The directory inode may have gone through rmdir by now. But + * the inode itself and its blocks are still allocated (we hold + * a reference to the inode so it didn't go through + * ext4_evict_inode()) and so we are safe to flush metadata + * blocks and the inode. + */ ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; @@ -107,7 +114,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!journal) { ret = __generic_file_fsync(file, start, end, datasync); - if (!ret && !hlist_empty(&inode->i_dentry)) + if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) goto issue_flush; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9e66cd1d7b78..170421edfdfe 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -802,7 +802,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, } else inode_init_owner(inode, dir, mode); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) ei->i_projid = EXT4_I(dir)->i_projid; else diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6ea25a190f8..9c064727ed62 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -647,11 +647,19 @@ found: /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + ext4_lblk_t i; + + for (i = 0; i < map->m_len; i++) { + unmap_underlying_metadata(inode->i_sb->s_bdev, + map->m_pblk + i); + } ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -1649,6 +1657,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { + if (page_mapped(page)) + clear_page_dirty_for_io(page); block_invalidatepage(page, 0, PAGE_SIZE); ClearPageUptodate(page); } @@ -3526,35 +3536,31 @@ out: static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) { - int unlocked = 0; - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; ssize_t ret; - if (ext4_should_dioread_nolock(inode)) { - /* - * Nolock dioread optimization may be dynamically disabled - * via ext4_inode_block_unlocked_dio(). Check inode's state - * while holding extra i_dio_count ref. - */ - inode_dio_begin(inode); - smp_mb(); - if (unlikely(ext4_test_inode_state(inode, - EXT4_STATE_DIOREAD_LOCK))) - inode_dio_end(inode); - else - unlocked = 1; - } + /* + * Shared inode_lock is enough for us - it protects against concurrent + * writes & truncates and since we take care of writing back page cache, + * we are protected against page writeback as well. + */ + inode_lock_shared(inode); if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, - NULL, unlocked ? 0 : DIO_LOCKING); + ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); } else { + size_t count = iov_iter_count(iter); + + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count); + if (ret) + goto out_unlock; ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, ext4_dio_get_block, - NULL, NULL, - unlocked ? 0 : DIO_LOCKING); + NULL, NULL, 0); } - if (unlocked) - inode_dio_end(inode); +out_unlock: + inode_unlock_shared(inode); return ret; } @@ -3890,7 +3896,7 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, } /* - * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode @@ -3919,7 +3925,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Write out all dirty pages to avoid race conditions * Then release them. */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) @@ -4414,7 +4420,7 @@ static inline void ext4_iget_extra_inode(struct inode *inode, int ext4_get_projid(struct inode *inode, kprojid_t *projid) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) + if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; @@ -4481,7 +4487,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); @@ -4814,14 +4820,14 @@ static int ext4_do_update_inode(handle_t *handle, * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ - if (!ei->i_dtime) { + if (ei->i_dtime && list_empty(&ei->i_orphan)) { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); - } else { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); @@ -4885,8 +4891,7 @@ static int ext4_do_update_inode(handle_t *handle, } } - BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_PROJECT) && + BUG_ON(!ext4_has_feature_project(inode->i_sb) && i_projid != EXT4_DEF_PROJID); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && @@ -5073,7 +5078,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) int orphan = 0; const unsigned int ia_valid = attr->ia_valid; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 1bb7df5e4536..bf5ae8ebbc97 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -19,8 +19,6 @@ #include "ext4_jbd2.h" #include "ext4.h" -#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) - /** * Swap memory between @a and @b for @len bytes. * @@ -310,8 +308,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) struct ext4_inode *raw_inode; struct dquot *transfer_to[MAXQUOTAS] = { }; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (!ext4_has_feature_project(sb)) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; else @@ -772,6 +769,9 @@ resizefs_out: #ifdef CONFIG_EXT4_FS_ENCRYPTION struct fscrypt_policy policy; + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) @@ -842,8 +842,7 @@ resizefs_out: ext4_get_inode_flags(ei); fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (ext4_has_feature_project(inode->i_sb)) { fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, EXT4_I(inode)->i_projid); } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a920c5d29fac..6fc14def0c70 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -598,6 +598,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } + if (ext4_encrypted_inode(orig_inode) || + ext4_encrypted_inode(donor_inode)) { + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported for encrypted files"); + return -EOPNOTSUPP; + } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ddc309e8471e..a73a9196b929 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -639,7 +639,7 @@ static struct stats dx_show_leaf(struct inode *dir, res = fscrypt_fname_alloc_buffer( dir, len, &fname_crypto_str); - if (res < 0) + if (res) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " @@ -647,7 +647,7 @@ static struct stats dx_show_leaf(struct inode *dir, res = fscrypt_fname_disk_to_usr(dir, 0, 0, &de_name, &fname_crypto_str); - if (res < 0) { + if (res) { printk(KERN_WARNING "Error " "converting filename " "from disk to usr" @@ -1011,7 +1011,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, hinfo->minor_hash, &de_name, &fname_crypto_str); - if (err < 0) { + if (err) { count = err; goto errout; } @@ -2044,33 +2044,31 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, frame->entries = entries; frame->at = entries; frame->bh = bh; - bh = bh2; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; - retval = ext4_handle_dirty_dirent_node(handle, dir, bh); + retval = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &fname->hinfo); + de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } - dx_release(frames); - retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); - brelse(bh); - return retval; + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ - ext4_mark_inode_dirty(handle, dir); + if (retval) + ext4_mark_inode_dirty(handle, dir); dx_release(frames); + brelse(bh2); return retval; } @@ -3144,7 +3142,7 @@ static int ext4_symlink(struct inode *dir, istr.name = (const unsigned char *) symname; istr.len = len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); disk_link.name = (char *) sd; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a6132a730967..b4cbee936cf8 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -405,14 +405,12 @@ int ext4_bio_write_page(struct ext4_io_submit *io, { struct page *data_page = NULL; struct inode *inode = page->mapping->host; - unsigned block_start, blocksize; + unsigned block_start; struct buffer_head *bh, *head; int ret = 0; int nr_submitted = 0; int nr_to_submit = 0; - blocksize = 1 << inode->i_blkbits; - BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ec8708989ca..6db81fbcbaa6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -78,6 +78,8 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum); /* * Lock ordering @@ -1267,7 +1269,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, @@ -1327,6 +1329,7 @@ static const match_table_t tokens = { {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, + {Opt_prjquota, "prjquota"}, {Opt_barrier, "barrier=%u"}, {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, @@ -1546,8 +1549,11 @@ static const struct mount_opts { MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, + {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, + MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | - EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), + MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_offusrjquota, 0, MOPT_Q}, @@ -1836,13 +1842,17 @@ static int parse_options(char *options, struct super_block *sb, return 0; } #ifdef CONFIG_QUOTA - if (ext4_has_feature_quota(sb) && - (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { - ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota " - "mount options ignored."); - clear_opt(sb, USRQUOTA); - clear_opt(sb, GRPQUOTA); - } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) { + ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return 0; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -2741,7 +2751,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; - sb_start_write(sb); for (group = elr->lr_next_group; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { @@ -2768,8 +2777,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } - sb_end_write(sb); - return ret; } @@ -2834,19 +2841,43 @@ cont_thread: mutex_unlock(&eli->li_list_mtx); goto exit_thread; } - list_for_each_safe(pos, n, &eli->li_request_list) { + int err = 0; + int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); - if (time_after_eq(jiffies, elr->lr_next_sched)) { - if (ext4_run_li_request(elr) != 0) { - /* error, remove the lazy_init job */ - ext4_remove_li_request(elr); - continue; + if (time_before(jiffies, elr->lr_next_sched)) { + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + continue; + } + if (down_read_trylock(&elr->lr_super->s_umount)) { + if (sb_start_write_trylock(elr->lr_super)) { + progress = 1; + /* + * We hold sb->s_umount, sb can not + * be removed from the list, it is + * now safe to drop li_list_mtx + */ + mutex_unlock(&eli->li_list_mtx); + err = ext4_run_li_request(elr); + sb_end_write(elr->lr_super); + mutex_lock(&eli->li_list_mtx); + n = pos->next; } + up_read((&elr->lr_super->s_umount)); + } + /* error, remove the lazy_init job */ + if (err) { + ext4_remove_li_request(elr); + continue; + } + if (!progress) { + elr->lr_next_sched = jiffies + + (prandom_u32() + % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); } - if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; } @@ -3179,6 +3210,8 @@ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + struct inode *j_inode; + unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); @@ -3209,10 +3242,23 @@ int ext4_calculate_overhead(struct super_block *sb) memset(buf, 0, PAGE_SIZE); cond_resched(); } - /* Add the internal journal blocks as well */ + + /* + * Add the internal journal blocks whether the journal has been + * loaded or not + */ if (sbi->s_journal && !sbi->journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); - + else if (ext4_has_feature_journal(sb) && !sbi->s_journal) { + j_inode = ext4_get_journal_inode(sb, j_inum); + if (j_inode) { + j_blocks = j_inode->i_size >> sb->s_blocksize_bits; + overhead += EXT4_NUM_B2C(sbi, j_blocks); + iput(j_inode); + } else { + ext4_msg(sb, KERN_ERR, "can't get journal size"); + } + } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); @@ -4208,18 +4254,16 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) write_unlock(&journal->j_state_lock); } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; - journal_t *journal; - - BUG_ON(!ext4_has_feature_journal(sb)); - - /* First, test for the existence of a valid inode on disk. Bad - * things happen if we iget() an unused inode, as the subsequent - * iput() will try to delete it. */ + /* + * Test for the existence of a valid inode on disk. Bad things + * happen if we iget() an unused inode, as the subsequent iput() + * will try to delete it. + */ journal_inode = ext4_iget(sb, journal_inum); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); @@ -4239,6 +4283,20 @@ static journal_t *ext4_get_journal(struct super_block *sb, iput(journal_inode); return NULL; } + return journal_inode; +} + +static journal_t *ext4_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + BUG_ON(!ext4_has_feature_journal(sb)); + + journal_inode = ext4_get_journal_inode(sb, journal_inum); + if (!journal_inode) + return NULL; journal = jbd2_journal_init_inode(journal_inode); if (!journal) { @@ -5250,12 +5308,18 @@ static int ext4_enable_quotas(struct super_block *sb) le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; + bool quota_mopt[EXT4_MAXQUOTAS] = { + test_opt(sb, USRQUOTA), + test_opt(sb, GRPQUOTA), + test_opt(sb, PRJQUOTA), + }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, - DQUOT_USAGE_ENABLED); + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { ext4_warning(sb, "Failed to enable quota tracking " diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 128ea78b8958..557b3b0d668c 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -30,7 +30,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, char *caddr, *paddr = NULL; struct fscrypt_str cstr, pstr; struct fscrypt_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; u32 max_size = inode->i_sb->s_blocksize; @@ -49,7 +48,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, if (IS_ERR(cpage)) return ERR_CAST(cpage); caddr = page_address(cpage); - caddr[size] = 0; } /* Symlink is encrypted */ @@ -65,16 +63,14 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); if (res) goto errout; + paddr = pstr.name; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; - paddr = pstr.name; - /* Null-terminate the name */ - if (res <= pstr.len) - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; if (cpage) put_page(cpage); set_delayed_call(done, kfree_link, paddr); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2eb935ca5d9e..c15d63389957 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -199,6 +199,8 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, } while (!IS_LAST_ENTRY(entry)) { + if (entry->e_value_block != 0) + return -EFSCORRUPTED; if (entry->e_value_size != 0 && (value_start + le16_to_cpu(entry->e_value_offs) < (void *)e + sizeof(__u32) || @@ -641,7 +643,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; @@ -661,7 +663,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; @@ -669,7 +671,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) } free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) { - if (!s->here->e_value_block && s->here->e_value_size) { + if (s->here->e_value_size) { size_t size = le32_to_cpu(s->here->e_value_size); free += EXT4_XATTR_SIZE(size); } @@ -691,7 +693,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) s->here->e_name_len = name_len; memcpy(s->here->e_name, i->name, name_len); } else { - if (!s->here->e_value_block && s->here->e_value_size) { + if (s->here->e_value_size) { void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(s->here->e_value_offs); void *val = s->base + offs; @@ -725,8 +727,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); - if (!last->e_value_block && - last->e_value_size && o < offs) + if (last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + size); last = EXT4_XATTR_NEXT(last); @@ -1318,18 +1319,19 @@ retry: */ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, int value_offs_shift, void *to, - void *from, size_t n, int blocksize) + void *from, size_t n) { struct ext4_xattr_entry *last = entry; int new_offs; + /* We always shift xattr headers further thus offsets get lower */ + BUG_ON(value_offs_shift > 0); + /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; - BUG_ON(new_offs + le32_to_cpu(last->e_value_size) - > blocksize); last->e_value_offs = cpu_to_le16(new_offs); } } @@ -1338,6 +1340,141 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, } /* + * Move xattr pointed to by 'entry' from inode into external xattr block + */ +static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_xattr_entry *entry) +{ + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t value_offs, value_size; + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + .name_index = entry->e_name_index, + }; + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + int error; + + value_offs = le16_to_cpu(entry->e_value_offs); + value_size = le32_to_cpu(entry->e_value_size); + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + buffer = kmalloc(value_size, GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!is || !bs || !buffer || !b_entry_name) { + error = -ENOMEM; + goto out; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + /* Save the entry name and the entry value */ + memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto out; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto out; + + /* Remove the chosen entry from the inode */ + error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto out; + + i.name = b_entry_name; + i.value = buffer; + i.value_len = value_size; + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto out; + + /* Add entry which was removed from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto out; + error = 0; +out: + kfree(b_entry_name); + kfree(buffer); + if (is) + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + + return error; +} + +static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + int isize_diff, size_t ifree, + size_t bfree, int *total_ino) +{ + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + struct ext4_xattr_entry *small_entry; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *last; + unsigned int entry_size; /* EA entry size */ + unsigned int total_size; /* EA entry size + value size */ + unsigned int min_total_size; + int error; + + while (isize_diff > ifree) { + entry = NULL; + small_entry = NULL; + min_total_size = ~0U; + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); + if (total_size <= bfree && + total_size < min_total_size) { + if (total_size + ifree < isize_diff) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry == NULL) + return -ENOSPC; + entry = small_entry; + } + + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + total_size = entry_size + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + error = ext4_xattr_move_to_block(handle, inode, raw_inode, + entry); + if (error) + return error; + + *total_ino -= entry_size; + ifree += total_size; + bfree -= total_size; + } + + return 0; +} + +/* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. */ @@ -1345,14 +1482,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry, *last, *first; struct buffer_head *bh = NULL; - struct ext4_xattr_ibody_find *is = NULL; - struct ext4_xattr_block_find *bs = NULL; - char *buffer = NULL, *b_entry_name = NULL; - size_t min_offs, free; + size_t min_offs; + size_t ifree, bfree; int total_ino; - void *base, *start, *end; + void *base, *end; int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ @@ -1368,34 +1502,24 @@ retry: goto out; header = IHDR(inode, raw_inode); - entry = IFIRST(header); /* * Check if enough free space is available in the inode to shift the * entries ahead by new_extra_isize. */ - base = start = entry; + base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; - last = entry; total_ino = sizeof(struct ext4_xattr_ibody_header); error = xattr_check_inode(inode, header, end); if (error) goto cleanup; - free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= isize_diff) { - entry = IFIRST(header); - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - new_extra_isize, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, - (void *)header, total_ino, - inode->i_sb->s_blocksize); - EXT4_I(inode)->i_extra_isize = new_extra_isize; - goto out; - } + ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); + if (ifree >= isize_diff) + goto shift; /* * Enough free space isn't available in the inode, check if @@ -1413,146 +1537,44 @@ retry: goto cleanup; } base = BHDR(bh); - first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, NULL); - if (free < isize_diff) { + bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, + NULL); + if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; brelse(bh); goto retry; } - error = -1; + error = -ENOSPC; goto cleanup; } } else { - free = inode->i_sb->s_blocksize; + bfree = inode->i_sb->s_blocksize; } - while (isize_diff > 0) { - size_t offs, size, entry_size; - struct ext4_xattr_entry *small_entry = NULL; - struct ext4_xattr_info i = { - .value = NULL, - .value_len = 0, - }; - unsigned int total_size; /* EA entry size + value size */ - unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ - unsigned int min_total_size = ~0U; - - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); - bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - if (!is || !bs) { - error = -ENOMEM; - goto cleanup; - } - - is->s.not_found = -ENODATA; - bs->s.not_found = -ENODATA; - is->iloc.bh = NULL; - bs->bh = NULL; - - last = IFIRST(header); - /* Find the entry best suited to be pushed into EA block */ - entry = NULL; - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - total_size = - EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + - EXT4_XATTR_LEN(last->e_name_len); - if (total_size <= free && total_size < min_total_size) { - if (total_size < isize_diff) { - small_entry = last; - } else { - entry = last; - min_total_size = total_size; - } - } - } - - if (entry == NULL) { - if (small_entry) { - entry = small_entry; - } else { - if (!tried_min_extra_isize && - s_min_extra_isize) { - tried_min_extra_isize++; - new_extra_isize = s_min_extra_isize; - kfree(is); is = NULL; - kfree(bs); bs = NULL; - brelse(bh); - goto retry; - } - error = -1; - goto cleanup; - } - } - offs = le16_to_cpu(entry->e_value_offs); - size = le32_to_cpu(entry->e_value_size); - entry_size = EXT4_XATTR_LEN(entry->e_name_len); - i.name_index = entry->e_name_index, - buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); - b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); - if (!buffer || !b_entry_name) { - error = -ENOMEM; - goto cleanup; + error = ext4_xattr_make_inode_space(handle, inode, raw_inode, + isize_diff, ifree, bfree, + &total_ino); + if (error) { + if (error == -ENOSPC && !tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + brelse(bh); + goto retry; } - /* Save the entry name and the entry value */ - memcpy(buffer, (void *)IFIRST(header) + offs, - EXT4_XATTR_SIZE(size)); - memcpy(b_entry_name, entry->e_name, entry->e_name_len); - b_entry_name[entry->e_name_len] = '\0'; - i.name = b_entry_name; - - error = ext4_get_inode_loc(inode, &is->iloc); - if (error) - goto cleanup; - - error = ext4_xattr_ibody_find(inode, &i, is); - if (error) - goto cleanup; - - /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); - if (error) - goto cleanup; - total_ino -= entry_size; - - entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) - shift_bytes = isize_diff; - else - shift_bytes = entry_size + EXT4_XATTR_SIZE(size); - /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, -shift_bytes, - (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + - EXT4_I(inode)->i_extra_isize + shift_bytes, - (void *)header, total_ino, inode->i_sb->s_blocksize); - - isize_diff -= shift_bytes; - EXT4_I(inode)->i_extra_isize += shift_bytes; - header = IHDR(inode, raw_inode); - - i.name = b_entry_name; - i.value = buffer; - i.value_len = size; - error = ext4_xattr_block_find(inode, &i, bs); - if (error) - goto cleanup; - - /* Add entry which was removed from the inode into the block */ - error = ext4_xattr_block_set(handle, inode, &i, bs); - if (error) - goto cleanup; - kfree(b_entry_name); - kfree(buffer); - b_entry_name = NULL; - buffer = NULL; - brelse(is->iloc.bh); - kfree(is); - kfree(bs); + goto cleanup; } +shift: + /* Adjust the offsets and shift the remaining entries ahead */ + ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino); + EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); @@ -1560,12 +1582,6 @@ out: return 0; cleanup: - kfree(b_entry_name); - kfree(buffer); - if (is) - brelse(is->iloc.bh); - kfree(is); - kfree(bs); brelse(bh); /* * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode @@ -1734,7 +1750,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, *name++; } - if (entry->e_value_block == 0 && entry->e_value_size != 0) { + if (entry->e_value_size != 0) { __le32 *value = (__le32 *)((char *)header + le16_to_cpu(entry->e_value_offs)); for (n = (le32_to_cpu(entry->e_value_size) + diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 4dcc9e28dc5c..6fe23af509e1 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -109,14 +109,16 @@ fail: return ERR_PTR(-EINVAL); } -static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, + const struct posix_acl *acl, size_t *size) { struct f2fs_acl_header *f2fs_acl; struct f2fs_acl_entry *entry; int i; - f2fs_acl = f2fs_kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * - sizeof(struct f2fs_acl_entry), GFP_NOFS); + f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + + acl->a_count * sizeof(struct f2fs_acl_entry), + GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -175,7 +177,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { - value = f2fs_kmalloc(retval, GFP_F2FS_ZERO); + value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, @@ -210,12 +212,10 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; set_acl_inode(inode, inode->i_mode); - if (error == 0) - acl = NULL; } break; @@ -230,7 +230,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, } if (acl) { - value = f2fs_acl_to_disk(acl, &size); + value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); return (int)PTR_ERR(value); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index b2334d11dae8..2c685185c24d 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -41,7 +41,6 @@ extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else -#define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f94d01e7d001..7e9b504bd8b2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -28,7 +28,7 @@ struct kmem_cache *inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + set_ckpt_flags(sbi, CP_ERROR_FLAG); sbi->sb->s_flags |= MS_RDONLY; if (!end_io) f2fs_flush_merged_bios(sbi); @@ -267,7 +267,6 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); - struct blk_plug plug; long diff, written; /* collect a number of dirty meta pages and write together */ @@ -280,9 +279,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); diff = nr_pages_to_write(sbi, META, wbc); - blk_start_plug(&plug); written = sync_meta_pages(sbi, META, wbc->nr_to_write); - blk_finish_plug(&plug); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -388,6 +385,9 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -491,7 +491,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) spin_lock(&im->ino_lock); #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_ORPHAN)) { + if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); return -ENOSPC; } @@ -531,8 +531,20 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; + struct node_info ni; + int err = acquire_orphan_inode(sbi); + + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; + } - inode = f2fs_iget(sbi->sb, ino); + __add_ino_entry(sbi, ino, ORPHAN_INO); + + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { /* * there should be a bug that we can't find the entry @@ -546,6 +558,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); + + get_node_info(sbi, ino, &ni); + + /* ENOMEM was fully retried in f2fs_evict_inode. */ + if (ni.blk_addr != NULL_ADDR) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return -EIO; + } + __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; } @@ -554,7 +578,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; int err; - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) + if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -578,7 +602,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); } /* clear Orphan Flag */ - clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); + clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); return 0; } @@ -639,45 +663,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) } } -static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, - block_t cp_addr, unsigned long long *version) +static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, + struct f2fs_checkpoint **cp_block, struct page **cp_page, + unsigned long long *version) { - struct page *cp_page_1, *cp_page_2 = NULL; unsigned long blk_size = sbi->blocksize; - struct f2fs_checkpoint *cp_block; - unsigned long long cur_version = 0, pre_version = 0; - size_t crc_offset; + size_t crc_offset = 0; __u32 crc = 0; - /* Read the 1st cp block in this CP pack */ - cp_page_1 = get_meta_page(sbi, cp_addr); + *cp_page = get_meta_page(sbi, cp_addr); + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); - /* get the version number */ - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) - goto invalid_cp1; + crc_offset = le32_to_cpu((*cp_block)->checksum_offset); + if (crc_offset >= blk_size) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid crc_offset: %zu", crc_offset); + return -EINVAL; + } - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset)) - goto invalid_cp1; + crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block + + crc_offset))); + if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); + return -EINVAL; + } - pre_version = cur_cp_version(cp_block); + *version = cur_cp_version(*cp_block); + return 0; +} - /* Read the 2nd cp block in this CP pack */ - cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; - cp_page_2 = get_meta_page(sbi, cp_addr); +static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, + block_t cp_addr, unsigned long long *version) +{ + struct page *cp_page_1 = NULL, *cp_page_2 = NULL; + struct f2fs_checkpoint *cp_block = NULL; + unsigned long long cur_version = 0, pre_version = 0; + int err; - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) - goto invalid_cp2; + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_1, version); + if (err) + goto invalid_cp1; + pre_version = *version; - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset)) + cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_2, version); + if (err) goto invalid_cp2; - - cur_version = cur_cp_version(cp_block); + cur_version = *version; if (cur_version == pre_version) { *version = cur_version; @@ -972,10 +1006,40 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) finish_wait(&sbi->cp_wait, &wait); } +static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + spin_lock(&sbi->cp_lock); + + if (cpc->reason == CP_UMOUNT) + __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason == CP_FASTBOOT) + __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + /* set this flag to activate crc|cp_ver for recovery */ + __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + + spin_unlock(&sbi->cp_lock); +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; nid_t last_nid = nm_i->next_scan_nid; @@ -984,19 +1048,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); - bool invalidate = false; struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; - /* - * This avoids to conduct wrong roll-forward operations and uses - * metapages, so should be called prior to sync_meta_pages below. - */ - if (!test_opt(sbi, LFS) && discard_next_dnode(sbi, discard_blk)) - invalidate = true; - /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); @@ -1036,10 +1091,12 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); + spin_lock(&sbi->cp_lock); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) - set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else - clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + spin_unlock(&sbi->cp_lock); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -1054,23 +1111,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) cp_payload_blks + data_sum_blocks + orphan_blocks); - if (cpc->reason == CP_UMOUNT) - set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - else - clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - - if (cpc->reason == CP_FASTBOOT) - set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - else - clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - - if (orphan_num) - set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - else - clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - set_ckpt_flags(ckpt, CP_FSCK_FLAG); + /* update ckpt flag for checkpoint */ + update_ckpt_flags(sbi, cpc); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); @@ -1137,14 +1179,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); - /* - * invalidate meta page which is used temporarily for zeroing out - * block at the end of warm node chain. - */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, - discard_blk); - release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) @@ -1152,6 +1186,17 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + clear_sbi_flag(sbi, SBI_NEED_CP); + + /* + * redirty superblock if metadata like node page or inode cache is + * updated during writing checkpoint. + */ + if (get_pages(sbi, F2FS_DIRTY_NODES) || + get_pages(sbi, F2FS_DIRTY_IMETA)) + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); return 0; } @@ -1190,6 +1235,18 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); + /* this is the case of multiple fstrims without any changes */ + if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { + f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); + f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); + f2fs_bug_on(sbi, prefree_segments(sbi)); + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + unblock_operations(sbi); + goto out; + } + /* * update checkpoint pack index * Increase the version number so that @@ -1205,6 +1262,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ccb401eebc11..0d0177c9149c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -34,6 +34,11 @@ static void f2fs_read_end_io(struct bio *bio) struct bio_vec *bvec; int i; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + bio->bi_error = -EIO; +#endif + if (f2fs_bio_encrypted(bio)) { if (bio->bi_error) { fscrypt_release_ctx(bio->bi_private); @@ -626,11 +631,13 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) ssize_t ret = 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); - map.m_len = F2FS_BYTES_TO_BLK(iov_iter_count(from)); - map.m_next_pgofs = NULL; + map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; - if (f2fs_encrypted_inode(inode)) - return 0; + map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { ret = f2fs_convert_inline_inode(inode); @@ -672,6 +679,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, bool allocated = false; block_t blkaddr; + if (!maxblocks) + return 0; + map->m_len = 0; map->m_flags = 0; @@ -783,6 +793,7 @@ skip: err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; + allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -966,8 +977,8 @@ out: return ret; } -struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) +static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fscrypt_ctx *ctx = NULL; @@ -1284,7 +1295,7 @@ write: if (!wbc->for_reclaim) need_balance_fs = true; - else if (has_not_enough_free_secs(sbi, 0)) + else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; err = -EAGAIN; @@ -1344,6 +1355,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; + int nwritten = 0; pagevec_init(&pvec, 0); @@ -1418,6 +1430,8 @@ continue_unlock: done_index = page->index + 1; done = 1; break; + } else { + nwritten++; } if (--wbc->nr_to_write <= 0 && @@ -1439,6 +1453,10 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; + if (nwritten) + f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, + NULL, 0, DATA, WRITE); + return ret; } @@ -1480,7 +1498,6 @@ static int f2fs_write_data_pages(struct address_space *mapping, * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. */ - f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_inode(inode); return ret; @@ -1518,8 +1535,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, * we already allocated all the blocks, so we don't need to get * the block addresses when there is no need to fill the page. */ - if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) && - len == PAGE_SIZE) + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) return 0; if (f2fs_has_inline_data(inode) || @@ -1616,7 +1632,7 @@ repeat: if (err) goto fail; - if (need_balance && has_not_enough_free_secs(sbi, 0)) { + if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { unlock_page(page); f2fs_balance_fs(sbi, true); lock_page(page); @@ -1633,22 +1649,12 @@ repeat: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - if (len == PAGE_SIZE) - goto out_update; - if (PageUptodate(page)) - goto out_clear; - - if ((pos & PAGE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_SIZE - 1); - unsigned end = start + len; - - /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_SIZE); - goto out_update; - } + if (len == PAGE_SIZE || PageUptodate(page)) + return 0; if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); } else { struct bio *bio; @@ -1676,11 +1682,6 @@ repeat: goto fail; } } -out_update: - if (!PageUptodate(page)) - SetPageUptodate(page); -out_clear: - clear_cold_data(page); return 0; fail: @@ -1698,11 +1699,26 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); + /* + * This should be come from len == PAGE_SIZE, and we expect copied + * should be PAGE_SIZE. Otherwise, we treat it with zero copied and + * let generic_perform_write() try to copy data again through copied=0. + */ + if (!PageUptodate(page)) { + if (unlikely(copied != PAGE_SIZE)) + copied = 0; + else + SetPageUptodate(page); + } + if (!copied) + goto unlock_out; + set_page_dirty(page); + clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); - +unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; @@ -1873,6 +1889,58 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, get_data_block_bmap); } +#ifdef CONFIG_MIGRATION +#include <linux/migrate.h> + +int f2fs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc, extra_count; + struct f2fs_inode_info *fi = F2FS_I(mapping->host); + bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + + BUG_ON(PageWriteback(page)); + + /* migrating an atomic written page is safe with the inmem_lock hold */ + if (atomic_written && !mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + + /* + * A reference is expected if PagePrivate set when move mapping, + * however F2FS breaks this for maintaining dirty page counts when + * truncating pages. So here adjusting the 'extra_count' make it work. + */ + extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + rc = migrate_page_move_mapping(mapping, newpage, + page, NULL, mode, extra_count); + if (rc != MIGRATEPAGE_SUCCESS) { + if (atomic_written) + mutex_unlock(&fi->inmem_lock); + return rc; + } + + if (atomic_written) { + struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) + if (cur->page == page) { + cur->page = newpage; + break; + } + mutex_unlock(&fi->inmem_lock); + put_page(page); + get_page(newpage); + } + + if (PagePrivate(page)) + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + + migrate_page_copy(newpage, page); + + return MIGRATEPAGE_SUCCESS; +} +#endif + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -1885,4 +1953,7 @@ const struct address_space_operations f2fs_dblock_aops = { .releasepage = f2fs_release_page, .direct_IO = f2fs_direct_IO, .bmap = f2fs_bmap, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index badd407bb622..fb245bd302e4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -45,6 +45,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; @@ -54,6 +55,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); si->valid_count = valid_user_blocks(sbi); + si->discard_blks = discard_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); si->inline_xattr = atomic_read(&sbi->inline_xattr); @@ -154,7 +156,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct sit_info); si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); - si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + if (f2fs_discard_en(sbi)) + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); @@ -228,8 +232,13 @@ static int stat_show(struct seq_file *s, void *v) si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); - seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", - si->utilization, si->valid_count); + if (test_opt(si->sbi, DISCARD)) + seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", + si->utilization, si->valid_count, si->discard_blks); + else + seq_printf(s, "Utilization: %u%% (%u valid blocks)\n", + si->utilization, si->valid_count); + seq_printf(s, " - Node: %u (Inode: %u, ", si->valid_node_count, si->valid_inode_count); seq_printf(s, "Other: %u)\n - Data: %u\n", @@ -311,6 +320,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_data, si->ndirty_files); seq_printf(s, " - meta: %4lld in %4d\n", si->ndirty_meta, si->meta_pages); + seq_printf(s, " - imeta: %4lld\n", + si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9054aeac8015..12b5836a1033 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_UNKNOWN] = DT_UNKNOWN, [F2FS_FT_REG_FILE] = DT_REG, [F2FS_FT_DIR] = DT_DIR, @@ -172,7 +172,10 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, int max_slots; f2fs_hash_t namehash; - namehash = f2fs_dentry_hash(&name); + if(fname->hash) + namehash = cpu_to_le32(fname->hash); + else + namehash = f2fs_dentry_hash(&name); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -212,31 +215,17 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, return de; } -/* - * Find an entry in the specified directory with the wanted name. - * It returns the page where the entry was found (as a parameter - res_page), - * and the entry itself. Page is returned mapped and unlocked. - * Entry is guaranteed to be valid. - */ -struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - const struct qstr *child, struct page **res_page) +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page) { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; unsigned int max_depth; unsigned int level; - struct fscrypt_name fname; - int err; - - err = fscrypt_setup_filename(dir, child, 1, &fname); - if (err) { - *res_page = ERR_PTR(err); - return NULL; - } if (f2fs_has_inline_dentry(dir)) { *res_page = NULL; - de = find_in_inline_dir(dir, &fname, res_page); + de = find_in_inline_dir(dir, fname, res_page); goto out; } @@ -256,11 +245,35 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, for (level = 0; level < max_depth; level++) { *res_page = NULL; - de = find_in_level(dir, level, &fname, res_page); + de = find_in_level(dir, level, fname, res_page); if (de || IS_ERR(*res_page)) break; } out: + return de; +} + +/* + * Find an entry in the specified directory with the wanted name. + * It returns the page where the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page) +{ + struct f2fs_dir_entry *de = NULL; + struct fscrypt_name fname; + int err; + + err = fscrypt_setup_filename(dir, child, 1, &fname); + if (err) { + *res_page = ERR_PTR(err); + return NULL; + } + + de = __f2fs_find_entry(dir, &fname, res_page); + fscrypt_free_filename(&fname); return de; } @@ -375,7 +388,8 @@ static int make_empty_dir(struct inode *inode, } struct page *init_inode_metadata(struct inode *inode, struct inode *dir, - const struct qstr *name, struct page *dpage) + const struct qstr *new_name, const struct qstr *orig_name, + struct page *dpage) { struct page *page; int err; @@ -400,7 +414,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - err = f2fs_init_security(inode, dir, name, page); + err = f2fs_init_security(inode, dir, orig_name, page); if (err) goto put_error; @@ -417,8 +431,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (name) - init_dent_inode(name, page); + if (new_name) + init_dent_inode(new_name, page); /* * This file should be checkpointed during fsync. @@ -496,7 +510,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, de->ino = cpu_to_le32(ino); set_de_type(de, mode); for (i = 0; i < slots; i++) { - test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); + __set_bit_le(bit_pos + i, (void *)d->bitmap); /* avoid wrong garbage data for readdir */ if (i) (de + i)->name_len = 0; @@ -504,6 +518,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, } int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; @@ -530,7 +545,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_DIR_DEPTH)) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) return -ENOSPC; #endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) @@ -569,7 +584,8 @@ add_dentry: if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, new_name, NULL); + page = init_inode_metadata(inode, dir, new_name, + orig_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -599,6 +615,26 @@ fail: return err; } +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct qstr new_name; + int err = -EAGAIN; + + new_name.name = fname_name(fname); + new_name.len = fname_len(fname); + + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + return err; +} + /* * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). @@ -607,24 +643,15 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; - struct qstr new_name; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - new_name.name = fname_name(&fname); - new_name.len = fname_len(&fname); - - err = -EAGAIN; - if (f2fs_has_inline_dentry(dir)) - err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); - if (err == -EAGAIN) - err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode); + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); fscrypt_free_filename(&fname); - f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; } @@ -634,7 +661,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL, NULL); + page = init_inode_metadata(inode, dir, NULL, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -786,19 +813,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int ret; + int err; - de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS); - if (!de_name.name) - return false; - - memcpy(de_name.name, d->filename[bit_pos], de_name.len); - - ret = fscrypt_fname_disk_to_usr(d->inode, + err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); - kfree(de_name.name); - if (ret < 0) + if (err) return true; de_name = *fstr; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 14f5fe2b841e..9e8de18a168a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -46,6 +46,8 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_IO, + FAULT_CHECKPOINT, FAULT_MAX, }; @@ -55,40 +57,8 @@ struct f2fs_fault_info { unsigned int inject_type; }; -extern struct f2fs_fault_info f2fs_fault; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(type) (f2fs_fault.inject_type & (1 << (type))) - -static inline bool time_to_inject(int type) -{ - if (!f2fs_fault.inject_rate) - return false; - if (type == FAULT_KMALLOC && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_PAGE_ALLOC && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_ALLOC_NID && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_ORPHAN && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_BLOCK && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type)) - return false; - - atomic_inc(&f2fs_fault.inject_ops); - if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) { - atomic_set(&f2fs_fault.inject_ops, 0); - printk("%sF2FS-fs : inject %s in %pF\n", - KERN_INFO, - fault_name[type], - __builtin_return_address(0)); - return true; - } - return false; -} +#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) #endif /* @@ -158,7 +128,7 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 32 +#define DEF_BATCHED_TRIM_SECTIONS 2 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ @@ -211,6 +181,13 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; +struct bio_entry { + struct list_head list; + struct bio *bio; + struct completion event; + int error; +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ @@ -645,6 +622,7 @@ struct f2fs_sm_info { /* for small discard management */ struct list_head discard_list; /* 4KB discard list */ + struct list_head wait_list; /* linked with issued discard bio */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -748,6 +726,7 @@ enum { SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ SBI_NEED_SB_WRITE, /* need to recover superblock */ + SBI_NEED_CP, /* need to checkpoint */ }; enum { @@ -765,7 +744,7 @@ struct f2fs_sb_info { struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ int valid_super_block; /* valid super block no */ - int s_flag; /* flags for sbi */ + unsigned long s_flag; /* flags for sbi */ #ifdef CONFIG_F2FS_FS_ENCRYPTION u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; @@ -785,6 +764,7 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ @@ -892,8 +872,37 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + + /* For fault injection */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; +#endif }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + printk("%sF2FS-fs : inject %s in %pF\n", + KERN_INFO, + fault_name[type], + __builtin_return_address(0)); + return true; + } + return false; +} +#endif + /* For write statistics. Suppose sector size is 512 bytes, * and the return value is in kbytes. s is of struct f2fs_sb_info. */ @@ -1034,17 +1043,17 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) { - return sbi->s_flag & (0x01 << type); + return test_bit(type, &sbi->s_flag); } static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag |= (0x01 << type); + set_bit(type, &sbi->s_flag); } static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag &= ~(0x01 << type); + clear_bit(type, &sbi->s_flag); } static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) @@ -1052,26 +1061,57 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } -static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return ckpt_flags & f; } -static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); +} + +static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags |= f; cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + spin_lock(&sbi->cp_lock); + __set_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags &= (~f); cp->ckpt_flags = cpu_to_le32(ckpt_flags); } +static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + spin_lock(&sbi->cp_lock); + __clear_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q); +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -1110,8 +1150,8 @@ static inline bool __remain_node_summaries(int reason) static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) { - return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG)); + return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || + is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } /* @@ -1151,7 +1191,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t diff; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_BLOCK)) + if (time_to_inject(sbi, FAULT_BLOCK)) return false; #endif /* @@ -1193,6 +1233,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { percpu_counter_inc(&sbi->nr_pages[count_type]); + + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + return; + set_sbi_flag(sbi, SBI_IS_DIRTY); } @@ -1243,6 +1287,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) return sbi->total_valid_block_count; } +static inline block_t discard_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->discard_blks; +} + static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1376,7 +1425,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, if (page) return page; - if (time_to_inject(FAULT_PAGE_ALLOC)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) return NULL; #endif if (!for_write) @@ -1804,7 +1853,7 @@ static inline int f2fs_readonly(struct super_block *sb) static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) { - return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } static inline bool is_dot_dotdot(const struct qstr *str) @@ -1827,10 +1876,11 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } -static inline void *f2fs_kmalloc(size_t size, gfp_t flags) +static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_KMALLOC)) + if (time_to_inject(sbi, FAULT_KMALLOC)) return NULL; #endif return kmalloc(size, flags); @@ -1885,6 +1935,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); */ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); +struct inode *f2fs_iget_retry(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); int update_inode(struct inode *, struct page *); int update_inode_page(struct inode *); @@ -1900,7 +1951,6 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, @@ -1910,10 +1960,12 @@ bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, struct page *); + const struct qstr *, const struct qstr *, struct page *); void update_parent_metadata(struct inode *, struct inode *, unsigned int); int room_for_filename(const void *, int, int); void f2fs_drop_nlink(struct inode *, struct inode *); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, + struct page **); struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); @@ -1924,7 +1976,9 @@ int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); int f2fs_add_regular_entry(struct inode *, const struct qstr *, - struct inode *, nid_t, umode_t); + const struct qstr *, struct inode *, nid_t, umode_t); +int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, + nid_t, umode_t); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, @@ -2010,9 +2064,9 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); -bool discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); @@ -2095,6 +2149,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); void f2fs_set_page_dirty_nobuffers(struct page *); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); +#ifdef CONFIG_MIGRATION +int f2fs_migrate_page(struct address_space *, struct page *, struct page *, + enum migrate_mode); +#endif /* * gc.c @@ -2123,13 +2181,14 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages; + s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + s64 inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; int bg_gc, wb_bios; int inline_xattr, inline_inode, inline_dir, orphans; - unsigned int valid_count, valid_node_count, valid_inode_count; + unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; @@ -2294,8 +2353,8 @@ bool recover_inline_data(struct inode *, struct page *); struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct fscrypt_name *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, - nid_t, umode_t); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); bool f2fs_empty_inline_dir(struct inode *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2ebc4c79562c..acdf4b929f97 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -135,7 +135,7 @@ static inline bool need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino)) + else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) need_cp = true; else if (file_wrong_pino(inode)) need_cp = true; @@ -523,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = f2fs_grab_cache_page(mapping, index, false); + page = find_lock_page(mapping, index); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); @@ -680,7 +680,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int err; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; @@ -1451,7 +1451,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags; unsigned int oldflags; int ret; @@ -1951,7 +1951,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, * avoid defragment running in SSR mode when free section are allocated * intensively */ - if (has_not_enough_free_secs(sbi, sec_num)) { + if (has_not_enough_free_secs(sbi, 0, sec_num)) { err = -EAGAIN; goto out; } @@ -2082,6 +2082,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) return -EOPNOTSUPP; + if (src == dst) { + if (pos_in == pos_out) + return 0; + if (pos_out > pos_in && pos_out < pos_in + len) + return -EINVAL; + } + inode_lock(src); if (src != dst) { if (!inode_trylock(dst)) { @@ -2133,8 +2140,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - ret = __exchange_data_block(src, dst, pos_in, - pos_out, len >> F2FS_BLKSIZE_BITS, false); + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, + pos_out >> F2FS_BLKSIZE_BITS, + len >> F2FS_BLKSIZE_BITS, false); if (!ret) { if (dst_max_i_size) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8f7fa326ce95..93985c64d8a8 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -47,6 +47,11 @@ static int gc_thread_func(void *data) continue; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -96,7 +101,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - gc_th = f2fs_kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; goto out; @@ -270,7 +275,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int secno, max_cost, last_victim; + unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); unsigned int nsearched = 0; @@ -280,7 +285,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = max_cost = get_max_cost(sbi, &p); + p.min_cost = get_max_cost(sbi, &p); if (p.max_search == 0) goto out; @@ -423,10 +428,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi, static void gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { - bool initial = true; struct f2fs_summary *entry; block_t start_addr; int off; + int phase = 0; start_addr = START_BLOCK(sbi, segno); @@ -439,16 +444,24 @@ next_step: struct node_info ni; /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) return; if (check_valid_map(sbi, segno, off) == 0) continue; - if (initial) { + if (phase == 0) { + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { ra_node_page(sbi, nid); continue; } + + /* phase == 2 */ node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; @@ -469,10 +482,8 @@ next_step: stat_inc_node_blk_count(sbi, 1, gc_type); } - if (initial) { - initial = false; + if (++phase < 3) goto next_step; - } } /* @@ -706,16 +717,23 @@ next_step: struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; block_t start_bidx; + nid_t nid = le32_to_cpu(entry->nid); /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) return; if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { - ra_node_page(sbi, le32_to_cpu(entry->nid)); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + ra_node_page(sbi, nid); continue; } @@ -723,14 +741,14 @@ next_step: if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; - if (phase == 1) { + if (phase == 2) { ra_node_page(sbi, dni.ino); continue; } ofs_in_node = le16_to_cpu(entry->ofs_in_node); - if (phase == 2) { + if (phase == 3) { inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode)) continue; @@ -756,7 +774,7 @@ next_step: continue; } - /* phase 3 */ + /* phase 4 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -789,7 +807,7 @@ next_step: } } - if (++phase < 4) + if (++phase < 5) goto next_step; } @@ -815,7 +833,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int seg_freed = 0; + int sec_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -834,8 +852,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, for (segno = start_segno; segno < end_segno; segno++) { - if (get_valid_blocks(sbi, segno, 1) == 0) - continue; + if (get_valid_blocks(sbi, segno, 1) == 0 || + unlikely(f2fs_cp_error(sbi))) + goto next; /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), @@ -861,7 +880,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); - +next: f2fs_put_page(sum_page, 0); } @@ -871,22 +890,20 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); - if (gc_type == FG_GC) { - while (start_segno < end_segno) - if (get_valid_blocks(sbi, start_segno++, 1) == 0) - seg_freed++; - } + if (gc_type == FG_GC && + get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + sec_freed = 1; stat_inc_call_count(sbi->stat_info); - return seg_freed; + return sec_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0, seg_freed; + int sec_freed = 0; int ret = -EINVAL; struct cp_control cpc; struct gc_inode_list gc_list = { @@ -905,7 +922,7 @@ gc_more: goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { gc_type = FG_GC; /* * If there is no victim and no prefree segment but still not @@ -914,10 +931,14 @@ gc_more: */ if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) { - write_checkpoint(sbi, &cpc); + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; segno = NULL_SEGNO; - } else if (has_not_enough_free_secs(sbi, 0)) { - write_checkpoint(sbi, &cpc); + } else if (has_not_enough_free_secs(sbi, 0, 0)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; } } @@ -925,20 +946,19 @@ gc_more: goto stop; ret = 0; - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); - - if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) + if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && + gc_type == FG_GC) sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) goto gc_more; if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + ret = write_checkpoint(sbi, &cpc); } stop: mutex_unlock(&sbi->gc_mutex); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ccea8735de59..34234d84a38b 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -424,7 +424,7 @@ static int f2fs_add_inline_entries(struct inode *dir, ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; - err = f2fs_add_regular_entry(dir, &new_name, NULL, + err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, ino, fake_mode); if (err) goto punch_dentry_pages; @@ -445,8 +445,8 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *backup_dentry; int err; - backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry), - GFP_F2FS_ZERO); + backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), + sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -488,17 +488,17 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); } -int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, - struct inode *inode, nid_t ino, umode_t mode) +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - size_t namelen = name->len; struct f2fs_inline_dentry *dentry_blk = NULL; struct f2fs_dentry_ptr d; - int slots = GET_DENTRY_SLOTS(namelen); + int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; int err = 0; @@ -519,18 +519,21 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, ipage); + page = init_inode_metadata(inode, dir, new_name, + orig_name, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(name); + name_hash = f2fs_dentry_hash(new_name); make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); - f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -563,7 +566,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, inline_dentry = inline_data_addr(page); bit_pos = dentry - inline_dentry->dentry; for (i = 0; i < slots; i++) - test_and_clear_bit_le(bit_pos + i, + __clear_bit_le(bit_pos + i, &inline_dentry->dentry_bitmap); set_page_dirty(page); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9ac5efc15347..d7369895a78a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> +#include <linux/backing-dev.h> #include <linux/writeback.h> #include "f2fs.h" @@ -234,6 +235,20 @@ bad_inode: return ERR_PTR(ret); } +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; +retry: + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + } + return inode; +} + int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; @@ -354,7 +369,7 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_EVICT_INODE)) + if (time_to_inject(sbi, FAULT_EVICT_INODE)) goto no_delete; #endif diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1c481c9dc088..e80ed0302c22 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -91,18 +91,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); + int i; /* * filename format of multimedia file should be defined as: - * "filename + '.' + extension". + * "filename + '.' + extension + (optional: '.' + temp extension)". */ if (slen < sublen + 2) return 0; - if (s[slen - sublen - 1] != '.') - return 0; + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } - return !strncasecmp(s + slen - sublen, sub, sublen); + return 0; } /* @@ -449,7 +454,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, ostr.name = sd->encrypted_path; ostr.len = disk_link.len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_out; sd->len = cpu_to_le16(ostr.len); @@ -1010,7 +1015,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, struct fscrypt_str cstr = FSTR_INIT(NULL, 0); struct fscrypt_str pstr = FSTR_INIT(NULL, 0); struct fscrypt_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); u32 max_size = inode->i_sb->s_blocksize; int res; @@ -1025,7 +1029,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, if (IS_ERR(cpage)) return ERR_CAST(cpage); caddr = page_address(cpage); - caddr[size] = 0; /* Symlink is encrypted */ sd = (struct fscrypt_symlink_data *)caddr; @@ -1048,7 +1051,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, goto errout; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* this is broken symlink case */ @@ -1060,7 +1063,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, paddr = pstr.name; /* Null-terminate the name */ - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; put_page(cpage); set_delayed_call(done, kfree_link, paddr); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f75d197d5beb..01177ecdeab8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -54,8 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); if (excess_cached_nats(sbi)) res = false; - if (nm_i->nat_cnt > DEF_NAT_CACHE_THRESHOLD) - res = false; } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; @@ -1314,6 +1312,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; + int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1387,7 +1386,10 @@ continue_unlock: unlock_page(page); f2fs_put_page(last_page, 0); break; + } else { + nwritten++; } + if (page == last_page) { f2fs_put_page(page, 0); marked = true; @@ -1409,6 +1411,9 @@ continue_unlock: unlock_page(last_page); goto retry; } + + if (nwritten) + f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); return ret ? -EIO: 0; } @@ -1418,6 +1423,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) struct pagevec pvec; int step = 0; int nwritten = 0; + int ret = 0; pagevec_init(&pvec, 0); @@ -1438,7 +1444,8 @@ next_step: if (unlikely(f2fs_cp_error(sbi))) { pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } /* @@ -1489,6 +1496,8 @@ continue_unlock: if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) unlock_page(page); + else + nwritten++; if (--wbc->nr_to_write == 0) break; @@ -1506,14 +1515,17 @@ continue_unlock: step++; goto next_step; } - return nwritten; +out: + if (nwritten) + f2fs_submit_merged_bio(sbi, NODE, WRITE); + return ret; } int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index = 0, end = ULONG_MAX; struct pagevec pvec; - int ret2 = 0, ret = 0; + int ret2, ret = 0; pagevec_init(&pvec, 0); @@ -1542,10 +1554,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) cond_resched(); } - if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) - ret2 = -ENOSPC; - if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) - ret2 = -EIO; + ret2 = filemap_check_errors(NODE_MAPPING(sbi)); if (!ret) ret = ret2; return ret; @@ -1672,6 +1681,9 @@ const struct address_space_operations f2fs_node_aops = { .set_page_dirty = f2fs_set_node_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, @@ -1838,7 +1850,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_ALLOC_NID)) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; #endif if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) @@ -2015,10 +2027,12 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; - +retry: ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); - if (!ipage) - return -ENOMEM; + if (!ipage) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index fc7684554b1a..868bec65e51c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -229,6 +229,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) f2fs_change_bit(block_off, nm_i->nat_bitmap); } +static inline nid_t ino_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline __u64 cpver_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.next_blkaddr); +} + static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { @@ -259,40 +290,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); - rn->footer.cp_ver = ckpt->checkpoint_ver; + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } -static inline nid_t ino_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.ino); -} - -static inline nid_t nid_of_node(struct page *node_page) +static inline bool is_recoverable_dnode(struct page *page) { - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.nid); -} - -static inline unsigned int ofs_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - unsigned flag = le32_to_cpu(rn->footer.flag); - return flag >> OFFSET_BIT_SHIFT; -} - -static inline unsigned long long cpver_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le64_to_cpu(rn->footer.cp_ver); -} + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = cur_cp_version(ckpt); -static inline block_t next_blkaddr_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.next_blkaddr); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + return cpu_to_le64(cp_ver) == cpver_of_node(page); } /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9e652d5a659b..2fc84a991325 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -68,15 +68,17 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static struct fsync_inode_entry *add_fsync_inode(struct list_head *head, - struct inode *inode) +static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, + struct list_head *head, nid_t ino) { + struct inode *inode; struct fsync_inode_entry *entry; - entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); - if (!entry) - return NULL; + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); @@ -96,48 +98,41 @@ static int recover_dentry(struct inode *inode, struct page *ipage, struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; - struct qstr name; + struct fscrypt_name fname; struct page *page; struct inode *dir, *einode; struct fsync_inode_entry *entry; int err = 0; + char *name; entry = get_fsync_inode(dir_list, pino); if (!entry) { - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; - } - - entry = add_fsync_inode(dir_list, dir); - if (!entry) { - err = -ENOMEM; - iput(dir); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + if (IS_ERR(entry)) { + dir = ERR_CAST(entry); + err = PTR_ERR(entry); goto out; } } dir = entry->inode; - if (file_enc_name(inode)) - return 0; + memset(&fname, 0, sizeof(struct fscrypt_name)); + fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname.disk_name.name = raw_inode->i_name; - name.len = le32_to_cpu(raw_inode->i_namelen); - name.name = raw_inode->i_name; - - if (unlikely(name.len > F2FS_NAME_LEN)) { + if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; goto out; } retry: - de = f2fs_find_entry(dir, &name, &page); + de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_unmap_put; if (de) { - einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); err = PTR_ERR(einode); @@ -156,18 +151,24 @@ retry: } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { - err = __f2fs_add_link(dir, &name, inode, + err = __f2fs_do_add_link(dir, &fname, inode, inode->i_ino, inode->i_mode); } + if (err == -ENOMEM) + goto retry; goto out; out_unmap_put: f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); out: + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = raw_inode->i_name; f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", - __func__, ino_of_node(ipage), raw_inode->i_name, + __func__, ino_of_node(ipage), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } @@ -223,9 +224,7 @@ static bool is_same_inode(struct inode *inode, struct page *ipage) static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; - struct inode *inode; struct page *page = NULL; block_t blkaddr; int err = 0; @@ -242,7 +241,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) + if (!is_recoverable_dnode(page)) break; if (!is_fsync_dnode(page)) @@ -263,23 +262,15 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + entry = add_fsync_inode(sbi, head, ino_of_node(page)); + if (IS_ERR(entry)) { + err = PTR_ERR(entry); if (err == -ENOENT) { err = 0; goto next; } break; } - - /* add this fsync inode to the list */ - entry = add_fsync_inode(head, inode); - if (!entry) { - err = -ENOMEM; - iput(inode); - break; - } } entry->blkaddr = blkaddr; @@ -363,7 +354,7 @@ got_it: if (ino != dn->inode->i_ino) { /* Deallocate previous index in the node page */ - inode = f2fs_iget(sbi->sb, ino); + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); } else { @@ -431,10 +422,15 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, end = start + ADDRS_PER_PAGE(page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); - +retry_dn: err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_dn; + } goto out; + } f2fs_wait_on_page_writeback(dn.node_page, NODE, true); @@ -485,11 +481,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (err) goto err; } - +retry_prev: /* Check the previous node page having this index */ err = check_index_in_prev_nodes(sbi, dest, &dn); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_prev; + } goto err; + } /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, @@ -514,7 +515,6 @@ out: static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, struct list_head *dir_list) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; int err = 0; @@ -534,7 +534,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) { + if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); break; } @@ -626,38 +626,20 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) { - bool invalidate = false; - - if (test_opt(sbi, LFS)) { - update_meta_page(sbi, NULL, blkaddr); - invalidate = true; - } else if (discard_next_dnode(sbi, blkaddr)) { - invalidate = true; - } - - /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) - sync_meta_pages(sbi, META, LONG_MAX); + if (err) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + mutex_unlock(&sbi->cp_mutex); - /* invalidate temporary meta page */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), - blkaddr, blkaddr); + /* let's drop all the directory inodes for clean checkpoint */ + destroy_fsync_dnodes(&dir_list); - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - mutex_unlock(&sbi->cp_mutex); - } else if (need_writecp) { + if (!err && need_writecp) { struct cp_control cpc = { .reason = CP_RECOVERY, }; - mutex_unlock(&sbi->cp_mutex); err = write_checkpoint(sbi, &cpc); - } else { - mutex_unlock(&sbi->cp_mutex); } - destroy_fsync_dnodes(&dir_list); kmem_cache_destroy(fsync_entry_slab); return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a46296f57b02..fc886f008449 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,6 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *bio_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -344,6 +345,11 @@ int commit_inmem_pages(struct inode *inode) */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + if (!need) return; @@ -355,7 +361,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi, 0)) { + if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); f2fs_gc(sbi, false); } @@ -580,6 +586,74 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, + struct bio *bio) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + + INIT_LIST_HEAD(&be->list); + be->bio = bio; + init_completion(&be->event); + list_add_tail(&be->list, wait_list); + + return be; +} + +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be, *tmp; + + list_for_each_entry_safe(be, tmp, wait_list, list) { + struct bio *bio = be->bio; + int err; + + wait_for_completion_io(&be->event); + err = be->error; + if (err == -EOPNOTSUPP) + err = 0; + + if (err) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", err); + + bio_put(bio); + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); + } +} + +static void f2fs_submit_bio_wait_endio(struct bio *bio) +{ + struct bio_entry *be = (struct bio_entry *)bio->bi_private; + + be->error = bio->bi_error; + complete(&be->event); +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct bio *bio = NULL; + int err; + + err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, + &bio); + if (!err && bio) { + struct bio_entry *be = __add_bio_entry(sbi, bio); + + bio->bi_private = be; + bio->bi_end_io = f2fs_submit_bio_wait_endio; + bio->bi_opf |= REQ_SYNC; + submit_bio(bio); + } + + return err; +} + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { @@ -597,29 +671,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, sbi->discard_blks--; } trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); - return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); -} - -bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) -{ - int err = -EOPNOTSUPP; - - if (test_opt(sbi, DISCARD)) { - struct seg_entry *se = get_seg_entry(sbi, - GET_SEGNO(sbi, blkaddr)); - unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - - if (f2fs_test_bit(offset, se->discard_map)) - return false; - - err = f2fs_issue_discard(sbi, blkaddr, 1); - } - - if (err) { - update_meta_page(sbi, NULL, blkaddr); - return true; - } - return false; + return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0); } static void __add_discard_entry(struct f2fs_sb_info *sbi, @@ -660,7 +712,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool force = (cpc->reason == CP_DISCARD); int i; - if (se->valid_blocks == max_blocks) + if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) return; if (!force) { @@ -719,11 +771,14 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct list_head *head = &(SM_I(sbi)->discard_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason == CP_DISCARD); + blk_start_plug(&plug); + mutex_lock(&dirty_i->seglist_lock); while (1) { @@ -772,6 +827,8 @@ skip: SM_I(sbi)->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } + + blk_finish_plug(&plug); } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -818,12 +875,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (del > 0) { if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -1202,7 +1261,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) + if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) return v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR); @@ -1277,6 +1336,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (end <= MAIN_BLKADDR(sbi)) goto out; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Found FS corruption, run fsck to fix."); + goto out; + } + /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : @@ -1301,6 +1366,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) mutex_lock(&sbi->gc_mutex); err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); + if (err) + break; + + schedule(); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); @@ -1391,7 +1460,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, /* direct_io'ed data is aligned to the segment for better performance */ if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0)) + !has_not_enough_free_secs(sbi, 0, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -1589,11 +1658,9 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, { struct page *cpage; - if (blkaddr == NEW_ADDR) + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return; - f2fs_bug_on(sbi, blkaddr == NULL_ADDR); - cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { f2fs_wait_on_page_writeback(cpage, DATA, true); @@ -1739,7 +1806,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) int type = CURSEG_HOT_DATA; int err; - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = npages_for_summary_flush(sbi, true); if (npages >= 2) @@ -1836,7 +1903,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); @@ -2127,12 +2194,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi) = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - sit_i->sentries[start].discard_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map || - !sit_i->sentries[start].ckpt_valid_map || - !sit_i->sentries[start].discard_map) + !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; + + if (f2fs_discard_en(sbi)) { + sit_i->sentries[start].discard_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; + } } sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2239,6 +2310,8 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; + struct seg_entry *se; + struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; @@ -2251,41 +2324,58 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; struct page *page; - down_read(&curseg->journal_rwsem); - for (i = 0; i < sits_in_cursum(journal); i++) { - if (le32_to_cpu(segno_in_journal(journal, i)) - == start) { - sit = sit_in_journal(journal, i); - up_read(&curseg->journal_rwsem); - goto got_it; - } - } - up_read(&curseg->journal_rwsem); - + se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); -got_it: + check_block_count(sbi, start, &sit); seg_info_from_raw_sit(se, &sit); /* build discard map only one time */ - memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks; - - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += sbi->blocks_per_seg - + se->valid_blocks; } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); + + down_read(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + + old_valid_blocks = se->valid_blocks; + + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks - old_valid_blocks; + } + up_read(&curseg->journal_rwsem); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -2427,6 +2517,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; INIT_LIST_HEAD(&sm_info->discard_list); + INIT_LIST_HEAD(&sm_info->wait_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2570,10 +2661,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; + bio_entry_slab = f2fs_kmem_cache_create("bio_entry", + sizeof(struct bio_entry)); + if (!bio_entry_slab) + goto destroy_discard_entry; + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destory_discard_entry; + goto destroy_bio_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2583,7 +2679,9 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destory_discard_entry: +destroy_bio_entry: + kmem_cache_destroy(bio_entry_slab); +destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: return -ENOMEM; @@ -2592,6 +2690,7 @@ fail: void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(bio_entry_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b33f73ec60a4..fecb856ad874 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -479,7 +479,8 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) reserved_sections(sbi) + 1); } -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); @@ -489,8 +490,8 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi)); + return (free_sections(sbi) + freed) <= + (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) @@ -587,8 +588,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) + || blk_addr >= MAX_BLKADDR(sbi)); } /* diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7f863a645ab1..6132b4ce4e4c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -40,7 +40,6 @@ static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION -struct f2fs_fault_info f2fs_fault; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", @@ -50,16 +49,21 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_IO] = "IO error", + [FAULT_CHECKPOINT] = "checkpoint error", }; -static void f2fs_build_fault_attr(unsigned int rate) +static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, + unsigned int rate) { + struct f2fs_fault_info *ffi = &sbi->fault_info; + if (rate) { - atomic_set(&f2fs_fault.inject_ops, 0); - f2fs_fault.inject_rate = rate; - f2fs_fault.inject_type = (1 << FAULT_MAX) - 1; + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + ffi->inject_type = (1 << FAULT_MAX) - 1; } else { - memset(&f2fs_fault, 0, sizeof(struct f2fs_fault_info)); + memset(ffi, 0, sizeof(struct f2fs_fault_info)); } } #endif @@ -87,6 +91,7 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_inline_dentry, + Opt_noinline_dentry, Opt_flush_merge, Opt_noflush_merge, Opt_nobarrier, @@ -118,6 +123,7 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, + {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, {Opt_noflush_merge, "noflush_merge"}, {Opt_nobarrier, "nobarrier"}, @@ -167,7 +173,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&f2fs_fault; + return (unsigned char *)&sbi->fault_info; #endif return NULL; } @@ -312,6 +318,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif ATTR_LIST(lifetime_write_kbytes), NULL, }; @@ -327,22 +337,6 @@ static struct kobj_type f2fs_ktype = { .release = f2fs_sb_release, }; -#ifdef CONFIG_F2FS_FAULT_INJECTION -/* sysfs for f2fs fault injection */ -static struct kobject f2fs_fault_inject; - -static struct attribute *f2fs_fault_attrs[] = { - ATTR_LIST(inject_rate), - ATTR_LIST(inject_type), - NULL -}; - -static struct kobj_type f2fs_fault_ktype = { - .default_attrs = f2fs_fault_attrs, - .sysfs_ops = &f2fs_attr_ops, -}; -#endif - void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -370,10 +364,6 @@ static int parse_options(struct super_block *sb, char *options) char *p, *name; int arg = 0; -#ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_build_fault_attr(0); -#endif - if (!options) return 0; @@ -488,6 +478,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_dentry: set_opt(sbi, INLINE_DENTRY); break; + case Opt_noinline_dentry: + clear_opt(sbi, INLINE_DENTRY); + break; case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; @@ -533,7 +526,7 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_build_fault_attr(arg); + f2fs_build_fault_attr(sbi, arg); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -730,7 +723,7 @@ static void f2fs_put_super(struct super_block *sb) * clean checkpoint again. */ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || - !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { .reason = CP_UMOUNT, }; @@ -878,6 +871,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_data"); if (test_opt(sbi, INLINE_DENTRY)) seq_puts(seq, ",inline_dentry"); + else + seq_puts(seq, ",noinline_dentry"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) @@ -946,7 +941,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) seq_printf(seq, "%d|%-3u|", se->type, get_valid_blocks(sbi, i, 1)); for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) - seq_printf(seq, "%x ", se->cur_valid_map[j]); + seq_printf(seq, " %.2x", se->cur_valid_map[j]); seq_putc(seq, '\n'); } return 0; @@ -975,6 +970,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); + set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); @@ -991,6 +987,10 @@ static void default_options(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FS_POSIX_ACL set_opt(sbi, POSIX_ACL); #endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0); +#endif } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -1001,6 +1001,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info ffi = sbi->fault_info; +#endif /* * Save the old mount options in case we @@ -1096,6 +1099,9 @@ restore_gc: restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; +#ifdef CONFIG_F2FS_FAULT_INJECTION + sbi->fault_info = ffi; +#endif return err; } @@ -1469,6 +1475,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); mutex_init(&sbi->wio_mutex[DATA]); + spin_lock_init(&sbi->cp_lock); #ifdef CONFIG_F2FS_FS_ENCRYPTION memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, @@ -1810,7 +1817,7 @@ try_onemore: * previous checkpoint was not done by clean system shutdown. */ if (bdev_read_only(sb->s_bdev) && - !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; goto free_kobj; } @@ -1818,6 +1825,9 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); + if (!retry) + goto skip_recovery; + err = recover_fsync_data(sbi, false); if (err < 0) { need_fsck = true; @@ -1835,7 +1845,7 @@ try_onemore: goto free_kobj; } } - +skip_recovery: /* recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -1879,7 +1889,9 @@ free_root_inode: dput(sb->s_root); sb->s_root = NULL; free_node_inode: + truncate_inode_pages_final(NODE_MAPPING(sbi)); mutex_lock(&sbi->umount_mutex); + release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); @@ -1978,16 +1990,6 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } -#ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_fault_inject.kset = f2fs_kset; - f2fs_build_fault_attr(0); - err = kobject_init_and_add(&f2fs_fault_inject, &f2fs_fault_ktype, - NULL, "fault_injection"); - if (err) { - f2fs_fault_inject.kset = NULL; - goto free_kset; - } -#endif err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_kset; @@ -2006,10 +2008,6 @@ free_filesystem: free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_kset: -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (f2fs_fault_inject.kset) - kobject_put(&f2fs_fault_inject); -#endif kset_unregister(f2fs_kset); free_extent_cache: destroy_extent_cache(); @@ -2031,9 +2029,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); -#ifdef CONFIG_F2FS_FAULT_INJECTION - kobject_put(&f2fs_fault_inject); -#endif kset_unregister(f2fs_kset); destroy_extent_cache(); destroy_checkpoint_caches(); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c8898b5148eb..1f74876233b6 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -217,18 +217,20 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static void *read_all_xattrs(struct inode *inode, struct page *ipage) +static int read_all_xattrs(struct inode *inode, struct page *ipage, + void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; size_t size = PAGE_SIZE, inline_size = 0; void *txattr_addr; + int err; inline_size = inline_xattr_size(inode); txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) - return NULL; + return -ENOMEM; /* read from inline xattr */ if (inline_size) { @@ -239,8 +241,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_addr = inline_xattr_addr(ipage); } else { page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + err = PTR_ERR(page); goto fail; + } inline_addr = inline_xattr_addr(page); } memcpy(txattr_addr, inline_addr, inline_size); @@ -254,8 +258,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) /* The inode already has an extended attribute block. */ xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); - if (IS_ERR(xpage)) + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); goto fail; + } xattr_addr = page_address(xpage); memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); @@ -269,10 +275,11 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); header->h_refcount = cpu_to_le32(1); } - return txattr_addr; + *base_addr = txattr_addr; + return 0; fail: kzfree(txattr_addr); - return NULL; + return err; } static inline int write_all_xattrs(struct inode *inode, __u32 hsize, @@ -366,9 +373,9 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; entry = __find_xattr(base_addr, index, len, name); if (IS_XATTR_LAST_ENTRY(entry)) { @@ -402,9 +409,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; - base_addr = read_all_xattrs(inode, NULL); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, NULL, &base_addr); + if (error) + return error; list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -463,9 +470,9 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; /* find entry with wanted name. */ here = __find_xattr(base_addr, index, len, name); @@ -548,6 +555,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); f2fs_mark_inode_dirty_sync(inode); + if (!error && S_ISDIR(inode->i_mode)) + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: kzfree(base_addr); return error; diff --git a/fs/fat/file.c b/fs/fat/file.c index f70185668832..c09ab4e108e5 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -450,7 +450,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~TIMES_SET_FLAGS; } - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); attr->ia_valid = ia_valid; if (error) { if (sbi->options.quiet) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 92b7363dafa9..4afdc3f36470 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -21,6 +21,17 @@ #include <linux/namei.h> #include "fat.h" +static inline unsigned long vfat_d_version(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + +static inline void vfat_d_version_set(struct dentry *dentry, + unsigned long version) +{ + dentry->d_fsdata = (void *) version; +} + /* * If new entry was created in the parent, it could create the 8.3 * alias (the shortname of logname). So, the parent may have the @@ -33,7 +44,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) { int ret = 1; spin_lock(&dentry->d_lock); - if (dentry->d_time != d_inode(dentry->d_parent)->i_version) + if (vfat_d_version(dentry) != d_inode(dentry->d_parent)->i_version) ret = 0; spin_unlock(&dentry->d_lock); return ret; @@ -759,7 +770,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, out: mutex_unlock(&MSDOS_SB(sb)->s_lock); if (!inode) - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); return d_splice_alias(inode, dentry); error: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -823,7 +834,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -849,7 +860,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); diff --git a/fs/file.c b/fs/file.c index 6b1acdfe59da..69d6990e3021 100644 --- a/fs/file.c +++ b/fs/file.c @@ -23,12 +23,12 @@ #include <linux/rcupdate.h> #include <linux/workqueue.h> -int sysctl_nr_open __read_mostly = 1024*1024; -int sysctl_nr_open_min = BITS_PER_LONG; +unsigned int sysctl_nr_open __read_mostly = 1024*1024; +unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) -int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & - -BITS_PER_LONG; +unsigned int sysctl_nr_open_max = + __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void *alloc_fdmem(size_t size) { @@ -163,7 +163,7 @@ out: * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_fdtable(struct files_struct *files, int nr) +static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -208,7 +208,7 @@ static int expand_fdtable(struct files_struct *files, int nr) * expanded and execution may have blocked. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_files(struct files_struct *files, int nr) +static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -243,12 +243,12 @@ repeat: return expanded; } -static inline void __set_close_on_exec(int fd, struct fdtable *fdt) +static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->close_on_exec); } -static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) +static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); @@ -268,10 +268,10 @@ static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } -static int count_open_files(struct fdtable *fdt) +static unsigned int count_open_files(struct fdtable *fdt) { - int size = fdt->max_fds; - int i; + unsigned int size = fdt->max_fds; + unsigned int i; /* Find the last open fd */ for (i = size / BITS_PER_LONG; i > 0; ) { @@ -291,7 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, i; + unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -391,7 +391,7 @@ static struct fdtable *close_files(struct files_struct * files) * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); - int i, j = 0; + unsigned int i, j = 0; for (;;) { unsigned long set; @@ -477,11 +477,11 @@ struct files_struct init_files = { .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), }; -static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start) +static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { - unsigned long maxfd = fdt->max_fds; - unsigned long maxbit = maxfd / BITS_PER_LONG; - unsigned long bitbit = start / BITS_PER_LONG; + unsigned int maxfd = fdt->max_fds; + unsigned int maxbit = maxfd / BITS_PER_LONG; + unsigned int bitbit = start / BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit > maxfd) diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 1b2f6c2c3aaf..76f09ce7e5b2 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -1,5 +1,6 @@ config FUSE_FS tristate "FUSE (Filesystem in Userspace) support" + select FS_POSIX_ACL help With FUSE it is possible to implement a fully functional filesystem in a userspace program. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 448aa27ada00..60da84a86dab 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o -fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o +fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c new file mode 100644 index 000000000000..ec85765502f1 --- /dev/null +++ b/fs/fuse/acl.c @@ -0,0 +1,99 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2016 Canonical Ltd. <seth.forshee@canonical.com> + * + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + */ + +#include "fuse_i.h" + +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> + +struct posix_acl *fuse_get_acl(struct inode *inode, int type) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + int size; + const char *name; + void *value = NULL; + struct posix_acl *acl; + + if (!fc->posix_acl || fc->no_getxattr) + return NULL; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return ERR_PTR(-EOPNOTSUPP); + + value = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + size = fuse_getxattr(inode, name, value, PAGE_SIZE); + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if ((size == 0) || (size == -ENODATA) || + (size == -EOPNOTSUPP && fc->no_getxattr)) + acl = NULL; + else if (size == -ERANGE) + acl = ERR_PTR(-E2BIG); + else + acl = ERR_PTR(size); + + kfree(value); + return acl; +} + +int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + const char *name; + int ret; + + if (!fc->posix_acl || fc->no_setxattr) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return -EINVAL; + + if (acl) { + /* + * Fuse userspace is responsible for updating access + * permissions in the inode, if needed. fuse_setxattr + * invalidates the inode attributes, which will force + * them to be refreshed the next time they are used, + * and it also updates i_ctime. + */ + size_t size = posix_acl_xattr_size(acl->a_count); + void *value; + + if (size > PAGE_SIZE) + return -E2BIG; + + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) { + kfree(value); + return ret; + } + + ret = fuse_setxattr(inode, name, value, size, 0); + kfree(value); + } else { + ret = fuse_removexattr(inode, name); + } + forget_all_cached_acls(inode); + fuse_invalidate_attr(inode); + + return ret; +} diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a94d2ed81ab4..70ea57c7b6bb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -728,7 +728,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) struct pipe_buffer *buf = cs->pipebufs; if (!cs->write) { - err = buf->ops->confirm(cs->pipe, buf); + err = pipe_buf_confirm(cs->pipe, buf); if (err) return err; @@ -767,7 +767,6 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->len = err; cs->offset = off; cs->pg = page; - cs->offset = off; iov_iter_advance(cs->iter, err); } @@ -828,7 +827,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) fuse_copy_finish(cs); - err = buf->ops->confirm(cs->pipe, buf); + err = pipe_buf_confirm(cs->pipe, buf); if (err) return err; @@ -841,7 +840,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (cs->len != PAGE_SIZE) goto out_fallback; - if (buf->ops->steal(cs->pipe, buf) != 0) + if (pipe_buf_steal(cs->pipe, buf) != 0) goto out_fallback; newpage = buf->page; @@ -1342,9 +1341,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - int ret; + int total, ret; int page_nr = 0; - int do_wakeup = 0; struct pipe_buffer *bufs; struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); @@ -1363,52 +1361,23 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (ret < 0) goto out; - ret = 0; - pipe_lock(pipe); - - if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); - if (!ret) - ret = -EPIPE; - goto out_unlock; - } - if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { ret = -EIO; - goto out_unlock; + goto out; } - while (page_nr < cs.nr_segs) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - struct pipe_buffer *buf = pipe->bufs + newbuf; - - buf->page = bufs[page_nr].page; - buf->offset = bufs[page_nr].offset; - buf->len = bufs[page_nr].len; + for (ret = total = 0; page_nr < cs.nr_segs; total += ret) { /* * Need to be careful about this. Having buf->ops in module * code can Oops if the buffer persists after module unload. */ - buf->ops = &nosteal_pipe_buf_ops; - - pipe->nrbufs++; - page_nr++; - ret += buf->len; - - if (pipe->files) - do_wakeup = 1; - } - -out_unlock: - pipe_unlock(pipe); - - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + bufs[page_nr].ops = &nosteal_pipe_buf_ops; + ret = add_to_pipe(pipe, &bufs[page_nr++]); + if (unlikely(ret < 0)) + break; } - + if (total) + ret = total; out: for (; page_nr < cs.nr_segs; page_nr++) put_page(bufs[page_nr].page); @@ -1993,7 +1962,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - ibuf->ops->get(pipe, ibuf); + pipe_buf_get(pipe, ibuf); *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2015,10 +1984,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); - for (idx = 0; idx < nbuf; idx++) { - struct pipe_buffer *buf = &bufs[idx]; - buf->ops->release(pipe, buf); - } + for (idx = 0; idx < nbuf; idx++) + pipe_buf_release(pipe, &bufs[idx]); + out: kfree(bufs); return ret; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dbf77fe1dc2e..572d12410c7c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,6 +14,7 @@ #include <linux/namei.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/posix_acl.h> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) { @@ -38,47 +39,39 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } -#if BITS_PER_LONG >= 64 +union fuse_dentry { + u64 time; + struct rcu_head rcu; +}; + static inline void fuse_dentry_settime(struct dentry *entry, u64 time) { - entry->d_time = time; + ((union fuse_dentry *) entry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(struct dentry *entry) { - return entry->d_time; -} -#else -/* - * On 32 bit archs store the high 32 bits of time in d_fsdata - */ -static void fuse_dentry_settime(struct dentry *entry, u64 time) -{ - entry->d_time = time; - entry->d_fsdata = (void *) (unsigned long) (time >> 32); -} - -static u64 fuse_dentry_time(struct dentry *entry) -{ - return (u64) entry->d_time + - ((u64) (unsigned long) entry->d_fsdata << 32); + return ((union fuse_dentry *) entry->d_fsdata)->time; } -#endif /* * FUSE caches dentries and attributes with separate timeout. The * time in jiffies until the dentry/attributes are valid is stored in - * dentry->d_time and fuse_inode->i_time respectively. + * dentry->d_fsdata and fuse_inode->i_time respectively. */ /* * Calculate the time in jiffies until a dentry/attributes are valid */ -static u64 time_to_jiffies(unsigned long sec, unsigned long nsec) +static u64 time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { - struct timespec ts = {sec, nsec}; - return get_jiffies_64() + timespec_to_jiffies(&ts); + struct timespec64 ts = { + sec, + max_t(u32, nsec, NSEC_PER_SEC - 1) + }; + + return get_jiffies_64() + timespec64_to_jiffies(&ts); } else return 0; } @@ -244,6 +237,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; + forget_all_cached_acls(inode); fuse_change_attributes(inode, &outarg.attr, entry_attr_timeout(&outarg), attr_version); @@ -273,8 +267,23 @@ static int invalid_nodeid(u64 nodeid) return !nodeid || nodeid == FUSE_ROOT_ID; } +static int fuse_dentry_init(struct dentry *dentry) +{ + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + + return dentry->d_fsdata ? 0 : -ENOMEM; +} +static void fuse_dentry_release(struct dentry *dentry) +{ + union fuse_dentry *fd = dentry->d_fsdata; + + kfree_rcu(fd, rcu); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_init = fuse_dentry_init, + .d_release = fuse_dentry_release, }; int fuse_valid_type(int m) @@ -918,6 +927,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, if (time_before64(fi->i_time, get_jiffies_64())) { r = true; + forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else { r = false; @@ -1018,7 +1028,7 @@ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; - if (fc->flags & FUSE_ALLOW_OTHER) + if (fc->allow_other) return 1; cred = current_cred(); @@ -1065,6 +1075,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) if (mask & MAY_NOT_BLOCK) return -ECHILD; + forget_all_cached_acls(inode); return fuse_do_getattr(inode, NULL, NULL); } @@ -1093,7 +1104,7 @@ static int fuse_permission(struct inode *inode, int mask) /* * If attributes are needed, refresh them before proceeding */ - if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) || + if (fc->default_permissions || ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -1106,7 +1117,7 @@ static int fuse_permission(struct inode *inode, int mask) } } - if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { + if (fc->default_permissions) { err = generic_permission(inode, mask); /* If permission is denied, try to refresh file @@ -1234,6 +1245,7 @@ retry: fi->nlookup++; spin_unlock(&fc->lock); + forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), attr_version); @@ -1592,9 +1604,10 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ -int fuse_do_setattr(struct inode *inode, struct iattr *attr, +int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct file *file) { + struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); FUSE_ARGS(args); @@ -1606,10 +1619,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, int err; bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; @@ -1703,14 +1716,63 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; + int ret; if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; - if (attr->ia_valid & ATTR_FILE) - return fuse_do_setattr(inode, attr, attr->ia_file); - else - return fuse_do_setattr(inode, attr, NULL); + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { + attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | + ATTR_MODE); + + /* + * The only sane way to reliably kill suid/sgid is to do it in + * the userspace filesystem + * + * This should be done on write(), truncate() and chown(). + */ + if (!fc->handle_killpriv) { + int kill; + + /* + * ia_mode calculation may have used stale i_mode. + * Refresh and recalculate. + */ + ret = fuse_do_getattr(inode, NULL, file); + if (ret) + return ret; + + attr->ia_mode = inode->i_mode; + kill = should_remove_suid(entry); + if (kill & ATTR_KILL_SUID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISUID; + } + if (kill & ATTR_KILL_SGID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISGID; + } + } + } + if (!attr->ia_valid) + return 0; + + ret = fuse_do_setattr(entry, attr, file); + if (!ret) { + /* + * If filesystem supports acls it may have updated acl xattrs in + * the filesystem, so forget cached acls for the inode. + */ + if (fc->posix_acl) + forget_all_cached_acls(inode); + + /* Directory mode changed, may need to revalidate access */ + if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) + fuse_invalidate_entry_cache(entry); + } + return ret; } static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, @@ -1740,6 +1802,8 @@ static const struct inode_operations fuse_dir_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, }; static const struct file_operations fuse_dir_operations = { @@ -1758,6 +1822,8 @@ static const struct inode_operations fuse_common_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, }; static const struct inode_operations fuse_symlink_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3988b43c2f5a..abc66a6237fd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2326,49 +2326,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) return retval; } -static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, - unsigned int nr_segs, size_t bytes, bool to_user) -{ - struct iov_iter ii; - int page_idx = 0; - - if (!bytes) - return 0; - - iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes); - - while (iov_iter_count(&ii)) { - struct page *page = pages[page_idx++]; - size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); - void *kaddr; - - kaddr = kmap(page); - - while (todo) { - char __user *uaddr = ii.iov->iov_base + ii.iov_offset; - size_t iov_len = ii.iov->iov_len - ii.iov_offset; - size_t copy = min(todo, iov_len); - size_t left; - - if (!to_user) - left = copy_from_user(kaddr, uaddr, copy); - else - left = copy_to_user(uaddr, kaddr, copy); - - if (unlikely(left)) - return -EFAULT; - - iov_iter_advance(&ii, copy); - todo -= copy; - kaddr += copy; - } - - kunmap(page); - } - - return 0; -} - /* * CUSE servers compiled on 32bit broke on 64bit kernels because the * ABI was defined to be 'struct iovec' which is different on 32bit @@ -2520,8 +2477,9 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; - size_t in_size, out_size, transferred; - int err; + size_t in_size, out_size, transferred, c; + int err, i; + struct iov_iter ii; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; @@ -2603,10 +2561,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, req->in.args[1].size = in_size; req->in.argpages = 1; - err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, - false); - if (err) - goto out; + err = -EFAULT; + iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { + c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } } req->out.numargs = 2; @@ -2672,7 +2633,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, if (transferred > inarg.out_size) goto out; - err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); + err = -EFAULT; + iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { + c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + err = 0; out: if (req) fuse_put_request(fc, req); @@ -2842,7 +2810,7 @@ static void fuse_do_truncate(struct file *file) attr.ia_file = file; attr.ia_valid |= ATTR_FILE; - fuse_do_setattr(inode, &attr, file); + fuse_do_setattr(file_dentry(file), &attr, file); } static inline loff_t fuse_round_up(loff_t off) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6db54d0bd81b..0dfbb136e59a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -23,6 +23,7 @@ #include <linux/poll.h> #include <linux/workqueue.h> #include <linux/kref.h> +#include <linux/xattr.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -36,15 +37,6 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 -/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem - module will check permissions based on the file mode. Otherwise no - permission checking is done in the kernel */ -#define FUSE_DEFAULT_PERMISSIONS (1 << 0) - -/** If the FUSE_ALLOW_OTHER flag is given, then not only the user - doing the mount will be allowed to access the filesystem */ -#define FUSE_ALLOW_OTHER (1 << 1) - /** Number of page pointers embedded in fuse_req */ #define FUSE_REQ_INLINE_PAGES 1 @@ -469,9 +461,6 @@ struct fuse_conn { /** The group id for this mount */ kgid_t group_id; - /** The fuse mount flags for this mount */ - unsigned flags; - /** Maximum read size */ unsigned max_read; @@ -547,6 +536,9 @@ struct fuse_conn { /** allow parallel lookups and readdir (default is serialized) */ unsigned parallel_dirops:1; + /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ + unsigned handle_killpriv:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -624,6 +616,15 @@ struct fuse_conn { /** Is lseek not implemented by fs? */ unsigned no_lseek:1; + /** Does the filesystem support posix acls? */ + unsigned posix_acl:1; + + /** Check permissions based on the file mode or not? */ + unsigned default_permissions:1; + + /** Allow other than the mounter user to access the filesystem ? */ + unsigned allow_other:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -960,7 +961,7 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); -int fuse_do_setattr(struct inode *inode, struct iattr *attr, +int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct file *file); void fuse_set_initialized(struct fuse_conn *fc); @@ -968,7 +969,17 @@ void fuse_set_initialized(struct fuse_conn *fc); void fuse_unlock_inode(struct inode *inode); void fuse_lock_inode(struct inode *inode); +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags); +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); +int fuse_removexattr(struct inode *inode, const char *name); extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler *fuse_acl_xattr_handlers[]; + +struct posix_acl; +struct posix_acl *fuse_get_acl(struct inode *inode, int type); +int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1e535f31fed0..17141099f2e7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -20,6 +20,7 @@ #include <linux/random.h> #include <linux/sched.h> #include <linux/exportfs.h> +#include <linux/posix_acl.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -66,7 +67,8 @@ struct fuse_mount_data { unsigned rootmode_present:1; unsigned user_id_present:1; unsigned group_id_present:1; - unsigned flags; + unsigned default_permissions:1; + unsigned allow_other:1; unsigned max_read; unsigned blksize; }; @@ -192,7 +194,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, * check in may_delete(). */ fi->orig_i_mode = inode->i_mode; - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + if (!fc->default_permissions) inode->i_mode &= ~S_ISVTX; fi->orig_ino = attr->ino; @@ -340,6 +342,7 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, return -ENOENT; fuse_invalidate_attr(inode); + forget_all_cached_acls(inode); if (offset >= 0) { pg_start = offset >> PAGE_SHIFT; if (len <= 0) @@ -532,11 +535,11 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_DEFAULT_PERMISSIONS: - d->flags |= FUSE_DEFAULT_PERMISSIONS; + d->default_permissions = 1; break; case OPT_ALLOW_OTHER: - d->flags |= FUSE_ALLOW_OTHER; + d->allow_other = 1; break; case OPT_MAX_READ: @@ -570,9 +573,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); - if (fc->flags & FUSE_DEFAULT_PERMISSIONS) + if (fc->default_permissions) seq_puts(m, ",default_permissions"); - if (fc->flags & FUSE_ALLOW_OTHER) + if (fc->allow_other) seq_puts(m, ",allow_other"); if (fc->max_read != ~0) seq_printf(m, ",max_read=%u", fc->max_read); @@ -910,8 +913,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->writeback_cache = 1; if (arg->flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; + if (arg->flags & FUSE_HANDLE_KILLPRIV) + fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fc->sb->s_time_gran = arg->time_gran; + if ((arg->flags & FUSE_POSIX_ACL)) { + fc->default_permissions = 1; + fc->posix_acl = 1; + fc->sb->s_xattr = fuse_acl_xattr_handlers; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -941,7 +951,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | - FUSE_PARALLEL_DIROPS; + FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1110,7 +1120,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= MS_POSIXACL; - fc->flags = d.flags; + fc->default_permissions = d.default_permissions; + fc->allow_other = d.allow_other; fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index e22980f0a9e2..3caac46b08b0 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -9,9 +9,10 @@ #include "fuse_i.h" #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> -static int fuse_setxattr(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags) { struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); @@ -45,8 +46,8 @@ static int fuse_setxattr(struct inode *inode, const char *name, return err; } -static ssize_t fuse_getxattr(struct inode *inode, const char *name, - void *value, size_t size) +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size) { struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); @@ -78,7 +79,7 @@ static ssize_t fuse_getxattr(struct inode *inode, const char *name, } ret = fuse_simple_request(fc, &args); if (!ret && !size) - ret = outarg.size; + ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { fc->no_getxattr = 1; ret = -EOPNOTSUPP; @@ -137,7 +138,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) } ret = fuse_simple_request(fc, &args); if (!ret && !size) - ret = outarg.size; + ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { @@ -147,7 +148,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) return ret; } -static int fuse_removexattr(struct inode *inode, const char *name) +int fuse_removexattr(struct inode *inode, const char *name) { struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); @@ -201,3 +202,10 @@ const struct xattr_handler *fuse_xattr_handlers[] = { &fuse_xattr_handler, NULL }; + +const struct xattr_handler *fuse_acl_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &fuse_xattr_handler, + NULL +}; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 363ba9e9d8d0..2524807ee070 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -92,17 +92,11 @@ int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - - if (error == 0) - acl = NULL; - - if (mode != inode->i_mode) { - inode->i_mode = mode; + if (mode != inode->i_mode) mark_inode_dirty(inode); - } } if (acl) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 82df36886938..5a6f52ea2722 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -187,7 +187,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w ClearPageChecked(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + BIT(BH_Dirty)|BIT(BH_Uptodate)); } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); } @@ -1147,6 +1147,16 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) if (!page_has_buffers(page)) return 0; + /* + * From xfs_vm_releasepage: mm accommodates an old ext3 case where + * clean pages might not have had the dirty bit cleared. Thus, it can + * send actual dirty pages to ->releasepage() via shrink_active_list(). + * + * As a workaround, we skip pages that contain dirty buffers below. + * Once ->releasepage isn't called on dirty pages anymore, we can warn + * on dirty buffers like we used to here again. + */ + gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); @@ -1156,8 +1166,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bd = bh->b_private; if (bd && bd->bd_tr) goto cannot_release; - if (buffer_pinned(bh) || buffer_dirty(bh)) - goto not_possible; + if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh))) + goto cannot_release; bh = bh->b_this_page; } while(bh != head); spin_unlock(&sdp->sd_ail_lock); @@ -1180,9 +1190,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return try_to_free_buffers(page); -not_possible: /* Should never happen */ - WARN_ON(buffer_dirty(bh)); - WARN_ON(buffer_pinned(bh)); cannot_release: spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6e2bec1cd289..645721f3ff00 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -82,8 +82,8 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, } if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, - (1 << BH_Uptodate)); + create_empty_buffers(page, BIT(inode->i_blkbits), + BIT(BH_Uptodate)); bh = page_buffers(page); @@ -690,7 +690,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi BUG_ON(!dblock); BUG_ON(!new); - bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5)); + bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5)); ret = gfs2_block_map(inode, lblock, &bh, create); *extlen = bh.b_size >> inode->i_blkbits; *dblock = bh.b_blocknr; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index fcb59b23f1e3..db8fbeb62483 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -351,7 +351,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) if (hc) return hc; - hsize = 1 << ip->i_depth; + hsize = BIT(ip->i_depth); hsize *= sizeof(__be64); if (hsize != i_size_read(&ip->i_inode)) { gfs2_consist_inode(ip); @@ -819,8 +819,8 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, if (ip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf; - unsigned hsize = 1 << ip->i_depth; - unsigned index; + unsigned int hsize = BIT(ip->i_depth); + unsigned int index; u64 ln; if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(ip); @@ -932,7 +932,7 @@ static int dir_make_exhash(struct inode *inode) return -ENOSPC; bn = bh->b_blocknr; - gfs2_assert(sdp, dip->i_entries < (1 << 16)); + gfs2_assert(sdp, dip->i_entries < BIT(16)); leaf->lf_entries = cpu_to_be16(dip->i_entries); /* Copy dirents */ @@ -1041,7 +1041,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) bn = nbh->b_blocknr; /* Compute the start and len of leaf pointers in the hash table. */ - len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); + len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth)); half_len = len >> 1; if (!half_len) { pr_warn("i_depth %u lf_depth %u index %u\n", @@ -1163,7 +1163,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) int x; int error = 0; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); hsize_bytes = hsize * sizeof(__be64); hc = gfs2_dir_get_hash_table(dip); @@ -1539,7 +1539,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, int error = 0; unsigned depth = 0; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); hash = gfs2_dir_offset2hash(ctx->pos); index = hash >> (32 - dip->i_depth); @@ -1558,7 +1558,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, if (error) break; - len = 1 << (dip->i_depth - depth); + len = BIT(dip->i_depth - depth); index = (index & ~(len - 1)) + len; } @@ -2113,7 +2113,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) u64 leaf_no; int error = 0, last; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); lp = gfs2_dir_get_hash_table(dip); if (IS_ERR(lp)) @@ -2126,7 +2126,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) if (error) goto out; leaf = (struct gfs2_leaf *)bh->b_data; - len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); + len = BIT(dip->i_depth - be16_to_cpu(leaf->lf_depth)); next_index = (index & ~(len - 1)) + len; last = ((next_index >= hsize) ? 1 : 0); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 320e65e61938..e23ff70b3435 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -395,9 +395,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); - /* Update file times before taking page lock */ - file_update_time(vma->vm_file); - ret = gfs2_rsqa_alloc(ip); if (ret) goto out; @@ -409,6 +406,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_uninit; + /* Update file times before taking page lock */ + file_update_time(vma->vm_file); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); @@ -954,30 +954,6 @@ out_uninit: return ret; } -static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct inode *inode = in->f_mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - inode_lock(inode); - - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - if (ret) { - inode_unlock(inode); - return ret; - } - - gfs2_glock_dq_uninit(&gh); - inode_unlock(inode); - - return generic_file_splice_read(in, ppos, pipe, len, flags); -} - - static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) @@ -1140,7 +1116,7 @@ const struct file_operations gfs2_file_fops = { .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, - .splice_read = gfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, @@ -1168,7 +1144,7 @@ const struct file_operations gfs2_file_fops_nolock = { .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, - .splice_read = gfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 3a90b2b5b9bb..14cbf60167a7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -69,7 +69,7 @@ static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 -#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) +#define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) static struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, @@ -1781,7 +1781,13 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - register_shrinker(&glock_shrinker); + ret = register_shrinker(&glock_shrinker); + if (ret) { + destroy_workqueue(gfs2_delete_workqueue); + destroy_workqueue(glock_workqueue); + rhashtable_destroy(&gl_hash_table); + return ret; + } return 0; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9cbd4b6ebff1..f6c4f0058899 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -187,6 +187,10 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } gfs2_set_iop(inode); + + inode->i_atime.tv_sec = 0; + inode->i_atime.tv_nsec = 0; + unlock_new_inode(inode); } @@ -1932,7 +1936,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto out; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) goto out; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 7710dfd3af35..aace8ce34a18 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -85,7 +85,7 @@ static inline int gfs2_check_internal_file_size(struct inode *inode, u64 size = i_size_read(inode); if (size < minsize || size > maxsize) goto err; - if (size & ((1 << inode->i_blkbits) - 1)) + if (size & (BIT(inode->i_blkbits) - 1)) goto err; return 0; err: diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 74fd0139e6c2..67d1fc4668f7 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -145,7 +145,9 @@ static int __init init_gfs2_fs(void) if (!gfs2_qadata_cachep) goto fail; - register_shrinker(&gfs2_qd_shrinker); + error = register_shrinker(&gfs2_qd_shrinker); + if (error) + goto fail; error = register_filesystem(&gfs2_fs_type); if (error) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 950b8be68e41..373639a59782 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -216,23 +216,26 @@ static void gfs2_meta_read_endio(struct bio *bio) static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], int num) { - struct buffer_head *bh = bhs[0]; - struct bio *bio; - int i; - - if (!num) - return; - - bio = bio_alloc(GFP_NOIO, num); - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - for (i = 0; i < num; i++) { - bh = bhs[i]; - bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + while (num > 0) { + struct buffer_head *bh = *bhs; + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + while (num > 0) { + bh = *bhs; + if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { + BUG_ON(bio->bi_iter.bi_size == 0); + break; + } + bhs++; + num--; + } + bio->bi_end_io = gfs2_meta_read_endio; + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); } - bio->bi_end_io = gfs2_meta_read_endio; - bio_set_op_attrs(bio, op, op_flags); - submit_bio(bio); } /** diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ef1e1822977f..ff72ac6439c8 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -58,7 +58,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_quota_scale_num = 1; gt->gt_quota_scale_den = 1; gt->gt_new_files_jdata = 0; - gt->gt_max_readahead = 1 << 18; + gt->gt_max_readahead = BIT(18); gt->gt_complain_secs = 10; } @@ -284,7 +284,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) / sizeof(u64); sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - @@ -302,7 +302,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) /* Compute maximum reservation required to add a entry to a directory */ - hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + hash_blocks = DIV_ROUND_UP(sizeof(u64) * BIT(GFS2_DIR_MAX_DEPTH), sdp->sd_jbsize); ind_blocks = 0; @@ -1089,7 +1089,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 77930ca25303..8af2dfa09236 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -75,7 +75,7 @@ #include "util.h" #define GFS2_QD_HASH_SHIFT 12 -#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT) #define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) /* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ @@ -384,7 +384,7 @@ static int bh_get(struct gfs2_quota_data *qd) block = qd->qd_slot / sdp->sd_qc_per_block; offset = qd->qd_slot % sdp->sd_qc_per_block; - bh_map.b_size = 1 << ip->i_inode.i_blkbits; + bh_map.b_size = BIT(ip->i_inode.i_blkbits); error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); if (error) goto fail; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 3a7e60bb39f8..e3ee387a6dfe 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -359,7 +359,7 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); u64 size = i_size_read(jd->jd_inode); - if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, BIT(30))) return -EIO; jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a181bd709b1e..ed373261f26d 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -606,7 +606,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; - error = inode_change_ok(inode, attr); /* basic permission checks */ + error = setattr_prepare(dentry, attr); /* basic permission checks */ if (error) return error; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4a7c9241213d..10827c912c4d 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -245,7 +245,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index ab7ea2506b4d..9b92058a1240 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -65,8 +65,8 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, case ACL_TYPE_ACCESS: xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - err = posix_acl_equiv_mode(acl, &inode->i_mode); - if (err < 0) + err = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (err) return err; } err = 0; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 90e46cd752fe..44aa96ba1df8 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -812,7 +812,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) int fd = HOSTFS_I(inode)->fd; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index d3bcdd975700..b3be1b5a62e2 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -189,6 +189,11 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, hpfs_get_block); } +static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block); +} + const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, @@ -214,4 +219,5 @@ const struct file_operations hpfs_file_ops = const struct inode_operations hpfs_file_iops = { .setattr = hpfs_setattr, + .fiemap = hpfs_fiemap, }; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 1f3c6d76200b..b9c724ed1e7e 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -273,7 +273,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) goto out_unlock; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) goto out_unlock; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4ea71eba40a5..2c0c3a017a6a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -416,7 +416,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; - bool rsv_on_error; u32 hash; /* @@ -458,18 +457,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * cache (remove_huge_page) BEFORE removing the * region/reserve map (hugetlb_unreserve_pages). In * rare out of memory conditions, removal of the - * region/reserve map could fail. Before free'ing - * the page, note PagePrivate which is used in case - * of error. + * region/reserve map could fail. Correspondingly, + * the subpool and global reserve usage count can need + * to be adjusted. */ - rsv_on_error = !PagePrivate(page); + VM_BUG_ON(PagePrivate(page)); remove_huge_page(page); freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, next, next + 1, 1))) - hugetlb_fix_reserve_counts(inode, - rsv_on_error); + hugetlb_fix_reserve_counts(inode); } unlock_page(page); @@ -672,7 +670,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) BUG_ON(!inode); - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/inode.c b/fs/inode.c index 705f8609fdb8..7d037591259d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1023,13 +1023,17 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; - +again: spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data); spin_unlock(&inode_hash_lock); if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } return inode; } @@ -1066,6 +1070,10 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, destroy_inode(inode); inode = old; wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } } return inode; @@ -1093,12 +1101,16 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; - +again: spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } return inode; } @@ -1133,6 +1145,10 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) destroy_inode(inode); inode = old; wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } } return inode; } @@ -1268,10 +1284,16 @@ EXPORT_SYMBOL(ilookup5_nowait); struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct inode *inode = ilookup5_nowait(sb, hashval, test, data); - - if (inode) + struct inode *inode; +again: + inode = ilookup5_nowait(sb, hashval, test, data); + if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + } return inode; } EXPORT_SYMBOL(ilookup5); @@ -1288,13 +1310,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; - +again: spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); - if (inode) + if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + } return inode; } EXPORT_SYMBOL(ilookup); @@ -1538,16 +1565,36 @@ sector_t bmap(struct inode *inode, sector_t block) EXPORT_SYMBOL(bmap); /* + * Update times in overlayed inode from underlying real inode + */ +static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, + bool rcu) +{ + if (!rcu) { + struct inode *realinode = d_real_inode(dentry); + + if (unlikely(inode != realinode) && + (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || + !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) { + inode->i_mtime = realinode->i_mtime; + inode->i_ctime = realinode->i_ctime; + } + } +} + +/* * With relative atime, only update atime if the previous atime is * earlier than either the ctime or mtime or if at least a day has * passed since the last atime update. */ -static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, - struct timespec now) +static int relatime_need_update(const struct path *path, struct inode *inode, + struct timespec now, bool rcu) { - if (!(mnt->mnt_flags & MNT_RELATIME)) + if (!(path->mnt->mnt_flags & MNT_RELATIME)) return 1; + + update_ovl_inode_times(path->dentry, inode, rcu); /* * Is mtime younger than atime? If yes, update atime: */ @@ -1614,7 +1661,8 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -bool atime_needs_update(const struct path *path, struct inode *inode) +bool __atime_needs_update(const struct path *path, struct inode *inode, + bool rcu) { struct vfsmount *mnt = path->mnt; struct timespec now; @@ -1640,7 +1688,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode) now = current_fs_time(inode->i_sb); - if (!relatime_need_update(mnt, inode, now)) + if (!relatime_need_update(path, inode, now, rcu)) return false; if (timespec_equal(&inode->i_atime, &now)) @@ -1655,7 +1703,7 @@ void touch_atime(const struct path *path) struct inode *inode = d_inode(path->dentry); struct timespec now; - if (!atime_needs_update(path, inode)) + if (!__atime_needs_update(path, inode, false)) return; if (!sb_start_write_trylock(inode->i_sb)) diff --git a/fs/internal.h b/fs/internal.h index ba0737649d4a..f4da3341b4a3 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -12,6 +12,7 @@ struct super_block; struct file_system_type; struct iomap; +struct iomap_ops; struct linux_binprm; struct path; struct mount; @@ -120,6 +121,15 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); +extern bool __atime_needs_update(const struct path *, struct inode *, bool); +static inline bool atime_needs_update_rcu(const struct path *path, + struct inode *inode) +{ + return __atime_needs_update(path, inode, true); +} + +extern bool atime_needs_update_rcu(const struct path *, struct inode *); + /* * fs-writeback.c */ @@ -156,7 +166,7 @@ extern void mnt_pin_kill(struct mount *m); /* * fs/nsfs.c */ -extern struct dentry_operations ns_dentry_operations; +extern const struct dentry_operations ns_dentry_operations; /* * fs/ioctl.c @@ -164,3 +174,13 @@ extern struct dentry_operations ns_dentry_operations; extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, unsigned long arg); extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* + * iomap support: + */ +typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, + void *data, struct iomap *iomap); + +loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap_ops *ops, void *data, + iomap_actor_t actor); diff --git a/fs/iomap.c b/fs/iomap.c index 706270f21b35..013d1d36fbbf 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -27,9 +27,6 @@ #include <linux/dax.h> #include "internal.h" -typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap); - /* * Execute a iomap write on a segment of the mapping that spans a * contiguous range of pages that have identical block mapping state. @@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, * resources they require in the iomap_begin call, and release them in the * iomap_end call. */ -static loff_t +loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap_ops *ops, void *data, iomap_actor_t actor) { @@ -252,6 +249,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +static struct page * +__iomap_read_page(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +static loff_t +iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + long status = 0; + ssize_t written = 0; + + do { + struct page *page, *rpage; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, length); + + rpage = __iomap_read_page(inode, pos); + if (IS_ERR(rpage)) + return PTR_ERR(rpage); + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE, + &page, iomap); + put_page(rpage); + if (unlikely(status)) + return status; + + WARN_ON_ONCE(!PageUptodate(page)); + + status = iomap_write_end(inode, pos, bytes, bytes, page); + if (unlikely(status <= 0)) { + if (WARN_ON_ONCE(status == 0)) + return -EIO; + return status; + } + + cond_resched(); + + pos += status; + written += status; + length -= status; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (length); + + return written; +} + +int +iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, + struct iomap_ops *ops) +{ + loff_t ret; + + while (len) { + ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, + iomap_dirty_actor); + if (ret <= 0) + return ret; + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_file_dirty); + static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { @@ -430,6 +509,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, if (iomap->flags & IOMAP_F_MERGED) flags |= FIEMAP_EXTENT_MERGED; + if (iomap->flags & IOMAP_F_SHARED) + flags |= FIEMAP_EXTENT_SHARED; return fiemap_fill_next_extent(fi, iomap->offset, iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 46261a6f902d..927da4956a89 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1090,11 +1090,15 @@ static void jbd2_stats_proc_exit(journal_t *journal) * very few fields yet: that has to wait until we have created the * journal structures from from scratch, or loaded them from disk. */ -static journal_t * journal_init_common (void) +static journal_t *journal_init_common(struct block_device *bdev, + struct block_device *fs_dev, + unsigned long long start, int len, int blocksize) { static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; + struct buffer_head *bh; + int n; journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) @@ -1131,6 +1135,32 @@ static journal_t * journal_init_common (void) lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", &jbd2_trans_commit_key, 0); + /* journal descriptor can store up to n blocks -bzzz */ + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!journal->j_wbuf) { + kfree(journal); + return NULL; + } + + bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); + if (!bh) { + pr_err("%s: Cannot get buffer for journal superblock\n", + __func__); + kfree(journal->j_wbuf); + kfree(journal); + return NULL; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + return journal; } @@ -1157,51 +1187,21 @@ static journal_t * journal_init_common (void) * range of blocks on an arbitrary block device. * */ -journal_t * jbd2_journal_init_dev(struct block_device *bdev, +journal_t *jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int blocksize) { - journal_t *journal = journal_init_common(); - struct buffer_head *bh; - int n; + journal_t *journal; + journal = journal_init_common(bdev, fs_dev, start, len, blocksize); if (!journal) return NULL; - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; bdevname(journal->j_dev, journal->j_devname); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - - bh = __getblk(journal->j_dev, start, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; -out_err: - kfree(journal->j_wbuf); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; } /** @@ -1212,67 +1212,36 @@ out_err: * the journal. The inode must exist already, must support bmap() and * must have all data blocks preallocated. */ -journal_t * jbd2_journal_init_inode (struct inode *inode) +journal_t *jbd2_journal_init_inode(struct inode *inode) { - struct buffer_head *bh; - journal_t *journal = journal_init_common(); + journal_t *journal; char *p; - int err; - int n; unsigned long long blocknr; + blocknr = bmap(inode, 0); + if (!blocknr) { + pr_err("%s: Cannot locate journal superblock\n", + __func__); + return NULL; + } + + jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", + inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, + blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, + inode->i_sb->s_blocksize); if (!journal) return NULL; - journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; journal->j_inode = inode; bdevname(journal->j_dev, journal->j_devname); p = strreplace(journal->j_devname, '/', '!'); sprintf(p, "-%lu", journal->j_inode->i_ino); - jbd_debug(1, - "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", - journal, inode->i_sb->s_id, inode->i_ino, - (long long) inode->i_size, - inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - - journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; - journal->j_blocksize = inode->i_sb->s_blocksize; jbd2_stats_proc_init(journal); - /* journal descriptor can store up to n blocks -bzzz */ - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - - err = jbd2_journal_bmap(journal, 0, &blocknr); - /* If that failed, give up */ - if (err) { - printk(KERN_ERR "%s: Cannot locate journal superblock\n", - __func__); - goto out_err; - } - - bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; - return journal; -out_err: - kfree(journal->j_wbuf); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; } /* diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b5bc3e249163..3d8246a9faa4 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -159,6 +159,7 @@ static void wait_transaction_locked(journal_t *journal) read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); + jbd2_might_wait_for_commit(journal); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); } @@ -182,8 +183,6 @@ static int add_transaction_credits(journal_t *journal, int blocks, int needed; int total = blocks + rsv_blocks; - jbd2_might_wait_for_commit(journal); - /* * If the current transaction is locked down for commit, wait * for the lock to be released. @@ -214,6 +213,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (atomic_read(&journal->j_reserved_credits) + total > journal->j_max_transaction_buffers) { read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= journal->j_max_transaction_buffers); @@ -238,6 +238,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) __jbd2_log_wait_for_space(journal); @@ -255,6 +256,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, sub_reserved_credits(journal, rsv_blocks); atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks <= journal->j_max_transaction_buffers / 2); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index bc2693d56298..2a0f2a1044c1 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -233,9 +233,10 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - rc = posix_acl_equiv_mode(acl, &mode); - if (rc < 0) + umode_t mode; + + rc = posix_acl_update_mode(inode, &mode, &acl); + if (rc) return rc; if (inode->i_mode != mode) { struct iattr attr; @@ -247,8 +248,6 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (rc < 0) return rc; } - if (rc == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index ae2ebb26b446..3773b24b4db0 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -193,7 +193,7 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = d_inode(dentry); int rc; - rc = inode_change_ok(inode, iattr); + rc = setattr_prepare(dentry, iattr); if (rc) return rc; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 21fa92ba2c19..3a1e1554a4e3 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -78,13 +78,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, case ACL_TYPE_ACCESS: ea_name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - rc = posix_acl_equiv_mode(acl, &inode->i_mode); - if (rc < 0) + rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (rc) return rc; inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - if (rc == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/jfs/file.c b/fs/jfs/file.c index f6eb0417a909..739492c7a3fd 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -103,7 +103,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = d_inode(dentry); int rc; - rc = inode_change_ok(inode, iattr); + rc = setattr_prepare(dentry, iattr); if (rc) return rc; diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 2e58978d6f45..4d973524c887 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2893,8 +2893,7 @@ restart: * on anon_list2. Let's check. */ if (!list_empty(&TxAnchor.anon_list2)) { - list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list); - INIT_LIST_HEAD(&TxAnchor.anon_list2); + list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); goto restart; } TXN_UNLOCK(); diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 90b3bc21e9b0..bd9b641ada2c 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -379,8 +379,14 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) * cached in meta-data cache, and not written out * by txCommit(); */ - filemap_fdatawait(ipbmap->i_mapping); - filemap_write_and_wait(ipbmap->i_mapping); + rc = filemap_fdatawait(ipbmap->i_mapping); + if (rc) + goto error_out; + + rc = filemap_write_and_wait(ipbmap->i_mapping); + if (rc) + goto error_out; + diWriteSpecial(ipbmap, 0); newPage = nPages; /* first new page number */ diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 2b735ce0268a..102b6f0bc7af 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) return -EINVAL; mutex_lock(&kernfs_mutex); - error = inode_change_ok(inode, iattr); + error = setattr_prepare(dentry, iattr); if (error) goto out; diff --git a/fs/libfs.c b/fs/libfs.c index b322d756b20d..a6d89f151771 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -395,7 +395,7 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, iattr); + error = setattr_prepare(dentry, iattr); if (error) return error; diff --git a/fs/locks.c b/fs/locks.c index ee1b15f6fc13..8cc218d7a039 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -127,7 +127,6 @@ #include <linux/pid_namespace.h> #include <linux/hashtable.h> #include <linux/percpu.h> -#include <linux/lglock.h> #define CREATE_TRACE_POINTS #include <trace/events/filelock.h> @@ -139,6 +138,11 @@ #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) +static inline bool is_remote_lock(struct file *filp) +{ + return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK)); +} + static bool lease_breaking(struct file_lock *fl) { return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); @@ -158,12 +162,18 @@ int lease_break_time = 45; /* * The global file_lock_list is only used for displaying /proc/locks, so we - * keep a list on each CPU, with each list protected by its own spinlock via - * the file_lock_lglock. Note that alterations to the list also require that - * the relevant flc_lock is held. + * keep a list on each CPU, with each list protected by its own spinlock. + * Global serialization is done using file_rwsem. + * + * Note that alterations to the list also require that the relevant flc_lock is + * held. */ -DEFINE_STATIC_LGLOCK(file_lock_lglock); -static DEFINE_PER_CPU(struct hlist_head, file_lock_list); +struct file_lock_list_struct { + spinlock_t lock; + struct hlist_head hlist; +}; +static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); +DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. @@ -587,15 +597,23 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) /* Must be called with the flc_lock held! */ static void locks_insert_global_locks(struct file_lock *fl) { - lg_local_lock(&file_lock_lglock); + struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list); + + percpu_rwsem_assert_held(&file_rwsem); + + spin_lock(&fll->lock); fl->fl_link_cpu = smp_processor_id(); - hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list)); - lg_local_unlock(&file_lock_lglock); + hlist_add_head(&fl->fl_link, &fll->hlist); + spin_unlock(&fll->lock); } /* Must be called with the flc_lock held! */ static void locks_delete_global_locks(struct file_lock *fl) { + struct file_lock_list_struct *fll; + + percpu_rwsem_assert_held(&file_rwsem); + /* * Avoid taking lock if already unhashed. This is safe since this check * is done while holding the flc_lock, and new insertions into the list @@ -603,9 +621,11 @@ static void locks_delete_global_locks(struct file_lock *fl) */ if (hlist_unhashed(&fl->fl_link)) return; - lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu); + + fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu); + spin_lock(&fll->lock); hlist_del_init(&fl->fl_link); - lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu); + spin_unlock(&fll->lock); } static unsigned long @@ -791,7 +811,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; struct file_lock_context *ctx; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { @@ -915,6 +935,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) return -ENOMEM; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -955,6 +976,7 @@ find_conflict: out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); @@ -991,6 +1013,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If @@ -1162,6 +1185,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, } out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); /* * Free any unused locks. */ @@ -1192,7 +1216,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return posix_lock_inode(file_inode(filp), fl, conflock); + return posix_lock_inode(locks_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1232,7 +1256,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) int locks_mandatory_locked(struct file *file) { int ret; - struct inode *inode = file_inode(file); + struct inode *inode = locks_inode(file); struct file_lock_context *ctx; struct file_lock *fl; @@ -1436,6 +1460,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) return error; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1487,9 +1512,13 @@ restart: locks_insert_block(fl, new_fl); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); + + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); @@ -1506,6 +1535,7 @@ restart: } out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; @@ -1572,7 +1602,7 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); @@ -1580,7 +1610,7 @@ int fcntl_getlease(struct file *filp) ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); - time_out_leases(file_inode(filp), &dispose); + time_out_leases(inode, &dispose); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file != filp) continue; @@ -1613,7 +1643,8 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + if ((arg == F_RDLCK) && + (atomic_read(&d_real_inode(dentry)->i_writecount) > 0)) return -EAGAIN; if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || @@ -1628,7 +1659,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr { struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = file_inode(filp); + struct inode *inode = dentry->d_inode; struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; @@ -1660,6 +1691,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr return -EINVAL; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); error = check_conflicting_open(dentry, arg, lease->fl_flags); @@ -1730,6 +1762,7 @@ out_setup: lease->fl_lmops->lm_setup(lease, priv); out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); if (is_deleg) inode_unlock(inode); @@ -1742,7 +1775,7 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1752,6 +1785,7 @@ static int generic_delete_lease(struct file *filp, void *owner) return error; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file == filp && @@ -1764,6 +1798,7 @@ static int generic_delete_lease(struct file *filp, void *owner) if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); return error; } @@ -1782,7 +1817,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -1830,7 +1865,7 @@ EXPORT_SYMBOL(generic_setlease); int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { - if (filp->f_op->setlease) + if (filp->f_op->setlease && is_remote_lock(filp)) return filp->f_op->setlease(filp, arg, lease, priv); else return generic_setlease(filp, arg, lease, priv); @@ -1979,7 +2014,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (error) goto out_free; - if (f.file->f_op->flock) + if (f.file->f_op->flock && is_remote_lock(f.file)) error = f.file->f_op->flock(f.file, (can_sleep) ? F_SETLKW : F_SETLK, lock); @@ -2005,7 +2040,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); return 0; @@ -2129,7 +2164,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, cmd, fl); else return posix_lock_file(filp, fl, conf); @@ -2191,7 +2226,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; - inode = file_inode(filp); + inode = locks_inode(filp); /* * This might block, so we do it before checking the inode. @@ -2343,7 +2378,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = file_inode(filp); + inode = locks_inode(filp); /* Don't allow mandatory locks on files that may be memory mapped * and shared. @@ -2426,6 +2461,7 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { int error; + struct inode *inode = locks_inode(filp); struct file_lock lock; struct file_lock_context *ctx; @@ -2434,7 +2470,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2452,7 +2488,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock); - trace_locks_remove_posix(file_inode(filp), &lock, error); + trace_locks_remove_posix(inode, &lock, error); } EXPORT_SYMBOL(locks_remove_posix); @@ -2469,12 +2505,12 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); if (list_empty(&flctx->flc_flock)) return; - if (filp->f_op->flock) + if (filp->f_op->flock && is_remote_lock(filp)) filp->f_op->flock(filp, F_SETLKW, &fl); else flock_lock_inode(inode, &fl); @@ -2508,7 +2544,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + ctx = smp_load_acquire(&locks_inode(filp)->i_flctx); if (!ctx) return; @@ -2552,7 +2588,7 @@ EXPORT_SYMBOL(posix_unblock_lock); */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } @@ -2574,13 +2610,24 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, struct inode *inode = NULL; unsigned int fl_pid; - if (fl->fl_nspid) - fl_pid = pid_vnr(fl->fl_nspid); - else + if (fl->fl_nspid) { + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; + + /* Don't let fl_pid change based on who is reading the file */ + fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns); + + /* + * If there isn't a fl_pid don't display who is waiting on + * the lock if we are called from locks_show, or if we are + * called from __show_fd_info - skip lock entirely + */ + if (fl_pid == 0) + return; + } else fl_pid = fl->fl_pid; if (fl->fl_file != NULL) - inode = file_inode(fl->fl_file); + inode = locks_inode(fl->fl_file); seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { @@ -2648,9 +2695,13 @@ static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; struct file_lock *fl, *bfl; + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; fl = hlist_entry(v, struct file_lock, fl_link); + if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns)) + return 0; + lock_get_status(f, fl, iter->li_pos, ""); list_for_each_entry(bfl, &fl->fl_block, fl_block) @@ -2682,7 +2733,7 @@ static void __show_fd_locks(struct seq_file *f, void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) { - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; int id = 0; @@ -2703,9 +2754,9 @@ static void *locks_start(struct seq_file *f, loff_t *pos) struct locks_iterator *iter = f->private; iter->li_pos = *pos + 1; - lg_global_lock(&file_lock_lglock); + percpu_down_write(&file_rwsem); spin_lock(&blocked_lock_lock); - return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos); + return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) @@ -2713,14 +2764,14 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos) struct locks_iterator *iter = f->private; ++iter->li_pos; - return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos); + return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos); } static void locks_stop(struct seq_file *f, void *v) __releases(&blocked_lock_lock) { spin_unlock(&blocked_lock_lock); - lg_global_unlock(&file_lock_lglock); + percpu_up_write(&file_rwsem); } static const struct seq_operations locks_seq_operations = { @@ -2761,10 +2812,13 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); - lg_lock_init(&file_lock_lglock, "file_lock_lglock"); - for_each_possible_cpu(i) - INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i)); + for_each_possible_cpu(i) { + struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); + + spin_lock_init(&fll->lock); + INIT_HLIST_HEAD(&fll->hlist); + } return 0; } diff --git a/fs/logfs/file.c b/fs/logfs/file.c index f01ddfb1a03b..5d9fe466bbc9 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -244,7 +244,7 @@ static int logfs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int err = 0; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; diff --git a/fs/mbcache.c b/fs/mbcache.c index eccda3a02de6..c5bd19ffa326 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -366,7 +366,11 @@ struct mb_cache *mb_cache_create(int bucket_bits) cache->c_shrink.count_objects = mb_cache_count; cache->c_shrink.scan_objects = mb_cache_scan; cache->c_shrink.seeks = DEFAULT_SEEKS; - register_shrinker(&cache->c_shrink); + if (register_shrinker(&cache->c_shrink)) { + kfree(cache->c_hash); + kfree(cache); + goto err_out; + } INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); diff --git a/fs/minix/file.c b/fs/minix/file.c index 94f0eb9a6e2c..a6a4797aa0d4 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -26,7 +26,7 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/mount.h b/fs/mount.h index 14db05d424f7..d2e25d7b64b3 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -10,9 +10,12 @@ struct mnt_namespace { struct mount * root; struct list_head list; struct user_namespace *user_ns; + struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; + unsigned int mounts; /* # of mounts in the namespace */ + unsigned int pending_mounts; }; struct mnt_pcp { diff --git a/fs/namei.c b/fs/namei.c index adb04146df09..4bbcae1ba58e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1015,7 +1015,7 @@ const char *get_link(struct nameidata *nd) if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); - } else if (atime_needs_update(&last->link, inode)) { + } else if (atime_needs_update_rcu(&last->link, inode)) { if (unlikely(unlazy_walk(nd, NULL, 0))) return ERR_PTR(-ECHILD); touch_atime(&last->link); diff --git a/fs/namespace.c b/fs/namespace.c index 7bb2cda3bfef..58aca9c931ac 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,9 @@ #include "pnode.h" #include "internal.h" +/* Maximum number of mounts in a mount namespace */ +unsigned int sysctl_mount_max __read_mostly = 100000; + static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; static unsigned int mp_hash_mask __read_mostly; @@ -899,6 +902,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); + n->mounts += n->pending_mounts; + n->pending_mounts = 0; + attach_shadowed(mnt, parent, shadows); touch_mnt_namespace(n); } @@ -1419,11 +1425,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) { + struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); - __touch_mnt_namespace(p->mnt_ns); + ns = p->mnt_ns; + if (ns) { + ns->mounts--; + __touch_mnt_namespace(ns); + } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; @@ -1840,6 +1851,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse) return 0; } +int count_mounts(struct mnt_namespace *ns, struct mount *mnt) +{ + unsigned int max = READ_ONCE(sysctl_mount_max); + unsigned int mounts = 0, old, pending, sum; + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) + mounts++; + + old = ns->mounts; + pending = ns->pending_mounts; + sum = old + pending; + if ((old > sum) || + (pending > sum) || + (max < sum) || + (mounts > (max - sum))) + return -ENOSPC; + + ns->pending_mounts = pending + mounts; + return 0; +} + /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached @@ -1909,10 +1942,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct path *parent_path) { HLIST_HEAD(tree_list); + struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mount *child, *p; struct hlist_node *n; int err; + /* Is there space to add these mounts to the mount namespace? */ + if (!parent_path) { + err = count_mounts(ns, source_mnt); + if (err) + goto out; + } + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -1949,11 +1990,13 @@ static int attach_recursive_mnt(struct mount *source_mnt, out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); + child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: + ns->pending_mounts = 0; return err; } @@ -2700,7 +2743,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME); + MS_STRICTATIME | MS_NOREMOTELOCK); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, @@ -2719,9 +2762,20 @@ dput_out: return retval; } +static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); +} + +static void dec_mnt_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); +} + static void free_mnt_ns(struct mnt_namespace *ns) { ns_free_inum(&ns->ns); + dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); } @@ -2738,14 +2792,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + struct ucounts *ucounts; int ret; + ucounts = inc_mnt_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); - if (!new_ns) + if (!new_ns) { + dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); + } ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); + dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } new_ns->ns.ops = &mntns_operations; @@ -2756,6 +2818,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) init_waitqueue_head(&new_ns->poll); new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; + new_ns->mounts = 0; + new_ns->pending_mounts = 0; return new_ns; } @@ -2805,6 +2870,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, q = new; while (p) { q->mnt_ns = new_ns; + new_ns->mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); @@ -2843,6 +2909,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; new_ns->root = mnt; + new_ns->mounts++; list_add(&mnt->mnt_list, &new_ns->list); } else { mntput(m); @@ -3348,10 +3415,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct user_namespace *mntns_owner(struct ns_common *ns) +{ + return to_mnt_ns(ns)->user_ns; +} + const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, + .owner = mntns_owner, }; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 1af15fcbe57b..f6cf4c7e92b1 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -884,7 +884,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) /* ageing the dentry to force validation */ ncp_age_dentry(server, dentry); - result = inode_change_ok(inode, attr); + result = setattr_prepare(dentry, attr); if (result < 0) goto out; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index ca699ddc11c1..2efbdde36c3e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -182,29 +182,6 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) } EXPORT_SYMBOL_GPL(nfs_file_read); -ssize_t -nfs_file_splice_read(struct file *filp, loff_t *ppos, - struct pipe_inode_info *pipe, size_t count, - unsigned int flags) -{ - struct inode *inode = file_inode(filp); - ssize_t res; - - dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", - filp, (unsigned long) count, (unsigned long long) *ppos); - - nfs_start_io_read(inode); - res = nfs_revalidate_mapping(inode, filp->f_mapping); - if (!res) { - res = generic_file_splice_read(filp, ppos, pipe, count, flags); - if (res > 0) - nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); - } - nfs_end_io_read(inode); - return res; -} -EXPORT_SYMBOL_GPL(nfs_file_splice_read); - int nfs_file_mmap(struct file * file, struct vm_area_struct * vma) { @@ -871,7 +848,7 @@ const struct file_operations nfs_file_operations = { .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = nfs_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74935a19e4bf..4b308a1487a5 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -365,8 +365,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *) int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); -ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, - size_t, unsigned int); int nfs_file_mmap(struct file *, struct vm_area_struct *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); @@ -681,11 +679,11 @@ unsigned int nfs_page_length(struct page *page) loff_t i_size = i_size_read(page_file_mapping(page)->host); if (i_size > 0) { - pgoff_t page_index = page_file_index(page); + pgoff_t index = page_index(page); pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT; - if (page_index < end_index) + if (index < end_index) return PAGE_SIZE; - if (page_index == end_index) + if (index == end_index) return ((i_size - 1) & ~PAGE_MASK) + 1; } return 0; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index d085ad794884..89a77950e0b0 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -248,7 +248,7 @@ const struct file_operations nfs4_file_operations = { .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = nfs_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 174dd4cf5747..965db474f4b0 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -342,7 +342,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, * update_nfs_request below if the region is not locked. */ req->wb_page = page; if (page) { - req->wb_index = page_file_index(page); + req->wb_index = page_index(page); get_page(page); } req->wb_offset = offset; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 572e5b3b06f1..defc9233e985 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -295,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page) int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_SIZE, page_file_index(page)); + page, PAGE_SIZE, page_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); nfs_add_stats(inode, NFSIOS_READPAGES, 1); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3a6724c6eb5f..53211838f72a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -151,7 +151,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c spin_lock(&inode->i_lock); i_size = i_size_read(inode); end_index = (i_size - 1) >> PAGE_SHIFT; - if (i_size > 0 && page_file_index(page) < end_index) + if (i_size > 0 && page_index(page) < end_index) goto out; end = page_file_offset(page) + ((loff_t)offset+count); if (i_size >= end) @@ -603,7 +603,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, { int ret; - nfs_pageio_cond_complete(pgio, page_file_index(page)); + nfs_pageio_cond_complete(pgio, page_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, launder); if (ret == -EAGAIN) { diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 9d46a0bdd9f9..62469c60be23 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -55,10 +55,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) goto oom; for (i = 0; i < rqgi->ngroups; i++) { - if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) - GROUP_AT(gi, i) = exp->ex_anon_gid; + if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i])) + gi->gid[i] = exp->ex_anon_gid; else - GROUP_AT(gi, i) = GROUP_AT(rqgi, i); + gi->gid[i] = rqgi->gid[i]; } } else { gi = get_group_info(rqgi); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a204d7e109d4..39bfaba9c99c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1903,7 +1903,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2) if (g1->ngroups != g2->ngroups) return false; for (i=0; i<g1->ngroups; i++) - if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i))) + if (!gid_eq(g1->gid[i], g2->gid[i])) return false; return true; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index e9214768cde9..08188743db53 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -74,10 +74,10 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, * which only requires access, and "set-[ac]time-to-X" which * requires ownership. * So if it looks like it might be "set both to the same time which - * is close to now", and if inode_change_ok fails, then we + * is close to now", and if setattr_prepare fails, then we * convert to "set to now" instead of "set to explicit time" * - * We only call inode_change_ok as the last test as technically + * We only call setattr_prepare as the last test as technically * it is not an interface that we should be using. */ #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) @@ -92,17 +92,15 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, * request is. We require it be within 30 minutes of now. */ time_t delta = iap->ia_atime.tv_sec - get_seconds(); - struct inode *inode; nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); if (nfserr) goto done; - inode = d_inode(fhp->fh_dentry); if (delta < 0) delta = -delta; if (delta < MAX_TOUCH_TIME_ERROR && - inode_change_ok(inode, iap) != 0) { + setattr_prepare(fhp->fh_dentry, iap) != 0) { /* * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. * This will cause notify_change to set these times diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index af04f553d7c9..402c325e0467 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -829,7 +829,7 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) struct super_block *sb = inode->i_sb; int err; - err = inode_change_ok(inode, iattr); + err = setattr_prepare(dentry, iattr); if (err) return err; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index a64313868d3a..7ebfca6a1427 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -49,12 +49,12 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; * enough to fit in "count". Return an error pointer if the count * is not large enough. * - * Called with the group->notification_mutex held. + * Called with the group->notification_lock held. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -64,7 +64,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (FAN_EVENT_METADATA_LEN > count) return ERR_PTR(-EINVAL); - /* held the notification_mutex the whole time, so this is the + /* held the notification_lock the whole time, so this is the * same event we peeked above */ return fsnotify_remove_first_event(group); } @@ -147,7 +147,7 @@ static struct fanotify_perm_event_info *dequeue_event( { struct fanotify_perm_event_info *event, *return_e = NULL; - spin_lock(&group->fanotify_data.access_lock); + spin_lock(&group->notification_lock); list_for_each_entry(event, &group->fanotify_data.access_list, fae.fse.list) { if (event->fd != fd) @@ -157,7 +157,7 @@ static struct fanotify_perm_event_info *dequeue_event( return_e = event; break; } - spin_unlock(&group->fanotify_data.access_lock); + spin_unlock(&group->notification_lock); pr_debug("%s: found return_re=%p\n", __func__, return_e); @@ -244,10 +244,10 @@ static unsigned int fanotify_poll(struct file *file, poll_table *wait) int ret = 0; poll_wait(file, &group->notification_waitq, wait); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = POLLIN | POLLRDNORM; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } @@ -268,9 +268,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); kevent = get_one_event(group, count); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); if (IS_ERR(kevent)) { ret = PTR_ERR(kevent); @@ -309,10 +309,10 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, wake_up(&group->fanotify_data.access_waitq); break; } - spin_lock(&group->fanotify_data.access_lock); + spin_lock(&group->notification_lock); list_add_tail(&kevent->list, &group->fanotify_data.access_list); - spin_unlock(&group->fanotify_data.access_lock); + spin_unlock(&group->notification_lock); #endif } buf += ret; @@ -371,7 +371,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) * Process all permission events on access_list and notification queue * and simulate reply from userspace. */ - spin_lock(&group->fanotify_data.access_lock); + spin_lock(&group->notification_lock); list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, fae.fse.list) { pr_debug("%s: found group=%p event=%p\n", __func__, group, @@ -380,22 +380,22 @@ static int fanotify_release(struct inode *ignored, struct file *file) list_del_init(&event->fae.fse.list); event->response = FAN_ALLOW; } - spin_unlock(&group->fanotify_data.access_lock); /* * Destroy all non-permission events. For permission events just * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns. */ - mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); - if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) + if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) { + spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); - else + spin_lock(&group->notification_lock); + } else FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; } - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); @@ -421,10 +421,10 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case FIONREAD: - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; } @@ -765,7 +765,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); #endif diff --git a/fs/notify/group.c b/fs/notify/group.c index b47f7cfdcaa4..fbe3cbebec16 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -45,9 +45,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_group_stop_queueing(struct fsnotify_group *group) { - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); group->shutdown = true; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); } /* @@ -125,7 +125,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) atomic_set(&group->refcnt, 1); atomic_set(&group->num_marks, 0); - mutex_init(&group->notification_mutex); + spin_lock_init(&group->notification_lock); INIT_LIST_HEAD(&group->notification_list); init_waitqueue_head(&group->notification_waitq); group->max_events = UINT_MAX; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index b8d08d0d0a4d..69d1ea3d292a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -115,10 +115,10 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) int ret = 0; poll_wait(file, &group->notification_waitq, wait); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = POLLIN | POLLRDNORM; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } @@ -138,7 +138,7 @@ static int round_event_name_len(struct fsnotify_event *fsn_event) * enough to fit in "count". Return an error pointer if * not large enough. * - * Called with the group->notification_mutex held. + * Called with the group->notification_lock held. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) @@ -157,7 +157,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (event_size > count) return ERR_PTR(-EINVAL); - /* held the notification_mutex the whole time, so this is the + /* held the notification_lock the whole time, so this is the * same event we peeked above */ fsnotify_remove_first_event(group); @@ -234,9 +234,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); kevent = get_one_event(group, count); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent); @@ -300,13 +300,13 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case FIONREAD: - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) { send_len += sizeof(struct inotify_event); send_len += round_event_name_len(fsn_event); } - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index e455e83ceeeb..66f85c651c52 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie); /* return true if the notify queue is empty, false otherwise */ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + assert_spin_locked(&group->notification_lock); return list_empty(&group->notification_list) ? true : false; } @@ -73,8 +73,17 @@ void fsnotify_destroy_event(struct fsnotify_group *group, /* Overflow events are per-group and we don't want to free them */ if (!event || event->mask == FS_Q_OVERFLOW) return; - /* If the event is still queued, we have a problem... */ - WARN_ON(!list_empty(&event->list)); + /* + * If the event is still queued, we have a problem... Do an unreliable + * lockless check first to avoid locking in the common case. The + * locking may be necessary for permission events which got removed + * from the list by a different CPU than the one freeing the event. + */ + if (!list_empty(&event->list)) { + spin_lock(&group->notification_lock); + WARN_ON(!list_empty(&event->list)); + spin_unlock(&group->notification_lock); + } group->ops->free_event(event); } @@ -95,10 +104,10 @@ int fsnotify_add_event(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); if (group->shutdown) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return 2; } @@ -106,7 +115,7 @@ int fsnotify_add_event(struct fsnotify_group *group, ret = 2; /* Queue overflow event only if it isn't already queued */ if (!list_empty(&group->overflow_event->list)) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } event = group->overflow_event; @@ -116,7 +125,7 @@ int fsnotify_add_event(struct fsnotify_group *group, if (!list_empty(list) && merge) { ret = merge(list, event); if (ret) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } } @@ -124,7 +133,7 @@ int fsnotify_add_event(struct fsnotify_group *group, queue: group->q_len++; list_add_tail(&event->list, list); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); wake_up(&group->notification_waitq); kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); @@ -139,7 +148,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event; - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p\n", __func__, group); @@ -161,7 +170,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) */ struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + assert_spin_locked(&group->notification_lock); return list_first_entry(&group->notification_list, struct fsnotify_event, list); @@ -175,12 +184,14 @@ void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_first_event(group); + spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, event); + spin_lock(&group->notification_lock); } - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); } /* diff --git a/fs/nsfs.c b/fs/nsfs.c index 8f20d6016e20..30bb10034120 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -5,11 +5,16 @@ #include <linux/magic.h> #include <linux/ktime.h> #include <linux/seq_file.h> +#include <linux/user_namespace.h> +#include <linux/nsfs.h> static struct vfsmount *nsfs_mnt; +static long ns_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg); static const struct file_operations ns_file_operations = { .llseek = no_llseek, + .unlocked_ioctl = ns_ioctl, }; static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) @@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } -void *ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops) +static void *__ns_get_path(struct path *path, struct ns_common *ns) { - struct vfsmount *mnt = mntget(nsfs_mnt); + struct vfsmount *mnt = nsfs_mnt; struct qstr qname = { .name = "", }; struct dentry *dentry; struct inode *inode; - struct ns_common *ns; unsigned long d; -again: - ns = ns_ops->get(task); - if (!ns) { - mntput(mnt); - return ERR_PTR(-ENOENT); - } rcu_read_lock(); d = atomic_long_read(&ns->stashed); if (!d) @@ -68,17 +65,16 @@ again: if (!lockref_get_not_dead(&dentry->d_lockref)) goto slow; rcu_read_unlock(); - ns_ops->put(ns); + ns->ops->put(ns); got_it: - path->mnt = mnt; + path->mnt = mntget(mnt); path->dentry = dentry; return NULL; slow: rcu_read_unlock(); inode = new_inode_pseudo(mnt->mnt_sb); if (!inode) { - ns_ops->put(ns); - mntput(mnt); + ns->ops->put(ns); return ERR_PTR(-ENOMEM); } inode->i_ino = ns->inum; @@ -91,21 +87,96 @@ slow: dentry = d_alloc_pseudo(mnt->mnt_sb, &qname); if (!dentry) { iput(inode); - mntput(mnt); return ERR_PTR(-ENOMEM); } d_instantiate(dentry, inode); - dentry->d_fsdata = (void *)ns_ops; + dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { d_delete(dentry); /* make sure ->d_prune() does nothing */ dput(dentry); cpu_relax(); - goto again; + return ERR_PTR(-EAGAIN); } goto got_it; } +void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct ns_common *ns; + void *ret; + +again: + ns = ns_ops->get(task); + if (!ns) + return ERR_PTR(-ENOENT); + + ret = __ns_get_path(path, ns); + if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN) + goto again; + return ret; +} + +static int open_related_ns(struct ns_common *ns, + struct ns_common *(*get_ns)(struct ns_common *ns)) +{ + struct path path = {}; + struct file *f; + void *err; + int fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + while (1) { + struct ns_common *relative; + + relative = get_ns(ns); + if (IS_ERR(relative)) { + put_unused_fd(fd); + return PTR_ERR(relative); + } + + err = __ns_get_path(&path, relative); + if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN) + continue; + break; + } + if (IS_ERR(err)) { + put_unused_fd(fd); + return PTR_ERR(err); + } + + f = dentry_open(&path, O_RDONLY, current_cred()); + path_put(&path); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else + fd_install(fd, f); + + return fd; +} + +static long ns_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct ns_common *ns = get_proc_ns(file_inode(filp)); + + switch (ioctl) { + case NS_GET_USERNS: + return open_related_ns(ns, ns_get_owner); + case NS_GET_PARENT: + if (!ns->ops->get_parent) + return -EINVAL; + return open_related_ns(ns, ns->ops->get_parent); + default: + return -ENOTTY; + } +} + int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops) { diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f548629dfaac..bf72a2c58b75 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1850,7 +1850,7 @@ again: * pages being swapped out between us bringing them into memory * and doing the actual copying. */ - if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) { + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; break; } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index e01287c964a8..9d7a44872df5 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2893,7 +2893,7 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) int err; unsigned int ia_valid = attr->ia_valid; - err = inode_change_ok(vi, attr); + err = setattr_prepare(dentry, attr); if (err) goto out; /* We do not support NTFS ACLs yet. */ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 2162434728c0..164307b99405 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -241,13 +241,11 @@ int ocfs2_set_acl(handle_t *handle, case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - ret = posix_acl_equiv_mode(acl, &mode); - if (ret < 0) - return ret; + umode_t mode; - if (ret == 0) - acl = NULL; + ret = posix_acl_update_mode(inode, &mode, &acl); + if (ret) + return ret; ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 98d36548153d..bbb4b3e5b4ff 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1842,6 +1842,16 @@ out_commit: ocfs2_commit_trans(osb, handle); out: + /* + * The mmapped page won't be unlocked in ocfs2_free_write_ctxt(), + * even in case of error here like ENOSPC and ENOMEM. So, we need + * to unlock the target page manually to prevent deadlocks when + * retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED + * to VM code. + */ + if (wc->w_target_locked) + unlock_page(mmap_page); + ocfs2_free_write_ctxt(inode, wc); if (data_ac) { diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1d67fcbf7160..8abab16b4602 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -2104,7 +2104,7 @@ int o2net_start_listening(struct o2nm_node *node) BUG_ON(o2net_listen_sock != NULL); mlog(ML_KTHREAD, "starting o2net thread...\n"); - o2net_wq = create_singlethread_workqueue("o2net"); + o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM); if (o2net_wq == NULL) { mlog(ML_ERROR, "unable to launch o2net thread\n"); return -ENOMEM; /* ? */ diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 533bd524e41e..733e4e79c8e2 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1904,7 +1904,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) } snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); - dlm->dlm_worker = create_singlethread_workqueue(wq_name); + dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index ef474cdd6404..a0c3e03f9cd3 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -211,7 +211,7 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; @@ -646,7 +646,7 @@ static int __init init_dlmfs_fs(void) } cleanup_inode = 1; - user_dlm_worker = create_singlethread_workqueue("user_dlm"); + user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0); if (!user_dlm_worker) { status = -ENOMEM; goto bail; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 63316db763da..ba5c177d0ed6 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1155,7 +1155,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - status = inode_change_ok(inode, attr); + status = setattr_prepare(dentry, attr); if (status) return status; @@ -2321,36 +2321,6 @@ out_mutex: return ret; } -static ssize_t ocfs2_file_splice_read(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - int ret = 0, lock_level = 0; - struct inode *inode = file_inode(in); - - trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - in->f_path.dentry->d_name.len, - in->f_path.dentry->d_name.name, len); - - /* - * See the comment in ocfs2_file_read_iter() - */ - ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level); - if (ret < 0) { - mlog_errno(ret); - goto bail; - } - ocfs2_inode_unlock(inode, lock_level); - - ret = generic_file_splice_read(in, ppos, pipe, len, flags); - -bail: - return ret; -} - static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -2506,7 +2476,7 @@ const struct file_operations ocfs2_fops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, - .splice_read = ocfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; @@ -2551,7 +2521,7 @@ const struct file_operations ocfs2_fops_no_plocks = { .compat_ioctl = ocfs2_compat_ioctl, #endif .flock = ocfs2_flock, - .splice_read = ocfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 50cc55047443..5af68fcdf9d3 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -123,8 +123,6 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) -extern struct kmem_cache *ocfs2_inode_cache; - extern const struct address_space_operations ocfs2_aops; extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops; diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index f8f5fc5e6c05..0b58abcf1c6d 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1314,8 +1314,6 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read); - DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 603b28d6f008..f56fe39fab04 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2329,7 +2329,7 @@ static int ocfs2_initialize_super(struct super_block *sb, } cleancache_init_shared_fs(sb); - osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); + osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d9e26cfbb793..bf83e6644333 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -349,7 +349,7 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/open.c b/fs/open.c index 4fd6e256f4f4..8aeb08bb278b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -68,6 +68,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, long vfs_truncate(const struct path *path, loff_t length) { struct inode *inode; + struct dentry *upperdentry; long error; inode = path->dentry->d_inode; @@ -90,7 +91,17 @@ long vfs_truncate(const struct path *path, loff_t length) if (IS_APPEND(inode)) goto mnt_drop_write_and_out; - error = get_write_access(inode); + /* + * If this is an overlayfs then do as if opening the file so we get + * write access on the upper inode, not on the overlay inode. For + * non-overlay filesystems d_real() is an identity function. + */ + upperdentry = d_real(path->dentry, NULL, O_WRONLY); + error = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto mnt_drop_write_and_out; + + error = get_write_access(upperdentry->d_inode); if (error) goto mnt_drop_write_and_out; @@ -109,7 +120,7 @@ long vfs_truncate(const struct path *path, loff_t length) error = do_truncate(path->dentry, length, 0, NULL); put_write_and_out: - put_write_access(inode); + put_write_access(upperdentry->d_inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: @@ -726,7 +737,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; - error = break_lease(inode, f->f_flags); + error = break_lease(locks_inode(f), f->f_flags); if (error) goto cleanup_all; diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 28f2195cd798..7a3754488312 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -73,14 +73,11 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - /* - * can we represent this with the traditional file - * mode permission bits? - */ - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) { - gossip_err("%s: posix_acl_equiv_mode err: %d\n", + umode_t mode; + + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) { + gossip_err("%s: posix_acl_update_mode err: %d\n", __func__, error); return error; @@ -90,8 +87,6 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) SetModeFlag(orangefs_inode); inode->i_mode = mode; mark_inode_dirty_sync(inode); - if (error == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index 00235bf644dc..1e8fe844e69f 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -73,7 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) } } - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; ret = 1; out_release_op: op_release(new_op); diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index a287a66d94e3..516ffb4dc9a0 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -11,14 +11,19 @@ #include "orangefs-kernel.h" #include "orangefs-dev-proto.h" #include "orangefs-bufmap.h" +#include "orangefs-debugfs.h" #include <linux/debugfs.h> #include <linux/slab.h> /* this file implements the /dev/pvfs2-req device node */ +uint32_t orangefs_userspace_version; + static int open_access_count; +static DEFINE_MUTEX(devreq_mutex); + #define DUMP_DEVICE_ERROR() \ do { \ gossip_err("*****************************************************\n");\ @@ -43,7 +48,7 @@ static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op) { int index = hash_func(op->tag, hash_table_size); - list_add_tail(&op->list, &htable_ops_in_progress[index]); + list_add_tail(&op->list, &orangefs_htable_ops_in_progress[index]); } /* @@ -57,20 +62,20 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag) index = hash_func(tag, hash_table_size); - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); list_for_each_entry_safe(op, next, - &htable_ops_in_progress[index], + &orangefs_htable_ops_in_progress[index], list) { if (op->tag == tag && !op_state_purged(op) && !op_state_given_up(op)) { list_del_init(&op->list); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); return op; } } - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); return NULL; } @@ -276,11 +281,11 @@ restart: if (ret != 0) goto error; - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); spin_lock(&cur_op->lock); if (unlikely(op_state_given_up(cur_op))) { spin_unlock(&cur_op->lock); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); complete(&cur_op->waitq); goto restart; } @@ -298,7 +303,7 @@ restart: current->comm); orangefs_devreq_add_op(cur_op); spin_unlock(&cur_op->lock); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); /* The client only asks to read one size buffer. */ return MAX_DEV_REQ_UPSIZE; @@ -387,6 +392,13 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, return -EPROTO; } + if (!orangefs_userspace_version) { + orangefs_userspace_version = head.version; + } else if (orangefs_userspace_version != head.version) { + gossip_err("Error: userspace version changes\n"); + return -EPROTO; + } + /* remove the op from the in progress hash table */ op = orangefs_devreq_remove_op(head.tag); if (!op) { @@ -527,6 +539,7 @@ static int orangefs_devreq_release(struct inode *inode, struct file *file) gossip_debug(GOSSIP_DEV_DEBUG, "pvfs2-client-core: device close complete\n"); open_access_count = 0; + orangefs_userspace_version = 0; mutex_unlock(&devreq_mutex); return 0; } @@ -576,8 +589,6 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE; struct ORANGEFS_dev_map_desc user_desc; int ret = 0; - struct dev_mask_info_s mask_info = { 0 }; - struct dev_mask2_info_s mask2_info = { 0, 0 }; int upstream_kmod = 1; struct orangefs_sb_info_s *orangefs_sb; @@ -619,7 +630,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) * all of the remounts are serviced (to avoid ops between * mounts to fail) */ - ret = mutex_lock_interruptible(&request_mutex); + ret = mutex_lock_interruptible(&orangefs_request_mutex); if (ret < 0) return ret; gossip_debug(GOSSIP_DEV_DEBUG, @@ -654,7 +665,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) gossip_debug(GOSSIP_DEV_DEBUG, "%s: priority remount complete\n", __func__); - mutex_unlock(&request_mutex); + mutex_unlock(&orangefs_request_mutex); return ret; case ORANGEFS_DEV_UPSTREAM: @@ -668,134 +679,11 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) return ret; case ORANGEFS_DEV_CLIENT_MASK: - ret = copy_from_user(&mask2_info, - (void __user *)arg, - sizeof(struct dev_mask2_info_s)); - - if (ret != 0) - return -EIO; - - client_debug_mask.mask1 = mask2_info.mask1_value; - client_debug_mask.mask2 = mask2_info.mask2_value; - - pr_info("%s: client debug mask has been been received " - ":%llx: :%llx:\n", - __func__, - (unsigned long long)client_debug_mask.mask1, - (unsigned long long)client_debug_mask.mask2); - - return ret; - + return orangefs_debugfs_new_client_mask((void __user *)arg); case ORANGEFS_DEV_CLIENT_STRING: - ret = copy_from_user(&client_debug_array_string, - (void __user *)arg, - ORANGEFS_MAX_DEBUG_STRING_LEN); - /* - * The real client-core makes an effort to ensure - * that actual strings that aren't too long to fit in - * this buffer is what we get here. We're going to use - * string functions on the stuff we got, so we'll make - * this extra effort to try and keep from - * flowing out of this buffer when we use the string - * functions, even if somehow the stuff we end up - * with here is garbage. - */ - client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] = - '\0'; - - if (ret != 0) { - pr_info("%s: CLIENT_STRING: copy_from_user failed\n", - __func__); - return -EIO; - } - - pr_info("%s: client debug array string has been received.\n", - __func__); - - if (!help_string_initialized) { - - /* Free the "we don't know yet" default string... */ - kfree(debug_help_string); - - /* build a proper debug help string */ - if (orangefs_prepare_debugfs_help_string(0)) { - gossip_err("%s: no debug help string \n", - __func__); - return -EIO; - } - - /* Replace the boilerplate boot-time debug-help file. */ - debugfs_remove(help_file_dentry); - - help_file_dentry = - debugfs_create_file( - ORANGEFS_KMOD_DEBUG_HELP_FILE, - 0444, - debug_dir, - debug_help_string, - &debug_help_fops); - - if (!help_file_dentry) { - gossip_err("%s: debugfs_create_file failed for" - " :%s:!\n", - __func__, - ORANGEFS_KMOD_DEBUG_HELP_FILE); - return -EIO; - } - } - - debug_mask_to_string(&client_debug_mask, 1); - - debugfs_remove(client_debug_dentry); - - orangefs_client_debug_init(); - - help_string_initialized++; - - return ret; - + return orangefs_debugfs_new_client_string((void __user *)arg); case ORANGEFS_DEV_DEBUG: - ret = copy_from_user(&mask_info, - (void __user *)arg, - sizeof(mask_info)); - - if (ret != 0) - return -EIO; - - if (mask_info.mask_type == KERNEL_MASK) { - if ((mask_info.mask_value == 0) - && (kernel_mask_set_mod_init)) { - /* - * the kernel debug mask was set when the - * kernel module was loaded; don't override - * it if the client-core was started without - * a value for ORANGEFS_KMODMASK. - */ - return 0; - } - debug_mask_to_string(&mask_info.mask_value, - mask_info.mask_type); - gossip_debug_mask = mask_info.mask_value; - pr_info("%s: kernel debug mask has been modified to " - ":%s: :%llx:\n", - __func__, - kernel_debug_string, - (unsigned long long)gossip_debug_mask); - } else if (mask_info.mask_type == CLIENT_MASK) { - debug_mask_to_string(&mask_info.mask_value, - mask_info.mask_type); - pr_info("%s: client debug mask has been modified to" - ":%s: :%llx:\n", - __func__, - client_debug_string, - llu(mask_info.mask_value)); - } else { - gossip_lerr("Invalid mask type....\n"); - return -EINVAL; - } - - return ret; - + return orangefs_debugfs_new_debug((void __user *)arg); default: return -ENOIOCTLCMD; } diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 324f0af40d7b..284373a57a08 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -177,8 +177,8 @@ static int orangefs_readdir(struct file *file, struct dir_context *ctx) } gossip_debug(GOSSIP_DIR_DEBUG, - "orangefs_readdir called on %s (pos=%llu)\n", - dentry->d_name.name, llu(pos)); + "orangefs_readdir called on %pd (pos=%llu)\n", + dentry, llu(pos)); memset(&readdir_response, 0, sizeof(readdir_response)); diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h index 66b99210f1f9..3b8923f8bf21 100644 --- a/fs/orangefs/downcall.h +++ b/fs/orangefs/downcall.h @@ -83,7 +83,10 @@ struct orangefs_listxattr_response { }; struct orangefs_param_response { - __s64 value; + union { + __s64 value64; + __s32 value32[2]; + } u; }; #define PERF_COUNT_BUF_SIZE 4096 @@ -98,6 +101,11 @@ struct orangefs_fs_key_response { char fs_key[FS_KEY_BUF_SIZE]; }; +/* 2.9.6 */ +struct orangefs_features_response { + __u64 features; +}; + struct orangefs_downcall_s { __s32 type; __s32 status; @@ -119,6 +127,7 @@ struct orangefs_downcall_s { struct orangefs_param_response param; struct orangefs_perf_count_response perf_count; struct orangefs_fs_key_response fs_key; + struct orangefs_features_response features; } resp; }; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 526040e09f78..2aa088ab713b 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -14,6 +14,32 @@ #include <linux/fs.h> #include <linux/pagemap.h> +static int flush_racache(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + int ret; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: %pU: Handle is %pU | fs_id %d\n", __func__, + get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, + orangefs_inode->refn.fs_id); + + new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; + + ret = service_operation(new_op, "orangefs_flush_racache", + get_interruptible_flag(inode)); + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", + __func__, ret); + + op_release(new_op); + return ret; +} + /* * Copy to client-core's address space from the buffers specified * by the iovec upto total_size bytes. @@ -386,7 +412,7 @@ ssize_t orangefs_inode_read(struct inode *inode, size_t bufmap_size; ssize_t ret = -EINVAL; - g_orangefs_stats.reads++; + orangefs_stats.reads++; bufmap_size = orangefs_bufmap_size_query(); if (count > bufmap_size) { @@ -427,7 +453,7 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); - g_orangefs_stats.reads++; + orangefs_stats.reads++; rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); iocb->ki_pos = pos; @@ -488,7 +514,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite } iocb->ki_pos = pos; - g_orangefs_stats.writes++; + orangefs_stats.writes++; out: @@ -585,21 +611,30 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) static int orangefs_file_release(struct inode *inode, struct file *file) { gossip_debug(GOSSIP_FILE_DEBUG, - "orangefs_file_release: called on %s\n", - file->f_path.dentry->d_name.name); + "orangefs_file_release: called on %pD\n", + file); orangefs_flush_inode(inode); /* - * remove all associated inode pages from the page cache and mmap + * remove all associated inode pages from the page cache and * readahead cache (if any); this forces an expensive refresh of * data for the next caller of mmap (or 'get_block' accesses) */ if (file->f_path.dentry->d_inode && file->f_path.dentry->d_inode->i_mapping && - mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) + mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) { + if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { + gossip_debug(GOSSIP_INODE_DEBUG, + "calling flush_racache on %pU\n", + get_khandle_from_ino(inode)); + flush_racache(inode); + gossip_debug(GOSSIP_INODE_DEBUG, + "flush_racache finished\n"); + } truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping, 0); + } return 0; } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a9407eeecb21..0e3bd7e07f88 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -129,8 +129,8 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_direct_IO: %s\n", - iocb->ki_filp->f_path.dentry->d_name.name); + "orangefs_direct_IO: %pD\n", + iocb->ki_filp); return -EINVAL; } @@ -216,10 +216,10 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = dentry->d_inode; gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_setattr: called on %s\n", - dentry->d_name.name); + "orangefs_setattr: called on %pd\n", + dentry); - ret = inode_change_ok(inode, iattr); + ret = setattr_prepare(dentry, iattr); if (ret) goto out; @@ -259,8 +259,8 @@ int orangefs_getattr(struct vfsmount *mnt, struct orangefs_inode_s *orangefs_inode = NULL; gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_getattr: called on %s\n", - dentry->d_name.name); + "orangefs_getattr: called on %pd\n", + dentry); ret = orangefs_inode_getattr(inode, 0, 0); if (ret == 0) { diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index ccca5186d5cd..4d5576a21c82 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -24,9 +24,9 @@ static int orangefs_create(struct inode *dir, struct inode *inode; int ret; - gossip_debug(GOSSIP_NAME_DEBUG, "%s: %s\n", + gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd\n", __func__, - dentry->d_name.name); + dentry); new_op = op_alloc(ORANGEFS_VFS_OP_CREATE); if (!new_op) @@ -43,9 +43,9 @@ static int orangefs_create(struct inode *dir, ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); gossip_debug(GOSSIP_NAME_DEBUG, - "%s: %s: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n", + "%s: %pd: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n", __func__, - dentry->d_name.name, + dentry, &new_op->downcall.resp.create.refn.khandle, new_op->downcall.resp.create.refn.fs_id, new_op, @@ -57,28 +57,28 @@ static int orangefs_create(struct inode *dir, inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &new_op->downcall.resp.create.refn); if (IS_ERR(inode)) { - gossip_err("%s: Failed to allocate inode for file :%s:\n", + gossip_err("%s: Failed to allocate inode for file :%pd:\n", __func__, - dentry->d_name.name); + dentry); ret = PTR_ERR(inode); goto out; } gossip_debug(GOSSIP_NAME_DEBUG, - "%s: Assigned inode :%pU: for file :%s:\n", + "%s: Assigned inode :%pU: for file :%pd:\n", __func__, get_khandle_from_ino(inode), - dentry->d_name.name); + dentry); d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, - "%s: dentry instantiated for %s\n", + "%s: dentry instantiated for %pd\n", __func__, - dentry->d_name.name); + dentry); SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); @@ -87,9 +87,9 @@ static int orangefs_create(struct inode *dir, out: op_release(new_op); gossip_debug(GOSSIP_NAME_DEBUG, - "%s: %s: returning %d\n", + "%s: %pd: returning %d\n", __func__, - dentry->d_name.name, + dentry, ret); return ret; } @@ -115,8 +115,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, * -EEXIST on O_EXCL opens, which is broken if we skip this lookup * in the create path) */ - gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n", - __func__, dentry->d_name.name); + gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %pd\n", + __func__, dentry); if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1)) return ERR_PTR(-ENAMETOOLONG); @@ -169,9 +169,9 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, gossip_debug(GOSSIP_NAME_DEBUG, "orangefs_lookup: Adding *negative* dentry " - "%p for %s\n", + "%p for %pd\n", dentry, - dentry->d_name.name); + dentry); d_add(dentry, NULL); res = NULL; @@ -183,7 +183,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); if (IS_ERR(inode)) { @@ -224,10 +224,10 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) int ret; gossip_debug(GOSSIP_NAME_DEBUG, - "%s: called on %s\n" + "%s: called on %pd\n" " (inode %pU): Parent is %pU | fs_id %d\n", __func__, - dentry->d_name.name, + dentry, get_khandle_from_ino(inode), &parent->refn.khandle, parent->refn.fs_id); @@ -322,13 +322,13 @@ static int orangefs_symlink(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, - "Inode (Symlink) %pU -> %s\n", + "Inode (Symlink) %pU -> %pd\n", get_khandle_from_ino(inode), - dentry->d_name.name); + dentry); SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); @@ -386,13 +386,13 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, - "Inode (Directory) %pU -> %s\n", + "Inode (Directory) %pU -> %pd\n", get_khandle_from_ino(inode), - dentry->d_name.name); + dentry); /* * NOTE: we have no good way to keep nlink consistent for directories diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c index b6edbe9fb309..aa3830b741c7 100644 --- a/fs/orangefs/orangefs-cache.c +++ b/fs/orangefs/orangefs-cache.c @@ -73,8 +73,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op) return "OP_STATFS"; else if (type == ORANGEFS_VFS_OP_TRUNCATE) return "OP_TRUNCATE"; - else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH) - return "OP_MMAP_RA_FLUSH"; + else if (type == ORANGEFS_VFS_OP_RA_FLUSH) + return "OP_RA_FLUSH"; else if (type == ORANGEFS_VFS_OP_FS_MOUNT) return "OP_FS_MOUNT"; else if (type == ORANGEFS_VFS_OP_FS_UMOUNT) @@ -97,6 +97,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op) return "OP_FSYNC"; else if (type == ORANGEFS_VFS_OP_FSKEY) return "OP_FSKEY"; + else if (type == ORANGEFS_VFS_OP_FEATURES) + return "OP_FEATURES"; } return "OP_UNKNOWN?"; } diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 1714a737d556..eb09aa026723 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -43,36 +43,35 @@ #include "protocol.h" #include "orangefs-kernel.h" -static int orangefs_debug_disabled = 1; - -static int orangefs_debug_help_open(struct inode *, struct file *); +#define DEBUG_HELP_STRING_SIZE 4096 +#define HELP_STRING_UNINITIALIZED \ + "Client Debug Keywords are unknown until the first time\n" \ + "the client is started after boot.\n" +#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help" +#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug" +#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug" +#define ORANGEFS_VERBOSE "verbose" +#define ORANGEFS_ALL "all" -const struct file_operations debug_help_fops = { - .open = orangefs_debug_help_open, - .read = seq_read, - .release = seq_release, - .llseek = seq_lseek, +/* + * An array of client_debug_mask will be built to hold debug keyword/mask + * values fetched from userspace. + */ +struct client_debug_mask { + char *keyword; + __u64 mask1; + __u64 mask2; }; +static int orangefs_kernel_debug_init(void); + +static int orangefs_debug_help_open(struct inode *, struct file *); static void *help_start(struct seq_file *, loff_t *); static void *help_next(struct seq_file *, void *, loff_t *); static void help_stop(struct seq_file *, void *); static int help_show(struct seq_file *, void *); -static const struct seq_operations help_debug_ops = { - .start = help_start, - .next = help_next, - .stop = help_stop, - .show = help_show, -}; - -/* - * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and - * ORANGEFS_KMOD_DEBUG_FILE. - */ -static DEFINE_MUTEX(orangefs_debug_lock); - -int orangefs_debug_open(struct inode *, struct file *); +static int orangefs_debug_open(struct inode *, struct file *); static ssize_t orangefs_debug_read(struct file *, char __user *, @@ -84,6 +83,43 @@ static ssize_t orangefs_debug_write(struct file *, size_t, loff_t *); +static int orangefs_prepare_cdm_array(char *); +static void debug_mask_to_string(void *, int); +static void do_k_string(void *, int); +static void do_c_string(void *, int); +static int keyword_is_amalgam(char *); +static int check_amalgam_keyword(void *, int); +static void debug_string_to_mask(char *, void *, int); +static void do_c_mask(int, char *, struct client_debug_mask **); +static void do_k_mask(int, char *, __u64 **); + +static char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none"; +static char *debug_help_string; +static char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; +static char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; + +static struct dentry *help_file_dentry; +static struct dentry *client_debug_dentry; +static struct dentry *debug_dir; + +static unsigned int kernel_mask_set_mod_init; +static int orangefs_debug_disabled = 1; +static int help_string_initialized; + +static const struct seq_operations help_debug_ops = { + .start = help_start, + .next = help_next, + .stop = help_stop, + .show = help_show, +}; + +const struct file_operations debug_help_fops = { + .open = orangefs_debug_help_open, + .read = seq_read, + .release = seq_release, + .llseek = seq_lseek, +}; + static const struct file_operations kernel_debug_fops = { .open = orangefs_debug_open, .read = orangefs_debug_read, @@ -91,15 +127,55 @@ static const struct file_operations kernel_debug_fops = { .llseek = generic_file_llseek, }; +static int client_all_index; +static int client_verbose_index; + +static struct client_debug_mask *cdm_array; +static int cdm_element_count; + +static struct client_debug_mask client_debug_mask; + +/* + * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and + * ORANGEFS_KMOD_DEBUG_FILE. + */ +static DEFINE_MUTEX(orangefs_debug_lock); + /* * initialize kmod debug operations, create orangefs debugfs dir and * ORANGEFS_KMOD_DEBUG_HELP_FILE. */ -int orangefs_debugfs_init(void) +int orangefs_debugfs_init(int debug_mask) { - int rc = -ENOMEM; + /* convert input debug mask to a 64-bit unsigned integer */ + orangefs_gossip_debug_mask = (unsigned long long)debug_mask; + + /* + * set the kernel's gossip debug string; invalid mask values will + * be ignored. + */ + debug_mask_to_string(&orangefs_gossip_debug_mask, 0); + + /* remove any invalid values from the mask */ + debug_string_to_mask(kernel_debug_string, &orangefs_gossip_debug_mask, + 0); + + /* + * if the mask has a non-zero value, then indicate that the mask + * was set when the kernel module was loaded. The orangefs dev ioctl + * command will look at this boolean to determine if the kernel's + * debug mask should be overwritten when the client-core is started. + */ + if (orangefs_gossip_debug_mask != 0) + kernel_mask_set_mod_init = true; + + pr_info("%s: called with debug mask: :%s: :%llx:\n", + __func__, + kernel_debug_string, + (unsigned long long)orangefs_gossip_debug_mask); + debug_dir = debugfs_create_dir("orangefs", NULL); if (!debug_dir) { pr_info("%s: debugfs_create_dir failed.\n", __func__); @@ -117,13 +193,58 @@ int orangefs_debugfs_init(void) } orangefs_debug_disabled = 0; + + rc = orangefs_kernel_debug_init(); + +out: + + return rc; +} + +/* + * initialize the kernel-debug file. + */ +static int orangefs_kernel_debug_init(void) +{ + int rc = -ENOMEM; + struct dentry *ret; + char *k_buffer = NULL; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); + + k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); + if (!k_buffer) + goto out; + + if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { + strcpy(k_buffer, kernel_debug_string); + strcat(k_buffer, "\n"); + } else { + strcpy(k_buffer, "none\n"); + pr_info("%s: overflow 1!\n", __func__); + } + + ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, + 0444, + debug_dir, + k_buffer, + &kernel_debug_fops); + if (!ret) { + pr_info("%s: failed to create %s.\n", + __func__, + ORANGEFS_KMOD_DEBUG_FILE); + goto out; + } + rc = 0; out: + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); return rc; } + void orangefs_debugfs_cleanup(void) { debugfs_remove_recursive(debug_dir); @@ -196,49 +317,6 @@ static int help_show(struct seq_file *m, void *v) } /* - * initialize the kernel-debug file. - */ -int orangefs_kernel_debug_init(void) -{ - int rc = -ENOMEM; - struct dentry *ret; - char *k_buffer = NULL; - - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); - - k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!k_buffer) - goto out; - - if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { - strcpy(k_buffer, kernel_debug_string); - strcat(k_buffer, "\n"); - } else { - strcpy(k_buffer, "none\n"); - pr_info("%s: overflow 1!\n", __func__); - } - - ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, - 0444, - debug_dir, - k_buffer, - &kernel_debug_fops); - if (!ret) { - pr_info("%s: failed to create %s.\n", - __func__, - ORANGEFS_KMOD_DEBUG_FILE); - goto out; - } - - rc = 0; - -out: - - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); - return rc; -} - -/* * initialize the client-debug file. */ int orangefs_client_debug_init(void) @@ -282,7 +360,7 @@ out: } /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/ -int orangefs_debug_open(struct inode *inode, struct file *file) +static int orangefs_debug_open(struct inode *inode, struct file *file) { int rc = -ENODEV; @@ -350,8 +428,8 @@ static ssize_t orangefs_debug_write(struct file *file, struct client_debug_mask c_mask = { NULL, 0, 0 }; gossip_debug(GOSSIP_DEBUGFS_DEBUG, - "orangefs_debug_write: %s\n", - file->f_path.dentry->d_name.name); + "orangefs_debug_write: %pD\n", + file); /* * Thwart users who try to jamb a ridiculous number @@ -384,8 +462,8 @@ static ssize_t orangefs_debug_write(struct file *file, */ if (!strcmp(file->f_path.dentry->d_name.name, ORANGEFS_KMOD_DEBUG_FILE)) { - debug_string_to_mask(buf, &gossip_debug_mask, 0); - debug_mask_to_string(&gossip_debug_mask, 0); + debug_string_to_mask(buf, &orangefs_gossip_debug_mask, 0); + debug_mask_to_string(&orangefs_gossip_debug_mask, 0); debug_string = kernel_debug_string; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "New kernel debug string is %s\n", @@ -452,3 +530,546 @@ out: kfree(buf); return rc; } + +/* + * After obtaining a string representation of the client's debug + * keywords and their associated masks, this function is called to build an + * array of these values. + */ +static int orangefs_prepare_cdm_array(char *debug_array_string) +{ + int i; + int rc = -EINVAL; + char *cds_head = NULL; + char *cds_delimiter = NULL; + int keyword_len = 0; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + /* + * figure out how many elements the cdm_array needs. + */ + for (i = 0; i < strlen(debug_array_string); i++) + if (debug_array_string[i] == '\n') + cdm_element_count++; + + if (!cdm_element_count) { + pr_info("No elements in client debug array string!\n"); + goto out; + } + + cdm_array = + kzalloc(cdm_element_count * sizeof(struct client_debug_mask), + GFP_KERNEL); + if (!cdm_array) { + pr_info("malloc failed for cdm_array!\n"); + rc = -ENOMEM; + goto out; + } + + cds_head = debug_array_string; + + for (i = 0; i < cdm_element_count; i++) { + cds_delimiter = strchr(cds_head, '\n'); + *cds_delimiter = '\0'; + + keyword_len = strcspn(cds_head, " "); + + cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL); + if (!cdm_array[i].keyword) { + rc = -ENOMEM; + goto out; + } + + sscanf(cds_head, + "%s %llx %llx", + cdm_array[i].keyword, + (unsigned long long *)&(cdm_array[i].mask1), + (unsigned long long *)&(cdm_array[i].mask2)); + + if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE)) + client_verbose_index = i; + + if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL)) + client_all_index = i; + + cds_head = cds_delimiter + 1; + } + + rc = cdm_element_count; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc); + +out: + + return rc; + +} + +/* + * /sys/kernel/debug/orangefs/debug-help can be catted to + * see all the available kernel and client debug keywords. + * + * When the kernel boots, we have no idea what keywords the + * client supports, nor their associated masks. + * + * We pass through this function once at boot and stamp a + * boilerplate "we don't know" message for the client in the + * debug-help file. We pass through here again when the client + * starts and then we can fill out the debug-help file fully. + * + * The client might be restarted any number of times between + * reboots, we only build the debug-help file the first time. + */ +int orangefs_prepare_debugfs_help_string(int at_boot) +{ + int rc = -EINVAL; + int i; + int byte_count = 0; + char *client_title = "Client Debug Keywords:\n"; + char *kernel_title = "Kernel Debug Keywords:\n"; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (at_boot) { + byte_count += strlen(HELP_STRING_UNINITIALIZED); + client_title = HELP_STRING_UNINITIALIZED; + } else { + /* + * fill the client keyword/mask array and remember + * how many elements there were. + */ + cdm_element_count = + orangefs_prepare_cdm_array(client_debug_array_string); + if (cdm_element_count <= 0) + goto out; + + /* Count the bytes destined for debug_help_string. */ + byte_count += strlen(client_title); + + for (i = 0; i < cdm_element_count; i++) { + byte_count += strlen(cdm_array[i].keyword + 2); + if (byte_count >= DEBUG_HELP_STRING_SIZE) { + pr_info("%s: overflow 1!\n", __func__); + goto out; + } + } + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: cdm_element_count:%d:\n", + __func__, + cdm_element_count); + } + + byte_count += strlen(kernel_title); + for (i = 0; i < num_kmod_keyword_mask_map; i++) { + byte_count += + strlen(s_kmod_keyword_mask_map[i].keyword + 2); + if (byte_count >= DEBUG_HELP_STRING_SIZE) { + pr_info("%s: overflow 2!\n", __func__); + goto out; + } + } + + /* build debug_help_string. */ + debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL); + if (!debug_help_string) { + rc = -ENOMEM; + goto out; + } + + strcat(debug_help_string, client_title); + + if (!at_boot) { + for (i = 0; i < cdm_element_count; i++) { + strcat(debug_help_string, "\t"); + strcat(debug_help_string, cdm_array[i].keyword); + strcat(debug_help_string, "\n"); + } + } + + strcat(debug_help_string, "\n"); + strcat(debug_help_string, kernel_title); + + for (i = 0; i < num_kmod_keyword_mask_map; i++) { + strcat(debug_help_string, "\t"); + strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword); + strcat(debug_help_string, "\n"); + } + + rc = 0; + +out: + + return rc; + +} + +/* + * kernel = type 0 + * client = type 1 + */ +static void debug_mask_to_string(void *mask, int type) +{ + int i; + int len = 0; + char *debug_string; + int element_count = 0; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (type) { + debug_string = client_debug_string; + element_count = cdm_element_count; + } else { + debug_string = kernel_debug_string; + element_count = num_kmod_keyword_mask_map; + } + + memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); + + /* + * Some keywords, like "all" or "verbose", are amalgams of + * numerous other keywords. Make a special check for those + * before grinding through the whole mask only to find out + * later... + */ + if (check_amalgam_keyword(mask, type)) + goto out; + + /* Build the debug string. */ + for (i = 0; i < element_count; i++) + if (type) + do_c_string(mask, i); + else + do_k_string(mask, i); + + len = strlen(debug_string); + + if ((len) && (type)) + client_debug_string[len - 1] = '\0'; + else if (len) + kernel_debug_string[len - 1] = '\0'; + else if (type) + strcpy(client_debug_string, "none"); + else + strcpy(kernel_debug_string, "none"); + +out: +gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string); + + return; + +} + +static void do_k_string(void *k_mask, int index) +{ + __u64 *mask = (__u64 *) k_mask; + + if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword)) + goto out; + + if (*mask & s_kmod_keyword_mask_map[index].mask_val) { + if ((strlen(kernel_debug_string) + + strlen(s_kmod_keyword_mask_map[index].keyword)) + < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) { + strcat(kernel_debug_string, + s_kmod_keyword_mask_map[index].keyword); + strcat(kernel_debug_string, ","); + } else { + gossip_err("%s: overflow!\n", __func__); + strcpy(kernel_debug_string, ORANGEFS_ALL); + goto out; + } + } + +out: + + return; +} + +static void do_c_string(void *c_mask, int index) +{ + struct client_debug_mask *mask = (struct client_debug_mask *) c_mask; + + if (keyword_is_amalgam(cdm_array[index].keyword)) + goto out; + + if ((mask->mask1 & cdm_array[index].mask1) || + (mask->mask2 & cdm_array[index].mask2)) { + if ((strlen(client_debug_string) + + strlen(cdm_array[index].keyword) + 1) + < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) { + strcat(client_debug_string, + cdm_array[index].keyword); + strcat(client_debug_string, ","); + } else { + gossip_err("%s: overflow!\n", __func__); + strcpy(client_debug_string, ORANGEFS_ALL); + goto out; + } + } +out: + return; +} + +static int keyword_is_amalgam(char *keyword) +{ + int rc = 0; + + if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE))) + rc = 1; + + return rc; +} + +/* + * kernel = type 0 + * client = type 1 + * + * return 1 if we found an amalgam. + */ +static int check_amalgam_keyword(void *mask, int type) +{ + __u64 *k_mask; + struct client_debug_mask *c_mask; + int k_all_index = num_kmod_keyword_mask_map - 1; + int rc = 0; + + if (type) { + c_mask = (struct client_debug_mask *) mask; + + if ((c_mask->mask1 == cdm_array[client_all_index].mask1) && + (c_mask->mask2 == cdm_array[client_all_index].mask2)) { + strcpy(client_debug_string, ORANGEFS_ALL); + rc = 1; + goto out; + } + + if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) && + (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) { + strcpy(client_debug_string, ORANGEFS_VERBOSE); + rc = 1; + goto out; + } + + } else { + k_mask = (__u64 *) mask; + + if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) { + strcpy(kernel_debug_string, ORANGEFS_ALL); + rc = 1; + goto out; + } + } + +out: + + return rc; +} + +/* + * kernel = type 0 + * client = type 1 + */ +static void debug_string_to_mask(char *debug_string, void *mask, int type) +{ + char *unchecked_keyword; + int i; + char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL); + char *original_pointer; + int element_count = 0; + struct client_debug_mask *c_mask = NULL; + __u64 *k_mask = NULL; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (type) { + c_mask = (struct client_debug_mask *)mask; + element_count = cdm_element_count; + } else { + k_mask = (__u64 *)mask; + *k_mask = 0; + element_count = num_kmod_keyword_mask_map; + } + + original_pointer = strsep_fodder; + while ((unchecked_keyword = strsep(&strsep_fodder, ","))) + if (strlen(unchecked_keyword)) { + for (i = 0; i < element_count; i++) + if (type) + do_c_mask(i, + unchecked_keyword, + &c_mask); + else + do_k_mask(i, + unchecked_keyword, + &k_mask); + } + + kfree(original_pointer); +} + +static void do_c_mask(int i, char *unchecked_keyword, + struct client_debug_mask **sane_mask) +{ + + if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) { + (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1; + (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2; + } +} + +static void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask) +{ + + if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword)) + **sane_mask = (**sane_mask) | + s_kmod_keyword_mask_map[i].mask_val; +} + +int orangefs_debugfs_new_client_mask(void __user *arg) +{ + struct dev_mask2_info_s mask2_info = {0}; + int ret; + + ret = copy_from_user(&mask2_info, + (void __user *)arg, + sizeof(struct dev_mask2_info_s)); + + if (ret != 0) + return -EIO; + + client_debug_mask.mask1 = mask2_info.mask1_value; + client_debug_mask.mask2 = mask2_info.mask2_value; + + pr_info("%s: client debug mask has been been received " + ":%llx: :%llx:\n", + __func__, + (unsigned long long)client_debug_mask.mask1, + (unsigned long long)client_debug_mask.mask2); + + return ret; +} + +int orangefs_debugfs_new_client_string(void __user *arg) +{ + int ret; + + ret = copy_from_user(&client_debug_array_string, + (void __user *)arg, + ORANGEFS_MAX_DEBUG_STRING_LEN); + if (ret != 0) + return -EIO; + + /* + * The real client-core makes an effort to ensure + * that actual strings that aren't too long to fit in + * this buffer is what we get here. We're going to use + * string functions on the stuff we got, so we'll make + * this extra effort to try and keep from + * flowing out of this buffer when we use the string + * functions, even if somehow the stuff we end up + * with here is garbage. + */ + client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] = + '\0'; + + if (ret != 0) { + pr_info("%s: CLIENT_STRING: copy_from_user failed\n", + __func__); + return -EIO; + } + + pr_info("%s: client debug array string has been received.\n", + __func__); + + if (!help_string_initialized) { + + /* Free the "we don't know yet" default string... */ + kfree(debug_help_string); + + /* build a proper debug help string */ + if (orangefs_prepare_debugfs_help_string(0)) { + gossip_err("%s: no debug help string \n", + __func__); + return -EIO; + } + + /* Replace the boilerplate boot-time debug-help file. */ + debugfs_remove(help_file_dentry); + + help_file_dentry = + debugfs_create_file( + ORANGEFS_KMOD_DEBUG_HELP_FILE, + 0444, + debug_dir, + debug_help_string, + &debug_help_fops); + + if (!help_file_dentry) { + gossip_err("%s: debugfs_create_file failed for" + " :%s:!\n", + __func__, + ORANGEFS_KMOD_DEBUG_HELP_FILE); + return -EIO; + } + } + + debug_mask_to_string(&client_debug_mask, 1); + + debugfs_remove(client_debug_dentry); + + orangefs_client_debug_init(); + + help_string_initialized++; + + return ret; +} + +int orangefs_debugfs_new_debug(void __user *arg) +{ + struct dev_mask_info_s mask_info = {0}; + int ret; + + ret = copy_from_user(&mask_info, + (void __user *)arg, + sizeof(mask_info)); + + if (ret != 0) + return -EIO; + + if (mask_info.mask_type == KERNEL_MASK) { + if ((mask_info.mask_value == 0) + && (kernel_mask_set_mod_init)) { + /* + * the kernel debug mask was set when the + * kernel module was loaded; don't override + * it if the client-core was started without + * a value for ORANGEFS_KMODMASK. + */ + return 0; + } + debug_mask_to_string(&mask_info.mask_value, + mask_info.mask_type); + orangefs_gossip_debug_mask = mask_info.mask_value; + pr_info("%s: kernel debug mask has been modified to " + ":%s: :%llx:\n", + __func__, + kernel_debug_string, + (unsigned long long)orangefs_gossip_debug_mask); + } else if (mask_info.mask_type == CLIENT_MASK) { + debug_mask_to_string(&mask_info.mask_value, + mask_info.mask_type); + pr_info("%s: client debug mask has been modified to" + ":%s: :%llx:\n", + __func__, + client_debug_string, + llu(mask_info.mask_value)); + } else { + gossip_lerr("Invalid mask type....\n"); + return -EINVAL; + } + + return ret; +} diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h index e4828c0e3ef9..803517269ba6 100644 --- a/fs/orangefs/orangefs-debugfs.h +++ b/fs/orangefs/orangefs-debugfs.h @@ -1,3 +1,7 @@ -int orangefs_debugfs_init(void); -int orangefs_kernel_debug_init(void); +int orangefs_debugfs_init(int); void orangefs_debugfs_cleanup(void); +int orangefs_client_debug_init(void); +int orangefs_prepare_debugfs_help_string(int); +int orangefs_debugfs_new_client_mask(void __user *); +int orangefs_debugfs_new_client_string(void __user *); +int orangefs_debugfs_new_debug(void __user *); diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index 9eac9d9a3f3a..a3d84ffee905 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -28,7 +28,7 @@ #define ORANGEFS_VFS_OP_RENAME 0xFF00000A #define ORANGEFS_VFS_OP_STATFS 0xFF00000B #define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C -#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH 0xFF00000D +#define ORANGEFS_VFS_OP_RA_FLUSH 0xFF00000D #define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E #define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F #define ORANGEFS_VFS_OP_GETXATTR 0xFF000010 @@ -41,6 +41,10 @@ #define ORANGEFS_VFS_OP_FSYNC 0xFF00EE01 #define ORANGEFS_VFS_OP_FSKEY 0xFF00EE02 #define ORANGEFS_VFS_OP_READDIRPLUS 0xFF00EE03 +#define ORANGEFS_VFS_OP_FEATURES 0xFF00EE05 /* 2.9.6 */ + +/* features is a 64-bit unsigned bitmask */ +#define ORANGEFS_FEATURE_READAHEAD 1 /* * Misc constants. Please retain them as multiples of 8! diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 633c07a6e3d8..0a82048f3aaf 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -100,16 +100,6 @@ enum orangefs_vfs_op_states { }; /* - * An array of client_debug_mask will be built to hold debug keyword/mask - * values fetched from userspace. - */ -struct client_debug_mask { - char *keyword; - __u64 mask1; - __u64 mask2; -}; - -/* * orangefs kernel memory related flags */ @@ -119,29 +109,6 @@ struct client_debug_mask { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */ -/* these functions are defined in orangefs-utils.c */ -int orangefs_prepare_cdm_array(char *debug_array_string); -int orangefs_prepare_debugfs_help_string(int); - -/* defined in orangefs-debugfs.c */ -int orangefs_client_debug_init(void); - -void debug_string_to_mask(char *, void *, int); -void do_c_mask(int, char *, struct client_debug_mask **); -void do_k_mask(int, char *, __u64 **); - -void debug_mask_to_string(void *, int); -void do_k_string(void *, int); -void do_c_string(void *, int); -int check_amalgam_keyword(void *, int); -int keyword_is_amalgam(char *); - -/*these variables are defined in orangefs-mod.c */ -extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -extern unsigned int kernel_mask_set_mod_init; - extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; @@ -331,7 +298,7 @@ struct orangefs_stats { unsigned long writes; }; -extern struct orangefs_stats g_orangefs_stats; +extern struct orangefs_stats orangefs_stats; /* * NOTE: See Documentation/filesystems/porting for information @@ -447,6 +414,8 @@ void purge_waiting_ops(void); /* * defined in super.c */ +extern uint64_t orangefs_features; + struct dentry *orangefs_mount(struct file_system_type *fst, int flags, const char *devname, @@ -506,6 +475,8 @@ ssize_t orangefs_inode_read(struct inode *inode, /* * defined in devorangefs-req.c */ +extern uint32_t orangefs_userspace_version; + int orangefs_dev_init(void); void orangefs_dev_cleanup(void); int is_daemon_in_service(void); @@ -543,20 +514,18 @@ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op); int orangefs_normalize_to_errno(__s32 error_code); -extern struct mutex devreq_mutex; -extern struct mutex request_mutex; -extern int debug; +extern struct mutex orangefs_request_mutex; extern int op_timeout_secs; extern int slot_timeout_secs; -extern int dcache_timeout_msecs; -extern int getattr_timeout_msecs; +extern int orangefs_dcache_timeout_msecs; +extern int orangefs_getattr_timeout_msecs; extern struct list_head orangefs_superblocks; extern spinlock_t orangefs_superblocks_lock; extern struct list_head orangefs_request_list; extern spinlock_t orangefs_request_list_lock; extern wait_queue_head_t orangefs_request_list_waitq; -extern struct list_head *htable_ops_in_progress; -extern spinlock_t htable_ops_in_progress_lock; +extern struct list_head *orangefs_htable_ops_in_progress; +extern spinlock_t orangefs_htable_ops_in_progress_lock; extern int hash_table_size; extern const struct address_space_operations orangefs_address_operations; diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index e9fd5755c05f..2e5b03065f34 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -21,34 +21,17 @@ * global variables declared here */ -/* array of client debug keyword/mask values */ -struct client_debug_mask *cdm_array; -int cdm_element_count; - -char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none"; -char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; - -char *debug_help_string; -int help_string_initialized; -struct dentry *help_file_dentry; -struct dentry *client_debug_dentry; -struct dentry *debug_dir; -int client_verbose_index; -int client_all_index; -struct orangefs_stats g_orangefs_stats; +struct orangefs_stats orangefs_stats; /* the size of the hash tables for ops in progress */ int hash_table_size = 509; static ulong module_parm_debug_mask; -__u64 gossip_debug_mask; -struct client_debug_mask client_debug_mask = { NULL, 0, 0 }; -unsigned int kernel_mask_set_mod_init; /* implicitly false */ +__u64 orangefs_gossip_debug_mask; int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; -int dcache_timeout_msecs = 50; -int getattr_timeout_msecs = 50; +int orangefs_dcache_timeout_msecs = 50; +int orangefs_getattr_timeout_msecs = 50; MODULE_LICENSE("GPL"); MODULE_AUTHOR("ORANGEFS Development Team"); @@ -71,20 +54,17 @@ module_param(module_parm_debug_mask, ulong, 0644); module_param(op_timeout_secs, int, 0); module_param(slot_timeout_secs, int, 0); -/* synchronizes the request device file */ -DEFINE_MUTEX(devreq_mutex); - /* * Blocks non-priority requests from being queued for servicing. This * could be used for protecting the request list data structure, but * for now it's only being used to stall the op addition to the request * list */ -DEFINE_MUTEX(request_mutex); +DEFINE_MUTEX(orangefs_request_mutex); /* hash table for storing operations waiting for matching downcall */ -struct list_head *htable_ops_in_progress; -DEFINE_SPINLOCK(htable_ops_in_progress_lock); +struct list_head *orangefs_htable_ops_in_progress; +DEFINE_SPINLOCK(orangefs_htable_ops_in_progress_lock); /* list for queueing upcall operations */ LIST_HEAD(orangefs_request_list); @@ -100,32 +80,6 @@ static int __init orangefs_init(void) int ret = -1; __u32 i = 0; - /* convert input debug mask to a 64-bit unsigned integer */ - gossip_debug_mask = (unsigned long long) module_parm_debug_mask; - - /* - * set the kernel's gossip debug string; invalid mask values will - * be ignored. - */ - debug_mask_to_string(&gossip_debug_mask, 0); - - /* remove any invalid values from the mask */ - debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0); - - /* - * if the mask has a non-zero value, then indicate that the mask - * was set when the kernel module was loaded. The orangefs dev ioctl - * command will look at this boolean to determine if the kernel's - * debug mask should be overwritten when the client-core is started. - */ - if (gossip_debug_mask != 0) - kernel_mask_set_mod_init = true; - - pr_info("%s: called with debug mask: :%s: :%llx:\n", - __func__, - kernel_debug_string, - (unsigned long long)gossip_debug_mask); - ret = bdi_init(&orangefs_backing_dev_info); if (ret) @@ -146,9 +100,9 @@ static int __init orangefs_init(void) if (ret < 0) goto cleanup_op; - htable_ops_in_progress = + orangefs_htable_ops_in_progress = kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL); - if (!htable_ops_in_progress) { + if (!orangefs_htable_ops_in_progress) { gossip_err("Failed to initialize op hashtable"); ret = -ENOMEM; goto cleanup_inode; @@ -156,7 +110,7 @@ static int __init orangefs_init(void) /* initialize a doubly linked at each hash table index */ for (i = 0; i < hash_table_size; i++) - INIT_LIST_HEAD(&htable_ops_in_progress[i]); + INIT_LIST_HEAD(&orangefs_htable_ops_in_progress[i]); ret = fsid_key_table_initialize(); if (ret < 0) @@ -179,14 +133,10 @@ static int __init orangefs_init(void) if (ret) goto cleanup_key_table; - ret = orangefs_debugfs_init(); + ret = orangefs_debugfs_init(module_parm_debug_mask); if (ret) goto debugfs_init_failed; - ret = orangefs_kernel_debug_init(); - if (ret) - goto kernel_debug_init_failed; - ret = orangefs_sysfs_init(); if (ret) goto sysfs_init_failed; @@ -214,8 +164,6 @@ cleanup_device: sysfs_init_failed: -kernel_debug_init_failed: - debugfs_init_failed: orangefs_debugfs_cleanup(); @@ -223,7 +171,7 @@ cleanup_key_table: fsid_key_table_finalize(); cleanup_progress_table: - kfree(htable_ops_in_progress); + kfree(orangefs_htable_ops_in_progress); cleanup_inode: orangefs_inode_cache_finalize(); @@ -250,12 +198,12 @@ static void __exit orangefs_exit(void) orangefs_dev_cleanup(); BUG_ON(!list_empty(&orangefs_request_list)); for (i = 0; i < hash_table_size; i++) - BUG_ON(!list_empty(&htable_ops_in_progress[i])); + BUG_ON(!list_empty(&orangefs_htable_ops_in_progress[i])); orangefs_inode_cache_finalize(); op_cache_finalize(); - kfree(htable_ops_in_progress); + kfree(orangefs_htable_ops_in_progress); bdi_destroy(&orangefs_backing_dev_info); @@ -274,10 +222,10 @@ void purge_inprogress_ops(void) struct orangefs_kernel_op_s *op; struct orangefs_kernel_op_s *next; - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); list_for_each_entry_safe(op, next, - &htable_ops_in_progress[i], + &orangefs_htable_ops_in_progress[i], list) { set_op_state_purged(op); gossip_debug(GOSSIP_DEV_DEBUG, @@ -287,7 +235,7 @@ void purge_inprogress_ops(void) op->op_state, current->comm); } - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); } } diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 375708c2db87..a799546a67f7 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -73,6 +73,24 @@ * Description: * Time getattr is valid in milliseconds. * + * What: /sys/fs/orangefs/readahead_count + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer count. + * + * What: /sys/fs/orangefs/readahead_size + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer size. + * + * What: /sys/fs/orangefs/readahead_count_size + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer count and size. + * * What: /sys/fs/orangefs/acache/... * Date: Jun 2015 * Contact: Martin Brandenburg <martin@omnibond.com> @@ -121,159 +139,34 @@ #define PC_KOBJ_ID "pc" #define STATS_KOBJ_ID "stats" -struct orangefs_obj { - struct kobject kobj; - int op_timeout_secs; - int perf_counter_reset; - int perf_history_size; - int perf_time_interval_secs; - int slot_timeout_secs; - int dcache_timeout_msecs; - int getattr_timeout_msecs; -}; - -struct acache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_msecs; -}; - -struct capcache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_secs; -}; - -struct ccache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_secs; -}; - -struct ncache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_msecs; -}; - -struct pc_orangefs_obj { - struct kobject kobj; - char *acache; - char *capcache; - char *ncache; -}; - -struct stats_orangefs_obj { - struct kobject kobj; - int reads; - int writes; -}; +/* + * Every item calls orangefs_attr_show and orangefs_attr_store through + * orangefs_sysfs_ops. They look at the orangefs_attributes further below to + * call one of sysfs_int_show, sysfs_int_store, sysfs_service_op_show, or + * sysfs_service_op_store. + */ struct orangefs_attribute { struct attribute attr; - ssize_t (*show)(struct orangefs_obj *orangefs_obj, + ssize_t (*show)(struct kobject *kobj, struct orangefs_attribute *attr, char *buf); - ssize_t (*store)(struct orangefs_obj *orangefs_obj, + ssize_t (*store)(struct kobject *kobj, struct orangefs_attribute *attr, const char *buf, size_t count); }; -struct acache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct capcache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct ccache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct ncache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct pc_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj, - struct pc_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj, - struct pc_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct stats_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj, - struct stats_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj, - struct stats_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - static ssize_t orangefs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct orangefs_attribute *attribute; - struct orangefs_obj *orangefs_obj; - int rc; attribute = container_of(attr, struct orangefs_attribute, attr); - orangefs_obj = container_of(kobj, struct orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(orangefs_obj, attribute, buf); - -out: - return rc; + if (!attribute->show) + return -EIO; + return attribute->show(kobj, attribute, buf); } static ssize_t orangefs_attr_store(struct kobject *kobj, @@ -282,24 +175,15 @@ static ssize_t orangefs_attr_store(struct kobject *kobj, size_t len) { struct orangefs_attribute *attribute; - struct orangefs_obj *orangefs_obj; - int rc; - gossip_debug(GOSSIP_SYSFS_DEBUG, - "orangefs_attr_store: start\n"); + if (!strcmp(kobj->name, PC_KOBJ_ID) || + !strcmp(kobj->name, STATS_KOBJ_ID)) + return -EPERM; attribute = container_of(attr, struct orangefs_attribute, attr); - orangefs_obj = container_of(kobj, struct orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(orangefs_obj, attribute, buf, len); - -out: - return rc; + if (!attribute->store) + return -EIO; + return attribute->store(kobj, attribute, buf, len); } static const struct sysfs_ops orangefs_sysfs_ops = { @@ -307,402 +191,58 @@ static const struct sysfs_ops orangefs_sysfs_ops = { .store = orangefs_attr_store, }; -static ssize_t acache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct acache_orangefs_attribute *attribute; - struct acache_orangefs_obj *acache_orangefs_obj; - int rc; - - attribute = container_of(attr, struct acache_orangefs_attribute, attr); - acache_orangefs_obj = - container_of(kobj, struct acache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(acache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t acache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct acache_orangefs_attribute *attribute; - struct acache_orangefs_obj *acache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "acache_orangefs_attr_store: start\n"); - - attribute = container_of(attr, struct acache_orangefs_attribute, attr); - acache_orangefs_obj = - container_of(kobj, struct acache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(acache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops acache_orangefs_sysfs_ops = { - .show = acache_orangefs_attr_show, - .store = acache_orangefs_attr_store, -}; - -static ssize_t capcache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct capcache_orangefs_attribute *attribute; - struct capcache_orangefs_obj *capcache_orangefs_obj; - int rc; - - attribute = - container_of(attr, struct capcache_orangefs_attribute, attr); - capcache_orangefs_obj = - container_of(kobj, struct capcache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(capcache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t capcache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct capcache_orangefs_attribute *attribute; - struct capcache_orangefs_obj *capcache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "capcache_orangefs_attr_store: start\n"); - - attribute = - container_of(attr, struct capcache_orangefs_attribute, attr); - capcache_orangefs_obj = - container_of(kobj, struct capcache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(capcache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops capcache_orangefs_sysfs_ops = { - .show = capcache_orangefs_attr_show, - .store = capcache_orangefs_attr_store, -}; - -static ssize_t ccache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ccache_orangefs_attribute *attribute; - struct ccache_orangefs_obj *ccache_orangefs_obj; - int rc; - - attribute = - container_of(attr, struct ccache_orangefs_attribute, attr); - ccache_orangefs_obj = - container_of(kobj, struct ccache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(ccache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t ccache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct ccache_orangefs_attribute *attribute; - struct ccache_orangefs_obj *ccache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "ccache_orangefs_attr_store: start\n"); - - attribute = - container_of(attr, struct ccache_orangefs_attribute, attr); - ccache_orangefs_obj = - container_of(kobj, struct ccache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(ccache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops ccache_orangefs_sysfs_ops = { - .show = ccache_orangefs_attr_show, - .store = ccache_orangefs_attr_store, -}; - -static ssize_t ncache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ncache_orangefs_attribute *attribute; - struct ncache_orangefs_obj *ncache_orangefs_obj; - int rc; - - attribute = container_of(attr, struct ncache_orangefs_attribute, attr); - ncache_orangefs_obj = - container_of(kobj, struct ncache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(ncache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t ncache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct ncache_orangefs_attribute *attribute; - struct ncache_orangefs_obj *ncache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "ncache_orangefs_attr_store: start\n"); - - attribute = container_of(attr, struct ncache_orangefs_attribute, attr); - ncache_orangefs_obj = - container_of(kobj, struct ncache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(ncache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops ncache_orangefs_sysfs_ops = { - .show = ncache_orangefs_attr_show, |