aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-12 18:44:24 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-12 18:44:24 -0700
commita2ccb68b1e6add42c0bf3ade73cd11c98d32b890 (patch)
tree03addb13529482fdc635ff7a916dd4e1c79ffbfb
parent02c9c0e9b9d9e26f56aa74245c1a5a843878f899 (diff)
parent6d0a07edd17cfc12fdc1f36de8072fa17cc3666f (diff)
downloadleg-kernel-a2ccb68b1e6add42c0bf3ade73cd11c98d32b890.tar.gz
Merge branch 'akpm' (patches from Andrew)
Merge fixes from Andrew Morton: "4 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: thp: calculate the mapcount correctly for THP pages during WP faults ksm: fix conflict between mmput and scan_get_next_rmap_item ocfs2: fix posix_acl_create deadlock ocfs2: revert using ocfs2_acl_chmod to avoid inode cluster lock hang
-rw-r--r--fs/ocfs2/acl.c87
-rw-r--r--fs/ocfs2/acl.h5
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/namei.c23
-rw-r--r--fs/ocfs2/refcounttree.c17
-rw-r--r--fs/ocfs2/xattr.c14
-rw-r--r--fs/ocfs2/xattr.h4
-rw-r--r--include/linux/mm.h9
-rw-r--r--include/linux/swap.h6
-rw-r--r--mm/huge_memory.c71
-rw-r--r--mm/ksm.c15
-rw-r--r--mm/memory.c22
-rw-r--r--mm/swapfile.c13
13 files changed, 209 insertions, 81 deletions
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0cdf497c91ef..2162434728c0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -322,3 +322,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
brelse(di_bh);
return acl;
}
+
+int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int ret;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ if (ret)
+ return ret;
+ ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+ acl, NULL, NULL);
+ posix_acl_release(acl);
+ return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+ struct inode *inode,
+ struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dir_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl = NULL;
+ int ret = 0, ret2;
+ umode_t mode;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+ dir_bh);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl) {
+ mode = inode->i_mode & ~current_umask();
+ ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
+ }
+ if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+ if (S_ISDIR(inode->i_mode)) {
+ ret = ocfs2_set_acl(handle, inode, di_bh,
+ ACL_TYPE_DEFAULT, acl,
+ meta_ac, data_ac);
+ if (ret)
+ goto cleanup;
+ }
+ mode = inode->i_mode;
+ ret = __posix_acl_create(&acl, GFP_NOFS, &mode);
+ if (ret < 0)
+ return ret;
+
+ ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret2) {
+ mlog_errno(ret2);
+ ret = ret2;
+ goto cleanup;
+ }
+ if (ret > 0) {
+ ret = ocfs2_set_acl(handle, inode,
+ di_bh, ACL_TYPE_ACCESS,
+ acl, meta_ac, data_ac);
+ }
+ }
+cleanup:
+ posix_acl_release(acl);
+ return ret;
+}
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 3fce68d08625..2783a75b3999 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle,
struct posix_acl *acl,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_alloc_context *data_ac);
+extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+ struct buffer_head *, struct buffer_head *,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5308841756be..59cce53c91d8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1268,20 +1268,20 @@ bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
bail:
- brelse(bh);
/* Release quota pointers in case we acquired them */
for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
dqput(transfer_to[qtype]);
if (!status && attr->ia_valid & ATTR_MODE) {
- status = posix_acl_chmod(inode, inode->i_mode);
+ status = ocfs2_acl_chmod(inode, bh);
if (status < 0)
mlog_errno(status);
}
if (inode_locked)
ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
return status;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6b3e87189a64..a8f1225e6d9b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir,
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
int did_block_signals = 0;
- struct posix_acl *default_acl = NULL, *acl = NULL;
struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
@@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (status) {
- mlog_errno(status);
- goto leave;
- }
-
handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
S_ISDIR(mode),
xattr_credits));
@@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir,
inc_nlink(dir);
}
- if (default_acl) {
- status = ocfs2_set_acl(handle, inode, new_fe_bh,
- ACL_TYPE_DEFAULT, default_acl,
- meta_ac, data_ac);
- }
- if (!status && acl) {
- status = ocfs2_set_acl(handle, inode, new_fe_bh,
- ACL_TYPE_ACCESS, acl,
- meta_ac, data_ac);
- }
+ status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+ meta_ac, data_ac);
if (status < 0) {
mlog_errno(status);
@@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir,
d_instantiate(dentry, inode);
status = 0;
leave:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
if (handle)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 744d5d90c363..92bbe93bfe10 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = d_inode(old_dentry);
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
- struct posix_acl *default_acl, *acl;
- umode_t mode;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
- mode = inode->i_mode;
- error = posix_acl_create(dir, &mode, &default_acl, &acl);
- if (error) {
- mlog_errno(error);
- return error;
- }
- error = ocfs2_create_inode_in_orphan(dir, mode,
+ error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
@@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
- &new_dentry->d_name,
- default_acl, acl);
+ &new_dentry->d_name);
if (error)
mlog_errno(error);
}
out:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
new_dentry);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 7d3d979f57d9..f19b7381a998 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7216,12 +7216,10 @@ out:
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr,
- struct posix_acl *default_acl,
- struct posix_acl *acl)
+ const struct qstr *qstr)
{
- struct buffer_head *dir_bh = NULL;
int ret = 0;
+ struct buffer_head *dir_bh = NULL;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@@ -7234,11 +7232,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
mlog_errno(ret);
goto leave;
}
-
- if (!ret && default_acl)
- ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
- if (!ret && acl)
- ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
+ if (ret)
+ mlog_errno(ret);
ocfs2_inode_unlock(dir, 0);
brelse(dir_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index f10d5b93c366..1633cc15ea1f 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr,
- struct posix_acl *default_acl,
- struct posix_acl *acl);
+ const struct qstr *qstr);
#endif /* OCFS2_XATTR_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 864d7221de84..8f468e0d2534 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -500,11 +500,20 @@ static inline int page_mapcount(struct page *page)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int total_mapcount(struct page *page);
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
#else
static inline int total_mapcount(struct page *page)
{
return page_mapcount(page);
}
+static inline int page_trans_huge_mapcount(struct page *page,
+ int *total_mapcount)
+{
+ int mapcount = page_mapcount(page);
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ return mapcount;
+}
#endif
static inline struct page *virt_to_head_page(const void *x)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0a4cd4703f40..ad220359f1b0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
-extern int reuse_swap_page(struct page *);
+extern bool reuse_swap_page(struct page *, int *);
extern int try_to_free_swap(struct page *);
struct backing_dev_info;
@@ -513,8 +513,8 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-#define reuse_swap_page(page) \
- (!PageTransCompound(page) && page_mapcount(page) == 1)
+#define reuse_swap_page(page, total_mapcount) \
+ (page_trans_huge_mapcount(page, total_mapcount) == 1)
static inline int try_to_free_swap(struct page *page)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f7daa7de8f48..b49ee126d4d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
/*
* We can only reuse the page if nobody else maps the huge page or it's
- * part. We can do it by checking page_mapcount() on each sub-page, but
- * it's expensive.
- * The cheaper way is to check page_count() to be equal 1: every
- * mapcount takes page reference reference, so this way we can
- * guarantee, that the PMD is the only mapping.
- * This can give false negative if somebody pinned the page, but that's
- * fine.
+ * part.
*/
- if (page_mapcount(page) == 1 && page_count(page) == 1) {
+ if (page_trans_huge_mapcount(page, NULL) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -2079,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (pte_write(pteval)) {
writable = true;
} else {
- if (PageSwapCache(page) && !reuse_swap_page(page)) {
+ if (PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
unlock_page(page);
result = SCAN_SWAP_CACHE_PAGE;
goto out;
@@ -3223,6 +3218,64 @@ int total_mapcount(struct page *page)
}
/*
+ * This calculates accurately how many mappings a transparent hugepage
+ * has (unlike page_mapcount() which isn't fully accurate). This full
+ * accuracy is primarily needed to know if copy-on-write faults can
+ * reuse the page and change the mapping to read-write instead of
+ * copying them. At the same time this returns the total_mapcount too.
+ *
+ * The function returns the highest mapcount any one of the subpages
+ * has. If the return value is one, even if different processes are
+ * mapping different subpages of the transparent hugepage, they can
+ * all reuse it, because each process is reusing a different subpage.
+ *
+ * The total_mapcount is instead counting all virtual mappings of the
+ * subpages. If the total_mapcount is equal to "one", it tells the
+ * caller all mappings belong to the same "mm" and in turn the
+ * anon_vma of the transparent hugepage can become the vma->anon_vma
+ * local one as no other process may be mapping any of the subpages.
+ *
+ * It would be more accurate to replace page_mapcount() with
+ * page_trans_huge_mapcount(), however we only use
+ * page_trans_huge_mapcount() in the copy-on-write faults where we
+ * need full accuracy to avoid breaking page pinning, because
+ * page_trans_huge_mapcount() is slower than page_mapcount().
+ */
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+{
+ int i, ret, _total_mapcount, mapcount;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (likely(!PageTransCompound(page))) {
+ mapcount = atomic_read(&page->_mapcount) + 1;
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ return mapcount;
+ }
+
+ page = compound_head(page);
+
+ _total_mapcount = ret = 0;
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mapcount = atomic_read(&page[i]._mapcount) + 1;
+ ret = max(ret, mapcount);
+ _total_mapcount += mapcount;
+ }
+ if (PageDoubleMap(page)) {
+ ret -= 1;
+ _total_mapcount -= HPAGE_PMD_NR;
+ }
+ mapcount = compound_mapcount(page);
+ ret += mapcount;
+ _total_mapcount += mapcount;
+ if (total_mapcount)
+ *total_mapcount = _total_mapcount;
+ return ret;
+}
+
+/*
* This function splits huge page into normal pages. @page can point to any
* subpage of huge page to split. Split doesn't change the position of @page.
*
diff --git a/mm/ksm.c b/mm/ksm.c
index b99e828172f6..4786b4150f62 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -783,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void)
}
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+ up_read(&mm->mmap_sem);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -794,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void)
free_mm_slot(mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- up_read(&mm->mmap_sem);
mmdrop(mm);
- } else {
+ } else
spin_unlock(&ksm_mmlist_lock);
- up_read(&mm->mmap_sem);
- }
}
/* Clean up stable nodes, but don't worry if some are still busy */
@@ -1663,8 +1661,15 @@ next_mm:
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
- spin_unlock(&ksm_mmlist_lock);
up_read(&mm->mmap_sem);
+ /*
+ * up_read(&mm->mmap_sem) first because after
+ * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
+ * already have been freed under us by __ksm_exit()
+ * because the "mm_slot" is still hashed and
+ * ksm_scan.mm_slot doesn't point to it anymore.
+ */
+ spin_unlock(&ksm_mmlist_lock);
}
/* Repeat until we've completed scanning the whole list */
diff --git a/mm/memory.c b/mm/memory.c
index 52c218e2b724..07493e34ab7e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2373,6 +2373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
* not dirty accountable.
*/
if (PageAnon(old_page) && !PageKsm(old_page)) {
+ int total_mapcount;
if (!trylock_page(old_page)) {
get_page(old_page);
pte_unmap_unlock(page_table, ptl);
@@ -2387,13 +2388,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
put_page(old_page);
}
- if (reuse_swap_page(old_page)) {
- /*
- * The page is all ours. Move it to our anon_vma so
- * the rmap code will not search our parent or siblings.
- * Protected against the rmap code by the page lock.
- */
- page_move_anon_rmap(old_page, vma, address);
+ if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (total_mapcount == 1) {
+ /*
+ * The page is all ours. Move it to
+ * our anon_vma so the rmap code will
+ * not search our parent or siblings.
+ * Protected against the rmap code by
+ * the page lock.
+ */
+ page_move_anon_rmap(compound_head(old_page),
+ vma, address);
+ }
unlock_page(old_page);
return wp_page_reuse(mm, vma, address, page_table, ptl,
orig_pte, old_page, 0, 0);
@@ -2617,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
inc_mm_counter_fast(mm, MM_ANONPAGES);
dec_mm_counter_fast(mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+ if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 83874eced5bf..031713ab40ce 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -922,18 +922,19 @@ out:
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
+ *
+ * NOTE: total_mapcount should not be relied upon by the caller if
+ * reuse_swap_page() returns false, but it may be always overwritten
+ * (see the other implementation for CONFIG_SWAP=n).
*/
-int reuse_swap_page(struct page *page)
+bool reuse_swap_page(struct page *page, int *total_mapcount)
{
int count;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
- return 0;
- /* The page is part of THP and cannot be reused */
- if (PageTransCompound(page))
- return 0;
- count = page_mapcount(page);
+ return false;
+ count = page_trans_huge_mapcount(page, total_mapcount);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
if (count == 1 && !PageWriteback(page)) {