aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 16:29:35 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 16:29:35 -0700
commit203b4fc903b644223a27ad3f25f3a0f3a3911d1d (patch)
tree8c210b67a17b74b2a39c500891a20e23c2390cf6 /arch/x86
parent7edcf0d314f69e506ddd9562062b2a79fa965bb9 (diff)
parent765d28f136291f9639e3c031a1070fb76d6625c7 (diff)
download96b-common-203b4fc903b644223a27ad3f25f3a0f3a3911d1d.tar.gz
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Thomas Gleixner: - Make lazy TLB mode even lazier to avoid pointless switch_mm() operations, which reduces CPU load by 1-2% for memcache workloads - Small cleanups and improvements all over the place * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm: Remove redundant check for kmem_cache_create() arm/asm/tlb.h: Fix build error implicit func declaration x86/mm/tlb: Make clear_asid_other() static x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off() x86/mm/tlb: Always use lazy TLB mode x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs x86/mm/tlb: Make lazy TLB mode lazier x86/mm/tlb: Restructure switch_mm_irqs_off() x86/mm/tlb: Leave lazy TLB mode at page table free time mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids x86/mm: Add TLB purge to free pmd/pte page interfaces ioremap: Update pgtable free interfaces with addr x86/mm: Disable ioremap free page handling on x86-PAE
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/tlbflush.h21
-rw-r--r--arch/x86/mm/pgtable.c64
-rw-r--r--arch/x86/mm/tlb.c224
3 files changed, 223 insertions, 86 deletions
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6690cd3fc8b1..511bf5fae8b8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
-static inline bool tlb_defer_switch_to_init_mm(void)
-{
- /*
- * If we have PCID, then switching to init_mm is reasonably
- * fast. If we don't have PCID, then switching to init_mm is
- * quite slow, so we try to defer it in the hopes that we can
- * avoid it entirely. The latter approach runs the risk of
- * receiving otherwise unnecessary IPIs.
- *
- * This choice is just a heuristic. The tlb code can handle this
- * function returning true or false regardless of whether we have
- * PCID.
- */
- return !static_cpu_has(X86_FEATURE_PCID);
-}
-
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
native_flush_tlb_others(mask, info)
#endif
+extern void tlb_flush_remove_tables(struct mm_struct *mm);
+extern void tlb_flush_remove_tables_local(void *arg);
+
+#define HAVE_TLB_FLUSH_REMOVE_TABLES
+
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 47b5951e592b..0f1683fcb196 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -329,9 +329,6 @@ static int __init pgd_cache_init(void)
*/
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
SLAB_PANIC, NULL);
- if (!pgd_cache)
- return -ENOMEM;
-
return 0;
}
core_initcall(pgd_cache_init);
@@ -719,28 +716,50 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
+#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
* @pud: Pointer to a PUD.
+ * @addr: Virtual address associated with pud.
*
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
*/
-int pud_free_pmd_page(pud_t *pud)
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
- pmd_t *pmd;
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i]))
- return 0;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd);
return 1;
@@ -749,11 +768,12 @@ int pud_free_pmd_page(pud_t *pud)
/**
* pmd_free_pte_page - Clear pmd entry and free pte page.
* @pmd: Pointer to a PMD.
+ * @addr: Virtual address associated with pmd.
*
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
-int pmd_free_pte_page(pmd_t *pmd)
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
@@ -762,8 +782,30 @@ int pmd_free_pte_page(pmd_t *pmd)
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
free_page((unsigned long)pte);
return 1;
}
+
+#else /* !CONFIG_X86_64 */
+
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+ return pud_none(*pud);
+}
+
+/*
+ * Disable free page handling on x86-PAE. This assures that ioremap()
+ * does not update sync'd pmd entries. See vmalloc_sync_one().
+ */
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+ return pmd_none(*pmd);
+}
+
+#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb1f34c3c85..752dbf4e0e50 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
+#include <linux/gfp.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -35,7 +36,7 @@
* necessary invalidation by clearing out the 'ctx_id' which
* forces a TLB flush when the context is loaded.
*/
-void clear_asid_other(void)
+static void clear_asid_other(void)
{
u16 asid;
@@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
+ bool need_flush;
+ u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
@@ -240,20 +244,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
- * We don't currently support having a real mm loaded without
- * our cpu set in mm_cpumask(). We have all the bookkeeping
- * in place to figure out whether we would need to flush
- * if our cpu were cleared in mm_cpumask(), but we don't
- * currently use it.
+ * Even in lazy TLB mode, the CPU should stay set in the
+ * mm_cpumask. The TLB shootdown code can figure out from
+ * from cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
- return;
+ /*
+ * If the CPU is not in lazy TLB mode, we are just switching
+ * from one thread in a process to another thread in the same
+ * process. No TLB flush required.
+ */
+ if (!was_lazy)
+ return;
+
+ /*
+ * Read the tlb_gen to check whether a flush is needed.
+ * If the TLB is up to date, just use it.
+ * The barrier synchronizes with the tlb_gen increment in
+ * the TLB shootdown code.
+ */
+ smp_mb();
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+ next_tlb_gen)
+ return;
+
+ /*
+ * TLB contents went out of date while we were in lazy
+ * mode. Fall through to the TLB switching code below.
+ */
+ new_asid = prev_asid;
+ need_flush = true;
} else {
- u16 new_asid;
- bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
@@ -285,53 +310,60 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
sync_current_stack_to_mm(next);
}
- /* Stop remote flushes for the previous mm */
- VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
- real_prev != &init_mm);
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ /*
+ * Stop remote flushes for the previous mm.
+ * Skip kernel threads; we never send init_mm TLB flushing IPIs,
+ * but the bitmap manipulation can cause cache line contention.
+ */
+ if (real_prev != &init_mm) {
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
+ mm_cpumask(real_prev)));
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ }
/*
* Start remote flushes and then read tlb_gen.
*/
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ if (next != &init_mm)
+ cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ }
- if (need_flush) {
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
-
- /*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- } else {
- /* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
-
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
- }
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ load_new_mm_cr3(next->pgd, new_asid, true);
/*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
*/
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ load_new_mm_cr3(next->pgd, new_asid, false);
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
+ /*
+ * Record last user mm's context id, so we can avoid
+ * flushing branch buffer with IBPB if we switch back
+ * to the same user.
+ */
+ if (next != &init_mm)
+ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
@@ -354,20 +386,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- if (tlb_defer_switch_to_init_mm()) {
- /*
- * There's a significant optimization that may be possible
- * here. We have accurate enough TLB flush tracking that we
- * don't need to maintain coherence of TLB per se when we're
- * lazy. We do, however, need to maintain coherence of
- * paging-structure caches. We could, in principle, leave our
- * old mm loaded and only switch to init_mm when
- * tlb_remove_page() happens.
- */
- this_cpu_write(cpu_tlbstate.is_lazy, true);
- } else {
- switch_mm(NULL, &init_mm, NULL);
- }
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
}
/*
@@ -454,6 +473,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
+ *
+ * This should be rare, with native_flush_tlb_others skipping
+ * IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
@@ -560,6 +582,9 @@ static void flush_tlb_func_remote(void *info)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
+ cpumask_var_t lazymask;
+ unsigned int cpu;
+
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,8 +608,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
- unsigned int cpu;
-
cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
@@ -592,8 +615,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
- smp_call_function_many(cpumask, flush_tlb_func_remote,
+
+ /*
+ * A temporary cpumask is used in order to skip sending IPIs
+ * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
+ * If the allocation fails, simply IPI every CPU in mm_cpumask.
+ */
+ if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
+ return;
+ }
+
+ cpumask_copy(lazymask, cpumask);
+
+ for_each_cpu(cpu, lazymask) {
+ if (per_cpu(cpu_tlbstate.is_lazy, cpu))
+ cpumask_clear_cpu(cpu, lazymask);
+ }
+
+ smp_call_function_many(lazymask, flush_tlb_func_remote,
+ (void *)info, 1);
+
+ free_cpumask_var(lazymask);
}
/*
@@ -646,6 +690,68 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
put_cpu();
}
+void tlb_flush_remove_tables_local(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
+ this_cpu_read(cpu_tlbstate.is_lazy)) {
+ /*
+ * We're in lazy mode. We need to at least flush our
+ * paging-structure cache to avoid speculatively reading
+ * garbage into our TLB. Since switching to init_mm is barely
+ * slower than a minimal flush, just switch to init_mm.
+ */
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
+ }
+}
+
+static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
+ struct cpumask *lazy_cpus)
+{
+ int cpu;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
+ cpumask_set_cpu(cpu, lazy_cpus);
+ }
+}
+
+void tlb_flush_remove_tables(struct mm_struct *mm)
+{
+ int cpu = get_cpu();
+ cpumask_var_t lazy_cpus;
+
+ if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
+ put_cpu();
+ return;
+ }
+
+ if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
+ /*
+ * If the cpumask allocation fails, do a brute force flush
+ * on all the CPUs that have this mm loaded.
+ */
+ smp_call_function_many(mm_cpumask(mm),
+ tlb_flush_remove_tables_local, (void *)mm, 1);
+ put_cpu();
+ return;
+ }
+
+ /*
+ * CPUs with !is_lazy either received a TLB flush IPI while the user
+ * pages in this address range were unmapped, or have context switched
+ * and reloaded %CR3 since then.
+ *
+ * Shootdown IPIs at page table freeing time only need to be sent to
+ * CPUs that may have out of date TLB contents.
+ */
+ mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
+ smp_call_function_many(lazy_cpus,
+ tlb_flush_remove_tables_local, (void *)mm, 1);
+ free_cpumask_var(lazy_cpus);
+ put_cpu();
+}
static void do_flush_tlb_all(void *info)
{