aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/dump_pagetables.c27
-rw-r--r--arch/x86/mm/fault.c7
-rw-r--r--arch/x86/mm/init.c79
-rw-r--r--arch/x86/mm/init_64.c14
-rw-r--r--arch/x86/mm/kmmio.c25
-rw-r--r--arch/x86/mm/mmap.c21
-rw-r--r--arch/x86/mm/numa_emulation.c107
-rw-r--r--arch/x86/mm/pageattr.c27
-rw-r--r--arch/x86/mm/pgtable.c177
-rw-r--r--arch/x86/mm/pti.c262
-rw-r--r--arch/x86/mm/tlb.c19
11 files changed, 599 insertions, 166 deletions
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2f3c9196b834..a12afff146d1 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -111,6 +111,8 @@ static struct addr_marker address_markers[] = {
[END_OF_SPACE_NR] = { -1, NULL }
};
+#define INIT_PGD ((pgd_t *) &init_top_pgt)
+
#else /* CONFIG_X86_64 */
enum address_markers_idx {
@@ -121,6 +123,9 @@ enum address_markers_idx {
#ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR,
#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ LDT_NR,
+#endif
CPU_ENTRY_AREA_NR,
FIXADDR_START_NR,
END_OF_SPACE_NR,
@@ -134,11 +139,16 @@ static struct addr_marker address_markers[] = {
#ifdef CONFIG_HIGHMEM
[PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ [LDT_NR] = { 0UL, "LDT remap" },
+#endif
[CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
[FIXADDR_START_NR] = { 0UL, "Fixmap area" },
[END_OF_SPACE_NR] = { -1, NULL }
};
+#define INIT_PGD (swapper_pg_dir)
+
#endif /* !CONFIG_X86_64 */
/* Multipliers for offsets within the PTEs */
@@ -496,11 +506,7 @@ static inline bool is_hypervisor_range(int idx)
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx, bool dmesg)
{
-#ifdef CONFIG_X86_64
- pgd_t *start = (pgd_t *) &init_top_pgt;
-#else
- pgd_t *start = swapper_pg_dir;
-#endif
+ pgd_t *start = INIT_PGD;
pgprotval_t prot, eff;
int i;
struct pg_state st = {};
@@ -563,12 +569,13 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
-static void ptdump_walk_user_pgd_level_checkwx(void)
+void ptdump_walk_user_pgd_level_checkwx(void)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pgd_t *pgd = (pgd_t *) &init_top_pgt;
+ pgd_t *pgd = INIT_PGD;
- if (!static_cpu_has(X86_FEATURE_PTI))
+ if (!(__supported_pte_mask & _PAGE_NX) ||
+ !static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("x86/mm: Checking user space page tables\n");
@@ -580,7 +587,6 @@ static void ptdump_walk_user_pgd_level_checkwx(void)
void ptdump_walk_pgd_level_checkwx(void)
{
ptdump_walk_pgd_level_core(NULL, NULL, true, false);
- ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)
@@ -609,6 +615,9 @@ static int __init pt_dump_init(void)
# endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
+# ifdef CONFIG_MODIFY_LDT_SYSCALL
+ address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
+# endif
#endif
return 0;
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2aafa6ab6103..b9123c497e0a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,6 +16,7 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -317,8 +318,6 @@ static noinline int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
- WARN_ON_ONCE(in_nmi());
-
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
@@ -1001,7 +1000,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey, unsigned int fault)
+ unsigned long address, u32 *pkey, vm_fault_t fault)
{
if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -1215,7 +1214,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- int fault, major = 0;
+ vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cee58a972cb2..5c32a7665492 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -4,6 +4,8 @@
#include <linux/swap.h>
#include <linux/memblock.h>
#include <linux/bootmem.h> /* for max_low_pfn */
+#include <linux/swapfile.h>
+#include <linux/swapops.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
@@ -97,15 +99,22 @@ __ref void *alloc_low_pages(unsigned int num)
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
- unsigned long ret;
- if (min_pfn_mapped >= max_pfn_mapped)
- panic("alloc_low_pages: ran out of memory");
- ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ unsigned long ret = 0;
+
+ if (min_pfn_mapped < max_pfn_mapped) {
+ ret = memblock_find_in_range(
+ min_pfn_mapped << PAGE_SHIFT,
max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
+ }
+ if (ret)
+ memblock_reserve(ret, PAGE_SIZE * num);
+ else if (can_use_brk_pgt)
+ ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
+
if (!ret)
panic("alloc_low_pages: can not alloc memory");
- memblock_reserve(ret, PAGE_SIZE * num);
+
pfn = ret >> PAGE_SHIFT;
} else {
pfn = pgt_buf_end;
@@ -773,13 +782,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
}
}
+/*
+ * begin/end can be in the direct map or the "high kernel mapping"
+ * used for the kernel image only. free_init_pages() will do the
+ * right thing for either kind of address.
+ */
+void free_kernel_image_pages(void *begin, void *end)
+{
+ unsigned long begin_ul = (unsigned long)begin;
+ unsigned long end_ul = (unsigned long)end;
+ unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
+
+
+ free_init_pages("unused kernel image", begin_ul, end_ul);
+
+ /*
+ * PTI maps some of the kernel into userspace. For performance,
+ * this includes some kernel areas that do not contain secrets.
+ * Those areas might be adjacent to the parts of the kernel image
+ * being freed, which may contain secrets. Remove the "high kernel
+ * image mapping" for these freed areas, ensuring they are not even
+ * potentially vulnerable to Meltdown regardless of the specific
+ * optimizations PTI is currently using.
+ *
+ * The "noalias" prevents unmapping the direct map alias which is
+ * needed to access the freed pages.
+ *
+ * This is only valid for 64bit kernels. 32bit has only one mapping
+ * which can't be treated in this way for obvious reasons.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
+ set_memory_np_noalias(begin_ul, len_pages);
+}
+
void __ref free_initmem(void)
{
e820__reallocate_tables();
- free_init_pages("unused kernel",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
+ free_kernel_image_pages(&__init_begin, &__init_end);
}
#ifdef CONFIG_BLK_DEV_INITRD
@@ -880,3 +920,26 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
__pte2cachemode_tbl[entry] = cache;
}
+
+#ifdef CONFIG_SWAP
+unsigned long max_swapfile_size(void)
+{
+ unsigned long pages;
+
+ pages = generic_max_swapfile_size();
+
+ if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+ /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
+ unsigned long l1tf_limit = l1tf_pfn_limit() + 1;
+ /*
+ * We encode swap offsets also with 3 bits below those for pfn
+ * which makes the usable limit higher.
+ */
+#if CONFIG_PGTABLE_LEVELS > 2
+ l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
+#endif
+ pages = min_t(unsigned long, l1tf_limit, pages);
+ }
+ return pages;
+}
+#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a688617c727e..dd519f372169 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1283,20 +1283,10 @@ void mark_rodata_ro(void)
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
- free_init_pages("unused kernel",
- (unsigned long) __va(__pa_symbol(text_end)),
- (unsigned long) __va(__pa_symbol(rodata_start)));
- free_init_pages("unused kernel",
- (unsigned long) __va(__pa_symbol(rodata_end)),
- (unsigned long) __va(__pa_symbol(_sdata)));
+ free_kernel_image_pages((void *)text_end, (void *)rodata_start);
+ free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
debug_checkwx();
-
- /*
- * Do this after all of the manipulation of the
- * kernel text page tables are complete.
- */
- pti_clone_kernel_text();
}
int kern_addr_valid(unsigned long addr)
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 7c8686709636..79eb55ce69a9 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -126,24 +126,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
+ pmd_t new_pmd;
pmdval_t v = pmd_val(*pmd);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pmd(pmd, __pmd(v));
+ *old = v;
+ new_pmd = pmd_mknotpresent(*pmd);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ new_pmd = __pmd(*old);
+ }
+ set_pmd(pmd, new_pmd);
}
static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pte_atomic(pte, __pte(v));
+ *old = v;
+ /* Nothing should care about address */
+ pte_clear(&init_mm, 0, pte);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ set_pte_atomic(pte, __pte(*old));
+ }
}
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 48c591251600..f40ab8185d94 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -240,3 +240,24 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
return phys_addr_valid(addr + count - 1);
}
+
+/*
+ * Only allow root to set high MMIO mappings to PROT_NONE.
+ * This prevents an unpriv. user to set them to PROT_NONE and invert
+ * them, then pointing to valid memory for L1TF speculation.
+ *
+ * Note: for locked down kernels may want to disable the root override.
+ */
+bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return true;
+ if (!__pte_needs_invert(pgprot_val(prot)))
+ return true;
+ /* If it's real memory always allow */
+ if (pfn_valid(pfn))
+ return true;
+ if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
+ return false;
+ return true;
+}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 34a2a3bfde9c..b54d52a2d00a 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -61,7 +61,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
eb->nid = nid;
if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
- emu_nid_to_phys[nid] = nid;
+ emu_nid_to_phys[nid] = pb->nid;
pb->start += size;
if (pb->start >= pb->end) {
@@ -198,40 +198,73 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
return end;
}
+static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
+{
+ unsigned long max_pfn = PHYS_PFN(max_addr);
+ unsigned long base_pfn = PHYS_PFN(base);
+ unsigned long hole_pfns = PHYS_PFN(hole);
+
+ return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
+}
+
/*
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
* `addr' to `max_addr'.
*
* Returns zero on success or negative on error.
*/
-static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
struct numa_meminfo *pi,
- u64 addr, u64 max_addr, u64 size)
+ u64 addr, u64 max_addr, u64 size,
+ int nr_nodes, struct numa_memblk *pblk,
+ int nid)
{
nodemask_t physnode_mask = numa_nodes_parsed;
+ int i, ret, uniform = 0;
u64 min_size;
- int nid = 0;
- int i, ret;
- if (!size)
+ if ((!size && !nr_nodes) || (nr_nodes && !pblk))
return -1;
+
/*
- * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
- * increased accordingly if the requested size is too small. This
- * creates a uniform distribution of node sizes across the entire
- * machine (but not necessarily over physical nodes).
+ * In the 'uniform' case split the passed in physical node by
+ * nr_nodes, in the non-uniform case, ignore the passed in
+ * physical block and try to create nodes of at least size
+ * @size.
+ *
+ * In the uniform case, split the nodes strictly by physical
+ * capacity, i.e. ignore holes. In the non-uniform case account
+ * for holes and treat @size as a minimum floor.
*/
- min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
- min_size = max(min_size, FAKE_NODE_MIN_SIZE);
- if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
- min_size = (min_size + FAKE_NODE_MIN_SIZE) &
- FAKE_NODE_MIN_HASH_MASK;
+ if (!nr_nodes)
+ nr_nodes = MAX_NUMNODES;
+ else {
+ nodes_clear(physnode_mask);
+ node_set(pblk->nid, physnode_mask);
+ uniform = 1;
+ }
+
+ if (uniform) {
+ min_size = uniform_size(max_addr, addr, 0, nr_nodes);
+ size = min_size;
+ } else {
+ /*
+ * The limit on emulated nodes is MAX_NUMNODES, so the
+ * size per node is increased accordingly if the
+ * requested size is too small. This creates a uniform
+ * distribution of node sizes across the entire machine
+ * (but not necessarily over physical nodes).
+ */
+ min_size = uniform_size(max_addr, addr,
+ mem_hole_size(addr, max_addr), nr_nodes);
+ }
+ min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
if (size < min_size) {
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
size >> 20, min_size >> 20);
size = min_size;
}
- size &= FAKE_NODE_MIN_HASH_MASK;
+ size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
/*
* Fill physical nodes with fake nodes of size until there is no memory
@@ -248,10 +281,14 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
node_clear(i, physnode_mask);
continue;
}
+
start = pi->blk[phys_blk].start;
limit = pi->blk[phys_blk].end;
- end = find_end_of_node(start, limit, size);
+ if (uniform)
+ end = start + size;
+ else
+ end = find_end_of_node(start, limit, size);
/*
* If there won't be at least FAKE_NODE_MIN_SIZE of
* non-reserved memory in ZONE_DMA32 for the next node,
@@ -266,7 +303,8 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
* next node, this one must extend to the end of the
* physical node.
*/
- if (limit - end - mem_hole_size(end, limit) < size)
+ if ((limit - end - mem_hole_size(end, limit) < size)
+ && !uniform)
end = limit;
ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
@@ -276,7 +314,15 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
return ret;
}
}
- return 0;
+ return nid;
+}
+
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, u64 size)
+{
+ return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
+ 0, NULL, NUMA_NO_NODE);
}
int __init setup_emu2phys_nid(int *dfl_phys_nid)
@@ -346,7 +392,28 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* the fixed node size. Otherwise, if it is just a single number N,
* split the system RAM into N fake nodes.
*/
- if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+ if (strchr(emu_cmdline, 'U')) {
+ nodemask_t physnode_mask = numa_nodes_parsed;
+ unsigned long n;
+ int nid = 0;
+
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+ ret = -1;
+ for_each_node_mask(i, physnode_mask) {
+ ret = split_nodes_size_interleave_uniform(&ei, &pi,
+ pi.blk[i].start, pi.blk[i].end, 0,
+ n, &pi.blk[i], nid);
+ if (ret < 0)
+ break;
+ if (ret < n) {
+ pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
+ __func__, i, ret, n);
+ ret = -1;
+ break;
+ }
+ nid = ret;
+ }
+ } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
u64 size;
size = memparse(emu_cmdline, &emu_cmdline);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3bded76e8d5c..8d6c34fe49be 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -53,6 +53,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
+#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -1014,8 +1015,8 @@ static long populate_pmd(struct cpa_data *cpa,
pmd = pmd_offset(pud, start);
- set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pmd_pgprot)));
+ set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
+ canon_pgprot(pmd_pgprot))));
start += PMD_SIZE;
cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
@@ -1087,8 +1088,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
- set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pud_pgprot)));
+ set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
+ canon_pgprot(pud_pgprot))));
start += PUD_SIZE;
cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
@@ -1486,6 +1487,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+ /* Has caller explicitly disabled alias checking? */
+ if (in_flag & CPA_NO_CHECK_ALIAS)
+ checkalias = 0;
ret = __change_page_attr_set_clr(&cpa, checkalias);
@@ -1772,6 +1776,15 @@ int set_memory_np(unsigned long addr, int numpages)
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
+int set_memory_np_noalias(unsigned long addr, int numpages)
+{
+ int cpa_flags = CPA_NO_CHECK_ALIAS;
+
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(_PAGE_PRESENT), 0,
+ cpa_flags, NULL);
+}
+
int set_memory_4k(unsigned long addr, int numpages)
{
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
@@ -1784,6 +1797,12 @@ int set_memory_nonglobal(unsigned long addr, int numpages)
__pgprot(_PAGE_GLOBAL), 0);
}
+int set_memory_global(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_GLOBAL), 0);
+}
+
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
struct cpa_data cpa;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 47b5951e592b..e848a4811785 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,7 +63,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
- tlb_remove_table(tlb, pte);
+ paravirt_tlb_remove_table(tlb, pte);
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -79,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
tlb->need_flush_all = 1;
#endif
pgtable_pmd_page_dtor(page);
- tlb_remove_table(tlb, page);
+ paravirt_tlb_remove_table(tlb, page);
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_page(pud));
+ paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_page(p4d));
+ paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
@@ -182,6 +182,14 @@ static void pgd_dtor(pgd_t *pgd)
*/
#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
+/*
+ * We allocate separate PMDs for the kernel part of the user page-table
+ * when PTI is enabled. We need them to map the per-process LDT into the
+ * user-space page-table.
+ */
+#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \
+ KERNEL_PGD_PTRS : 0)
+
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
@@ -202,14 +210,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS 0
-
+#define PREALLOCATED_USER_PMDS 0
#endif /* CONFIG_X86_PAE */
-static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
- for(i = 0; i < PREALLOCATED_PMDS; i++)
+ for (i = 0; i < count; i++)
if (pmds[i]) {
pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
@@ -217,7 +225,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
}
}
-static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
bool failed = false;
@@ -226,7 +234,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
- for(i = 0; i < PREALLOCATED_PMDS; i++) {
+ for (i = 0; i < count; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
if (!pmd)
failed = true;
@@ -241,7 +249,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
}
if (failed) {
- free_pmds(mm, pmds);
+ free_pmds(mm, pmds, count);
return -ENOMEM;
}
@@ -254,23 +262,38 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
* preallocate which never got a corresponding vma will need to be
* freed manually.
*/
+static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
+{
+ pgd_t pgd = *pgdp;
+
+ if (pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ *pgdp = native_make_pgd(0);
+
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ mm_dec_nr_pmds(mm);
+ }
+}
+
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
int i;
- for(i = 0; i < PREALLOCATED_PMDS; i++) {
- pgd_t pgd = pgdp[i];
+ for (i = 0; i < PREALLOCATED_PMDS; i++)
+ mop_up_one_pmd(mm, &pgdp[i]);
- if (pgd_val(pgd) != 0) {
- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pgdp[i] = native_make_pgd(0);
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
- paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
- pmd_free(mm, pmd);
- mm_dec_nr_pmds(mm);
- }
- }
+ pgdp = kernel_to_user_pgdp(pgdp);
+
+ for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
+ mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
+#endif
}
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
@@ -296,6 +319,38 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
}
}
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
+ pgd_t *k_pgd, pmd_t *pmds[])
+{
+ pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ p4d_t *u_p4d;
+ pud_t *u_pud;
+ int i;
+
+ u_p4d = p4d_offset(u_pgd, 0);
+ u_pud = pud_offset(u_p4d, 0);
+
+ s_pgd += KERNEL_PGD_BOUNDARY;
+ u_pud += KERNEL_PGD_BOUNDARY;
+
+ for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
+ pmd_t *pmd = pmds[i];
+
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+
+ pud_populate(mm, u_pud, pmd);
+ }
+
+}
+#else
+static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
+ pgd_t *k_pgd, pmd_t *pmds[])
+{
+}
+#endif
/*
* Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
* assumes that pgd should be in one page.
@@ -329,9 +384,6 @@ static int __init pgd_cache_init(void)
*/
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
SLAB_PANIC, NULL);
- if (!pgd_cache)
- return -ENOMEM;
-
return 0;
}
core_initcall(pgd_cache_init);
@@ -343,7 +395,8 @@ static inline pgd_t *_pgd_alloc(void)
* We allocate one page for pgd.
*/
if (!SHARED_KERNEL_PMD)
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP,
+ PGD_ALLOCATION_ORDER);
/*
* Now PAE kernel is not running as a Xen domain. We can allocate
@@ -355,7 +408,7 @@ static inline pgd_t *_pgd_alloc(void)
static inline void _pgd_free(pgd_t *pgd)
{
if (!SHARED_KERNEL_PMD)
- free_page((unsigned long)pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
else
kmem_cache_free(pgd_cache, pgd);
}
@@ -375,6 +428,7 @@ static inline void _pgd_free(pgd_t *pgd)
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
+ pmd_t *u_pmds[PREALLOCATED_USER_PMDS];
pmd_t *pmds[PREALLOCATED_PMDS];
pgd = _pgd_alloc();
@@ -384,12 +438,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
mm->pgd = pgd;
- if (preallocate_pmds(mm, pmds) != 0)
+ if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
goto out_free_pgd;
- if (paravirt_pgd_alloc(mm) != 0)
+ if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
goto out_free_pmds;
+ if (paravirt_pgd_alloc(mm) != 0)
+ goto out_free_user_pmds;
+
/*
* Make sure that pre-populating the pmds is atomic with
* respect to anything walking the pgd_list, so that they
@@ -399,13 +456,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
+ pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
spin_unlock(&pgd_lock);
return pgd;
+out_free_user_pmds:
+ free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
- free_pmds(mm, pmds);
+ free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
_pgd_free(pgd);
out:
@@ -719,28 +779,50 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
+#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
* @pud: Pointer to a PUD.
+ * @addr: Virtual address associated with pud.
*
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
*/
-int pud_free_pmd_page(pud_t *pud)
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
- pmd_t *pmd;
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i]))
- return 0;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd);
return 1;
@@ -749,11 +831,12 @@ int pud_free_pmd_page(pud_t *pud)
/**
* pmd_free_pte_page - Clear pmd entry and free pte page.
* @pmd: Pointer to a PMD.
+ * @addr: Virtual address associated with pmd.
*
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
-int pmd_free_pte_page(pmd_t *pmd)
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
@@ -762,8 +845,30 @@ int pmd_free_pte_page(pmd_t *pmd)
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
free_page((unsigned long)pte);
return 1;
}
+
+#else /* !CONFIG_X86_64 */
+
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+ return pud_none(*pud);
+}
+
+/*
+ * Disable free page handling on x86-PAE. This assures that ioremap()
+ * does not update sync'd pmd entries. See vmalloc_sync_one().
+ */
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+ return pmd_none(*pmd);
+}
+
+#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e705878..31341ae7309f 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -45,6 +45,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
+#include <asm/sections.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -54,6 +55,16 @@
#define __GFP_NOTRACK 0
#endif
+/*
+ * Define the page-table levels we clone for user-space on 32
+ * and 64 bit.
+ */
+#ifdef CONFIG_X86_64
+#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD
+#else
+#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE
+#endif
+
static void __init pti_print_if_insecure(const char *reason)
{
if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
@@ -117,7 +128,7 @@ enable:
setup_force_cpu_cap(X86_FEATURE_PTI);
}
-pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
/*
* Changes to the high (kernel) portion of the kernelmode page
@@ -176,7 +187,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
- if (!new_p4d_page)
+ if (WARN_ON_ONCE(!new_p4d_page))
return NULL;
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,13 +206,17 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
- p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+ p4d_t *p4d;
pud_t *pud;
+ p4d = pti_user_pagetable_walk_p4d(address);
+ if (!p4d)
+ return NULL;
+
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
- if (!new_pud_page)
+ if (WARN_ON_ONCE(!new_pud_page))
return NULL;
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -215,7 +230,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
- if (!new_pmd_page)
+ if (WARN_ON_ONCE(!new_pmd_page))
return NULL;
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -224,7 +239,6 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
return pmd_offset(pud, address);
}
-#ifdef CONFIG_X86_VSYSCALL_EMULATION
/*
* Walk the shadow copy of the page tables (optionally) trying to allocate
* page table pages on the way down. Does not support large pages.
@@ -237,9 +251,13 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
- pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+ pmd_t *pmd;
pte_t *pte;
+ pmd = pti_user_pagetable_walk_pmd(address);
+ if (!pmd)
+ return NULL;
+
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
@@ -262,6 +280,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
return pte;
}
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
static void __init pti_setup_vsyscall(void)
{
pte_t *pte, *target_pte;
@@ -282,8 +301,14 @@ static void __init pti_setup_vsyscall(void)
static void __init pti_setup_vsyscall(void) { }
#endif
+enum pti_clone_level {
+ PTI_CLONE_PMD,
+ PTI_CLONE_PTE,
+};
+
static void
-pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+pti_clone_pgtable(unsigned long start, unsigned long end,
+ enum pti_clone_level level)
{
unsigned long addr;
@@ -291,59 +316,105 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
* Clone the populated PMDs which cover start to end. These PMD areas
* can have holes.
*/
- for (addr = start; addr < end; addr += PMD_SIZE) {
+ for (addr = start; addr < end;) {
+ pte_t *pte, *target_pte;
pmd_t *pmd, *target_pmd;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
+ /* Overflow check */
+ if (addr < start)
+ break;
+
pgd = pgd_offset_k(addr);
if (WARN_ON(pgd_none(*pgd)))
return;
p4d = p4d_offset(pgd, addr);
if (WARN_ON(p4d_none(*p4d)))
return;
+
pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
+ if (pud_none(*pud)) {
+ addr += PUD_SIZE;
continue;
+ }
+
pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
+ if (pmd_none(*pmd)) {
+ addr += PMD_SIZE;
continue;
+ }
- target_pmd = pti_user_pagetable_walk_pmd(addr);
- if (WARN_ON(!target_pmd))
- return;
-
- /*
- * Only clone present PMDs. This ensures only setting
- * _PAGE_GLOBAL on present PMDs. This should only be
- * called on well-known addresses anyway, so a non-
- * present PMD would be a surprise.
- */
- if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)))
- return;
-
- /*
- * Setting 'target_pmd' below creates a mapping in both
- * the user and kernel page tables. It is effectively
- * global, so set it as global in both copies. Note:
- * the X86_FEATURE_PGE check is not _required_ because
- * the CPU ignores _PAGE_GLOBAL when PGE is not
- * supported. The check keeps consistentency with
- * code that only set this bit when supported.
- */
- if (boot_cpu_has(X86_FEATURE_PGE))
- *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL);
-
- /*
- * Copy the PMD. That is, the kernelmode and usermode
- * tables will share the last-level page tables of this
- * address range
- */
- *target_pmd = pmd_clear_flags(*pmd, clear);
+ if (pmd_large(*pmd) || level == PTI_CLONE_PMD) {
+ target_pmd = pti_user_pagetable_walk_pmd(addr);
+ if (WARN_ON(!target_pmd))
+ return;
+
+ /*
+ * Only clone present PMDs. This ensures only setting
+ * _PAGE_GLOBAL on present PMDs. This should only be
+ * called on well-known addresses anyway, so a non-
+ * present PMD would be a surprise.
+ */
+ if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)))
+ return;
+
+ /*
+ * Setting 'target_pmd' below creates a mapping in both
+ * the user and kernel page tables. It is effectively
+ * global, so set it as global in both copies. Note:
+ * the X86_FEATURE_PGE check is not _required_ because
+ * the CPU ignores _PAGE_GLOBAL when PGE is not
+ * supported. The check keeps consistentency with
+ * code that only set this bit when supported.
+ */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL);
+
+ /*
+ * Copy the PMD. That is, the kernelmode and usermode
+ * tables will share the last-level page tables of this
+ * address range
+ */
+ *target_pmd = *pmd;
+
+ addr += PMD_SIZE;
+
+ } else if (level == PTI_CLONE_PTE) {
+
+ /* Walk the page-table down to the pte level */
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte)) {
+ addr += PAGE_SIZE;
+ continue;
+ }
+
+ /* Only clone present PTEs */
+ if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT)))
+ return;
+
+ /* Allocate PTE in the user page-table */
+ target_pte = pti_user_pagetable_walk_pte(addr);
+ if (WARN_ON(!target_pte))
+ return;
+
+ /* Set GLOBAL bit in both PTEs */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ *pte = pte_set_flags(*pte, _PAGE_GLOBAL);
+
+ /* Clone the PTE */
+ *target_pte = *pte;
+
+ addr += PAGE_SIZE;
+
+ } else {
+ BUG();
+ }
}
}
+#ifdef CONFIG_X86_64
/*
* Clone a single p4d (i.e. a top-level entry on 4-level systems and a
* next-level entry on 5-level systems.
@@ -354,6 +425,9 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
user_p4d = pti_user_pagetable_walk_p4d(addr);
+ if (!user_p4d)
+ return;
+
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
@@ -367,6 +441,25 @@ static void __init pti_clone_user_shared(void)
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
}
+#else /* CONFIG_X86_64 */
+
+/*
+ * On 32 bit PAE systems with 1GB of Kernel address space there is only
+ * one pgd/p4d for the whole kernel. Cloning that would map the whole
+ * address space into the user page-tables, making PTI useless. So clone
+ * the page-table on the PMD level to prevent that.
+ */
+static void __init pti_clone_user_shared(void)
+{
+ unsigned long start, end;
+
+ start = CPU_ENTRY_AREA_BASE;
+ end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
+
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD);
+}
+#endif /* CONFIG_X86_64 */
+
/*
* Clone the ESPFIX P4D into the user space visible page table
*/
@@ -380,11 +473,11 @@ static void __init pti_setup_espfix64(void)
/*
* Clone the populated PMDs of the entry and irqentry text and force it RO.
*/
-static void __init pti_clone_entry_text(void)
+static void pti_clone_entry_text(void)
{
- pti_clone_pmds((unsigned long) __entry_text_start,
- (unsigned long) __irqentry_text_end,
- _PAGE_RW);
+ pti_clone_pgtable((unsigned long) __entry_text_start,
+ (unsigned long) __irqentry_text_end,
+ PTI_CLONE_PMD);
}
/*
@@ -435,10 +528,17 @@ static inline bool pti_kernel_image_global_ok(void)
}
/*
+ * This is the only user for these and it is not arch-generic
+ * like the other set_memory.h functions. Just extern them.
+ */
+extern int set_memory_nonglobal(unsigned long addr, int numpages);
+extern int set_memory_global(unsigned long addr, int numpages);
+
+/*
* For some configurations, map all of kernel text into the user page
* tables. This reduces TLB misses, especially on non-PCID systems.
*/
-void pti_clone_kernel_text(void)
+static void pti_clone_kernel_text(void)
{
/*
* rodata is part of the kernel image and is normally
@@ -446,7 +546,8 @@ void pti_clone_kernel_text(void)
* clone the areas past rodata, they might contain secrets.
*/
unsigned long start = PFN_ALIGN(_text);
- unsigned long end = (unsigned long)__end_rodata_hpage_align;
+ unsigned long end_clone = (unsigned long)__end_rodata_aligned;
+ unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table);
if (!pti_kernel_image_global_ok())
return;
@@ -458,14 +559,18 @@ void pti_clone_kernel_text(void)
* pti_set_kernel_image_nonglobal() did to clear the
* global bit.
*/
- pti_clone_pmds(start, end, _PAGE_RW);
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+
+ /*
+ * pti_clone_pgtable() will set the global bit in any PMDs
+ * that it clones, but we also need to get any PTEs in
+ * the last level for areas that are not huge-page-aligned.
+ */
+
+ /* Set the global bit for normal non-__init kernel text: */
+ set_memory_global(start, (end_global - start) >> PAGE_SHIFT);
}
-/*
- * This is the only user for it and it is not arch-generic like
- * the other set_memory.h functions. Just extern it.
- */
-extern int set_memory_nonglobal(unsigned long addr, int numpages);
void pti_set_kernel_image_nonglobal(void)
{
/*
@@ -477,9 +582,11 @@ void pti_set_kernel_image_nonglobal(void)
unsigned long start = PFN_ALIGN(_text);
unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
- if (pti_kernel_image_global_ok())
- return;
-
+ /*
+ * This clears _PAGE_GLOBAL from the entire kernel image.
+ * pti_clone_kernel_text() map put _PAGE_GLOBAL back for
+ * areas that are mapped to userspace.
+ */
set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
}
@@ -493,6 +600,28 @@ void __init pti_init(void)
pr_info("enabled\n");
+#ifdef CONFIG_X86_32
+ /*
+ * We check for X86_FEATURE_PCID here. But the init-code will
+ * clear the feature flag on 32 bit because the feature is not
+ * supported on 32 bit anyway. To print the warning we need to
+ * check with cpuid directly again.
+ */
+ if (cpuid_ecx(0x1) & BIT(17)) {
+ /* Use printk to work around pr_fmt() */
+ printk(KERN_WARNING "\n");
+ printk(KERN_WARNING "************************************************************\n");
+ printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
+ printk(KERN_WARNING "** **\n");
+ printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n");
+ printk(KERN_WARNING "** Your performance will increase dramatically if you **\n");
+ printk(KERN_WARNING "** switch to a 64-bit kernel! **\n");
+ printk(KERN_WARNING "** **\n");
+ printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
+ printk(KERN_WARNING "************************************************************\n");
+ }
+#endif
+
pti_clone_user_shared();
/* Undo all global bits from the init pagetables in head_64.S: */
@@ -502,3 +631,22 @@ void __init pti_init(void)
pti_setup_espfix64();
pti_setup_vsyscall();
}
+
+/*
+ * Finalize the kernel mappings in the userspace page-table. Some of the
+ * mappings for the kernel image might have changed since pti_init()
+ * cloned them. This is because parts of the kernel image have been
+ * mapped RO and/or NX. These changes need to be cloned again to the
+ * userspace page-table.
+ */
+void pti_finalize(void)
+{
+ /*
+ * We need to clone everything (again) that maps parts of the
+ * kernel image.
+ */
+ pti_clone_entry_text();
+ pti_clone_kernel_text();
+
+ debug_checkwx_user();
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb1f34c3c85..9517d1b2a281 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -35,7 +35,7 @@
* necessary invalidation by clearing out the 'ctx_id' which
* forces a TLB flush when the context is loaded.
*/
-void clear_asid_other(void)
+static void clear_asid_other(void)
{
u16 asid;
@@ -285,15 +285,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
sync_current_stack_to_mm(next);
}
- /* Stop remote flushes for the previous mm */
- VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
- real_prev != &init_mm);
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ /*
+ * Stop remote flushes for the previous mm.
+ * Skip kernel threads; we never send init_mm TLB flushing IPIs,
+ * but the bitmap manipulation can cause cache line contention.
+ */
+ if (real_prev != &init_mm) {
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
+ mm_cpumask(real_prev)));
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ }
/*
* Start remote flushes and then read tlb_gen.
*/
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ if (next != &init_mm)
+ cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);