aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2019-01-18 14:29:42 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2019-01-18 14:29:42 +1100
commit02bf9bc148ee2a1a32dd4716fbe7e521a51a5ad2 (patch)
treee2eb64c21a30e21f51442d1cc2faeba912adc756
parent3c1c6ab5e402361bf244fa598a20759d7782bb00 (diff)
parentf531bf6f8e5e0b928981d5d056d9aa15cdbe1605 (diff)
download96b-common-02bf9bc148ee2a1a32dd4716fbe7e521a51a5ad2.tar.gz
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt3
-rw-r--r--Documentation/admin-guide/mm/pagemap.rst9
-rw-r--r--Documentation/core-api/flexible-arrays.rst130
-rw-r--r--Documentation/core-api/genalloc.rst2
-rw-r--r--Documentation/core-api/generic-radix-tree.rst12
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--Documentation/filesystems/proc.txt7
-rw-r--r--Documentation/flexible-arrays.txt123
-rw-r--r--arch/alpha/include/asm/topology.h3
-rw-r--r--arch/arm/mm/dma-mapping.c2
-rw-r--r--arch/arm/mm/mmu.c13
-rw-r--r--arch/arm64/Kconfig4
-rw-r--r--arch/arm64/include/asm/hugetlb.h5
-rw-r--r--arch/arm64/mm/hugetlbpage.c20
-rw-r--r--arch/c6x/include/asm/Kbuild1
-rw-r--r--arch/c6x/include/uapi/asm/Kbuild1
-rw-r--r--arch/c6x/mm/dma-coherent.c9
-rw-r--r--arch/h8300/include/asm/Kbuild1
-rw-r--r--arch/h8300/include/uapi/asm/Kbuild1
-rw-r--r--arch/hexagon/include/asm/Kbuild1
-rw-r--r--arch/hexagon/include/uapi/asm/Kbuild1
-rw-r--r--arch/ia64/kernel/numa.c2
-rw-r--r--arch/ia64/mm/discontig.c6
-rw-r--r--arch/m68k/include/asm/Kbuild1
-rw-r--r--arch/m68k/include/uapi/asm/Kbuild1
-rw-r--r--arch/microblaze/include/asm/Kbuild1
-rw-r--r--arch/microblaze/include/uapi/asm/Kbuild1
-rw-r--r--arch/microblaze/mm/init.c5
-rw-r--r--arch/nds32/mm/init.c12
-rw-r--r--arch/openrisc/include/asm/Kbuild1
-rw-r--r--arch/openrisc/include/uapi/asm/Kbuild1
-rw-r--r--arch/openrisc/mm/ioremap.c11
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h3
-rw-r--r--arch/powerpc/kernel/paca.c19
-rw-r--r--arch/powerpc/kernel/pci-common.c3
-rw-r--r--arch/powerpc/kernel/setup-common.c4
-rw-r--r--arch/powerpc/kernel/setup_64.c24
-rw-r--r--arch/powerpc/mm/hash_utils_64.c6
-rw-r--r--arch/powerpc/mm/numa.c14
-rw-r--r--arch/powerpc/mm/pgtable-book3e.c8
-rw-r--r--arch/powerpc/mm/pgtable-book3s64.c5
-rw-r--r--arch/powerpc/mm/pgtable-radix.c25
-rw-r--r--arch/powerpc/mm/ppc_mmu_32.c3
-rw-r--r--arch/powerpc/platforms/pasemi/iommu.c5
-rw-r--r--arch/powerpc/platforms/powernv/memtrace.c5
-rw-r--r--arch/powerpc/platforms/powernv/opal.c3
-rw-r--r--arch/powerpc/platforms/pseries/setup.c18
-rw-r--r--arch/powerpc/sysdev/dart_iommu.c7
-rw-r--r--arch/s390/numa/numa.c14
-rw-r--r--arch/sh/kernel/syscalls/syscalltbl.sh4
-rw-r--r--arch/sh/kernel/syscalls_32.S2
-rw-r--r--arch/sh/mm/init.c18
-rw-r--r--arch/sh/mm/numa.c5
-rw-r--r--arch/sparc/include/asm/pgtable_64.h30
-rw-r--r--arch/sparc/kernel/pci_fire.c3
-rw-r--r--arch/sparc/kernel/pci_schizo.c3
-rw-r--r--arch/sparc/kernel/prom_64.c7
-rw-r--r--arch/sparc/kernel/psycho_common.c3
-rw-r--r--arch/sparc/kernel/sbus.c3
-rw-r--r--arch/sparc/mm/init_64.c15
-rw-r--r--arch/unicore32/include/asm/Kbuild1
-rw-r--r--arch/unicore32/include/uapi/asm/Kbuild1
-rw-r--r--arch/unicore32/mm/mmu.c14
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c7
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--block/blk-core.c3
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c5
-rw-r--r--drivers/dma/dmaengine.c4
-rw-r--r--drivers/hv/hv_balloon.c20
-rw-r--r--drivers/infiniband/hw/hfi1/affinity.c3
-rw-r--r--drivers/infiniband/hw/hfi1/init.c3
-rw-r--r--drivers/iommu/dmar.c5
-rw-r--r--drivers/iommu/intel-iommu.c3
-rw-r--r--drivers/md/raid5-ppl.c6
-rw-r--r--drivers/md/raid5.c87
-rw-r--r--drivers/md/raid5.h9
-rw-r--r--drivers/misc/sgi-xp/xpc_uv.c3
-rw-r--r--drivers/misc/sram-exec.c2
-rw-r--r--drivers/misc/vmw_balloon.c32
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c5
-rw-r--r--drivers/xen/balloon.c24
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/buffer.c56
-rw-r--r--fs/dax.c7
-rw-r--r--fs/eventpoll.c173
-rw-r--r--fs/file.c1
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/ocfs2/aops.c22
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c6
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/proc/base.c43
-rw-r--r--fs/proc/generic.c4
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/proc_net.c20
-rw-r--r--fs/proc/task_mmu.c17
-rw-r--r--include/linux/balloon_compaction.h34
-rw-r--r--include/linux/compaction.h5
-rw-r--r--include/linux/device.h2
-rw-r--r--include/linux/flex_array.h149
-rw-r--r--include/linux/genalloc.h2
-rw-r--r--include/linux/generic-radix-tree.h231
-rw-r--r--include/linux/gfp.h7
-rw-r--r--include/linux/hugetlb.h48
-rw-r--r--include/linux/ipc_namespace.h1
-rw-r--r--include/linux/kernel.h2
-rw-r--r--include/linux/ksm.h7
-rw-r--r--include/linux/memblock.h41
-rw-r--r--include/linux/memcontrol.h37
-rw-r--r--include/linux/memory_hotplug.h2
-rw-r--r--include/linux/mm.h100
-rw-r--r--include/linux/mm_types.h73
-rw-r--r--include/linux/mmu_notifier.h35
-rw-r--r--include/linux/mmzone.h2
-rw-r--r--include/linux/nodemask.h15
-rw-r--r--include/linux/page-flags.h11
-rw-r--r--include/linux/pagemap.h1
-rw-r--r--include/linux/poison.h3
-rw-r--r--include/linux/sched.h14
-rw-r--r--include/linux/sched/signal.h18
-rw-r--r--include/linux/slub_def.h2
-rw-r--r--include/linux/swap.h13
-rw-r--r--include/net/sctp/structs.h15
-rw-r--r--include/uapi/linux/binfmts.h2
-rw-r--r--include/uapi/linux/kernel-page-flags.h2
-rw-r--r--init/init_task.c3
-rw-r--r--init/main.c5
-rw-r--r--ipc/ipc_sysctl.c14
-rw-r--r--ipc/util.c29
-rw-r--r--ipc/util.h46
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/dma/remap.c2
-rw-r--r--kernel/events/uprobes.c3
-rw-r--r--kernel/exit.c15
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/locking/lockdep.c19
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/power/snapshot.c17
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/fair.c15
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sysctl.c49
-rw-r--r--lib/Kconfig.debug18
-rw-r--r--lib/Makefile6
-rw-r--r--lib/cpumask.c3
-rw-r--r--lib/debugobjects.c66
-rw-r--r--lib/flex_array.c398
-rw-r--r--lib/genalloc.c5
-rw-r--r--lib/generic-radix-tree.c217
-rw-r--r--lib/test_vmalloc.c551
-rw-r--r--mm/compaction.c1038
-rw-r--r--mm/filemap.c223
-rw-r--r--mm/huge_memory.c25
-rw-r--r--mm/hugetlb.c18
-rw-r--r--mm/internal.h24
-rw-r--r--mm/khugepaged.c3
-rw-r--r--mm/ksm.c38
-rw-r--r--mm/madvise.c3
-rw-r--r--mm/memblock.c64
-rw-r--r--mm/memcontrol.c51
-rw-r--r--mm/memory.c48
-rw-r--r--mm/memory_hotplug.c82
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c9
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/oom_kill.c8
-rw-r--r--mm/page_alloc.c709
-rw-r--r--mm/page_ext.c5
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/page_poison.c2
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slub.c7
-rw-r--r--mm/swap_state.c16
-rw-r--r--mm/swapfile.c156
-rw-r--r--mm/vmalloc.c59
-rw-r--r--net/core/pktgen.c3
-rw-r--r--net/openvswitch/flow.h1
-rw-r--r--net/openvswitch/flow_netlink.h1
-rw-r--r--net/openvswitch/flow_table.c51
-rw-r--r--net/openvswitch/flow_table.h3
-rw-r--r--net/qrtr/qrtr.c3
-rw-r--r--net/sctp/stream.c105
-rw-r--r--net/sctp/stream_interleave.c2
-rwxr-xr-xscripts/checkpatch.pl6
-rwxr-xr-xscripts/decode_stacktrace.sh9
-rw-r--r--security/selinux/ss/avtab.c40
-rw-r--r--security/selinux/ss/avtab.h4
-rw-r--r--security/selinux/ss/conditional.c6
-rw-r--r--security/selinux/ss/policydb.c122
-rw-r--r--security/selinux/ss/policydb.h12
-rw-r--r--security/selinux/ss/services.c22
-rw-r--r--tools/include/linux/numa.h16
-rw-r--r--tools/include/linux/poison.h3
-rw-r--r--tools/perf/bench/numa.c7
-rw-r--r--tools/testing/selftests/proc/.gitignore1
-rw-r--r--tools/testing/selftests/proc/Makefile1
-rw-r--r--tools/testing/selftests/proc/proc-loadavg-001.c2
-rw-r--r--tools/testing/selftests/proc/proc-self-map-files-002.c2
-rw-r--r--tools/testing/selftests/proc/proc-self-syscall.c2
-rw-r--r--tools/testing/selftests/proc/proc-self-wchan.c2
-rw-r--r--tools/testing/selftests/proc/read.c2
-rw-r--r--tools/testing/selftests/proc/setns-dcache.c129
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests16
-rw-r--r--tools/testing/selftests/vm/test_vmalloc.sh176
-rw-r--r--tools/vm/page-types.c2
214 files changed, 4562 insertions, 2453 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 84f687236a50..17ce83cc48f4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1813,6 +1813,9 @@
ip= [IP_PNP]
See Documentation/filesystems/nfs/nfsroot.txt.
+ ipcmni_extend [KNL] Extend the maximum number of unique System V
+ IPC identifiers from 32,768 to 8,388,608.
+
irqaffinity= [SMP] Set the default irq affinity mask
The argument is a cpu list, as described above.
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 3f7bade2c231..340a5aee9b80 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -75,9 +75,10 @@ number of times a page is mapped.
20. NOPAGE
21. KSM
22. THP
- 23. BALLOON
+ 23. OFFLINE
24. ZERO_PAGE
25. IDLE
+ 26. PGTABLE
* ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the
memory cgroup each page is charged to, indexed by PFN. Only available when
@@ -118,8 +119,8 @@ Short descriptions to the page flags
identical memory pages dynamically shared between one or more processes
22 - THP
contiguous pages which construct transparent hugepages
-23 - BALLOON
- balloon compaction page
+23 - OFFLINE
+ page is logically offline
24 - ZERO_PAGE
zero page for pfn_zero or huge_zero page
25 - IDLE
@@ -128,6 +129,8 @@ Short descriptions to the page flags
Note that this flag may be stale in case the page was accessed via
a PTE. To make sure the flag is up-to-date one has to read
``/sys/kernel/mm/page_idle/bitmap`` first.
+26 - PGTABLE
+ page is in use as a page table
IO related page flags
---------------------
diff --git a/Documentation/core-api/flexible-arrays.rst b/Documentation/core-api/flexible-arrays.rst
deleted file mode 100644
index b6b85a1b518e..000000000000
--- a/Documentation/core-api/flexible-arrays.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-
-===================================
-Using flexible arrays in the kernel
-===================================
-
-Large contiguous memory allocations can be unreliable in the Linux kernel.
-Kernel programmers will sometimes respond to this problem by allocating
-pages with :c:func:`vmalloc()`. This solution not ideal, though. On 32-bit
-systems, memory from vmalloc() must be mapped into a relatively small address
-space; it's easy to run out. On SMP systems, the page table changes required
-by vmalloc() allocations can require expensive cross-processor interrupts on
-all CPUs. And, on all systems, use of space in the vmalloc() range increases
-pressure on the translation lookaside buffer (TLB), reducing the performance
-of the system.
-
-In many cases, the need for memory from vmalloc() can be eliminated by piecing
-together an array from smaller parts; the flexible array library exists to make
-this task easier.
-
-A flexible array holds an arbitrary (within limits) number of fixed-sized
-objects, accessed via an integer index. Sparse arrays are handled
-reasonably well. Only single-page allocations are made, so memory
-allocation failures should be relatively rare. The down sides are that the
-arrays cannot be indexed directly, individual object size cannot exceed the
-system page size, and putting data into a flexible array requires a copy
-operation. It's also worth noting that flexible arrays do no internal
-locking at all; if concurrent access to an array is possible, then the
-caller must arrange for appropriate mutual exclusion.
-
-The creation of a flexible array is done with :c:func:`flex_array_alloc()`::
-
- #include <linux/flex_array.h>
-
- struct flex_array *flex_array_alloc(int element_size,
- unsigned int total,
- gfp_t flags);
-
-The individual object size is provided by ``element_size``, while total is the
-maximum number of objects which can be stored in the array. The flags
-argument is passed directly to the internal memory allocation calls. With
-the current code, using flags to ask for high memory is likely to lead to
-notably unpleasant side effects.
-
-It is also possible to define flexible arrays at compile time with::
-
- DEFINE_FLEX_ARRAY(name, element_size, total);
-
-This macro will result in a definition of an array with the given name; the
-element size and total will be checked for validity at compile time.
-
-Storing data into a flexible array is accomplished with a call to
-:c:func:`flex_array_put()`::
-
- int flex_array_put(struct flex_array *array, unsigned int element_nr,
- void *src, gfp_t flags);
-
-This call will copy the data from src into the array, in the position
-indicated by ``element_nr`` (which must be less than the maximum specified when
-the array was created). If any memory allocations must be performed, flags
-will be used. The return value is zero on success, a negative error code
-otherwise.
-
-There might possibly be a need to store data into a flexible array while
-running in some sort of atomic context; in this situation, sleeping in the
-memory allocator would be a bad thing. That can be avoided by using
-``GFP_ATOMIC`` for the flags value, but, often, there is a better way. The
-trick is to ensure that any needed memory allocations are done before
-entering atomic context, using :c:func:`flex_array_prealloc()`::
-
- int flex_array_prealloc(struct flex_array *array, unsigned int start,
- unsigned int nr_elements, gfp_t flags);
-
-This function will ensure that memory for the elements indexed in the range
-defined by ``start`` and ``nr_elements`` has been allocated. Thereafter, a
-``flex_array_put()`` call on an element in that range is guaranteed not to
-block.
-
-Getting data back out of the array is done with :c:func:`flex_array_get()`::
-
- void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
-
-The return value is a pointer to the data element, or NULL if that
-particular element has never been allocated.
-
-Note that it is possible to get back a valid pointer for an element which
-has never been stored in the array. Memory for array elements is allocated
-one page at a time; a single allocation could provide memory for several
-adjacent elements. Flexible array elements are normally initialized to the
-value ``FLEX_ARRAY_FREE`` (defined as 0x6c in <linux/poison.h>), so errors
-involving that number probably result from use of unstored array entries.
-Note that, if array elements are allocated with ``__GFP_ZERO``, they will be
-initialized to zero and this poisoning will not happen.
-
-Individual elements in the array can be cleared with
-:c:func:`flex_array_clear()`::
-
- int flex_array_clear(struct flex_array *array, unsigned int element_nr);
-
-This function will set the given element to ``FLEX_ARRAY_FREE`` and return
-zero. If storage for the indicated element is not allocated for the array,
-``flex_array_clear()`` will return ``-EINVAL`` instead. Note that clearing an
-element does not release the storage associated with it; to reduce the
-allocated size of an array, call :c:func:`flex_array_shrink()`::
-
- int flex_array_shrink(struct flex_array *array);
-
-The return value will be the number of pages of memory actually freed.
-This function works by scanning the array for pages containing nothing but
-``FLEX_ARRAY_FREE`` bytes, so (1) it can be expensive, and (2) it will not work
-if the array's pages are allocated with ``__GFP_ZERO``.
-
-It is possible to remove all elements of an array with a call to
-:c:func:`flex_array_free_parts()`::
-
- void flex_array_free_parts(struct flex_array *array);
-
-This call frees all elements, but leaves the array itself in place.
-Freeing the entire array is done with :c:func:`flex_array_free()`::
-
- void flex_array_free(struct flex_array *array);
-
-As of this writing, there are no users of flexible arrays in the mainline
-kernel. The functions described here are also not exported to modules;
-that will probably be fixed when somebody comes up with a need for it.
-
-
-Flexible array functions
-------------------------
-
-.. kernel-doc:: include/linux/flex_array.h
diff --git a/Documentation/core-api/genalloc.rst b/Documentation/core-api/genalloc.rst
index 6b38a39fab24..a534cc7ebd05 100644
--- a/Documentation/core-api/genalloc.rst
+++ b/Documentation/core-api/genalloc.rst
@@ -129,7 +129,7 @@ writing of special-purpose memory allocators in the future.
:functions: gen_pool_for_each_chunk
.. kernel-doc:: lib/genalloc.c
- :functions: addr_in_gen_pool
+ :functions: gen_pool_has_addr
.. kernel-doc:: lib/genalloc.c
:functions: gen_pool_avail
diff --git a/Documentation/core-api/generic-radix-tree.rst b/Documentation/core-api/generic-radix-tree.rst
new file mode 100644
index 000000000000..ed42839ae42f
--- /dev/null
+++ b/Documentation/core-api/generic-radix-tree.rst
@@ -0,0 +1,12 @@
+=================================
+Generic radix trees/sparse arrays
+=================================
+
+.. kernel-doc:: include/linux/generic-radix-tree.h
+ :doc: Generic radix trees/sparse arrays
+
+generic radix tree functions
+----------------------------
+
+.. kernel-doc:: include/linux/generic-radix-tree.h
+ :functions:
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 3adee82be311..6870baffef82 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -28,6 +28,7 @@ Core utilities
errseq
printk-formats
circular-buffers
+ generic-radix-tree
memory-allocation
mm-api
gfp_mask-from-fs-io
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 66cad5c86171..c6639c514a65 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -508,9 +508,14 @@ manner. The codes are the following:
sd - soft-dirty flag
mm - mixed map area
hg - huge page advise flag
- nh - no-huge page advise flag
+ nh - no-huge page advise flag [*]
mg - mergable advise flag
+ [*] A process mapping may be advised to not be backed by transparent hugepages
+ by either madvise(MADV_NOHUGEPAGE) or prctl(PR_SET_THP_DISABLE). See
+ Documentation/admin-guide/mm/transhuge.rst for system-wide and process
+ mapping policies.
+
Note that there is no guarantee that every flag and associated mnemonic will
be present in all further kernel releases. Things get changed, the flags may
be vanished or the reverse -- new added. Interpretation of their meaning
diff --git a/Documentation/flexible-arrays.txt b/Documentation/flexible-arrays.txt
deleted file mode 100644
index a0f2989dd804..000000000000
--- a/Documentation/flexible-arrays.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-===================================
-Using flexible arrays in the kernel
-===================================
-
-:Updated: Last updated for 2.6.32
-:Author: Jonathan Corbet <corbet@lwn.net>
-
-Large contiguous memory allocations can be unreliable in the Linux kernel.
-Kernel programmers will sometimes respond to this problem by allocating
-pages with vmalloc(). This solution not ideal, though. On 32-bit systems,
-memory from vmalloc() must be mapped into a relatively small address space;
-it's easy to run out. On SMP systems, the page table changes required by
-vmalloc() allocations can require expensive cross-processor interrupts on
-all CPUs. And, on all systems, use of space in the vmalloc() range
-increases pressure on the translation lookaside buffer (TLB), reducing the
-performance of the system.
-
-In many cases, the need for memory from vmalloc() can be eliminated by
-piecing together an array from smaller parts; the flexible array library
-exists to make this task easier.
-
-A flexible array holds an arbitrary (within limits) number of fixed-sized
-objects, accessed via an integer index. Sparse arrays are handled
-reasonably well. Only single-page allocations are made, so memory
-allocation failures should be relatively rare. The down sides are that the
-arrays cannot be indexed directly, individual object size cannot exceed the
-system page size, and putting data into a flexible array requires a copy
-operation. It's also worth noting that flexible arrays do no internal
-locking at all; if concurrent access to an array is possible, then the
-caller must arrange for appropriate mutual exclusion.
-
-The creation of a flexible array is done with::
-
- #include <linux/flex_array.h>
-
- struct flex_array *flex_array_alloc(int element_size,
- unsigned int total,
- gfp_t flags);
-
-The individual object size is provided by element_size, while total is the
-maximum number of objects which can be stored in the array. The flags
-argument is passed directly to the internal memory allocation calls. With
-the current code, using flags to ask for high memory is likely to lead to
-notably unpleasant side effects.
-
-It is also possible to define flexible arrays at compile time with::
-
- DEFINE_FLEX_ARRAY(name, element_size, total);
-
-This macro will result in a definition of an array with the given name; the
-element size and total will be checked for validity at compile time.
-
-Storing data into a flexible array is accomplished with a call to::
-
- int flex_array_put(struct flex_array *array, unsigned int element_nr,
- void *src, gfp_t flags);
-
-This call will copy the data from src into the array, in the position
-indicated by element_nr (which must be less than the maximum specified when
-the array was created). If any memory allocations must be performed, flags
-will be used. The return value is zero on success, a negative error code
-otherwise.
-
-There might possibly be a need to store data into a flexible array while
-running in some sort of atomic context; in this situation, sleeping in the
-memory allocator would be a bad thing. That can be avoided by using
-GFP_ATOMIC for the flags value, but, often, there is a better way. The
-trick is to ensure that any needed memory allocations are done before
-entering atomic context, using::
-
- int flex_array_prealloc(struct flex_array *array, unsigned int start,
- unsigned int nr_elements, gfp_t flags);
-
-This function will ensure that memory for the elements indexed in the range
-defined by start and nr_elements has been allocated. Thereafter, a
-flex_array_put() call on an element in that range is guaranteed not to
-block.
-
-Getting data back out of the array is done with::
-
- void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
-
-The return value is a pointer to the data element, or NULL if that
-particular element has never been allocated.
-
-Note that it is possible to get back a valid pointer for an element which
-has never been stored in the array. Memory for array elements is allocated
-one page at a time; a single allocation could provide memory for several
-adjacent elements. Flexible array elements are normally initialized to the
-value FLEX_ARRAY_FREE (defined as 0x6c in <linux/poison.h>), so errors
-involving that number probably result from use of unstored array entries.
-Note that, if array elements are allocated with __GFP_ZERO, they will be
-initialized to zero and this poisoning will not happen.
-
-Individual elements in the array can be cleared with::
-
- int flex_array_clear(struct flex_array *array, unsigned int element_nr);
-
-This function will set the given element to FLEX_ARRAY_FREE and return
-zero. If storage for the indicated element is not allocated for the array,
-flex_array_clear() will return -EINVAL instead. Note that clearing an
-element does not release the storage associated with it; to reduce the
-allocated size of an array, call::
-
- int flex_array_shrink(struct flex_array *array);
-
-The return value will be the number of pages of memory actually freed.
-This function works by scanning the array for pages containing nothing but
-FLEX_ARRAY_FREE bytes, so (1) it can be expensive, and (2) it will not work
-if the array's pages are allocated with __GFP_ZERO.
-
-It is possible to remove all elements of an array with a call to::
-
- void flex_array_free_parts(struct flex_array *array);
-
-This call frees all elements, but leaves the array itself in place.
-Freeing the entire array is done with::
-
- void flex_array_free(struct flex_array *array);
-
-As of this writing, there are no users of flexible arrays in the mainline
-kernel. The functions described here are also not exported to modules;
-that will probably be fixed when somebody comes up with a need for it.
diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h
index e6e13a85796a..5a77a40567fa 100644
--- a/arch/alpha/include/asm/topology.h
+++ b/arch/alpha/include/asm/topology.h
@@ -4,6 +4,7 @@
#include <linux/smp.h>
#include <linux/threads.h>
+#include <linux/numa.h>
#include <asm/machvec.h>
#ifdef CONFIG_NUMA
@@ -29,7 +30,7 @@ static const struct cpumask *cpumask_of_node(int node)
{
int cpu;
- if (node == -1)
+ if (node == NUMA_NO_NODE)
return cpu_all_mask;
cpumask_clear(&node_to_cpumask_map[node]);
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index f1e2922e447c..a876101a0f82 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -563,7 +563,7 @@ static void *__alloc_from_pool(size_t size, struct page **ret_page)
static bool __in_atomic_pool(void *start, size_t size)
{
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
static int __free_from_pool(void *start, size_t size)
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index f5cc1ccfea3d..57de0dde3ae0 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -719,16 +719,9 @@ EXPORT_SYMBOL(phys_mem_access_prot);
#define vectors_base() (vectors_high() ? 0xffff0000 : 0)
-static void __init *early_alloc_aligned(unsigned long sz, unsigned long align)
-{
- void *ptr = __va(memblock_phys_alloc(sz, align));
- memset(ptr, 0, sz);
- return ptr;
-}
-
static void __init *early_alloc(unsigned long sz)
{
- return early_alloc_aligned(sz, sz);
+ return memblock_alloc(sz, sz);
}
static void *__init late_alloc(unsigned long sz)
@@ -1000,7 +993,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr)
if (!nr)
return;
- svm = early_alloc_aligned(sizeof(*svm) * nr, __alignof__(*svm));
+ svm = memblock_alloc(sizeof(*svm) * nr, __alignof__(*svm));
for (md = io_desc; nr; md++, nr--) {
create_mapping(md);
@@ -1022,7 +1015,7 @@ void __init vm_reserve_area_early(unsigned long addr, unsigned long size,
struct vm_struct *vm;
struct static_vm *svm;
- svm = early_alloc_aligned(sizeof(*svm), __alignof__(*svm));
+ svm = memblock_alloc(sizeof(*svm), __alignof__(*svm));
vm = &svm->vm;
vm->addr = (void *)addr;
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a4168d366127..cfbf307d6dc4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1467,6 +1467,10 @@ config SYSVIPC_COMPAT
def_bool y
depends on COMPAT && SYSVIPC
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ def_bool y
+ depends on HUGETLB_PAGE && MIGRATION
+
menu "Power management options"
source "kernel/power/Kconfig"
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index fb6609875455..c6a07a3b433e 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -20,6 +20,11 @@
#include <asm/page.h>
+#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+#define arch_hugetlb_migration_supported arch_hugetlb_migration_supported
+extern bool arch_hugetlb_migration_supported(struct hstate *h);
+#endif
+
#define __HAVE_ARCH_HUGE_PTEP_GET
static inline pte_t huge_ptep_get(pte_t *ptep)
{
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 28cbc22d7e30..6b4a47b3adf4 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -27,6 +27,26 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
+#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+bool arch_hugetlb_migration_supported(struct hstate *h)
+{
+ size_t pagesize = huge_page_size(h);
+
+ switch (pagesize) {
+#ifdef CONFIG_ARM64_4K_PAGES
+ case PUD_SIZE:
+#endif
+ case PMD_SIZE:
+ case CONT_PMD_SIZE:
+ case CONT_PTE_SIZE:
+ return true;
+ }
+ pr_warn("%s: unrecognized huge page size 0x%lx\n",
+ __func__, pagesize);
+ return false;
+}
+#endif
+
int pmd_huge(pmd_t pmd)
{
return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 33a2c94fed0d..63b4a1705182 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -30,6 +30,7 @@ generic-y += pgalloc.h
generic-y += preempt.h
generic-y += segment.h
generic-y += serial.h
+generic-y += shmparam.h
generic-y += tlbflush.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 6c6f6301012e..0febf1a07c30 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
include include/uapi/asm-generic/Kbuild.asm
generic-y += kvm_para.h
-generic-y += shmparam.h
generic-y += ucontext.h
diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c
index 75b79571732c..0be289839ce0 100644
--- a/arch/c6x/mm/dma-coherent.c
+++ b/arch/c6x/mm/dma-coherent.c
@@ -121,8 +121,6 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
*/
void __init coherent_mem_init(phys_addr_t start, u32 size)
{
- phys_addr_t bitmap_phys;
-
if (!size)
return;
@@ -138,11 +136,8 @@ void __init coherent_mem_init(phys_addr_t start, u32 size)
if (dma_size & (PAGE_SIZE - 1))
++dma_pages;
- bitmap_phys = memblock_phys_alloc(BITS_TO_LONGS(dma_pages) * sizeof(long),
- sizeof(long));
-
- dma_bitmap = phys_to_virt(bitmap_phys);
- memset(dma_bitmap, 0, dma_pages * PAGE_SIZE);
+ dma_bitmap = memblock_alloc(BITS_TO_LONGS(dma_pages) * sizeof(long),
+ sizeof(long));
}
static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index cd400d353d18..961c1dc064e1 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -40,6 +40,7 @@ generic-y += preempt.h
generic-y += scatterlist.h
generic-y += sections.h
generic-y += serial.h
+generic-y += shmparam.h
generic-y += sizes.h
generic-y += spinlock.h
generic-y += timex.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 6c6f6301012e..0febf1a07c30 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
include include/uapi/asm-generic/Kbuild.asm
generic-y += kvm_para.h
-generic-y += shmparam.h
generic-y += ucontext.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 47c4da3d64a4..b25fd42aa0f4 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -30,6 +30,7 @@ generic-y += rwsem.h
generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
+generic-y += shmparam.h
generic-y += sizes.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild
index 61d955c1747a..c1b06dcf6cf8 100644
--- a/arch/hexagon/include/uapi/asm/Kbuild
+++ b/arch/hexagon/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
include include/uapi/asm-generic/Kbuild.asm
-generic-y += shmparam.h
generic-y += ucontext.h
diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c
index 92c376279c6d..1315da6c7aeb 100644
--- a/arch/ia64/kernel/numa.c
+++ b/arch/ia64/kernel/numa.c
@@ -74,7 +74,7 @@ void __init build_cpu_to_node_map(void)
cpumask_clear(&node_to_cpu_mask[node]);
for_each_possible_early_cpu(cpu) {
- node = -1;
+ node = NUMA_NO_NODE;
for (i = 0; i < NR_CPUS; ++i)
if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
node = node_cpuid[i].nid;
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 8a965784340c..f9c36750c6a4 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -227,7 +227,7 @@ void __init setup_per_cpu_areas(void)
* CPUs are put into groups according to node. Walk cpu_map
* and create new groups at node boundaries.
*/
- prev_node = -1;
+ prev_node = NUMA_NO_NODE;
ai->nr_groups = 0;
for (unit = 0; unit < nr_units; unit++) {
cpu = cpu_map[unit];
@@ -435,7 +435,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
{
void *ptr = NULL;
u8 best = 0xff;
- int bestnode = -1, node, anynode = 0;
+ int bestnode = NUMA_NO_NODE, node, anynode = 0;
for_each_online_node(node) {
if (node_isset(node, memory_less_mask))
@@ -447,7 +447,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
anynode = node;
}
- if (bestnode == -1)
+ if (bestnode == NUMA_NO_NODE)
bestnode = anynode;
ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE,
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 9f1dd26903e3..95f8f631c4df 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -20,6 +20,7 @@ generic-y += mm-arch-hooks.h
generic-y += percpu.h
generic-y += preempt.h
generic-y += sections.h
+generic-y += shmparam.h
generic-y += spinlock.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index b8b3525271fa..960bf1e4be53 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -2,4 +2,3 @@ include include/uapi/asm-generic/Kbuild.asm
generated-y += unistd_32.h
generic-y += kvm_para.h
-generic-y += shmparam.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 9c7d1d25bf3d..791cc8d54d0a 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -26,6 +26,7 @@ generic-y += parport.h
generic-y += percpu.h
generic-y += preempt.h
generic-y += serial.h
+generic-y += shmparam.h
generic-y += syscalls.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index 28823e3db825..97823ec46e97 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -2,5 +2,4 @@ include include/uapi/asm-generic/Kbuild.asm
generated-y += unistd_32.h
generic-y += kvm_para.h
-generic-y += shmparam.h
generic-y += ucontext.h
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index b17fd8aafd64..44f4b8910c21 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -363,8 +363,9 @@ void __init *early_get_page(void)
* Mem start + kernel_tlb -> here is limit
* because of mem mapping from head.S
*/
- return __va(memblock_alloc_base(PAGE_SIZE, PAGE_SIZE,
- memory_start + kernel_tlb));
+ return memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE,
+ MEMBLOCK_LOW_LIMIT, memory_start + kernel_tlb,
+ NUMA_NO_NODE);
}
#endif /* CONFIG_MMU */
diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c
index 253f79fc7196..d1e521cce317 100644
--- a/arch/nds32/mm/init.c
+++ b/arch/nds32/mm/init.c
@@ -78,8 +78,7 @@ static void __init map_ram(void)
}
/* Alloc one page for holding PTE's... */
- pte = (pte_t *) __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- memset(pte, 0, PAGE_SIZE);
+ pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
set_pmd(pme, __pmd(__pa(pte) + _PAGE_KERNEL_TABLE));
/* Fill the newly allocated page with PTE'S */
@@ -111,8 +110,7 @@ static void __init fixedrange_init(void)
pgd = swapper_pg_dir + pgd_index(vaddr);
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
- fixmap_pmd_p = (pmd_t *) __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- memset(fixmap_pmd_p, 0, PAGE_SIZE);
+ fixmap_pmd_p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
set_pmd(pmd, __pmd(__pa(fixmap_pmd_p) + _PAGE_KERNEL_TABLE));
#ifdef CONFIG_HIGHMEM
@@ -124,8 +122,7 @@ static void __init fixedrange_init(void)
pgd = swapper_pg_dir + pgd_index(vaddr);
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
- pte = (pte_t *) __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- memset(pte, 0, PAGE_SIZE);
+ pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
set_pmd(pmd, __pmd(__pa(pte) + _PAGE_KERNEL_TABLE));
pkmap_page_table = pte;
#endif /* CONFIG_HIGHMEM */
@@ -150,8 +147,7 @@ void __init paging_init(void)
fixedrange_init();
/* allocate space for empty_zero_page */
- zero_page = __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- memset(zero_page, 0, PAGE_SIZE);
+ zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
zone_sizes_init();
empty_zero_page = virt_to_page(zero_page);
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index eb87cd8327c8..1f04844b6b82 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -34,6 +34,7 @@ generic-y += qrwlock_types.h
generic-y += qrwlock.h
generic-y += sections.h
generic-y += segment.h
+generic-y += shmparam.h
generic-y += string.h
generic-y += switch_to.h
generic-y += topology.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 6c6f6301012e..0febf1a07c30 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
include include/uapi/asm-generic/Kbuild.asm
generic-y += kvm_para.h
-generic-y += shmparam.h
generic-y += ucontext.h
diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c
index 270d1c9bc0d6..051bcb4fefd3 100644
--- a/arch/openrisc/mm/ioremap.c
+++ b/arch/openrisc/mm/ioremap.c
@@ -122,13 +122,10 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm)
{
pte_t *pte;
- if (likely(mem_init_done)) {
- pte = (pte_t *) __get_free_page(GFP_KERNEL);
- } else {
- pte = (pte_t *) __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- }
+ if (likely(mem_init_done))
+ pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ else
+ pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
- if (pte)
- clear_page(pte);
return pte;
}
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index aee4fcc24990..77fc21278fa2 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -10,6 +10,7 @@
#include <linux/pci.h>
#include <linux/list.h>
#include <linux/ioport.h>
+#include <linux/numa.h>
struct device_node;
@@ -265,7 +266,7 @@ extern int pcibios_map_io_space(struct pci_bus *bus);
#ifdef CONFIG_NUMA
#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE))
#else
-#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = -1)
+#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = NUMA_NO_NODE)
#endif
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 913bfca09c4f..8c890c6557ed 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -11,6 +11,7 @@
#include <linux/export.h>
#include <linux/memblock.h>
#include <linux/sched/task.h>
+#include <linux/numa.h>
#include <asm/lppaca.h>
#include <asm/paca.h>
@@ -27,7 +28,7 @@
static void *__init alloc_paca_data(unsigned long size, unsigned long align,
unsigned long limit, int cpu)
{
- unsigned long pa;
+ void *ptr;
int nid;
/*
@@ -36,23 +37,21 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align,
* which will put its paca in the right place.
*/
if (cpu == boot_cpuid) {
- nid = -1;
+ nid = NUMA_NO_NODE;
memblock_set_bottom_up(true);
} else {
nid = early_cpu_to_node(cpu);
}
- pa = memblock_alloc_base_nid(size, align, limit, nid, MEMBLOCK_NONE);
- if (!pa) {
- pa = memblock_alloc_base(size, align, limit);
- if (!pa)
- panic("cannot allocate paca data");
- }
+ ptr = memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
+ limit, nid);
+ if (!ptr)
+ panic("cannot allocate paca data");
if (cpu == boot_cpuid)
memblock_set_bottom_up(false);
- return __va(pa);
+ return ptr;
}
#ifdef CONFIG_PPC_PSERIES
@@ -118,7 +117,6 @@ static struct slb_shadow * __init new_slb_shadow(int cpu, unsigned long limit)
}
s = alloc_paca_data(sizeof(*s), L1_CACHE_BYTES, limit, cpu);
- memset(s, 0, sizeof(*s));
s->persistent = cpu_to_be32(SLB_NUM_BOLTED);
s->buffer_length = cpu_to_be32(sizeof(*s));
@@ -222,7 +220,6 @@ void __init allocate_paca(int cpu)
paca = alloc_paca_data(sizeof(struct paca_struct), L1_CACHE_BYTES,
limit, cpu);
paca_ptrs[cpu] = paca;
- memset(paca, 0, sizeof(struct paca_struct));
initialise_paca(paca, cpu);
#ifdef CONFIG_PPC_PSERIES
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 88e4f69a09e5..4538e8ddde80 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -32,6 +32,7 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/vgaarb.h>
+#include <linux/numa.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -132,7 +133,7 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev)
int nid = of_node_to_nid(dev);
if (nid < 0 || !node_online(nid))
- nid = -1;
+ nid = NUMA_NO_NODE;
PHB_SET_NODE(phb, nid);
}
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index ca00fbb97cf8..82be48c123cf 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -459,8 +459,8 @@ void __init smp_setup_cpu_maps(void)
DBG("smp_setup_cpu_maps()\n");
- cpu_to_phys_id = __va(memblock_phys_alloc(nr_cpu_ids * sizeof(u32), __alignof__(u32)));
- memset(cpu_to_phys_id, 0, nr_cpu_ids * sizeof(u32));
+ cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32),
+ __alignof__(u32));
for_each_node_by_type(dn, "cpu") {
const __be32 *intserv;
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 236c1151a3a7..3dcd779aef44 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -634,19 +634,17 @@ __init u64 ppc64_bolted_size(void)
static void *__init alloc_stack(unsigned long limit, int cpu)
{
- unsigned long pa;
+ void *ptr;
BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16);
- pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit,
- early_cpu_to_node(cpu), MEMBLOCK_NONE);
- if (!pa) {
- pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
- if (!pa)
- panic("cannot allocate stacks");
- }
+ ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE,
+ MEMBLOCK_LOW_LIMIT, limit,
+ early_cpu_to_node(cpu));
+ if (!ptr)
+ panic("cannot allocate stacks");
- return __va(pa);
+ return ptr;
}
void __init irqstack_early_init(void)
@@ -739,20 +737,17 @@ void __init emergency_stack_init(void)
struct thread_info *ti;
ti = alloc_stack(limit, i);
- memset(ti, 0, THREAD_SIZE);
emerg_stack_init_thread_info(ti, i);
paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE;
#ifdef CONFIG_PPC_BOOK3S_64
/* emergency stack for NMI exception handling. */
ti = alloc_stack(limit, i);
- memset(ti, 0, THREAD_SIZE);
emerg_stack_init_thread_info(ti, i);
paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE;
/* emergency stack for machine check exception handling. */
ti = alloc_stack(limit, i);
- memset(ti, 0, THREAD_SIZE);
emerg_stack_init_thread_info(ti, i);
paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE;
#endif
@@ -933,8 +928,9 @@ static void __ref init_fallback_flush(void)
* hardware prefetch runoff. We don't have a recipe for load patterns to
* reliably avoid the prefetcher.
*/
- l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit));
- memset(l1d_flush_fallback_area, 0, l1d_size * 2);
+ l1d_flush_fallback_area = memblock_alloc_try_nid(l1d_size * 2,
+ l1d_size, MEMBLOCK_LOW_LIMIT,
+ limit, NUMA_NO_NODE);
for_each_possible_cpu(cpu) {
struct paca_struct *paca = paca_ptrs[cpu];
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0cc7fbc3bd1c..bc6be44913d4 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -908,9 +908,9 @@ static void __init htab_initialize(void)
#ifdef CONFIG_DEBUG_PAGEALLOC
if (debug_pagealloc_enabled()) {
linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
- linear_map_hash_slots = __va(memblock_alloc_base(
- linear_map_hash_count, 1, ppc64_rma_size));
- memset(linear_map_hash_slots, 0, linear_map_hash_count);
+ linear_map_hash_slots = memblock_alloc_try_nid(
+ linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
+ ppc64_rma_size, NUMA_NO_NODE);
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 87f0dd004295..270cefb75cca 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -215,7 +215,7 @@ static void initialize_distance_lookup_table(int nid,
*/
static int associativity_to_nid(const __be32 *associativity)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
if (min_common_depth == -1)
goto out;
@@ -225,7 +225,7 @@ static int associativity_to_nid(const __be32 *associativity)
/* POWER4 LPAR uses 0xffff as invalid node */
if (nid == 0xffff || nid >= MAX_NUMNODES)
- nid = -1;
+ nid = NUMA_NO_NODE;
if (nid > 0 &&
of_read_number(associativity, 1) >= distance_ref_points_depth) {
@@ -244,7 +244,7 @@ out:
*/
static int of_node_to_nid_single(struct device_node *device)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
const __be32 *tmp;
tmp = of_get_associativity(device);
@@ -256,7 +256,7 @@ static int of_node_to_nid_single(struct device_node *device)
/* Walk the device tree upwards, looking for an associativity id */
int of_node_to_nid(struct device_node *device)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
of_node_get(device);
while (device) {
@@ -454,7 +454,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
*/
static int numa_setup_cpu(unsigned long lcpu)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
struct device_node *cpu;
/*
@@ -930,7 +930,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
{
struct drmem_lmb *lmb;
unsigned long lmb_size;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
lmb_size = drmem_lmb_size();
@@ -960,7 +960,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
static int hot_add_node_scn_to_nid(unsigned long scn_addr)
{
struct device_node *memory;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
for_each_node_by_type(memory, "memory") {
unsigned long start, size;
diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c
index e0ccf36714b2..53cbc7dc2df2 100644
--- a/arch/powerpc/mm/pgtable-book3e.c
+++ b/arch/powerpc/mm/pgtable-book3e.c
@@ -57,12 +57,8 @@ void vmemmap_remove_mapping(unsigned long start,
static __ref void *early_alloc_pgtable(unsigned long size)
{
- void *pt;
-
- pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
- memset(pt, 0, size);
-
- return pt;
+ return memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT,
+ __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE);
}
/*
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index f3c31f5e1026..55876b7e3813 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -195,11 +195,8 @@ void __init mmu_partition_table_init(void)
unsigned long ptcr;
BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
- partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
- MEMBLOCK_ALLOC_ANYWHERE));
-
/* Initialize the Partition Table with no entries */
- memset((void *)partition_tb, 0, patb_size);
+ partition_tb = memblock_alloc(patb_size, patb_size);
/*
* update partition table control register,
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 931156069a81..29bcea57dd08 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -51,26 +51,15 @@ static int native_register_process_table(unsigned long base, unsigned long pg_sz
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
unsigned long region_start, unsigned long region_end)
{
- unsigned long pa = 0;
- void *pt;
+ phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
+ phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
- if (region_start || region_end) /* has region hint */
- pa = memblock_alloc_range(size, size, region_start, region_end,
- MEMBLOCK_NONE);
- else if (nid != -1) /* has node hint */
- pa = memblock_alloc_base_nid(size, size,
- MEMBLOCK_ALLOC_ANYWHERE,
- nid, MEMBLOCK_NONE);
+ if (region_start)
+ min_addr = region_start;
+ if (region_end)
+ max_addr = region_end;
- if (!pa)
- pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE);
-
- BUG_ON(!pa);
-
- pt = __va(pa);
- memset(pt, 0, size);
-
- return pt;
+ return memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
}
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 3f4193201ee7..36a664f06c65 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -211,8 +211,7 @@ void __init MMU_init_hw(void)
* Find some memory for the hash table.
*/
if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
- Hash = __va(memblock_phys_alloc(Hash_size, Hash_size));
- memset(Hash, 0, Hash_size);
+ Hash = memblock_alloc(Hash_size, Hash_size);
_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index f2971522fb4a..f62930f839ca 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -208,7 +208,9 @@ static int __init iob_init(struct device_node *dn)
pr_debug(" -> %s\n", __func__);
/* For 2G space, 8x64 pages (2^21 bytes) is max total l2 size */
- iob_l2_base = (u32 *)__va(memblock_alloc_base(1UL<<21, 1UL<<21, 0x80000000));
+ iob_l2_base = memblock_alloc_try_nid_raw(1UL << 21, 1UL << 21,
+ MEMBLOCK_LOW_LIMIT, 0x80000000,
+ NUMA_NO_NODE);
pr_info("IOBMAP L2 allocated at: %p\n", iob_l2_base);
@@ -269,4 +271,3 @@ void __init iommu_init_early_pasemi(void)
pasemi_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pasemi;
set_pci_dma_ops(&dma_iommu_ops);
}
-
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 84d038ed3882..248a38ad25c7 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
+#include <linux/numa.h>
#include <asm/machdep.h>
#include <asm/debugfs.h>
@@ -223,7 +224,7 @@ static int memtrace_online(void)
ent = &memtrace_array[i];
/* We have onlined this chunk previously */
- if (ent->nid == -1)
+ if (ent->nid == NUMA_NO_NODE)
continue;
/* Remove from io mappings */
@@ -257,7 +258,7 @@ static int memtrace_online(void)
*/
debugfs_remove_recursive(ent->dir);
pr_info("Added trace memory back to node %d\n", ent->nid);
- ent->size = ent->start = ent->nid = -1;
+ ent->size = ent->start = ent->nid = NUMA_NO_NODE;
}
if (ret)
return ret;
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 79586f127521..8e157f9f1ff2 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -171,8 +171,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
/*
* Allocate a buffer to hold the MC recoverable ranges.
*/
- mc_recoverable_range =__va(memblock_phys_alloc(size, __alignof__(u64)));
- memset(mc_recoverable_range, 0, size);
+ mc_recoverable_range = memblock_alloc(size, __alignof__(u64));
for (i = 0; i < mc_recoverable_range_len; i++) {
mc_recoverable_range[i].start_addr =
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 41f62ca27c63..e4f0dfd4ae33 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -130,8 +130,13 @@ static void __init fwnmi_init(void)
* It will be used in real mode mce handler, hence it needs to be
* below RMA.
*/
- mce_data_buf = __va(memblock_alloc_base(RTAS_ERROR_LOG_MAX * nr_cpus,
- RTAS_ERROR_LOG_MAX, ppc64_rma_size));
+ mce_data_buf = memblock_alloc_try_nid_raw(RTAS_ERROR_LOG_MAX * nr_cpus,
+ RTAS_ERROR_LOG_MAX, MEMBLOCK_LOW_LIMIT,
+ ppc64_rma_size, NUMA_NO_NODE);
+ if (!mce_data_buf)
+ panic("Failed to allocate %d bytes below %pa for MCE buffer\n",
+ RTAS_ERROR_LOG_MAX * nr_cpus, &ppc64_rma_size);
+
for_each_possible_cpu(i) {
paca_ptrs[i]->mce_data_buf = mce_data_buf +
(RTAS_ERROR_LOG_MAX * i);
@@ -140,8 +145,13 @@ static void __init fwnmi_init(void)
#ifdef CONFIG_PPC_BOOK3S_64
/* Allocate per cpu slb area to save old slb contents during MCE */
size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
- slb_ptr = __va(memblock_alloc_base(size, sizeof(struct slb_entry),
- ppc64_rma_size));
+ slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry),
+ MEMBLOCK_LOW_LIMIT, ppc64_rma_size,
+ NUMA_NO_NODE);
+ if (!slb_ptr)
+ panic("Failed to allocate %zu bytes below %pa for slb area\n",
+ size, &ppc64_rma_size);
+
for_each_possible_cpu(i)
paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
#endif
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index a5b40d1460f1..25bc25fe0d93 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -251,8 +251,11 @@ static void allocate_dart(void)
* 16MB (1 << 24) alignment. We allocate a full 16Mb chuck since we
* will blow up an entire large page anyway in the kernel mapping.
*/
- dart_tablebase = __va(memblock_alloc_base(1UL<<24,
- 1UL<<24, 0x80000000L));
+ dart_tablebase = memblock_alloc_try_nid_raw(SZ_16M, SZ_16M,
+ MEMBLOCK_LOW_LIMIT, SZ_2G,
+ NUMA_NO_NODE);
+ if (!dart_tablebase)
+ panic("Failed to allocate 16MB below 2GB for DART table\n");
/* There is no point scanning the DART space for leaks*/
kmemleak_no_scan((void *)dart_tablebase);
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
index d31bde0870d8..2d1271e2a70d 100644
--- a/arch/s390/numa/numa.c
+++ b/arch/s390/numa/numa.c
@@ -58,18 +58,6 @@ EXPORT_SYMBOL(__node_distance);
int numa_debug_enabled;
/*
- * alloc_node_data() - Allocate node data
- */
-static __init pg_data_t *alloc_node_data(void)
-{
- pg_data_t *res;
-
- res = (pg_data_t *) memblock_phys_alloc(sizeof(pg_data_t), 8);
- memset(res, 0, sizeof(pg_data_t));
- return res;
-}
-
-/*
* numa_setup_memory() - Assign bootmem to nodes
*
* The memory is first added to memblock without any respect to nodes.
@@ -105,7 +93,7 @@ static void __init numa_setup_memory(void)
/* Allocate and fill out node_data */
for (nid = 0; nid < MAX_NUMNODES; nid++)
- NODE_DATA(nid) = alloc_node_data();
+ NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8);
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;
diff --git a/arch/sh/kernel/syscalls/syscalltbl.sh b/arch/sh/kernel/syscalls/syscalltbl.sh
index 85d78d9309ad..904b8e6e625d 100644
--- a/arch/sh/kernel/syscalls/syscalltbl.sh
+++ b/arch/sh/kernel/syscalls/syscalltbl.sh
@@ -13,10 +13,10 @@ emit() {
t_entry="$3"
while [ $t_nxt -lt $t_nr ]; do
- printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}"
+ printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
t_nxt=$((t_nxt+1))
done
- printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}"
+ printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
}
grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 96e9c54a07f5..bd1a9c544767 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -10,7 +10,7 @@
#include <linux/sys.h>
#include <linux/linkage.h>
-#define __SYSCALL(nr, entry, nargs) .long entry
+#define __SYSCALL(nr, entry) .long entry
.data
ENTRY(sys_call_table)
#include <asm/syscall_table.h>
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index a8e5c0e00fca..a0fa4de03dd5 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -192,24 +192,16 @@ void __init page_table_range_init(unsigned long start, unsigned long end,
void __init allocate_pgdat(unsigned int nid)
{
unsigned long start_pfn, end_pfn;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- unsigned long phys;
-#endif
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#ifdef CONFIG_NEED_MULTIPLE_NODES
- phys = __memblock_alloc_base(sizeof(struct pglist_data),
- SMP_CACHE_BYTES, end_pfn << PAGE_SHIFT);
- /* Retry with all of system memory */
- if (!phys)
- phys = __memblock_alloc_base(sizeof(struct pglist_data),
- SMP_CACHE_BYTES, memblock_end_of_DRAM());
- if (!phys)
+ NODE_DATA(nid) = memblock_alloc_try_nid_nopanic(
+ sizeof(struct pglist_data),
+ SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (!NODE_DATA(nid))
panic("Can't allocate pgdat for node %d\n", nid);
-
- NODE_DATA(nid) = __va(phys);
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
#endif
NODE_DATA(nid)->node_start_pfn = start_pfn;
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index 830e8b3684e4..c4bde6148810 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -41,9 +41,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
__add_active_range(nid, start_pfn, end_pfn);
/* Node-local pgdat */
- NODE_DATA(nid) = __va(memblock_alloc_base(sizeof(struct pglist_data),
- SMP_CACHE_BYTES, end));
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+ NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data),
+ SMP_CACHE_BYTES, nid);
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 1393a8ac596b..22500c3be7a9 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
extern struct page *mem_map_zero;
#define ZERO_PAGE(vaddr) (mem_map_zero)
-/* This macro must be updated when the size of struct page grows above 80
- * or reduces below 64.
- * The idea that compiler optimizes out switch() statement, and only
- * leaves clrx instructions
- */
-#define mm_zero_struct_page(pp) do { \
- unsigned long *_pp = (void *)(pp); \
- \
- /* Check that struct page is either 64, 72, or 80 bytes */ \
- BUILD_BUG_ON(sizeof(struct page) & 7); \
- BUILD_BUG_ON(sizeof(struct page) < 64); \
- BUILD_BUG_ON(sizeof(struct page) > 80); \
- \
- switch (sizeof(struct page)) { \
- case 80: \
- _pp[9] = 0; /* fallthrough */ \
- case 72: \
- _pp[8] = 0; /* fallthrough */ \
- default: \
- _pp[7] = 0; \
- _pp[6] = 0; \
- _pp[5] = 0; \
- _pp[4] = 0; \
- _pp[3] = 0; \
- _pp[2] = 0; \
- _pp[1] = 0; \
- _pp[0] = 0; \
- } \
-} while (0)
-
/* PFNs are real physical page numbers. However, mem_map only begins to record
* per-page information starting at pfn_base. This is to handle systems where
* the first physical page in the machine is at some huge physical address,
diff --git a/arch/sparc/kernel/pci_fire.c b/arch/sparc/kernel/pci_fire.c
index be71ae086622..0ca08d455e80 100644
--- a/arch/sparc/kernel/pci_fire.c
+++ b/arch/sparc/kernel/pci_fire.c
@@ -11,6 +11,7 @@
#include <linux/export.h>
#include <linux/irq.h>
#include <linux/of_device.h>
+#include <linux/numa.h>
#include <asm/prom.h>
#include <asm/irq.h>
@@ -416,7 +417,7 @@ static int pci_fire_pbm_init(struct pci_pbm_info *pbm,
struct device_node *dp = op->dev.of_node;
int err;
- pbm->numa_node = -1;
+ pbm->numa_node = NUMA_NO_NODE;
pbm->pci_ops = &sun4u_pci_ops;
pbm->config_space_reg_bits = 12;
diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c
index 934b97c72f7c..421aba00e6b0 100644
--- a/arch/sparc/kernel/pci_schizo.c
+++ b/arch/sparc/kernel/pci_schizo.c
@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/of_device.h>
+#include <linux/numa.h>
#include <asm/iommu.h>
#include <asm/irq.h>
@@ -1347,7 +1348,7 @@ static int schizo_pbm_init(struct pci_pbm_info *pbm,
pbm->next = pci_pbm_root;
pci_pbm_root = pbm;
- pbm->numa_node = -1;
+ pbm->numa_node = NUMA_NO_NODE;
pbm->pci_ops = &sun4u_pci_ops;
pbm->config_space_reg_bits = 8;
diff --git a/arch/sparc/kernel/prom_64.c b/arch/sparc/kernel/prom_64.c
index e897a4ded3a1..c50ff1fee0e6 100644
--- a/arch/sparc/kernel/prom_64.c
+++ b/arch/sparc/kernel/prom_64.c
@@ -34,16 +34,13 @@
void * __init prom_early_alloc(unsigned long size)
{
- unsigned long paddr = memblock_phys_alloc(size, SMP_CACHE_BYTES);
- void *ret;
+ void *ret = memblock_alloc(size, SMP_CACHE_BYTES);
- if (!paddr) {
+ if (!ret) {
prom_printf("prom_early_alloc(%lu) failed\n", size);
prom_halt();
}
- ret = __va(paddr);
- memset(ret, 0, size);
prom_early_allocated += size;
return ret;
diff --git a/arch/sparc/kernel/psycho_common.c b/arch/sparc/kernel/psycho_common.c
index 81aa91e5c0e6..e90bcb6bad7f 100644
--- a/arch/sparc/kernel/psycho_common.c
+++ b/arch/sparc/kernel/psycho_common.c
@@ -5,6 +5,7 @@
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/numa.h>
#include <asm/upa.h>
@@ -454,7 +455,7 @@ void psycho_pbm_init_common(struct pci_pbm_info *pbm, struct platform_device *op
struct device_node *dp = op->dev.of_node;
pbm->name = dp->full_name;
- pbm->numa_node = -1;
+ pbm->numa_node = NUMA_NO_NODE;
pbm->chip_type = chip_type;
pbm->chip_version = of_getintprop_default(dp, "version#", 0);
pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0);
diff --git a/arch/sparc/kernel/sbus.c b/arch/sparc/kernel/sbus.c
index 41c5deb581b8..32141e1006c4 100644
--- a/arch/sparc/kernel/sbus.c
+++ b/arch/sparc/kernel/sbus.c
@@ -15,6 +15,7 @@
#include <linux/interrupt.h>
#include <linux/of.h>
#include <linux/of_device.h>
+#include <linux/numa.h>
#include <asm/page.h>
#include <asm/io.h>
@@ -561,7 +562,7 @@ static void __init sbus_iommu_init(struct platform_device *op)
op->dev.archdata.iommu = iommu;
op->dev.archdata.stc = strbuf;
- op->dev.archdata.numa_node = -1;
+ op->dev.archdata.numa_node = NUMA_NO_NODE;
reg_base = regs + SYSIO_IOMMUREG_BASE;
iommu->iommu_control = reg_base + IOMMU_CONTROL;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index b4221d3727d0..ef340e8f209f 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -976,13 +976,13 @@ static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid)
{
int prev_nid, new_nid;
- prev_nid = -1;
+ prev_nid = NUMA_NO_NODE;
for ( ; start < end; start += PAGE_SIZE) {
for (new_nid = 0; new_nid < num_node_masks; new_nid++) {
struct node_mem_mask *p = &node_masks[new_nid];
if ((start & p->mask) == p->match) {
- if (prev_nid == -1)
+ if (prev_nid == NUMA_NO_NODE)
prev_nid = new_nid;
break;
}
@@ -1089,16 +1089,13 @@ static void __init allocate_node_data(int nid)
struct pglist_data *p;
unsigned long start_pfn, end_pfn;
#ifdef CONFIG_NEED_MULTIPLE_NODES
- unsigned long paddr;
- paddr = memblock_phys_alloc_try_nid(sizeof(struct pglist_data),
- SMP_CACHE_BYTES, nid);
- if (!paddr) {
+ NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data),
+ SMP_CACHE_BYTES, nid);
+ if (!NODE_DATA(nid)) {
prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
prom_halt();
}
- NODE_DATA(nid) = __va(paddr);
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
NODE_DATA(nid)->node_id = nid;
#endif
@@ -1208,7 +1205,7 @@ int of_node_to_nid(struct device_node *dp)
md = mdesc_grab();
count = 0;
- nid = -1;
+ nid = NUMA_NO_NODE;
mdesc_for_each_node_by_name(md, grp, "group") {
if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
nid = count;
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 1372553dc0a9..1d1544b6ca74 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -28,6 +28,7 @@ generic-y += preempt.h
generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
+generic-y += shmparam.h
generic-y += sizes.h
generic-y += syscalls.h
generic-y += topology.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 6c6f6301012e..0febf1a07c30 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
include include/uapi/asm-generic/Kbuild.asm
generic-y += kvm_para.h
-generic-y += shmparam.h
generic-y += ucontext.h
diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c
index 040a8c279761..a40219291965 100644
--- a/arch/unicore32/mm/mmu.c
+++ b/arch/unicore32/mm/mmu.c
@@ -141,18 +141,12 @@ static void __init build_mem_type_table(void)
#define vectors_base() (vectors_high() ? 0xffff0000 : 0)
-static void __init *early_alloc(unsigned long sz)
-{
- void *ptr = __va(memblock_phys_alloc(sz, sz));
- memset(ptr, 0, sz);
- return ptr;
-}
-
static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr,
unsigned long prot)
{
if (pmd_none(*pmd)) {
- pte_t *pte = early_alloc(PTRS_PER_PTE * sizeof(pte_t));
+ pte_t *pte = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t),
+ PTRS_PER_PTE * sizeof(pte_t));
__pmd_populate(pmd, __pa(pte) | prot);
}
BUG_ON(pmd_bad(*pmd));
@@ -354,7 +348,7 @@ static void __init devicemaps_init(void)
/*
* Allocate the vector page early.
*/
- vectors = early_alloc(PAGE_SIZE);
+ vectors = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
for (addr = VMALLOC_END; addr; addr += PGDIR_SIZE)
pmd_clear(pmd_off_k(addr));
@@ -431,7 +425,7 @@ void __init paging_init(void)
top_pmd = pmd_off_k(0xffff0000);
/* allocate the zero page. */
- zero_page = early_alloc(PAGE_SIZE);
+ zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
bootmem_init();
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 662963681ea6..e662f987dfa2 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/scatterlist.h>
+#include <linux/numa.h>
#include <asm/io.h>
#include <asm/pat.h>
#include <asm/x86_init.h>
@@ -141,7 +142,7 @@ cpumask_of_pcibus(const struct pci_bus *bus)
int node;
node = __pcibus_to_node(bus);
- return (node == -1) ? cpu_online_mask :
+ return (node == NUMA_NO_NODE) ? cpu_online_mask :
cpumask_of_node(node);
}
#endif
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index a555da094157..1e225528f0d7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -27,6 +27,7 @@
#include <linux/crash_dump.h>
#include <linux/reboot.h>
#include <linux/memory.h>
+#include <linux/numa.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -1390,7 +1391,7 @@ static void __init build_socket_tables(void)
}
/* Set socket -> node values: */
- lnid = -1;
+ lnid = NUMA_NO_NODE;
for_each_present_cpu(cpu) {
int nid = cpu_to_node(cpu);
int apicid, sockid;
@@ -1521,7 +1522,7 @@ static void __init uv_system_init_hub(void)
new_hub->pnode = 0xffff;
new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
- new_hub->memory_nid = -1;
+ new_hub->memory_nid = NUMA_NO_NODE;
new_hub->nr_possible_cpus = 0;
new_hub->nr_online_cpus = 0;
}
@@ -1538,7 +1539,7 @@ static void __init uv_system_init_hub(void)
uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
- if (uv_cpu_hub_info(cpu)->memory_nid == -1)
+ if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE)
uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
/* Init memoryless node: */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ccd1f2a8e557..c91ff9f9fe8a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
#include <linux/stackprotector.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/numa.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -841,7 +842,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/* reduce the number of lines printed when booting a large cpu count system */
static void announce_cpu(int cpu, int apicid)
{
- static int current_node = -1;
+ static int current_node = NUMA_NO_NODE;
int node = early_cpu_to_node(cpu);
static int width, node_width;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2ff25ad33233..dea3725b1dc0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1031,7 +1031,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- unsigned int fault)
+ vm_fault_t fault)
{
struct task_struct *tsk = current;
diff --git a/block/blk-core.c b/block/blk-core.c
index 3c5f61ceeb67..ab17c055faa6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -496,8 +496,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q->stats)
goto fail_stats;
- q->backing_dev_info->ra_pages =
- (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->backing_dev_info->name = "block";
q->node = node_id;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 9a6f40cd8df6..83302ecdc8db 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -40,6 +40,7 @@
#include <linux/export.h>
#include <linux/debugfs.h>
#include <linux/prefetch.h>
+#include <linux/numa.h>
#include "mtip32xx.h"
#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32)
@@ -4013,9 +4014,9 @@ static int get_least_used_cpu_on_node(int node)
/* Helper for selecting a node in round robin mode */
static inline int mtip_get_next_rr_node(void)
{
- static int next_node = -1;
+ static int next_node = NUMA_NO_NODE;
- if (next_node == -1) {
+ if (next_node == NUMA_NO_NODE) {
next_node = first_online_node;
return next_node;
}
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index f1a441ab395d..3a11b1092e80 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -63,6 +63,7 @@
#include <linux/acpi_dma.h>
#include <linux/of_dma.h>
#include <linux/mempool.h>
+#include <linux/numa.h>
static DEFINE_MUTEX(dma_list_mutex);
static DEFINE_IDA(dma_ida);
@@ -386,7 +387,8 @@ EXPORT_SYMBOL(dma_issue_pending_all);
static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
{
int node = dev_to_node(chan->device->dev);
- return node == -1 || cpumask_test_cpu(cpu, cpumask_of_node(node));
+ return node == NUMA_NO_NODE ||
+ cpumask_test_cpu(cpu, cpumask_of_node(node));
}
/**
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 7c6349a50ef1..b32036cbb7a4 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -681,8 +681,13 @@ static struct notifier_block hv_memory_nb = {
/* Check if the particular page is backed and can be onlined and online it. */
static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
{
- if (!has_pfn_is_backed(has, page_to_pfn(pg)))
+ if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
+ if (!PageOffline(pg))
+ __SetPageOffline(pg);
return;
+ }
+ if (PageOffline(pg))
+ __ClearPageOffline(pg);
/* This frame is currently backed; online the page. */
__online_page_set_limits(pg);
@@ -771,7 +776,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
}
}
-static void hv_online_page(struct page *pg)
+static int hv_online_page(struct page *pg, unsigned int order)
{
struct hv_hotadd_state *has;
unsigned long flags;
@@ -783,10 +788,12 @@ static void hv_online_page(struct page *pg)
if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
continue;
- hv_page_online_one(has, pg);
+ hv_bring_pgs_online(has, pfn, (1UL << order));
break;
}
spin_unlock_irqrestore(&dm_device.ha_lock, flags);
+
+ return 0;
}
static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
@@ -1201,6 +1208,7 @@ static void free_balloon_pages(struct hv_dynmem_device *dm,
for (i = 0; i < num_pages; i++) {
pg = pfn_to_page(i + start_frame);
+ __ClearPageOffline(pg);
__free_page(pg);
dm->num_pages_ballooned--;
}
@@ -1213,7 +1221,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
struct dm_balloon_response *bl_resp,
int alloc_unit)
{
- unsigned int i = 0;
+ unsigned int i, j;
struct page *pg;
if (num_pages < alloc_unit)
@@ -1245,6 +1253,10 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
if (alloc_unit != 1)
split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
+ /* mark all pages offline */
+ for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++)
+ __SetPageOffline(pg + j);
+
bl_resp->range_count++;
bl_resp->range_array[i].finfo.start_page =
page_to_pfn(pg);
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 2baf38cc1e23..4fe662c3bbc1 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -48,6 +48,7 @@
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/numa.h>
#include "hfi.h"
#include "affinity.h"
@@ -777,7 +778,7 @@ void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
_dev_comp_vect_cpu_mask_clean_up(dd, entry);
unlock:
mutex_unlock(&node_affinity.lock);
- dd->node = -1;
+ dd->node = NUMA_NO_NODE;
}
/*
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 7835eb52e7c5..441b06e2a154 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -54,6 +54,7 @@
#include <linux/printk.h>
#include <linux/hrtimer.h>
#include <linux/bitmap.h>
+#include <linux/numa.h>
#include <rdma/rdma_vt.h>
#include "hfi.h"
@@ -1303,7 +1304,7 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
dd->unit = ret;
list_add(&dd->list, &hfi1_dev_list);
}
- dd->node = -1;
+ dd->node = NUMA_NO_NODE;
spin_unlock_irqrestore(&hfi1_devs_lock, flags);
idr_preload_end();
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index dc9f14811e0f..6b7df25e1488 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -39,6 +39,7 @@
#include <linux/dmi.h>
#include <linux/slab.h>
#include <linux/iommu.h>
+#include <linux/numa.h>
#include <asm/irq_remapping.h>
#include <asm/iommu_table.h>
@@ -477,7 +478,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
if (!node_online(node))
- node = -1;
+ node = NUMA_NO_NODE;
drhd->iommu->node = node;
return 0;
}
@@ -1062,7 +1063,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
iommu->msagaw = msagaw;
iommu->segment = drhd->segment;
- iommu->node = -1;
+ iommu->node = NUMA_NO_NODE;
ver = readl(iommu->reg + DMAR_VER_REG);
pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 2bd9ac285c0d..048b5ab36a02 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -47,6 +47,7 @@
#include <linux/dma-contiguous.h>
#include <linux/dma-direct.h>
#include <linux/crash_dump.h>
+#include <linux/numa.h>
#include <asm/irq_remapping.h>
#include <asm/cacheflush.h>
#include <asm/iommu.h>
@@ -1716,7 +1717,7 @@ static struct dmar_domain *alloc_domain(int flags)
return NULL;
memset(domain, 0, sizeof(*domain));
- domain->nid = -1;
+ domain->nid = NUMA_NO_NODE;
domain->flags = flags;
domain->has_iotlb_device = false;
INIT_LIST_HEAD(&domain->devices);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 3a7c36326589..0b096ddc9c1e 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -16,7 +16,6 @@
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/crc32c.h>
-#include <linux/flex_array.h>
#include <linux/async_tx.h>
#include <linux/raid/md_p.h>
#include "md.h"
@@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
- struct page **srcs = flex_array_get(percpu->scribble, 0);
+ struct page **srcs = percpu->scribble;
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
@@ -196,8 +195,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
}
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
- NULL, sh, flex_array_get(percpu->scribble, 0)
- + sizeof(struct page *) * (sh->disks + 2));
+ NULL, sh, (void *) (srcs + sh->disks + 2));
if (count == 1)
tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4990f0319f6c..c92e26fbcd7d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -54,7 +54,6 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/nodemask.h>
-#include <linux/flex_array.h>
#include <trace/events/block.h>
#include <linux/list_sort.h>
@@ -1394,22 +1393,16 @@ static void ops_complete_compute(void *stripe_head_ref)
}
/* return a pointer to the address conversion region of the scribble buffer */
-static addr_conv_t *to_addr_conv(struct stripe_head *sh,
- struct raid5_percpu *percpu, int i)
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
{
- void *addr;
-
- addr = flex_array_get(percpu->scribble, i);
- return addr + sizeof(struct page *) * (sh->disks + 2);
+ return percpu->scribble + i * percpu->scribble_obj_size;
}
/* return a pointer to the address conversion region of the scribble buffer */
-static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
+ struct raid5_percpu *percpu, int i)
{
- void *addr;
-
- addr = flex_array_get(percpu->scribble, i);
- return addr;
+ return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
}
static struct dma_async_tx_descriptor *
@@ -2238,21 +2231,23 @@ static int grow_stripes(struct r5conf *conf, int num)
* calculate over all devices (not just the data blocks), using zeros in place
* of the P and Q blocks.
*/
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
+static int scribble_alloc(struct raid5_percpu *percpu,
+ int num, int cnt, gfp_t flags)
{
- struct flex_array *ret;
- size_t len;
+ size_t obj_size =
+ sizeof(struct page *) * (num+2) +
+ sizeof(addr_conv_t) * (num+2);
+ void *scribble;
- len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
- ret = flex_array_alloc(len, cnt, flags);
- if (!ret)
- return NULL;
- /* always prealloc all elements, so no locking is required */
- if (flex_array_prealloc(ret, 0, cnt, flags)) {
- flex_array_free(ret);
- return NULL;
- }
- return ret;
+ scribble = kvmalloc_array(cnt, obj_size, flags);
+ if (!scribble)
+ return -ENOMEM;
+
+ kvfree(percpu->scribble);
+
+ percpu->scribble = scribble;
+ percpu->scribble_obj_size = obj_size;
+ return 0;
}
static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
@@ -2270,23 +2265,18 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
return 0;
mddev_suspend(conf->mddev);
get_online_cpus();
+
for_each_present_cpu(cpu) {
struct raid5_percpu *percpu;
- struct flex_array *scribble;
percpu = per_cpu_ptr(conf->percpu, cpu);
- scribble = scribble_alloc(new_disks,
- new_sectors / STRIPE_SECTORS,
- GFP_NOIO);
-
- if (scribble) {
- flex_array_free(percpu->scribble);
- percpu->scribble = scribble;
- } else {
- err = -ENOMEM;
+ err = scribble_alloc(percpu, new_disks,
+ new_sectors / STRIPE_SECTORS,
+ GFP_NOIO);
+ if (err)
break;
- }
}
+
put_online_cpus();
mddev_resume(conf->mddev);
if (!err) {
@@ -6738,25 +6728,26 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
{
safe_put_page(percpu->spare_page);
- if (percpu->scribble)
- flex_array_free(percpu->scribble);
percpu->spare_page = NULL;
+ kvfree(percpu->scribble);
percpu->scribble = NULL;
}
static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
{
- if (conf->level == 6 && !percpu->spare_page)
+ if (conf->level == 6 && !percpu->spare_page) {
percpu->spare_page = alloc_page(GFP_KERNEL);
- if (!percpu->scribble)
- percpu->scribble = scribble_alloc(max(conf->raid_disks,
- conf->previous_raid_disks),
- max(conf->chunk_sectors,
- conf->prev_chunk_sectors)
- / STRIPE_SECTORS,
- GFP_KERNEL);
-
- if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
+ if (!percpu->spare_page)
+ return -ENOMEM;
+ }
+
+ if (scribble_alloc(percpu,
+ max(conf->raid_disks,
+ conf->previous_raid_disks),
+ max(conf->chunk_sectors,
+ conf->prev_chunk_sectors)
+ / STRIPE_SECTORS,
+ GFP_KERNEL)) {
free_scratch_buffer(conf, percpu);
return -ENOMEM;
}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8474c224127b..cf991f13403e 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -638,10 +638,11 @@ struct r5conf {
/* per cpu variables */
struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */
- struct flex_array *scribble; /* space for constructing buffer
- * lists and performing address
- * conversions
- */
+ void *scribble; /* space for constructing buffer
+ * lists and performing address
+ * conversions
+ */
+ int scribble_obj_size;
} __percpu *percpu;
int scribble_disks;
int scribble_sectors;
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 0441abe87880..9e443df44b3b 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -22,6 +22,7 @@
#include <linux/module.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/numa.h>
#include <asm/uv/uv_hub.h>
#if defined CONFIG_X86_64
#include <asm/uv/bios.h>
@@ -61,7 +62,7 @@ static struct xpc_heartbeat_uv *xpc_heartbeat_uv;
XPC_NOTIFY_MSG_SIZE_UV)
#define XPC_NOTIFY_IRQ_NAME "xpc_notify"
-static int xpc_mq_node = -1;
+static int xpc_mq_node = NUMA_NO_NODE;
static struct xpc_gru_mq_uv *xpc_activate_mq_uv;
static struct xpc_gru_mq_uv *xpc_notify_mq_uv;
diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c
index 426ad912b441..d054e2842a5f 100644
--- a/drivers/misc/sram-exec.c
+++ b/drivers/misc/sram-exec.c
@@ -96,7 +96,7 @@ void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src,
if (!part)
return NULL;
- if (!addr_in_gen_pool(pool, (unsigned long)dst, size))
+ if (!gen_pool_has_addr(pool, (unsigned long)dst, size))
return NULL;
base = (unsigned long)part->base;
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index f8240b87df22..869ec842729e 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -557,6 +557,36 @@ vmballoon_page_in_frames(enum vmballoon_page_size_type page_size)
}
/**
+ * vmballoon_mark_page_offline() - mark a page as offline
+ * @page: pointer for the page.
+ * @page_size: the size of the page.
+ */
+static void
+vmballoon_mark_page_offline(struct page *page,
+ enum vmballoon_page_size_type page_size)
+{
+ int i;
+
+ for (i = 0; i < vmballoon_page_in_frames(page_size); i++)
+ __SetPageOffline(page + i);
+}
+
+/**
+ * vmballoon_mark_page_online() - mark a page as online
+ * @page: pointer for the page.
+ * @page_size: the size of the page.
+ */
+static void
+vmballoon_mark_page_online(struct page *page,
+ enum vmballoon_page_size_type page_size)
+{
+ int i;
+
+ for (i = 0; i < vmballoon_page_in_frames(page_size); i++)
+ __ClearPageOffline(page + i);
+}
+
+/**
* vmballoon_send_get_target() - Retrieve desired balloon size from the host.
*
* @b: pointer to the balloon.
@@ -612,6 +642,7 @@ static int vmballoon_alloc_page_list(struct vmballoon *b,
ctl->page_size);
if (page) {
+ vmballoon_mark_page_offline(page, ctl->page_size);
/* Success. Add the page to the list and continue. */
list_add(&page->lru, &ctl->pages);
continue;
@@ -850,6 +881,7 @@ static void vmballoon_release_page_list(struct list_head *page_list,
list_for_each_entry_safe(page, tmp, page_list, lru) {
list_del(&page->lru);
+ vmballoon_mark_page_online(page, page_size);
__free_pages(page, vmballoon_page_order(page_size));
}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index b53087a980ef..0942e407f1ec 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -27,6 +27,7 @@
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/atomic.h>
+#include <linux/numa.h>
#include <scsi/fc/fc_fcoe.h>
#include <net/udp_tunnel.h>
#include <net/pkt_cls.h>
@@ -6415,7 +6416,7 @@ int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring)
{
struct device *dev = tx_ring->dev;
int orig_node = dev_to_node(dev);
- int ring_node = -1;
+ int ring_node = NUMA_NO_NODE;
int size;
size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count;
@@ -6509,7 +6510,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
{
struct device *dev = rx_ring->dev;
int orig_node = dev_to_node(dev);
- int ring_node = -1;
+ int ring_node = NUMA_NO_NODE;
int size;
size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index ceb5048de9a7..3ff8f91b1fea 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -345,7 +345,7 @@ static enum bp_state reserve_additional_memory(void)
/*
* add_memory_resource() will call online_pages() which in its turn
- * will call xen_online_page() callback causing deadlock if we don't
+ * will call xen_online_pages() callback causing deadlock if we don't
* release balloon_mutex here. Unlocking here is safe because the
* callers drop the mutex before trying again.
*/
@@ -369,15 +369,23 @@ static enum bp_state reserve_additional_memory(void)
return BP_ECANCELED;
}
-static void xen_online_page(struct page *page)
+static int xen_online_pages(struct page *pg, unsigned int order)
{
- __online_page_set_limits(page);
+ unsigned long i, size = (1 << order);
+ unsigned long start_pfn = page_to_pfn(pg);
+ struct page *p;
+ pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
mutex_lock(&balloon_mutex);
-
- __balloon_append(page);
-
+ for (i = 0; i < size; i++) {
+ p = pfn_to_page(start_pfn + i);
+ __online_page_set_limits(p);
+ __SetPageOffline(p);
+ __balloon_append(p);
+ }
mutex_unlock(&balloon_mutex);
+
+ return 0;
}
static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v)
@@ -441,6 +449,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]);
/* Relinquish the page back to the allocator. */
+ __ClearPageOffline(page);
free_reserved_page(page);
}
@@ -467,6 +476,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
state = BP_EAGAIN;
break;
}
+ __SetPageOffline(page);
adjust_managed_page_count(page, -1);
xenmem_reservation_scrub_page(page);
list_add(&page->lru, &pages);
@@ -702,7 +712,7 @@ static int __init balloon_init(void)
balloon_stats.max_retry_count = RETRY_UNLIMITED;
#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
- set_online_page_callback(&xen_online_page);
+ set_online_page_callback(&xen_online_pages);
register_memory_notifier(&xen_memory_nb);
register_sysctl_table(xen_root);
#endif
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 48ce50484e80..10d3bd3f534b 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -92,7 +92,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
return ret;
if (v9ses->cache)
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
diff --git a/fs/afs/super.c b/fs/afs/super.c
index d30a7dc56ef0..37520552e56e 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -423,7 +423,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d7daa3890156..0175f35ea7e5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2947,7 +2947,7 @@ int open_ctree(struct super_block *sb,
sb->s_bdi->congested_fn = btrfs_congested_fn;
sb->s_bdi->congested_data = fs_info;
sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
diff --git a/fs/buffer.c b/fs/buffer.c
index 52d024bfdbc1..784de3dbcf28 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -954,10 +954,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -977,6 +987,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1032,6 +1045,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1043,6 +1062,24 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1L :
+ bdev->bd_super->s_blocksize, size,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1 :
+ bdev->bd_super->s_blocksize_bits,
+ IS_ERR_OR_NULL(bdev->bd_inode) ? -1 :
+ bdev->bd_inode->i_blkbits);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3215,6 +3252,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3253,11 +3295,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3281,6 +3330,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/dax.c b/fs/dax.c
index 6959837cc465..947cee0b2c28 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -788,6 +788,13 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
address = pgoff_address(index, vma);
/*
+ * All the field are populated by follow_pte_pmd() except
+ * the event field.
+ */
+ mmu_notifier_range_init(&range, NULL, 0, -1UL,
+ MMU_NOTIFY_PROTECTION_PAGE);
+
+ /*
* Note because we provide start/end to follow_pte_pmd it will
* call mmu_notifier_invalidate_range_start() on our behalf
* before taking any lock.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a5d219d920e7..4a0e98d87fcc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -50,10 +50,10 @@
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->wq.lock (spinlock)
+ * 3) ep->lock (rwlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->wq.lock) because we manipulate objects
+ * We need a rwlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -85,7 +85,7 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->wq.lock") to have it working,
+ * mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
@@ -182,8 +182,6 @@ struct epitem {
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
- *
- * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
/*
@@ -203,13 +201,16 @@ struct eventpoll {
/* List of ready file descriptors */
struct list_head rdllist;
+ /* Lock which protects rdllist and ovflist */
+ rwlock_t lock;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->wq.lock.
+ * holding ->lock.
*/
struct epitem *ovflist;
@@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
WRITE_ONCE(ep->ovflist, NULL);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -722,7 +723,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ /*
+ * ->ovflist is LIFO, so we have to reverse it in order
+ * to keep in FIFO.
+ */
+ list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
@@ -745,11 +750,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
@@ -789,10 +794,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -842,7 +847,7 @@ static void ep_free(struct eventpoll *ep)
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->wq.lock".
+ * us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
@@ -1023,6 +1028,7 @@ static int ep_alloc(struct eventpoll **pep)
goto free_uid;
mutex_init(&ep->mtx);
+ rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1112,21 +1118,107 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+/**
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ * existing list until all changes are completed, in other words
+ * concurrent list_add_tail_lockless() calls should be protected
+ * with a read lock, where write lock acts as a barrier which
+ * makes sure all list_add_tail_lockless() calls are fully
+ * completed.
+ *
+ * Also an element can be locklessly added to the list only in one
+ * direction i.e. either to the tail either to the head, otherwise
+ * concurrent access will corrupt the list.
+ *
+ * Returns %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+ struct list_head *head)
+{
+ struct list_head *prev;
+
+ /*
+ * This is simple 'new->next = head' operation, but cmpxchg()
+ * is used in order to detect that same element has been just
+ * added to the list from another CPU: the winner observes
+ * new->next == new.
+ */
+ if (cmpxchg(&new->next, new, head) != new)
+ return false;
+
+ /*
+ * Initially ->next of a new element must be updated with the head
+ * (we are inserting to the tail) and only then pointers are atomically
+ * exchanged. XCHG guarantees memory ordering, thus ->next should be
+ * updated before pointers are actually swapped and pointers are
+ * swapped before prev->next is updated.
+ */
+
+ prev = xchg(&head->prev, new);
+
+ /*
+ * It is safe to modify prev->next and new->prev, because a new element
+ * is added only to the tail and new->next is updated before XCHG.
+ */
+
+ prev->next = new;
+ new->prev = prev;
+
+ return true;
+}
+
+/**
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Returns %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+ struct eventpoll *ep = epi->ep;
+
+ /* Check that the same epi has not been just chained from another CPU */
+ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+ return false;
+
+ /* Atomically exchange tail */
+ epi->next = xchg(&ep->ovflist, epi);
+
+ return true;
+}
+
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
+ *
+ * This callback takes a read lock in order not to content with concurrent
+ * events from another file descriptors, thus all modifications to ->rdllist
+ * or ->ovflist are lockless. Read lock is paired with the write lock from
+ * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
+ * Another thing worth to mention is that ep_poll_callback() can be called
+ * concurrently for the same @epi from different CPUs if poll table was inited
+ * with several wait queues entries. Plural wakeup from different CPUs of a
+ * single wait queue is serialized by wq.lock, but the case when multiple wait
+ * queues are used should be detected accordingly. This is detected using
+ * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
- unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
+ unsigned long flags;
int ewake = 0;
- spin_lock_irqsave(&ep->wq.lock, flags);
+ read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1155,24 +1247,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = READ_ONCE(ep->ovflist);
- WRITE_ONCE(ep->ovflist, epi);
- if (epi->ws) {
- /*
- * Activate ep->ws since epi->ws may get
- * deactivated at any time.
- */
- __pm_stay_awake(ep->ws);
- }
-
- }
+ if (epi->next == EP_UNACTIVE_PTR &&
+ chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ if (!ep_is_linked(epi) &&
+ list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
@@ -1197,13 +1280,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->wq.lock, flags);
+ read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1488,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1500,12 +1583,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
@@ -1531,10 +1614,10 @@ error_unregister:
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
@@ -1578,9 +1661,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->wq.lock while
+ * We need this because we did not take ep->lock while
* changing epi above (but ep_poll_callback does take
- * ep->wq.lock).
+ * ep->lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
@@ -1599,18 +1682,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -1771,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
*/
timed_out = 1;
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
goto send_events;
}
diff --git a/fs/file.c b/fs/file.c
index 3209ee271c41..a10487aa0a84 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -457,6 +457,7 @@ struct files_struct init_files = {
.full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+ .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2bbb7c59d6da..336844d0eb3a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1013,7 +1013,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* fuse does it's own writeback accounting */
sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 832c1759a09a..e8ea6f783f19 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2153,13 +2153,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2243,6 +2260,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 802636d50365..74896525dc1a 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2134,7 +2134,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
* if this had completed successfully
* before sending this lock state to the
* new master */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with cancel pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 63d701cd1e2e..b92d603b0cc1 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -183,6 +183,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
flags, owner);
spin_lock(&res->spinlock);
spin_lock(&lock->spinlock);
+
+ if ((flags & LKM_CANCEL) &&
+ dlm_lock_on_list(&res->granted, lock))
+ status = DLM_CANCELGRANT;
+
/* if the master told us the lock was already granted,
* let the ast handle all of these actions */
if (status == DLM_CANCELGRANT) {
@@ -229,6 +234,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
mlog(0, "clearing convert_type at %smaster node\n",
master_node ? "" : "non-");
lock->ml.convert_type = LKM_IVMODE;
+ lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
}
/* remove the extra ref on lock */
diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c0977d..51d5fd8840ab 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
struct page *page = buf->page;
if (page_count(page) == 1) {
- if (memcg_kmem_enabled())
- memcg_kmem_uncharge(page, 0);
+ memcg_kmem_uncharge(page, 0);
__SetPageLocked(page);
return 0;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 02ca5f1d9523..53c2caeafa16 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -59,6 +59,7 @@
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
@@ -92,7 +93,6 @@
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
-#include <linux/flex_array.h>
#include <linux/posix-timers.h>
#include <trace/events/oom.h>
#include "internal.h"
@@ -2146,11 +2146,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct task_struct *task;
struct mm_struct *mm;
unsigned long nr_files, pos, i;
- struct flex_array *fa = NULL;
- struct map_files_info info;
+ GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
+ genradix_init(&fa);
+
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2182,35 +2183,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
*/
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
- if (vma->vm_file && ++pos > ctx->pos)
- nr_files++;
- }
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= ctx->pos)
+ continue;
- if (nr_files) {
- fa = flex_array_alloc(sizeof(info), nr_files,
- GFP_KERNEL);
- if (!fa || flex_array_prealloc(fa, 0, nr_files,
- GFP_KERNEL)) {
+ p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
+ if (!p) {
ret = -ENOMEM;
- if (fa)
- flex_array_free(fa);
up_read(&mm->mmap_sem);
mmput(mm);
goto out_put_task;
}
- for (i = 0, vma = mm->mmap, pos = 2; vma;
- vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (++pos <= ctx->pos)
- continue;
- info.start = vma->vm_start;
- info.end = vma->vm_end;
- info.mode = vma->vm_file->f_mode;
- if (flex_array_put(fa, i++, &info, GFP_KERNEL))
- BUG();
- }
+ p->start = vma->vm_start;
+ p->end = vma->vm_end;
+ p->mode = vma->vm_file->f_mode;
}
up_read(&mm->mmap_sem);
mmput(mm);
@@ -2219,7 +2207,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
unsigned int len;
- p = flex_array_get(fa, i);
+ p = genradix_ptr(&fa, i);
len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
if (!proc_fill_cache(file, ctx,
buf, len,
@@ -2229,12 +2217,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
break;
ctx->pos++;
}
- if (fa)
- flex_array_free(fa);
out_put_task:
put_task_struct(task);
out:
+ genradix_free(&fa);
return ret;
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8ae109429a88..e39bac94dead 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, &proc_misc_dentry_ops);
+ d_set_d_op(dentry, de->proc_dops);
return d_splice_alias(inode, dentry);
}
read_unlock(&proc_subdir_lock);
@@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
+ ent->proc_dops = &proc_misc_dentry_ops;
+
out:
return ent;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 188eadd3fa1b..141995470367 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,6 +44,7 @@ struct proc_dir_entry {
struct completion *pde_unload_completion;
const struct inode_operations *proc_iops;
const struct file_operations *proc_fops;
+ const struct dentry_operations *proc_dops;
union {
const struct seq_operations *seq_ops;
int (*single_show)(struct seq_file *, void *);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 40b05e0d4274..544d1ee15aee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page)
else if (page_count(page) == 0 && is_free_buddy_page(page))
u |= 1 << KPF_BUDDY;
- if (PageBalloon(page))
- u |= 1 << KPF_BALLOON;
+ if (PageOffline(page))
+ u |= 1 << KPF_OFFLINE;
if (PageTable(page))
u |= 1 << KPF_PGTABLE;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index d5e0fcb3439e..a7b12435519e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode)
return maybe_get_net(PDE_NET(PDE(inode)));
}
+static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return 0;
+}
+
+static const struct dentry_operations proc_net_dentry_ops = {
+ .d_revalidate = proc_net_d_revalidate,
+ .d_delete = always_delete_dentry,
+};
+
+static void pde_force_lookup(struct proc_dir_entry *pde)
+{
+ /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+ pde->proc_dops = &proc_net_dentry_ops;
+}
+
static int seq_open_net(struct inode *inode, struct file *file)
{
unsigned int state_size = PDE(inode)->state_size;
@@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_seq_fops;
p->seq_ops = ops;
p->state_size = state_size;
@@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_seq_fops;
p->seq_ops = ops;
p->state_size = state_size;
@@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_single_fops;
p->single_show = show;
return proc_register(parent, p);
@@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_single_fops;
p->single_show = show;
p->write = write;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0ec9edab2f3..6976e17dba68 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -653,13 +653,25 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
};
+ unsigned long flags = vma->vm_flags;
size_t i;
+ /*
+ * Disabling thp is possible through both MADV_NOHUGEPAGE and
+ * PR_SET_THP_DISABLE. Both historically used VM_NOHUGEPAGE. Since
+ * the introduction of MMF_DISABLE_THP, however, userspace needs the
+ * ability to detect vmas where thp is not eligible in the same manner.
+ */
+ if (vma->vm_mm && test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) {
+ flags &= ~VM_HUGEPAGE;
+ flags |= VM_NOHUGEPAGE;
+ }
+
seq_puts(m, "VmFlags: ");
for (i = 0; i < BITS_PER_LONG; i++) {
if (!mnemonics[i][0])
continue;
- if (vma->vm_flags & (1UL << i)) {
+ if (flags & (1UL << i)) {
seq_putc(m, mnemonics[i][0]);
seq_putc(m, mnemonics[i][1]);
seq_putc(m, ' ');
@@ -1143,7 +1155,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
break;
}
- mmu_notifier_range_init(&range, mm, 0, -1UL);
+ mmu_notifier_range_init(&range, mm, 0, -1UL,
+ MMU_NOTIFY_SOFT_DIRTY);
mmu_notifier_invalidate_range_start(&range);
}
walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 53051f3d8f25..f111c780ef1d 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -4,15 +4,18 @@
*
* Common interface definitions for making balloon pages movable by compaction.
*
- * Despite being perfectly possible to perform ballooned pages migration, they
- * make a special corner case to compaction scans because balloon pages are not
- * enlisted at any LRU list like the other pages we do compact / migrate.
+ * Balloon page migration makes use of the general non-lru movable page
+ * feature.
+ *
+ * page->private is used to reference the responsible balloon device.
+ * page->mapping is used in context of non-lru page migration to reference
+ * the address space operations for page isolation/migration/compaction.
*
* As the page isolation scanning step a compaction thread does is a lockless
* procedure (from a page standpoint), it might bring some racy situations while
* performing balloon page compaction. In order to sort out these racy scenarios
* and safely perform balloon's page compaction and migration we must, always,
- * ensure following these three simple rules:
+ * ensure following these simple rules:
*
* i. when updating a balloon's page ->mapping element, strictly do it under
* the following lock order, independently of the far superior
@@ -21,19 +24,8 @@
* +--spin_lock_irq(&b_dev_info->pages_lock);
* ... page->mapping updates here ...
*
- * ii. before isolating or dequeueing a balloon page from the balloon device
- * pages list, the page reference counter must be raised by one and the
- * extra refcount must be dropped when the page is enqueued back into
- * the balloon device page list, thus a balloon page keeps its reference
- * counter raised only while it is under our special handling;
- *
- * iii. after the lockless scan step have selected a potential balloon page for
- * isolation, re-test the PageBalloon mark and the PagePrivate flag
- * under the proper page lock, to ensure isolating a valid balloon page
- * (not yet isolated, nor under release procedure)
- *
- * iv. isolation or dequeueing procedure must clear PagePrivate flag under
- * page lock together with removing page from balloon device page list.
+ * ii. isolation or dequeueing procedure must remove the page from balloon
+ * device page list under b_dev_info->pages_lock.
*
* The functions provided by this interface are placed to help on coping with
* the aforementioned balloon page corner case, as well as to ensure the simple
@@ -103,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping,
static inline void balloon_page_insert(struct balloon_dev_info *balloon,
struct page *page)
{
- __SetPageBalloon(page);
+ __SetPageOffline(page);
__SetPageMovable(page, balloon->inode->i_mapping);
set_page_private(page, (unsigned long)balloon);
list_add(&page->lru, &balloon->pages);
@@ -119,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
*/
static inline void balloon_page_delete(struct page *page)
{
- __ClearPageBalloon(page);
+ __ClearPageOffline(page);
__ClearPageMovable(page);
set_page_private(page, 0);
/*
@@ -149,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
static inline void balloon_page_insert(struct balloon_dev_info *balloon,
struct page *page)
{
- __SetPageBalloon(page);
+ __SetPageOffline(page);
list_add(&page->lru, &balloon->pages);
}
static inline void balloon_page_delete(struct page *page)
{
- __ClearPageBalloon(page);
+ __ClearPageOffline(page);
list_del(&page->lru);
}
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 68250a57aace..c960923d9ec2 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -88,14 +88,13 @@ extern int sysctl_compact_memory;
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos);
extern int sysctl_extfrag_threshold;
-extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos);
extern int sysctl_compact_unevictable_allowed;
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, unsigned int alloc_flags,
- const struct alloc_context *ac, enum compact_priority prio);
+ const struct alloc_context *ac, enum compact_priority prio,
+ struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags, int classzone_idx);
diff --git a/include/linux/device.h b/include/linux/device.h
index 6cb4640b6160..4d2f13e8c540 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1095,7 +1095,7 @@ static inline void set_dev_node(struct device *dev, int node)
#else
static inline int dev_to_node(struct device *dev)
{
- return -1;
+ return NUMA_NO_NODE;
}
static inline void set_dev_node(struct device *dev, int node)
{
diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
deleted file mode 100644
index b94fa61b51fb..000000000000
--- a/include/linux/flex_array.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _FLEX_ARRAY_H
-#define _FLEX_ARRAY_H
-
-#include <linux/types.h>
-#include <linux/reciprocal_div.h>
-#include <asm/page.h>
-
-#define FLEX_ARRAY_PART_SIZE PAGE_SIZE
-#define FLEX_ARRAY_BASE_SIZE PAGE_SIZE
-
-struct flex_array_part;
-
-/*
- * This is meant to replace cases where an array-like
- * structure has gotten too big to fit into kmalloc()
- * and the developer is getting tempted to use
- * vmalloc().
- */
-
-struct flex_array {
- union {
- struct {
- int element_size;
- int total_nr_elements;
- int elems_per_part;
- struct reciprocal_value reciprocal_elems;
- struct flex_array_part *parts[];
- };
- /*
- * This little trick makes sure that
- * sizeof(flex_array) == PAGE_SIZE
- */
- char padding[FLEX_ARRAY_BASE_SIZE];
- };
-};
-
-/* Number of bytes left in base struct flex_array, excluding metadata */
-#define FLEX_ARRAY_BASE_BYTES_LEFT \
- (FLEX_ARRAY_BASE_SIZE - offsetof(struct flex_array, parts))
-
-/* Number of pointers in base to struct flex_array_part pages */
-#define FLEX_ARRAY_NR_BASE_PTRS \
- (FLEX_ARRAY_BASE_BYTES_LEFT / sizeof(struct flex_array_part *))
-
-/* Number of elements of size that fit in struct flex_array_part */
-#define FLEX_ARRAY_ELEMENTS_PER_PART(size) \
- (FLEX_ARRAY_PART_SIZE / size)
-
-/*
- * Defines a statically allocated flex array and ensures its parameters are
- * valid.
- */
-#define DEFINE_FLEX_ARRAY(__arrayname, __element_size, __total) \
- struct flex_array __arrayname = { { { \
- .element_size = (__element_size), \
- .total_nr_elements = (__total), \
- } } }; \
- static inline void __arrayname##_invalid_parameter(void) \
- { \
- BUILD_BUG_ON((__total) > FLEX_ARRAY_NR_BASE_PTRS * \
- FLEX_ARRAY_ELEMENTS_PER_PART(__element_size)); \
- }
-
-/**
- * flex_array_alloc() - Creates a flexible array.
- * @element_size: individual object size.
- * @total: maximum number of objects which can be stored.
- * @flags: GFP flags
- *
- * Return: Returns an object of structure flex_array.
- */
-struct flex_array *flex_array_alloc(int element_size, unsigned int total,
- gfp_t flags);
-
-/**
- * flex_array_prealloc() - Ensures that memory for the elements indexed in the
- * range defined by start and nr_elements has been allocated.
- * @fa: array to allocate memory to.
- * @start: start address
- * @nr_elements: number of elements to be allocated.
- * @flags: GFP flags
- *
- */
-int flex_array_prealloc(struct flex_array *fa, unsigned int start,
- unsigned int nr_elements, gfp_t flags);
-
-/**
- * flex_array_free() - Removes all elements of a flexible array.
- * @fa: array to be freed.
- */
-void flex_array_free(struct flex_array *fa);
-
-/**
- * flex_array_free_parts() - Removes all elements of a flexible array, but
- * leaves the array itself in place.
- * @fa: array to be emptied.
- */
-void flex_array_free_parts(struct flex_array *fa);
-
-/**
- * flex_array_put() - Stores data into a flexible array.
- * @fa: array where element is to be stored.
- * @element_nr: position to copy, must be less than the maximum specified when
- * the array was created.
- * @src: data source to be copied into the array.
- * @flags: GFP flags
- *
- * Return: Returns zero on success, a negative error code otherwise.
- */
-int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src,
- gfp_t flags);
-
-/**
- * flex_array_clear() - Clears an individual element in the array, sets the
- * given element to FLEX_ARRAY_FREE.
- * @element_nr: element position to clear.
- * @fa: array to which element to be cleared belongs.
- *
- * Return: Returns zero on success, -EINVAL otherwise.
- */
-int flex_array_clear(struct flex_array *fa, unsigned int element_nr);
-
-/**
- * flex_array_get() - Retrieves data into a flexible array.
- *
- * @element_nr: Element position to retrieve data from.
- * @fa: array from which data is to be retrieved.
- *
- * Return: Returns a pointer to the data element, or NULL if that
- * particular element has never been allocated.
- */
-void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
-
-/**
- * flex_array_shrink() - Reduces the allocated size of an array.
- * @fa: array to shrink.
- *
- * Return: Returns number of pages of memory actually freed.
- *
- */
-int flex_array_shrink(struct flex_array *fa);
-
-#define flex_array_put_ptr(fa, nr, src, gfp) \
- flex_array_put(fa, nr, (void *)&(src), gfp)
-
-void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr);
-
-#endif /* _FLEX_ARRAY_H */
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index dd0a452373e7..c5e5bed82fbc 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -156,7 +156,7 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev,
int min_alloc_order, int nid, const char *name);
extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+extern bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
size_t size);
#ifdef CONFIG_OF
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
new file mode 100644
index 000000000000..3a91130a4fbd
--- /dev/null
+++ b/include/linux/generic-radix-tree.h
@@ -0,0 +1,231 @@
+#ifndef _LINUX_GENERIC_RADIX_TREE_H
+#define _LINUX_GENERIC_RADIX_TREE_H
+
+/**
+ * DOC: Generic radix trees/sparse arrays:
+ *
+ * Very simple and minimalistic, supporting arbitrary size entries up to
+ * PAGE_SIZE.
+ *
+ * A genradix is defined with the type it will store, like so:
+ *
+ * static GENRADIX(struct foo) foo_genradix;
+ *
+ * The main operations are:
+ *
+ * - genradix_init(radix) - initialize an empty genradix
+ *
+ * - genradix_free(radix) - free all memory owned by the genradix and
+ * reinitialize it
+ *
+ * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
+ * NULL if that entry does not exist
+ *
+ * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
+ * allocating it if necessary
+ *
+ * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
+ *
+ * The radix tree allocates one page of entries at a time, so entries may exist
+ * that were never explicitly allocated - they will be initialized to all
+ * zeroes.
+ *
+ * Internally, a genradix is just a radix tree of pages, and indexing works in
+ * terms of byte offsets. The wrappers in this header file use sizeof on the
+ * type the radix contains to calculate a byte offset from the index - see
+ * __idx_to_offset.
+ */
+
+#include <asm/page.h>
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+
+struct genradix_root;
+
+struct __genradix {
+ struct genradix_root __rcu *root;
+};
+
+/*
+ * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
+ */
+
+#define __GENRADIX_INITIALIZER \
+ { \
+ .tree = { \
+ .root = NULL, \
+ } \
+ }
+
+/*
+ * We use a 0 size array to stash the type we're storing without taking any
+ * space at runtime - then the various accessor macros can use typeof() to get
+ * to it for casts/sizeof - we also force the alignment so that storing a type
+ * with a ridiculous alignment doesn't blow up the alignment or size of the
+ * genradix.
+ */
+
+#define GENRADIX(_type) \
+struct { \
+ struct __genradix tree; \
+ _type type[0] __aligned(1); \
+}
+
+#define DEFINE_GENRADIX(_name, _type) \
+ GENRADIX(_type) _name = __GENRADIX_INITIALIZER
+
+/**
+ * genradix_init - initialize a genradix
+ * @_radix: genradix to initialize
+ *
+ * Does not fail
+ */
+#define genradix_init(_radix) \
+do { \
+ *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER; \
+} while (0)
+
+void __genradix_free(struct __genradix *);
+
+/**
+ * genradix_free: free all memory owned by a genradix
+ * @_radix: the genradix to free
+ *
+ * After freeing, @_radix will be reinitialized and empty
+ */
+#define genradix_free(_radix) __genradix_free(&(_radix)->tree)
+
+static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
+{
+ if (__builtin_constant_p(obj_size))
+ BUILD_BUG_ON(obj_size > PAGE_SIZE);
+ else
+ BUG_ON(obj_size > PAGE_SIZE);
+
+ if (!is_power_of_2(obj_size)) {
+ size_t objs_per_page = PAGE_SIZE / obj_size;
+
+ return (idx / objs_per_page) * PAGE_SIZE +
+ (idx % objs_per_page) * obj_size;
+ } else {
+ return idx * obj_size;
+ }
+}
+
+#define __genradix_cast(_radix) (typeof((_radix)->type[0]) *)
+#define __genradix_obj_size(_radix) sizeof((_radix)->type[0])
+#define __genradix_idx_to_offset(_radix, _idx) \
+ __idx_to_offset(_idx, __genradix_obj_size(_radix))
+
+void *__genradix_ptr(struct __genradix *, size_t);
+
+/**
+ * genradix_ptr - get a pointer to a genradix entry
+ * @_radix: genradix to access
+ * @_idx: index to fetch
+ *
+ * Returns a pointer to entry at @_idx, or NULL if that entry does not exist.
+ */
+#define genradix_ptr(_radix, _idx) \
+ (__genradix_cast(_radix) \
+ __genradix_ptr(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)))
+
+void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
+
+/**
+ * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
+ * if necessary
+ * @_radix: genradix to access
+ * @_idx: index to fetch
+ * @_gfp: gfp mask
+ *
+ * Returns a pointer to entry at @_idx, or NULL on allocation failure
+ */
+#define genradix_ptr_alloc(_radix, _idx, _gfp) \
+ (__genradix_cast(_radix) \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ _gfp))
+
+struct genradix_iter {
+ size_t offset;
+ size_t pos;
+};
+
+/**
+ * genradix_iter_init - initialize a genradix_iter
+ * @_radix: genradix that will be iterated over
+ * @_idx: index to start iterating from
+ */
+#define genradix_iter_init(_radix, _idx) \
+ ((struct genradix_iter) { \
+ .pos = (_idx), \
+ .offset = __genradix_idx_to_offset((_radix), (_idx)),\
+ })
+
+void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
+
+/**
+ * genradix_iter_peek - get first entry at or above iterator's current
+ * position
+ * @_iter: a genradix_iter
+ * @_radix: genradix being iterated over
+ *
+ * If no more entries exist at or above @_iter's current position, returns NULL
+ */
+#define genradix_iter_peek(_iter, _radix) \
+ (__genradix_cast(_radix) \
+ __genradix_iter_peek(_iter, &(_radix)->tree, \
+ PAGE_SIZE / __genradix_obj_size(_radix)))
+
+static inline void __genradix_iter_advance(struct genradix_iter *iter,
+ size_t obj_size)
+{
+ iter->offset += obj_size;
+
+ if (!is_power_of_2(obj_size) &&
+ (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE)
+ iter->offset = round_up(iter->offset, PAGE_SIZE);
+
+ iter->pos++;
+}
+
+#define genradix_iter_advance(_iter, _radix) \
+ __genradix_iter_advance(_iter, __genradix_obj_size(_radix))
+
+#define genradix_for_each_from(_radix, _iter, _p, _start) \
+ for (_iter = genradix_iter_init(_radix, _start); \
+ (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \
+ genradix_iter_advance(&_iter, _radix))
+
+/**
+ * genradix_for_each - iterate over entry in a genradix
+ * @_radix: genradix to iterate over
+ * @_iter: a genradix_iter to track current position
+ * @_p: pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each(_radix, _iter, _p) \
+ genradix_for_each_from(_radix, _iter, _p, 0)
+
+int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
+
+/**
+ * genradix_prealloc - preallocate entries in a generic radix tree
+ * @_radix: genradix to preallocate
+ * @_nr: number of entries to preallocate
+ * @_gfp: gfp mask
+ *
+ * Returns 0 on success, -ENOMEM on failure
+ */
+#define genradix_prealloc(_radix, _nr, _gfp) \
+ __genradix_prealloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _nr + 1),\
+ _gfp)
+
+
+#endif /* _LINUX_GENERIC_RADIX_TREE_H */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5f5e25fd6149..9e58799b730f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -543,7 +543,12 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
extern void free_unref_page(struct page *page);
-extern void free_unref_page_list(struct list_head *list);
+extern void __free_page_list(struct list_head *list, bool dropref, unsigned long *highest_pfn);
+
+static inline void free_unref_page_list(struct list_head *list)
+{
+ return __free_page_list(list, false, NULL);
+}
struct page_frag_cache;
extern void __page_frag_cache_drain(struct page *page, unsigned int count);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 087fd5f48c91..4cc3871b65fc 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -493,17 +493,54 @@ static inline pgoff_t basepage_index(struct page *page)
extern int dissolve_free_huge_page(struct page *page);
extern int dissolve_free_huge_pages(unsigned long start_pfn,
unsigned long end_pfn);
-static inline bool hugepage_migration_supported(struct hstate *h)
-{
+
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+#ifndef arch_hugetlb_migration_supported
+static inline bool arch_hugetlb_migration_supported(struct hstate *h)
+{
if ((huge_page_shift(h) == PMD_SHIFT) ||
- (huge_page_shift(h) == PGDIR_SHIFT))
+ (huge_page_shift(h) == PUD_SHIFT) ||
+ (huge_page_shift(h) == PGDIR_SHIFT))
return true;
else
return false;
+}
+#endif
#else
+static inline bool arch_hugetlb_migration_supported(struct hstate *h)
+{
return false;
+}
#endif
+
+static inline bool hugepage_migration_supported(struct hstate *h)
+{
+ return arch_hugetlb_migration_supported(h);
+}
+
+/*
+ * Movability check is different as compared to migration check.
+ * It determines whether or not a huge page should be placed on
+ * movable zone or not. Movability of any huge page should be
+ * required only if huge page size is supported for migration.
+ * There wont be any reason for the huge page to be movable if
+ * it is not migratable to start with. Also the size of the huge
+ * page should be large enough to be placed under a movable zone
+ * and still feasible enough to be migratable. Just the presence
+ * in movable zone does not make the migration feasible.
+ *
+ * So even though large huge page sizes like the gigantic ones
+ * are migratable they should not be movable because its not
+ * feasible to migrate them from movable zone.
+ */
+static inline bool hugepage_movable_supported(struct hstate *h)
+{
+ if (!hugepage_migration_supported(h))
+ return false;
+
+ if (hstate_is_gigantic(h))
+ return false;
+ return true;
}
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
@@ -602,6 +639,11 @@ static inline bool hugepage_migration_supported(struct hstate *h)
return false;
}
+static inline bool hugepage_movable_supported(struct hstate *h)
+{
+ return false;
+}
+
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 6ab8c1bada3f..7d5f553a3b24 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -16,6 +16,7 @@ struct user_namespace;
struct ipc_ids {
int in_use;
unsigned short seq;
+ unsigned short deleted;
struct rw_semaphore rwsem;
struct idr ipcs_idr;
int max_idx;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8f0e68e250a7..dff02d51a6b3 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -14,6 +14,7 @@
#include <linux/printk.h>
#include <linux/build_bug.h>
#include <asm/byteorder.h>
+#include <asm/div64.h>
#include <uapi/linux/kernel.h>
#define USHRT_MAX ((u16)(~0U))
@@ -204,7 +205,6 @@
#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; })
#ifdef CONFIG_LBDAF
-# include <asm/div64.h>
# define sector_div(a, b) do_div(a, b)
#else
# define sector_div(n, b)( \
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 161e8164abcf..e48b1e453ff5 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -53,6 +53,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma, unsigned long address);
#else /* !CONFIG_KSM */
@@ -86,6 +88,11 @@ static inline void rmap_walk_ksm(struct page *page,
static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
}
+static inline bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ return false;
+}
#endif /* CONFIG_MMU */
#endif /* !CONFIG_KSM */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 64c41cf45590..60e100fe5922 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -247,6 +247,47 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn,
+ unsigned long *out_epfn);
+/**
+ * for_each_free_mem_range_in_zone - iterate through zone specific free
+ * memblock areas
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone. Available once memblock and an empty zone is initialized. The main
+ * assumption is that the zone start, end, and pgdat have been associated.
+ * This way we can use the zone to determine NUMA node, and if a given part
+ * of the memblock is valid for the zone.
+ */
+#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \
+ for (i = 0, \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \
+ i != U64_MAX; \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+
+/**
+ * for_each_free_mem_range_in_zone_from - iterate through zone specific
+ * free memblock areas from a given point
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone, continuing from current position. Available as soon as memblock is
+ * initialized.
+ */
+#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
+ for (; i != U64_MAX; \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 83ae11cbd12c..b0eb29ea0d9c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1273,12 +1273,12 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
void memcg_kmem_put_cache(struct kmem_cache *cachep);
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
- struct mem_cgroup *memcg);
#ifdef CONFIG_MEMCG_KMEM
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
-void memcg_kmem_uncharge(struct page *page, int order);
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
+void __memcg_kmem_uncharge(struct page *page, int order);
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+ struct mem_cgroup *memcg);
extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1300,6 +1300,26 @@ static inline bool memcg_kmem_enabled(void)
return static_branch_unlikely(&memcg_kmem_enabled_key);
}
+static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+ if (memcg_kmem_enabled())
+ return __memcg_kmem_charge(page, gfp, order);
+ return 0;
+}
+
+static inline void memcg_kmem_uncharge(struct page *page, int order)
+{
+ if (memcg_kmem_enabled())
+ __memcg_kmem_uncharge(page, order);
+}
+
+static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp,
+ int order, struct mem_cgroup *memcg)
+{
+ if (memcg_kmem_enabled())
+ return __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ return 0;
+}
/*
* helper for accessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
@@ -1325,6 +1345,15 @@ static inline void memcg_kmem_uncharge(struct page *page, int order)
{
}
+static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void __memcg_kmem_uncharge(struct page *page, int order)
+{
+}
+
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 07da5c6c5ba0..d56bfbacf7d6 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -87,7 +87,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
unsigned long *valid_start, unsigned long *valid_end);
extern void __offline_isolated_pages(unsigned long, unsigned long);
-typedef void (*online_page_callback_t)(struct page *page);
+typedef int (*online_page_callback_t)(struct page *page, unsigned int order);
extern int set_online_page_callback(online_page_callback_t callback);
extern int restore_online_page_callback(online_page_callback_t callback);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..1e0e2df634f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -26,6 +26,7 @@
#include <linux/page_ref.h>
#include <linux/memremap.h>
#include <linux/overflow.h>
+#include <linux/sizes.h>
struct mempolicy;
struct anon_vma;
@@ -123,10 +124,45 @@ extern int mmap_rnd_compat_bits __read_mostly;
/*
* On some architectures it is expensive to call memset() for small sizes.
- * Those architectures should provide their own implementation of "struct page"
- * zeroing by defining this macro in <asm/pgtable.h>.
+ * If an architecture decides to implement their own version of
+ * mm_zero_struct_page they should wrap the defines below in a #ifndef and
+ * define their own version of this macro in <asm/pgtable.h>
*/
-#ifndef mm_zero_struct_page
+#if BITS_PER_LONG == 64
+/* This function must be updated when the size of struct page grows above 80
+ * or reduces below 56. The idea that compiler optimizes out switch()
+ * statement, and only leaves move/store instructions. Also the compiler can
+ * combine write statments if they are both assignments and can be reordered,
+ * this can result in several of the writes here being dropped.
+ */
+#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
+static inline void __mm_zero_struct_page(struct page *page)
+{
+ unsigned long *_pp = (void *)page;
+
+ /* Check that struct page is either 56, 64, 72, or 80 bytes */
+ BUILD_BUG_ON(sizeof(struct page) & 7);
+ BUILD_BUG_ON(sizeof(struct page) < 56);
+ BUILD_BUG_ON(sizeof(struct page) > 80);
+
+ switch (sizeof(struct page)) {
+ case 80:
+ _pp[9] = 0; /* fallthrough */
+ case 72:
+ _pp[8] = 0; /* fallthrough */
+ case 64:
+ _pp[7] = 0; /* fallthrough */
+ case 56:
+ _pp[6] = 0;
+ _pp[5] = 0;
+ _pp[4] = 0;
+ _pp[3] = 0;
+ _pp[2] = 0;
+ _pp[1] = 0;
+ _pp[0] = 0;
+ }
+}
+#else
#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page)))
#endif
@@ -1192,11 +1228,18 @@ static inline void set_page_node(struct page *page, unsigned long node)
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}
+static inline void set_page_reserved(struct page *page, bool reserved)
+{
+ page->flags &= ~(1ul << PG_reserved);
+ page->flags |= (unsigned long)(!!reserved) << PG_reserved;
+}
+
static inline void set_page_links(struct page *page, enum zone_type zone,
- unsigned long node, unsigned long pfn)
+ unsigned long node, unsigned long pfn, bool reserved)
{
set_page_zone(page, zone);
set_page_node(page, node);
+ set_page_reserved(page, reserved);
#ifdef SECTION_IN_PAGE_FLAGS
set_page_section(page, pfn_to_section_nr(pfn));
#endif
@@ -1323,52 +1366,6 @@ static inline void clear_page_pfmemalloc(struct page *page)
}
/*
- * Different kinds of faults, as returned by handle_mm_fault().
- * Used to decide whether a process gets delivered SIGBUS or
- * just gets major/minor fault counters bumped up.
- */
-
-#define VM_FAULT_OOM 0x0001
-#define VM_FAULT_SIGBUS 0x0002
-#define VM_FAULT_MAJOR 0x0004
-#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
-#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
-#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
-#define VM_FAULT_SIGSEGV 0x0040
-
-#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
-#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
-#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
-#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
-#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */
-#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables
- * and needs fsync() to complete (for
- * synchronous page faults in DAX) */
-
-#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
- VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
- VM_FAULT_FALLBACK)
-
-#define VM_FAULT_RESULT_TRACE \
- { VM_FAULT_OOM, "OOM" }, \
- { VM_FAULT_SIGBUS, "SIGBUS" }, \
- { VM_FAULT_MAJOR, "MAJOR" }, \
- { VM_FAULT_WRITE, "WRITE" }, \
- { VM_FAULT_HWPOISON, "HWPOISON" }, \
- { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \
- { VM_FAULT_SIGSEGV, "SIGSEGV" }, \
- { VM_FAULT_NOPAGE, "NOPAGE" }, \
- { VM_FAULT_LOCKED, "LOCKED" }, \
- { VM_FAULT_RETRY, "RETRY" }, \
- { VM_FAULT_FALLBACK, "FALLBACK" }, \
- { VM_FAULT_DONE_COW, "DONE_COW" }, \
- { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
-
-/* Encode hstate index for a hwpoisoned large page */
-#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
-#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
-
-/*
* Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
*/
extern void pagefault_out_of_memory(void);
@@ -2447,8 +2444,7 @@ int __must_check write_one_page(struct page *page);
void task_dirty_inc(struct task_struct *tsk);
/* readahead.c */
-#define VM_MAX_READAHEAD 128 /* kbytes */
-#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
+#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2c471a2c43fa..6312b02d65ed 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -22,7 +22,6 @@
#endif
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
-typedef int vm_fault_t;
struct address_space;
struct mem_cgroup;
@@ -614,6 +613,78 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
struct vm_fault;
+/**
+ * typedef vm_fault_t - Return type for page fault handlers.
+ *
+ * Page fault handlers return a bitmask of %VM_FAULT values.
+ */
+typedef __bitwise unsigned int vm_fault_t;
+
+/**
+ * enum vm_fault_reason - Page fault handlers return a bitmask of
+ * these values to tell the core VM what happened when handling the
+ * fault. Used to decide whether a process gets delivered SIGBUS or
+ * just gets major/minor fault counters bumped up.
+ *
+ * @VM_FAULT_OOM: Out Of Memory
+ * @VM_FAULT_SIGBUS: Bad access
+ * @VM_FAULT_MAJOR: Page read from storage
+ * @VM_FAULT_WRITE: Special case for get_user_pages
+ * @VM_FAULT_HWPOISON: Hit poisoned small page
+ * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded
+ * in upper bits
+ * @VM_FAULT_SIGSEGV: segmentation fault
+ * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page
+ * @VM_FAULT_LOCKED: ->fault locked the returned page
+ * @VM_FAULT_RETRY: ->fault blocked, must retry
+ * @VM_FAULT_FALLBACK: huge page fault failed, fall back to small
+ * @VM_FAULT_DONE_COW: ->fault has fully handled COW
+ * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs
+ * fsync() to complete (for synchronous page faults
+ * in DAX)
+ * @VM_FAULT_HINDEX_MASK: mask HINDEX value
+ *
+ */
+enum vm_fault_reason {
+ VM_FAULT_OOM = (__force vm_fault_t)0x000001,
+ VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002,
+ VM_FAULT_MAJOR = (__force vm_fault_t)0x000004,
+ VM_FAULT_WRITE = (__force vm_fault_t)0x000008,
+ VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010,
+ VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
+ VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040,
+ VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100,
+ VM_FAULT_LOCKED = (__force vm_fault_t)0x000200,
+ VM_FAULT_RETRY = (__force vm_fault_t)0x000400,
+ VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
+ VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
+ VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
+ VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
+};
+
+/* Encode hstate index for a hwpoisoned large page */
+#define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16))
+#define VM_FAULT_GET_HINDEX(x) (((x) >> 16) & 0xf)
+
+#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \
+ VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \
+ VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)
+
+#define VM_FAULT_RESULT_TRACE \
+ { VM_FAULT_OOM, "OOM" }, \
+ { VM_FAULT_SIGBUS, "SIGBUS" }, \
+ { VM_FAULT_MAJOR, "MAJOR" }, \
+ { VM_FAULT_WRITE, "WRITE" }, \
+ { VM_FAULT_HWPOISON, "HWPOISON" }, \
+ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \
+ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \
+ { VM_FAULT_NOPAGE, "NOPAGE" }, \
+ { VM_FAULT_LOCKED, "LOCKED" }, \
+ { VM_FAULT_RETRY, "RETRY" }, \
+ { VM_FAULT_FALLBACK, "FALLBACK" }, \
+ { VM_FAULT_DONE_COW, "DONE_COW" }, \
+ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
+
struct vm_special_mapping {
const char *name; /* The name, e.g. "[vdso]". */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 4050ec1c3b45..29f7b9670ba3 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -25,10 +25,39 @@ struct mmu_notifier_mm {
spinlock_t lock;
};
+/**
+ * enum mmu_notifier_event - reason for the mmu notifier callback
+ * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
+ * move the range
+ *
+ * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
+ * madvise() or replacing a page by another one, ...).
+ *
+ * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
+ * ie using the vma access permission (vm_page_prot) to update the whole range
+ * is enough no need to inspect changes to the CPU page table (mprotect()
+ * syscall)
+ *
+ * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
+ * pages in the range so to mirror those changes the user must inspect the CPU
+ * page table (from the end callback).
+ *
+ * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
+ * access flags)
+ */
+enum mmu_notifier_event {
+ MMU_NOTIFY_UNMAP = 0,
+ MMU_NOTIFY_CLEAR,
+ MMU_NOTIFY_PROTECTION_VMA,
+ MMU_NOTIFY_PROTECTION_PAGE,
+ MMU_NOTIFY_SOFT_DIRTY,
+};
+
struct mmu_notifier_range {
struct mm_struct *mm;
unsigned long start;
unsigned long end;
+ enum mmu_notifier_event event;
bool blockable;
};
@@ -320,11 +349,13 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ enum mmu_notifier_event event)
{
range->mm = mm;
range->start = start;
range->end = end;
+ range->event = event;
}
#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
@@ -452,7 +483,7 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
range->end = end;
}
-#define mmu_notifier_range_init(range, mm, start, end) \
+#define mmu_notifier_range_init(range, mm, start, end, event) \
_mmu_notifier_range_init(range, start, end)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 842f9189537b..90c13cdeefb5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -480,6 +480,8 @@ struct zone {
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
+ unsigned long compact_init_migrate_pfn;
+ unsigned long compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 5a30ad594ccc..2fc99ac677f3 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -98,6 +98,14 @@
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
extern nodemask_t _unused_nodemask_arg_;
+#if MAX_NUMNODES > 1
+extern int nr_node_ids;
+extern int nr_online_nodes;
+#else
+#define nr_node_ids 1
+#define nr_online_nodes 1
+#endif
+
/**
* nodemask_pr_args - printf args to output a nodemask
* @maskp: nodemask to be printed
@@ -108,7 +116,7 @@ extern nodemask_t _unused_nodemask_arg_;
__nodemask_pr_bits(maskp)
static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
{
- return m ? MAX_NUMNODES : 0;
+ return m ? nr_node_ids : 0;
}
static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
{
@@ -444,9 +452,6 @@ static inline int next_memory_node(int nid)
return next_node(nid, node_states[N_MEMORY]);
}
-extern int nr_node_ids;
-extern int nr_online_nodes;
-
static inline void node_set_online(int nid)
{
node_set_state(nid, N_ONLINE);
@@ -485,8 +490,6 @@ static inline int num_node_state(enum node_states state)
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
-#define nr_node_ids 1
-#define nr_online_nodes 1
#define node_set_online(node) node_set_state((node), N_ONLINE)
#define node_set_offline(node) node_clear_state((node), N_ONLINE)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 39b4494e29f1..808b4183e30d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -671,7 +671,7 @@ PAGEFLAG_FALSE(DoubleMap)
/* Reserve 0x0000007f to catch underflows of page_mapcount */
#define PAGE_MAPCOUNT_RESERVE -128
#define PG_buddy 0x00000080
-#define PG_balloon 0x00000100
+#define PG_offline 0x00000100
#define PG_kmemcg 0x00000200
#define PG_table 0x00000400
@@ -706,10 +706,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \
PAGE_TYPE_OPS(Buddy, buddy)
/*
- * PageBalloon() is true for pages that are on the balloon page list
- * (see mm/balloon_compaction.c).
+ * PageOffline() indicates that the page is logically offline although the
+ * containing section is online. (e.g. inflated in a balloon driver or
+ * not onlined when onlining the section).
+ * The content of these pages is effectively stale. Such pages should not
+ * be touched (read/write/dump/save) except by their owner.
*/
-PAGE_TYPE_OPS(Balloon, balloon)
+PAGE_TYPE_OPS(Offline, offline)
/*
* If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e2d7039af6a3..1020e6f40880 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -252,6 +252,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_WRITE 0x00000008
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
+#define FGP_FOR_MMAP 0x00000040
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
int fgp_flags, gfp_t cache_gfp_mask);
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 15927ebc22f2..10173f989a43 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -83,9 +83,6 @@
#define MUTEX_DEBUG_FREE 0x22
#define MUTEX_POISON_WW_CTX ((void *) 0x500 + POISON_POINTER_DELTA)
-/********** lib/flex_array.c **********/
-#define FLEX_ARRAY_FREE 0x6c /* for use-after-free poisoning */
-
/********** security/ **********/
#define KEY_DESTROY 0xbd
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2f90fa92468..6c2e441b50ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -47,6 +47,7 @@ struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
+struct capture_control;
struct robust_list_head;
struct sched_attr;
struct sched_param;
@@ -964,6 +965,9 @@ struct task_struct {
struct io_context *io_context;
+#ifdef CONFIG_COMPACTION
+ struct capture_control *capture_control;
+#endif
/* Ptrace state: */
unsigned long ptrace_message;
kernel_siginfo_t *last_siginfo;
@@ -1168,6 +1172,9 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
+
+ /* Used by memcontrol for high relcaim: */
+ struct mem_cgroup *memcg_high_reclaim;
#endif
#ifdef CONFIG_BLK_CGROUP
@@ -1208,6 +1215,13 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 13789d10a50e..76b8399b17f6 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -417,10 +417,20 @@ static inline void set_restore_sigmask(void)
set_thread_flag(TIF_RESTORE_SIGMASK);
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
+
+static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+{
+ clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+}
+
static inline void clear_restore_sigmask(void)
{
clear_thread_flag(TIF_RESTORE_SIGMASK);
}
+static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+{
+ return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+}
static inline bool test_restore_sigmask(void)
{
return test_thread_flag(TIF_RESTORE_SIGMASK);
@@ -438,6 +448,10 @@ static inline void set_restore_sigmask(void)
current->restore_sigmask = true;
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
+static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+{
+ tsk->restore_sigmask = false;
+}
static inline void clear_restore_sigmask(void)
{
current->restore_sigmask = false;
@@ -446,6 +460,10 @@ static inline bool test_restore_sigmask(void)
{
return current->restore_sigmask;
}
+static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+{
+ return tsk->restore_sigmask;
+}
static inline bool test_and_clear_restore_sigmask(void)
{
if (!current->restore_sigmask)
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 3a1a1dbc6f49..201a635be846 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -81,7 +81,7 @@ struct kmem_cache_order_objects {
*/
struct kmem_cache {
struct kmem_cache_cpu __percpu *cpu_slab;
- /* Used for retriving partial slabs etc */
+ /* Used for retrieving partial slabs etc */
slab_flags_t flags;
unsigned long min_partial;
unsigned int size; /* The size of an object including meta data */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 622025ac1461..928550bd28f3 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,8 +175,9 @@ enum {
SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
+ SWP_VALID = (1 << 13), /* swap is valid to be operated on? */
/* add others here before... */
- SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */
+ SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */
};
#define SWAP_CLUSTER_MAX 32UL
@@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
-extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry);
+extern int __swap_count(swp_entry_t entry);
extern int __swp_swapcount(swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
@@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
+extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+ preempt_enable();
+}
#else /* CONFIG_SWAP */
@@ -576,7 +583,7 @@ static inline int page_swapcount(struct page *page)
return 0;
}
-static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+static inline int __swap_count(swp_entry_t entry)
{
return 0;
}
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 003020eb6e66..e21c477990e4 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -48,6 +48,7 @@
#define __sctp_structs_h__
#include <linux/ktime.h>
+#include <linux/generic-radix-tree.h>
#include <linux/rhashtable-types.h>
#include <linux/socket.h> /* linux/in.h needs this!! */
#include <linux/in.h> /* We get struct sockaddr_in. */
@@ -57,7 +58,6 @@
#include <linux/atomic.h> /* This gets us atomic counters. */
#include <linux/skbuff.h> /* We need sk_buff_head. */
#include <linux/workqueue.h> /* We need tq_struct. */
-#include <linux/flex_array.h> /* We need flex_array. */
#include <linux/sctp.h> /* We need sctp* header structs. */
#include <net/sctp/auth.h> /* We need auth specific structs */
#include <net/ip.h> /* For inet_skb_parm */
@@ -1445,8 +1445,9 @@ struct sctp_stream_in {
};
struct sctp_stream {
- struct flex_array *out;
- struct flex_array *in;
+ GENRADIX(struct sctp_stream_out) out;
+ GENRADIX(struct sctp_stream_in) in;
+
__u16 outcnt;
__u16 incnt;
/* Current stream being sent, if any */
@@ -1469,17 +1470,17 @@ struct sctp_stream {
};
static inline struct sctp_stream_out *sctp_stream_out(
- const struct sctp_stream *stream,
+ struct sctp_stream *stream,
__u16 sid)
{
- return flex_array_get(stream->out, sid);
+ return genradix_ptr(&stream->out, sid);
}
static inline struct sctp_stream_in *sctp_stream_in(
- const struct sctp_stream *stream,
+ struct sctp_stream *stream,
__u16 sid)
{
- return flex_array_get(stream->in, sid);
+ return genradix_ptr(&stream->in, sid);
}
#define SCTP_SO(s, i) sctp_stream_out((s), (i))
diff --git a/include/uapi/linux/binfmts.h b/include/uapi/linux/binfmts.h
index 4abad03a8853..689025d9c185 100644
--- a/include/uapi/linux/binfmts.h
+++ b/include/uapi/linux/binfmts.h
@@ -16,6 +16,6 @@ struct pt_regs;
#define MAX_ARG_STRINGS 0x7FFFFFFF
/* sizeof(linux_binprm->buf) */
-#define BINPRM_BUF_SIZE 128
+#define BINPRM_BUF_SIZE 256
#endif /* _UAPI_LINUX_BINFMTS_H */
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 21b9113c69da..6f2f2720f3ac 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -32,7 +32,7 @@
#define KPF_KSM 21
#define KPF_THP 22
-#define KPF_BALLOON 23
+#define KPF_OFFLINE 23
#define KPF_ZERO_PAGE 24
#define KPF_IDLE 25
#define KPF_PGTABLE 26
diff --git a/init/init_task.c b/init/init_task.c
index 5aebe3be4d7c..26131e73aa6d 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/audit.h>
+#include <linux/numa.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
@@ -154,7 +155,7 @@ struct task_struct init_task
.vtime.state = VTIME_SYS,
#endif
#ifdef CONFIG_NUMA_BALANCING
- .numa_preferred_nid = -1,
+ .numa_preferred_nid = NUMA_NO_NODE,
.numa_group = NULL,
.numa_faults = NULL,
#endif
diff --git a/init/main.c b/init/main.c
index e2e80ca3165a..a56f65aa4f4b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -695,7 +695,9 @@ asmlinkage __visible void __init start_kernel(void)
initrd_start = 0;
}
#endif
+#ifndef CONFIG_DEFERRED_STRUCT_PAGE_INIT
page_ext_init();
+#endif
kmemleak_init();
setup_per_cpu_pageset();
numa_policy_init();
@@ -1131,6 +1133,9 @@ static noinline void __init kernel_init_freeable(void)
sched_init_smp();
page_alloc_init_late();
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ page_ext_init();
+#endif
do_basic_setup();
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 49f9bf4ffc7f..d9ac6caf6a5e 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -120,7 +120,9 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
static int zero;
static int one = 1;
static int int_max = INT_MAX;
-static int ipc_mni = IPCMNI;
+int ipc_mni = IPCMNI;
+int ipc_mni_shift = IPCMNI_SHIFT;
+bool ipc_mni_extended;
static struct ctl_table ipc_kern_table[] = {
{
@@ -246,3 +248,13 @@ static int __init ipc_sysctl_init(void)
}
device_initcall(ipc_sysctl_init);
+
+static int __init ipc_mni_extend(char *str)
+{
+ ipc_mni = IPCMNI_EXTEND;
+ ipc_mni_shift = IPCMNI_EXTEND_SHIFT;
+ ipc_mni_extended = true;
+ pr_info("IPCMNI extended to %d.\n", ipc_mni);
+ return 0;
+}
+early_param("ipcmni_extend", ipc_mni_extend);
diff --git a/ipc/util.c b/ipc/util.c
index 0af05752969f..3f11a81cf9b7 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -110,12 +110,13 @@ static const struct rhashtable_params ipc_kht_params = {
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
- * below IPCMNI) then initialise the keys hashtable and ids idr.
+ * below ipc_mni) then initialise the keys hashtable and ids idr.
*/
void ipc_init_ids(struct ipc_ids *ids)
{
ids->in_use = 0;
- ids->seq = 0;
+ ids->deleted = false;
+ ids->seq = ipc_mni_extended ? 0 : -1; /* seq # is pre-incremented */
init_rwsem(&ids->rwsem);
rhashtable_init(&ids->key_ht, &ipc_kht_params);
idr_init(&ids->ipcs_idr);
@@ -198,6 +199,11 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
{
int idx, next_id = -1;
+/*
+ * To conserve sequence number space with extended ipc_mni when new ID
+ * is built, the sequence number is incremented only when one or more
+ * IDs have been removed previously.
+ */
#ifdef CONFIG_CHECKPOINT_RESTORE
next_id = ids->next_id;
ids->next_id = -1;
@@ -216,9 +222,13 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
*/
if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
- new->seq = ids->seq++;
- if (ids->seq > IPCID_SEQ_MAX)
- ids->seq = 0;
+ if (!ipc_mni_extended || ids->deleted) {
+ ids->seq++;
+ if (ids->seq > IPCID_SEQ_MAX)
+ ids->seq = 0;
+ ids->deleted = false;
+ }
+ new->seq = ids->seq;
idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT);
} else {
new->seq = ipcid_to_seqx(next_id);
@@ -226,7 +236,7 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
0, GFP_NOWAIT);
}
if (idx >= 0)
- new->id = SEQ_MULTIPLIER * new->seq + idx;
+ new->id = (new->seq << IPCMNI_SEQ_SHIFT) + idx;
return idx;
}
@@ -254,8 +264,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
/* 1) Initialize the refcount so that ipc_rcu_putref works */
refcount_set(&new->refcount, 1);
- if (limit > IPCMNI)
- limit = IPCMNI;
+ if (limit > ipc_mni)
+ limit = ipc_mni;
if (ids->in_use >= limit)
return -ENOSPC;
@@ -436,6 +446,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
idr_remove(&ids->ipcs_idr, idx);
ipc_kht_remove(ids, ipcp);
ids->in_use--;
+ ids->deleted = true;
ipcp->deleted = true;
if (unlikely(idx == ids->max_idx)) {
@@ -738,7 +749,7 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
if (total >= ids->in_use)
return NULL;
- for (; pos < IPCMNI; pos++) {
+ for (; pos < ipc_mni; pos++) {
ipc = idr_find(&ids->ipcs_idr, pos);
if (ipc != NULL) {
*new_pos = pos + 1;
diff --git a/ipc/util.h b/ipc/util.h
index d768fdbed515..127c0fedbd09 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -15,8 +15,36 @@
#include <linux/err.h>
#include <linux/ipc_namespace.h>
-#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
-#define SEQ_MULTIPLIER (IPCMNI)
+/*
+ * The IPC ID contains 2 separate numbers - index and sequence number.
+ * By default,
+ * bits 0-14: index (32k, 15 bits)
+ * bits 15-30: sequence number (64k, 16 bits)
+ *
+ * When IPCMNI extension mode is turned on, the composition changes:
+ * bits 0-22: index (8M, 23 bits)
+ * bits 23-30: sequence number (256, 8 bits)
+ */
+#define IPCMNI_SHIFT 15
+#define IPCMNI_EXTEND_SHIFT 23
+#define IPCMNI (1 << IPCMNI_SHIFT)
+#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT)
+
+#ifdef CONFIG_SYSVIPC_SYSCTL
+extern int ipc_mni;
+extern int ipc_mni_shift;
+extern bool ipc_mni_extended;
+
+#define IPCMNI_SEQ_SHIFT ipc_mni_shift
+#define IPCMNI_IDX_MASK ((1 << ipc_mni_shift) - 1)
+
+#else /* CONFIG_SYSVIPC_SYSCTL */
+
+#define ipc_mni IPCMNI
+#define ipc_mni_extended false
+#define IPCMNI_SEQ_SHIFT IPCMNI_SHIFT
+#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1)
+#endif /* CONFIG_SYSVIPC_SYSCTL */
void sem_init(void);
void msg_init(void);
@@ -96,9 +124,9 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *);
#define IPC_MSG_IDS 1
#define IPC_SHM_IDS 2
-#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
-#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
-#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX)
+#define ipcid_to_idx(id) ((id) & IPCMNI_IDX_MASK)
+#define ipcid_to_seqx(id) ((id) >> IPCMNI_SEQ_SHIFT)
+#define IPCID_SEQ_MAX (INT_MAX >> IPCMNI_SEQ_SHIFT)
/* must be called with ids->rwsem acquired for writing */
int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
@@ -123,8 +151,8 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids)
if (ids->in_use == 0)
return -1;
- if (ids->in_use == IPCMNI)
- return IPCMNI - 1;
+ if (ids->in_use == ipc_mni)
+ return ipc_mni - 1;
return ids->max_idx;
}
@@ -219,10 +247,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
static inline int sem_check_semmni(struct ipc_namespace *ns) {
/*
- * Check semmni range [0, IPCMNI]
+ * Check semmni range [0, ipc_mni]
* semmni is the last element of sem_ctls[4] array
*/
- return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > IPCMNI))
+ return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni))
? -ERANGE : 0;
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 933cb3e45b98..093c9f917ed0 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+ VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
arch_crash_save_vmcoreinfo();
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 7a723194ecbe..2b750f13bc8f 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -158,7 +158,7 @@ out:
bool dma_in_atomic_pool(void *start, size_t size)
{
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8aef47ee7bfa..ad415f7c2715 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -174,7 +174,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct mmu_notifier_range range;
struct mem_cgroup *memcg;
- mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE,
+ MMU_NOTIFY_CLEAR);
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
diff --git a/kernel/exit.c b/kernel/exit.c
index 284f2fe9a293..8495991af784 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -664,9 +664,6 @@ static void forget_original_parent(struct task_struct *father,
{
struct task_struct *p, *t, *reaper;
- if (unlikely(!list_empty(&father->ptraced)))
- exit_ptrace(father, dead);
-
/* Can drop and reacquire tasklist_lock */
reaper = find_child_reaper(father);
if (list_empty(&father->children))
@@ -705,8 +702,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
LIST_HEAD(dead);
write_lock_irq(&tasklist_lock);
- forget_original_parent(tsk, &dead);
+ if (unlikely(!list_empty(&tsk->ptraced)))
+ exit_ptrace(tsk, &dead);
+ write_unlock_irq(&tasklist_lock);
+
+ /* Ptraced tasks have to be released before zap_pid_ns_processes(). */
+ list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
+ write_lock_irq(&tasklist_lock);
+ forget_original_parent(tsk, &dead);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
diff --git a/kernel/fork.c b/kernel/fork.c
index b69248e6f0e0..c3e51fe46c3b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -919,6 +919,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
+ tsk->memcg_high_reclaim = NULL;
#endif
return tsk;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 087d18d771b5..ebebbcf3c5de 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/numa.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -675,7 +676,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
{
struct kthread_worker *worker;
struct task_struct *task;
- int node = -1;
+ int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 95932333a48b..029aaabf3169 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -50,6 +50,7 @@
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/nmi.h>
+#include <linux/rwsem.h>
#include <asm/sections.h>
@@ -3550,6 +3551,24 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
+#if defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) && defined(CONFIG_DEBUG_AID_FOR_SYZBOT)
+ if (hlock->read && curr->mm) {
+ struct rw_semaphore *sem = container_of(lock,
+ struct rw_semaphore,
+ dep_map);
+
+ if (sem == &curr->mm->mmap_sem) {
+#if defined(CONFIG_RWSEM_SPIN_ON_OWNER)
+ pr_warn("mmap_sem: hlock->read=%d count=%ld current=%px, owner=%px\n",
+ hlock->read, atomic_long_read(&sem->count),
+ curr, READ_ONCE(sem->owner));
+#else
+ pr_warn("mmap_sem: hlock->read=%d count=%ld\n",
+ hlock->read, atomic_long_read(&sem->count));
+#endif
+ }
+ }
+#endif
WARN(hlock->read, "downgrading a read lock");
hlock->read = 1;
hlock->acquire_ip = ip;
diff --git a/kernel/panic.c b/kernel/panic.c
index f121e6ba7e11..0ae0d7332f12 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -642,16 +642,14 @@ static int clear_warn_once_set(void *data, u64 val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
- NULL,
- clear_warn_once_set,
- "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set,
+ "%lld\n");
static __init int register_warn_debugfs(void)
{
/* Don't care about failure */
- debugfs_create_file("clear_warn_once", 0200, NULL,
- NULL, &clear_warn_once_fops);
+ debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL,
+ &clear_warn_once_fops);
return 0;
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 640b2034edd6..4802b039b89f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(!PageHighMem(page));
- if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
- PageReserved(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page) || PageOffline(page))
return NULL;
if (page_is_guard(page))
@@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(PageHighMem(page));
@@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
+ if (PageOffline(page))
+ return NULL;
+
if (PageReserved(page)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 771e93f9c43f..6f357f4fc859 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
+#include <linux/sched/signal.h>
/*
* Access another process' address space via ptrace.
@@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
- case PTRACE_GETSIGMASK:
+ case PTRACE_GETSIGMASK: {
+ sigset_t *mask;
+
if (addr != sizeof(sigset_t)) {
ret = -EINVAL;
break;
}
- if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ if (test_tsk_restore_sigmask(child))
+ mask = &child->saved_sigmask;
+ else
+ mask = &child->blocked;
+
+ if (copy_to_user(datavp, mask, sizeof(sigset_t)))
ret = -EFAULT;
else
ret = 0;
break;
+ }
case PTRACE_SETSIGMASK: {
sigset_t new_set;
@@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request,
child->blocked = new_set;
spin_unlock_irq(&child->sighand->siglock);
+ clear_tsk_restore_sigmask(child);
+
ret = 0;
break;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a674c7db2f29..ae5beb3ed09e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2177,6 +2177,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
init_numa_balancing(clone_flags, p);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 50aa2aba69bd..7303e0b6a0d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1160,7 +1160,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = -1;
+ p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
@@ -1180,13 +1180,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
@@ -1400,7 +1400,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
@@ -1848,7 +1848,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -2095,7 +2095,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1;
+ int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
@@ -2638,7 +2638,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
+ src_nid != p->numa_preferred_nid))
return;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 268bed80244f..f73c28143011 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2945,7 +2945,7 @@ static bool known_siginfo_layout(unsigned sig, int si_code)
if (si_code == SI_KERNEL)
return true;
else if ((si_code > SI_USER)) {
- if (sig_specific_sicodes(sig)) {
+ if (sig && sig_specific_sicodes(sig)) {
if (si_code <= sig_sicodes[sig].limit)
return true;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba4d9e85feb8..c69b1f32853b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,6 +67,8 @@
#include <linux/bpf.h>
#include <linux/mount.h>
+#include "../lib/kstrtox.h"
+
#include <linux/uaccess.h>
#include <asm/processor.h>
@@ -127,6 +129,7 @@ static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
+static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
@@ -1446,7 +1449,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = sysctl_extfrag_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
@@ -1722,6 +1725,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(files_stat.max_files),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &long_max,
},
{
.procname = "nr_open",
@@ -2092,6 +2097,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v)
}
}
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
#define TMPBUFLEN 22
/**
* proc_get_long - reads an ASCII formatted integer from a user buffer
@@ -2135,7 +2175,8 @@ static int proc_get_long(char **buf, size_t *size,
if (!isdigit(*p))
return -EINVAL;
- *val = simple_strtoul(p, &p, 0);
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
len = p - tmp;
@@ -2804,6 +2845,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
break;
if (neg)
continue;
+ if ((max && val > *max) || (min && val < *min)) {
+ err = -EINVAL;
+ break;
+ }
val = convmul * val / convdiv;
if ((min && val < *min) || (max && val > *max))
continue;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6d92718df707..d0a23cb12b8b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1892,6 +1892,18 @@ config TEST_LKM
If unsure, say N.
+config TEST_VMALLOC
+ tristate "Test module for stress/performance analysis of vmalloc allocator"
+ default n
+ depends on m
+ help
+ This builds the "test_vmalloc" module that should be used for
+ stress and performance analysis. So, any new change for vmalloc
+ subsystem can be evaluated from performance and stability point
+ of view.
+
+ If unsure, say N.
+
config TEST_USER_COPY
tristate "Test user/kernel boundary protections"
depends on m
@@ -2105,6 +2117,12 @@ config IO_STRICT_DEVMEM
If in doubt, say Y.
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu # Kernel hacking
diff --git a/lib/Makefile b/lib/Makefile
index a1cfa1975ca3..c7c8c29399f6 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -35,10 +35,11 @@ obj-y += lockref.o
obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
- gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
+ gcd.o lcm.o list_sort.o uuid.o iov_iter.o clz_ctz.o \
bsearch.o find_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o rhashtable.o reciprocal_div.o \
- once.o refcount.o usercopy.o errseq.o bucket_locks.o
+ once.o refcount.o usercopy.o errseq.o bucket_locks.o \
+ generic-radix-tree.o
obj-$(CONFIG_STRING_SELFTEST) += test_string.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
@@ -60,6 +61,7 @@ UBSAN_SANITIZE_test_ubsan.o := y
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o
obj-$(CONFIG_TEST_LKM) += test_module.o
+obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o
obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o
obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
obj-$(CONFIG_TEST_SORT) += test_sort.o
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 8d666ab84b5c..087a3e9a0202 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -5,6 +5,7 @@
#include <linux/cpumask.h>
#include <linux/export.h>
#include <linux/memblock.h>
+#include <linux/numa.h>
/**
* cpumask_next - get the next cpu in a cpumask
@@ -206,7 +207,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
/* Wrap: we always want a cpu. */
i %= num_online_cpus();
- if (node == -1) {
+ if (node == NUMA_NO_NODE) {
for_each_cpu(cpu, cpu_online_mask)
if (i-- == 0)
return cpu;
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 55437fd5128b..1546a3c00709 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -375,6 +375,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
struct debug_bucket *db;
struct debug_obj *obj;
unsigned long flags;
+ bool check_object_on_stack = false;
fill_pool();
@@ -391,7 +392,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
debug_objects_oom();
return;
}
- debug_object_is_on_stack(addr, onstack);
+ check_object_on_stack = true;
}
switch (obj->state) {
@@ -402,20 +403,24 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "init");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "init");
debug_object_fixup(descr->fixup_init, addr, state);
return;
case ODEBUG_STATE_DESTROYED:
+ raw_spin_unlock_irqrestore(&db->lock, flags);
debug_print_object(obj, "init");
- break;
+ return;
default:
break;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (check_object_on_stack)
+ debug_object_is_on_stack(addr, onstack);
+
}
/**
@@ -473,33 +478,34 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
obj = lookup_object(addr, db);
if (obj) {
+ ret = 0;
switch (obj->state) {
case ODEBUG_STATE_INIT:
case ODEBUG_STATE_INACTIVE:
obj->state = ODEBUG_STATE_ACTIVE;
- ret = 0;
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "activate");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "activate");
ret = debug_object_fixup(descr->fixup_activate, addr, state);
return ret ? 0 : -EINVAL;
case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "activate");
ret = -EINVAL;
break;
default:
- ret = 0;
break;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (ret)
+ debug_print_object(obj, "activate");
return ret;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+
/*
* We are here when a static object is activated. We
* let the type specific code confirm whether this is
@@ -548,24 +554,30 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
if (!obj->astate)
obj->state = ODEBUG_STATE_INACTIVE;
else
- debug_print_object(obj, "deactivate");
+ goto out_unlock_print;
break;
case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "deactivate");
- break;
+ goto out_unlock_print;
+
default:
break;
}
- } else {
+ }
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (!obj) {
struct debug_obj o = { .object = addr,
.state = ODEBUG_STATE_NOTAVAILABLE,
.descr = descr };
debug_print_object(&o, "deactivate");
}
+ return;
+out_unlock_print:
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "deactivate");
}
EXPORT_SYMBOL_GPL(debug_object_deactivate);
@@ -599,15 +611,16 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
obj->state = ODEBUG_STATE_DESTROYED;
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "destroy");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "destroy");
debug_object_fixup(descr->fixup_destroy, addr, state);
return;
case ODEBUG_STATE_DESTROYED:
+ raw_spin_unlock_irqrestore(&db->lock, flags);
debug_print_object(obj, "destroy");
- break;
+ return;
default:
break;
}
@@ -641,9 +654,9 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr)
switch (obj->state) {
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "free");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "free");
debug_object_fixup(descr->fixup_free, addr, state);
return;
default:
@@ -731,22 +744,27 @@ debug_object_active_state(void *addr, struct debug_obj_descr *descr,
if (obj->astate == expect)
obj->astate = next;
else
- debug_print_object(obj, "active_state");
+ goto out_unlock_print;
break;
default:
- debug_print_object(obj, "active_state");
- break;
+ goto out_unlock_print;
}
- } else {
+ }
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (!obj) {
struct debug_obj o = { .object = addr,
.state = ODEBUG_STATE_NOTAVAILABLE,
.descr = descr };
debug_print_object(&o, "active_state");
}
+ return;
+out_unlock_print:
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "active_state");
}
EXPORT_SYMBOL_GPL(debug_object_active_state);
@@ -782,10 +800,10 @@ repeat:
switch (obj->state) {
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "free");
descr = obj->descr;
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "free");
debug_object_fixup(descr->fixup_free,
(void *) oaddr, state);
goto repeat;
@@ -978,19 +996,24 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings)
struct debug_obj *obj;
unsigned long flags;
int res = -EINVAL;
+ enum debug_obj_state obj_state;
db = get_bucket((unsigned long) addr);
raw_spin_lock_irqsave(&db->lock, flags);
obj = lookup_object(addr, db);
+ obj_state = obj ? obj->state : state;
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+
if (!obj && state != ODEBUG_STATE_NONE) {
WARN(1, KERN_ERR "ODEBUG: selftest object not found\n");
goto out;
}
- if (obj && obj->state != state) {
+ if (obj_state != state) {
WARN(1, KERN_ERR "ODEBUG: selftest wrong state: %d != %d\n",
- obj->state, state);
+ obj_state, state);
goto out;
}
if (fixups != debug_objects_fixups) {
@@ -1005,7 +1028,6 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings)
}
res = 0;
out:
- raw_spin_unlock_irqrestore(&db->lock, flags);
if (res)
debug_objects_enabled = 0;
return res;
diff --git a/lib/flex_array.c b/lib/flex_array.c
deleted file mode 100644
index 2eed22fa507c..000000000000
--- a/lib/flex_array.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Flexible array managed in PAGE_SIZE parts
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2009
- *
- * Author: Dave Hansen <dave@linux.vnet.ibm.com>
- */
-
-#include <linux/flex_array.h>
-#include <linux/slab.h>
-#include <linux/stddef.h>
-#include <linux/export.h>
-#include <linux/reciprocal_div.h>
-
-struct flex_array_part {
- char elements[FLEX_ARRAY_PART_SIZE];
-};
-
-/*
- * If a user requests an allocation which is small
- * enough, we may simply use the space in the
- * flex_array->parts[] array to store the user
- * data.
- */
-static inline int elements_fit_in_base(struct flex_array *fa)
-{
- int data_size = fa->element_size * fa->total_nr_elements;
- if (data_size <= FLEX_ARRAY_BASE_BYTES_LEFT)
- return 1;
- return 0;
-}
-
-/**
- * flex_array_alloc - allocate a new flexible array
- * @element_size: the size of individual elements in the array
- * @total: total number of elements that this should hold
- * @flags: page allocation flags to use for base array
- *
- * Note: all locking must be provided by the caller.
- *
- * @total is used to size internal structures. If the user ever
- * accesses any array indexes >=@total, it will produce errors.
- *
- * The maximum number of elements is defined as: the number of
- * elements that can be stored in a page times the number of
- * page pointers that we can fit in the base structure or (using
- * integer math):
- *
- * (PAGE_SIZE/element_size) * (PAGE_SIZE-8)/sizeof(void *)
- *
- * Here's a table showing example capacities. Note that the maximum
- * index that the get/put() functions is just nr_objects-1. This
- * basically means that you get 4MB of storage on 32-bit and 2MB on
- * 64-bit.
- *
- *
- * Element size | Objects | Objects |
- * PAGE_SIZE=4k | 32-bit | 64-bit |
- * ---------------------------------|
- * 1 bytes | 4177920 | 2088960 |
- * 2 bytes | 2088960 | 1044480 |
- * 3 bytes | 1392300 | 696150 |
- * 4 bytes | 1044480 | 522240 |
- * 32 bytes | 130560 | 65408 |
- * 33 bytes | 126480 | 63240 |
- * 2048 bytes | 2040 | 1020 |
- * 2049 bytes | 1020 | 510 |
- * void * | 1044480 | 261120 |
- *
- * Since 64-bit pointers are twice the size, we lose half the
- * capacity in the base structure. Also note that no effort is made
- * to efficiently pack objects across page boundaries.
- */
-struct flex_array *flex_array_alloc(int element_size, unsigned int total,
- gfp_t flags)
-{
- struct flex_array *ret;
- int elems_per_part = 0;
- int max_size = 0;
- struct reciprocal_value reciprocal_elems = { 0 };
-
- if (element_size) {
- elems_per_part = FLEX_ARRAY_ELEMENTS_PER_PART(element_size);
- reciprocal_elems = reciprocal_value(elems_per_part);
- max_size = FLEX_ARRAY_NR_BASE_PTRS * elems_per_part;
- }
-
- /* max_size will end up 0 if element_size > PAGE_SIZE */
- if (total > max_size)
- return NULL;
- ret = kzalloc(sizeof(struct flex_array), flags);
- if (!ret)
- return NULL;
- ret->element_size = element_size;
- ret->total_nr_elements = total;
- ret->elems_per_part = elems_per_part;
- ret->reciprocal_elems = reciprocal_elems;
- if (elements_fit_in_base(ret) && !(flags & __GFP_ZERO))
- memset(&ret->parts[0], FLEX_ARRAY_FREE,
- FLEX_ARRAY_BASE_BYTES_LEFT);
- return ret;
-}
-EXPORT_SYMBOL(flex_array_alloc);
-
-static int fa_element_to_part_nr(struct flex_array *fa,
- unsigned int element_nr)
-{
- /*
- * if element_size == 0 we don't get here, so we never touch
- * the zeroed fa->reciprocal_elems, which would yield invalid
- * results
- */
- return reciprocal_divide(element_nr, fa->reciprocal_elems);
-}
-
-/**
- * flex_array_free_parts - just free the second-level pages
- * @fa: the flex array from which to free parts
- *
- * This is to be used in cases where the base 'struct flex_array'
- * has been statically allocated and should not be free.
- */
-void flex_array_free_parts(struct flex_array *fa)
-{
- int part_nr;
-
- if (elements_fit_in_base(fa))
- return;
- for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++)
- kfree(fa->parts[part_nr]);
-}
-EXPORT_SYMBOL(flex_array_free_parts);
-
-void flex_array_free(struct flex_array *fa)
-{
- flex_array_free_parts(fa);
- kfree(fa);
-}
-EXPORT_SYMBOL(flex_array_free);
-
-static unsigned int index_inside_part(struct flex_array *fa,
- unsigned int element_nr,
- unsigned int part_nr)
-{
- unsigned int part_offset;
-
- part_offset = element_nr - part_nr * fa->elems_per_part;
- return part_offset * fa->element_size;
-}
-
-static struct flex_array_part *
-__fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags)
-{
- struct flex_array_part *part = fa->parts[part_nr];
- if (!part) {
- part = kmalloc(sizeof(struct flex_array_part), flags);
- if (!part)
- return NULL;
- if (!(flags & __GFP_ZERO))
- memset(part, FLEX_ARRAY_FREE,
- sizeof(struct flex_array_part));
- fa->parts[part_nr] = part;
- }
- return part;
-}
-
-/**
- * flex_array_put - copy data into the array at @element_nr
- * @fa: the flex array to copy data into
- * @element_nr: index of the position in which to insert
- * the new element.
- * @src: address of data to copy into the array
- * @flags: page allocation flags to use for array expansion
- *
- *
- * Note that this *copies* the contents of @src into
- * the array. If you are trying to store an array of
- * pointers, make sure to pass in &ptr instead of ptr.
- * You may instead wish to use the flex_array_put_ptr()
- * helper function.
- *
- * Locking must be provided by the caller.
- */
-int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src,
- gfp_t flags)
-{
- int part_nr = 0;
- struct flex_array_part *part;
- void *dst;
-
- if (element_nr >= fa->total_nr_elements)
- return -ENOSPC;
- if (!fa->element_size)
- return 0;
- if (elements_fit_in_base(fa))
- part = (struct flex_array_part *)&fa->parts[0];
- else {
- part_nr = fa_element_to_part_nr(fa, element_nr);
- part = __fa_get_part(fa, part_nr, flags);
- if (!part)
- return -ENOMEM;
- }
- dst = &part->elements[index_inside_part(fa, element_nr, part_nr)];
- memcpy(dst, src, fa->element_size);
- return 0;
-}
-EXPORT_SYMBOL(flex_array_put);
-
-/**
- * flex_array_clear - clear element in array at @element_nr
- * @fa: the flex array of the element.
- * @element_nr: index of the position to clear.
- *
- * Locking must be provided by the caller.
- */
-int flex_array_clear(struct flex_array *fa, unsigned int element_nr)
-{
- int part_nr = 0;
- struct flex_array_part *part;
- void *dst;
-
- if (element_nr >= fa->total_nr_elements)
- return -ENOSPC;
- if (!fa->element_size)
- return 0;
- if (elements_fit_in_base(fa))
- part = (struct flex_array_part *)&fa->parts[0];
- else {
- part_nr = fa_element_to_part_nr(fa, element_nr);
- part = fa->parts[part_nr];
- if (!part)
- return -EINVAL;
- }
- dst = &part->elements[index_inside_part(fa, element_nr, part_nr)];
- memset(dst, FLEX_ARRAY_FREE, fa->element_size);
- return 0;
-}
-EXPORT_SYMBOL(flex_array_clear);
-
-/**
- * flex_array_prealloc - guarantee that array space exists
- * @fa: the flex array for which to preallocate parts
- * @start: index of first array element for which space is allocated
- * @nr_elements: number of elements for which space is allocated
- * @flags: page allocation flags
- *
- * This will guarantee that no future calls to flex_array_put()
- * will allocate memory. It can be used if you are expecting to
- * be holding a lock or in some atomic context while writing
- * data into the array.
- *
- * Locking must be provided by the caller.
- */
-int flex_array_prealloc(struct flex_array *fa, unsigned int start,
- unsigned int nr_elements, gfp_t flags)
-{
- int start_part;
- int end_part;
- int part_nr;
- unsigned int end;
- struct flex_array_part *part;
-
- if (!start && !nr_elements)
- return 0;
- if (start >= fa->total_nr_elements)
- return -ENOSPC;
- if (!nr_elements)
- return 0;
-
- end = start + nr_elements - 1;
-
- if (end >= fa->total_nr_elements)
- return -ENOSPC;
- if (!fa->element_size)
- return 0;
- if (elements_fit_in_base(fa))
- return 0;
- start_part = fa_element_to_part_nr(fa, start);
- end_part = fa_element_to_part_nr(fa, end);
- for (part_nr = start_part; part_nr <= end_part; part_nr++) {
- part = __fa_get_part(fa, part_nr, flags);
- if (!part)
- return -ENOMEM;
- }
- return 0;
-}
-EXPORT_SYMBOL(flex_array_prealloc);
-
-/**
- * flex_array_get - pull data back out of the array
- * @fa: the flex array from which to extract data
- * @element_nr: index of the element to fetch from the array
- *
- * Returns a pointer to the data at index @element_nr. Note
- * that this is a copy of the data that was passed in. If you
- * are using this to store pointers, you'll get back &ptr. You
- * may instead wish to use the flex_array_get_ptr helper.
- *
- * Locking must be provided by the caller.
- */
-void *flex_array_get(struct flex_array *fa, unsigned int element_nr)
-{
- int part_nr = 0;
- struct flex_array_part *part;
-
- if (!fa->element_size)
- return NULL;
- if (element_nr >= fa->total_nr_elements)
- return NULL;
- if (elements_fit_in_base(fa))
- part = (struct flex_array_part *)&fa->parts[0];
- else {
- part_nr = fa_element_to_part_nr(fa, element_nr);
- part = fa->parts[part_nr];
- if (!part)
- return NULL;
- }
- return &part->elements[index_inside_part(fa, element_nr, part_nr)];
-}
-EXPORT_SYMBOL(flex_array_get);
-
-/**
- * flex_array_get_ptr - pull a ptr back out of the array
- * @fa: the flex array from which to extract data
- * @element_nr: index of the element to fetch from the array
- *
- * Returns the pointer placed in the flex array at element_nr using
- * flex_array_put_ptr(). This function should not be called if the
- * element in question was not set using the _put_ptr() helper.
- */
-void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr)
-{
- void **tmp;
-
- tmp = flex_array_get(fa, element_nr);
- if (!tmp)
- return NULL;
-
- return *tmp;
-}
-EXPORT_SYMBOL(flex_array_get_ptr);
-
-static int part_is_free(struct flex_array_part *part)
-{
- int i;
-
- for (i = 0; i < sizeof(struct flex_array_part); i++)
- if (part->elements[i] != FLEX_ARRAY_FREE)
- return 0;
- return 1;
-}
-
-/**
- * flex_array_shrink - free unused second-level pages
- * @fa: the flex array to shrink
- *
- * Frees all second-level pages that consist solely of unused
- * elements. Returns the number of pages freed.
- *
- * Locking must be provided by the caller.
- */
-int flex_array_shrink(struct flex_array *fa)
-{
- struct flex_array_part *part;
- int part_nr;
- int ret = 0;
-
- if (!fa->total_nr_elements || !fa->element_size)
- return 0;
- if (elements_fit_in_base(fa))
- return ret;
- for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) {
- part = fa->parts[part_nr];
- if (!part)
- continue;
- if (part_is_free(part)) {
- fa->parts[part_nr] = NULL;
- kfree(part);
- ret++;
- }
- }
- return ret;
-}
-EXPORT_SYMBOL(flex_array_shrink);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 7e85d1e37a6e..2e45b3a94313 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -424,7 +424,7 @@ void gen_pool_for_each_chunk(struct gen_pool *pool,
EXPORT_SYMBOL(gen_pool_for_each_chunk);
/**
- * addr_in_gen_pool - checks if an address falls within the range of a pool
+ * gen_pool_has_addr - checks if an address falls within the range of a pool
* @pool: the generic memory pool
* @start: start address
* @size: size of the region
@@ -432,7 +432,7 @@ EXPORT_SYMBOL(gen_pool_for_each_chunk);
* Check if the range of addresses falls within the specified pool. Returns
* true if the entire range is contained in the pool and false otherwise.
*/
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
size_t size)
{
bool found = false;
@@ -451,6 +451,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
rcu_read_unlock();
return found;
}
+EXPORT_SYMBOL(gen_pool_has_addr);
/**
* gen_pool_avail - get available free space of the pool
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
new file mode 100644
index 000000000000..a7bafc413730
--- /dev/null
+++ b/lib/generic-radix-tree.c
@@ -0,0 +1,217 @@
+
+#include <linux/export.h>
+#include <linux/generic-radix-tree.h>
+#include <linux/gfp.h>
+
+#define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
+
+struct genradix_node {
+ union {
+ /* Interior node: */
+ struct genradix_node *children[GENRADIX_ARY];
+
+ /* Leaf: */
+ u8 data[PAGE_SIZE];
+ };
+};
+
+static inline int genradix_depth_shift(unsigned depth)
+{
+ return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+}
+
+/*
+ * Returns size (of data, in bytes) that a tree of a given depth holds:
+ */
+static inline size_t genradix_depth_size(unsigned depth)
+{
+ return 1UL << genradix_depth_shift(depth);
+}
+
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH \
+ DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK \
+ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+ return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+ return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
+
+/*
+ * Returns pointer to the specified byte @offset within @radix, or NULL if not
+ * allocated
+ */
+void *__genradix_ptr(struct __genradix *radix, size_t offset)
+{
+ struct genradix_root *r = READ_ONCE(radix->root);
+ struct genradix_node *n = genradix_root_to_node(r);
+ unsigned level = genradix_root_to_depth(r);
+
+ if (ilog2(offset) >= genradix_depth_shift(level))
+ return NULL;
+
+ while (1) {
+ if (!n)
+ return NULL;
+ if (!level)
+ break;
+
+ level--;
+
+ n = n->children[offset >> genradix_depth_shift(level)];
+ offset &= genradix_depth_size(level) - 1;
+ }
+
+ return &n->data[offset];
+}
+EXPORT_SYMBOL(__genradix_ptr);
+
+/*
+ * Returns pointer to the specified byte @offset within @radix, allocating it if
+ * necessary - newly allocated slots are always zeroed out:
+ */
+void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
+ gfp_t gfp_mask)
+{
+ struct genradix_root *v = READ_ONCE(radix->root);
+ struct genradix_node *n, *new_node = NULL;
+ unsigned level;
+
+ /* Increase tree depth if necessary: */
+ while (1) {
+ struct genradix_root *r = v, *new_root;
+
+ n = genradix_root_to_node(r);
+ level = genradix_root_to_depth(r);
+
+ if (n && ilog2(offset) < genradix_depth_shift(level))
+ break;
+
+ if (!new_node) {
+ new_node = (void *)
+ __get_free_page(gfp_mask|__GFP_ZERO);
+ if (!new_node)
+ return NULL;
+ }
+
+ new_node->children[0] = n;
+ new_root = ((struct genradix_root *)
+ ((unsigned long) new_node | (n ? level + 1 : 0)));
+
+ if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) {
+ v = new_root;
+ new_node = NULL;
+ }
+ }
+
+ while (level--) {
+ struct genradix_node **p =
+ &n->children[offset >> genradix_depth_shift(level)];
+ offset &= genradix_depth_size(level) - 1;
+
+ n = READ_ONCE(*p);
+ if (!n) {
+ if (!new_node) {
+ new_node = (void *)
+ __get_free_page(gfp_mask|__GFP_ZERO);
+ if (!new_node)
+ return NULL;
+ }
+
+ if (!(n = cmpxchg_release(p, NULL, new_node)))
+ swap(n, new_node);
+ }
+ }
+
+ if (new_node)
+ free_page((unsigned long) new_node);
+
+ return &n->data[offset];
+}
+EXPORT_SYMBOL(__genradix_ptr_alloc);
+
+void *__genradix_iter_peek(struct genradix_iter *iter,
+ struct __genradix *radix,
+ size_t objs_per_page)
+{
+ struct genradix_root *r;
+ struct genradix_node *n;
+ unsigned level, i;
+restart:
+ r = READ_ONCE(radix->root);
+ if (!r)
+ return NULL;
+
+ n = genradix_root_to_node(r);
+ level = genradix_root_to_depth(r);
+
+ if (ilog2(iter->offset) >= genradix_depth_shift(level))
+ return NULL;
+
+ while (level) {
+ level--;
+
+ i = (iter->offset >> genradix_depth_shift(level)) &
+ (GENRADIX_ARY - 1);
+
+ while (!n->children[i]) {
+ i++;
+ iter->offset = round_down(iter->offset +
+ genradix_depth_size(level),
+ genradix_depth_size(level));
+ iter->pos = (iter->offset >> PAGE_SHIFT) *
+ objs_per_page;
+ if (i == GENRADIX_ARY)
+ goto restart;
+ }
+
+ n = n->children[i];
+ }
+
+ return &n->data[iter->offset & (PAGE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek);
+
+static void genradix_free_recurse(struct genradix_node *n, unsigned level)
+{
+ if (level) {
+ unsigned i;
+
+ for (i = 0; i < GENRADIX_ARY; i++)
+ if (n->children[i])
+ genradix_free_recurse(n->children[i], level - 1);
+ }
+
+ free_page((unsigned long) n);
+}
+
+int __genradix_prealloc(struct __genradix *radix, size_t size,
+ gfp_t gfp_mask)
+{
+ size_t offset;
+
+ for (offset = 0; offset < size; offset += PAGE_SIZE)
+ if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(__genradix_prealloc);
+
+void __genradix_free(struct __genradix *radix)
+{
+ struct genradix_root *r = xchg(&radix->root, NULL);
+
+ genradix_free_recurse(genradix_root_to_node(r),
+ genradix_root_to_depth(r));
+}
+EXPORT_SYMBOL(__genradix_free);
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
new file mode 100644
index 000000000000..83cdcaa82bf6
--- /dev/null
+++ b/lib/test_vmalloc.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for stress and analyze performance of vmalloc allocator.
+ * (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/kthread.h>
+#include <linux/moduleparam.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/rwsem.h>
+#include <linux/mm.h>
+
+#define __param(type, name, init, msg) \
+ static type name = init; \
+ module_param(name, type, 0444); \
+ MODULE_PARM_DESC(name, msg) \
+
+__param(bool, single_cpu_test, false,
+ "Use single first online CPU to run tests");
+
+__param(bool, sequential_test_order, false,
+ "Use sequential stress tests order");
+
+__param(int, test_repeat_count, 1,
+ "Set test repeat counter");
+
+__param(int, test_loop_count, 1000000,
+ "Set test loop counter");
+
+__param(int, run_test_mask, INT_MAX,
+ "Set tests specified in the mask.\n\n"
+ "\t\tid: 1, name: fix_size_alloc_test\n"
+ "\t\tid: 2, name: full_fit_alloc_test\n"
+ "\t\tid: 4, name: long_busy_list_alloc_test\n"
+ "\t\tid: 8, name: random_size_alloc_test\n"
+ "\t\tid: 16, name: fix_align_alloc_test\n"
+ "\t\tid: 32, name: random_size_align_alloc_test\n"
+ "\t\tid: 64, name: align_shift_alloc_test\n"
+ "\t\tid: 128, name: pcpu_alloc_test\n"
+ /* Add a new test case description here. */
+);
+
+/*
+ * Depends on single_cpu_test parameter. If it is true, then
+ * use first online CPU to trigger a test on, otherwise go with
+ * all online CPUs.
+ */
+static cpumask_t cpus_run_test_mask = CPU_MASK_NONE;
+
+/*
+ * Read write semaphore for synchronization of setup
+ * phase that is done in main thread and workers.
+ */
+static DECLARE_RWSEM(prepare_for_test_rwsem);
+
+/*
+ * Completion tracking for worker threads.
+ */
+static DECLARE_COMPLETION(test_all_done_comp);
+static atomic_t test_n_undone = ATOMIC_INIT(0);
+
+static inline void
+test_report_one_done(void)
+{
+ if (atomic_dec_and_test(&test_n_undone))
+ complete(&test_all_done_comp);
+}
+
+static int random_size_align_alloc_test(void)
+{
+ unsigned long size, align, rnd;
+ void *ptr;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ get_random_bytes(&rnd, sizeof(rnd));
+
+ /*
+ * Maximum 1024 pages, if PAGE_SIZE is 4096.
+ */
+ align = 1 << (rnd % 23);
+
+ /*
+ * Maximum 10 pages.
+ */
+ size = ((rnd % 10) + 1) * PAGE_SIZE;
+
+ ptr = __vmalloc_node_range(size, align,
+ VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL,
+ 0, 0, __builtin_return_address(0));
+
+ if (!ptr)
+ return -1;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+/*
+ * This test case is supposed to be failed.
+ */
+static int align_shift_alloc_test(void)
+{
+ unsigned long align;
+ void *ptr;
+ int i;
+
+ for (i = 0; i < BITS_PER_LONG; i++) {
+ align = ((unsigned long) 1) << i;
+
+ ptr = __vmalloc_node_range(PAGE_SIZE, align,
+ VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL,
+ 0, 0, __builtin_return_address(0));
+
+ if (!ptr)
+ return -1;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+static int fix_align_alloc_test(void)
+{
+ void *ptr;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ ptr = __vmalloc_node_range(5 * PAGE_SIZE,
+ THREAD_ALIGN << 1,
+ VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL,
+ 0, 0, __builtin_return_address(0));
+
+ if (!ptr)
+ return -1;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+static int random_size_alloc_test(void)
+{
+ unsigned int n;
+ void *p;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ get_random_bytes(&n, sizeof(i));
+ n = (n % 100) + 1;
+
+ p = vmalloc(n * PAGE_SIZE);
+
+ if (!p)
+ return -1;
+
+ *((__u8 *)p) = 1;
+ vfree(p);
+ }
+
+ return 0;
+}
+
+static int long_busy_list_alloc_test(void)
+{
+ void *ptr_1, *ptr_2;
+ void **ptr;
+ int rv = -1;
+ int i;
+
+ ptr = vmalloc(sizeof(void *) * 15000);
+ if (!ptr)
+ return rv;
+
+ for (i = 0; i < 15000; i++)
+ ptr[i] = vmalloc(1 * PAGE_SIZE);
+
+ for (i = 0; i < test_loop_count; i++) {
+ ptr_1 = vmalloc(100 * PAGE_SIZE);
+ if (!ptr_1)
+ goto leave;
+
+ ptr_2 = vmalloc(1 * PAGE_SIZE);
+ if (!ptr_2) {
+ vfree(ptr_1);
+ goto leave;
+ }
+
+ *((__u8 *)ptr_1) = 0;
+ *((__u8 *)ptr_2) = 1;
+
+ vfree(ptr_1);
+ vfree(ptr_2);
+ }
+
+ /* Success */
+ rv = 0;
+
+leave:
+ for (i = 0; i < 15000; i++)
+ vfree(ptr[i]);
+
+ vfree(ptr);
+ return rv;
+}
+
+static int full_fit_alloc_test(void)
+{
+ void **ptr, **junk_ptr, *tmp;
+ int junk_length;
+ int rv = -1;
+ int i;
+
+ junk_length = fls(num_online_cpus());
+ junk_length *= (32 * 1024 * 1024 / PAGE_SIZE);
+
+ ptr = vmalloc(sizeof(void *) * junk_length);
+ if (!ptr)
+ return rv;
+
+ junk_ptr = vmalloc(sizeof(void *) * junk_length);
+ if (!junk_ptr) {
+ vfree(ptr);
+ return rv;
+ }
+
+ for (i = 0; i < junk_length; i++) {
+ ptr[i] = vmalloc(1 * PAGE_SIZE);
+ junk_ptr[i] = vmalloc(1 * PAGE_SIZE);
+ }
+
+ for (i = 0; i < junk_length; i++)
+ vfree(junk_ptr[i]);
+
+ for (i = 0; i < test_loop_count; i++) {
+ tmp = vmalloc(1 * PAGE_SIZE);
+
+ if (!tmp)
+ goto error;
+
+ *((__u8 *)tmp) = 1;
+ vfree(tmp);
+ }
+
+ /* Success */
+ rv = 0;
+
+error:
+ for (i = 0; i < junk_length; i++)
+ vfree(ptr[i]);
+
+ vfree(ptr);
+ vfree(junk_ptr);
+
+ return rv;
+}
+
+static int fix_size_alloc_test(void)
+{
+ void *ptr;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ ptr = vmalloc(3 * PAGE_SIZE);
+
+ if (!ptr)
+ return -1;
+
+ *((__u8 *)ptr) = 0;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+static int
+pcpu_alloc_test(void)
+{
+ int rv = 0;
+#ifndef CONFIG_NEED_PER_CPU_KM
+ void __percpu **pcpu;
+ size_t size, align;
+ int i;
+
+ pcpu = vmalloc(sizeof(void __percpu *) * 35000);
+ if (!pcpu)
+ return -1;
+
+ for (i = 0; i < 35000; i++) {
+ unsigned int r;
+
+ get_random_bytes(&r, sizeof(i));
+ size = (r % (PAGE_SIZE / 4)) + 1;
+
+ /*
+ * Maximum PAGE_SIZE
+ */
+ get_random_bytes(&r, sizeof(i));
+ align = 1 << ((i % 11) + 1);
+
+ pcpu[i] = __alloc_percpu(size, align);
+ if (!pcpu[i])
+ rv = -1;
+ }
+
+ for (i = 0; i < 35000; i++)
+ free_percpu(pcpu[i]);
+
+ vfree(pcpu);
+#endif
+ return rv;
+}
+
+struct test_case_desc {
+ const char *test_name;
+ int (*test_func)(void);
+};
+
+static struct test_case_desc test_case_array[] = {
+ { "fix_size_alloc_test", fix_size_alloc_test },
+ { "full_fit_alloc_test", full_fit_alloc_test },
+ { "long_busy_list_alloc_test", long_busy_list_alloc_test },
+ { "random_size_alloc_test", random_size_alloc_test },
+ { "fix_align_alloc_test", fix_align_alloc_test },
+ { "random_size_align_alloc_test", random_size_align_alloc_test },
+ { "align_shift_alloc_test", align_shift_alloc_test },
+ { "pcpu_alloc_test", pcpu_alloc_test },
+ /* Add a new test case here. */
+};
+
+struct test_case_data {
+ int test_failed;
+ int test_passed;
+ u64 time;
+};
+
+/* Split it to get rid of: WARNING: line over 80 characters */
+static struct test_case_data
+ per_cpu_test_data[NR_CPUS][ARRAY_SIZE(test_case_array)];
+
+static struct test_driver {
+ struct task_struct *task;
+ unsigned long start;
+ unsigned long stop;
+ int cpu;
+} per_cpu_test_driver[NR_CPUS];
+
+static void shuffle_array(int *arr, int n)
+{
+ unsigned int rnd;
+ int i, j, x;
+
+ for (i = n - 1; i > 0; i--) {
+ get_random_bytes(&rnd, sizeof(rnd));
+
+ /* Cut the range. */
+ j = rnd % i;
+
+ /* Swap indexes. */
+ x = arr[i];
+ arr[i] = arr[j];
+ arr[j] = x;
+ }
+}
+
+static int test_func(void *private)
+{
+ struct test_driver *t = private;
+ cpumask_t newmask = CPU_MASK_NONE;
+ int random_array[ARRAY_SIZE(test_case_array)];
+ int index, i, j, ret;
+ ktime_t kt;
+ u64 delta;
+
+ cpumask_set_cpu(t->cpu, &newmask);
+ set_cpus_allowed_ptr(current, &newmask);
+
+ for (i = 0; i < ARRAY_SIZE(test_case_array); i++)
+ random_array[i] = i;
+
+ if (!sequential_test_order)
+ shuffle_array(random_array, ARRAY_SIZE(test_case_array));
+
+ /*
+ * Block until initialization is done.
+ */
+ down_read(&prepare_for_test_rwsem);
+
+ t->start = get_cycles();
+ for (i = 0; i < ARRAY_SIZE(test_case_array); i++) {
+ index = random_array[i];
+
+ /*
+ * Skip tests if run_test_mask has been specified.
+ */
+ if (!((run_test_mask & (1 << index)) >> index))
+ continue;
+
+ kt = ktime_get();
+ for (j = 0; j < test_repeat_count; j++) {
+ ret = test_case_array[index].test_func();
+ if (!ret)
+ per_cpu_test_data[t->cpu][index].test_passed++;
+ else
+ per_cpu_test_data[t->cpu][index].test_failed++;
+ }
+
+ /*
+ * Take an average time that test took.
+ */
+ delta = (u64) ktime_us_delta(ktime_get(), kt);
+ do_div(delta, (u32) test_repeat_count);
+
+ per_cpu_test_data[t->cpu][index].time = delta;
+ }
+ t->stop = get_cycles();
+
+ up_read(&prepare_for_test_rwsem);
+ test_report_one_done();
+
+ /*
+ * Wait for the kthread_stop() call.
+ */
+ while (!kthread_should_stop())
+ msleep(10);
+
+ return 0;
+}
+
+static void
+init_test_configurtion(void)
+{
+ /*
+ * Reset all data of all CPUs.
+ */
+ memset(per_cpu_test_data, 0, sizeof(per_cpu_test_data));
+
+ if (single_cpu_test)
+ cpumask_set_cpu(cpumask_first(cpu_online_mask),
+ &cpus_run_test_mask);
+ else
+ cpumask_and(&cpus_run_test_mask, cpu_online_mask,
+ cpu_online_mask);
+
+ if (test_repeat_count <= 0)
+ test_repeat_count = 1;
+
+ if (test_loop_count <= 0)
+ test_loop_count = 1;
+}
+
+static void do_concurrent_test(void)
+{
+ int cpu, ret;
+
+ /*
+ * Set some basic configurations plus sanity check.
+ */
+ init_test_configurtion();
+
+ /*
+ * Put on hold all workers.
+ */
+ down_write(&prepare_for_test_rwsem);
+
+ for_each_cpu(cpu, &cpus_run_test_mask) {
+ struct test_driver *t = &per_cpu_test_driver[cpu];
+
+ t->cpu = cpu;
+ t->task = kthread_run(test_func, t, "vmalloc_test/%d", cpu);
+
+ if (!IS_ERR(t->task))
+ /* Success. */
+ atomic_inc(&test_n_undone);
+ else
+ pr_err("Failed to start kthread for %d CPU\n", cpu);
+ }
+
+ /*
+ * Now let the workers do their job.
+ */
+ up_write(&prepare_for_test_rwsem);
+
+ /*
+ * Sleep quiet until all workers are done with 1 second
+ * interval. Since the test can take a lot of time we
+ * can run into a stack trace of the hung task. That is
+ * why we go with completion_timeout and HZ value.
+ */
+ do {
+ ret = wait_for_completion_timeout(&test_all_done_comp, HZ);
+ } while (!ret);
+
+ for_each_cpu(cpu, &cpus_run_test_mask) {
+ struct test_driver *t = &per_cpu_test_driver[cpu];
+ int i;
+
+ if (!IS_ERR(t->task))
+ kthread_stop(t->task);
+
+ for (i = 0; i < ARRAY_SIZE(test_case_array); i++) {
+ if (!((run_test_mask & (1 << i)) >> i))
+ continue;
+
+ pr_info(
+ "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n",
+ test_case_array[i].test_name,
+ per_cpu_test_data[cpu][i].test_passed,
+ per_cpu_test_data[cpu][i].test_failed,
+ test_repeat_count, test_loop_count,
+ per_cpu_test_data[cpu][i].time);
+ }
+
+ pr_info("All test took CPU%d=%lu cycles\n",
+ cpu, t->stop - t->start);
+ }
+}
+
+static int vmalloc_test_init(void)
+{
+ do_concurrent_test();
+ return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void vmalloc_test_exit(void)
+{
+}
+
+module_init(vmalloc_test_init)
+module_exit(vmalloc_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Uladzislau Rezki");
+MODULE_DESCRIPTION("vmalloc test module");
diff --git a/mm/compaction.c b/mm/compaction.c
index ef29490b0f46..9830f81cd27f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -52,21 +52,15 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
static unsigned long release_freepages(struct list_head *freelist)
{
- struct page *page, *next;
- unsigned long high_pfn = 0;
+ unsigned long high_pfn;
- list_for_each_entry_safe(page, next, freelist, lru) {
- unsigned long pfn = page_to_pfn(page);
- list_del(&page->lru);
- __free_page(page);
- if (pfn > high_pfn)
- high_pfn = pfn;
- }
+ __free_page_list(freelist, true, &high_pfn);
+ INIT_LIST_HEAD(freelist);
return high_pfn;
}
-static void map_pages(struct list_head *list)
+static void split_map_pages(struct list_head *list)
{
unsigned int i, order, nr_pages;
struct page *page, *next;
@@ -237,6 +231,62 @@ static bool pageblock_skip_persistent(struct page *page)
return false;
}
+static bool
+__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
+ bool check_target)
+{
+ struct page *page = pfn_to_online_page(pfn);
+ struct page *end_page;
+
+ if (!page)
+ return false;
+ if (zone != page_zone(page))
+ return false;
+ if (pageblock_skip_persistent(page))
+ return false;
+
+ /*
+ * If skip is already cleared do no further checking once the
+ * restart points have been set.
+ */
+ if (check_source && check_target && !get_pageblock_skip(page))
+ return true;
+
+ /*
+ * If clearing skip for the target scanner, do not select a
+ * non-movable pageblock as the starting point.
+ */
+ if (!check_source && check_target &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ return false;
+
+ /*
+ * Only clear the hint if a sample indicates there is either a
+ * free page or an LRU page in the block. One or other condition
+ * is necessary for the block to be a migration source/target.
+ */
+ page = pfn_to_page(pageblock_start_pfn(pfn));
+ if (zone != page_zone(page))
+ return false;
+ end_page = page + pageblock_nr_pages;
+
+ do {
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+
+ page += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ } while (page < end_page);
+
+ return false;
+}
+
/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
@@ -244,30 +294,54 @@ static bool pageblock_skip_persistent(struct page *page)
*/
static void __reset_isolation_suitable(struct zone *zone)
{
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
- unsigned long pfn;
+ unsigned long migrate_pfn = zone->zone_start_pfn;
+ unsigned long free_pfn = zone_end_pfn(zone);
+ unsigned long reset_migrate = free_pfn;
+ unsigned long reset_free = migrate_pfn;
+ bool source_set = false;
+ bool free_set = false;
+
+ if (!zone->compact_blockskip_flush)
+ return;
zone->compact_blockskip_flush = false;
- /* Walk the zone and mark every pageblock as suitable for isolation */
- for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
- struct page *page;
-
+ /*
+ * Walk the zone and update pageblock skip information. Source looks
+ * for PageLRU while target looks for PageBuddy. When the scanner
+ * is found, both PageBuddy and PageLRU are checked as the pageblock
+ * is suitable as both source and target.
+ */
+ for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
+ free_pfn -= pageblock_nr_pages) {
cond_resched();
- page = pfn_to_online_page(pfn);
- if (!page)
- continue;
- if (zone != page_zone(page))
- continue;
- if (pageblock_skip_persistent(page))
- continue;
+ /* Update the migrate PFN */
+ if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
+ migrate_pfn < reset_migrate) {
+ source_set = true;
+ reset_migrate = migrate_pfn;
+ zone->compact_init_migrate_pfn = reset_migrate;
+ zone->compact_cached_migrate_pfn[0] = reset_migrate;
+ zone->compact_cached_migrate_pfn[1] = reset_migrate;
+ }
- clear_pageblock_skip(page);
+ /* Update the free PFN */
+ if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
+ free_pfn > reset_free) {
+ free_set = true;
+ reset_free = free_pfn;
+ zone->compact_init_free_pfn = reset_free;
+ zone->compact_cached_free_pfn = reset_free;
+ }
}
- reset_cached_positions(zone);
+ /* Leave no distance if no suitable block was reset */
+ if (reset_migrate >= reset_free) {
+ zone->compact_cached_migrate_pfn[0] = migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = migrate_pfn;
+ zone->compact_cached_free_pfn = free_pfn;
+ }
}
void reset_isolation_suitable(pg_data_t *pgdat)
@@ -286,15 +360,53 @@ void reset_isolation_suitable(pg_data_t *pgdat)
}
/*
+ * Sets the pageblock skip bit if it was clear. Note that this is a hint as
+ * locks are not required for read/writers. Returns true if it was already set.
+ */
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
+{
+ bool skip;
+
+ /* Do no update if skip hint is being ignored */
+ if (cc->ignore_skip_hint)
+ return false;
+
+ if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+ return false;
+
+ skip = get_pageblock_skip(page);
+ if (!skip && !cc->no_set_skip_hint)
+ set_pageblock_skip(page);
+
+ return skip;
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+ struct zone *zone = cc->zone;
+
+ pfn = pageblock_end_pfn(pfn);
+
+ /* Set for isolation rather than compaction */
+ if (cc->no_set_skip_hint)
+ return;
+
+ if (pfn > zone->compact_cached_migrate_pfn[0])
+ zone->compact_cached_migrate_pfn[0] = pfn;
+ if (cc->mode != MIGRATE_ASYNC &&
+ pfn > zone->compact_cached_migrate_pfn[1])
+ zone->compact_cached_migrate_pfn[1] = pfn;
+}
+
+/*
* If no pages were isolated then mark this pageblock to be skipped in the
* future. The information is later cleared by __reset_isolation_suitable().
*/
static void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
{
struct zone *zone = cc->zone;
- unsigned long pfn;
if (cc->no_set_skip_hint)
return;
@@ -302,24 +414,11 @@ static void update_pageblock_skip(struct compact_control *cc,
if (!page)
return;
- if (nr_isolated)
- return;
-
set_pageblock_skip(page);
- pfn = page_to_pfn(page);
-
/* Update where async and sync compaction should restart */
- if (migrate_scanner) {
- if (pfn > zone->compact_cached_migrate_pfn[0])
- zone->compact_cached_migrate_pfn[0] = pfn;
- if (cc->mode != MIGRATE_ASYNC &&
- pfn > zone->compact_cached_migrate_pfn[1])
- zone->compact_cached_migrate_pfn[1] = pfn;
- } else {
- if (pfn < zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = pfn;
- }
+ if (pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
}
#else
static inline bool isolation_suitable(struct compact_control *cc,
@@ -334,35 +433,52 @@ static inline bool pageblock_skip_persistent(struct page *page)
}
static inline void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
{
}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+}
+
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
+{
+ return false;
+}
#endif /* CONFIG_COMPACTION */
/*
* Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. For async compaction, back out if the lock cannot
- * be taken immediately. For sync compaction, spin on the lock if needed.
+ * very heavily contended. For async compaction, trylock and record if the
+ * lock is contended. The lock will still be acquired but compaction will
+ * abort when the current block is finished regardless of success rate.
+ * Sync compaction acquires the lock.
*
- * Returns true if the lock is held
- * Returns false if the lock is not held and compaction should abort
+ * Always returns true which makes it easier to track lock state in callers.
*/
-static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
+static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
struct compact_control *cc)
{
- if (cc->mode == MIGRATE_ASYNC) {
- if (!spin_trylock_irqsave(lock, *flags)) {
- cc->contended = true;
- return false;
- }
- } else {
- spin_lock_irqsave(lock, *flags);
+ /* Track if the lock is contended in async mode */
+ if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
+ if (spin_trylock_irqsave(lock, *flags))
+ return true;
+
+ cc->contended = true;
}
+ spin_lock_irqsave(lock, *flags);
return true;
}
+/* Avoid soft-lockups due to long scan times */
+static inline void compact_check_resched(struct compact_control *cc)
+{
+ if (need_resched())
+ cond_resched();
+}
+
/*
* Compaction requires the taking of some coarse locks that are potentially
* very heavily contended. The lock should be periodically unlocked to avoid
@@ -391,37 +507,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
return true;
}
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
- cond_resched();
- }
-
- return false;
-}
-
-/*
- * Aside from avoiding lock contention, compaction also periodically checks
- * need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_unlock_should_abort() does, but
- * is used where no lock is concerned.
- *
- * Returns false when no scheduling was needed, or sync compaction scheduled.
- * Returns true when async compaction should abort.
- */
-static inline bool compact_should_abort(struct compact_control *cc)
-{
- /* async compaction aborts if contended */
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
-
- cond_resched();
- }
+ compact_check_resched(cc);
return false;
}
@@ -435,19 +521,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long *start_pfn,
unsigned long end_pfn,
struct list_head *freelist,
+ unsigned int stride,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
- struct page *cursor, *valid_page = NULL;
+ struct page *cursor;
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
unsigned int order;
+ /* Strict mode is for isolation, speed is secondary */
+ if (strict)
+ stride = 1;
+
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. */
- for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+ for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
int isolated;
struct page *page = cursor;
@@ -465,9 +556,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (!pfn_valid_within(blockpfn))
goto isolate_fail;
- if (!valid_page)
- valid_page = page;
-
/*
* For compound pages such as THP and hugetlbfs, we can save
* potentially a lot of iterations if we skip them at once.
@@ -484,6 +572,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
goto isolate_fail;
}
+ /*
+ * A reserved page is never freed and tend to be clustered in
+ * the same pageblock. Skip the block.
+ */
+ if (PageReserved(page)) {
+ blockpfn = end_pfn;
+ break;
+ }
+
if (!PageBuddy(page))
goto isolate_fail;
@@ -495,18 +592,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
* recheck as well.
*/
if (!locked) {
- /*
- * The zone lock must be held to isolate freepages.
- * Unfortunately this is a very coarse lock and can be
- * heavily contended if there are parallel allocations
- * or parallel compactions. For async compaction do not
- * spin on the lock and we acquire the lock as late as
- * possible.
- */
- locked = compact_trylock_irqsave(&cc->zone->lock,
+ locked = compact_lock_irqsave(&cc->zone->lock,
&flags, cc);
- if (!locked)
- break;
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
@@ -565,10 +652,6 @@ isolate_fail:
if (strict && blockpfn < end_pfn)
total_isolated = 0;
- /* Update the pageblock-skip if the whole pageblock was scanned */
- if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, false);
-
cc->total_free_scanned += nr_scanned;
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
@@ -626,7 +709,7 @@ isolate_freepages_range(struct compact_control *cc,
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
- block_end_pfn, &freelist, true);
+ block_end_pfn, &freelist, 0, true);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -644,7 +727,7 @@ isolate_freepages_range(struct compact_control *cc,
}
/* __isolate_free_page() does not map the pages */
- map_pages(&freelist);
+ split_map_pages(&freelist);
if (pfn < end_pfn) {
/* Loop terminated early, cleanup. */
@@ -702,6 +785,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
+ bool skip_updated = false;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -719,8 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
return 0;
}
- if (compact_should_abort(cc))
- return 0;
+ compact_check_resched(cc);
if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
skip_on_failure = true;
@@ -768,8 +851,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
page = pfn_to_page(low_pfn);
- if (!valid_page)
+ /*
+ * Check if the pageblock has already been marked skipped.
+ * Only the aligned PFN is checked as the caller isolates
+ * COMPACT_CLUSTER_MAX at a time so the second call must
+ * not falsely conclude that the block should be skipped.
+ */
+ if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
+ if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+ low_pfn = end_pfn;
+ goto isolate_abort;
+ }
valid_page = page;
+ }
/*
* Skip if free. We read page order here without zone lock
@@ -827,6 +921,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_success;
}
+ /*
+ * A reserved page is never freed and tend to be
+ * clustered in the same pageblocks. Skip the block.
+ */
+ if (PageReserved(page))
+ low_pfn = end_pfn;
+
goto isolate_fail;
}
@@ -848,10 +949,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
- locked = compact_trylock_irqsave(zone_lru_lock(zone),
+ locked = compact_lock_irqsave(zone_lru_lock(zone),
&flags, cc);
- if (!locked)
- break;
+
+ /* Try get exclusive access under lock */
+ if (!skip_updated) {
+ skip_updated = true;
+ if (test_and_set_skip(cc, page, low_pfn))
+ goto isolate_abort;
+ }
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
@@ -887,16 +993,13 @@ isolate_success:
nr_isolated++;
/*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator.
- * - this is the lowest page that was isolated and likely be
- * then freed by migration.
+ * Avoid isolating too much unless this block is being
+ * rescanned (e.g. dirty/writeback pages, parallel allocation)
+ * or a lock is contended. For contention, isolate quickly to
+ * potentially remove one source of contention.
*/
- if (!cc->last_migrated_pfn)
- cc->last_migrated_pfn = low_pfn;
-
- /* Avoid isolating too much */
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
+ !cc->rescan && !cc->contended) {
++low_pfn;
break;
}
@@ -918,7 +1021,6 @@ isolate_fail:
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- cc->last_migrated_pfn = 0;
nr_isolated = 0;
}
@@ -939,15 +1041,23 @@ isolate_fail:
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
+isolate_abort:
if (locked)
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
/*
- * Update the pageblock-skip information and cached scanner pfn,
- * if the whole pageblock was scanned without isolating any page.
+ * Updated the cached scanner pfn once the pageblock has been scanned
+ * Pages will either be migrated in which case there is no point
+ * scanning in the near future or migration failed in which case the
+ * failure reason may persist. The block is marked for skipping if
+ * there were no pages isolated in the block or if the block is
+ * rescanned twice in a row.
*/
- if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated, true);
+ if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
+ if (valid_page && !skip_updated)
+ set_pageblock_skip(valid_page);
+ update_cached_migrate(cc, low_pfn);
+ }
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
@@ -1013,6 +1123,9 @@ static bool suitable_migration_source(struct compact_control *cc,
{
int block_mt;
+ if (pageblock_skip_persistent(page))
+ return false;
+
if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
return true;
@@ -1050,6 +1163,12 @@ static bool suitable_migration_target(struct compact_control *cc,
return false;
}
+static inline unsigned int
+freelist_scan_limit(struct compact_control *cc)
+{
+ return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
+}
+
/*
* Test whether the free scanner has reached the same or lower pageblock than
* the migration scanner, and compaction should thus terminate.
@@ -1060,6 +1179,237 @@ static inline bool compact_scanners_met(struct compact_control *cc)
<= (cc->migrate_pfn >> pageblock_order);
}
+/* Reorder the free list to reduce repeated future searches */
+static void
+move_freelist_head(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_last(freelist, &freepage->lru)) {
+ list_cut_position(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+static void
+move_freelist_tail(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_last(freelist, &freepage->lru)) {
+ list_cut_before(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+static void
+fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
+{
+ unsigned long start_pfn, end_pfn;
+ struct page *page = pfn_to_page(pfn);
+
+ /* Do not search around if there are enough pages already */
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ return;
+
+ /* Minimise scanning during async compaction */
+ if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
+ return;
+
+ /* Pageblock boundaries */
+ start_pfn = pageblock_start_pfn(pfn);
+ end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone));
+
+ /* Scan before */
+ if (start_pfn != pfn) {
+ isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ return;
+ }
+
+ /* Scan after */
+ start_pfn = pfn + nr_isolated;
+ if (start_pfn != end_pfn)
+ isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+
+ /* Skip this pageblock in the future as it's full or nearly full */
+ if (cc->nr_freepages < cc->nr_migratepages)
+ set_pageblock_skip(page);
+}
+
+/* Search orders in round-robin fashion */
+static int next_search_order(struct compact_control *cc, int order)
+{
+ order--;
+ if (order < 0)
+ order = cc->order - 1;
+
+ /* Search wrapped around? */
+ if (order == cc->search_order) {
+ cc->search_order--;
+ if (cc->search_order < 0)
+ cc->search_order = cc->order - 1;
+ return -1;
+ }
+
+ return order;
+}
+
+static unsigned long
+fast_isolate_freepages(struct compact_control *cc)
+{
+ unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int order_scanned = 0, nr_scanned = 0;
+ unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+ unsigned long nr_isolated = 0;
+ unsigned long distance;
+ struct page *page = NULL;
+ bool scan_start = false;
+ int order;
+
+ /* Full compaction passes in a negative order */
+ if (cc->order <= 0)
+ return cc->free_pfn;
+
+ /*
+ * If starting the scan, use a deeper search and use the highest
+ * PFN found if a suitable one is not found.
+ */
+ if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
+ limit = pageblock_nr_pages >> 1;
+ scan_start = true;
+ }
+
+ /*
+ * Preferred point is in the top quarter of the scan space but take
+ * a pfn from the top half if the search is problematic.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn);
+ low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2));
+ min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1));
+
+ if (WARN_ON_ONCE(min_pfn > low_pfn))
+ low_pfn = min_pfn;
+
+ /*
+ * Search starts from the last successful isolation order or the next
+ * order to search after a previous failure
+ */
+ cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order);
+
+ for (order = cc->search_order;
+ !page && order >= 0;
+ order = next_search_order(cc, order)) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ struct page *freepage;
+ unsigned long flags;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry_reverse(freepage, freelist, lru) {
+ unsigned long pfn;
+
+ order_scanned++;
+ nr_scanned++;
+ pfn = page_to_pfn(freepage);
+
+ if (pfn >= highest)
+ highest = pageblock_start_pfn(pfn);
+
+ if (pfn >= low_pfn) {
+ cc->fast_search_fail = 0;
+ cc->search_order = order;
+ page = freepage;
+ break;
+ }
+
+ if (pfn >= min_pfn && pfn > high_pfn) {
+ high_pfn = pfn;
+
+ /* Shorten the scan if a candidate is found */
+ limit >>= 1;
+ }
+
+ if (order_scanned >= limit)
+ break;
+ }
+
+ /* Use a minimum pfn if a preferred one was not found */
+ if (!page && high_pfn) {
+ page = pfn_to_page(high_pfn);
+
+ /* Update freepage for the list reorder below */
+ freepage = page;
+ }
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_head(freelist, freepage);
+
+ /* Isolate the page if available */
+ if (page) {
+ if (__isolate_free_page(page, order)) {
+ set_page_private(page, order);
+ nr_isolated = 1 << order;
+ cc->nr_freepages += nr_isolated;
+ list_add_tail(&page->lru, &cc->freepages);
+ count_compact_events(COMPACTISOLATED, nr_isolated);
+ } else {
+ /* If isolation fails, abort the search */
+ order = -1;
+ page = NULL;
+ }
+ }
+
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /*
+ * Smaller scan on next order so the total scan ig related
+ * to freelist_scan_limit.
+ */
+ if (order_scanned >= limit)
+ limit = min(1U, limit >> 1);
+ }
+
+ if (!page) {
+ cc->fast_search_fail++;
+ if (scan_start) {
+ /*
+ * Use the highest PFN found above min. If one was
+ * not found, be pessemistic for direct compaction
+ * and use the min mark.
+ */
+ if (highest) {
+ page = pfn_to_page(highest);
+ cc->free_pfn = highest;
+ } else {
+ if (cc->direct_compaction) {
+ page = pfn_to_page(min_pfn);
+ cc->free_pfn = min_pfn;
+ }
+ }
+ }
+ }
+
+ if (highest && highest >= cc->zone->compact_cached_free_pfn) {
+ highest -= pageblock_nr_pages;
+ cc->zone->compact_cached_free_pfn = highest;
+ }
+
+ cc->total_free_scanned += nr_scanned;
+ if (!page)
+ return cc->free_pfn;
+
+ low_pfn = page_to_pfn(page);
+ fast_isolate_around(cc, low_pfn, nr_isolated);
+ return low_pfn;
+}
+
/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
@@ -1073,6 +1423,12 @@ static void isolate_freepages(struct compact_control *cc)
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
struct list_head *freelist = &cc->freepages;
+ unsigned int stride;
+
+ /* Try a small search of the free lists for a candidate */
+ isolate_start_pfn = fast_isolate_freepages(cc);
+ if (cc->nr_freepages)
+ goto splitmap;
/*
* Initialise the free scanner. The starting point is where we last
@@ -1086,10 +1442,11 @@ static void isolate_freepages(struct compact_control *cc)
* is using.
*/
isolate_start_pfn = cc->free_pfn;
- block_start_pfn = pageblock_start_pfn(cc->free_pfn);
+ block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
low_pfn = pageblock_end_pfn(cc->migrate_pfn);
+ stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1;
/*
* Isolate free pages until enough are available to migrate the
@@ -1100,14 +1457,14 @@ static void isolate_freepages(struct compact_control *cc)
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
+ unsigned long nr_isolated;
+
/*
* This can iterate a massively long zone without finding any
- * suitable migration targets, so periodically check if we need
- * to schedule, or even abort async compaction.
+ * suitable migration targets, so periodically check resched.
*/
- if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
- && compact_should_abort(cc))
- break;
+ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ compact_check_resched(cc);
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
@@ -1123,15 +1480,15 @@ static void isolate_freepages(struct compact_control *cc)
continue;
/* Found a block suitable for isolating free pages from. */
- isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
- freelist, false);
+ nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ block_end_pfn, freelist, stride, false);
- /*
- * If we isolated enough freepages, or aborted due to lock
- * contention, terminate.
- */
- if ((cc->nr_freepages >= cc->nr_migratepages)
- || cc->contended) {
+ /* Update the skip hint if the full pageblock was scanned */
+ if (isolate_start_pfn == block_end_pfn)
+ update_pageblock_skip(cc, page, block_start_pfn);
+
+ /* Are enough freepages isolated? */
+ if (cc->nr_freepages >= cc->nr_migratepages) {
if (isolate_start_pfn >= block_end_pfn) {
/*
* Restart at previous pageblock if more
@@ -1148,10 +1505,14 @@ static void isolate_freepages(struct compact_control *cc)
*/
break;
}
- }
- /* __isolate_free_page() does not map the pages */
- map_pages(freelist);
+ /* Adjust stride depending on isolation */
+ if (nr_isolated) {
+ stride = 1;
+ continue;
+ }
+ stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1);
+ }
/*
* Record where the free scanner will restart next time. Either we
@@ -1160,6 +1521,10 @@ static void isolate_freepages(struct compact_control *cc)
* and the loop terminated due to isolate_start_pfn < low_pfn
*/
cc->free_pfn = isolate_start_pfn;
+
+splitmap:
+ /* __isolate_free_page() does not map the pages */
+ split_map_pages(freelist);
}
/*
@@ -1172,13 +1537,8 @@ static struct page *compaction_alloc(struct page *migratepage,
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
- /*
- * Isolate free pages if necessary, and if we are not aborting due to
- * contention.
- */
if (list_empty(&cc->freepages)) {
- if (!cc->contended)
- isolate_freepages(cc);
+ isolate_freepages(cc);
if (list_empty(&cc->freepages))
return NULL;
@@ -1217,6 +1577,158 @@ typedef enum {
*/
int sysctl_compact_unevictable_allowed __read_mostly = 1;
+static inline void
+update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
+{
+ if (cc->fast_start_pfn == ULONG_MAX)
+ return;
+
+ if (!cc->fast_start_pfn)
+ cc->fast_start_pfn = pfn;
+
+ cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
+}
+
+static inline void
+reinit_migrate_pfn(struct compact_control *cc)
+{
+ if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX)
+ return;
+
+ cc->migrate_pfn = cc->fast_start_pfn;
+ cc->fast_start_pfn = ULONG_MAX;
+}
+
+/*
+ * Briefly search the free lists for a migration source that already has
+ * some free pages to reduce the number of pages that need migration
+ * before a pageblock is free.
+ */
+static unsigned long fast_find_migrateblock(struct compact_control *cc)
+{
+ unsigned int limit = freelist_scan_limit(cc);
+ unsigned int nr_scanned = 0;
+ unsigned long distance;
+ unsigned long pfn = cc->migrate_pfn;
+ unsigned long high_pfn;
+ int order;
+
+ /* Skip hints are relied on to avoid repeats on the fast search */
+ if (cc->ignore_skip_hint)
+ return pfn;
+
+ /*
+ * If the migrate_pfn is not at the start of a zone or the start
+ * of a pageblock then assume this is a continuation of a previous
+ * scan restarted due to COMPACT_CLUSTER_MAX.
+ */
+ if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
+ return pfn;
+
+ /*
+ * For smaller orders, just linearly scan as the number of pages
+ * to migrate should be relatively small and does not necessarily
+ * justify freeing up a large block for a small allocation.
+ */
+ if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return pfn;
+
+ /*
+ * Only allow kcompactd and direct requests for movable pages to
+ * quickly clear out a MOVABLE pageblock for allocation. This
+ * reduces the risk that a large movable pageblock is freed for
+ * an unmovable/reclaimable small allocation.
+ */
+ if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
+ return pfn;
+
+ /*
+ * When starting the migration scanner, pick any pageblock within the
+ * first half of the search space. Otherwise try and pick a pageblock
+ * within the first eighth to reduce the chances that a migration
+ * target later becomes a source.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn) >> 1;
+ if (cc->migrate_pfn != cc->zone->zone_start_pfn)
+ distance >>= 2;
+ high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
+
+ for (order = cc->order - 1;
+ order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+ order--) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ unsigned long nr_skipped = 0;
+ unsigned long flags;
+ struct page *freepage;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry(freepage, freelist, lru) {
+ unsigned long free_pfn;
+
+ nr_scanned++;
+ free_pfn = page_to_pfn(freepage);
+ if (free_pfn < high_pfn) {
+ /*
+ * Avoid if skipped recently. Move to the tail
+ * of the list so it will not be found again
+ * soon
+ */
+ if (get_pageblock_skip(freepage)) {
+
+ if (list_is_last(freelist, &freepage->lru))
+ break;
+
+ nr_skipped++;
+ list_del(&freepage->lru);
+ list_add_tail(&freepage->lru, freelist);
+ if (nr_skipped > 2)
+ break;
+ continue;
+ }
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_tail(freelist, freepage);
+
+ update_fast_start_pfn(cc, free_pfn);
+ pfn = pageblock_start_pfn(free_pfn);
+ cc->fast_search_fail = 0;
+ break;
+ }
+
+ /*
+ * If low PFNs are being found and discarded then
+ * limit the scan as fast searching is finding
+ * poor candidates.
+ */
+ if (free_pfn < cc->migrate_pfn)
+ limit >>= 1;
+
+ if (nr_scanned >= limit) {
+ cc->fast_search_fail++;
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+ }
+
+ cc->total_migrate_scanned += nr_scanned;
+
+ /*
+ * If fast scanning failed then use a cached entry for a page block
+ * that had free pages as the basis for starting a linear scan.
+ */
+ if (pfn == cc->migrate_pfn)
+ reinit_migrate_pfn(cc);
+
+ return pfn;
+}
+
/*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
@@ -1235,9 +1747,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
/*
* Start at where we last stopped, or beginning of the zone as
- * initialized by compact_zone()
+ * initialized by compact_zone(). The first failure will use
+ * the lowest PFN as the starting point for linear scanning.
*/
- low_pfn = cc->migrate_pfn;
+ low_pfn = fast_find_migrateblock(cc);
block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < zone->zone_start_pfn)
block_start_pfn = zone->zone_start_pfn;
@@ -1253,38 +1766,48 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
-
/*
* This can potentially iterate a massively long zone with
* many pageblocks unsuitable, so periodically check if we
- * need to schedule, or even abort async compaction.
+ * need to schedule.
*/
- if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
- && compact_should_abort(cc))
- break;
+ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ compact_check_resched(cc);
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
if (!page)
continue;
- /* If isolation recently failed, do not retry */
- if (!isolation_suitable(cc, page))
+ /*
+ * If isolation recently failed, do not retry. Only check the
+ * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
+ * to be visited multiple times. Assume skip was checked
+ * before making it "skip" so other compaction instances do
+ * not scan the same block.
+ */
+ if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+ !isolation_suitable(cc, page))
continue;
/*
- * For async compaction, also only scan in MOVABLE blocks.
- * Async compaction is optimistic to see if the minimum amount
- * of work satisfies the allocation.
+ * For async compaction, also only scan in MOVABLE blocks
+ * without huge pages. Async compaction is optimistic to see
+ * if the minimum amount of work satisfies the allocation.
+ * The cached PFN is updated as it's possible that all
+ * remaining blocks between source and target are suitable
+ * and the compaction scanners fail to meet.
*/
- if (!suitable_migration_source(cc, page))
+ if (!suitable_migration_source(cc, page)) {
+ update_cached_migrate(cc, block_end_pfn);
continue;
+ }
/* Perform the isolation */
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
- if (!low_pfn || cc->contended)
+ if (!low_pfn)
return ISOLATE_ABORT;
/*
@@ -1310,19 +1833,16 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
-static enum compact_result __compact_finished(struct zone *zone,
- struct compact_control *cc)
+static enum compact_result __compact_finished(struct compact_control *cc)
{
unsigned int order;
const int migratetype = cc->migratetype;
-
- if (cc->contended || fatal_signal_pending(current))
- return COMPACT_CONTENDED;
+ int ret;
/* Compaction run completes if the migrate and free scanner meet */
if (compact_scanners_met(cc)) {
/* Let the next compaction start anew. */
- reset_cached_positions(zone);
+ reset_cached_positions(cc->zone);
/*
* Mark that the PG_migrate_skip information should be cleared
@@ -1331,7 +1851,7 @@ static enum compact_result __compact_finished(struct zone *zone,
* based on an allocation request.
*/
if (cc->direct_compaction)
- zone->compact_blockskip_flush = true;
+ cc->zone->compact_blockskip_flush = true;
if (cc->whole_zone)
return COMPACT_COMPLETE;
@@ -1342,20 +1862,19 @@ static enum compact_result __compact_finished(struct zone *zone,
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
- if (cc->finishing_block) {
- /*
- * We have finished the pageblock, but better check again that
- * we really succeeded.
- */
- if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
- cc->finishing_block = false;
- else
- return COMPACT_CONTINUE;
- }
+ /*
+ * Always finish scanning a pageblock to reduce the possibility of
+ * fallbacks in the future. This is particularly important when
+ * migration source is unmovable/reclaimable but it's not worth
+ * special casing.
+ */
+ if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
+ ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < MAX_ORDER; order++) {
- struct free_area *area = &zone->free_area[order];
+ struct free_area *area = &cc->zone->free_area[order];
bool can_steal;
/* Job done if page is free of the right migratetype */
@@ -1393,21 +1912,23 @@ static enum compact_result __compact_finished(struct zone *zone,
return COMPACT_SUCCESS;
}
- cc->finishing_block = true;
- return COMPACT_CONTINUE;
+ ret = COMPACT_CONTINUE;
+ break;
}
}
- return COMPACT_NO_SUITABLE_PAGE;
+ if (cc->contended || fatal_signal_pending(current))
+ ret = COMPACT_CONTENDED;
+
+ return ret;
}
-static enum compact_result compact_finished(struct zone *zone,
- struct compact_control *cc)
+static enum compact_result compact_finished(struct compact_control *cc)
{
int ret;
- ret = __compact_finished(zone, cc);
- trace_mm_compaction_finished(zone, cc->order, ret);
+ ret = __compact_finished(cc);
+ trace_mm_compaction_finished(cc->zone, cc->order, ret);
if (ret == COMPACT_NO_SUITABLE_PAGE)
ret = COMPACT_CONTINUE;
@@ -1534,15 +2055,18 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
return false;
}
-static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
+static enum compact_result
+compact_zone(struct compact_control *cc, struct capture_control *capc)
{
enum compact_result ret;
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long start_pfn = cc->zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(cc->zone);
+ unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
+ bool update_cached;
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
- ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
+ ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
@@ -1555,8 +2079,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
* Clear pageblock skip if there were failures recently and compaction
* is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order))
- __reset_isolation_suitable(zone);
+ if (compaction_restarting(cc->zone, cc->order))
+ __reset_isolation_suitable(cc->zone);
/*
* Setup to move all movable pages to the end of the zone. Used cached
@@ -1564,43 +2088,76 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
* want to compact the whole zone), but check that it is initialised
* by ensuring the values are within zone boundaries.
*/
+ cc->fast_start_pfn = 0;
if (cc->whole_zone) {
cc->migrate_pfn = start_pfn;
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
} else {
- cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
- cc->free_pfn = zone->compact_cached_free_pfn;
+ cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
+ cc->free_pfn = cc->zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
- zone->compact_cached_free_pfn = cc->free_pfn;
+ cc->zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
- zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
- zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
- if (cc->migrate_pfn == start_pfn)
+ if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
cc->whole_zone = true;
}
- cc->last_migrated_pfn = 0;
+ last_migrated_pfn = 0;
+
+ /*
+ * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
+ * the basis that some migrations will fail in ASYNC mode. However,
+ * if the cached PFNs match and pageblocks are skipped due to having
+ * no isolation candidates, then the sync state does not matter.
+ * Until a pageblock with isolation candidates is found, keep the
+ * cached PFNs in sync to avoid revisiting the same blocks.
+ */
+ update_cached = !sync &&
+ cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
migrate_prep_local();
- while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
+ unsigned long start_pfn = cc->migrate_pfn;
+
+ /*
+ * Avoid multiple rescans which can happen if a page cannot be
+ * isolated (dirty/writeback in async mode) or if the migrated
+ * pages are being allocated before the pageblock is cleared.
+ * The first rescan will capture the entire pageblock for
+ * migration. If it fails, it'll be marked skip and scanning
+ * will proceed as normal.
+ */
+ cc->rescan = false;
+ if (pageblock_start_pfn(last_migrated_pfn) ==
+ pageblock_start_pfn(start_pfn)) {
+ cc->rescan = true;
+ }
- switch (isolate_migratepages(zone, cc)) {
+ switch (isolate_migratepages(cc->zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
+ last_migrated_pfn = 0;
goto out;
case ISOLATE_NONE:
+ if (update_cached) {
+ cc->zone->compact_cached_migrate_pfn[1] =
+ cc->zone->compact_cached_migrate_pfn[0];
+ }
+
/*
* We haven't isolated and migrated anything, but
* there might still be unflushed migrations from
@@ -1608,6 +2165,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
*/
goto check_drain;
case ISOLATE_SUCCESS:
+ update_cached = false;
+ last_migrated_pfn = start_pfn;
;
}
@@ -1639,8 +2198,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
cc->migrate_pfn = block_end_pfn(
cc->migrate_pfn - 1, cc->order);
/* Draining pcplists is useless in this case */
- cc->last_migrated_pfn = 0;
-
+ last_migrated_pfn = 0;
}
}
@@ -1652,21 +2210,26 @@ check_drain:
* compact_finished() can detect immediately if allocation
* would succeed.
*/
- if (cc->order > 0 && cc->last_migrated_pfn) {
+ if (cc->order > 0 && last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
- if (cc->last_migrated_pfn < current_block_start) {
+ if (last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
- drain_local_pages(zone);
+ drain_local_pages(cc->zone);
put_cpu();
/* No more flushing until we migrate again */
- cc->last_migrated_pfn = 0;
+ last_migrated_pfn = 0;
}
}
+ /* Stop if a page has been captured */
+ if (capc && capc->page) {
+ ret = COMPACT_SUCCESS;
+ break;
+ }
}
out:
@@ -1685,8 +2248,8 @@ out:
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
*/
- if (free_pfn > zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = free_pfn;
+ if (free_pfn > cc->zone->compact_cached_free_pfn)
+ cc->zone->compact_cached_free_pfn = free_pfn;
}
count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
@@ -1700,7 +2263,8 @@ out:
static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum compact_priority prio,
- unsigned int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags, int classzone_idx,
+ struct page **capture)
{
enum compact_result ret;
struct compact_control cc = {
@@ -1709,6 +2273,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.total_migrate_scanned = 0,
.total_free_scanned = 0,
.order = order,
+ .search_order = order,
.gfp_mask = gfp_mask,
.zone = zone,
.mode = (prio == COMPACT_PRIO_ASYNC) ?
@@ -1720,14 +2285,24 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
};
+ struct capture_control capc = {
+ .cc = &cc,
+ .page = NULL,
+ };
+
+ if (capture)
+ current->capture_control = &capc;
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- ret = compact_zone(zone, &cc);
+ ret = compact_zone(&cc, &capc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
+ *capture = capc.page;
+ current->capture_control = NULL;
+
return ret;
}
@@ -1745,7 +2320,7 @@ int sysctl_extfrag_threshold = 500;
*/
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum compact_priority prio)
+ enum compact_priority prio, struct page **capture)
{
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
@@ -1772,8 +2347,18 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
continue;
}
+ /*
+ * Do not compact remote memory. It's expensive and high-order
+ * small allocations are expected to prefer or require local
+ * memory. Similarly, larger requests such as THP can fallback
+ * to base pages in preference to remote huge pages if
+ * __GFP_THISNODE is not specified
+ */
+ if (zone_to_nid(zone) != zone_to_nid(ac->preferred_zoneref->zone))
+ continue;
+
status = compact_zone_order(zone, order, gfp_mask, prio,
- alloc_flags, ac_classzone_idx(ac));
+ alloc_flags, ac_classzone_idx(ac), capture);
rc = max(status, rc);
/* The allocation should succeed, stop compacting */
@@ -1841,7 +2426,7 @@ static void compact_node(int nid)
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- compact_zone(zone, &cc);
+ compact_zone(&cc, NULL);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
@@ -1876,14 +2461,6 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
return 0;
}
-int sysctl_extfrag_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- proc_dointvec_minmax(table, write, buffer, length, ppos);
-
- return 0;
-}
-
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
static ssize_t sysfs_compact_node(struct device *dev,
struct device_attribute *attr,
@@ -1948,6 +2525,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
+ .search_order = pgdat->kcompactd_max_order,
.total_migrate_scanned = 0,
.total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
@@ -1983,7 +2561,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
if (kthread_should_stop())
return;
- status = compact_zone(zone, &cc);
+ status = compact_zone(&cc, NULL);
if (status == COMPACT_SUCCESS) {
compaction_defer_reset(zone, cc.order, false);
diff --git a/mm/filemap.c b/mm/filemap.c
index 9f5e323e883e..df2bc9c531f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1564,6 +1564,9 @@ EXPORT_SYMBOL(find_lock_entry);
* @gfp_mask and added to the page cache and the VM's LRU
* list. The page is returned locked and with an increased
* refcount. Otherwise, NULL is returned.
+ * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
+ * its own locking dance if the page is already in cache, or unlock the page
+ * before returning if we had to add the page to pagecache.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
@@ -1616,7 +1619,7 @@ no_page:
if (!page)
return NULL;
- if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+ if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
/* Init accessed so avoid atomic mark_page_accessed later */
@@ -1630,6 +1633,13 @@ no_page:
if (err == -EEXIST)
goto repeat;
}
+
+ /*
+ * add_to_page_cache_lru locks the page, and for mmap we expect
+ * an unlocked page.
+ */
+ if (fgp_flags & FGP_FOR_MMAP)
+ unlock_page(page);
}
return page;
@@ -1837,16 +1847,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- /*
- * must check mapping and index after taking the ref.
- * otherwise we can get both false positives and false
- * negatives, which is just confusing to the caller.
- */
- if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
- put_page(page);
- break;
- }
-
pages[ret] = page;
if (++ret == nr_pages)
break;
@@ -2354,62 +2354,93 @@ out:
EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
-/**
- * page_cache_read - adds requested page to the page cache if not already there
- * @file: file to read
- * @offset: page index
- * @gfp_mask: memory allocation flags
- *
- * This adds the requested page to the page cache if it isn't already there,
- * and schedules an I/O to read in its contents from disk.
- */
-static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
+#define MMAP_LOTSAMISS (100)
+static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+ struct file *fpin)
{
- struct address_space *mapping = file->f_mapping;
- struct page *page;
- int ret;
+ int flags = vmf->flags;
- do {
- page = __page_cache_alloc(gfp_mask);
- if (!page)
- return -ENOMEM;
+ if (fpin)
+ return fpin;
- ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
- if (ret == 0)
- ret = mapping->a_ops->readpage(file, page);
- else if (ret == -EEXIST)
- ret = 0; /* losing race to add is OK */
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+ * anything, so we only pin the file and drop the mmap_sem if only
+ * FAULT_FLAG_ALLOW_RETRY is set.
+ */
+ if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+ FAULT_FLAG_ALLOW_RETRY) {
+ fpin = get_file(vmf->vma->vm_file);
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ }
+ return fpin;
+}
- put_page(page);
+/*
+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
+ * @vmf - the vm_fault for this fault.
+ * @page - the page to lock.
+ * @fpin - the pointer to the file we may pin (or is already pinned).
+ *
+ * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
+ * It differs in that it actually returns the page locked if it returns 1 and 0
+ * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
+ * will point to the pinned file and needs to be fput()'ed at a later point.
+ */
+static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
+ struct file **fpin)
+{
+ if (trylock_page(page))
+ return 1;
- } while (ret == AOP_TRUNCATED_PAGE);
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
- return ret;
+ *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
+ if (vmf->flags & FAULT_FLAG_KILLABLE) {
+ if (__lock_page_killable(page)) {
+ /*
+ * We didn't have the right flags to drop the mmap_sem,
+ * but all fault_handlers only check for fatal signals
+ * if we return VM_FAULT_RETRY, so we need to drop the
+ * mmap_sem here and return 0 if we don't have a fpin.
+ */
+ if (*fpin == NULL)
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
}
-#define MMAP_LOTSAMISS (100)
/*
- * Synchronous readahead happens when we don't even find
- * a page in the page cache at all.
+ * Synchronous readahead happens when we don't even find a page in the page
+ * cache at all. We don't want to perform IO under the mmap sem, so if we have
+ * to drop the mmap sem we return the file that was pinned in order for us to do
+ * that. If we didn't pin a file then we return NULL. The file that is
+ * returned needs to be fput()'ed when we're done with it.
*/
-static void do_sync_mmap_readahead(struct vm_area_struct *vma,
- struct file_ra_state *ra,
- struct file *file,
- pgoff_t offset)
+static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ struct file *fpin = NULL;
+ pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
- return;
+ if (vmf->vma->vm_flags & VM_RAND_READ)
+ return fpin;
if (!ra->ra_pages)
- return;
+ return fpin;
- if (vma->vm_flags & VM_SEQ_READ) {
+ if (vmf->vma->vm_flags & VM_SEQ_READ) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
- return;
+ return fpin;
}
/* Avoid banging the cache line if not needed */
@@ -2421,37 +2452,44 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
* stop bothering with read-ahead. It will only hurt.
*/
if (ra->mmap_miss > MMAP_LOTSAMISS)
- return;
+ return fpin;
/*
* mmap read-around
*/
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ra_submit(ra, mapping, file);
+ return fpin;
}
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
- * so we want to possibly extend the readahead further..
+ * so we want to possibly extend the readahead further. We return the file that
+ * was pinned if we have to drop the mmap_sem in order to do IO.
*/
-static void do_async_mmap_readahead(struct vm_area_struct *vma,
- struct file_ra_state *ra,
- struct file *file,
- struct page *page,
- pgoff_t offset)
+static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
+ struct page *page)
{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ struct file *fpin = NULL;
+ pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
- return;
+ if (vmf->vma->vm_flags & VM_RAND_READ)
+ return fpin;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
- if (PageReadahead(page))
+ if (PageReadahead(page)) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_readahead(mapping, ra, file,
page, offset, ra->ra_pages);
+ }
+ return fpin;
}
/**
@@ -2481,6 +2519,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
+ struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
@@ -2502,23 +2541,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
* We found the page, so try async readahead before
* waiting for the lock.
*/
- do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
+ fpin = do_async_mmap_readahead(vmf, page);
} else if (!page) {
/* No page in the page cache at all */
- do_sync_mmap_readahead(vmf->vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
+ fpin = do_sync_mmap_readahead(vmf);
retry_find:
- page = find_get_page(mapping, offset);
- if (!page)
- goto no_cached_page;
+ page = pagecache_get_page(mapping, offset,
+ FGP_CREAT|FGP_FOR_MMAP,
+ vmf->gfp_mask);
+ if (!page) {
+ if (fpin)
+ goto out_retry;
+ return vmf_error(-ENOMEM);
+ }
}
- if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
- put_page(page);
- return ret | VM_FAULT_RETRY;
- }
+ if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+ goto out_retry;
/* Did it get truncated? */
if (unlikely(page->mapping != mapping)) {
@@ -2536,6 +2578,16 @@ retry_find:
goto page_not_uptodate;
/*
+ * We've made it this far and we had to drop our mmap_sem, now is the
+ * time to return to the upper layer and have it re-find the vma and
+ * redo the fault.
+ */
+ if (fpin) {
+ unlock_page(page);
+ goto out_retry;
+ }
+
+ /*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
@@ -2549,28 +2601,6 @@ retry_find:
vmf->page = page;
return ret | VM_FAULT_LOCKED;
-no_cached_page:
- /*
- * We're only likely to ever get here if MADV_RANDOM is in
- * effect.
- */
- error = page_cache_read(file, offset, vmf->gfp_mask);
-
- /*
- * The page we want has now been added to the page cache.
- * In the unlikely event that someone removed it in the
- * meantime, we'll just come back here and read it again.
- */
- if (error >= 0)
- goto retry_find;
-
- /*
- * An error return from page_cache_read can result if the
- * system is low on memory, or a problem occurs while trying
- * to schedule I/O.
- */
- return vmf_error(error);
-
page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
@@ -2579,12 +2609,15 @@ page_not_uptodate:
* and we need to check for errors.
*/
ClearPageError(page);
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
error = mapping->a_ops->readpage(file, page);
if (!error) {
wait_on_page_locked(page);
if (!PageUptodate(page))
error = -EIO;
}
+ if (fpin)
+ goto out_retry;
put_page(page);
if (!error || error == AOP_TRUNCATED_PAGE)
@@ -2593,6 +2626,18 @@ page_not_uptodate:
/* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(file, ra);
return VM_FAULT_SIGBUS;
+
+out_retry:
+ /*
+ * We dropped the mmap_sem, we need to return to the fault handler to
+ * re-find the vma and come back and find our hopefully still populated
+ * page.
+ */
+ if (page)
+ put_page(page);
+ if (fpin)
+ fput(fpin);
+ return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index faf357eaf0ce..f5f1d4324fe2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -33,6 +33,7 @@
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
+#include <linux/numa.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -1183,7 +1184,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
}
mmu_notifier_range_init(&range, vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
+ haddr + HPAGE_PMD_SIZE,
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1346,7 +1348,8 @@ alloc:
__SetPageUptodate(new_page);
mmu_notifier_range_init(&range, vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
+ haddr + HPAGE_PMD_SIZE,
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
spin_lock(vmf->ptl);
@@ -1475,7 +1478,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = -1, this_nid = numa_node_id();
+ int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
bool migrated = false;
@@ -1520,7 +1523,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
goto clear_pmdnuma;
@@ -1528,7 +1531,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
@@ -1549,14 +1552,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto out_unlock;
}
/* Bail if we fail to protect against THP splits for any reason */
if (unlikely(!anon_vma)) {
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto clear_pmdnuma;
}
@@ -1618,7 +1621,7 @@ out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
flags);
@@ -2024,7 +2027,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
struct mmu_notifier_range range;
mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
- (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
+ (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE,
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
ptl = pud_lock(vma->vm_mm, pud);
if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
@@ -2242,7 +2246,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
struct mmu_notifier_range range;
mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
- (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+ (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE,
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df2e7dd5ff17..791d2d9f5f45 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -25,6 +25,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
+#include <linux/numa.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask,
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
- int node = -1;
+ int node = NUMA_NO_NODE;
zonelist = node_zonelist(nid, gfp_mask);
@@ -919,7 +920,7 @@ retry_cpuset:
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepage_migration_supported(h))
+ if (hugepage_movable_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -3247,7 +3248,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (cow) {
mmu_notifier_range_init(&range, src, vma->vm_start,
- vma->vm_end);
+ vma->vm_end, MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
}
@@ -3358,7 +3359,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
- mmu_notifier_range_init(&range, mm, start, end);
+ mmu_notifier_range_init(&range, mm, start, end, MMU_NOTIFY_CLEAR);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
address = start;
@@ -3626,7 +3627,8 @@ retry_avoidcopy:
__SetPageUptodate(new_page);
set_page_huge_active(new_page);
- mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+ mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h),
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
/*
@@ -4268,7 +4270,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
+ if (nonblocking &&
+ !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*nonblocking = 0;
*nr_pages = 0;
/*
@@ -4346,7 +4349,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
- mmu_notifier_range_init(&range, mm, start, end);
+ mmu_notifier_range_init(&range, mm, start, end,
+ MMU_NOTIFY_PROTECTION_VMA);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
diff --git a/mm/internal.h b/mm/internal.h
index f4a7bb02decf..9eeaf2b95166 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
+extern void __free_pages_core(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
@@ -183,14 +184,16 @@ extern int user_min_free_kbytes;
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
+ unsigned int nr_freepages; /* Number of isolated free pages */
+ unsigned int nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+ unsigned long fast_start_pfn; /* a pfn to start linear scan from */
struct zone *zone;
- unsigned long nr_freepages; /* Number of isolated free pages */
- unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long total_migrate_scanned;
unsigned long total_free_scanned;
- unsigned long free_pfn; /* isolate_freepages search base */
- unsigned long migrate_pfn; /* isolate_migratepages search base */
- unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
+ unsigned short fast_search_fail;/* failures to use free list searches */
+ short search_order; /* order to start a fast search at */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
int order; /* order a direct compactor needs */
int migratetype; /* migratetype of direct compactor */
@@ -203,7 +206,16 @@ struct compact_control {
bool direct_compaction; /* False from kcompactd or /proc/... */
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock or sched contention */
- bool finishing_block; /* Finishing current pageblock */
+ bool rescan; /* Rescanning the same pageblock */
+};
+
+/*
+ * Used in direct compaction when a page should be taken from the freelists
+ * immediately when one is created during the free path.
+ */
+struct capture_control {
+ struct compact_control *cc;
+ struct page *page;
};
unsigned long
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4f017339ddb2..ceb242ca6ef6 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm,
pte = pte_offset_map(pmd, address);
pte_ptl = pte_lockptr(mm, pmd);
- mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE,
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c48ad13b4c9..839b3313a760 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -598,7 +598,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
chain->chain_prune_time = jiffies;
chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
- chain->nid = -1; /* debug */
+ chain->nid = NUMA_NO_NODE; /* debug */
#endif
ksm_stable_node_chains++;
@@ -706,8 +706,9 @@ again:
* case this node is no longer referenced, and should be freed;
* however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
- * but if page is swapcache in migrate_page_move_mapping(), it might
- * still be our page, in which case it's essential to keep the node.
+ * the same is in reuse_ksm_page() case; but if page is swapcache
+ * in migrate_page_move_mapping(), it might still be our page,
+ * in which case it's essential to keep the node.
*/
while (!get_page_unless_zero(page)) {
/*
@@ -1052,7 +1053,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
BUG_ON(PageTransCompound(page));
mmu_notifier_range_init(&range, mm, pvmw.address,
- pvmw.address + PAGE_SIZE);
+ pvmw.address + PAGE_SIZE,
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
if (!page_vma_mapped_walk(&pvmw))
@@ -1139,7 +1141,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (!pmd)
goto out;
- mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE,
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -2642,6 +2645,31 @@ again:
goto again;
}
+bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
+ WARN_ON(!page_mapped(page)) ||
+ WARN_ON(!PageLocked(page))) {
+ dump_page(page, "reuse_ksm_page");
+ return false;
+ }
+#endif
+
+ if (PageSwapCache(page) || !page_stable_node(page))
+ return false;
+ /* Prohibit parallel get_ksm_page() */
+ if (!page_ref_freeze(page, 1))
+ return false;
+
+ page_move_anon_rmap(page, vma);
+ page->index = linear_page_index(vma, address);
+ page_ref_unfreeze(page, 1);
+
+ return true;
+}
#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
diff --git a/mm/madvise.c b/mm/madvise.c
index 21a7881a2db4..d220ad7087ed 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.end = min(vma->vm_end, end_addr);
if (range.end <= vma->vm_start)
return -EINVAL;
- mmu_notifier_range_init(&range, mm, range.start, range.end);
+ mmu_notifier_range_init(&range, mm, range.start, range.end,
+ MMU_NOTIFY_CLEAR);
lru_add_drain();
tlb_gather_mmu(&tlb, mm, range.start, range.end);
diff --git a/mm/memblock.c b/mm/memblock.c
index 022d4cbb3618..a32db30c6492 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1247,6 +1247,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/**
+ * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
+ *
+ * @idx: pointer to u64 loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
+ * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
+ *
+ * This function is meant to be a zone/pfn specific wrapper for the
+ * for_each_mem_range type iterators. Specifically they are used in the
+ * deferred memory init routines and as such we were duplicating much of
+ * this logic throughout the code. So instead of having it in multiple
+ * locations it seemed like it would make more sense to centralize this to
+ * one new iterator that does everything they need.
+ */
+void __init_memblock
+__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn, unsigned long *out_epfn)
+{
+ int zone_nid = zone_to_nid(zone);
+ phys_addr_t spa, epa;
+ int nid;
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+
+ while (*idx != U64_MAX) {
+ unsigned long epfn = PFN_DOWN(epa);
+ unsigned long spfn = PFN_UP(spa);
+
+ /*
+ * Verify the end is at least past the start of the zone and
+ * that we have at least one PFN to initialize.
+ */
+ if (zone->zone_start_pfn < epfn && spfn < epfn) {
+ /* if we went too far just stop searching */
+ if (zone_end_pfn(zone) <= spfn) {
+ *idx = U64_MAX;
+ break;
+ }
+
+ if (out_spfn)
+ *out_spfn = max(zone->zone_start_pfn, spfn);
+ if (out_epfn)
+ *out_epfn = min(zone_end_pfn(zone), epfn);
+
+ return;
+ }
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+ }
+
+ /* signal end of iteration */
+ if (out_spfn)
+ *out_spfn = ULONG_MAX;
+ if (out_epfn)
+ *out_epfn = 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index af7f18b32389..a4ac554be7e8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1387,10 +1387,22 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
.gfp_mask = gfp_mask,
.order = order,
};
- bool ret;
+ bool ret = true;
mutex_lock(&oom_lock);
+
+ /*
+ * multi-threaded tasks might race with oom_reaper and gain
+ * MMF_OOM_SKIP before reaching out_of_memory which can lead
+ * to out_of_memory failure if the task is the last one in
+ * memcg which would be a false possitive failure reported
+ */
+ if (tsk_is_oom_victim(current))
+ goto unlock;
+
ret = out_of_memory(&oc);
+
+unlock:
mutex_unlock(&oom_lock);
return ret;
}
@@ -2161,7 +2173,8 @@ void mem_cgroup_handle_over_high(void)
if (likely(!nr_pages))
return;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = current->memcg_high_reclaim;
+ current->memcg_high_reclaim = NULL;
reclaim_high(memcg, nr_pages, GFP_KERNEL);
css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
@@ -2317,10 +2330,10 @@ done_restock:
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
* if __GFP_RECLAIM but let's always punt for simplicity and so that
- * GFP_KERNEL can consistently be used during reclaim. @memcg is
- * not recorded as it most likely matches current's and won't
- * change in the meantime. As high limit is checked again before
- * reclaim, the cost of mismatch is negligible.
+ * GFP_KERNEL can consistently be used during reclaim. Record the memcg
+ * for the return-to-userland high reclaim. If the memcg is already
+ * recorded and the recorded memcg is not the descendant of the memcg
+ * needing high reclaim, punt the high reclaim to the work queue.
*/
do {
if (page_counter_read(&memcg->memory) > memcg->high) {
@@ -2328,6 +2341,13 @@ done_restock:
if (in_interrupt()) {
schedule_work(&memcg->high_work);
break;
+ } else if (!current->memcg_high_reclaim) {
+ css_get(&memcg->css);
+ current->memcg_high_reclaim = memcg;
+ } else if (!mem_cgroup_is_descendant(
+ current->memcg_high_reclaim, memcg)) {
+ schedule_work(&memcg->high_work);
+ break;
}
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
@@ -2573,7 +2593,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
}
/**
- * memcg_kmem_charge_memcg: charge a kmem page
+ * __memcg_kmem_charge_memcg: charge a kmem page
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
@@ -2581,7 +2601,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg)
{
unsigned int nr_pages = 1 << order;
@@ -2604,24 +2624,24 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
}
/**
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
- if (mem_cgroup_disabled() || memcg_kmem_bypass())
+ if (memcg_kmem_bypass())
return 0;
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (!ret)
__SetPageKmemcg(page);
}
@@ -2629,11 +2649,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
return ret;
}
/**
- * memcg_kmem_uncharge: uncharge a kmem page
+ * __memcg_kmem_uncharge: uncharge a kmem page
* @page: page to uncharge
* @order: allocation order
*/
-void memcg_kmem_uncharge(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
@@ -3626,8 +3646,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
- GFP_KERNEL);
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
goto unlock;
diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..efc3d02175f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
+#include <linux/numa.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -1009,7 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
is_cow = is_cow_mapping(vma->vm_flags);
if (is_cow) {
- mmu_notifier_range_init(&range, src_mm, addr, end);
+ mmu_notifier_range_init(&range, src_mm, addr, end,
+ MMU_NOTIFY_PROTECTION_PAGE);
mmu_notifier_invalidate_range_start(&range);
}
@@ -1333,7 +1335,8 @@ void unmap_vmas(struct mmu_gather *tlb,
{
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
+ mmu_notifier_range_init(&range, vma->vm_mm, start_addr,
+ end_addr, MMU_NOTIFY_UNMAP);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
@@ -1355,7 +1358,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
+ mmu_notifier_range_init(&range, vma->vm_mm, start,
+ start + size, MMU_NOTIFY_CLEAR);
tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1381,7 +1385,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
+ mmu_notifier_range_init(&range, vma->vm_mm, address,
+ address + size, MMU_NOTIFY_CLEAR);
tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -2272,7 +2277,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
__SetPageUptodate(new_page);
mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
- (vmf->address & PAGE_MASK) + PAGE_SIZE);
+ (vmf->address & PAGE_MASK) + PAGE_SIZE,
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
/*
@@ -2504,8 +2510,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
+ if (PageAnon(vmf->page)) {
int total_map_swapcount;
+ if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
+ page_count(vmf->page) != 1))
+ goto copy;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2520,6 +2529,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
}
put_page(vmf->page);
}
+ if (PageKsm(vmf->page)) {
+ bool reused = reuse_ksm_page(vmf->page, vmf->vma,
+ vmf->address);
+ unlock_page(vmf->page);
+ if (!reused)
+ goto copy;
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
+ }
if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
if (total_map_swapcount == 1) {
/*
@@ -2540,7 +2558,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
-
+copy:
/*
* Ok, we need to copy. Oh, well..
*/
@@ -2699,7 +2717,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct swap_info_struct *si = swp_swap_info(entry);
if (si->flags & SWP_SYNCHRONOUS_IO &&
- __swap_count(si, entry) == 1) {
+ __swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
@@ -2809,7 +2827,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -2823,6 +2840,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
@@ -3586,7 +3604,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
- int page_nid = -1;
+ int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
bool migrated = false;
@@ -3653,7 +3671,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
put_page(page);
goto out;
}
@@ -3667,7 +3685,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
flags |= TNF_MIGRATE_FAIL;
out:
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
@@ -4082,7 +4100,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
if (range) {
mmu_notifier_range_init(range, mm, address & PMD_MASK,
- (address & PMD_MASK) + PMD_SIZE);
+ (address & PMD_MASK) + PMD_SIZE,
+ range->event);
mmu_notifier_invalidate_range_start(range);
}
*ptlp = pmd_lock(mm, pmd);
@@ -4100,7 +4119,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
if (range) {
mmu_notifier_range_init(range, mm, address & PAGE_MASK,
- (address & PAGE_MASK) + PAGE_SIZE);
+ (address & PAGE_MASK) + PAGE_SIZE,
+ range->event);
mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b9a667d36c55..ec22c86d9f89 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -47,7 +47,7 @@
* and restore_online_page_callback() for generic callback restore.
*/
-static void generic_online_page(struct page *page);
+static int generic_online_page(struct page *page, unsigned int order);
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
@@ -656,30 +656,55 @@ void __online_page_free(struct page *page)
}
EXPORT_SYMBOL_GPL(__online_page_free);
-static void generic_online_page(struct page *page)
+static int generic_online_page(struct page *page, unsigned int order)
{
- __online_page_set_limits(page);
- __online_page_increment_counters(page);
- __online_page_free(page);
+ __free_pages_core(page, order);
+ totalram_pages_add(1UL << order);
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages_add(1UL << order);
+#endif
+ return 0;
+}
+
+static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
+{
+ unsigned long end = start + nr_pages;
+ int order, ret, onlined_pages = 0;
+
+ while (start < end) {
+ order = min(MAX_ORDER - 1,
+ get_order(PFN_PHYS(end) - PFN_PHYS(start)));
+
+ /*
+ * External callback providers can deny onlining pages. So check
+ * for return value.
+ * zero : if all pages are onlined.
+ * positive : if only few pages are onlined.
+ * negative : if none of the pages are onlined.
+ */
+ ret = (*online_page_callback)(pfn_to_page(start), order);
+ if (!ret)
+ onlined_pages += (1UL << order);
+ else if (ret > 0)
+ onlined_pages += ret;
+
+ start += (1UL << order);
+ }
+ return onlined_pages;
}
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long i;
unsigned long onlined_pages = *(unsigned long *)arg;
- struct page *page;
if (PageReserved(pfn_to_page(start_pfn)))
- for (i = 0; i < nr_pages; i++) {
- page = pfn_to_page(start_pfn + i);
- (*online_page_callback)(page);
- onlined_pages++;
- }
+ onlined_pages = online_pages_blocks(start_pfn, nr_pages);
online_mem_sections(start_pfn, start_pfn + nr_pages);
- *(unsigned long *)arg = onlined_pages;
+ *(unsigned long *)arg += onlined_pages;
return 0;
}
@@ -689,9 +714,9 @@ static void node_states_check_changes_online(unsigned long nr_pages,
{
int nid = zone_to_nid(zone);
- arg->status_change_nid = -1;
- arg->status_change_nid_normal = -1;
- arg->status_change_nid_high = -1;
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
if (!node_state(nid, N_MEMORY))
arg->status_change_nid = nid;
@@ -1344,7 +1369,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
- int not_managed = 0;
int ret = 0;
LIST_HEAD(source);
@@ -1392,7 +1416,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
else
ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
if (!ret) { /* Success */
- put_page(page);
list_add_tail(&page->lru, &source);
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
@@ -1401,22 +1424,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
} else {
pr_warn("failed to isolate pfn %lx\n", pfn);
dump_page(page, "isolation failed");
- put_page(page);
- /* Because we don't have big zone->lock. we should
- check this again here. */
- if (page_count(page)) {
- not_managed++;
- ret = -EBUSY;
- break;
- }
}
+ put_page(page);
}
if (!list_empty(&source)) {
- if (not_managed) {
- putback_movable_pages(&source);
- goto out;
- }
-
/* Allocate a new page from the nearest neighbor node */
ret = migrate_pages(&source, new_node_page, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
@@ -1429,7 +1440,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
putback_movable_pages(&source);
}
}
-out:
+
return ret;
}
@@ -1499,9 +1510,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
unsigned long present_pages = 0;
enum zone_type zt;
- arg->status_change_nid = -1;
- arg->status_change_nid_normal = -1;
- arg->status_change_nid_high = -1;
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
/*
* Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@ -1617,7 +1628,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
cond_resched();
lru_add_drain_all();
- drain_all_pages(zone);
pfn = scan_movable_pages(pfn, end_pfn);
if (pfn) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d4496d9d34f5..1da2f1f09675 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -350,7 +350,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) &&
+ if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -2304,7 +2304,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
- int polnid = -1;
+ int polnid = NUMA_NO_NODE;
int ret = -1;
pol = get_vma_policy(vma, addr);
diff --git a/mm/migrate.c b/mm/migrate.c
index a16b15090df3..4512afab46ac 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -916,7 +916,7 @@ static int fallback_migrate_page(struct address_space *mapping,
*/
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
- return -EAGAIN;
+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
return migrate_page(mapping, newpage, page, mode);
}
@@ -1289,7 +1289,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
struct anon_vma *anon_vma = NULL;
/*
- * Movability of hugepages depends on architectures and hugepage size.
+ * Migratability of hugepages depends on architectures and their size.
* This check is necessary because some callers of hugepage migration
* like soft offline and memory hotremove don't walk through page
* tables or check whether the hugepage is pmd-based or not before
@@ -2343,7 +2343,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
mm_walk.private = migrate;
mmu_notifier_range_init(&range, mm_walk.mm, migrate->start,
- migrate->end);
+ migrate->end, MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->start, migrate->end, &mm_walk);
mmu_notifier_invalidate_range_end(&range);
@@ -2751,7 +2751,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
mmu_notifier_range_init(&range,
migrate->vma->vm_mm,
- addr, migrate->end);
+ addr, migrate->end,
+ MMU_NOTIFY_CLEAR);
mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 36cb358db170..d5bbe5ca61ac 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -185,7 +185,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
/* invoke the mmu notifier if the pmd is populated */
if (!range.start) {
- mmu_notifier_range_init(&range, vma->vm_mm, addr, end);
+ mmu_notifier_range_init(&range, vma->vm_mm, addr, end,
+ MMU_NOTIFY_PROTECTION_VMA);
mmu_notifier_invalidate_range_start(&range);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 3320616ed93f..f2efb720be7f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end);
+ mmu_notifier_range_init(&range, vma->vm_mm, old_addr,
+ old_end, MMU_NOTIFY_UNMAP);
mmu_notifier_invalidate_range_start(&range);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f0e8cd9edb1a..0930b4365be7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -532,7 +532,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_gather tlb;
mmu_notifier_range_init(&range, mm, vma->vm_start,
- vma->vm_end);
+ vma->vm_end, MMU_NOTIFY_CLEAR);
tlb_gather_mmu(&tlb, mm, range.start, range.end);
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
tlb_finish_mmu(&tlb, range.start, range.end);
@@ -892,6 +892,7 @@ static void __oom_kill_process(struct task_struct *victim)
*/
rcu_read_lock();
for_each_process(p) {
+ struct task_struct *t;
if (!process_shares_mm(p, mm))
continue;
if (same_thread_group(p, victim))
@@ -911,6 +912,11 @@ static void __oom_kill_process(struct task_struct *victim)
if (unlikely(p->flags & PF_KTHREAD))
continue;
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ mark_oom_victim(t);
+ task_unlock(t);
}
rcu_read_unlock();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d295c9bc01a8..d7a521971a05 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -336,38 +336,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
}
/*
- * Returns true when the remaining initialisation should be deferred until
- * later in the boot cycle when it can be parallelised.
+ * Calculate first_deferred_pfn in case:
+ * - in MEMMAP_EARLY context
+ * - this is the last zone
+ *
+ * If the first aligned section doesn't exceed the end_pfn, set it to
+ * first_deferred_pfn and return it.
*/
-static bool __meminit
-defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+unsigned long __meminit
+defer_pfn(int nid, unsigned long start_pfn, unsigned long end_pfn,
+ enum memmap_context context)
{
- static unsigned long prev_end_pfn, nr_initialised;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ unsigned long pfn;
- /*
- * prev_end_pfn static that contains the end of previous zone
- * No need to protect because called very early in boot before smp_init.
- */
- if (prev_end_pfn != end_pfn) {
- prev_end_pfn = end_pfn;
- nr_initialised = 0;
- }
+ if (context != MEMMAP_EARLY)
+ return end_pfn;
- /* Always populate low zones for address-constrained allocations */
- if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
- return false;
+ /* Always populate low zones */
+ if (end_pfn < pgdat_end_pfn(pgdat))
+ return end_pfn;
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- nr_initialised++;
- if ((nr_initialised > PAGES_PER_SECTION) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- NODE_DATA(nid)->first_deferred_pfn = pfn;
- return true;
+ pfn = roundup(start_pfn + PAGES_PER_SECTION - 1, PAGES_PER_SECTION);
+ if (end_pfn > pfn) {
+ pgdat->first_deferred_pfn = pfn;
+ end_pfn = pfn;
}
- return false;
+ return end_pfn;
}
#else
#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
@@ -377,9 +372,11 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+unsigned long __meminit
+defer_pfn(int nid, unsigned long start_pfn, unsigned long end_pfn,
+ enum memmap_context context)
{
- return false;
+ return end_pfn;
}
#endif
@@ -789,6 +786,41 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
}
+#ifdef CONFIG_COMPACTION
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ struct capture_control *capc = current->capture_control;
+
+ return capc &&
+ !(current->flags & PF_KTHREAD) &&
+ !capc->page &&
+ capc->cc->zone == zone &&
+ capc->cc->direct_compaction ? capc : NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page, int order)
+{
+ if (!capc || order != capc->cc->order)
+ return false;
+
+ capc->page = page;
+ return true;
+}
+
+#else
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ return NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page, int order)
+{
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
/*
* Freeing function for a buddy system allocator.
*
@@ -822,6 +854,7 @@ static inline void __free_one_page(struct page *page,
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
+ struct capture_control *capc = task_capc(zone);
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
@@ -837,6 +870,12 @@ static inline void __free_one_page(struct page *page,
continue_merging:
while (order < max_order - 1) {
+ if (compaction_capture(capc, page, order)) {
+ if (likely(!is_migrate_isolate(migratetype)))
+ __mod_zone_freepage_state(zone, -(1 << order),
+ migratetype);
+ return;
+ }
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
@@ -1056,7 +1095,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
- memcg_kmem_uncharge(page, order);
+ __memcg_kmem_uncharge(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -1213,17 +1252,23 @@ static void free_one_page(struct zone *zone,
spin_unlock(&zone->lock);
}
-static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- unsigned long zone, int nid)
+static void __meminit __init_struct_page_nolru(struct page *page,
+ unsigned long pfn,
+ unsigned long zone, int nid,
+ bool is_reserved)
{
mm_zero_struct_page(page);
- set_page_links(page, zone, nid, pfn);
+
+ /*
+ * We can use a non-atomic operation for setting the
+ * PG_reserved flag as we are still initializing the pages.
+ */
+ set_page_links(page, zone, nid, pfn, is_reserved);
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
page_kasan_tag_reset(page);
- INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
@@ -1231,6 +1276,74 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
#endif
}
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+ unsigned long zone, int nid)
+{
+ __init_struct_page_nolru(page, pfn, zone, nid, false);
+ INIT_LIST_HEAD(&page->lru);
+}
+
+static void __meminit __init_pageblock(unsigned long start_pfn,
+ unsigned long nr_pages,
+ unsigned long zone, int nid,
+ struct dev_pagemap *pgmap,
+ bool is_reserved)
+{
+ unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ struct page *start_page = pfn_to_page(start_pfn);
+ unsigned long pfn = start_pfn + nr_pages - 1;
+ struct page *page;
+
+ /*
+ * Enforce the following requirements:
+ * size > 0
+ * size < pageblock_nr_pages
+ * start_pfn -> pfn does not cross pageblock_nr_pages boundary
+ */
+ VM_BUG_ON(((start_pfn ^ pfn) | (nr_pages - 1)) > nr_pgmask);
+
+ /*
+ * Work from highest page to lowest, this way we will still be
+ * warm in the cache when we call set_pageblock_migratetype
+ * below.
+ *
+ * The loop is based around the page pointer as the main index
+ * instead of the pfn because pfn is not used inside the loop if
+ * the section number is not in page flags and WANT_PAGE_VIRTUAL
+ * is not defined.
+ */
+ for (page = start_page + nr_pages; page-- != start_page; pfn--) {
+ __init_struct_page_nolru(page, pfn, zone, nid, is_reserved);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back
+ * pointer and hmm_data. It is a bug if a ZONE_DEVICE
+ * page is ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ if (!pgmap)
+ INIT_LIST_HEAD(&page->lru);
+ }
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
+ *
+ * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * because this is done early in sparse_add_one_section
+ */
+ if (!(start_pfn & nr_pgmask))
+ set_pageblock_migratetype(start_page, MIGRATE_MOVABLE);
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __meminit init_reserved_page(unsigned long pfn)
{
@@ -1303,7 +1416,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
+void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1344,36 +1457,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
int nid;
- nid = __early_pfn_to_nid(pfn, state);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid >= 0 && nid != node)
return false;
return true;
}
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
#else
-
static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
-{
- return true;
-}
#endif
@@ -1382,7 +1481,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, order);
+ __free_pages_core(page, order);
}
/*
@@ -1457,32 +1556,6 @@ void clear_zone_contiguous(struct zone *zone)
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(unsigned long pfn,
- unsigned long nr_pages)
-{
- struct page *page;
- unsigned long i;
-
- if (!nr_pages)
- return;
-
- page = pfn_to_page(pfn);
-
- /* Free a large naturally-aligned chunk if possible */
- if (nr_pages == pageblock_nr_pages &&
- (pfn & (pageblock_nr_pages - 1)) == 0) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pageblock_order);
- return;
- }
-
- for (i = 0; i < nr_pages; i++, page++, pfn++) {
- if ((pfn & (pageblock_nr_pages - 1)) == 0)
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, 0);
- }
-}
-
/* Completion tracking for deferred_init_memmap() threads */
static atomic_t pgdat_init_n_undone __initdata;
static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
@@ -1494,57 +1567,89 @@ static inline void __init pgdat_init_report_one_done(void)
}
/*
- * Returns true if page needs to be initialized or freed to buddy allocator.
+ * Returns count if page range needs to be initialized or freed
*
- * First we check if pfn is valid on architectures where it is possible to have
- * holes within pageblock_nr_pages. On systems where it is not possible, this
- * function is optimized out.
+ * First we check if the contiguous pfns are valid on architectures where it
+ * is possible to have holes within pageblock_nr_pages. On systems where it
+ * is not possible, this function is optimized out.
*
- * Then, we check if a current large page is valid by only checking the validity
- * of the head pfn.
+ * Then, we check if a current large page is valid by only checking the
+ * validity of the head pfn.
*
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not belong
- * to this memory node.
*/
-static inline bool __init
-deferred_pfn_valid(int nid, unsigned long pfn,
- struct mminit_pfnnid_cache *nid_init_state)
+static unsigned long __next_pfn_valid_range(unsigned long *pfn,
+ unsigned long *i,
+ unsigned long end_pfn)
{
- if (!pfn_valid_within(pfn))
- return false;
- if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
- return false;
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
- return false;
- return true;
+ unsigned long start_pfn = *i;
+
+ while (start_pfn < end_pfn) {
+ unsigned long t = ALIGN(start_pfn + 1, pageblock_nr_pages);
+ unsigned long pageblock_pfn = min(t, end_pfn);
+ unsigned long count = 0;
+
+#ifndef CONFIG_HOLES_IN_ZONE
+ if (pfn_valid(start_pfn))
+ count = pageblock_pfn - start_pfn;
+ start_pfn = pageblock_pfn;
+#else
+ while (start_pfn < pageblock_pfn) {
+ if (pfn_valid(start_pfn++)) {
+ count++;
+ continue;
+ }
+
+ if (!count)
+ continue;
+
+ /*
+ * The last PFN was invalid, report the block of
+ * PFNs we currently have available and skip over
+ * the invalid one.
+ */
+ *pfn = start_pfn - (count + 1);
+ *i = start_pfn;
+ return count;
+ }
+#endif
+ if (!count)
+ continue;
+
+ *pfn = start_pfn - count;
+ *i = start_pfn;
+ return count;
+ }
+
+ return 0;
}
+#define for_each_deferred_pfn_valid_range(pfn, count, i, start_pfn, end_pfn) \
+ for (i = (start_pfn), \
+ count = __next_pfn_valid_range(&pfn, &i, (end_pfn)); \
+ count; \
+ count = __next_pfn_valid_range(&pfn, &i, (end_pfn)))
+
/*
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+static void __init deferred_free_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
- unsigned long nr_free = 0;
-
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 0;
- } else if (!(pfn & nr_pgmask)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 1;
- touch_nmi_watchdog();
+ unsigned long i, pfn, count;
+
+ for_each_deferred_pfn_valid_range(pfn, count, i, start_pfn, end_pfn) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (count == pageblock_nr_pages) {
+ __free_pages_core(page, pageblock_order);
} else {
- nr_free++;
+ while (count--)
+ __free_pages_core(page++, 0);
}
+
+ touch_nmi_watchdog();
}
- /* Free the last block of pages to allocator */
- deferred_free_range(pfn - nr_free, nr_free);
}
/*
@@ -1552,43 +1657,121 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
-static unsigned long __init deferred_init_pages(int nid, int zid,
- unsigned long pfn,
+static unsigned long __init deferred_init_pages(struct zone *zone,
+ unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ int nid = zone_to_nid(zone);
+ unsigned long i, pfn, count;
unsigned long nr_pages = 0;
- struct page *page = NULL;
+ int zid = zone_idx(zone);
+
+ for_each_deferred_pfn_valid_range(pfn, count, i, start_pfn, end_pfn) {
+ nr_pages += count;
+ __init_pageblock(pfn, count, zid, nid, NULL, false);
+
+ touch_nmi_watchdog();
+ }
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
- page = NULL;
+ return nr_pages;
+}
+
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
+
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
continue;
- } else if (!page || !(pfn & nr_pgmask)) {
- page = pfn_to_page(pfn);
- touch_nmi_watchdog();
- } else {
- page++;
- }
- __init_single_page(page, pfn, zid, nid);
- nr_pages++;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, than free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ nr_pages += deferred_init_pages(zone, spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ j = *i;
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ deferred_free_pages(*start_pfn, t);
+ *start_pfn = t;
+
+ if (mo_pfn < *end_pfn)
+ break;
}
- return (nr_pages);
+
+ /* Store our current values to be reused on the next iteration */
+ *i = j;
+
+ return nr_pages;
}
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
- int nid = pgdat->node_id;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+ unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
- unsigned long nr_pages = 0;
- unsigned long spfn, epfn, first_init_pfn, flags;
- phys_addr_t spa, epa;
- int zid;
struct zone *zone;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ int zid;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1614,31 +1797,27 @@ static int __init deferred_init_memmap(void *data)
if (first_init_pfn < zone_end_pfn(zone))
break;
}
- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, than free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
*/
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
- }
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
- }
+ while (spfn < epfn)
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+zone_empty:
pgdat_resize_unlock(pgdat, &flags);
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
- jiffies_to_msecs(jiffies - start));
+ pr_info("node %d initialised, %lu pages in %ums\n",
+ pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
@@ -1662,14 +1841,11 @@ static int __init deferred_init_memmap(void *data)
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
- int zid = zone_idx(zone);
- int nid = zone_to_nid(zone);
- pg_data_t *pgdat = NODE_DATA(nid);
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- unsigned long nr_pages = 0;
- unsigned long first_init_pfn, spfn, epfn, t, flags;
+ pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- phys_addr_t spa, epa;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
u64 i;
/* Only the last zone may have deferred pages */
@@ -1698,37 +1874,24 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
return true;
}
- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
-
- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags);
- return false;
+ return true;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
-
- while (spfn < epfn && nr_pages < nr_pages_needed) {
- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
- first_deferred_pfn = min(t, epfn);
- nr_pages += deferred_init_pages(nid, zid, spfn,
- first_deferred_pfn);
- spfn = first_deferred_pfn;
- }
-
- if (nr_pages >= nr_pages_needed)
- break;
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn && nr_pages < nr_pages_needed) {
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ first_deferred_pfn = spfn;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
-
- if (first_deferred_pfn == epfn)
- break;
- }
pgdat->first_deferred_pfn = first_deferred_pfn;
pgdat_resize_unlock(pgdat, &flags);
@@ -1945,8 +2108,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
@@ -2876,18 +3039,26 @@ void free_unref_page(struct page *page)
/*
* Free a list of 0-order pages
*/
-void free_unref_page_list(struct list_head *list)
+void __free_page_list(struct list_head *list, bool dropref,
+ unsigned long *highest_pfn)
{
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+ if (highest_pfn)
+ *highest_pfn = 0;
+
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
+ if (dropref)
+ WARN_ON_ONCE(!put_page_testzero(page));
pfn = page_to_pfn(page);
if (!free_unref_page_prepare(page, pfn))
list_del(&page->lru);
set_page_private(page, pfn);
+ if (highest_pfn && pfn > *highest_pfn)
+ *highest_pfn = pfn;
}
local_irq_save(flags);
@@ -2950,7 +3121,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* watermark, because we already know our high-order page
* exists.
*/
- watermark = min_wmark_pages(zone) + (1UL << order);
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
@@ -3698,7 +3869,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
- struct page *page;
+ struct page *page = NULL;
unsigned long pflags;
unsigned int noreclaim_flag;
@@ -3709,13 +3880,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- prio);
+ prio, &page);
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
- if (*compact_result <= COMPACT_INACTIVE)
+ if (*compact_result <= COMPACT_INACTIVE) {
+ WARN_ON_ONCE(page);
return NULL;
+ }
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
@@ -3723,7 +3896,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ /* Prep a captured page if available */
+ if (page)
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+
+ /* Try get a page from the freelist if available */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -4556,7 +4735,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
@@ -5630,6 +5809,36 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
return false;
}
+static void __meminit __memmap_init_hotplug(unsigned long size, int nid,
+ unsigned long zone,
+ unsigned long start_pfn,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long pfn = start_pfn + size;
+
+ while (pfn != start_pfn) {
+ unsigned long stride = pfn;
+
+ pfn = max(ALIGN_DOWN(pfn - 1, pageblock_nr_pages), start_pfn);
+ stride -= pfn;
+
+ /*
+ * The last argument of __init_pageblock is a boolean
+ * value indicating if the page will be marked as reserved.
+ *
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * Under certain circumstances ZONE_DEVICE pages may not
+ * need to be marked as reserved, however there is still
+ * code that is depending on this being set for now.
+ */
+ __init_pageblock(pfn, stride, zone, nid, pgmap, true);
+
+ cond_resched();
+ }
+}
+
/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
@@ -5640,49 +5849,59 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
struct vmem_altmap *altmap)
{
unsigned long pfn, end_pfn = start_pfn + size;
- struct page *page;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
+ if (context == MEMMAP_HOTPLUG) {
#ifdef CONFIG_ZONE_DEVICE
- /*
- * Honor reservation requested by the driver for this ZONE_DEVICE
- * memory. We limit the total number of pages to initialize to just
- * those that might contain the memory mapping. We will defer the
- * ZONE_DEVICE page initialization until after we have released
- * the hotplug lock.
- */
- if (zone == ZONE_DEVICE) {
- if (!altmap)
- return;
+ /*
+ * Honor reservation requested by the driver for this
+ * ZONE_DEVICE memory. We limit the total number of pages to
+ * initialize to just those that might contain the memory
+ * mapping. We will defer the ZONE_DEVICE page initialization
+ * until after we have released the hotplug lock.
+ */
+ if (zone == ZONE_DEVICE) {
+ if (!altmap)
+ return;
+
+ if (start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+ end_pfn = altmap->base_pfn +
+ vmem_altmap_offset(altmap);
+ }
+#endif
+ /*
+ * For these ZONE_DEVICE pages we don't need to record the
+ * pgmap as they should represent only those pages used to
+ * store the memory map. The actual ZONE_DEVICE pages will
+ * be initialized later.
+ */
+ __memmap_init_hotplug(end_pfn - start_pfn, nid, zone,
+ start_pfn, NULL);
- if (start_pfn == altmap->base_pfn)
- start_pfn += altmap->reserve;
- end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ return;
}
-#endif
+
+ end_pfn = defer_pfn(nid, start_pfn, end_pfn, context);
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page;
+
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
- if (overlap_memmap_init(zone, &pfn))
- continue;
- if (defer_init(nid, pfn, end_pfn))
- break;
- }
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
- if (context == MEMMAP_HOTPLUG)
- __SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
@@ -5721,7 +5940,6 @@ void __ref memmap_init_zone_device(struct zone *zone,
unsigned long size,
struct dev_pagemap *pgmap)
{
- unsigned long pfn, end_pfn = start_pfn + size;
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long zone_idx = zone_idx(zone);
unsigned long start = jiffies;
@@ -5737,53 +5955,13 @@ void __ref memmap_init_zone_device(struct zone *zone,
*/
if (pgmap->altmap_valid) {
struct vmem_altmap *altmap = &pgmap->altmap;
+ unsigned long end_pfn = start_pfn + size;
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
size = end_pfn - start_pfn;
}
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- struct page *page = pfn_to_page(pfn);
-
- __init_single_page(page, pfn, zone_idx, nid);
-
- /*
- * Mark page reserved as it will need to wait for onlining
- * phase for it to be fully associated with a zone.
- *
- * We can use the non-atomic __set_bit operation for setting
- * the flag as we are still initializing the pages.
- */
- __SetPageReserved(page);
-
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back
- * pointer and hmm_data. It is a bug if a ZONE_DEVICE
- * page is ever freed or placed on a driver-private list.
- */
- page->pgmap = pgmap;
- page->hmm_data = 0;
-
- /*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- *
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
- * because this is done early in sparse_add_one_section
- */
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- cond_resched();
- }
- }
+ __memmap_init_hotplug(size, nid, zone_idx, start_pfn, pgmap);
pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
size, jiffies_to_msecs(jiffies - start));
@@ -6016,7 +6194,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != -1) {
+ if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
@@ -6771,7 +6949,7 @@ unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
unsigned long start, end, mask;
- int last_nid = -1;
+ int last_nid = NUMA_NO_NODE;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
@@ -8196,7 +8374,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
lru_add_drain_all();
- drain_all_pages(cc.zone);
order = 0;
outer_start = start;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index ae44f7adbe07..0cfaa064e7a6 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -300,7 +300,7 @@ static int __meminit online_page_ext(unsigned long start_pfn,
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
- if (nid == -1) {
+ if (nid == NUMA_NO_NODE) {
/*
* In this case, "nid" already exists and contains valid memory.
* "start_pfn" passed to us is a pfn which is an arg for
@@ -399,9 +399,8 @@ void __init page_ext_init(void)
* -------------pfn-------------->
* N0 | N1 | N2 | N0 | N1 | N2|....
*
- * Take into account DEFERRED_STRUCT_PAGE_INIT.
*/
- if (early_pfn_to_nid(pfn) != nid)
+ if (pfn_to_nid(pfn) != nid)
continue;
if (init_section_page_ext(pfn, nid))
goto oom;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 28b06524939f..3bba0ddd3ee4 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
@@ -542,7 +542,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
unsigned long block_end_pfn;
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
diff --git a/mm/page_poison.c b/mm/page_poison.c
index f0c15e9017c0..e546b70e592a 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,6 +6,7 @@
#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
+#include <linux/kasan.h>
static bool want_page_poisoning __read_mostly;
@@ -40,6 +41,7 @@ static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
+ kasan_unpoison_shadow(addr, PAGE_SIZE);
memset(addr, PAGE_POISON, PAGE_SIZE);
kunmap_atomic(addr);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0454ecc29537..63a978c22e47 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -898,7 +898,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, vma->vm_mm, address,
min(vma->vm_end, address +
- (PAGE_SIZE << compound_order(page))));
+ (PAGE_SIZE << compound_order(page))),
+ MMU_NOTIFY_PROTECTION_PAGE);
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1373,7 +1374,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, vma->vm_mm, address,
min(vma->vm_end, address +
- (PAGE_SIZE << compound_order(page))));
+ (PAGE_SIZE << compound_order(page))),
+ MMU_NOTIFY_CLEAR);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
diff --git a/mm/slab.h b/mm/slab.h
index 4190c24ef0e9..cde51d7f631f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -276,8 +276,6 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
- if (!memcg_kmem_enabled())
- return 0;
if (is_root_cache(s))
return 0;
return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
@@ -286,8 +284,6 @@ static __always_inline int memcg_charge_slab(struct page *page,
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct kmem_cache *s)
{
- if (!memcg_kmem_enabled())
- return;
memcg_kmem_uncharge(page, order);
}
diff --git a/mm/slub.c b/mm/slub.c
index 1e3d0ec4e200..6c632fef8ef1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2235,8 +2235,8 @@ static void unfreeze_partials(struct kmem_cache *s,
}
/*
- * Put a page that was just frozen (in __slab_free) into a partial page
- * slot if available.
+ * Put a page that was just frozen (in __slab_free|get_partial_node) into a
+ * partial page slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
@@ -2463,8 +2463,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
- } else
- freelist = NULL;
+ }
return freelist;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fd2f21e1c60a..5a1cc9387151 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -310,8 +310,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
unsigned long addr)
{
struct page *page;
+ struct swap_info_struct *si;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ put_swap_device(si);
INC_CACHE_INFO(find_total);
if (page) {
@@ -354,8 +359,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
+ struct page *found_page = NULL, *new_page = NULL;
+ struct swap_info_struct *si;
int err;
*new_page_allocated = false;
@@ -365,7 +370,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swapper_space, swp_offset(entry));
+ si = get_swap_device(entry);
+ if (!si)
+ break;
+ found_page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
+ put_swap_device(si);
if (found_page)
break;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dbac1d49469d..f0edf7244256 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,6 +38,7 @@
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
+#include <linux/stop_machine.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -1174,6 +1175,64 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
return usage;
}
+/*
+ * Check whether swap entry is valid in the swap device. If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called. Otherwise return NULL.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * preempt_disable() in get_swap_device() or after the
+ * preempt_enable() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc. The
+ * caller must be prepared for that. For example, the following
+ * situation is possible.
+ *
+ * CPU1 CPU2
+ * do_swap_page()
+ * ... swapoff+swapon
+ * __read_swap_cache_async()
+ * swapcache_prepare()
+ * __swap_duplicate()
+ * // check swap_map
+ * // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff. And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ unsigned long type, offset;
+
+ if (!entry.val)
+ goto out;
+ type = swp_type(entry);
+ if (type >= nr_swapfiles)
+ goto bad_nofile;
+ si = swap_info[type];
+
+ preempt_disable();
+ if (!(si->flags & SWP_VALID))
+ goto unlock_out;
+ offset = swp_offset(entry);
+ if (offset >= si->max)
+ goto unlock_out;
+
+ return si;
+bad_nofile:
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+ return NULL;
+unlock_out:
+ preempt_enable();
+ return NULL;
+}
+
static unsigned char __swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -1345,11 +1404,18 @@ int page_swapcount(struct page *page)
return count;
}
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
{
+ struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
+ int count = 0;
- return swap_count(si->swap_map[offset]);
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_count(si->swap_map[offset]);
+ put_swap_device(si);
+ }
+ return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1374,9 +1440,11 @@ int __swp_swapcount(swp_entry_t entry)
int count = 0;
struct swap_info_struct *si;
- si = __swap_info_get(entry);
- if (si)
+ si = get_swap_device(entry);
+ if (si) {
count = swap_swapcount(si, entry);
+ put_swap_device(si);
+ }
return count;
}
@@ -1772,8 +1840,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -1782,6 +1848,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
/*
* Move the page to the active list so it is not
@@ -2428,9 +2496,9 @@ static int swap_node(struct swap_info_struct *p)
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
{
int i;
@@ -2455,7 +2523,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
}
p->swap_map = swap_map;
p->cluster_info = cluster_info;
- p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+ p->flags |= SWP_WRITEOK | SWP_VALID;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2474,6 +2546,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
add_to_avail_list(p);
}
+static int swap_onoff_stop(void *arg)
+{
+ return 0;
+}
+
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info,
@@ -2482,7 +2559,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, prio, swap_map, cluster_info);
+ setup_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * Guarantee swap_map, cluster_info, etc. fields are used
+ * between get/put_swap_device() only if SWP_VALID bit is set
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2491,7 +2578,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2594,6 +2682,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * wait for swap operations protected by get/put_swap_device()
+ * to complete
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -3358,22 +3457,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
- unsigned long offset, type;
+ unsigned long offset;
unsigned char count;
unsigned char has_cache;
int err = -EINVAL;
- if (non_swap_entry(entry))
+ p = get_swap_device(entry);
+ if (!p)
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
- goto bad_file;
- p = swap_info[type];
offset = swp_offset(entry);
- if (unlikely(offset >= p->max))
- goto out;
-
ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -3419,11 +3512,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
out:
+ if (p)
+ put_swap_device(p);
return err;
-
-bad_file:
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
- goto out;
}
/*
@@ -3515,6 +3606,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
struct page *list_page;
pgoff_t offset;
unsigned char count;
+ int ret = 0;
/*
* When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3522,15 +3614,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
page = alloc_page(gfp_mask | __GFP_HIGHMEM);
- si = swap_info_get(entry);
+ si = get_swap_device(entry);
if (!si) {
/*
* An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap entry has been freed,
- * perhaps even the whole swap_map cleared for swapoff.
+ * __swap_duplicate(): the swap device may be swapoff
*/
goto outer;
}
+ spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3548,9 +3640,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
- unlock_cluster(ci);
- spin_unlock(&si->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/*
@@ -3602,10 +3693,11 @@ out_unlock_cont:
out:
unlock_cluster(ci);
spin_unlock(&si->lock);
+ put_swap_device(si);
outer:
if (page)
__free_page(page);
- return 0;
+ return ret;
}
/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 871e41c55e23..e83961767dc1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1505,7 +1505,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
addr))
return;
- area = find_vmap_area((unsigned long)addr)->vm;
+ area = find_vm_area(addr);
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
@@ -1565,6 +1565,14 @@ void vfree_atomic(const void *addr)
__vfree_deferred(addr);
}
+static void __vfree(const void *addr)
+{
+ if (unlikely(in_interrupt()))
+ __vfree_deferred(addr);
+ else
+ __vunmap(addr, 1);
+}
+
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
@@ -1591,10 +1599,8 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt()))
- __vfree_deferred(addr);
- else
- __vunmap(addr, 1);
+
+ __vfree(addr);
}
EXPORT_SYMBOL(vfree);
@@ -1709,7 +1715,7 @@ fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
- vfree(area->addr);
+ __vfree(area->addr);
return NULL;
}
@@ -1768,6 +1774,15 @@ fail:
return NULL;
}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node_range);
+#endif
+
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
@@ -1859,18 +1874,10 @@ EXPORT_SYMBOL(vzalloc);
*/
void *vmalloc_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, SHMLBA,
- GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user);
@@ -1964,16 +1971,10 @@ EXPORT_SYMBOL(vmalloc_32);
*/
void *vmalloc_32_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user);
@@ -2248,7 +2249,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!(area->flags & VM_USERMAP))
return -EINVAL;
- if (kaddr + size > area->addr + area->size)
+ if (kaddr + size > area->addr + get_vm_area_size(area))
return -EINVAL;
do {
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 6ac919847ce6..f3f5a78cd062 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -158,6 +158,7 @@
#include <linux/etherdevice.h>
#include <linux/kthread.h>
#include <linux/prefetch.h>
+#include <linux/mmzone.h>
#include <net/net_namespace.h>
#include <net/checksum.h>
#include <net/ipv6.h>
@@ -3625,7 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->svlan_cfi = 0;
pkt_dev->svlan_id = 0xffff;
pkt_dev->burst = 1;
- pkt_dev->node = -1;
+ pkt_dev->node = NUMA_NO_NODE;
err = pktgen_setup_dev(t->net, pkt_dev, ifname);
if (err)
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index ba01fc4270bd..5b8e5bd7457b 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -30,7 +30,6 @@
#include <linux/in6.h>
#include <linux/jiffies.h>
#include <linux/time.h>
-#include <linux/flex_array.h>
#include <linux/cpumask.h>
#include <net/inet_ecn.h>
#include <net/ip_tunnels.h>
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 6657606b2b47..66f9553758a5 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -30,7 +30,6 @@
#include <linux/in6.h>
#include <linux/jiffies.h>
#include <linux/time.h>
-#include <linux/flex_array.h>
#include <net/inet_ecn.h>
#include <net/ip_tunnels.h>
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 80ea2a71852e..cfb0098c9a01 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -111,29 +111,6 @@ int ovs_flow_tbl_count(const struct flow_table *table)
return table->count;
}
-static struct flex_array *alloc_buckets(unsigned int n_buckets)
-{
- struct flex_array *buckets;
- int i, err;
-
- buckets = flex_array_alloc(sizeof(struct hlist_head),
- n_buckets, GFP_KERNEL);
- if (!buckets)
- return NULL;
-
- err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
- if (err) {
- flex_array_free(buckets);
- return NULL;
- }
-
- for (i = 0; i < n_buckets; i++)
- INIT_HLIST_HEAD((struct hlist_head *)
- flex_array_get(buckets, i));
-
- return buckets;
-}
-
static void flow_free(struct sw_flow *flow)
{
int cpu;
@@ -168,31 +145,30 @@ void ovs_flow_free(struct sw_flow *flow, bool deferred)
flow_free(flow);
}
-static void free_buckets(struct flex_array *buckets)
-{
- flex_array_free(buckets);
-}
-
-
static void __table_instance_destroy(struct table_instance *ti)
{
- free_buckets(ti->buckets);
+ kvfree(ti->buckets);
kfree(ti);
}
static struct table_instance *table_instance_alloc(int new_size)
{
struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+ int i;
if (!ti)
return NULL;
- ti->buckets = alloc_buckets(new_size);
-
+ ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head),
+ GFP_KERNEL);
if (!ti->buckets) {
kfree(ti);
return NULL;
}
+
+ for (i = 0; i < new_size; i++)
+ INIT_HLIST_HEAD(&ti->buckets[i]);
+
ti->n_buckets = new_size;
ti->node_ver = 0;
ti->keep_flows = false;
@@ -249,7 +225,7 @@ static void table_instance_destroy(struct table_instance *ti,
for (i = 0; i < ti->n_buckets; i++) {
struct sw_flow *flow;
- struct hlist_head *head = flex_array_get(ti->buckets, i);
+ struct hlist_head *head = &ti->buckets[i];
struct hlist_node *n;
int ver = ti->node_ver;
int ufid_ver = ufid_ti->node_ver;
@@ -294,7 +270,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
ver = ti->node_ver;
while (*bucket < ti->n_buckets) {
i = 0;
- head = flex_array_get(ti->buckets, *bucket);
+ head = &ti->buckets[*bucket];
hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
if (i < *last) {
i++;
@@ -313,8 +289,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
{
hash = jhash_1word(hash, ti->hash_seed);
- return flex_array_get(ti->buckets,
- (hash & (ti->n_buckets - 1)));
+ return &ti->buckets[hash & (ti->n_buckets - 1)];
}
static void table_instance_insert(struct table_instance *ti,
@@ -347,9 +322,7 @@ static void flow_table_copy_flows(struct table_instance *old,
/* Insert in new table. */
for (i = 0; i < old->n_buckets; i++) {
struct sw_flow *flow;
- struct hlist_head *head;
-
- head = flex_array_get(old->buckets, i);
+ struct hlist_head *head = &old->buckets[i];
if (ufid)
hlist_for_each_entry(flow, head,
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 2dd9900f533d..de5ec6cf5174 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -29,7 +29,6 @@
#include <linux/in6.h>
#include <linux/jiffies.h>
#include <linux/time.h>
-#include <linux/flex_array.h>
#include <net/inet_ecn.h>
#include <net/ip_tunnels.h>
@@ -37,7 +36,7 @@
#include "flow.h"
struct table_instance {
- struct flex_array *buckets;
+ struct hlist_head *buckets;
unsigned int n_buckets;
struct rcu_head rcu;
int node_ver;
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 86e1e37eb4e8..b37e6e0a1026 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -15,6 +15,7 @@
#include <linux/netlink.h>
#include <linux/qrtr.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
+#include <linux/numa.h>
#include <net/sock.h>
@@ -101,7 +102,7 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk)
return container_of(sk, struct qrtr_sock, sk);
}
-static unsigned int qrtr_local_nid = -1;
+static unsigned int qrtr_local_nid = NUMA_NO_NODE;
/* for node ids */
static RADIX_TREE(qrtr_nodes, GFP_KERNEL);
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 3892e7630f3a..09b9c7dc592c 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -37,53 +37,6 @@
#include <net/sctp/sm.h>
#include <net/sctp/stream_sched.h>
-static struct flex_array *fa_alloc(size_t elem_size, size_t elem_count,
- gfp_t gfp)
-{
- struct flex_array *result;
- int err;
-
- result = flex_array_alloc(elem_size, elem_count, gfp);
- if (result) {
- err = flex_array_prealloc(result, 0, elem_count, gfp);
- if (err) {
- flex_array_free(result);
- result = NULL;
- }
- }
-
- return result;
-}
-
-static void fa_free(struct flex_array *fa)
-{
- if (fa)
- flex_array_free(fa);
-}
-
-static void fa_copy(struct flex_array *fa, struct flex_array *from,
- size_t index, size_t count)
-{
- void *elem;
-
- while (count--) {
- elem = flex_array_get(from, index);
- flex_array_put(fa, index, elem, 0);
- index++;
- }
-}
-
-static void fa_zero(struct flex_array *fa, size_t index, size_t count)
-{
- void *elem;
-
- while (count--) {
- elem = flex_array_get(fa, index);
- memset(elem, 0, fa->element_size);
- index++;
- }
-}
-
/* Migrates chunks from stream queues to new stream queues if needed,
* but not across associations. Also, removes those chunks to streams
* higher than the new max.
@@ -138,46 +91,32 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream,
static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
gfp_t gfp)
{
- struct flex_array *out;
- size_t elem_size = sizeof(struct sctp_stream_out);
-
- out = fa_alloc(elem_size, outcnt, gfp);
- if (!out)
- return -ENOMEM;
-
- if (stream->out) {
- fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
- fa_free(stream->out);
- }
+ int ret;
- if (outcnt > stream->outcnt)
- fa_zero(out, stream->outcnt, (outcnt - stream->outcnt));
+ if (outcnt <= stream->outcnt)
+ return 0;
- stream->out = out;
+ ret = genradix_prealloc(&stream->out, outcnt, gfp);
+ if (ret)
+ return ret;
+ stream->outcnt = outcnt;
return 0;
}
static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
gfp_t gfp)
{
- struct flex_array *in;
- size_t elem_size = sizeof(struct sctp_stream_in);
-
- in = fa_alloc(elem_size, incnt, gfp);
- if (!in)
- return -ENOMEM;
-
- if (stream->in) {
- fa_copy(in, stream->in, 0, min(incnt, stream->incnt));
- fa_free(stream->in);
- }
+ int ret;
- if (incnt > stream->incnt)
- fa_zero(in, stream->incnt, (incnt - stream->incnt));
+ if (incnt <= stream->incnt)
+ return 0;
- stream->in = in;
+ ret = genradix_prealloc(&stream->in, incnt, gfp);
+ if (ret)
+ return ret;
+ stream->incnt = incnt;
return 0;
}
@@ -204,7 +143,6 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
if (ret)
goto out;
- stream->outcnt = outcnt;
for (i = 0; i < stream->outcnt; i++)
SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
@@ -218,14 +156,11 @@ in:
ret = sctp_stream_alloc_in(stream, incnt, gfp);
if (ret) {
sched->free(stream);
- fa_free(stream->out);
- stream->out = NULL;
+ genradix_free(&stream->out);
stream->outcnt = 0;
goto out;
}
- stream->incnt = incnt;
-
out:
return ret;
}
@@ -250,8 +185,8 @@ void sctp_stream_free(struct sctp_stream *stream)
sched->free(stream);
for (i = 0; i < stream->outcnt; i++)
kfree(SCTP_SO(stream, i)->ext);
- fa_free(stream->out);
- fa_free(stream->in);
+ genradix_free(&stream->out);
+ genradix_free(&stream->in);
}
void sctp_stream_clear(struct sctp_stream *stream)
@@ -282,8 +217,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
sched->sched_all(stream);
- new->out = NULL;
- new->in = NULL;
+ new->out.tree.root = NULL;
+ new->in.tree.root = NULL;
new->outcnt = 0;
new->incnt = 0;
}
@@ -535,8 +470,6 @@ int sctp_send_add_streams(struct sctp_association *asoc,
goto out;
}
- stream->outcnt = outcnt;
-
asoc->strreset_outstanding = !!out + !!in;
out:
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index a6bf21579466..102c6fefe38c 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -101,7 +101,7 @@ static void sctp_chunk_assign_mid(struct sctp_chunk *chunk)
static bool sctp_validate_data(struct sctp_chunk *chunk)
{
- const struct sctp_stream *stream;
+ struct sctp_stream *stream;
__u16 sid, ssn;
if (chunk->chunk_hdr->type != SCTP_CID_DATA)
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index b737ca9d7204..155fa9305166 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2696,8 +2696,10 @@ sub process {
($line =~ /^\s*(?:WARNING:|BUG:)/ ||
$line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
# timestamp
- $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
- # stack dump address
+ $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/) ||
+ $line =~ /^(?:\s+\w+:\s+[0-9a-fA-F]+){3,3}/ ||
+ $line =~ /^\s*\#\d+\s*\[[0-9a-fA-F]+\]\s*\w+ at [0-9a-fA-F]+/) {
+ # stack dump address styles
$commit_log_possible_stack_dump = 1;
}
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 98a7d63a723e..bcdd45df3f51 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -37,6 +37,13 @@ parse_symbol() {
symbol=${symbol#\(}
symbol=${symbol%\)}
+ # Strip segment
+ local segment
+ if [[ $symbol == *:* ]] ; then
+ segment=${symbol%%:*}:
+ symbol=${symbol#*:}
+ fi
+
# Strip the symbol name so that we could look it up
local name=${symbol%+*}
@@ -84,7 +91,7 @@ parse_symbol() {
code=${code//$'\n'/' '}
# Replace old address with pretty line numbers
- symbol="$name ($code)"
+ symbol="$segment$name ($code)"
}
decode_code() {
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index c0417cf17fee..8c5800750fa8 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -93,12 +93,10 @@ avtab_insert_node(struct avtab *h, int hvalue,
newnode->next = prev->next;
prev->next = newnode;
} else {
- newnode->next = flex_array_get_ptr(h->htable, hvalue);
- if (flex_array_put_ptr(h->htable, hvalue, newnode,
- GFP_KERNEL|__GFP_ZERO)) {
- kmem_cache_free(avtab_node_cachep, newnode);
- return NULL;
- }
+ struct avtab_node **n = &h->htable[hvalue];
+
+ newnode->next = *n;
+ *n = newnode;
}
h->nel++;
@@ -111,11 +109,11 @@ static int avtab_insert(struct avtab *h, struct avtab_key *key, struct avtab_dat
struct avtab_node *prev, *cur, *newnode;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
- if (!h || !h->htable)
+ if (!h)
return -EINVAL;
hvalue = avtab_hash(key, h->mask);
- for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+ for (prev = NULL, cur = h->htable[hvalue];
cur;
prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -156,10 +154,10 @@ avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datu
struct avtab_node *prev, *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
- if (!h || !h->htable)
+ if (!h)
return NULL;
hvalue = avtab_hash(key, h->mask);
- for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+ for (prev = NULL, cur = h->htable[hvalue];
cur;
prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -186,11 +184,11 @@ struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
- if (!h || !h->htable)
+ if (!h)
return NULL;
hvalue = avtab_hash(key, h->mask);
- for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+ for (cur = h->htable[hvalue]; cur;
cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -222,11 +220,11 @@ avtab_search_node(struct avtab *h, struct avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
- if (!h || !h->htable)
+ if (!h)
return NULL;
hvalue = avtab_hash(key, h->mask);
- for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+ for (cur = h->htable[hvalue]; cur;
cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -281,11 +279,11 @@ void avtab_destroy(struct avtab *h)
int i;
struct avtab_node *cur, *temp;
- if (!h || !h->htable)
+ if (!h)
return;
for (i = 0; i < h->nslot; i++) {
- cur = flex_array_get_ptr(h->htable, i);
+ cur = h->htable[i];
while (cur) {
temp = cur;
cur = cur->next;
@@ -295,7 +293,7 @@ void avtab_destroy(struct avtab *h)
kmem_cache_free(avtab_node_cachep, temp);
}
}
- flex_array_free(h->htable);
+ kvfree(h->htable);
h->htable = NULL;
h->nslot = 0;
h->mask = 0;
@@ -303,6 +301,7 @@ void avtab_destroy(struct avtab *h)
int avtab_init(struct avtab *h)
{
+ kvfree(h->htable);
h->htable = NULL;
h->nel = 0;
return 0;
@@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules)
nslot = MAX_AVTAB_HASH_BUCKETS;
mask = nslot - 1;
- h->htable = flex_array_alloc(sizeof(struct avtab_node *), nslot,
- GFP_KERNEL | __GFP_ZERO);
+ h->htable = kvcalloc(nslot, sizeof(void *), GFP_KERNEL);
if (!h->htable)
return -ENOMEM;
@@ -353,7 +351,7 @@ void avtab_hash_eval(struct avtab *h, char *tag)
max_chain_len = 0;
chain2_len_sum = 0;
for (i = 0; i < h->nslot; i++) {
- cur = flex_array_get_ptr(h->htable, i);
+ cur = h->htable[i];
if (cur) {
slots_used++;
chain_len = 0;
@@ -646,7 +644,7 @@ int avtab_write(struct policydb *p, struct avtab *a, void *fp)
return rc;
for (i = 0; i < a->nslot; i++) {
- for (cur = flex_array_get_ptr(a->htable, i); cur;
+ for (cur = a->htable[i]; cur;
cur = cur->next) {
rc = avtab_write_item(p, cur, fp);
if (rc)
diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h
index 0d652fad5319..de16673b2314 100644
--- a/security/selinux/ss/avtab.h
+++ b/security/selinux/ss/avtab.h
@@ -24,7 +24,6 @@
#define _SS_AVTAB_H_
#include "security.h"
-#include <linux/flex_array.h>
struct avtab_key {
u16 source_type; /* source type */
@@ -84,11 +83,10 @@ struct avtab_node {
};
struct avtab {
- struct flex_array *htable;
+ struct avtab_node **htable;
u32 nel; /* number of elements */
u32 nslot; /* number of hash slots */
u32 mask; /* mask to compute hash func */
-
};
int avtab_init(struct avtab *);
diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c
index f49e522e932d..3bbb60345209 100644
--- a/security/selinux/ss/conditional.c
+++ b/security/selinux/ss/conditional.c
@@ -195,7 +195,6 @@ int cond_index_bool(void *key, void *datum, void *datap)
{
struct policydb *p;
struct cond_bool_datum *booldatum;
- struct flex_array *fa;
booldatum = datum;
p = datap;
@@ -203,10 +202,7 @@ int cond_index_bool(void *key, void *datum, void *datap)
if (!booldatum->value || booldatum->value > p->p_bools.nprim)
return -EINVAL;
- fa = p->sym_val_to_name[SYM_BOOLS];
- if (flex_array_put_ptr(fa, booldatum->value - 1, key,
- GFP_KERNEL | __GFP_ZERO))
- BUG();
+ p->sym_val_to_name[SYM_BOOLS][booldatum->value - 1] = key;
p->bool_val_to_struct[booldatum->value - 1] = booldatum;
return 0;
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index c1c31e33657a..6b576e588725 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -36,7 +36,6 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/audit.h>
-#include <linux/flex_array.h>
#include "security.h"
#include "policydb.h"
@@ -341,17 +340,14 @@ static int common_index(void *key, void *datum, void *datap)
{
struct policydb *p;
struct common_datum *comdatum;
- struct flex_array *fa;
comdatum = datum;
p = datap;
if (!comdatum->value || comdatum->value > p->p_commons.nprim)
return -EINVAL;
- fa = p->sym_val_to_name[SYM_COMMONS];
- if (flex_array_put_ptr(fa, comdatum->value - 1, key,
- GFP_KERNEL | __GFP_ZERO))
- BUG();
+ p->sym_val_to_name[SYM_COMMONS][comdatum->value - 1] = key;
+
return 0;
}
@@ -359,16 +355,13 @@ static int class_index(void *key, void *datum, void *datap)
{
struct policydb *p;
struct class_datum *cladatum;
- struct flex_array *fa;
cladatum = datum;
p = datap;
if (!cladatum->value || cladatum->value > p->p_classes.nprim)
return -EINVAL;
- fa = p->sym_val_to_name[SYM_CLASSES];
- if (flex_array_put_ptr(fa, cladatum->value - 1, key,
- GFP_KERNEL | __GFP_ZERO))
- BUG();
+
+ p->sym_val_to_name[SYM_CLASSES][cladatum->value - 1] = key;
p->class_val_to_struct[cladatum->value - 1] = cladatum;
return 0;
}
@@ -377,7 +370,6 @@ static int role_index(void *key, void *datum, void *datap)
{
struct policydb *p;
struct role_datum *role;
- struct flex_array *fa;
role = datum;
p = datap;
@@ -386,10 +378,7 @@ static int role_index(void *key, void *datum, void *datap)
|| role->bounds > p->p_roles.nprim)
return -EINVAL;
- fa = p->sym_val_to_name[SYM_ROLES];
- if (flex_array_put_ptr(fa, role->value - 1, key,
- GFP_KERNEL | __GFP_ZERO))
- BUG();
+ p->sym_val_to_name[SYM_ROLES][role->value - 1] = key;
p->role_val_to_struct[role->value - 1] = role;
return 0;
}
@@ -398,7 +387,6 @@ static int type_index(void *key, void *datum, void *datap)
{
struct policydb *p;
struct type_datum *typdatum;
- struct flex_array *fa;
typdatum = datum;
p = datap;
@@ -408,15 +396,8 @@ static int type_index(void *key, void *datum, void *datap)
|| typdatum->value > p->p_types.nprim
|| typdatum->bounds > p->p_types.nprim)
return -EINVAL;
- fa = p->sym_val_to_name[SYM_TYPES];
- if (flex_array_put_ptr(fa, typdatum->value - 1, key,
- GFP_KERNEL | __GFP_ZERO))
- BUG();
-
- fa = p->type_val_to_struct_array;
- if (flex_array_put_ptr(fa, typdatum->value - 1, typdatum,
- GFP_KERNEL | __GFP_ZERO))
- BUG();
+ p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key;
+ p->type_val_to_struct_array[typdatum->value - 1] = typdatum;
}
return 0;
@@ -426,7 +407,6 @@ static int user_index(void *key, void *datum, void *datap)
{
struct policydb *p;
struct user_datum *usrdatum;
- struct flex_array *fa;
usrdatum = datum;
p = datap;
@@ -435,10 +415,7 @@ static int user_index(void *key, void *datum, void *datap)
|| usrdatum->bounds > p->p_users.nprim)
return -EINVAL;
- fa = p->sym_val_to_name[SYM_USERS];
- if (flex_array_put_ptr(fa, usrdatum->value - 1, key,
- GFP_KERNEL | __GFP_ZERO))
- BUG();
+ p->sym_val_to_name[SYM_USERS][usrdatum->value - 1] = key;
p->user_val_to_struct[usrdatum->value - 1] = usrdatum;
return 0;
}
@@ -447,7 +424,6 @@ static int sens_index(void *key, void *datum, void *datap)
{
struct policydb *p;
struct level_datum *levdatum;
- struct flex_array *fa;
levdatum = datum;
p = datap;
@@ -456,10 +432,8 @@ static int sens_index(void *key, void *datum, void *datap)
if (!levdatum->level->sens ||
levdatum->level->sens > p->p_levels.nprim)
return -EINVAL;
- fa = p->sym_val_to_name[SYM_LEVELS];
- if (flex_array_put_ptr(fa, levdatum->level->sens - 1, key,
- GFP_KERNEL | __GFP_ZERO))
- BUG();
+
+ p->sym_val_to_name[SYM_LEVELS][levdatum->level->sens - 1] = key;
}
return 0;
@@ -469,7 +443,6 @@ static int cat_index(void *key, void *datum, void *datap)
{
struct policydb *p;
struct cat_datum *catdatum;
- struct flex_array *fa;
catdatum = datum;
p = datap;
@@ -477,10 +450,8 @@ static int cat_index(void *key, void *datum, void *datap)
if (!catdatum->isalias) {
if (!catdatum->value || catdatum->value > p->p_cats.nprim)
return -EINVAL;
- fa = p->sym_val_to_name[SYM_CATS];
- if (flex_array_put_ptr(fa, catdatum->value - 1, key,
- GFP_KERNEL | __GFP_ZERO))
- BUG();
+
+ p->sym_val_to_name[SYM_CATS][catdatum->value - 1] = key;
}
return 0;
@@ -568,35 +539,23 @@ static int policydb_index(struct policydb *p)
if (!p->user_val_to_struct)
return -ENOMEM;
- /* Yes, I want the sizeof the pointer, not the structure */
- p->type_val_to_struct_array = flex_array_alloc(sizeof(struct type_datum *),
- p->p_types.nprim,
- GFP_KERNEL | __GFP_ZERO);
+ p->type_val_to_struct_array = kvcalloc(p->p_types.nprim,
+ sizeof(*p->type_val_to_struct_array),
+ GFP_KERNEL);
if (!p->type_val_to_struct_array)
return -ENOMEM;
- rc = flex_array_prealloc(p->type_val_to_struct_array, 0,
- p->p_types.nprim, GFP_KERNEL | __GFP_ZERO);
- if (rc)
- goto out;
-
rc = cond_init_bool_indexes(p);
if (rc)
goto out;
for (i = 0; i < SYM_NUM; i++) {
- p->sym_val_to_name[i] = flex_array_alloc(sizeof(char *),
- p->symtab[i].nprim,
- GFP_KERNEL | __GFP_ZERO);
+ p->sym_val_to_name[i] = kvcalloc(p->symtab[i].nprim,
+ sizeof(char *),
+ GFP_KERNEL);
if (!p->sym_val_to_name[i])
return -ENOMEM;
- rc = flex_array_prealloc(p->sym_val_to_name[i],
- 0, p->symtab[i].nprim,
- GFP_KERNEL | __GFP_ZERO);
- if (rc)
- goto out;
-
rc = hashtab_map(p->symtab[i].table, index_f[i], p);
if (rc)
goto out;
@@ -810,16 +769,13 @@ void policydb_destroy(struct policydb *p)
hashtab_destroy(p->symtab[i].table);
}
- for (i = 0; i < SYM_NUM; i++) {
- if (p->sym_val_to_name[i])
- flex_array_free(p->sym_val_to_name[i]);
- }
+ for (i = 0; i < SYM_NUM; i++)
+ kvfree(p->sym_val_to_name[i]);
kfree(p->class_val_to_struct);
kfree(p->role_val_to_struct);
kfree(p->user_val_to_struct);
- if (p->type_val_to_struct_array)
- flex_array_free(p->type_val_to_struct_array);
+ kvfree(p->type_val_to_struct_array);
avtab_destroy(&p->te_avtab);
@@ -872,17 +828,9 @@ void policydb_destroy(struct policydb *p)
hashtab_map(p->range_tr, range_tr_destroy, NULL);
hashtab_destroy(p->range_tr);
- if (p->type_attr_map_array) {
- for (i = 0; i < p->p_types.nprim; i++) {
- struct ebitmap *e;
-
- e = flex_array_get(p->type_attr_map_array, i);
- if (!e)
- continue;
- ebitmap_destroy(e);
- }
- flex_array_free(p->type_attr_map_array);
- }
+ for (i = 0; i < p->p_types.nprim; i++)
+ ebitmap_destroy(&p->type_attr_map_array[i]);
+ kvfree(p->type_attr_map_array);
ebitmap_destroy(&p->filename_trans_ttypes);
ebitmap_destroy(&p->policycaps);
@@ -1770,8 +1718,7 @@ static int type_bounds_sanity_check(void *key, void *datum, void *datap)
return -EINVAL;
}
- upper = flex_array_get_ptr(p->type_val_to_struct_array,
- upper->bounds - 1);
+ upper = p->type_val_to_struct_array[upper->bounds - 1];
BUG_ON(!upper);
if (upper->attribute) {
@@ -2543,23 +2490,15 @@ int policydb_read(struct policydb *p, void *fp)
if (rc)
goto bad;
- rc = -ENOMEM;
- p->type_attr_map_array = flex_array_alloc(sizeof(struct ebitmap),
- p->p_types.nprim,
- GFP_KERNEL | __GFP_ZERO);
+ p->type_attr_map_array = kvcalloc(p->p_types.nprim,
+ sizeof(*p->type_attr_map_array),
+ GFP_KERNEL);
if (!p->type_attr_map_array)
goto bad;
- /* preallocate so we don't have to worry about the put ever failing */
- rc = flex_array_prealloc(p->type_attr_map_array, 0, p->p_types.nprim,
- GFP_KERNEL | __GFP_ZERO);
- if (rc)
- goto bad;
-
for (i = 0; i < p->p_types.nprim; i++) {
- struct ebitmap *e = flex_array_get(p->type_attr_map_array, i);
+ struct ebitmap *e = &p->type_attr_map_array[i];
- BUG_ON(!e);
ebitmap_init(e);
if (p->policyvers >= POLICYDB_VERSION_AVTAB) {
rc = ebitmap_read(e, fp);
@@ -3554,9 +3493,8 @@ int policydb_write(struct policydb *p, void *fp)
return rc;
for (i = 0; i < p->p_types.nprim; i++) {
- struct ebitmap *e = flex_array_get(p->type_attr_map_array, i);
+ struct ebitmap *e = &p->type_attr_map_array[i];
- BUG_ON(!e);
rc = ebitmap_write(e, fp);
if (rc)
return rc;
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h
index 215f8f30ac5a..27039149ff0a 100644
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -24,8 +24,6 @@
#ifndef _SS_POLICYDB_H_
#define _SS_POLICYDB_H_
-#include <linux/flex_array.h>
-
#include "symtab.h"
#include "avtab.h"
#include "sidtab.h"
@@ -251,13 +249,13 @@ struct policydb {
#define p_cats symtab[SYM_CATS]
/* symbol names indexed by (value - 1) */
- struct flex_array *sym_val_to_name[SYM_NUM];
+ char **sym_val_to_name[SYM_NUM];
/* class, role, and user attributes indexed by (value - 1) */
struct class_datum **class_val_to_struct;
struct role_datum **role_val_to_struct;
struct user_datum **user_val_to_struct;
- struct flex_array *type_val_to_struct_array;
+ struct type_datum **type_val_to_struct_array;
/* type enforcement access vectors and transitions */
struct avtab te_avtab;
@@ -294,7 +292,7 @@ struct policydb {
struct hashtab *range_tr;
/* type -> attribute reverse mapping */
- struct flex_array *type_attr_map_array;
+ struct ebitmap *type_attr_map_array;
struct ebitmap policycaps;
@@ -369,9 +367,7 @@ static inline int put_entry(const void *buf, size_t bytes, int num, struct polic
static inline char *sym_name(struct policydb *p, unsigned int sym_num, unsigned int element_nr)
{
- struct flex_array *fa = p->sym_val_to_name[sym_num];
-
- return flex_array_get_ptr(fa, element_nr);
+ return p->sym_val_to_name[sym_num][element_nr];
}
extern u16 string_to_security_class(struct policydb *p, const char *name);
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index d6e7b4856d93..93810afd2ac1 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -49,7 +49,6 @@
#include <linux/sched.h>
#include <linux/audit.h>
#include <linux/mutex.h>
-#include <linux/flex_array.h>
#include <linux/vmalloc.h>
#include <net/netlabel.h>
@@ -545,15 +544,13 @@ static void type_attribute_bounds_av(struct policydb *policydb,
struct type_datum *target;
u32 masked = 0;
- source = flex_array_get_ptr(policydb->type_val_to_struct_array,
- scontext->type - 1);
+ source = policydb->type_val_to_struct_array[scontext->type - 1];
BUG_ON(!source);
if (!source->bounds)
return;
- target = flex_array_get_ptr(policydb->type_val_to_struct_array,
- tcontext->type - 1);
+ target = policydb->type_val_to_struct_array[tcontext->type - 1];
BUG_ON(!target);
memset(&lo_avd, 0, sizeof(lo_avd));
@@ -653,11 +650,9 @@ static void context_struct_compute_av(struct policydb *policydb,
*/
avkey.target_class = tclass;
avkey.specified = AVTAB_AV | AVTAB_XPERMS;
- sattr = flex_array_get(policydb->type_attr_map_array,
- scontext->type - 1);
+ sattr = &policydb->type_attr_map_array[scontext->type - 1];
BUG_ON(!sattr);
- tattr = flex_array_get(policydb->type_attr_map_array,
- tcontext->type - 1);
+ tattr = &policydb->type_attr_map_array[tcontext->type - 1];
BUG_ON(!tattr);
ebitmap_for_each_positive_bit(sattr, snode, i) {
ebitmap_for_each_positive_bit(tattr, tnode, j) {
@@ -900,8 +895,7 @@ int security_bounded_transition(struct selinux_state *state,
index = new_context->type;
while (true) {
- type = flex_array_get_ptr(policydb->type_val_to_struct_array,
- index - 1);
+ type = policydb->type_val_to_struct_array[index - 1];
BUG_ON(!type);
/* not bounded anymore */
@@ -1064,11 +1058,9 @@ void security_compute_xperms_decision(struct selinux_state *state,
avkey.target_class = tclass;
avkey.specified = AVTAB_XPERMS;
- sattr = flex_array_get(policydb->type_attr_map_array,
- scontext->type - 1);
+ sattr = &policydb->type_attr_map_array[scontext->type - 1];
BUG_ON(!sattr);
- tattr = flex_array_get(policydb->type_attr_map_array,
- tcontext->type - 1);
+ tattr = &policydb->type_attr_map_array[tcontext->type - 1];
BUG_ON(!tattr);
ebitmap_for_each_positive_bit(sattr, snode, i) {
ebitmap_for_each_positive_bit(tattr, tnode, j) {
diff --git a/tools/include/linux/numa.h b/tools/include/linux/numa.h
new file mode 100644
index 000000000000..110b0e5d0fb0
--- /dev/null
+++ b/tools/include/linux/numa.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NUMA_H
+#define _LINUX_NUMA_H
+
+
+#ifdef CONFIG_NODES_SHIFT
+#define NODES_SHIFT CONFIG_NODES_SHIFT
+#else
+#define NODES_SHIFT 0
+#endif
+
+#define MAX_NUMNODES (1 << NODES_SHIFT)
+
+#define NUMA_NO_NODE (-1)
+
+#endif /* _LINUX_NUMA_H */
diff --git a/tools/include/linux/poison.h b/tools/include/linux/poison.h
index 9fdcd3eaac3b..d29725769107 100644
--- a/tools/include/linux/poison.h
+++ b/tools/include/linux/poison.h
@@ -87,9 +87,6 @@
#define MUTEX_DEBUG_INIT 0x11
#define MUTEX_DEBUG_FREE 0x22
-/********** lib/flex_array.c **********/
-#define FLEX_ARRAY_FREE 0x6c /* for use-after-free poisoning */
-
/********** security/ **********/
#define KEY_DESTROY 0xbd
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 44195514b19e..98ad783efc69 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -34,6 +34,7 @@
#include <sys/types.h>
#include <linux/kernel.h>
#include <linux/time64.h>
+#include <linux/numa.h>
#include <numa.h>
#include <numaif.h>
@@ -298,7 +299,7 @@ static cpu_set_t bind_to_node(int target_node)
CPU_ZERO(&mask);
- if (target_node == -1) {
+ if (target_node == NUMA_NO_NODE) {
for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
CPU_SET(cpu, &mask);
} else {
@@ -339,7 +340,7 @@ static void bind_to_memnode(int node)
unsigned long nodemask;
int ret;
- if (node == -1)
+ if (node == NUMA_NO_NODE)
return;
BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8);
@@ -1363,7 +1364,7 @@ static void init_thread_data(void)
int cpu;
/* Allow all nodes by default: */
- td->bind_node = -1;
+ td->bind_node = NUMA_NO_NODE;
/* Allow all CPUs by default: */
CPU_ZERO(&td->bind_cpumask);
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index 82121a81681f..29bac5ef9a93 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -10,4 +10,5 @@
/proc-uptime-002
/read
/self
+/setns-dcache
/thread-self
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index 1c12c34cf85d..434d033ee067 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -14,6 +14,7 @@ TEST_GEN_PROGS += proc-uptime-001
TEST_GEN_PROGS += proc-uptime-002
TEST_GEN_PROGS += read
TEST_GEN_PROGS += self
+TEST_GEN_PROGS += setns-dcache
TEST_GEN_PROGS += thread-self
include ../lib.mk
diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c
index fcff7047000d..471e2aa28077 100644
--- a/tools/testing/selftests/proc/proc-loadavg-001.c
+++ b/tools/testing/selftests/proc/proc-loadavg-001.c
@@ -30,7 +30,7 @@ int main(void)
if (unshare(CLONE_NEWPID) == -1) {
if (errno == ENOSYS || errno == EPERM)
- return 2;
+ return 4;
return 1;
}
diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c
index 85744425b08d..762cb01f2ca7 100644
--- a/tools/testing/selftests/proc/proc-self-map-files-002.c
+++ b/tools/testing/selftests/proc/proc-self-map-files-002.c
@@ -63,7 +63,7 @@ int main(void)
p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0);
if (p == MAP_FAILED) {
if (errno == EPERM)
- return 2;
+ return 4;
return 1;
}
diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c
index 5ab5f4810e43..b2993a67a4b3 100644
--- a/tools/testing/selftests/proc/proc-self-syscall.c
+++ b/tools/testing/selftests/proc/proc-self-syscall.c
@@ -39,7 +39,7 @@ int main(void)
fd = open("/proc/self/syscall", O_RDONLY);
if (fd == -1) {
if (errno == ENOENT)
- return 2;
+ return 4;
return 1;
}
diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c
index a38b2fbaa7ad..b467b98a457d 100644
--- a/tools/testing/selftests/proc/proc-self-wchan.c
+++ b/tools/testing/selftests/proc/proc-self-wchan.c
@@ -27,7 +27,7 @@ int main(void)
fd = open("/proc/self/wchan", O_RDONLY);
if (fd == -1) {
if (errno == ENOENT)
- return 2;
+ return 4;
return 1;
}
diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c
index 563e752e6eba..b2bd8da58efe 100644
--- a/tools/testing/selftests/proc/read.c
+++ b/tools/testing/selftests/proc/read.c
@@ -126,7 +126,7 @@ int main(void)
d = opendir("/proc");
if (!d)
- return 2;
+ return 4;
f(d, 0);
return 0;
}
diff --git a/tools/testing/selftests/proc/setns-dcache.c b/tools/testing/selftests/proc/setns-dcache.c
new file mode 100644
index 000000000000..60ab197a73fc
--- /dev/null
+++ b/tools/testing/selftests/proc/setns-dcache.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright © 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that setns(CLONE_NEWNET) points to new /proc/net content even
+ * if old one is in dcache.
+ *
+ * FIXME /proc/net/unix is under CONFIG_UNIX which can be disabled.
+ */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+
+static pid_t pid = -1;
+
+static void f(void)
+{
+ if (pid > 0) {
+ kill(pid, SIGTERM);
+ }
+}
+
+int main(void)
+{
+ int fd[2];
+ char _ = 0;
+ int nsfd;
+
+ atexit(f);
+
+ /* Check for priviledges and syscall availability straight away. */
+ if (unshare(CLONE_NEWNET) == -1) {
+ if (errno == ENOSYS || errno == EPERM) {
+ return 4;
+ }
+ return 1;
+ }
+ /* Distinguisher between two otherwise empty net namespaces. */
+ if (socket(AF_UNIX, SOCK_STREAM, 0) == -1) {
+ return 1;
+ }
+
+ if (pipe(fd) == -1) {
+ return 1;
+ }
+
+ pid = fork();
+ if (pid == -1) {
+ return 1;
+ }
+
+ if (pid == 0) {
+ if (unshare(CLONE_NEWNET) == -1) {
+ return 1;
+ }
+
+ if (write(fd[1], &_, 1) != 1) {
+ return 1;
+ }
+
+ pause();
+
+ return 0;
+ }
+
+ if (read(fd[0], &_, 1) != 1) {
+ return 1;
+ }
+
+ {
+ char buf[64];
+ snprintf(buf, sizeof(buf), "/proc/%u/ns/net", pid);
+ nsfd = open(buf, O_RDONLY);
+ if (nsfd == -1) {
+ return 1;
+ }
+ }
+
+ /* Reliably pin dentry into dcache. */
+ (void)open("/proc/net/unix", O_RDONLY);
+
+ if (setns(nsfd, CLONE_NEWNET) == -1) {
+ return 1;
+ }
+
+ kill(pid, SIGTERM);
+ pid = 0;
+
+ {
+ char buf[4096];
+ ssize_t rv;
+ int fd;
+
+ fd = open("/proc/net/unix", O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+
+#define S "Num RefCount Protocol Flags Type St Inode Path\n"
+ rv = read(fd, buf, sizeof(buf));
+
+ assert(rv == strlen(S));
+ assert(memcmp(buf, S, strlen(S)) == 0);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 584a91ae4a8f..951c507a27f7 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -211,4 +211,20 @@ else
echo "[PASS]"
fi
+echo "------------------------------------"
+echo "running vmalloc stability smoke test"
+echo "------------------------------------"
+./test_vmalloc.sh smoke
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
exit $exitcode
diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh
new file mode 100644
index 000000000000..06d2bb109f06
--- /dev/null
+++ b/tools/testing/selftests/vm/test_vmalloc.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+# a) analyse performance of vmalloc allocations;
+# b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="vmalloc"
+DRIVER="test_${TEST_NAME}"
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+#
+# Static templates for performance, stressing and smoke tests.
+# Also it is possible to pass any supported parameters manualy.
+#
+PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3"
+SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10"
+STRESS_PARAM="test_repeat_count=20"
+
+check_test_requirements()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo "$0: Must be run as root"
+ exit $ksft_skip
+ fi
+
+ if ! which modprobe > /dev/null 2>&1; then
+ echo "$0: You need modprobe installed"
+ exit $ksft_skip
+ fi
+
+ if ! modinfo $DRIVER > /dev/null 2>&1; then
+ echo "$0: You must have the following enabled in your kernel:"
+ echo "CONFIG_TEST_VMALLOC=m"
+ exit $ksft_skip
+ fi
+}
+
+run_perfformance_check()
+{
+ echo "Run performance tests to evaluate how fast vmalloc allocation is."
+ echo "It runs all test cases on one single CPU with sequential order."
+
+ modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Ccheck the kernel message buffer to see the summary."
+}
+
+run_stability_check()
+{
+ echo "Run stability tests. In order to stress vmalloc subsystem we run"
+ echo "all available test cases on all available CPUs simultaneously."
+ echo "It will take time, so be patient."
+
+ modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+run_smoke_check()
+{
+ echo "Run smoke test. Note, this test provides basic coverage."
+ echo "Please check $0 output how it can be used"
+ echo "for deep performance analysis as well as stress testing."
+
+ modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+usage()
+{
+ echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | "
+ echo "manual parameters"
+ echo
+ echo "Valid tests and parameters:"
+ echo
+ modinfo $DRIVER
+ echo
+ echo "Example usage:"
+ echo
+ echo "# Shows help message"
+ echo "./${DRIVER}.sh"
+ echo
+ echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs"
+ echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5"
+ echo
+ echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with "
+ echo "sequential order"
+ echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 "
+ echo "run_test_mask=23"
+ echo
+ echo -n "# Runs all tests on all online CPUs, shuffled order, repeats "
+ echo "20 times"
+ echo "./${DRIVER}.sh test_repeat_count=20"
+ echo
+ echo "# Performance analysis"
+ echo "./${DRIVER}.sh performance"
+ echo
+ echo "# Stress testing"
+ echo "./${DRIVER}.sh stress"
+ echo
+ exit 0
+}
+
+function validate_passed_args()
+{
+ VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
+
+ #
+ # Something has been passed, check it.
+ #
+ for passed_arg in $@; do
+ key=${passed_arg//=*/}
+ val="${passed_arg:$((${#key}+1))}"
+ valid=0
+
+ for valid_arg in $VALID_ARGS; do
+ if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then
+ valid=1
+ break
+ fi
+ done
+
+ if [[ $valid -ne 1 ]]; then
+ echo "Error: key or value is not correct: ${key} $val"
+ exit $exitcode
+ fi
+ done
+}
+
+function run_manual_check()
+{
+ #
+ # Validate passed parameters. If there is wrong one,
+ # the script exists and does not execute further.
+ #
+ validate_passed_args $@
+
+ echo "Run the test with following parameters: $@"
+ modprobe $DRIVER $@ > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+function run_test()
+{
+ if [ $# -eq 0 ]; then
+ usage
+ else
+ if [[ "$1" = "performance" ]]; then
+ run_perfformance_check
+ elif [[ "$1" = "stress" ]]; then
+ run_stability_check
+ elif [[ "$1" = "smoke" ]]; then
+ run_smoke_check
+ else
+ run_manual_check $@
+ fi
+ fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 1ff3a6c0367b..6f64b2b93234 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -133,7 +133,7 @@ static const char * const page_flag_names[] = {
[KPF_NOPAGE] = "n:nopage",
[KPF_KSM] = "x:ksm",
[KPF_THP] = "t:thp",
- [KPF_BALLOON] = "o:balloon",
+ [KPF_OFFLINE] = "o:offline",
[KPF_PGTABLE] = "g:pgtable",
[KPF_ZERO_PAGE] = "z:zero_page",
[KPF_IDLE] = "i:idle_page",