aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/pps/pps-gpio.txt8
-rw-r--r--Documentation/pps/pps.txt44
-rw-r--r--Documentation/rbtree.txt33
-rw-r--r--Documentation/vm/hmm.txt384
-rw-r--r--MAINTAINERS19
-rw-r--r--arch/alpha/include/asm/string.h15
-rw-r--r--arch/alpha/include/asm/vga.h2
-rw-r--r--arch/alpha/lib/memset.S10
-rw-r--r--arch/arm/include/asm/string.h14
-rw-r--r--arch/arm/kernel/armksyms.c2
-rw-r--r--arch/arm/lib/memset.S24
-rw-r--r--arch/arm64/kernel/smp.c2
-rw-r--r--arch/frv/Kconfig3
-rw-r--r--arch/h8300/Kconfig3
-rw-r--r--arch/m32r/configs/m32104ut_defconfig4
-rw-r--r--arch/m32r/configs/m32700ut.smp_defconfig3
-rw-r--r--arch/m32r/configs/m32700ut.up_defconfig3
-rw-r--r--arch/m32r/configs/mappi.nommu_defconfig2
-rw-r--r--arch/m32r/configs/mappi.smp_defconfig4
-rw-r--r--arch/m32r/configs/mappi.up_defconfig4
-rw-r--r--arch/m32r/configs/mappi2.opsp_defconfig2
-rw-r--r--arch/m32r/configs/mappi2.vdec2_defconfig2
-rw-r--r--arch/m32r/configs/mappi3.smp_defconfig4
-rw-r--r--arch/m32r/configs/oaks32r_defconfig2
-rw-r--r--arch/m32r/configs/opsput_defconfig2
-rw-r--r--arch/m32r/configs/usrv_defconfig5
-rw-r--r--arch/m68k/Kconfig3
-rw-r--r--arch/microblaze/Kconfig16
-rw-r--r--arch/microblaze/Makefile2
-rw-r--r--arch/mips/include/asm/vga.h7
-rw-r--r--arch/mn10300/configs/asb2303_defconfig6
-rw-r--r--arch/mn10300/configs/asb2364_defconfig8
-rw-r--r--arch/openrisc/Kconfig3
-rw-r--r--arch/parisc/Kconfig3
-rw-r--r--arch/powerpc/include/asm/vga.h8
-rw-r--r--arch/powerpc/kernel/paca.c2
-rw-r--r--arch/powerpc/kernel/setup-common.c2
-rw-r--r--arch/powerpc/sysdev/xive/native.c4
-rw-r--r--arch/sh/configs/ap325rxa_defconfig10
-rw-r--r--arch/sh/configs/apsh4a3a_defconfig9
-rw-r--r--arch/sh/configs/apsh4ad0a_defconfig6
-rw-r--r--arch/sh/configs/cayman_defconfig4
-rw-r--r--arch/sh/configs/dreamcast_defconfig6
-rw-r--r--arch/sh/configs/ecovec24-romimage_defconfig7
-rw-r--r--arch/sh/configs/ecovec24_defconfig9
-rw-r--r--arch/sh/configs/edosk7705_defconfig2
-rw-r--r--arch/sh/configs/edosk7760_defconfig11
-rw-r--r--arch/sh/configs/espt_defconfig10
-rw-r--r--arch/sh/configs/hp6xx_defconfig4
-rw-r--r--arch/sh/configs/kfr2r09-romimage_defconfig5
-rw-r--r--arch/sh/configs/kfr2r09_defconfig7
-rw-r--r--arch/sh/configs/landisk_defconfig4
-rw-r--r--arch/sh/configs/lboxre2_defconfig3
-rw-r--r--arch/sh/configs/magicpanelr2_defconfig9
-rw-r--r--arch/sh/configs/microdev_defconfig3
-rw-r--r--arch/sh/configs/migor_defconfig8
-rw-r--r--arch/sh/configs/polaris_defconfig9
-rw-r--r--arch/sh/configs/r7780mp_defconfig4
-rw-r--r--arch/sh/configs/r7785rp_defconfig3
-rw-r--r--arch/sh/configs/rsk7201_defconfig8
-rw-r--r--arch/sh/configs/rsk7203_defconfig10
-rw-r--r--arch/sh/configs/rsk7264_defconfig3
-rw-r--r--arch/sh/configs/rsk7269_defconfig4
-rw-r--r--arch/sh/configs/rts7751r2d1_defconfig5
-rw-r--r--arch/sh/configs/rts7751r2dplus_defconfig8
-rw-r--r--arch/sh/configs/sdk7780_defconfig9
-rw-r--r--arch/sh/configs/sdk7786_defconfig9
-rw-r--r--arch/sh/configs/se7206_defconfig8
-rw-r--r--arch/sh/configs/se7343_defconfig9
-rw-r--r--arch/sh/configs/se7619_defconfig5
-rw-r--r--arch/sh/configs/se7705_defconfig5
-rw-r--r--arch/sh/configs/se7712_defconfig7
-rw-r--r--arch/sh/configs/se7721_defconfig6
-rw-r--r--arch/sh/configs/se7722_defconfig3
-rw-r--r--arch/sh/configs/se7724_defconfig9
-rw-r--r--arch/sh/configs/se7750_defconfig5
-rw-r--r--arch/sh/configs/se7751_defconfig5
-rw-r--r--arch/sh/configs/se7780_defconfig7
-rw-r--r--arch/sh/configs/secureedge5410_defconfig9
-rw-r--r--arch/sh/configs/sh03_defconfig4
-rw-r--r--arch/sh/configs/sh2007_defconfig8
-rw-r--r--arch/sh/configs/sh7710voipgw_defconfig5
-rw-r--r--arch/sh/configs/sh7724_generic_defconfig3
-rw-r--r--arch/sh/configs/sh7757lcr_defconfig4
-rw-r--r--arch/sh/configs/sh7763rdp_defconfig9
-rw-r--r--arch/sh/configs/sh7770_generic_defconfig3
-rw-r--r--arch/sh/configs/sh7785lcr_32bit_defconfig10
-rw-r--r--arch/sh/configs/sh7785lcr_defconfig9
-rw-r--r--arch/sh/configs/shmin_defconfig4
-rw-r--r--arch/sh/configs/shx3_defconfig6
-rw-r--r--arch/sh/configs/titan_defconfig8
-rw-r--r--arch/sh/configs/ul2_defconfig9
-rw-r--r--arch/sh/configs/urquell_defconfig9
-rw-r--r--arch/sparc/Kconfig3
-rw-r--r--arch/sparc/include/asm/vga.h25
-rw-r--r--arch/tile/kernel/setup.c2
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/include/asm/pgtable.h17
-rw-r--r--arch/x86/include/asm/pgtable_64.h14
-rw-r--r--arch/x86/include/asm/pgtable_types.h10
-rw-r--r--arch/x86/include/asm/string_32.h24
-rw-r--r--arch/x86/include/asm/string_64.h36
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/mm/init_64.c22
-rw-r--r--block/cfq-iosched.c81
-rw-r--r--drivers/base/cpu.c4
-rw-r--r--drivers/base/node.c22
-rw-r--r--drivers/block/zram/zram_drv.c13
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h2
-rw-r--r--drivers/gpu/drm/drm_mm.c19
-rw-r--r--drivers/gpu/drm/drm_vma_manager.c2
-rw-r--r--drivers/gpu/drm/i915/i915_gem_userptr.c6
-rw-r--r--drivers/gpu/drm/radeon/radeon.h2
-rw-r--r--drivers/gpu/drm/radeon/radeon_mn.c8
-rw-r--r--drivers/gpu/drm/radeon/radeon_vm.c7
-rw-r--r--drivers/infiniband/core/umem_rbtree.c4
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c2
-rw-r--r--drivers/infiniband/hw/hfi1/mmu_rb.c10
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c6
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.h2
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c15
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h12
-rw-r--r--drivers/mtd/nand/denali.c3
-rw-r--r--drivers/pps/Kconfig7
-rw-r--r--drivers/pps/clients/Kconfig7
-rw-r--r--drivers/pps/generators/Kconfig3
-rw-r--r--drivers/scsi/scsi_debug.c2
-rw-r--r--drivers/scsi/sym53c8xx_2/sym_hipd.c11
-rw-r--r--drivers/vhost/vhost.c2
-rw-r--r--drivers/vhost/vhost.h2
-rw-r--r--fs/aio.c8
-rw-r--r--fs/autofs4/autofs_i.h22
-rw-r--r--fs/autofs4/dev-ioctl.c44
-rw-r--r--fs/binfmt_flat.c8
-rw-r--r--fs/eventpoll.c30
-rw-r--r--fs/f2fs/data.c5
-rw-r--r--fs/fat/namei_vfat.c35
-rw-r--r--fs/hugetlbfs/inode.c11
-rw-r--r--fs/inode.c2
-rw-r--r--fs/namei.c15
-rw-r--r--fs/proc/generic.c34
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c81
-rw-r--r--fs/proc/task_nommu.c5
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/userfaultfd.c20
-rw-r--r--include/asm-generic/pgtable.h52
-rw-r--r--include/drm/drm_mm.h2
-rw-r--r--include/linux/bitmap.h32
-rw-r--r--include/linux/bitops.h5
-rw-r--r--include/linux/byteorder/big_endian.h4
-rw-r--r--include/linux/byteorder/little_endian.h4
-rw-r--r--include/linux/cpumask.h21
-rw-r--r--include/linux/fs.h7
-rw-r--r--include/linux/hmm.h520
-rw-r--r--include/linux/huge_mm.h24
-rw-r--r--include/linux/init_task.h5
-rw-r--r--include/linux/interval_tree.h8
-rw-r--r--include/linux/interval_tree_generic.h48
-rw-r--r--include/linux/ioport.h2
-rw-r--r--include/linux/ipc.h6
-rw-r--r--include/linux/ipc_namespace.h8
-rw-r--r--include/linux/kernel.h7
-rw-r--r--include/linux/kmod.h60
-rw-r--r--include/linux/memory_hotplug.h11
-rw-r--r--include/linux/memremap.h99
-rw-r--r--include/linux/migrate.h152
-rw-r--r--include/linux/migrate_mode.h5
-rw-r--r--include/linux/mm.h58
-rw-r--r--include/linux/mm_types.h6
-rw-r--r--include/linux/mmzone.h24
-rw-r--r--include/linux/pps-gpio.h2
-rw-r--r--include/linux/pps_kernel.h16
-rw-r--r--include/linux/proc_fs.h8
-rw-r--r--include/linux/rbtree.h21
-rw-r--r--include/linux/rbtree_augmented.h33
-rw-r--r--include/linux/rhashtable.h2
-rw-r--r--include/linux/rmap.h7
-rw-r--r--include/linux/rtmutex.h11
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/string.h30
-rw-r--r--include/linux/swap.h24
-rw-r--r--include/linux/swapops.h143
-rw-r--r--include/linux/umh.h69
-rw-r--r--include/linux/vmstat.h33
-rw-r--r--include/linux/vt_buffer.h13
-rw-r--r--include/rdma/ib_umem_odp.h11
-rw-r--r--include/rdma/ib_verbs.h2
-rw-r--r--include/uapi/linux/auto_dev-ioctl.h2
-rw-r--r--include/uapi/linux/auto_fs4.h2
-rw-r--r--include/uapi/linux/pps.h4
-rw-r--r--init/main.c13
-rw-r--r--ipc/msg.c10
-rw-r--r--ipc/msgutil.c2
-rw-r--r--ipc/namespace.c24
-rw-r--r--ipc/sem.c25
-rw-r--r--ipc/shm.c12
-rw-r--r--ipc/util.c106
-rw-r--r--ipc/util.h21
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/kcov.c1
-rw-r--r--kernel/kmod.c563
-rw-r--r--kernel/locking/rtmutex-debug.c2
-rw-r--r--kernel/locking/rtmutex.c35
-rw-r--r--kernel/locking/rtmutex_common.h12
-rw-r--r--kernel/memremap.c60
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/sched/deadline.c50
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c39
-rw-r--r--kernel/sched/sched.h9
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/umh.c568
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Kconfig.debug11
-rw-r--r--lib/Makefile1
-rw-r--r--lib/bitmap.c18
-rw-r--r--lib/cmdline.c1
-rw-r--r--lib/cpumask.c16
-rw-r--r--lib/hexdump.c5
-rw-r--r--lib/interval_tree_test.c4
-rw-r--r--lib/oid_registry.c4
-rw-r--r--lib/radix-tree.c9
-rw-r--r--lib/rbtree.c65
-rw-r--r--lib/rbtree_test.c230
-rw-r--r--lib/string.c207
-rw-r--r--lib/test_bitmap.c91
-rw-r--r--lib/test_debug_virtual.c49
-rw-r--r--lib/test_kmod.c4
-rw-r--r--mm/Kconfig54
-rw-r--r--mm/Makefile1
-rw-r--r--mm/balloon_compaction.c8
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/gup.c29
-rw-r--r--mm/hmm.c1257
-rw-r--r--mm/huge_memory.c174
-rw-r--r--mm/interval_tree.c10
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c256
-rw-r--r--mm/memory.c138
-rw-r--r--mm/memory_hotplug.c14
-rw-r--r--mm/mempolicy.c142
-rw-r--r--mm/migrate.c1007
-rw-r--r--mm/mlock.c10
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c18
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c13
-rw-r--r--mm/page_vma_mapped.c28
-rw-r--r--mm/pgtable-generic.c3
-rw-r--r--mm/rmap.c50
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c11
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/vmstat.c154
-rw-r--r--mm/zsmalloc.c17
-rwxr-xr-xscripts/checkpatch.pl61
-rw-r--r--sound/soc/codecs/pcm512x.c3
-rw-r--r--tools/testing/selftests/kcmp/kcmp_test.c60
271 files changed, 7222 insertions, 2011 deletions
diff --git a/Documentation/devicetree/bindings/pps/pps-gpio.txt b/Documentation/devicetree/bindings/pps/pps-gpio.txt
index 40bf9c3564a5..0de23b793657 100644
--- a/Documentation/devicetree/bindings/pps/pps-gpio.txt
+++ b/Documentation/devicetree/bindings/pps/pps-gpio.txt
@@ -13,8 +13,12 @@ Optional properties:
Example:
pps {
- compatible = "pps-gpio";
- gpios = <&gpio2 6 0>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_pps>;
+ gpios = <&gpio1 26 GPIO_ACTIVE_HIGH>;
assert-falling-edge;
+
+ compatible = "pps-gpio";
+ status = "okay";
};
diff --git a/Documentation/pps/pps.txt b/Documentation/pps/pps.txt
index 1fdbd5447216..99f5d8c4c652 100644
--- a/Documentation/pps/pps.txt
+++ b/Documentation/pps/pps.txt
@@ -48,12 +48,12 @@ problem:
time_pps_create().
This implies that the source has a /dev/... entry. This assumption is
-ok for the serial and parallel port, where you can do something
+OK for the serial and parallel port, where you can do something
useful besides(!) the gathering of timestamps as it is the central
-task for a PPS-API. But this assumption does not work for a single
+task for a PPS API. But this assumption does not work for a single
purpose GPIO line. In this case even basic file-related functionality
(like read() and write()) makes no sense at all and should not be a
-precondition for the use of a PPS-API.
+precondition for the use of a PPS API.
The problem can be simply solved if you consider that a PPS source is
not always connected with a GPS data source.
@@ -88,13 +88,13 @@ Coding example
--------------
To register a PPS source into the kernel you should define a struct
-pps_source_info_s as follows:
+pps_source_info as follows:
static struct pps_source_info pps_ktimer_info = {
.name = "ktimer",
.path = "",
- .mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | \
- PPS_ECHOASSERT | \
+ .mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT |
+ PPS_ECHOASSERT |
PPS_CANWAIT | PPS_TSFMT_TSPEC,
.echo = pps_ktimer_echo,
.owner = THIS_MODULE,
@@ -108,13 +108,13 @@ initialization routine as follows:
The pps_register_source() prototype is:
- int pps_register_source(struct pps_source_info_s *info, int default_params)
+ int pps_register_source(struct pps_source_info *info, int default_params)
where "info" is a pointer to a structure that describes a particular
PPS source, "default_params" tells the system what the initial default
parameters for the device should be (it is obvious that these parameters
must be a subset of ones defined in the struct
-pps_source_info_s which describe the capabilities of the driver).
+pps_source_info which describe the capabilities of the driver).
Once you have registered a new PPS source into the system you can
signal an assert event (for example in the interrupt handler routine)
@@ -142,8 +142,10 @@ If the SYSFS filesystem is enabled in the kernel it provides a new class:
Every directory is the ID of a PPS sources defined in the system and
inside you find several files:
- $ ls /sys/class/pps/pps0/
- assert clear echo mode name path subsystem@ uevent
+ $ ls -F /sys/class/pps/pps0/
+ assert dev mode path subsystem@
+ clear echo name power/ uevent
+
Inside each "assert" and "clear" file you can find the timestamp and a
sequence number:
@@ -154,32 +156,32 @@ sequence number:
Where before the "#" is the timestamp in seconds; after it is the
sequence number. Other files are:
-* echo: reports if the PPS source has an echo function or not;
+ * echo: reports if the PPS source has an echo function or not;
-* mode: reports available PPS functioning modes;
+ * mode: reports available PPS functioning modes;
-* name: reports the PPS source's name;
+ * name: reports the PPS source's name;
-* path: reports the PPS source's device path, that is the device the
- PPS source is connected to (if it exists).
+ * path: reports the PPS source's device path, that is the device the
+ PPS source is connected to (if it exists).
Testing the PPS support
-----------------------
In order to test the PPS support even without specific hardware you can use
-the ktimer driver (see the client subsection in the PPS configuration menu)
+the pps-ktimer driver (see the client subsection in the PPS configuration menu)
and the userland tools available in your distribution's pps-tools package,
-http://linuxpps.org , or https://github.com/ago/pps-tools .
+http://linuxpps.org , or https://github.com/redlab-i/pps-tools.
-Once you have enabled the compilation of ktimer just modprobe it (if
+Once you have enabled the compilation of pps-ktimer just modprobe it (if
not statically compiled):
- # modprobe ktimer
+ # modprobe pps-ktimer
and the run ppstest as follow:
- $ ./ppstest /dev/pps0
+ $ ./ppstest /dev/pps1
trying PPS source "/dev/pps1"
found PPS source "/dev/pps1"
ok, found 1 source(s), now start fetching data...
@@ -187,7 +189,7 @@ and the run ppstest as follow:
source 0 - assert 1186592700.388931295, sequence: 365 - clear 0.000000000, sequence: 0
source 0 - assert 1186592701.389032765, sequence: 366 - clear 0.000000000, sequence: 0
-Please, note that to compile userland programs you need the file timepps.h .
+Please note that to compile userland programs, you need the file timepps.h.
This is available in the pps-tools repository mentioned above.
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index b8a8c70b0188..c42a21b99046 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -193,6 +193,39 @@ Example::
for (node = rb_first(&mytree); node; node = rb_next(node))
printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
+Cached rbtrees
+--------------
+
+Computing the leftmost (smallest) node is quite a common task for binary
+search trees, such as for traversals or users relying on a the particular
+order for their own logic. To this end, users can use 'struct rb_root_cached'
+to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding
+potentially expensive tree iterations. This is done at negligible runtime
+overhead for maintanence; albeit larger memory footprint.
+
+Similar to the rb_root structure, cached rbtrees are initialized to be
+empty via:
+
+ struct rb_root_cached mytree = RB_ROOT_CACHED;
+
+Cached rbtree is simply a regular rb_root with an extra pointer to cache the
+leftmost node. This allows rb_root_cached to exist wherever rb_root does,
+which permits augmented trees to be supported as well as only a few extra
+interfaces:
+
+ struct rb_node *rb_first_cached(struct rb_root_cached *tree);
+ void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool);
+ void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
+
+Both insert and erase calls have their respective counterpart of augmented
+trees:
+
+ void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *,
+ bool, struct rb_augment_callbacks *);
+ void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *,
+ struct rb_augment_callbacks *);
+
+
Support for Augmented rbtrees
-----------------------------
diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt
new file mode 100644
index 000000000000..4d3aac9f4a5d
--- /dev/null
+++ b/Documentation/vm/hmm.txt
@@ -0,0 +1,384 @@
+Heterogeneous Memory Management (HMM)
+
+Transparently allow any component of a program to use any memory region of said
+program with a device without using device specific memory allocator. This is
+becoming a requirement to simplify the use of advance heterogeneous computing
+where GPU, DSP or FPGA are use to perform various computations.
+
+This document is divided as follow, in the first section i expose the problems
+related to the use of a device specific allocator. The second section i expose
+the hardware limitations that are inherent to many platforms. The third section
+gives an overview of HMM designs. The fourth section explains how CPU page-
+table mirroring works and what is HMM purpose in this context. Fifth section
+deals with how device memory is represented inside the kernel. Finaly the last
+section present the new migration helper that allow to leverage the device DMA
+engine.
+
+
+1) Problems of using device specific memory allocator:
+2) System bus, device memory characteristics
+3) Share address space and migration
+4) Address space mirroring implementation and API
+5) Represent and manage device memory from core kernel point of view
+6) Migrate to and from device memory
+7) Memory cgroup (memcg) and rss accounting
+
+
+-------------------------------------------------------------------------------
+
+1) Problems of using device specific memory allocator:
+
+Device with large amount of on board memory (several giga bytes) like GPU have
+historically manage their memory through dedicated driver specific API. This
+creates a disconnect between memory allocated and managed by device driver and
+regular application memory (private anonymous, share memory or regular file
+back memory). From here on i will refer to this aspect as split address space.
+I use share address space to refer to the opposite situation ie one in which
+any memory region can be use by device transparently.
+
+Split address space because device can only access memory allocated through the
+device specific API. This imply that all memory object in a program are not
+equal from device point of view which complicate large program that rely on a
+wide set of libraries.
+
+Concretly this means that code that wants to leverage device like GPU need to
+copy object between genericly allocated memory (malloc, mmap private/share/)
+and memory allocated through the device driver API (this still end up with an
+mmap but of the device file).
+
+For flat dataset (array, grid, image, ...) this isn't too hard to achieve but
+complex data-set (list, tree, ...) are hard to get right. Duplicating a complex
+data-set need to re-map all the pointer relations between each of its elements.
+This is error prone and program gets harder to debug because of the duplicate
+data-set.
+
+Split address space also means that library can not transparently use data they
+are getting from core program or other library and thus each library might have
+to duplicate its input data-set using specific memory allocator. Large project
+suffer from this and waste resources because of the various memory copy.
+
+Duplicating each library API to accept as input or output memory allocted by
+each device specific allocator is not a viable option. It would lead to a
+combinatorial explosions in the library entry points.
+
+Finaly with the advance of high level language constructs (in C++ but in other
+language too) it is now possible for compiler to leverage GPU or other devices
+without even the programmer knowledge. Some of compiler identified patterns are
+only do-able with a share address. It is as well more reasonable to use a share
+address space for all the other patterns.
+
+
+-------------------------------------------------------------------------------
+
+2) System bus, device memory characteristics
+
+System bus cripple share address due to few limitations. Most system bus only
+allow basic memory access from device to main memory, even cache coherency is
+often optional. Access to device memory from CPU is even more limited, most
+often than not it is not cache coherent.
+
+If we only consider the PCIE bus than device can access main memory (often
+through an IOMMU) and be cache coherent with the CPUs. However it only allows
+a limited set of atomic operation from device on main memory. This is worse
+in the other direction the CPUs can only access a limited range of the device
+memory and can not perform atomic operations on it. Thus device memory can not
+be consider like regular memory from kernel point of view.
+
+Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
+and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s).
+The final limitation is latency, access to main memory from the device has an
+order of magnitude higher latency than when the device access its own memory.
+
+Some platform are developing new system bus or additions/modifications to PCIE
+to address some of those limitations (OpenCAPI, CCIX). They mainly allow two
+way cache coherency between CPU and device and allow all atomic operations the
+architecture supports. Saddly not all platform are following this trends and
+some major architecture are left without hardware solutions to those problems.
+
+So for share address space to make sense not only we must allow device to
+access any memory memory but we must also permit any memory to be migrated to
+device memory while device is using it (blocking CPU access while it happens).
+
+
+-------------------------------------------------------------------------------
+
+3) Share address space and migration
+
+HMM intends to provide two main features. First one is to share the address
+space by duplication the CPU page table into the device page table so same
+address point to same memory and this for any valid main memory address in
+the process address space.
+
+To achieve this, HMM offer a set of helpers to populate the device page table
+while keeping track of CPU page table updates. Device page table updates are
+not as easy as CPU page table updates. To update the device page table you must
+allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics
+commands in it to perform the update (unmap, cache invalidations and flush,
+...). This can not be done through common code for all device. Hence why HMM
+provides helpers to factor out everything that can be while leaving the gory
+details to the device driver.
+
+The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does
+allow to allocate a struct page for each page of the device memory. Those page
+are special because the CPU can not map them. They however allow to migrate
+main memory to device memory using exhisting migration mechanism and everything
+looks like if page was swap out to disk from CPU point of view. Using a struct
+page gives the easiest and cleanest integration with existing mm mechanisms.
+Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory
+for the device memory and second to perform migration. Policy decision of what
+and when to migrate things is left to the device driver.
+
+Note that any CPU access to a device page trigger a page fault and a migration
+back to main memory ie when a page backing an given address A is migrated from
+a main memory page to a device page then any CPU access to address A trigger a
+page fault and initiate a migration back to main memory.
+
+
+With this two features, HMM not only allow a device to mirror a process address
+space and keeps both CPU and device page table synchronize, but also allow to
+leverage device memory by migrating part of data-set that is actively use by a
+device.
+
+
+-------------------------------------------------------------------------------
+
+4) Address space mirroring implementation and API
+
+Address space mirroring main objective is to allow to duplicate range of CPU
+page table into a device page table and HMM helps keeping both synchronize. A
+device driver that want to mirror a process address space must start with the
+registration of an hmm_mirror struct:
+
+ int hmm_mirror_register(struct hmm_mirror *mirror,
+ struct mm_struct *mm);
+ int hmm_mirror_register_locked(struct hmm_mirror *mirror,
+ struct mm_struct *mm);
+
+The locked variant is to be use when the driver is already holding the mmap_sem
+of the mm in write mode. The mirror struct has a set of callback that are use
+to propagate CPU page table:
+
+ struct hmm_mirror_ops {
+ /* sync_cpu_device_pagetables() - synchronize page tables
+ *
+ * @mirror: pointer to struct hmm_mirror
+ * @update_type: type of update that occurred to the CPU page table
+ * @start: virtual start address of the range to update
+ * @end: virtual end address of the range to update
+ *
+ * This callback ultimately originates from mmu_notifiers when the CPU
+ * page table is updated. The device driver must update its page table
+ * in response to this callback. The update argument tells what action
+ * to perform.
+ *
+ * The device driver must not return from this callback until the device
+ * page tables are completely updated (TLBs flushed, etc); this is a
+ * synchronous call.
+ */
+ void (*update)(struct hmm_mirror *mirror,
+ enum hmm_update action,
+ unsigned long start,
+ unsigned long end);
+ };
+
+Device driver must perform update to the range following action (turn range
+read only, or fully unmap, ...). Once driver callback returns the device must
+be done with the update.
+
+
+When device driver wants to populate a range of virtual address it can use
+either:
+ int hmm_vma_get_pfns(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns);
+ int hmm_vma_fault(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns,
+ bool write,
+ bool block);
+
+First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and
+will not trigger a page fault on missing or non present entry. The second one
+do trigger page fault on missing or read only entry if write parameter is true.
+Page fault use the generic mm page fault code path just like a CPU page fault.
+
+Both function copy CPU page table into their pfns array argument. Each entry in
+that array correspond to an address in the virtual range. HMM provide a set of
+flags to help driver identify special CPU page table entries.
+
+Locking with the update() callback is the most important aspect the driver must
+respect in order to keep things properly synchronize. The usage pattern is :
+
+ int driver_populate_range(...)
+ {
+ struct hmm_range range;
+ ...
+ again:
+ ret = hmm_vma_get_pfns(vma, &range, start, end, pfns);
+ if (ret)
+ return ret;
+ take_lock(driver->update);
+ if (!hmm_vma_range_done(vma, &range)) {
+ release_lock(driver->update);
+ goto again;
+ }
+
+ // Use pfns array content to update device page table
+
+ release_lock(driver->update);
+ return 0;
+ }
+
+The driver->update lock is the same lock that driver takes inside its update()
+callback. That lock must be call before hmm_vma_range_done() to avoid any race
+with a concurrent CPU page table update.
+
+HMM implements all this on top of the mmu_notifier API because we wanted to a
+simpler API and also to be able to perform optimization latter own like doing
+concurrent device update in multi-devices scenario.
+
+HMM also serve as an impedence missmatch between how CPU page table update are
+done (by CPU write to the page table and TLB flushes) from how device update
+their own page table. Device update is a multi-step process, first appropriate
+commands are write to a buffer, then this buffer is schedule for execution on
+the device. It is only once the device has executed commands in the buffer that
+the update is done. Creating and scheduling update command buffer can happen
+concurrently for multiple devices. Waiting for each device to report commands
+as executed is serialize (there is no point in doing this concurrently).
+
+
+-------------------------------------------------------------------------------
+
+5) Represent and manage device memory from core kernel point of view
+
+Several differents design were try to support device memory. First one use
+device specific data structure to keep information about migrated memory and
+HMM hooked itself in various place of mm code to handle any access to address
+that were back by device memory. It turns out that this ended up replicating
+most of the fields of struct page and also needed many kernel code path to be
+updated to understand this new kind of memory.
+
+Thing is most kernel code path never try to access the memory behind a page
+but only care about struct page contents. Because of this HMM switchted to
+directly using struct page for device memory which left most kernel code path
+un-aware of the difference. We only need to make sure that no one ever try to
+map those page from the CPU side.
+
+HMM provide a set of helpers to register and hotplug device memory as a new
+region needing struct page. This is offer through a very simple API:
+
+ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ unsigned long size);
+ void hmm_devmem_remove(struct hmm_devmem *devmem);
+
+The hmm_devmem_ops is where most of the important things are:
+
+ struct hmm_devmem_ops {
+ void (*free)(struct hmm_devmem *devmem, struct page *page);
+ int (*fault)(struct hmm_devmem *devmem,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ struct page *page,
+ unsigned flags,
+ pmd_t *pmdp);
+ };
+
+The first callback (free()) happens when the last reference on a device page is
+drop. This means the device page is now free and no longer use by anyone. The
+second callback happens whenever CPU try to access a device page which it can
+not do. This second callback must trigger a migration back to system memory.
+
+
+-------------------------------------------------------------------------------
+
+6) Migrate to and from device memory
+
+Because CPU can not access device memory, migration must use device DMA engine
+to perform copy from and to device memory. For this we need a new migration
+helper:
+
+ int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long mentries,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *src,
+ unsigned long *dst,
+ void *private);
+
+Unlike other migration function it works on a range of virtual address, there
+is two reasons for that. First device DMA copy has a high setup overhead cost
+and thus batching multiple pages is needed as otherwise the migration overhead
+make the whole excersie pointless. The second reason is because driver trigger
+such migration base on range of address the device is actively accessing.
+
+The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy())
+control destination memory allocation and copy operation. Second one is there
+to allow device driver to perform cleanup operation after migration.
+
+ struct migrate_vma_ops {
+ void (*alloc_and_copy)(struct vm_area_struct *vma,
+ const unsigned long *src,
+ unsigned long *dst,
+ unsigned long start,
+ unsigned long end,
+ void *private);
+ void (*finalize_and_map)(struct vm_area_struct *vma,
+ const unsigned long *src,
+ const unsigned long *dst,
+ unsigned long start,
+ unsigned long end,
+ void *private);
+ };
+
+It is important to stress that this migration helpers allow for hole in the
+virtual address range. Some pages in the range might not be migrated for all
+the usual reasons (page is pin, page is lock, ...). This helper does not fail
+but just skip over those pages.
+
+The alloc_and_copy() might as well decide to not migrate all pages in the
+range (for reasons under the callback control). For those the callback just
+have to leave the corresponding dst entry empty.
+
+Finaly the migration of the struct page might fails (for file back page) for
+various reasons (failure to freeze reference, or update page cache, ...). If
+that happens then the finalize_and_map() can catch any pages that was not
+migrated. Note those page were still copied to new page and thus we wasted
+bandwidth but this is considered as a rare event and a price that we are
+willing to pay to keep all the code simpler.
+
+
+-------------------------------------------------------------------------------
+
+7) Memory cgroup (memcg) and rss accounting
+
+For now device memory is accounted as any regular page in rss counters (either
+anonymous if device page is use for anonymous, file if device page is use for
+file back page or shmem if device page is use for share memory). This is a
+deliberate choice to keep existing application that might start using device
+memory without knowing about it to keep runing unimpacted.
+
+Drawbacks is that OOM killer might kill an application using a lot of device
+memory and not a lot of regular system memory and thus not freeing much system
+memory. We want to gather more real world experience on how application and
+system react under memory pressure in the presence of device memory before
+deciding to account device memory differently.
+
+
+Same decision was made for memory cgroup. Device memory page are accounted
+against same memory cgroup a regular page would be accounted to. This does
+simplify migration to and from device memory. This also means that migration
+back from device memory to regular memory can not fail because it would
+go above memory cgroup limit. We might revisit this choice latter on once we
+get more experience in how device memory is use and its impact on memory
+resource control.
+
+
+Note that device memory can never be pin nor by device driver nor through GUP
+and thus such memory is always free upon process exit. Or when last reference
+is drop in case of share memory or file back memory.
diff --git a/MAINTAINERS b/MAINTAINERS
index fe7a27ed0bdb..7f32b510fdea 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7457,6 +7457,13 @@ S: Maintained
F: tools/testing/selftests/
F: Documentation/dev-tools/kselftest*
+KERNEL USERMODE HELPER
+M: "Luis R. Rodriguez" <mcgrof@kernel.org>
+L: linux-kernel@vger.kernel.org
+S: Maintained
+F: kernel/umh.c
+F: include/linux/umh.h
+
KERNEL VIRTUAL MACHINE (KVM)
M: Paolo Bonzini <pbonzini@redhat.com>
M: Radim Krčmář <rkrcmar@redhat.com>
@@ -7632,7 +7639,7 @@ F: include/linux/kmemleak.h
F: mm/kmemleak.c
F: mm/kmemleak-test.c
-KMOD MODULE USERMODE HELPER
+KMOD KERNEL MODULE LOADER - USERMODE HELPER
M: "Luis R. Rodriguez" <mcgrof@kernel.org>
L: linux-kernel@vger.kernel.org
S: Maintained
@@ -7790,6 +7797,13 @@ M: Sasha Levin <alexander.levin@verizon.com>
S: Maintained
F: tools/lib/lockdep/
+HMM - Heterogeneous Memory Management
+M: Jérôme Glisse <jglisse@redhat.com>
+L: linux-mm@kvack.org
+S: Maintained
+F: mm/hmm*
+F: include/linux/hmm*
+
LIBNVDIMM BLK: MMIO-APERTURE DRIVER
M: Ross Zwisler <ross.zwisler@linux.intel.com>
L: linux-nvdimm@lists.01.org
@@ -10727,8 +10741,11 @@ W: http://wiki.enneenne.com/index.php/LinuxPPS_support
L: linuxpps@ml.enneenne.com (subscribers-only)
S: Maintained
F: Documentation/pps/
+F: Documentation/devicetree/bindings/pps/pps-gpio.txt
+F: Documentation/ABI/testing/sysfs-pps
F: drivers/pps/
F: include/linux/pps*.h
+F: include/uapi/linux/pps.h
PPTP DRIVER
M: Dmitry Kozlov <xeb@mail.ru>
diff --git a/arch/alpha/include/asm/string.h b/arch/alpha/include/asm/string.h
index c2911f591704..9eb9933d845f 100644
--- a/arch/alpha/include/asm/string.h
+++ b/arch/alpha/include/asm/string.h
@@ -65,13 +65,14 @@ extern void * memchr(const void *, int, size_t);
aligned values. The DEST and COUNT parameters must be even for
correct operation. */
-#define __HAVE_ARCH_MEMSETW
-extern void * __memsetw(void *dest, unsigned short, size_t count);
-
-#define memsetw(s, c, n) \
-(__builtin_constant_p(c) \
- ? __constant_c_memset((s),0x0001000100010001UL*(unsigned short)(c),(n)) \
- : __memsetw((s),(c),(n)))
+#define __HAVE_ARCH_MEMSET16
+extern void * __memset16(void *dest, unsigned short, size_t count);
+static inline void *memset16(uint16_t *p, uint16_t v, size_t n)
+{
+ if (__builtin_constant_p(v))
+ return __constant_c_memset(p, 0x0001000100010001UL * v, n * 2);
+ return __memset16(p, v, n * 2);
+}
#endif /* __KERNEL__ */
diff --git a/arch/alpha/include/asm/vga.h b/arch/alpha/include/asm/vga.h
index c00106bac521..3c1c2b6128e7 100644
--- a/arch/alpha/include/asm/vga.h
+++ b/arch/alpha/include/asm/vga.h
@@ -34,7 +34,7 @@ static inline void scr_memsetw(u16 *s, u16 c, unsigned int count)
if (__is_ioaddr(s))
memsetw_io((u16 __iomem *) s, c, count);
else
- memsetw(s, c, count);
+ memset16(s, c, count / 2);
}
/* Do not trust that the usage will be correct; analyze the arguments. */
diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S
index 89a26f5e89de..f824969e9e77 100644
--- a/arch/alpha/lib/memset.S
+++ b/arch/alpha/lib/memset.S
@@ -20,7 +20,7 @@
.globl memset
.globl __memset
.globl ___memset
- .globl __memsetw
+ .globl __memset16
.globl __constant_c_memset
.ent ___memset
@@ -110,8 +110,8 @@ EXPORT_SYMBOL(___memset)
EXPORT_SYMBOL(__constant_c_memset)
.align 5
- .ent __memsetw
-__memsetw:
+ .ent __memset16
+__memset16:
.prologue 0
inswl $17,0,$1 /* E0 */
@@ -123,8 +123,8 @@ __memsetw:
or $1,$4,$17 /* E0 */
br __constant_c_memset /* .. E1 */
- .end __memsetw
-EXPORT_SYMBOL(__memsetw)
+ .end __memset16
+EXPORT_SYMBOL(__memset16)
memset = ___memset
__memset = ___memset
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index cf4f3aad0fc1..fe1c6af3a1b1 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -24,6 +24,20 @@ extern void * memchr(const void *, int, __kernel_size_t);
#define __HAVE_ARCH_MEMSET
extern void * memset(void *, int, __kernel_size_t);
+#define __HAVE_ARCH_MEMSET32
+extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
+static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n)
+{
+ return __memset32(p, v, n * 4);
+}
+
+#define __HAVE_ARCH_MEMSET64
+extern void *__memset64(uint64_t *, uint32_t low, __kernel_size_t, uint32_t hi);
+static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
+{
+ return __memset64(p, v, n * 8, v >> 32);
+}
+
extern void __memzero(void *ptr, __kernel_size_t n);
#define memset(p,v,n) \
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 8e8d20cdbce7..5266fd9ad6b4 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -87,6 +87,8 @@ EXPORT_SYMBOL(__raw_writesl);
EXPORT_SYMBOL(strchr);
EXPORT_SYMBOL(strrchr);
EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(__memset32);
+EXPORT_SYMBOL(__memset64);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(memchr);
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index 3c65e3bd790f..ed6d35d9cdb5 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -28,7 +28,7 @@ UNWIND( .fnstart )
1: orr r1, r1, r1, lsl #8
orr r1, r1, r1, lsl #16
mov r3, r1
- cmp r2, #16
+7: cmp r2, #16
blt 4f
#if ! CALGN(1)+0
@@ -41,7 +41,7 @@ UNWIND( .fnend )
UNWIND( .fnstart )
UNWIND( .save {r8, lr} )
mov r8, r1
- mov lr, r1
+ mov lr, r3
2: subs r2, r2, #64
stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time.
@@ -73,11 +73,11 @@ UNWIND( .fnend )
UNWIND( .fnstart )
UNWIND( .save {r4-r8, lr} )
mov r4, r1
- mov r5, r1
+ mov r5, r3
mov r6, r1
- mov r7, r1
+ mov r7, r3
mov r8, r1
- mov lr, r1
+ mov lr, r3
cmp r2, #96
tstgt ip, #31
@@ -114,7 +114,7 @@ UNWIND( .fnstart )
tst r2, #4
strne r1, [ip], #4
/*
- * When we get here, we've got less than 4 bytes to zero. We
+ * When we get here, we've got less than 4 bytes to set. We
* may have an unaligned pointer as well.
*/
5: tst r2, #2
@@ -135,3 +135,15 @@ UNWIND( .fnstart )
UNWIND( .fnend )
ENDPROC(memset)
ENDPROC(mmioset)
+
+ENTRY(__memset32)
+UNWIND( .fnstart )
+ mov r3, r1 @ copy r1 to r3 and fall into memset64
+UNWIND( .fnend )
+ENDPROC(__memset32)
+ENTRY(__memset64)
+UNWIND( .fnstart )
+ mov ip, r0 @ preserve r0 as return value
+ b 7b @ jump into the middle of memset
+UNWIND( .fnend )
+ENDPROC(__memset64)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index ffe089942ac4..9f7195a5773e 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -690,7 +690,7 @@ void __init smp_init_cpus(void)
acpi_parse_gic_cpu_interface, 0);
if (cpu_count > nr_cpu_ids)
- pr_warn("Number of cores (%d) exceeds configured maximum of %d - clipping\n",
+ pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n",
cpu_count, nr_cpu_ids);
if (!bootcpu_valid) {
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index eefd9a4ed156..1cce8243449e 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -17,6 +17,9 @@ config FRV
select HAVE_DEBUG_STACKOVERFLOW
select ARCH_NO_COHERENT_DMA_MMAP
+config CPU_BIG_ENDIAN
+ def_bool y
+
config ZONE_DMA
bool
default y
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 6e3d36f37a02..3089f7fe2abd 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -23,6 +23,9 @@ config H8300
select HAVE_ARCH_HASH
select CPU_NO_EFFICIENT_FFS
+config CPU_BIG_ENDIAN
+ def_bool y
+
config RWSEM_GENERIC_SPINLOCK
def_bool y
diff --git a/arch/m32r/configs/m32104ut_defconfig b/arch/m32r/configs/m32104ut_defconfig
index be30e094db71..4aa42acbd512 100644
--- a/arch/m32r/configs/m32104ut_defconfig
+++ b/arch/m32r/configs/m32104ut_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
@@ -40,7 +39,6 @@ CONFIG_NETFILTER_XT_MATCH_REALM=m
CONFIG_NETFILTER_XT_MATCH_SCTP=m
CONFIG_NETFILTER_XT_MATCH_STRING=m
CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
-CONFIG_IP_NF_QUEUE=m
CONFIG_IP_NF_IPTABLES=m
CONFIG_IP_NF_MATCH_ADDRTYPE=m
CONFIG_IP_NF_MATCH_ECN=m
@@ -48,7 +46,6 @@ CONFIG_IP_NF_MATCH_TTL=m
CONFIG_IP_NF_FILTER=m
CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IP_NF_TARGET_LOG=m
-CONFIG_IP_NF_TARGET_ULOG=m
CONFIG_IP_NF_MANGLE=m
CONFIG_IP_NF_TARGET_ECN=m
CONFIG_IP_NF_TARGET_TTL=m
@@ -106,7 +103,6 @@ CONFIG_SENSORS_SMSC47M1=m
CONFIG_SENSORS_W83781D=m
CONFIG_SENSORS_W83L785TS=m
CONFIG_SENSORS_W83627HF=m
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_EXT2_FS=y
CONFIG_EXT2_FS_XATTR=y
CONFIG_EXT2_FS_POSIX_ACL=y
diff --git a/arch/m32r/configs/m32700ut.smp_defconfig b/arch/m32r/configs/m32700ut.smp_defconfig
index a3d727ed6a16..41a0495b65df 100644
--- a/arch/m32r/configs/m32700ut.smp_defconfig
+++ b/arch/m32r/configs/m32700ut.smp_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -30,7 +29,6 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
# CONFIG_IPV6 is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=m
@@ -63,7 +61,6 @@ CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_SERIAL_M32R_PLDSIO=y
CONFIG_HW_RANDOM=y
CONFIG_DS1302=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_FB=y
CONFIG_FIRMWARE_EDID=y
CONFIG_FB_S1D13XXX=y
diff --git a/arch/m32r/configs/m32700ut.up_defconfig b/arch/m32r/configs/m32700ut.up_defconfig
index b8334163099d..20078a866f45 100644
--- a/arch/m32r/configs/m32700ut.up_defconfig
+++ b/arch/m32r/configs/m32700ut.up_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -29,7 +28,6 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
# CONFIG_IPV6 is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=m
@@ -62,7 +60,6 @@ CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_SERIAL_M32R_PLDSIO=y
CONFIG_HW_RANDOM=y
CONFIG_DS1302=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_FB=y
CONFIG_FIRMWARE_EDID=y
CONFIG_FB_S1D13XXX=y
diff --git a/arch/m32r/configs/mappi.nommu_defconfig b/arch/m32r/configs/mappi.nommu_defconfig
index 7c90ce2fc42b..4bf3820e054a 100644
--- a/arch/m32r/configs/mappi.nommu_defconfig
+++ b/arch/m32r/configs/mappi.nommu_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
CONFIG_LOG_BUF_SHIFT=14
@@ -39,7 +38,6 @@ CONFIG_NETDEVICES=y
# CONFIG_VT is not set
CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_HW_RANDOM=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
CONFIG_NFS_FS=y
diff --git a/arch/m32r/configs/mappi.smp_defconfig b/arch/m32r/configs/mappi.smp_defconfig
index 367d07cebcd3..f9ed7bdbf4de 100644
--- a/arch/m32r/configs/mappi.smp_defconfig
+++ b/arch/m32r/configs/mappi.smp_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
@@ -31,9 +30,7 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_IPV6 is not set
# CONFIG_STANDALONE is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_NBD=m
@@ -50,7 +47,6 @@ CONFIG_NETDEVICES=y
# CONFIG_VT is not set
CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_HW_RANDOM=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
CONFIG_ISO9660_FS=y
diff --git a/arch/m32r/configs/mappi.up_defconfig b/arch/m32r/configs/mappi.up_defconfig
index cb11384386ce..289ae7421e12 100644
--- a/arch/m32r/configs/mappi.up_defconfig
+++ b/arch/m32r/configs/mappi.up_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
@@ -29,9 +28,7 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_IPV6 is not set
# CONFIG_STANDALONE is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_NBD=m
@@ -48,7 +45,6 @@ CONFIG_NETDEVICES=y
# CONFIG_VT is not set
CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_HW_RANDOM=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
CONFIG_ISO9660_FS=y
diff --git a/arch/m32r/configs/mappi2.opsp_defconfig b/arch/m32r/configs/mappi2.opsp_defconfig
index 3bff779259b4..2852f6e7e246 100644
--- a/arch/m32r/configs/mappi2.opsp_defconfig
+++ b/arch/m32r/configs/mappi2.opsp_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -50,7 +49,6 @@ CONFIG_SMC91X=y
# CONFIG_SERIO_I8042 is not set
CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_HW_RANDOM=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
# CONFIG_VGA_CONSOLE is not set
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
diff --git a/arch/m32r/configs/mappi2.vdec2_defconfig b/arch/m32r/configs/mappi2.vdec2_defconfig
index 75246c9c1af8..8da4dbad8510 100644
--- a/arch/m32r/configs/mappi2.vdec2_defconfig
+++ b/arch/m32r/configs/mappi2.vdec2_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -49,7 +48,6 @@ CONFIG_SMC91X=y
# CONFIG_SERIO_I8042 is not set
CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_HW_RANDOM=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
# CONFIG_VGA_CONSOLE is not set
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
diff --git a/arch/m32r/configs/mappi3.smp_defconfig b/arch/m32r/configs/mappi3.smp_defconfig
index 27cefd41ac1f..5605b23e2faf 100644
--- a/arch/m32r/configs/mappi3.smp_defconfig
+++ b/arch/m32r/configs/mappi3.smp_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
@@ -29,9 +28,7 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
# CONFIG_IPV6 is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_NBD=m
@@ -50,7 +47,6 @@ CONFIG_SMC91X=y
# CONFIG_VT is not set
CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_HW_RANDOM=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
CONFIG_ISO9660_FS=y
diff --git a/arch/m32r/configs/oaks32r_defconfig b/arch/m32r/configs/oaks32r_defconfig
index 5087a510ca4f..5ccab127f6ad 100644
--- a/arch/m32r/configs/oaks32r_defconfig
+++ b/arch/m32r/configs/oaks32r_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_LOG_BUF_SHIFT=14
# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
@@ -37,7 +36,6 @@ CONFIG_NETDEVICES=y
# CONFIG_VT is not set
CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_HW_RANDOM=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_EXT2_FS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
diff --git a/arch/m32r/configs/opsput_defconfig b/arch/m32r/configs/opsput_defconfig
index 50c6f525db20..3ce1d08355e5 100644
--- a/arch/m32r/configs/opsput_defconfig
+++ b/arch/m32r/configs/opsput_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -46,7 +45,6 @@ CONFIG_SERIAL_M32R_SIO_CONSOLE=y
CONFIG_SERIAL_M32R_PLDSIO=y
CONFIG_HW_RANDOM=y
CONFIG_DS1302=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
CONFIG_ISO9660_FS=m
diff --git a/arch/m32r/configs/usrv_defconfig b/arch/m32r/configs/usrv_defconfig
index a3cfaaedab60..cb8c051c3d46 100644
--- a/arch/m32r/configs/usrv_defconfig
+++ b/arch/m32r/configs/usrv_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -34,9 +33,6 @@ CONFIG_INET_ESP=y
CONFIG_INET_IPCOMP=y
# CONFIG_IPV6 is not set
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_ADV_OPTIONS=y
@@ -62,7 +58,6 @@ CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
# CONFIG_SERIAL_M32R_SIO is not set
# CONFIG_HWMON is not set
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_FS_XATTR is not set
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 5abb548f0e70..353d90487c2b 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -24,6 +24,9 @@ config M68K
select OLD_SIGSUSPEND3
select OLD_SIGACTION
+config CPU_BIG_ENDIAN
+ def_bool y
+
config RWSEM_GENERIC_SPINLOCK
bool
default y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 4ed8ebf33509..9d26abdf0dc1 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -36,6 +36,22 @@ config MICROBLAZE
select VIRT_TO_BUS
select CPU_NO_EFFICIENT_FFS
+# Endianness selection
+choice
+ prompt "Endianness selection"
+ default CPU_BIG_ENDIAN
+ help
+ microblaze architectures can be configured for either little or
+ big endian formats. Be sure to select the appropriate mode.
+
+config CPU_BIG_ENDIAN
+ bool "Big endian"
+
+config CPU_LITTLE_ENDIAN
+ bool "Little endian"
+
+endchoice
+
config SWAP
def_bool n
diff --git a/arch/microblaze/Makefile b/arch/microblaze/Makefile
index 740f2b82a182..1f6c486826a0 100644
--- a/arch/microblaze/Makefile
+++ b/arch/microblaze/Makefile
@@ -35,6 +35,8 @@ endif
CPUFLAGS-$(CONFIG_XILINX_MICROBLAZE0_USE_DIV) += -mno-xl-soft-div
CPUFLAGS-$(CONFIG_XILINX_MICROBLAZE0_USE_BARREL) += -mxl-barrel-shift
CPUFLAGS-$(CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR) += -mxl-pattern-compare
+CPUFLAGS-$(CONFIG_BIG_ENDIAN) += -mbig-endian
+CPUFLAGS-$(CONFIG_LITTLE_ENDIAN) += -mlittle-endian
CPUFLAGS-1 += $(call cc-option,-mcpu=v$(CPU_VER))
diff --git a/arch/mips/include/asm/vga.h b/arch/mips/include/asm/vga.h
index f82c83749a08..975ff51f80c4 100644
--- a/arch/mips/include/asm/vga.h
+++ b/arch/mips/include/asm/vga.h
@@ -6,6 +6,7 @@
#ifndef _ASM_VGA_H
#define _ASM_VGA_H
+#include <linux/string.h>
#include <asm/addrspace.h>
#include <asm/byteorder.h>
@@ -40,9 +41,15 @@ static inline u16 scr_readw(volatile const u16 *addr)
return le16_to_cpu(*addr);
}
+static inline void scr_memsetw(u16 *s, u16 v, unsigned int count)
+{
+ memset16(s, cpu_to_le16(v), count / 2);
+}
+
#define scr_memcpyw(d, s, c) memcpy(d, s, c)
#define scr_memmovew(d, s, c) memmove(d, s, c)
#define VT_BUF_HAVE_MEMCPYW
#define VT_BUF_HAVE_MEMMOVEW
+#define VT_BUF_HAVE_MEMSETW
#endif /* _ASM_VGA_H */
diff --git a/arch/mn10300/configs/asb2303_defconfig b/arch/mn10300/configs/asb2303_defconfig
index 1fd41ec1dfb5..d06dae131139 100644
--- a/arch/mn10300/configs/asb2303_defconfig
+++ b/arch/mn10300/configs/asb2303_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_TINY_RCU=y
@@ -28,16 +27,13 @@ CONFIG_IP_PNP_BOOTP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
# CONFIG_WIRELESS is not set
CONFIG_MTD=y
CONFIG_MTD_DEBUG=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_CFI=y
CONFIG_MTD_JEDECPROBE=y
CONFIG_MTD_CFI_ADV_OPTIONS=y
@@ -48,8 +44,6 @@ CONFIG_MTD_PHYSMAP=y
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SMC91X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_WLAN is not set
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
diff --git a/arch/mn10300/configs/asb2364_defconfig b/arch/mn10300/configs/asb2364_defconfig
index cd0a6cb17dee..b1d80cee97ee 100644
--- a/arch/mn10300/configs/asb2364_defconfig
+++ b/arch/mn10300/configs/asb2364_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -40,7 +39,6 @@ CONFIG_IP_PNP_BOOTP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
CONFIG_IPV6=y
# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set
@@ -50,10 +48,8 @@ CONFIG_IPV6=y
CONFIG_CONNECTOR=y
CONFIG_MTD=y
CONFIG_MTD_DEBUG=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_CFI=y
CONFIG_MTD_JEDECPROBE=y
CONFIG_MTD_CFI_ADV_OPTIONS=y
@@ -64,8 +60,6 @@ CONFIG_MTD_PHYSMAP=y
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SMSC911X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
@@ -77,7 +71,6 @@ CONFIG_SERIAL_8250_EXTENDED=y
CONFIG_SERIAL_8250_SHARE_IRQ=y
# CONFIG_HW_RANDOM is not set
# CONFIG_HWMON is not set
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_PROC_KCORE=y
# CONFIG_PROC_PAGE_MONITOR is not set
@@ -93,4 +86,3 @@ CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_DEBUG_BUGVERBOSE is not set
CONFIG_DEBUG_INFO=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 1e95920b0737..a0f2e4a323c1 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -29,6 +29,9 @@ config OPENRISC
select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
select NO_BOOTMEM
+config CPU_BIG_ENDIAN
+ def_bool y
+
config MMU
def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 13648519bd41..ba7b7ddc3844 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -59,6 +59,9 @@ config PARISC
config CPU_BIG_ENDIAN
def_bool y
+config CPU_BIG_ENDIAN
+ def_bool y
+
config MMU
def_bool y
diff --git a/arch/powerpc/include/asm/vga.h b/arch/powerpc/include/asm/vga.h
index ab3acd2f2786..7a7b541b7493 100644
--- a/arch/powerpc/include/asm/vga.h
+++ b/arch/powerpc/include/asm/vga.h
@@ -33,8 +33,16 @@ static inline u16 scr_readw(volatile const u16 *addr)
return le16_to_cpu(*addr);
}
+#define VT_BUF_HAVE_MEMSETW
+static inline void scr_memsetw(u16 *s, u16 v, unsigned int n)
+{
+ memset16(s, cpu_to_le16(v), n / 2);
+}
+
#define VT_BUF_HAVE_MEMCPYW
+#define VT_BUF_HAVE_MEMMOVEW
#define scr_memcpyw memcpy
+#define scr_memmovew memmove
#endif /* !CONFIG_VGA_CONSOLE && !CONFIG_MDA_CONSOLE */
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 70f073d6c3b2..2ff2b8a19f71 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -224,7 +224,7 @@ void __init allocate_pacas(void)
paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit));
memset(paca, 0, paca_size);
- printk(KERN_DEBUG "Allocated %u bytes for %d pacas at %p\n",
+ printk(KERN_DEBUG "Allocated %u bytes for %u pacas at %p\n",
paca_size, nr_cpu_ids, paca);
allocate_lppacas(nr_cpu_ids, limit);
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 7de73589d8e2..0ac741fae90e 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -551,7 +551,7 @@ void __init smp_setup_cpu_maps(void)
if (maxcpus > nr_cpu_ids) {
printk(KERN_WARNING
"Partition configured for %d cpus, "
- "operating system maximum is %d.\n",
+ "operating system maximum is %u.\n",
maxcpus, nr_cpu_ids);
maxcpus = nr_cpu_ids;
} else
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 44f3a25ca630..ebc244b08d67 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -511,13 +511,13 @@ static bool xive_parse_provisioning(struct device_node *np)
static void xive_native_setup_pools(void)
{
/* Allocate a pool big enough */
- pr_debug("XIVE: Allocating VP block for pool size %d\n", nr_cpu_ids);
+ pr_debug("XIVE: Allocating VP block for pool size %u\n", nr_cpu_ids);
xive_pool_vps = xive_native_alloc_vp_block(nr_cpu_ids);
if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP))
pr_err("XIVE: Failed to allocate pool VP, KVM might not function\n");
- pr_debug("XIVE: Pool VPs allocated at 0x%x for %d max CPUs\n",
+ pr_debug("XIVE: Pool VPs allocated at 0x%x for %u max CPUs\n",
xive_pool_vps, nr_cpu_ids);
}
diff --git a/arch/sh/configs/ap325rxa_defconfig b/arch/sh/configs/ap325rxa_defconfig
index e5335123b5e9..72b72e50a92e 100644
--- a/arch/sh/configs/ap325rxa_defconfig
+++ b/arch/sh/configs/ap325rxa_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -28,14 +27,10 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -51,8 +46,6 @@ CONFIG_NETDEVICES=y
CONFIG_SMSC_PHY=y
CONFIG_NET_ETHERNET=y
CONFIG_SMSC911X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
@@ -82,7 +75,6 @@ CONFIG_FB=y
CONFIG_FB_SH_MOBILE_LCDC=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_LOGO=y
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_MMC=y
CONFIG_MMC_SPI=y
@@ -110,8 +102,6 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_MUST_CHECK is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_CRYPTO=y
CONFIG_CRYPTO_CBC=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/apsh4a3a_defconfig b/arch/sh/configs/apsh4a3a_defconfig
index 6cb327977d13..4710df43a5b5 100644
--- a/arch/sh/configs/apsh4a3a_defconfig
+++ b/arch/sh/configs/apsh4a3a_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -28,15 +27,11 @@ CONFIG_INET=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
# CONFIG_WIRELESS is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FW_LOADER is not set
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -46,8 +41,6 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SMSC911X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_WLAN is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
@@ -66,7 +59,6 @@ CONFIG_FONTS=y
CONFIG_FONT_8x8=y
CONFIG_FONT_8x16=y
CONFIG_LOGO=y
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
@@ -96,7 +88,6 @@ CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_PREEMPT is not set
# CONFIG_DEBUG_BUGVERBOSE is not set
CONFIG_DEBUG_INFO=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_FTRACE is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
# CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig
index fe45d2c9b151..825c641726c4 100644
--- a/arch/sh/configs/apsh4ad0a_defconfig
+++ b/arch/sh/configs/apsh4ad0a_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -53,7 +52,6 @@ CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_NET_KEY=y
CONFIG_INET=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
# CONFIG_WIRELESS is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
@@ -70,8 +68,6 @@ CONFIG_NETDEVICES=y
CONFIG_MDIO_BITBANG=y
CONFIG_NET_ETHERNET=y
CONFIG_SMSC911X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_WLAN is not set
CONFIG_INPUT_EVDEV=y
# CONFIG_INPUT_KEYBOARD is not set
@@ -83,7 +79,6 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_LEGACY_PTYS is not set
# CONFIG_HW_RANDOM is not set
# CONFIG_HWMON is not set
-CONFIG_VIDEO_OUTPUT_CONTROL=y
CONFIG_FB=y
CONFIG_FB_SH7785FB=y
CONFIG_FRAMEBUFFER_CONSOLE=y
@@ -124,6 +119,5 @@ CONFIG_DEBUG_SHIRQ=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_VM=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_DWARF_UNWINDER=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/cayman_defconfig b/arch/sh/configs/cayman_defconfig
index 67e150631ea5..5a90e24aa8a6 100644
--- a/arch/sh/configs/cayman_defconfig
+++ b/arch/sh/configs/cayman_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_POSIX_MQUEUE=y
CONFIG_LOG_BUF_SHIFT=14
# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
@@ -19,7 +18,6 @@ CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_IP_PNP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FW_LOADER is not set
@@ -38,7 +36,6 @@ CONFIG_NET_ETHERNET=y
CONFIG_HW_RANDOM=y
CONFIG_I2C=m
CONFIG_WATCHDOG=y
-CONFIG_VIDEO_OUTPUT_CONTROL=y
CONFIG_FB=y
CONFIG_FIRMWARE_EDID=y
CONFIG_FB_MODE_HELPERS=y
@@ -67,5 +64,4 @@ CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_SCHEDSTATS=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/dreamcast_defconfig b/arch/sh/configs/dreamcast_defconfig
index ec243ca29529..3f08dc54480b 100644
--- a/arch/sh/configs/dreamcast_defconfig
+++ b/arch/sh/configs/dreamcast_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_LOG_BUF_SHIFT=14
@@ -32,7 +31,6 @@ CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_STANDALONE is not set
@@ -43,8 +41,6 @@ CONFIG_NET_ETHERNET=y
CONFIG_NET_PCI=y
CONFIG_8139TOO=y
# CONFIG_8139TOO_PIO is not set
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_KEYBOARD_ATKBD is not set
CONFIG_KEYBOARD_MAPLE=y
# CONFIG_MOUSE_PS2 is not set
@@ -56,7 +52,6 @@ CONFIG_HW_RANDOM=y
# CONFIG_HWMON is not set
CONFIG_WATCHDOG=y
CONFIG_SH_WDT=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_FB=y
CONFIG_FIRMWARE_EDID=y
CONFIG_FB_PVR2=y
@@ -74,5 +69,4 @@ CONFIG_LOGO=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_HUGETLBFS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/ecovec24-romimage_defconfig b/arch/sh/configs/ecovec24-romimage_defconfig
index 5fcb17bff24a..0c5dfccbfe37 100644
--- a/arch/sh/configs/ecovec24-romimage_defconfig
+++ b/arch/sh/configs/ecovec24-romimage_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -26,19 +25,15 @@ CONFIG_INET=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_MISC_DEVICES is not set
CONFIG_SCSI=y
CONFIG_BLK_DEV_SD=y
# CONFIG_SCSI_LOWLEVEL is not set
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SH_ETH=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
@@ -51,7 +46,6 @@ CONFIG_I2C=y
CONFIG_I2C_SH_MOBILE=y
CONFIG_GPIO_SYSFS=y
# CONFIG_HWMON is not set
-# CONFIG_HID_SUPPORT is not set
CONFIG_USB=y
CONFIG_USB_R8A66597_HCD=y
CONFIG_USB_STORAGE=y
@@ -64,4 +58,3 @@ CONFIG_TMPFS=y
# CONFIG_NETWORK_FILESYSTEMS is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig
index 0b364e3b0ff8..3568310c2c2f 100644
--- a/arch/sh/configs/ecovec24_defconfig
+++ b/arch/sh/configs/ecovec24_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -29,16 +28,12 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_IRDA=y
CONFIG_SH_SIR=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -53,8 +48,6 @@ CONFIG_NETDEVICES=y
CONFIG_SMSC_PHY=y
CONFIG_NET_ETHERNET=y
CONFIG_SH_ETH=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
CONFIG_INPUT_EVDEV=y
# CONFIG_KEYBOARD_ATKBD is not set
@@ -140,8 +133,6 @@ CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_CRYPTO=y
CONFIG_CRYPTO_CBC=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/edosk7705_defconfig b/arch/sh/configs/edosk7705_defconfig
index 41fa3a7eed96..db756e099052 100644
--- a/arch/sh/configs/edosk7705_defconfig
+++ b/arch/sh/configs/edosk7705_defconfig
@@ -20,7 +20,6 @@ CONFIG_CPU_SUBTYPE_SH7705=y
CONFIG_SH_EDOSK7705=y
CONFIG_SH_PCLK_FREQ=31250000
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_MISC_DEVICES is not set
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
@@ -35,5 +34,4 @@ CONFIG_SH_PCLK_FREQ=31250000
# CONFIG_SYSFS is not set
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRC32 is not set
diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig
index e1077a041ac3..aab4ff1e247c 100644
--- a/arch/sh/configs/edosk7760_defconfig
+++ b/arch/sh/configs/edosk7760_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_LOCALVERSION="_edosk7760"
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -31,7 +30,6 @@ CONFIG_IP_PNP_BOOTP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FW_LOADER is not set
@@ -39,10 +37,7 @@ CONFIG_DEBUG_DRIVER=y
CONFIG_DEBUG_DEVRES=y
CONFIG_MTD=y
CONFIG_MTD_DEBUG=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_JEDECPROBE=y
@@ -62,12 +57,9 @@ CONFIG_MTD_ABSENT=y
CONFIG_MTD_PHYSMAP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=26000
-# CONFIG_MISC_DEVICES is not set
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SMC91X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
@@ -92,7 +84,6 @@ CONFIG_SND=y
# CONFIG_SND_VERBOSE_PROCFS is not set
CONFIG_SND_VERBOSE_PRINTK=y
CONFIG_SND_SOC=y
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_EXT2_FS=y
CONFIG_EXT2_FS_XATTR=y
@@ -119,8 +110,6 @@ CONFIG_DETECT_HUNG_TASK=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_TIMER_STATS=y
CONFIG_DEBUG_INFO=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_CRYPTO=y
CONFIG_CRYPTO_MD5=y
CONFIG_CRYPTO_DES=y
diff --git a/arch/sh/configs/espt_defconfig b/arch/sh/configs/espt_defconfig
index 67cb1094a033..2985fe7c6d50 100644
--- a/arch/sh/configs/espt_defconfig
+++ b/arch/sh/configs/espt_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
@@ -26,13 +25,10 @@ CONFIG_INET=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_JEDECPROBE=y
@@ -43,14 +39,11 @@ CONFIG_MTD_CFI_GEOMETRY=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_COMPLEX_MAPPINGS=y
CONFIG_MTD_PHYSMAP=y
-# CONFIG_MISC_DEVICES is not set
CONFIG_SCSI=y
CONFIG_BLK_DEV_SD=y
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SH_ETH=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
@@ -65,7 +58,6 @@ CONFIG_FB_FOREIGN_ENDIAN=y
CONFIG_FB_SH7760=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_LOGO=y
-# CONFIG_HID_SUPPORT is not set
CONFIG_USB=y
CONFIG_USB_MON=y
CONFIG_USB_OHCI_HCD=y
@@ -73,7 +65,6 @@ CONFIG_USB_STORAGE=y
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-CONFIG_AUTOFS_FS=y
CONFIG_AUTOFS4_FS=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
@@ -123,6 +114,5 @@ CONFIG_NLS_UTF8=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRC_T10DIF=y
diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig
index 496edcdf95a3..4dcf7f552582 100644
--- a/arch/sh/configs/hp6xx_defconfig
+++ b/arch/sh/configs/hp6xx_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
@@ -37,7 +36,6 @@ CONFIG_SERIAL_SH_SCI_NR_UARTS=3
CONFIG_SERIAL_SH_SCI_CONSOLE=y
CONFIG_LEGACY_PTY_COUNT=64
# CONFIG_HWMON is not set
-CONFIG_VIDEO_OUTPUT_CONTROL=y
CONFIG_FB=y
CONFIG_FIRMWARE_EDID=y
CONFIG_FB_HIT=y
@@ -46,7 +44,6 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FONTS=y
CONFIG_FONT_PEARL_8x8=y
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_SH=y
@@ -55,7 +52,6 @@ CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=y
CONFIG_PROC_KCORE=y
CONFIG_NLS_CODEPAGE_850=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_CRYPTO=y
CONFIG_CRYPTO_CBC=y
CONFIG_CRYPTO_ECB=y
diff --git a/arch/sh/configs/kfr2r09-romimage_defconfig b/arch/sh/configs/kfr2r09-romimage_defconfig
index 029a506ca325..9cc37f29e3b4 100644
--- a/arch/sh/configs/kfr2r09-romimage_defconfig
+++ b/arch/sh/configs/kfr2r09-romimage_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -26,12 +25,10 @@ CONFIG_INET=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
# CONFIG_WIRELESS is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_MISC_DEVICES is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
@@ -44,7 +41,6 @@ CONFIG_I2C=y
CONFIG_I2C_SH_MOBILE=y
CONFIG_GPIO_SYSFS=y
# CONFIG_HWMON is not set
-# CONFIG_HID_SUPPORT is not set
CONFIG_USB_GADGET=y
CONFIG_USB_CDC_COMPOSITE=y
# CONFIG_DNOTIFY is not set
@@ -55,5 +51,4 @@ CONFIG_TMPFS=y
# CONFIG_NETWORK_FILESYSTEMS is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRC32 is not set
diff --git a/arch/sh/configs/kfr2r09_defconfig b/arch/sh/configs/kfr2r09_defconfig
index fac13ded07b2..46693d033644 100644
--- a/arch/sh/configs/kfr2r09_defconfig
+++ b/arch/sh/configs/kfr2r09_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -33,15 +32,12 @@ CONFIG_INET=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
# CONFIG_WIRELESS is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_INTELEXT=y
@@ -49,7 +45,6 @@ CONFIG_MTD_PHYSMAP=y
CONFIG_MTD_ONENAND=y
CONFIG_MTD_ONENAND_GENERIC=y
CONFIG_MTD_UBI=y
-# CONFIG_MISC_DEVICES is not set
# CONFIG_INPUT_MOUSEDEV is not set
CONFIG_INPUT_EVDEV=y
# CONFIG_KEYBOARD_ATKBD is not set
@@ -77,7 +72,6 @@ CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_CLUT224 is not set
# CONFIG_LOGO_SUPERH_MONO is not set
# CONFIG_LOGO_SUPERH_CLUT224 is not set
-# CONFIG_HID_SUPPORT is not set
CONFIG_USB_GADGET=y
CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
@@ -91,4 +85,3 @@ CONFIG_TMPFS=y
# CONFIG_NETWORK_FILESYSTEMS is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig
index 6783f31315c7..467f4d2d8e87 100644
--- a/arch/sh/configs/landisk_defconfig
+++ b/arch/sh/configs/landisk_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_LOG_BUF_SHIFT=14
# CONFIG_SYSCTL_SYSCALL is not set
@@ -24,10 +23,8 @@ CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_PNP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_NETFILTER=y
-CONFIG_IP_NF_QUEUE=m
CONFIG_ATALK=m
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_BLK_DEV_LOOP=y
@@ -118,7 +115,6 @@ CONFIG_NFSD_V3=y
CONFIG_SMB_FS=m
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_932=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_SH_STANDARD_BIOS=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRC_T10DIF=y
diff --git a/arch/sh/configs/lboxre2_defconfig b/arch/sh/configs/lboxre2_defconfig
index e3c0894b1bb4..9e3edfdf9b2e 100644
--- a/arch/sh/configs/lboxre2_defconfig
+++ b/arch/sh/configs/lboxre2_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_LOG_BUF_SHIFT=14
# CONFIG_SYSCTL_SYSCALL is not set
@@ -28,7 +27,6 @@ CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_PNP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_NETFILTER=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
@@ -61,7 +59,6 @@ CONFIG_VFAT_FS=y
CONFIG_TMPFS=y
CONFIG_ROMFS_FS=y
CONFIG_NLS_CODEPAGE_437=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_SH_STANDARD_BIOS=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRC_T10DIF=y
diff --git a/arch/sh/configs/magicpanelr2_defconfig b/arch/sh/configs/magicpanelr2_defconfig
index 9479872b1ae6..fb7415dbc102 100644
--- a/arch/sh/configs/magicpanelr2_defconfig
+++ b/arch/sh/configs/magicpanelr2_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -35,16 +34,13 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_STANDALONE is not set
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -55,8 +51,6 @@ CONFIG_NETDEVICES=y
CONFIG_SMSC_PHY=y
CONFIG_NET_ETHERNET=y
CONFIG_SMSC911X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
CONFIG_INPUT_EVDEV=y
# CONFIG_MOUSE_PS2 is not set
CONFIG_SERIAL_8250=y
@@ -68,7 +62,6 @@ CONFIG_SERIAL_SH_SCI=y
CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_HW_RANDOM is not set
# CONFIG_HWMON is not set
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_RTC_CLASS=y
# CONFIG_RTC_HCTOSYS is not set
@@ -96,7 +89,5 @@ CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_KOBJECT=y
CONFIG_DEBUG_INFO=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_CRC_CCITT=m
CONFIG_CRC16=m
diff --git a/arch/sh/configs/microdev_defconfig b/arch/sh/configs/microdev_defconfig
index f1d2e1b5ee41..c3f7d5899922 100644
--- a/arch/sh/configs/microdev_defconfig
+++ b/arch/sh/configs/microdev_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_LOG_BUF_SHIFT=14
CONFIG_BLK_DEV_INITRD=y
@@ -19,7 +18,6 @@ CONFIG_SUPERHYWAY=y
CONFIG_NET=y
CONFIG_INET=y
CONFIG_IP_PNP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FW_LOADER is not set
@@ -45,6 +43,5 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_CRYPTO_ECB=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig
index cc61eda44922..e04f21be0756 100644
--- a/arch/sh/configs/migor_defconfig
+++ b/arch/sh/configs/migor_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
@@ -26,15 +25,11 @@ CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_FW_LOADER=m
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -47,8 +42,6 @@ CONFIG_BLK_DEV_SD=y
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SMC91X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
CONFIG_INPUT_EVDEV=y
# CONFIG_KEYBOARD_ATKBD is not set
@@ -101,7 +94,6 @@ CONFIG_TMPFS=y
CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_CRYPTO_MANAGER=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
# CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/polaris_defconfig b/arch/sh/configs/polaris_defconfig
index f3d5d9f76310..0a432b5f50e7 100644
--- a/arch/sh/configs/polaris_defconfig
+++ b/arch/sh/configs/polaris_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
@@ -37,14 +36,11 @@ CONFIG_IP_MULTICAST=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FIRMWARE_IN_KERNEL is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_ADV_OPTIONS=y
@@ -57,8 +53,6 @@ CONFIG_NETDEVICES=y
CONFIG_SMSC_PHY=y
CONFIG_NET_ETHERNET=y
CONFIG_SMSC911X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
@@ -71,7 +65,6 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_LEGACY_PTYS is not set
# CONFIG_HW_RANDOM is not set
# CONFIG_HWMON is not set
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_SH=y
@@ -91,5 +84,3 @@ CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_DEBUG_SPINLOCK_SLEEP=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_SG=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig
index 920b8471ceb7..435bcd66c667 100644
--- a/arch/sh/configs/r7780mp_defconfig
+++ b/arch/sh/configs/r7780mp_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -35,13 +34,11 @@ CONFIG_INET=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_BRIDGE=m
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_FW_LOADER=m
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_COMPLEX_MAPPINGS=y
@@ -110,7 +107,6 @@ CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_DEBUG_PREEMPT is not set
CONFIG_DEBUG_INFO=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_CRYPTO_ECB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_HMAC=y
diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig
index c77da6be06b8..5877e6d1f285 100644
--- a/arch/sh/configs/r7785rp_defconfig
+++ b/arch/sh/configs/r7785rp_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -42,7 +41,6 @@ CONFIG_INET=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_BRIDGE=m
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
@@ -104,7 +102,6 @@ CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
CONFIG_DEBUG_INFO=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_SH_STANDARD_BIOS=y
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_4KSTACKS=y
diff --git a/arch/sh/configs/rsk7201_defconfig b/arch/sh/configs/rsk7201_defconfig
index 5df916d931c5..b195bc01e406 100644
--- a/arch/sh/configs/rsk7201_defconfig
+++ b/arch/sh/configs/rsk7201_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -37,10 +36,7 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
# CONFIG_FW_LOADER is not set
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -58,8 +54,6 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_HW_RANDOM is not set
# CONFIG_HWMON is not set
CONFIG_THERMAL=y
-CONFIG_VIDEO_OUTPUT_CONTROL=y
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_SH=y
@@ -71,5 +65,3 @@ CONFIG_ROMFS_FS=y
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig
index 3c4f6f4d52b0..8c471959bbc7 100644
--- a/arch/sh/configs/rsk7203_defconfig
+++ b/arch/sh/configs/rsk7203_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -44,7 +43,6 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
@@ -52,10 +50,7 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
# CONFIG_FW_LOADER is not set
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -64,8 +59,6 @@ CONFIG_NETDEVICES=y
CONFIG_SMSC_PHY=y
CONFIG_NET_ETHERNET=y
CONFIG_SMSC911X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
CONFIG_INPUT_FF_MEMLESS=m
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
@@ -81,7 +74,6 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_HWMON is not set
CONFIG_THERMAL=y
CONFIG_REGULATOR=y
-CONFIG_VIDEO_OUTPUT_CONTROL=y
CONFIG_HID_A4TECH=y
CONFIG_HID_APPLE=y
CONFIG_HID_BELKIN=y
@@ -130,6 +122,4 @@ CONFIG_DEBUG_VM=y
CONFIG_DEBUG_LIST=y
CONFIG_DEBUG_SG=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_DEBUG_STACK_USAGE=y
diff --git a/arch/sh/configs/rsk7264_defconfig b/arch/sh/configs/rsk7264_defconfig
index eecdf65bb789..2b9b731fc86b 100644
--- a/arch/sh/configs/rsk7264_defconfig
+++ b/arch/sh/configs/rsk7264_defconfig
@@ -35,7 +35,6 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
@@ -61,11 +60,9 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_HWMON is not set
CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-# CONFIG_USB_DEVICE_CLASS is not set
CONFIG_USB_R8A66597_HCD=y
CONFIG_USB_STORAGE=y
CONFIG_USB_STORAGE_DEBUG=y
-CONFIG_USB_LIBUSUAL=y
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
diff --git a/arch/sh/configs/rsk7269_defconfig b/arch/sh/configs/rsk7269_defconfig
index 8370b10df357..d041f7bcb84c 100644
--- a/arch/sh/configs/rsk7269_defconfig
+++ b/arch/sh/configs/rsk7269_defconfig
@@ -24,7 +24,6 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
@@ -44,11 +43,9 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_HWMON is not set
CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-# CONFIG_USB_DEVICE_CLASS is not set
CONFIG_USB_R8A66597_HCD=y
CONFIG_USB_STORAGE=y
CONFIG_USB_STORAGE_DEBUG=y
-CONFIG_USB_LIBUSUAL=y
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
@@ -60,5 +57,4 @@ CONFIG_PARTITION_ADVANCED=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_MUST_CHECK is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
# CONFIG_FTRACE is not set
diff --git a/arch/sh/configs/rts7751r2d1_defconfig b/arch/sh/configs/rts7751r2d1_defconfig
index a3d081095ce2..379d673f5ce8 100644
--- a/arch/sh/configs/rts7751r2d1_defconfig
+++ b/arch/sh/configs/rts7751r2d1_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_LOG_BUF_SHIFT=14
# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
@@ -22,7 +21,6 @@ CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_FW_LOADER=m
@@ -48,7 +46,6 @@ CONFIG_HW_RANDOM=y
CONFIG_SPI=y
CONFIG_SPI_SH_SCI=y
CONFIG_MFD_SM501=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_FB=y
CONFIG_FB_SH_MOBILE_LCDC=m
CONFIG_FB_SM501=y
@@ -83,7 +80,6 @@ CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_OHCI_HCD=y
CONFIG_USB_STORAGE=y
-CONFIG_USB_LIBUSUAL=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_R9701=y
CONFIG_EXT2_FS=y
@@ -94,6 +90,5 @@ CONFIG_TMPFS=y
CONFIG_MINIX_FS=y
CONFIG_NLS_CODEPAGE_932=y
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRC_T10DIF=y
diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig
index b1a04f3c598b..11177bceda83 100644
--- a/arch/sh/configs/rts7751r2dplus_defconfig
+++ b/arch/sh/configs/rts7751r2dplus_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_LOG_BUF_SHIFT=14
# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
@@ -22,15 +21,11 @@ CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_FW_LOADER=m
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_PHYSMAP=y
@@ -56,7 +51,6 @@ CONFIG_HW_RANDOM=y
CONFIG_SPI=y
CONFIG_SPI_SH_SCI=y
CONFIG_MFD_SM501=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_FB=y
CONFIG_FB_SH_MOBILE_LCDC=m
CONFIG_FB_SM501=y
@@ -91,7 +85,6 @@ CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_OHCI_HCD=y
CONFIG_USB_STORAGE=y
-CONFIG_USB_LIBUSUAL=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_R9701=y
CONFIG_EXT2_FS=y
@@ -102,6 +95,5 @@ CONFIG_TMPFS=y
CONFIG_MINIX_FS=y
CONFIG_NLS_CODEPAGE_932=y
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRC_T10DIF=y
diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig
index bbd4c2298708..95e5208b8260 100644
--- a/arch/sh/configs/sdk7780_defconfig
+++ b/arch/sh/configs/sdk7780_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_LOCALVERSION="_SDK7780"
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -39,7 +38,6 @@ CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_BOOTP=y
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
CONFIG_IPV6=y
# CONFIG_INET6_XFRM_MODE_BEET is not set
CONFIG_NET_SCHED=y
@@ -47,7 +45,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_PARPORT=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
-# CONFIG_MISC_DEVICES is not set
CONFIG_IDE=y
CONFIG_BLK_DEV_IDECD=y
CONFIG_BLK_DEV_PLATFORM=y
@@ -63,8 +60,6 @@ CONFIG_BLK_DEV_DM=y
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SMC91X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
CONFIG_NETCONSOLE=y
CONFIG_INPUT_FF_MEMLESS=m
CONFIG_INPUT_EVDEV=y
@@ -78,7 +73,6 @@ CONFIG_SSB=y
CONFIG_SSB_DRIVER_PCICORE=y
CONFIG_FB=y
CONFIG_FB_SH_MOBILE_LCDC=m
-CONFIG_DISPLAY_SUPPORT=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
CONFIG_LOGO=y
@@ -101,7 +95,6 @@ CONFIG_HID_SAMSUNG=y
CONFIG_HID_SONY=y
CONFIG_HID_SUNPLUS=y
CONFIG_USB=y
-# CONFIG_USB_DEVICE_CLASS is not set
CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
@@ -144,8 +137,6 @@ CONFIG_DETECT_HUNG_TASK=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_TIMER_STATS=y
CONFIG_DEBUG_INFO=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_SH_STANDARD_BIOS=y
CONFIG_CRYPTO_MD5=y
CONFIG_CRYPTO_DES=y
diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig
index 36642ec2cb97..e9ee0c878ead 100644
--- a/arch/sh/configs/sdk7786_defconfig
+++ b/arch/sh/configs/sdk7786_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_KERNEL_LZO=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -90,13 +89,11 @@ CONFIG_NET_KEY=y
CONFIG_INET=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
# CONFIG_WIRELESS is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FW_LOADER is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
CONFIG_MTD_BLOCK=y
CONFIG_FTL=y
@@ -119,7 +116,6 @@ CONFIG_MTD_UBI_GLUEBI=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_CRYPTOLOOP=y
CONFIG_BLK_DEV_RAM=y
-# CONFIG_MISC_DEVICES is not set
CONFIG_IDE=y
CONFIG_BLK_DEV_IDECD=y
CONFIG_BLK_DEV_PLATFORM=y
@@ -140,8 +136,6 @@ CONFIG_MDIO_BITBANG=y
CONFIG_NET_ETHERNET=y
CONFIG_SMC91X=y
CONFIG_SMSC911X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_WLAN is not set
CONFIG_VT_HW_CONSOLE_BINDING=y
CONFIG_SERIAL_SH_SCI=y
@@ -157,7 +151,6 @@ CONFIG_SPI=y
# CONFIG_HWMON is not set
CONFIG_WATCHDOG=y
CONFIG_SH_WDT=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_USB=y
CONFIG_USB_MON=y
CONFIG_USB_OHCI_HCD=y
@@ -223,9 +216,7 @@ CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_TIMER_STATS=y
CONFIG_DEBUG_MEMORY_INIT=y
-# CONFIG_RCU_CPU_STALL_VERBOSE is not set
CONFIG_LATENCYTOP=y
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_FUNCTION_TRACER=y
# CONFIG_FUNCTION_GRAPH_TRACER is not set
CONFIG_DMA_API_DEBUG=y
diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig
index 91853a67ec34..3553acd5edb1 100644
--- a/arch/sh/configs/se7206_defconfig
+++ b/arch/sh/configs/se7206_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -57,7 +56,6 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
@@ -65,9 +63,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
# CONFIG_FW_LOADER is not set
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -78,8 +73,6 @@ CONFIG_EEPROM_93CX6=y
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SMC91X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
@@ -109,7 +102,6 @@ CONFIG_DEBUG_SPINLOCK_SLEEP=y
CONFIG_DEBUG_VM=y
CONFIG_DEBUG_LIST=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_CRYPTO_DEFLATE=y
CONFIG_CRYPTO_LZO=y
diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig
index 201acb4652f7..fc77a67b16e7 100644
--- a/arch/sh/configs/se7343_defconfig
+++ b/arch/sh/configs/se7343_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -27,26 +26,19 @@ CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_SYN_COOKIES=y
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_RAM=y
CONFIG_MTD_PHYSMAP=y
-# CONFIG_MISC_DEVICES is not set
CONFIG_SCSI=y
CONFIG_SCSI_MULTI_LUN=y
# CONFIG_SCSI_LOWLEVEL is not set
CONFIG_NETDEVICES=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
CONFIG_USB_USBNET=y
# CONFIG_USB_NET_AX8817X is not set
CONFIG_USB_NET_DM9601=y
@@ -104,5 +96,4 @@ CONFIG_CRAMFS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_NFSD=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig
index 9a9ad9adf959..f54722dbc8f5 100644
--- a/arch/sh/configs/se7619_defconfig
+++ b/arch/sh/configs/se7619_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_LOG_BUF_SHIFT=14
# CONFIG_UID16 is not set
@@ -24,10 +23,7 @@ CONFIG_BINFMT_ZFLAT=y
# CONFIG_STANDALONE is not set
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -48,4 +44,3 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_SYSFS is not set
CONFIG_ROMFS_FS=y
# CONFIG_ENABLE_MUST_CHECK is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/sh/configs/se7705_defconfig b/arch/sh/configs/se7705_defconfig
index 044e0844fda1..ddfc69841955 100644
--- a/arch/sh/configs/se7705_defconfig
+++ b/arch/sh/configs/se7705_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_SWAP is not set
CONFIG_LOG_BUF_SHIFT=14
CONFIG_BLK_DEV_INITRD=y
@@ -27,11 +26,8 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -58,5 +54,4 @@ CONFIG_PROC_KCORE=y
CONFIG_JFFS2_FS=y
CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig
index 1248635e4f88..5a1097641247 100644
--- a/arch/sh/configs/se7712_defconfig
+++ b/arch/sh/configs/se7712_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
@@ -47,7 +46,6 @@ CONFIG_SYN_COOKIES=y
CONFIG_INET_AH=y
CONFIG_INET_ESP=y
CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_NET_SCHED=y
@@ -68,9 +66,6 @@ CONFIG_NET_CLS_FW=y
CONFIG_NET_CLS_IND=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -104,8 +99,6 @@ CONFIG_ROOT_NFS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_INFO=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_CRYPTO_ECB=m
CONFIG_CRYPTO_PCBC=m
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig
index c3ba6e8a9818..9c0ef13bee10 100644
--- a/arch/sh/configs/se7721_defconfig
+++ b/arch/sh/configs/se7721_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
@@ -46,7 +45,6 @@ CONFIG_SYN_COOKIES=y
CONFIG_INET_AH=y
CONFIG_INET_ESP=y
CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_NET_SCHED=y
@@ -67,9 +65,6 @@ CONFIG_NET_CLS_FW=y
CONFIG_NET_CLS_IND=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -132,6 +127,5 @@ CONFIG_NLS_ISO8859_1=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_INFO=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRC_CCITT=y
diff --git a/arch/sh/configs/se7722_defconfig b/arch/sh/configs/se7722_defconfig
index ae998c7e2ee0..ccc7fc423fde 100644
--- a/arch/sh/configs/se7722_defconfig
+++ b/arch/sh/configs/se7722_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -26,7 +25,6 @@ CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FW_LOADER is not set
@@ -57,6 +55,5 @@ CONFIG_PRINTK_TIME=y
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_SH_STANDARD_BIOS=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7724_defconfig b/arch/sh/configs/se7724_defconfig
index 1faa788aecae..aedb3a2d9a10 100644
--- a/arch/sh/configs/se7724_defconfig
+++ b/arch/sh/configs/se7724_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -30,14 +29,10 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -53,8 +48,6 @@ CONFIG_SMSC_PHY=y
CONFIG_NET_ETHERNET=y
CONFIG_SH_ETH=y
CONFIG_SMC91X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
CONFIG_INPUT_EVDEV=y
# CONFIG_KEYBOARD_ATKBD is not set
@@ -137,8 +130,6 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_MUST_CHECK is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_CRYPTO=y
CONFIG_CRYPTO_CBC=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7750_defconfig b/arch/sh/configs/se7750_defconfig
index 912c98590e22..b23f67542728 100644
--- a/arch/sh/configs/se7750_defconfig
+++ b/arch/sh/configs/se7750_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -25,11 +24,8 @@ CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_BOOTP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -58,5 +54,4 @@ CONFIG_ROOT_NFS=y
CONFIG_PARTITION_ADVANCED=y
# CONFIG_MSDOS_PARTITION is not set
# CONFIG_ENABLE_MUST_CHECK is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig
index 56b5e4ce8d4a..162343683937 100644
--- a/arch/sh/configs/se7751_defconfig
+++ b/arch/sh/configs/se7751_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_LOG_BUF_SHIFT=14
@@ -25,12 +24,9 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_NETFILTER=y
-CONFIG_IP_NF_QUEUE=y
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -48,5 +44,4 @@ CONFIG_EXT2_FS=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_JFFS2_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/se7780_defconfig b/arch/sh/configs/se7780_defconfig
index b0ef63ce525a..ec32c82646ed 100644
--- a/arch/sh/configs/se7780_defconfig
+++ b/arch/sh/configs/se7780_defconfig
@@ -24,7 +24,6 @@ CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_PNP=y
-# CONFIG_INET_LRO is not set
CONFIG_IPV6=y
# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET6_XFRM_MODE_TUNNEL is not set
@@ -32,8 +31,6 @@ CONFIG_IPV6=y
# CONFIG_IPV6_SIT is not set
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_ADV_OPTIONS=y
@@ -54,8 +51,6 @@ CONFIG_SMSC_PHY=y
CONFIG_NET_ETHERNET=y
CONFIG_SMC91X=y
CONFIG_NET_PCI=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
CONFIG_INPUT_FF_MEMLESS=m
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
# CONFIG_INPUT_KEYBOARD is not set
@@ -94,7 +89,6 @@ CONFIG_HID_SAMSUNG=y
CONFIG_HID_SONY=y
CONFIG_HID_SUNPLUS=y
CONFIG_USB=y
-# CONFIG_USB_DEVICE_CLASS is not set
CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
CONFIG_USB_OHCI_HCD=y
@@ -110,5 +104,4 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/secureedge5410_defconfig b/arch/sh/configs/secureedge5410_defconfig
index 7eae4e59d7f0..360592d63a2f 100644
--- a/arch/sh/configs/secureedge5410_defconfig
+++ b/arch/sh/configs/secureedge5410_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_SWAP is not set
CONFIG_LOG_BUF_SHIFT=14
CONFIG_BLK_DEV_INITRD=y
@@ -18,12 +17,9 @@ CONFIG_INET=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK_RO=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_ADV_OPTIONS=y
@@ -34,14 +30,11 @@ CONFIG_MTD_CFI_GEOMETRY=y
CONFIG_MTD_CFI_INTELEXT=y
CONFIG_MTD_PLATRAM=y
CONFIG_BLK_DEV_RAM=y
-# CONFIG_MISC_DEVICES is not set
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_NET_PCI=y
CONFIG_8139CP=y
CONFIG_8139TOO=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
@@ -51,7 +44,6 @@ CONFIG_SERIAL_SH_SCI=y
CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_HW_RANDOM is not set
# CONFIG_HWMON is not set
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_DS1302=y
@@ -60,4 +52,3 @@ CONFIG_EXT2_FS=y
CONFIG_TMPFS=y
CONFIG_CRAMFS=y
CONFIG_ROMFS_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/sh/configs/sh03_defconfig b/arch/sh/configs/sh03_defconfig
index 0cf4097b71e8..2156223405a1 100644
--- a/arch/sh/configs/sh03_defconfig
+++ b/arch/sh/configs/sh03_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -34,7 +33,6 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_STANDALONE is not set
@@ -70,7 +68,6 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_AUTOFS_FS=y
CONFIG_AUTOFS4_FS=y
CONFIG_ISO9660_FS=m
CONFIG_JOLIET=y
@@ -126,7 +123,6 @@ CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
CONFIG_NLS_UTF8=m
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_SH_STANDARD_BIOS=y
CONFIG_CRYPTO_ECB=m
CONFIG_CRYPTO_HMAC=y
diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig
index df25ae774ee0..34094e05e892 100644
--- a/arch/sh/configs/sh2007_defconfig
+++ b/arch/sh/configs/sh2007_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -42,7 +41,6 @@ CONFIG_NET_IPIP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_NETWORK_SECMARK=y
CONFIG_NET_PKTGEN=y
@@ -50,7 +48,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_CDROM_PKTCDVD=y
-# CONFIG_MISC_DEVICES is not set
CONFIG_RAID_ATTRS=y
CONFIG_SCSI=y
CONFIG_BLK_DEV_SD=y
@@ -72,8 +69,6 @@ CONFIG_TUN=y
CONFIG_VETH=y
CONFIG_NET_ETHERNET=y
CONFIG_SMSC911X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_WLAN is not set
CONFIG_INPUT_FF_MEMLESS=y
# CONFIG_INPUT_MOUSEDEV is not set
@@ -95,9 +90,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
CONFIG_LOGO=y
-# CONFIG_HID_SUPPORT is not set
CONFIG_USB=y
-# CONFIG_USB_DEVICE_CLASS is not set
CONFIG_USB_MON=y
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y
@@ -172,7 +165,6 @@ CONFIG_DEBUG_KERNEL=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_DEBUG_INFO=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_SH_STANDARD_BIOS=y
CONFIG_CRYPTO_NULL=y
CONFIG_CRYPTO_AUTHENC=y
diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig
index f92ad17cd629..65a1aad899c8 100644
--- a/arch/sh/configs/sh7710voipgw_defconfig
+++ b/arch/sh/configs/sh7710voipgw_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -24,7 +23,6 @@ CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_SYN_COOKIES=y
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
CONFIG_NETFILTER=y
@@ -36,8 +34,6 @@ CONFIG_NET_CLS_ROUTE4=y
CONFIG_NET_CLS_U32=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -59,5 +55,4 @@ CONFIG_THERMAL=y
# CONFIG_DNOTIFY is not set
CONFIG_JFFS2_FS=y
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig
index f83ac7b0b031..d15e53647983 100644
--- a/arch/sh/configs/sh7724_generic_defconfig
+++ b/arch/sh/configs/sh7724_generic_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_CGROUPS=y
@@ -18,7 +17,6 @@ CONFIG_HIBERNATION=y
CONFIG_CPU_IDLE=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_MISC_DEVICES is not set
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
@@ -44,5 +42,4 @@ CONFIG_UIO_PDRV_GENIRQ=y
# CONFIG_MISC_FILESYSTEMS is not set
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRC32 is not set
diff --git a/arch/sh/configs/sh7757lcr_defconfig b/arch/sh/configs/sh7757lcr_defconfig
index cfde98ddb29d..b0c4bc830fb8 100644
--- a/arch/sh/configs/sh7757lcr_defconfig
+++ b/arch/sh/configs/sh7757lcr_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -32,13 +31,11 @@ CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
CONFIG_IPV6=y
# CONFIG_WIRELESS is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FW_LOADER is not set
CONFIG_MTD=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_M25P80=y
CONFIG_BLK_DEV_RAM=y
@@ -48,7 +45,6 @@ CONFIG_NETDEVICES=y
CONFIG_VITESSE_PHY=y
CONFIG_NET_ETHERNET=y
CONFIG_SH_ETH=y
-# CONFIG_NETDEV_10000 is not set
# CONFIG_WLAN is not set
# CONFIG_KEYBOARD_ATKBD is not set
# CONFIG_MOUSE_PS2 is not set
diff --git a/arch/sh/configs/sh7763rdp_defconfig b/arch/sh/configs/sh7763rdp_defconfig
index 479536440264..2ef780fb9813 100644
--- a/arch/sh/configs/sh7763rdp_defconfig
+++ b/arch/sh/configs/sh7763rdp_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
@@ -26,11 +25,9 @@ CONFIG_INET=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
CONFIG_MTD_BLKDEVS=y
CONFIG_MTD_CFI=y
@@ -43,14 +40,11 @@ CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_CFI_STAA=y
CONFIG_MTD_COMPLEX_MAPPINGS=y
CONFIG_MTD_PHYSMAP=y
-# CONFIG_MISC_DEVICES is not set
CONFIG_SCSI=y
CONFIG_BLK_DEV_SD=y
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SH_ETH=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT_MOUSEDEV is not set
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
@@ -65,7 +59,6 @@ CONFIG_FB_FOREIGN_ENDIAN=y
CONFIG_FB_SH7760=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_LOGO=y
-# CONFIG_HID_SUPPORT is not set
CONFIG_USB=y
CONFIG_USB_MON=y
CONFIG_USB_OHCI_HCD=y
@@ -74,7 +67,6 @@ CONFIG_MMC=y
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-CONFIG_AUTOFS_FS=y
CONFIG_AUTOFS4_FS=y
CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=y
@@ -124,6 +116,5 @@ CONFIG_NLS_UTF8=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRC_T10DIF=y
diff --git a/arch/sh/configs/sh7770_generic_defconfig b/arch/sh/configs/sh7770_generic_defconfig
index 025bd3ac5ab0..742634b37c0a 100644
--- a/arch/sh/configs/sh7770_generic_defconfig
+++ b/arch/sh/configs/sh7770_generic_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_CGROUPS=y
@@ -20,7 +19,6 @@ CONFIG_HIBERNATION=y
CONFIG_CPU_IDLE=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_MISC_DEVICES is not set
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
@@ -46,5 +44,4 @@ CONFIG_UIO_PDRV_GENIRQ=y
# CONFIG_MISC_FILESYSTEMS is not set
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_CRC32 is not set
diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig
index 2fce54d9c388..2ddf5ca7094e 100644
--- a/arch/sh/configs/sh7785lcr_32bit_defconfig
+++ b/arch/sh/configs/sh7785lcr_32bit_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -44,13 +43,9 @@ CONFIG_INET=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -58,7 +53,6 @@ CONFIG_MTD_PHYSMAP=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_RAM=y
-# CONFIG_MISC_DEVICES is not set
# CONFIG_SCSI_PROC_FS is not set
CONFIG_BLK_DEV_SD=y
# CONFIG_SCSI_LOWLEVEL is not set
@@ -69,7 +63,6 @@ CONFIG_NET_ETHERNET=y
CONFIG_NET_VENDOR_3COM=y
CONFIG_VORTEX=y
CONFIG_R8169=y
-# CONFIG_NETDEV_10000 is not set
# CONFIG_WLAN is not set
CONFIG_INPUT_FF_MEMLESS=m
CONFIG_INPUT_EVDEV=y
@@ -113,7 +106,6 @@ CONFIG_SND_CMIPCI=y
CONFIG_SND_EMU10K1=y
# CONFIG_SND_SUPERH is not set
CONFIG_USB=y
-# CONFIG_USB_DEVICE_CLASS is not set
CONFIG_USB_R8A66597_HCD=y
CONFIG_USB_STORAGE=y
CONFIG_MMC=y
@@ -154,9 +146,7 @@ CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_MUTEXES=y
CONFIG_DEBUG_SPINLOCK_SLEEP=y
CONFIG_DEBUG_INFO=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_LATENCYTOP=y
-CONFIG_SYSCTL_SYSCALL_CHECK=y
# CONFIG_FTRACE is not set
CONFIG_CRYPTO_HMAC=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sh7785lcr_defconfig b/arch/sh/configs/sh7785lcr_defconfig
index d29da4a0f6c2..7098828d392e 100644
--- a/arch/sh/configs/sh7785lcr_defconfig
+++ b/arch/sh/configs/sh7785lcr_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -26,27 +25,21 @@ CONFIG_INET=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FW_LOADER is not set
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_PHYSMAP=y
CONFIG_BLK_DEV_RAM=y
-# CONFIG_MISC_DEVICES is not set
CONFIG_BLK_DEV_SD=y
# CONFIG_SCSI_LOWLEVEL is not set
CONFIG_ATA=y
CONFIG_SATA_SIL=y
CONFIG_NETDEVICES=y
CONFIG_R8169=y
-# CONFIG_NETDEV_10000 is not set
CONFIG_INPUT_FF_MEMLESS=m
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
# CONFIG_KEYBOARD_ATKBD is not set
@@ -121,8 +114,6 @@ CONFIG_NLS_ISO8859_1=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_DEBUG_BUGVERBOSE is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_CRYPTO_HMAC=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
# CONFIG_CRYPTO_HW is not set
diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig
index 4802e14a4649..d589cfdfb7eb 100644
--- a/arch/sh/configs/shmin_defconfig
+++ b/arch/sh/configs/shmin_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_SWAP is not set
CONFIG_LOG_BUF_SHIFT=14
# CONFIG_UID16 is not set
@@ -28,10 +27,8 @@ CONFIG_NET=y
CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_IP_PNP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
@@ -53,6 +50,5 @@ CONFIG_CRAMFS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_SH_STANDARD_BIOS=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig
index 4a4269ad5b04..755c4f73c718 100644
--- a/arch/sh/configs/shx3_defconfig
+++ b/arch/sh/configs/shx3_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -56,7 +55,6 @@ CONFIG_NET=y
CONFIG_INET=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
CONFIG_CAN=m
CONFIG_CAN_RAW=m
CONFIG_CAN_BCM=m
@@ -70,8 +68,6 @@ CONFIG_PATA_PLATFORM=y
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
CONFIG_SMC91X=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
@@ -82,7 +78,6 @@ CONFIG_I2C=m
CONFIG_SPI=y
# CONFIG_HWMON is not set
CONFIG_WATCHDOG=y
-CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_USB=y
CONFIG_USB_MON=y
CONFIG_USB_R8A66597_HCD=m
@@ -104,7 +99,6 @@ CONFIG_DEBUG_SHIRQ=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_DEBUG_VM=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_SH_STANDARD_BIOS=y
CONFIG_DEBUG_STACK_USAGE=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index a77b778c745b..ceb48e9b70f4 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -49,7 +48,6 @@ CONFIG_SYN_COOKIES=y
CONFIG_INET_AH=y
CONFIG_INET_ESP=y
CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
CONFIG_INET_DIAG=m
CONFIG_IPV6=y
CONFIG_IPV6_PRIVACY=y
@@ -79,7 +77,6 @@ CONFIG_NETFILTER_XT_MATCH_REALM=m
CONFIG_NETFILTER_XT_MATCH_SCTP=m
CONFIG_NETFILTER_XT_MATCH_STRING=m
CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
-CONFIG_IP_NF_QUEUE=m
CONFIG_IP_NF_IPTABLES=m
CONFIG_IP_NF_MATCH_ADDRTYPE=m
CONFIG_IP_NF_MATCH_AH=m
@@ -88,7 +85,6 @@ CONFIG_IP_NF_MATCH_TTL=m
CONFIG_IP_NF_FILTER=m
CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IP_NF_TARGET_LOG=m
-CONFIG_IP_NF_TARGET_ULOG=m
CONFIG_IP_NF_MANGLE=m
CONFIG_IP_NF_TARGET_ECN=m
CONFIG_IP_NF_TARGET_TTL=m
@@ -96,7 +92,6 @@ CONFIG_IP_NF_RAW=m
CONFIG_IP_NF_ARPTABLES=m
CONFIG_IP_NF_ARPFILTER=m
CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_IP6_NF_QUEUE=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -106,7 +101,6 @@ CONFIG_IP6_NF_MATCH_HL=m
CONFIG_IP6_NF_MATCH_IPV6HEADER=m
CONFIG_IP6_NF_MATCH_RT=m
CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_TARGET_LOG=m
CONFIG_IP6_NF_FILTER=m
CONFIG_IP6_NF_TARGET_REJECT=m
CONFIG_IP6_NF_MANGLE=m
@@ -154,7 +148,6 @@ CONFIG_FW_LOADER=m
CONFIG_CONNECTOR=m
CONFIG_MTD=m
CONFIG_MTD_DEBUG=y
-CONFIG_MTD_CHAR=m
CONFIG_MTD_BLOCK=m
CONFIG_FTL=m
CONFIG_NFTL=m
@@ -261,7 +254,6 @@ CONFIG_NLS_UTF8=m
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_BUGVERBOSE is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_ECB=y
CONFIG_CRYPTO_MD4=m
diff --git a/arch/sh/configs/ul2_defconfig b/arch/sh/configs/ul2_defconfig
index 2d288b887fbd..5f2921a85192 100644
--- a/arch/sh/configs/ul2_defconfig
+++ b/arch/sh/configs/ul2_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_IKCONFIG=y
@@ -29,7 +28,6 @@ CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_CFG80211=y
CONFIG_MAC80211=y
@@ -37,9 +35,6 @@ CONFIG_MAC80211_RC_PID=y
# CONFIG_MAC80211_RC_MINSTREL is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
@@ -50,8 +45,6 @@ CONFIG_ATA=y
CONFIG_PATA_PLATFORM=y
CONFIG_NETDEVICES=y
CONFIG_NET_ETHERNET=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
CONFIG_LIBERTAS=m
CONFIG_LIBERTAS_SDIO=m
CONFIG_LIBERTAS_DEBUG=y
@@ -70,7 +63,6 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_UNIX98_PTYS is not set
# CONFIG_LEGACY_PTYS is not set
# CONFIG_HW_RANDOM is not set
-# CONFIG_HID_SUPPORT is not set
CONFIG_USB=y
CONFIG_USB_MON=y
CONFIG_USB_R8A66597_HCD=y
@@ -92,6 +84,5 @@ CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ISO8859_1=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_CRYPTO_MICHAEL_MIC=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig
index 01c9a91ee896..7d5591b7c088 100644
--- a/arch/sh/configs/urquell_defconfig
+++ b/arch/sh/configs/urquell_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
@@ -46,20 +45,15 @@ CONFIG_INET=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
-# CONFIG_INET_LRO is not set
# CONFIG_IPV6 is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
# CONFIG_FW_LOADER is not set
CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_PHYSMAP=y
CONFIG_BLK_DEV_RAM=y
-# CONFIG_MISC_DEVICES is not set
CONFIG_BLK_DEV_SD=y
# CONFIG_SCSI_LOWLEVEL is not set
CONFIG_ATA=y
@@ -73,7 +67,6 @@ CONFIG_NET_PCI=y
CONFIG_8139CP=y
CONFIG_SKY2=y
CONFIG_SKY2_DEBUG=y
-# CONFIG_NETDEV_10000 is not set
CONFIG_INPUT_FF_MEMLESS=m
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
# CONFIG_KEYBOARD_ATKBD is not set
@@ -150,8 +143,6 @@ CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_DEBUG_INFO=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
# CONFIG_FTRACE is not set
# CONFIG_DUMP_CODE is not set
CONFIG_CRYPTO_HMAC=y
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index a4a626199c47..0be3828752e5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -97,6 +97,9 @@ config ARCH_PROC_KCORE_TEXT
config CPU_BIG_ENDIAN
def_bool y
+config CPU_BIG_ENDIAN
+ def_bool y
+
config ARCH_ATU
bool
default y if SPARC64
diff --git a/arch/sparc/include/asm/vga.h b/arch/sparc/include/asm/vga.h
index ec0e9967d93d..f54e8b6fb197 100644
--- a/arch/sparc/include/asm/vga.h
+++ b/arch/sparc/include/asm/vga.h
@@ -8,9 +8,13 @@
#define _LINUX_ASM_VGA_H_
#include <linux/bug.h>
+#include <linux/string.h>
#include <asm/types.h>
#define VT_BUF_HAVE_RW
+#define VT_BUF_HAVE_MEMSETW
+#define VT_BUF_HAVE_MEMCPYW
+#define VT_BUF_HAVE_MEMMOVEW
#undef scr_writew
#undef scr_readw
@@ -29,6 +33,27 @@ static inline u16 scr_readw(const u16 *addr)
return *addr;
}
+static inline void scr_memsetw(u16 *p, u16 v, unsigned int n)
+{
+ BUG_ON((long) p >= 0);
+
+ memset16(p, cpu_to_le16(v), n / 2);
+}
+
+static inline void scr_memcpyw(u16 *d, u16 *s, unsigned int n)
+{
+ BUG_ON((long) d >= 0);
+
+ memcpy(d, s, n);
+}
+
+static inline void scr_memmovew(u16 *d, u16 *s, unsigned int n)
+{
+ BUG_ON((long) d >= 0);
+
+ memmove(d, s, n);
+}
+
#define VGA_MAP_MEM(x,s) (x)
#endif
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 443a70bccc1c..6becb96c60a0 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -1200,7 +1200,7 @@ static void __init validate_hv(void)
* We use a struct cpumask for this, so it must be big enough.
*/
if ((smp_height * smp_width) > nr_cpu_ids)
- early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %d\n",
+ early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %u\n",
smp_height, smp_width, nr_cpu_ids);
#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4b278a33ccbb..a3e6e6136a47 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2323,6 +2323,10 @@ source "kernel/livepatch/Kconfig"
endmenu
+config ARCH_HAS_ADD_PAGES
+ def_bool y
+ depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+
config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
depends on X86_64 || (X86_32 && HIGHMEM)
@@ -2343,6 +2347,10 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
def_bool y
depends on X86_64 && HUGETLB_PAGE && MIGRATION
+config ARCH_ENABLE_THP_MIGRATION
+ def_bool y
+ depends on X86_64 && TRANSPARENT_HUGEPAGE
+
menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index bbeae4a2bd01..5b4c44d419c5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1172,6 +1172,23 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+#endif
#endif
#define PKRU_AD_BIT 0x1
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 2160c1fee920..972a4698c530 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -180,15 +180,21 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/*
* Encode and de-code a swap entry
*
- * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number
- * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
- * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry
+ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
+ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
+ * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
* there. We also need to avoid using A and D because of an
* erratum where they can be incorrectly set by hardware on
* non-present PTEs.
+ *
+ * SD (1) in swp entry is used to store soft dirty bit, which helps us
+ * remember soft dirty over page migration
+ *
+ * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
+ * but also L and G.
*/
#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
#define SWP_TYPE_BITS 5
@@ -204,7 +210,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
((type) << (SWP_TYPE_FIRST_BIT)) \
| ((offset) << SWP_OFFSET_FIRST_BIT) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
+#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 399261ce904c..f1492473f10e 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -99,15 +99,15 @@
/*
* Tracking soft dirty bit when a page goes to a swap is tricky.
* We need a bit which can be stored in pte _and_ not conflict
- * with swap entry format. On x86 bits 6 and 7 are *not* involved
- * into swap entry computation, but bit 6 is used for nonlinear
- * file mapping, so we borrow bit 7 for soft dirty tracking.
+ * with swap entry format. On x86 bits 1-4 are *not* involved
+ * into swap entry computation, but bit 7 is used for thp migration,
+ * so we borrow bit 1 for soft dirty tracking.
*
* Please note that this bit must be treated as swap dirty page
- * mark if and only if the PTE has present bit clear!
+ * mark if and only if the PTE/PMD has present bit clear!
*/
#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE
+#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW
#else
#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
#endif
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index e9ee84873de5..e371e7229042 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -340,6 +340,30 @@ extern void *memset(void *, int, size_t);
#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosw"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosl"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
/*
* find the first occurrence of byte 'c', or 1 past the area if none
*/
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2a8c822de1fc..f372a70a523f 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -58,6 +58,42 @@ extern void *__memcpy(void *to, const void *from, size_t len);
void *memset(void *s, int c, size_t n);
void *__memset(void *s, int c, size_t n);
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosw"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosl"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET64
+static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosq"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
#define __HAVE_ARCH_MEMMOVE
void *memmove(void *dest, const void *src, size_t count);
void *__memmove(void *dest, const void *src, size_t count);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7834f73efbf1..8315e2f517a7 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2097,7 +2097,7 @@ static int allocate_logical_cpuid(int apicid)
/* Allocate a new cpuid. */
if (nr_logical_cpuids >= nr_cpu_ids) {
- WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. "
+ WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. "
"Processor %d/0x%x and the rest are ignored.\n",
nr_cpu_ids, nr_logical_cpuids, apicid);
return -EINVAL;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 6e8fcb6f7e1e..28dafed6c682 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -168,7 +168,7 @@ void __init setup_per_cpu_areas(void)
unsigned long delta;
int rc;
- pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 54b9e89d4d6b..cd6622c3204e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1461,7 +1461,7 @@ __init void prefill_possible_map(void)
/* nr_cpu_ids could be reduced via nr_cpus= */
if (possible > nr_cpu_ids) {
- pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
+ pr_warn("%d Processors exceeds NR_CPUS limit of %u\n",
possible, nr_cpu_ids);
possible = nr_cpu_ids;
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..048fbe8fc274 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -761,7 +761,7 @@ void __init paging_init(void)
* After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
* updating.
*/
-static void update_end_of_memory_vars(u64 start, u64 size)
+static void update_end_of_memory_vars(u64 start, u64 size)
{
unsigned long end_pfn = PFN_UP(start + size);
@@ -772,22 +772,30 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
}
-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock)
{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- init_memory_mapping(start, start + size);
-
ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
- update_end_of_memory_vars(start, size);
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT);
return ret;
}
+
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ init_memory_mapping(start, start + size);
+
+ return add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
EXPORT_SYMBOL_GPL(arch_add_memory);
#define PAGE_INUSE 0xFD
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9b86e9b352e9..9f342ef1ad42 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -93,13 +93,14 @@ struct cfq_ttime {
* move this into the elevator for the rq sorting as well.
*/
struct cfq_rb_root {
- struct rb_root rb;
- struct rb_node *left;
+ struct rb_root_cached rb;
+ struct rb_node *rb_rightmost;
unsigned count;
u64 min_vdisktime;
struct cfq_ttime ttime;
};
-#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \
+#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
+ .rb_rightmost = NULL, \
.ttime = {.last_end_request = ktime_get_ns(),},}
/*
@@ -981,10 +982,9 @@ static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
static void update_min_vdisktime(struct cfq_rb_root *st)
{
- struct cfq_group *cfqg;
+ if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
+ struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
- if (st->left) {
- cfqg = rb_entry_cfqg(st->left);
st->min_vdisktime = max_vdisktime(st->min_vdisktime,
cfqg->vdisktime);
}
@@ -1166,46 +1166,28 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
}
}
-/*
- * The below is leftmost cache rbtree addon
- */
static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
{
/* Service tree is empty */
if (!root->count)
return NULL;
- if (!root->left)
- root->left = rb_first(&root->rb);
-
- if (root->left)
- return rb_entry(root->left, struct cfq_queue, rb_node);
-
- return NULL;
+ return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
}
static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
{
- if (!root->left)
- root->left = rb_first(&root->rb);
-
- if (root->left)
- return rb_entry_cfqg(root->left);
-
- return NULL;
+ return rb_entry_cfqg(rb_first_cached(&root->rb));
}
-static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
{
- rb_erase(n, root);
+ if (root->rb_rightmost == n)
+ root->rb_rightmost = rb_prev(n);
+
+ rb_erase_cached(n, &root->rb);
RB_CLEAR_NODE(n);
-}
-static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
-{
- if (root->left == n)
- root->left = NULL;
- rb_erase_init(n, &root->rb);
--root->count;
}
@@ -1255,29 +1237,30 @@ cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
static void
__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
- struct rb_node **node = &st->rb.rb_node;
+ struct rb_node **node = &st->rb.rb_root.rb_node;
struct rb_node *parent = NULL;
struct cfq_group *__cfqg;
s64 key = cfqg_key(st, cfqg);
- int left = 1;
+ bool leftmost = true, rightmost = true;
while (*node != NULL) {
parent = *node;
__cfqg = rb_entry_cfqg(parent);
- if (key < cfqg_key(st, __cfqg))
+ if (key < cfqg_key(st, __cfqg)) {
node = &parent->rb_left;
- else {
+ rightmost = false;
+ } else {
node = &parent->rb_right;
- left = 0;
+ leftmost = false;
}
}
- if (left)
- st->left = &cfqg->rb_node;
+ if (rightmost)
+ st->rb_rightmost = &cfqg->rb_node;
rb_link_node(&cfqg->rb_node, parent, node);
- rb_insert_color(&cfqg->rb_node, &st->rb);
+ rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
}
/*
@@ -1378,7 +1361,7 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
* so that groups get lesser vtime based on their weights, so that
* if group does not loose all if it was not continuously backlogged.
*/
- n = rb_last(&st->rb);
+ n = st->rb_rightmost;
if (n) {
__cfqg = rb_entry_cfqg(n);
cfqg->vdisktime = __cfqg->vdisktime +
@@ -2220,14 +2203,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct cfq_queue *__cfqq;
u64 rb_key;
struct cfq_rb_root *st;
- int left;
+ bool leftmost = true;
int new_cfqq = 1;
u64 now = ktime_get_ns();
st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
if (cfq_class_idle(cfqq)) {
rb_key = CFQ_IDLE_DELAY;
- parent = rb_last(&st->rb);
+ parent = st->rb_rightmost;
if (parent && parent != &cfqq->rb_node) {
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
rb_key += __cfqq->rb_key;
@@ -2261,10 +2244,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqq->service_tree = NULL;
}
- left = 1;
parent = NULL;
cfqq->service_tree = st;
- p = &st->rb.rb_node;
+ p = &st->rb.rb_root.rb_node;
while (*p) {
parent = *p;
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
@@ -2276,16 +2258,13 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
p = &parent->rb_left;
else {
p = &parent->rb_right;
- left = 0;
+ leftmost = false;
}
}
- if (left)
- st->left = &cfqq->rb_node;
-
cfqq->rb_key = rb_key;
rb_link_node(&cfqq->rb_node, parent, p);
- rb_insert_color(&cfqq->rb_node, &st->rb);
+ rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
st->count++;
if (add_front || !new_cfqq)
return;
@@ -2732,7 +2711,7 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
/* There is nothing to dispatch */
if (!st)
return NULL;
- if (RB_EMPTY_ROOT(&st->rb))
+ if (RB_EMPTY_ROOT(&st->rb.rb_root))
return NULL;
return cfq_rb_first(st);
}
@@ -3219,7 +3198,7 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_group *cfqg;
- if (RB_EMPTY_ROOT(&st->rb))
+ if (RB_EMPTY_ROOT(&st->rb.rb_root))
return NULL;
cfqg = cfq_rb_first_group(st);
update_min_vdisktime(st);
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 2c3b359b3536..321cd7b4d817 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -256,9 +256,9 @@ static ssize_t print_cpus_offline(struct device *dev,
buf[n++] = ',';
if (nr_cpu_ids == total_cpus-1)
- n += snprintf(&buf[n], len - n, "%d", nr_cpu_ids);
+ n += snprintf(&buf[n], len - n, "%u", nr_cpu_ids);
else
- n += snprintf(&buf[n], len - n, "%d-%d",
+ n += snprintf(&buf[n], len - n, "%u-%d",
nr_cpu_ids, total_cpus-1);
}
diff --git a/drivers/base/node.c b/drivers/base/node.c
index d8dc83017d8d..3855902f2c5b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
"interleave_hit %lu\n"
"local_node %lu\n"
"other_node %lu\n",
- sum_zone_node_page_state(dev->id, NUMA_HIT),
- sum_zone_node_page_state(dev->id, NUMA_MISS),
- sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
- sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
- sum_zone_node_page_state(dev->id, NUMA_LOCAL),
- sum_zone_node_page_state(dev->id, NUMA_OTHER));
+ sum_zone_numa_state(dev->id, NUMA_HIT),
+ sum_zone_numa_state(dev->id, NUMA_MISS),
+ sum_zone_numa_state(dev->id, NUMA_FOREIGN),
+ sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
+ sum_zone_numa_state(dev->id, NUMA_LOCAL),
+ sum_zone_numa_state(dev->id, NUMA_OTHER));
}
static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
@@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev,
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
sum_zone_node_page_state(nid, i));
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+ sum_zone_numa_state(nid, i));
+#endif
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ n += sprintf(buf+n, "%s %lu\n",
+ vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NUMA_STAT_ITEMS],
node_page_state(pgdat, i));
return n;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 4063f3f59f4f..2981c27d3aae 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -175,20 +175,11 @@ static inline void update_used_max(struct zram *zram,
} while (old_max != cur_max);
}
-static inline void zram_fill_page(char *ptr, unsigned long len,
+static inline void zram_fill_page(void *ptr, unsigned long len,
unsigned long value)
{
- int i;
- unsigned long *page = (unsigned long *)ptr;
-
WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
-
- if (likely(value == 0)) {
- memset(ptr, 0, len);
- } else {
- for (i = 0; i < len / sizeof(*page); i++)
- page[i] = value;
- }
+ memset_l(ptr, value, len / sizeof(unsigned long));
}
static bool page_same_filled(void *ptr, unsigned long *element)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index e1cde6b80027..3b0f2ec6eec7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -51,7 +51,7 @@ struct amdgpu_mn {
/* objects protected by lock */
struct mutex lock;
- struct rb_root objects;
+ struct rb_root_cached objects;
};
struct amdgpu_mn_node {
@@ -76,8 +76,8 @@ static void amdgpu_mn_destroy(struct work_struct *work)
mutex_lock(&adev->mn_lock);
mutex_lock(&rmn->lock);
hash_del(&rmn->node);
- rbtree_postorder_for_each_entry_safe(node, next_node, &rmn->objects,
- it.rb) {
+ rbtree_postorder_for_each_entry_safe(node, next_node,
+ &rmn->objects.rb_root, it.rb) {
list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
bo->mn = NULL;
list_del_init(&bo->mn_list);
@@ -221,7 +221,7 @@ static struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
rmn->mm = mm;
rmn->mn.ops = &amdgpu_mn_ops;
mutex_init(&rmn->lock);
- rmn->objects = RB_ROOT;
+ rmn->objects = RB_ROOT_CACHED;
r = __mmu_notifier_register(&rmn->mn, mm);
if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 6b1343e5541d..b9a5a77eedaf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2475,7 +2475,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
u64 flags;
uint64_t init_pde_value = 0;
- vm->va = RB_ROOT;
+ vm->va = RB_ROOT_CACHED;
vm->client_id = atomic64_inc_return(&adev->vm_manager.client_counter);
for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
vm->reserved_vmid[i] = NULL;
@@ -2596,10 +2596,11 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
amd_sched_entity_fini(vm->entity.sched, &vm->entity);
- if (!RB_EMPTY_ROOT(&vm->va)) {
+ if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
dev_err(adev->dev, "still active bo inside vm\n");
}
- rbtree_postorder_for_each_entry_safe(mapping, tmp, &vm->va, rb) {
+ rbtree_postorder_for_each_entry_safe(mapping, tmp,
+ &vm->va.rb_root, rb) {
list_del(&mapping->list);
amdgpu_vm_it_remove(mapping, &vm->va);
kfree(mapping);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index ba6691b58ee7..6716355403ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -118,7 +118,7 @@ struct amdgpu_vm_pt {
struct amdgpu_vm {
/* tree of virtual addresses mapped */
- struct rb_root va;
+ struct rb_root_cached va;
/* protecting invalidated */
spinlock_t status_lock;
diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
index f794089d30ac..61a1c8ea74bc 100644
--- a/drivers/gpu/drm/drm_mm.c
+++ b/drivers/gpu/drm/drm_mm.c
@@ -169,7 +169,7 @@ INTERVAL_TREE_DEFINE(struct drm_mm_node, rb,
struct drm_mm_node *
__drm_mm_interval_first(const struct drm_mm *mm, u64 start, u64 last)
{
- return drm_mm_interval_tree_iter_first((struct rb_root *)&mm->interval_tree,
+ return drm_mm_interval_tree_iter_first((struct rb_root_cached *)&mm->interval_tree,
start, last) ?: (struct drm_mm_node *)&mm->head_node;
}
EXPORT_SYMBOL(__drm_mm_interval_first);
@@ -180,6 +180,7 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node,
struct drm_mm *mm = hole_node->mm;
struct rb_node **link, *rb;
struct drm_mm_node *parent;
+ bool leftmost = true;
node->__subtree_last = LAST(node);
@@ -196,9 +197,10 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node,
rb = &hole_node->rb;
link = &hole_node->rb.rb_right;
+ leftmost = false;
} else {
rb = NULL;
- link = &mm->interval_tree.rb_node;
+ link = &mm->interval_tree.rb_root.rb_node;
}
while (*link) {
@@ -208,14 +210,15 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node,
parent->__subtree_last = node->__subtree_last;
if (node->start < parent->start)
link = &parent->rb.rb_left;
- else
+ else {
link = &parent->rb.rb_right;
+ leftmost = true;
+ }
}
rb_link_node(&node->rb, rb, link);
- rb_insert_augmented(&node->rb,
- &mm->interval_tree,
- &drm_mm_interval_tree_augment);
+ rb_insert_augmented_cached(&node->rb, &mm->interval_tree, leftmost,
+ &drm_mm_interval_tree_augment);
}
#define RB_INSERT(root, member, expr) do { \
@@ -577,7 +580,7 @@ void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new)
*new = *old;
list_replace(&old->node_list, &new->node_list);
- rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree);
+ rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree.rb_root);
if (drm_mm_hole_follows(old)) {
list_replace(&old->hole_stack, &new->hole_stack);
@@ -863,7 +866,7 @@ void drm_mm_init(struct drm_mm *mm, u64 start, u64 size)
mm->color_adjust = NULL;
INIT_LIST_HEAD(&mm->hole_stack);
- mm->interval_tree = RB_ROOT;
+ mm->interval_tree = RB_ROOT_CACHED;
mm->holes_size = RB_ROOT;
mm->holes_addr = RB_ROOT;
diff --git a/drivers/gpu/drm/drm_vma_manager.c b/drivers/gpu/drm/drm_vma_manager.c
index d9100b565198..28f1226576f8 100644
--- a/drivers/gpu/drm/drm_vma_manager.c
+++ b/drivers/gpu/drm/drm_vma_manager.c
@@ -147,7 +147,7 @@ struct drm_vma_offset_node *drm_vma_offset_lookup_locked(struct drm_vma_offset_m
struct rb_node *iter;
unsigned long offset;
- iter = mgr->vm_addr_space_mm.interval_tree.rb_node;
+ iter = mgr->vm_addr_space_mm.interval_tree.rb_root.rb_node;
best = NULL;
while (likely(iter)) {
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index f152a38d7079..23fd18bd1b56 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -49,7 +49,7 @@ struct i915_mmu_notifier {
spinlock_t lock;
struct hlist_node node;
struct mmu_notifier mn;
- struct rb_root objects;
+ struct rb_root_cached objects;
struct workqueue_struct *wq;
};
@@ -123,7 +123,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
struct interval_tree_node *it;
LIST_HEAD(cancelled);
- if (RB_EMPTY_ROOT(&mn->objects))
+ if (RB_EMPTY_ROOT(&mn->objects.rb_root))
return;
/* interval ranges are inclusive, but invalidate range is exclusive */
@@ -172,7 +172,7 @@ i915_mmu_notifier_create(struct mm_struct *mm)
spin_lock_init(&mn->lock);
mn->mn.ops = &i915_gem_userptr_notifier;
- mn->objects = RB_ROOT;
+ mn->objects = RB_ROOT_CACHED;
mn->wq = alloc_workqueue("i915-userptr-release", WQ_UNBOUND, 0);
if (mn->wq == NULL) {
kfree(mn);
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index ec63bc5e9de7..8cbaeec090c9 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -924,7 +924,7 @@ struct radeon_vm_id {
struct radeon_vm {
struct mutex mutex;
- struct rb_root va;
+ struct rb_root_cached va;
/* protecting invalidated and freed */
spinlock_t status_lock;
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
index 896f2cf51e4e..1d62288b7ee3 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -50,7 +50,7 @@ struct radeon_mn {
/* objects protected by lock */
struct mutex lock;
- struct rb_root objects;
+ struct rb_root_cached objects;
};
struct radeon_mn_node {
@@ -75,8 +75,8 @@ static void radeon_mn_destroy(struct work_struct *work)
mutex_lock(&rdev->mn_lock);
mutex_lock(&rmn->lock);
hash_del(&rmn->node);
- rbtree_postorder_for_each_entry_safe(node, next_node, &rmn->objects,
- it.rb) {
+ rbtree_postorder_for_each_entry_safe(node, next_node,
+ &rmn->objects.rb_root, it.rb) {
interval_tree_remove(&node->it, &rmn->objects);
list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
@@ -205,7 +205,7 @@ static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev)
rmn->mm = mm;
rmn->mn.ops = &radeon_mn_ops;
mutex_init(&rmn->lock);
- rmn->objects = RB_ROOT;
+ rmn->objects = RB_ROOT_CACHED;
r = __mmu_notifier_register(&rmn->mn, mm);
if (r)
diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c
index 5e82b408d522..e5c0e635e371 100644
--- a/drivers/gpu/drm/radeon/radeon_vm.c
+++ b/drivers/gpu/drm/radeon/radeon_vm.c
@@ -1185,7 +1185,7 @@ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm)
vm->ids[i].last_id_use = NULL;
}
mutex_init(&vm->mutex);
- vm->va = RB_ROOT;
+ vm->va = RB_ROOT_CACHED;
spin_lock_init(&vm->status_lock);
INIT_LIST_HEAD(&vm->invalidated);
INIT_LIST_HEAD(&vm->freed);
@@ -1232,10 +1232,11 @@ void radeon_vm_fini(struct radeon_device *rdev, struct radeon_vm *vm)
struct radeon_bo_va *bo_va, *tmp;
int i, r;
- if (!RB_EMPTY_ROOT(&vm->va)) {
+ if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
dev_err(rdev->dev, "still active bo inside vm\n");
}
- rbtree_postorder_for_each_entry_safe(bo_va, tmp, &vm->va, it.rb) {
+ rbtree_postorder_for_each_entry_safe(bo_va, tmp,
+ &vm->va.rb_root, it.rb) {
interval_tree_remove(&bo_va->it, &vm->va);
r = radeon_bo_reserve(bo_va->bo, false);
if (!r) {
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c
index d176597b4d78..fc801920e341 100644
--- a/drivers/infiniband/core/umem_rbtree.c
+++ b/drivers/infiniband/core/umem_rbtree.c
@@ -72,7 +72,7 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
/* @last is not a part of the interval. See comment for function
* node_last.
*/
-int rbt_ib_umem_for_each_in_range(struct rb_root *root,
+int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
u64 start, u64 last,
umem_call_back cb,
void *cookie)
@@ -95,7 +95,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root *root,
}
EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
-struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root,
+struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
u64 addr, u64 length)
{
struct umem_odp_node *node;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index e0cb99860934..4ab30d832ac5 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -118,7 +118,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
ucontext->closing = 0;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- ucontext->umem_tree = RB_ROOT;
+ ucontext->umem_tree = RB_ROOT_CACHED;
init_rwsem(&ucontext->umem_rwsem);
ucontext->odp_mrs_count = 0;
INIT_LIST_HEAD(&ucontext->no_private_counters);
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index 2f0d285dc278..175002c046ed 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -54,7 +54,7 @@
struct mmu_rb_handler {
struct mmu_notifier mn;
- struct rb_root root;
+ struct rb_root_cached root;
void *ops_arg;
spinlock_t lock; /* protect the RB tree */
struct mmu_rb_ops *ops;
@@ -108,7 +108,7 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
if (!handlr)
return -ENOMEM;
- handlr->root = RB_ROOT;
+ handlr->root = RB_ROOT_CACHED;
handlr->ops = ops;
handlr->ops_arg = ops_arg;
INIT_HLIST_NODE(&handlr->mn.hlist);
@@ -149,9 +149,9 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
INIT_LIST_HEAD(&del_list);
spin_lock_irqsave(&handler->lock, flags);
- while ((node = rb_first(&handler->root))) {
+ while ((node = rb_first_cached(&handler->root))) {
rbnode = rb_entry(node, struct mmu_rb_node, node);
- rb_erase(node, &handler->root);
+ rb_erase_cached(node, &handler->root);
/* move from LRU list to delete list */
list_move(&rbnode->list, &del_list);
}
@@ -300,7 +300,7 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
{
struct mmu_rb_handler *handler =
container_of(mn, struct mmu_rb_handler, mn);
- struct rb_root *root = &handler->root;
+ struct rb_root_cached *root = &handler->root;
struct mmu_rb_node *node, *ptr = NULL;
unsigned long flags;
bool added = false;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index c49db7c33979..4381c0a9a873 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -227,7 +227,7 @@ static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd,
vpn_last = vpn_start + npages - 1;
spin_lock(&pd->lock);
- usnic_uiom_remove_interval(&pd->rb_root, vpn_start,
+ usnic_uiom_remove_interval(&pd->root, vpn_start,
vpn_last, &rm_intervals);
usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd);
@@ -379,7 +379,7 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last,
(writable) ? IOMMU_WRITE : 0,
IOMMU_WRITE,
- &pd->rb_root,
+ &pd->root,
&sorted_diff_intervals);
if (err) {
usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n",
@@ -395,7 +395,7 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
}
- err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last,
+ err = usnic_uiom_insert_interval(&pd->root, vpn_start, vpn_last,
(writable) ? IOMMU_WRITE : 0);
if (err) {
usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n",
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
index 45ca7c1613a7..431efe4143f4 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -55,7 +55,7 @@ struct usnic_uiom_dev {
struct usnic_uiom_pd {
struct iommu_domain *domain;
spinlock_t lock;
- struct rb_root rb_root;
+ struct rb_root_cached root;
struct list_head devs;
int dev_cnt;
};
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
index 42b4b4c4e452..d399523206c7 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
@@ -100,9 +100,9 @@ static int interval_cmp(void *priv, struct list_head *a, struct list_head *b)
}
static void
-find_intervals_intersection_sorted(struct rb_root *root, unsigned long start,
- unsigned long last,
- struct list_head *list)
+find_intervals_intersection_sorted(struct rb_root_cached *root,
+ unsigned long start, unsigned long last,
+ struct list_head *list)
{
struct usnic_uiom_interval_node *node;
@@ -118,7 +118,7 @@ find_intervals_intersection_sorted(struct rb_root *root, unsigned long start,
int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last,
int flags, int flag_mask,
- struct rb_root *root,
+ struct rb_root_cached *root,
struct list_head *diff_set)
{
struct usnic_uiom_interval_node *interval, *tmp;
@@ -175,7 +175,7 @@ void usnic_uiom_put_interval_set(struct list_head *intervals)
kfree(interval);
}
-int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start,
+int usnic_uiom_insert_interval(struct rb_root_cached *root, unsigned long start,
unsigned long last, int flags)
{
struct usnic_uiom_interval_node *interval, *tmp;
@@ -246,8 +246,9 @@ err_out:
return err;
}
-void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start,
- unsigned long last, struct list_head *removed)
+void usnic_uiom_remove_interval(struct rb_root_cached *root,
+ unsigned long start, unsigned long last,
+ struct list_head *removed)
{
struct usnic_uiom_interval_node *interval;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
index c0b0b876ab90..1d7fc3226bca 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
@@ -48,12 +48,12 @@ struct usnic_uiom_interval_node {
extern void
usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node,
- struct rb_root *root);
+ struct rb_root_cached *root);
extern void
usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node,
- struct rb_root *root);
+ struct rb_root_cached *root);
extern struct usnic_uiom_interval_node *
-usnic_uiom_interval_tree_iter_first(struct rb_root *root,
+usnic_uiom_interval_tree_iter_first(struct rb_root_cached *root,
unsigned long start,
unsigned long last);
extern struct usnic_uiom_interval_node *
@@ -63,7 +63,7 @@ usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node,
* Inserts {start...last} into {root}. If there are overlaps,
* nodes will be broken up and merged
*/
-int usnic_uiom_insert_interval(struct rb_root *root,
+int usnic_uiom_insert_interval(struct rb_root_cached *root,
unsigned long start, unsigned long last,
int flags);
/*
@@ -71,7 +71,7 @@ int usnic_uiom_insert_interval(struct rb_root *root,
* 'removed.' The caller is responsibile for freeing memory of nodes in
* 'removed.'
*/
-void usnic_uiom_remove_interval(struct rb_root *root,
+void usnic_uiom_remove_interval(struct rb_root_cached *root,
unsigned long start, unsigned long last,
struct list_head *removed);
/*
@@ -81,7 +81,7 @@ void usnic_uiom_remove_interval(struct rb_root *root,
int usnic_uiom_get_intervals_diff(unsigned long start,
unsigned long last, int flags,
int flag_mask,
- struct rb_root *root,
+ struct rb_root_cached *root,
struct list_head *diff_set);
/* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */
void usnic_uiom_put_interval_set(struct list_head *intervals);
diff --git a/drivers/mtd/nand/denali.c b/drivers/mtd/nand/denali.c
index d723be352148..3087b0ba7b7f 100644
--- a/drivers/mtd/nand/denali.c
+++ b/drivers/mtd/nand/denali.c
@@ -980,9 +980,6 @@ static int denali_erase(struct mtd_info *mtd, int page)
return irq_status & INTR__ERASE_COMP ? 0 : NAND_STATUS_FAIL;
}
-#define DIV_ROUND_DOWN_ULL(ll, d) \
- ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
-
static int denali_setup_data_interface(struct mtd_info *mtd, int chipnr,
const struct nand_data_interface *conf)
{
diff --git a/drivers/pps/Kconfig b/drivers/pps/Kconfig
index 4b29a7182d7b..c6008f296605 100644
--- a/drivers/pps/Kconfig
+++ b/drivers/pps/Kconfig
@@ -19,9 +19,10 @@ menuconfig PPS
To compile this driver as a module, choose M here: the module
will be called pps_core.ko.
+if PPS
+
config PPS_DEBUG
bool "PPS debugging messages"
- depends on PPS
help
Say Y here if you want the PPS support to produce a bunch of debug
messages to the system log. Select this if you are having a
@@ -29,7 +30,7 @@ config PPS_DEBUG
config NTP_PPS
bool "PPS kernel consumer support"
- depends on PPS && !NO_HZ_COMMON
+ depends on !NO_HZ_COMMON
help
This option adds support for direct in-kernel time
synchronization using an external PPS signal.
@@ -39,3 +40,5 @@ config NTP_PPS
source drivers/pps/clients/Kconfig
source drivers/pps/generators/Kconfig
+
+endif # PPS
diff --git a/drivers/pps/clients/Kconfig b/drivers/pps/clients/Kconfig
index efec021ce662..7f02a9b1a1fd 100644
--- a/drivers/pps/clients/Kconfig
+++ b/drivers/pps/clients/Kconfig
@@ -3,11 +3,9 @@
#
comment "PPS clients support"
- depends on PPS
config PPS_CLIENT_KTIMER
tristate "Kernel timer client (Testing client, use for debug)"
- depends on PPS
help
If you say yes here you get support for a PPS debugging client
which uses a kernel timer to generate the PPS signal.
@@ -17,21 +15,20 @@ config PPS_CLIENT_KTIMER
config PPS_CLIENT_LDISC
tristate "PPS line discipline"
- depends on PPS && TTY
+ depends on TTY
help
If you say yes here you get support for a PPS source connected
with the CD (Carrier Detect) pin of your serial port.
config PPS_CLIENT_PARPORT
tristate "Parallel port PPS client"
- depends on PPS && PARPORT
+ depends on PARPORT
help
If you say yes here you get support for a PPS source connected
with the interrupt pin of your parallel port.
config PPS_CLIENT_GPIO
tristate "PPS client using GPIO"
- depends on PPS
help
If you say yes here you get support for a PPS source using
GPIO. To be useful you must also register a platform device
diff --git a/drivers/pps/generators/Kconfig b/drivers/pps/generators/Kconfig
index 86b59378e71f..e4c4f3dc0728 100644
--- a/drivers/pps/generators/Kconfig
+++ b/drivers/pps/generators/Kconfig
@@ -3,11 +3,10 @@
#
comment "PPS generators support"
- depends on PPS
config PPS_GENERATOR_PARPORT
tristate "Parallel port PPS signal generator"
- depends on PPS && PARPORT && BROKEN
+ depends on PARPORT && BROKEN
help
If you say yes here you get support for a PPS signal generator which
utilizes STROBE pin of a parallel port to send PPS signals. It uses
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 77a0335eb757..09ba494f8896 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5465,7 +5465,7 @@ static int sdebug_driver_probe(struct device * dev)
return error;
}
if (submit_queues > nr_cpu_ids) {
- pr_warn("%s: trim submit_queues (was %d) to nr_cpu_ids=%d\n",
+ pr_warn("%s: trim submit_queues (was %d) to nr_cpu_ids=%u\n",
my_name, submit_queues, nr_cpu_ids);
submit_queues = nr_cpu_ids;
}
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.c b/drivers/scsi/sym53c8xx_2/sym_hipd.c
index 15ff285a9f8f..ca360daa6a25 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.c
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.c
@@ -4985,13 +4985,10 @@ struct sym_lcb *sym_alloc_lcb (struct sym_hcb *np, u_char tn, u_char ln)
* Compute the bus address of this table.
*/
if (ln && !tp->luntbl) {
- int i;
-
tp->luntbl = sym_calloc_dma(256, "LUNTBL");
if (!tp->luntbl)
goto fail;
- for (i = 0 ; i < 64 ; i++)
- tp->luntbl[i] = cpu_to_scr(vtobus(&np->badlun_sa));
+ memset32(tp->luntbl, cpu_to_scr(vtobus(&np->badlun_sa)), 64);
tp->head.luntbl_sa = cpu_to_scr(vtobus(tp->luntbl));
}
@@ -5077,8 +5074,7 @@ static void sym_alloc_lcb_tags (struct sym_hcb *np, u_char tn, u_char ln)
/*
* Initialize the task table with invalid entries.
*/
- for (i = 0 ; i < SYM_CONF_MAX_TASK ; i++)
- lp->itlq_tbl[i] = cpu_to_scr(np->notask_ba);
+ memset32(lp->itlq_tbl, cpu_to_scr(np->notask_ba), SYM_CONF_MAX_TASK);
/*
* Fill up the tag buffer with tag numbers.
@@ -5764,8 +5760,7 @@ int sym_hcb_attach(struct Scsi_Host *shost, struct sym_fw *fw, struct sym_nvram
goto attach_failed;
np->badlun_sa = cpu_to_scr(SCRIPTB_BA(np, resel_bad_lun));
- for (i = 0 ; i < 64 ; i++) /* 64 luns/target, no less */
- np->badluntbl[i] = cpu_to_scr(vtobus(&np->badlun_sa));
+ memset32(np->badluntbl, cpu_to_scr(vtobus(&np->badlun_sa)), 64);
/*
* Prepare the bus address array that contains the bus
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9cb3f722dce1..d6dbb28245e6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1271,7 +1271,7 @@ static struct vhost_umem *vhost_umem_alloc(void)
if (!umem)
return NULL;
- umem->umem_tree = RB_ROOT;
+ umem->umem_tree = RB_ROOT_CACHED;
umem->numem = 0;
INIT_LIST_HEAD(&umem->umem_list);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index bb7c29b8b9fc..d59a9cc65f9d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -71,7 +71,7 @@ struct vhost_umem_node {
};
struct vhost_umem {
- struct rb_root umem_tree;
+ struct rb_root_cached umem_tree;
struct list_head umem_list;
int numem;
};
diff --git a/fs/aio.c b/fs/aio.c
index 8f0127526299..b5d69f28d8b1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -373,6 +373,14 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
pgoff_t idx;
int rc;
+ /*
+ * We cannot support the _NO_COPY case here, because copy needs to
+ * happen under the ctx->completion_lock. That does not work with the
+ * migration workflow of MIGRATE_SYNC_NO_COPY.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
rc = 0;
/* mapping->private_lock here protects against the kioctx teardown. */
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index beef981aa54f..4737615f0eaa 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -11,10 +11,21 @@
#include <linux/auto_fs4.h>
#include <linux/auto_dev-ioctl.h>
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/completion.h>
+#include <asm/current.h>
/* This is the range of ioctl() numbers we claim as ours */
#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
@@ -24,17 +35,6 @@
#define AUTOFS_DEV_IOCTL_IOC_COUNT \
(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD)
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <asm/current.h>
-#include <linux/uaccess.h>
-
#ifdef pr_fmt
#undef pr_fmt
#endif
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index dd9f1bebb5a3..b7c816f39404 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -93,17 +93,17 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
* at the end of the struct.
*/
static struct autofs_dev_ioctl *
- copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
struct autofs_dev_ioctl tmp, *res;
- if (copy_from_user(&tmp, in, sizeof(tmp)))
+ if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE))
return ERR_PTR(-EFAULT);
- if (tmp.size < sizeof(tmp))
+ if (tmp.size < AUTOFS_DEV_IOCTL_SIZE)
return ERR_PTR(-EINVAL);
- if (tmp.size > (PATH_MAX + sizeof(tmp)))
+ if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX)
return ERR_PTR(-ENAMETOOLONG);
res = memdup_user(in, tmp.size);
@@ -133,8 +133,8 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
goto out;
}
- if (param->size > sizeof(*param)) {
- err = invalid_str(param->path, param->size - sizeof(*param));
+ if (param->size > AUTOFS_DEV_IOCTL_SIZE) {
+ err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE);
if (err) {
pr_warn(
"path string terminator missing for cmd(0x%08x)\n",
@@ -258,11 +258,6 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
if (err)
goto out;
- /*
- * Find autofs super block that has the device number
- * corresponding to the autofs fs we want to open.
- */
-
filp = dentry_open(&path, O_RDONLY, current_cred());
path_put(&path);
if (IS_ERR(filp)) {
@@ -451,7 +446,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
dev_t devid;
int err = -ENOENT;
- if (param->size <= sizeof(*param)) {
+ if (param->size <= AUTOFS_DEV_IOCTL_SIZE) {
err = -EINVAL;
goto out;
}
@@ -539,7 +534,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
unsigned int devid, magic;
int err = -ENOENT;
- if (param->size <= sizeof(*param)) {
+ if (param->size <= AUTOFS_DEV_IOCTL_SIZE) {
err = -EINVAL;
goto out;
}
@@ -628,10 +623,6 @@ static int _autofs_dev_ioctl(unsigned int command,
ioctl_fn fn = NULL;
int err = 0;
- /* only root can play with this */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
cmd = _IOC_NR(command);
@@ -640,6 +631,14 @@ static int _autofs_dev_ioctl(unsigned int command,
return -ENOTTY;
}
+ /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD
+ * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD
+ */
+ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
+ cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* Copy the parameters into kernel space. */
param = copy_dev_ioctl(user);
if (IS_ERR(param))
@@ -706,7 +705,8 @@ out:
return err;
}
-static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
+static long autofs_dev_ioctl(struct file *file, unsigned int command,
+ unsigned long u)
{
int err;
@@ -715,9 +715,10 @@ static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
}
#ifdef CONFIG_COMPAT
-static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u)
+static long autofs_dev_ioctl_compat(struct file *file, unsigned int command,
+ unsigned long u)
{
- return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u));
+ return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u));
}
#else
#define autofs_dev_ioctl_compat NULL
@@ -733,7 +734,8 @@ static const struct file_operations _dev_ioctl_fops = {
static struct miscdevice _autofs_dev_ioctl_misc = {
.minor = AUTOFS_MINOR,
.name = AUTOFS_DEVICE_NAME,
- .fops = &_dev_ioctl_fops
+ .fops = &_dev_ioctl_fops,
+ .mode = 0644,
};
MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 604a176df0c2..ce6537c50ec1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -192,13 +192,11 @@ static int decompress_exec(
memset(&strm, 0, sizeof(strm));
strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
- if (strm.workspace == NULL) {
- pr_debug("no memory for decompress workspace\n");
+ if (!strm.workspace)
return -ENOMEM;
- }
+
buf = kmalloc(LBUFSIZE, GFP_KERNEL);
- if (buf == NULL) {
- pr_debug("no memory for read buffer\n");
+ if (!buf) {
retval = -ENOMEM;
goto out_free;
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index adbe328b957c..2fabd19cdeea 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -205,7 +205,7 @@ struct eventpoll {
struct list_head rdllist;
/* RB tree root used to store monitored fd structs */
- struct rb_root rbr;
+ struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
@@ -796,7 +796,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
list_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
- rb_erase(&epi->rbn, &ep->rbr);
+ rb_erase_cached(&epi->rbn, &ep->rbr);
spin_lock_irqsave(&ep->lock, flags);
if (ep_is_linked(&epi->rdllink))
@@ -840,7 +840,7 @@ static void ep_free(struct eventpoll *ep)
/*
* Walks through the whole tree by unregistering poll callbacks.
*/
- for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_unregister_pollwait(ep, epi);
@@ -856,7 +856,7 @@ static void ep_free(struct eventpoll *ep)
* a lockdep warning.
*/
mutex_lock(&ep->mtx);
- while ((rbp = rb_first(&ep->rbr)) != NULL) {
+ while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove(ep, epi);
cond_resched();
@@ -963,7 +963,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f)
struct rb_node *rbp;
mutex_lock(&ep->mtx);
- for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
struct inode *inode = file_inode(epi->ffd.file);
@@ -1040,7 +1040,7 @@ static int ep_alloc(struct eventpoll **pep)
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
- ep->rbr = RB_ROOT;
+ ep->rbr = RB_ROOT_CACHED;
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;
@@ -1066,7 +1066,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
struct epoll_filefd ffd;
ep_set_ffd(&ffd, file, fd);
- for (rbp = ep->rbr.rb_node; rbp; ) {
+ for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
epi = rb_entry(rbp, struct epitem, rbn);
kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
if (kcmp > 0)
@@ -1088,7 +1088,7 @@ static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long t
struct rb_node *rbp;
struct epitem *epi;
- for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (epi->ffd.fd == tfd) {
if (toff == 0)
@@ -1273,20 +1273,22 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
{
int kcmp;
- struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
+ struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
struct epitem *epic;
+ bool leftmost = true;
while (*p) {
parent = *p;
epic = rb_entry(parent, struct epitem, rbn);
kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
- if (kcmp > 0)
+ if (kcmp > 0) {
p = &parent->rb_right;
- else
+ leftmost = false;
+ } else
p = &parent->rb_left;
}
rb_link_node(&epi->rbn, parent, p);
- rb_insert_color(&epi->rbn, &ep->rbr);
+ rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
}
@@ -1530,7 +1532,7 @@ error_remove_epi:
list_del_rcu(&epi->fllink);
spin_unlock(&tfile->f_lock);
- rb_erase(&epi->rbn, &ep->rbr);
+ rb_erase_cached(&epi->rbn, &ep->rbr);
error_unregister:
ep_unregister_pollwait(ep, epi);
@@ -1878,7 +1880,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
mutex_lock_nested(&ep->mtx, call_nests + 1);
ep->visited = 1;
list_add(&ep->visited_list_link, &visited_list);
- for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
ep_tovisit = epi->ffd.file->private_data;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a791aac4c5af..fb96bb71da00 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2253,7 +2253,10 @@ int f2fs_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
set_page_private(newpage, page_private(page));
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6a7152d0c250..02c066663a3a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -19,6 +19,8 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/namei.h>
+#include <linux/kernel.h>
+
#include "fat.h"
static inline unsigned long vfat_d_version(struct dentry *dentry)
@@ -510,10 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
struct nls_table *nls)
{
const unsigned char *ip;
- unsigned char nc;
unsigned char *op;
- unsigned int ec;
- int i, k, fill;
+ int i, fill;
int charlen;
if (utf8) {
@@ -530,33 +530,22 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
i < len && *outlen < FAT_LFN_LEN;
*outlen += 1) {
if (escape && (*ip == ':')) {
+ u8 uc[2];
+
if (i > len - 5)
return -EINVAL;
- ec = 0;
- for (k = 1; k < 5; k++) {
- nc = ip[k];
- ec <<= 4;
- if (nc >= '0' && nc <= '9') {
- ec |= nc - '0';
- continue;
- }
- if (nc >= 'a' && nc <= 'f') {
- ec |= nc - ('a' - 10);
- continue;
- }
- if (nc >= 'A' && nc <= 'F') {
- ec |= nc - ('A' - 10);
- continue;
- }
+
+ if (hex2bin(uc, ip + 1, 2) < 0)
return -EINVAL;
- }
- *op++ = ec & 0xFF;
- *op++ = ec >> 8;
+
+ *(wchar_t *)op = uc[0] << 8 | uc[1];
+
+ op += 2;
ip += 5;
i += 5;
} else {
charlen = nls->char2uni(ip, len - i,
- (wchar_t *)op);
+ (wchar_t *)op);
if (charlen < 0)
return -EINVAL;
ip += charlen;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7c02b3f738e1..59073e9f01a4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -334,7 +334,7 @@ static void remove_huge_page(struct page *page)
}
static void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
{
struct vm_area_struct *vma;
@@ -498,7 +498,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
- if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
@@ -523,7 +523,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
i_mmap_lock_write(mapping);
- if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_start >> PAGE_SHIFT,
hole_end >> PAGE_SHIFT);
@@ -830,7 +830,10 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
diff --git a/fs/inode.c b/fs/inode.c
index 6a1626e0edaf..210054157a49 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -353,7 +353,7 @@ void address_space_init_once(struct address_space *mapping)
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
- mapping->i_mmap = RB_ROOT;
+ mapping->i_mmap = RB_ROOT_CACHED;
}
EXPORT_SYMBOL(address_space_init_once);
diff --git a/fs/namei.c b/fs/namei.c
index ddb6a7c2b3d4..1180f9c58093 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1129,9 +1129,18 @@ static int follow_automount(struct path *path, struct nameidata *nd,
* of the daemon to instantiate them before they can be used.
*/
if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
- LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
- path->dentry->d_inode)
- return -EISDIR;
+ LOOKUP_OPEN | LOOKUP_CREATE |
+ LOOKUP_AUTOMOUNT))) {
+ /* Positive dentry that isn't meant to trigger an
+ * automount, EISDIR will allow it to be used,
+ * otherwise there's no mount here "now" so return
+ * ENOENT.
+ */
+ if (path->dentry->d_inode)
+ return -EISDIR;
+ else
+ return -ENOENT;
+ }
if (path->dentry->d_sb->s_user_ns != &init_user_ns)
return -EACCES;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e3cda0b5968f..793a67574668 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -40,8 +40,8 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry
static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
{
- return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
- subdir_node);
+ return rb_entry_safe(rb_first_cached(&dir->subdir),
+ struct proc_dir_entry, subdir_node);
}
static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
@@ -54,7 +54,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
const char *name,
unsigned int len)
{
- struct rb_node *node = dir->subdir.rb_node;
+ struct rb_node *node = dir->subdir.rb_root.rb_node;
while (node) {
struct proc_dir_entry *de = rb_entry(node,
@@ -75,8 +75,9 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
static bool pde_subdir_insert(struct proc_dir_entry *dir,
struct proc_dir_entry *de)
{
- struct rb_root *root = &dir->subdir;
- struct rb_node **new = &root->rb_node, *parent = NULL;
+ struct rb_root_cached *root = &dir->subdir;
+ struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
+ bool leftmost = true;
/* Figure out where to put new node */
while (*new) {
@@ -88,15 +89,16 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
parent = *new;
if (result < 0)
new = &(*new)->rb_left;
- else if (result > 0)
+ else if (result > 0) {
new = &(*new)->rb_right;
- else
+ leftmost = false;
+ } else
return false;
}
/* Add new node and rebalance tree. */
rb_link_node(&de->subdir_node, parent, new);
- rb_insert_color(&de->subdir_node, root);
+ rb_insert_color_cached(&de->subdir_node, root, leftmost);
return true;
}
@@ -369,7 +371,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
ent->namelen = qstr.len;
ent->mode = mode;
ent->nlink = nlink;
- ent->subdir = RB_ROOT;
+ ent->subdir = RB_ROOT_CACHED;
atomic_set(&ent->count, 1);
spin_lock_init(&ent->pde_unload_lock);
INIT_LIST_HEAD(&ent->pde_openers);
@@ -499,6 +501,14 @@ out:
}
EXPORT_SYMBOL(proc_create_data);
+struct proc_dir_entry *proc_create(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ const struct file_operations *proc_fops)
+{
+ return proc_create_data(name, mode, parent, proc_fops, NULL);
+}
+EXPORT_SYMBOL(proc_create);
+
void proc_set_size(struct proc_dir_entry *de, loff_t size)
{
de->size = size;
@@ -545,7 +555,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de)
- rb_erase(&de->subdir_node, &parent->subdir);
+ rb_erase_cached(&de->subdir_node, &parent->subdir);
write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
@@ -582,13 +592,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
write_unlock(&proc_subdir_lock);
return -ENOENT;
}
- rb_erase(&root->subdir_node, &parent->subdir);
+ rb_erase_cached(&root->subdir_node, &parent->subdir);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
- rb_erase(&next->subdir_node, &de->subdir);
+ rb_erase_cached(&next->subdir_node, &de->subdir);
de = next;
continue;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2cbfcd32e884..a34195e92b20 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -40,7 +40,7 @@ struct proc_dir_entry {
const struct inode_operations *proc_iops;
const struct file_operations *proc_fops;
struct proc_dir_entry *parent;
- struct rb_root subdir;
+ struct rb_root_cached subdir;
struct rb_node subdir_node;
void *data;
atomic_t count; /* use count */
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index d72fc40241d9..a2bf369c923d 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -196,7 +196,7 @@ static __net_init int proc_net_ns_init(struct net *net)
if (!netd)
goto out;
- netd->subdir = RB_ROOT;
+ netd->subdir = RB_ROOT_CACHED;
netd->data = net;
netd->nlink = 2;
netd->namelen = 3;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index deecb397daa3..926fb27f4ca2 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -210,7 +210,7 @@ struct proc_dir_entry proc_root = {
.proc_iops = &proc_root_inode_operations,
.proc_fops = &proc_root_operations,
.parent = &proc_root,
- .subdir = RB_ROOT,
+ .subdir = RB_ROOT_CACHED,
.name = "/proc",
};
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a290966f91ec..7b40e11ede9b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -268,8 +268,7 @@ static int do_maps_open(struct inode *inode, struct file *file,
* Indicate if the VMA is a stack for the given task; for
* /proc/PID/maps that is the stack of the main task.
*/
-static int is_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma)
+static int is_stack(struct vm_area_struct *vma)
{
/*
* We make no effort to guess what a given thread considers to be
@@ -302,7 +301,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
- struct proc_maps_private *priv = m->private;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
@@ -350,7 +348,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- if (is_stack(priv, vma))
+ if (is_stack(vma))
name = "[stack]";
}
@@ -549,6 +547,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ else if (is_device_private_entry(swpent))
+ page = device_private_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
page = find_get_entry(vma->vm_file->f_mapping,
@@ -608,13 +608,14 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
- smaps_pmd_entry(pmd, addr, walk);
+ if (pmd_present(*pmd))
+ smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
- return 0;
+ goto out;
}
if (pmd_trans_unstable(pmd))
- return 0;
+ goto out;
/*
* The mmap_sem held all the way back in m_start() is what
* keeps khugepaged out of here and from collapsing things
@@ -624,6 +625,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
for (; addr != end; pte++, addr += PAGE_SIZE)
smaps_pte_entry(pte, addr, walk);
pte_unmap_unlock(pte - 1, ptl);
+out:
cond_resched();
return 0;
}
@@ -712,6 +714,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ else if (is_device_private_entry(swpent))
+ page = device_private_entry_to_page(swpent);
}
if (page) {
int mapcount = page_mapcount(page);
@@ -977,17 +981,22 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
{
pmd_t pmd = *pmdp;
- /* See comment in change_huge_pmd() */
- pmdp_invalidate(vma, addr, pmdp);
- if (pmd_dirty(*pmdp))
- pmd = pmd_mkdirty(pmd);
- if (pmd_young(*pmdp))
- pmd = pmd_mkyoung(pmd);
-
- pmd = pmd_wrprotect(pmd);
- pmd = pmd_clear_soft_dirty(pmd);
-
- set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ if (pmd_present(pmd)) {
+ /* See comment in change_huge_pmd() */
+ pmdp_invalidate(vma, addr, pmdp);
+ if (pmd_dirty(*pmdp))
+ pmd = pmd_mkdirty(pmd);
+ if (pmd_young(*pmdp))
+ pmd = pmd_mkyoung(pmd);
+
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_clear_soft_dirty(pmd);
+
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ pmd = pmd_swp_clear_soft_dirty(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
}
#else
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
@@ -1012,6 +1021,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
goto out;
}
+ if (!pmd_present(*pmd))
+ goto out;
+
page = pmd_page(*pmd);
/* Clear accessed and referenced bits. */
@@ -1254,7 +1266,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
- page = vm_normal_page(vma, addr, pte);
+ page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
@@ -1267,6 +1279,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
+
+ if (is_device_private_entry(entry))
+ page = device_private_entry_to_page(entry);
}
if (page && !PageAnon(page))
@@ -1293,27 +1308,33 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (ptl) {
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
+ struct page *page = NULL;
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
- /*
- * Currently pmd for thp is always present because thp
- * can not be swapped-out, migrated, or HWPOISONed
- * (split in such cases instead.)
- * This if-check is just to prepare for future implementation.
- */
if (pmd_present(pmd)) {
- struct page *page = pmd_page(pmd);
-
- if (page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+ page = pmd_page(pmd);
flags |= PM_PRESENT;
if (pm->show_pfn)
frame = pmd_pfn(pmd) +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
}
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ else if (is_swap_pmd(pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+ frame = swp_type(entry) |
+ (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ flags |= PM_SWAP;
+ VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ page = migration_entry_to_page(entry);
+ }
+#endif
+
+ if (page && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
@@ -1746,7 +1767,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_file_path(m, file, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
- } else if (is_stack(proc_priv, vma)) {
+ } else if (is_stack(vma)) {
seq_puts(m, " stack");
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 23266694db11..dea90b566a6e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -125,8 +125,7 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static int is_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma)
+static int is_stack(struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
@@ -178,7 +177,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
- } else if (mm && is_stack(priv, vma)) {
+ } else if (mm && is_stack(vma)) {
seq_pad(m, ' ');
seq_printf(m, "[stack]");
}
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f90a466ea5db..a02aa59d1e24 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1490,7 +1490,10 @@ static int ubifs_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
}
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
#endif
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5419e7da82ba..ef4b48d1ea42 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -381,8 +381,26 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
* in __get_user_pages if userfaultfd_release waits on the
* caller of handle_userfault to release the mmap_sem.
*/
- if (unlikely(ACCESS_ONCE(ctx->released)))
+ if (unlikely(ACCESS_ONCE(ctx->released))) {
+ /*
+ * Don't return VM_FAULT_SIGBUS in this case, so a non
+ * cooperative manager can close the uffd after the
+ * last UFFDIO_COPY, without risking to trigger an
+ * involuntary SIGBUS if the process was starting the
+ * userfaultfd while the userfaultfd was still armed
+ * (but after the last UFFDIO_COPY). If the uffd
+ * wasn't already closed when the userfault reached
+ * this point, that would normally be solved by
+ * userfaultfd_must_wait returning 'false'.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd.
+ */
+ ret = VM_FAULT_NOPAGE;
goto out;
+ }
/*
* Check that we can return VM_FAULT_RETRY.
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 4d7bb98f4134..8e0243036564 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -630,7 +630,24 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
#define arch_start_context_switch(prev) do {} while (0)
#endif
-#ifndef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+ return pmd;
+}
+
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+ return 0;
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd;
+}
+#endif
+#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
return 0;
@@ -675,6 +692,21 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
return pte;
}
+
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+ return pmd;
+}
+
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+ return 0;
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd;
+}
#endif
#ifndef __HAVE_PFNMAP_TRACKING
@@ -846,7 +878,23 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
barrier();
#endif
- if (pmd_none(pmdval) || pmd_trans_huge(pmdval))
+ /*
+ * !pmd_present() checks for pmd migration entries
+ *
+ * The complete check uses is_pmd_migration_entry() in linux/swapops.h
+ * But using that requires moving current function and pmd_trans_unstable()
+ * to linux/swapops.h to resovle dependency, which is too much code move.
+ *
+ * !pmd_present() is equivalent to is_pmd_migration_entry() currently,
+ * because !pmd_present() pages can only be under migration not swapped
+ * out.
+ *
+ * pmd_none() is preseved for future condition checks on pmd migration
+ * entries and not confusing with this function name, although it is
+ * redundant with !pmd_present().
+ */
+ if (pmd_none(pmdval) || pmd_trans_huge(pmdval) ||
+ (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval)))
return 1;
if (unlikely(pmd_bad(pmdval))) {
pmd_clear_bad(pmd);
diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h
index 49b292e98fec..8d10fc97801c 100644
--- a/include/drm/drm_mm.h
+++ b/include/drm/drm_mm.h
@@ -172,7 +172,7 @@ struct drm_mm {
* according to the (increasing) start address of the memory node. */
struct drm_mm_node head_node;
/* Keep an interval_tree for fast lookup of drm_mm_nodes by address. */
- struct rb_root interval_tree;
+ struct rb_root_cached interval_tree;
struct rb_root holes_size;
struct rb_root holes_addr;
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 5797ca6fdfe2..700cf5f67118 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -361,6 +361,38 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen,
}
/*
+ * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
+ *
+ * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit
+ * integers in 32-bit environment, and 64-bit integers in 64-bit one.
+ *
+ * There are four combinations of endianness and length of the word in linux
+ * ABIs: LE64, BE64, LE32 and BE32.
+ *
+ * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in
+ * bitmaps and therefore don't require any special handling.
+ *
+ * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory
+ * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the
+ * other hand is represented as an array of 32-bit words and the position of
+ * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that
+ * word. For example, bit #42 is located at 10th position of 2nd word.
+ * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit
+ * values in memory as it usually does. But for BE we need to swap hi and lo
+ * words manually.
+ *
+ * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and
+ * lo parts of u64. For LE32 it does nothing, and for BE environment it swaps
+ * hi and lo words, as is expected by bitmap.
+ */
+#if __BITS_PER_LONG == 64
+#define BITMAP_FROM_U64(n) (n)
+#else
+#define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \
+ ((unsigned long) ((u64)(n) >> 32))
+#endif
+
+/*
* bitmap_from_u64 - Check and swap words within u64.
* @mask: source bitmap
* @dst: destination bitmap
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index a83c822c35c2..8fbe259b197c 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -19,10 +19,11 @@
* GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
*/
#define GENMASK(h, l) \
- (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+ (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
#define GENMASK_ULL(h, l) \
- (((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
+ (((~0ULL) - (1ULL << (l)) + 1) & \
+ (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index 392041475c72..ffd215988392 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -3,5 +3,9 @@
#include <uapi/linux/byteorder/big_endian.h>
+#ifndef CONFIG_CPU_BIG_ENDIAN
+#warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN
+#endif
+
#include <linux/byteorder/generic.h>
#endif /* _LINUX_BYTEORDER_BIG_ENDIAN_H */
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index 08057377aa23..ba910bb9aad0 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -3,5 +3,9 @@
#include <uapi/linux/byteorder/little_endian.h>
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#warning inconsistent configuration, CONFIG_CPU_BIG_ENDIAN is set
+#endif
+
#include <linux/byteorder/generic.h>
#endif /* _LINUX_BYTEORDER_LITTLE_ENDIAN_H */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 4bf4479a3a80..cd415b733c2a 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -32,15 +32,15 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
#define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp)
#if NR_CPUS == 1
-#define nr_cpu_ids 1
+#define nr_cpu_ids 1U
#else
-extern int nr_cpu_ids;
+extern unsigned int nr_cpu_ids;
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
/* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also,
* not all bits may be allocated. */
-#define nr_cpumask_bits ((unsigned int)nr_cpu_ids)
+#define nr_cpumask_bits nr_cpu_ids
#else
#define nr_cpumask_bits ((unsigned int)NR_CPUS)
#endif
@@ -178,20 +178,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
}
-/**
- * cpumask_next - get the next cpu in a cpumask
- * @n: the cpu prior to the place to search (ie. return will be > @n)
- * @srcp: the cpumask pointer
- *
- * Returns >= nr_cpu_ids if no further cpus set.
- */
-static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
-{
- /* -1 is a legal arg here. */
- if (n != -1)
- cpumask_check(n);
- return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
-}
+unsigned int cpumask_next(int n, const struct cpumask *srcp);
/**
* cpumask_next_zero - get the next unset cpu in a cpumask
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 509434aaf5a4..2d0e6748e46e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -392,7 +392,7 @@ struct address_space {
struct radix_tree_root page_tree; /* radix tree of all pages */
spinlock_t tree_lock; /* and lock protecting it */
atomic_t i_mmap_writable;/* count VM_SHARED mappings */
- struct rb_root i_mmap; /* tree of private and shared mappings */
+ struct rb_root_cached i_mmap; /* tree of private and shared mappings */
struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */
/* Protected by tree_lock together with the radix tree */
unsigned long nrpages; /* number of total pages */
@@ -487,7 +487,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping)
*/
static inline int mapping_mapped(struct address_space *mapping)
{
- return !RB_EMPTY_ROOT(&mapping->i_mmap);
+ return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}
/*
@@ -3043,8 +3043,7 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat)
static inline int vfs_fstatat(int dfd, const char __user *filename,
struct kstat *stat, int flags)
{
- return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
- stat, STATX_BASIC_STATS);
+ return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
}
static inline int vfs_fstat(int fd, struct kstat *stat)
{
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 000000000000..96e69979f84d
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,520 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
+ */
+/*
+ * Heterogeneous Memory Management (HMM)
+ *
+ * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it
+ * is for. Here we focus on the HMM API description, with some explanation of
+ * the underlying implementation.
+ *
+ * Short description: HMM provides a set of helpers to share a virtual address
+ * space between CPU and a device, so that the device can access any valid
+ * address of the process (while still obeying memory protection). HMM also
+ * provides helpers to migrate process memory to device memory, and back. Each
+ * set of functionality (address space mirroring, and migration to and from
+ * device memory) can be used independently of the other.
+ *
+ *
+ * HMM address space mirroring API:
+ *
+ * Use HMM address space mirroring if you want to mirror range of the CPU page
+ * table of a process into a device page table. Here, "mirror" means "keep
+ * synchronized". Prerequisites: the device must provide the ability to write-
+ * protect its page tables (at PAGE_SIZE granularity), and must be able to
+ * recover from the resulting potential page faults.
+ *
+ * HMM guarantees that at any point in time, a given virtual address points to
+ * either the same memory in both CPU and device page tables (that is: CPU and
+ * device page tables each point to the same pages), or that one page table (CPU
+ * or device) points to no entry, while the other still points to the old page
+ * for the address. The latter case happens when the CPU page table update
+ * happens first, and then the update is mirrored over to the device page table.
+ * This does not cause any issue, because the CPU page table cannot start
+ * pointing to a new page until the device page table is invalidated.
+ *
+ * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
+ * updates to each device driver that has registered a mirror. It also provides
+ * some API calls to help with taking a snapshot of the CPU page table, and to
+ * synchronize with any updates that might happen concurrently.
+ *
+ *
+ * HMM migration to and from device memory:
+ *
+ * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
+ * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
+ * of the device memory, and allows the device driver to manage its memory
+ * using those struct pages. Having struct pages for device memory makes
+ * migration easier. Because that memory is not addressable by the CPU it must
+ * never be pinned to the device; in other words, any CPU page fault can always
+ * cause the device memory to be migrated (copied/moved) back to regular memory.
+ *
+ * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
+ * allows use of a device DMA engine to perform the copy operation between
+ * regular system memory and device memory.
+ */
+#ifndef LINUX_HMM_H
+#define LINUX_HMM_H
+
+#include <linux/kconfig.h>
+
+#if IS_ENABLED(CONFIG_HMM)
+
+#include <linux/device.h>
+#include <linux/migrate.h>
+#include <linux/memremap.h>
+#include <linux/completion.h>
+
+struct hmm;
+
+/*
+ * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page
+ *
+ * Flags:
+ * HMM_PFN_VALID: pfn is valid
+ * HMM_PFN_READ: CPU page table has read permission set
+ * HMM_PFN_WRITE: CPU page table has write permission set
+ * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
+ * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none()
+ * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
+ * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
+ * be mirrored by a device, because the entry will never have HMM_PFN_VALID
+ * set and the pfn value is undefined.
+ * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE)
+ */
+typedef unsigned long hmm_pfn_t;
+
+#define HMM_PFN_VALID (1 << 0)
+#define HMM_PFN_READ (1 << 1)
+#define HMM_PFN_WRITE (1 << 2)
+#define HMM_PFN_ERROR (1 << 3)
+#define HMM_PFN_EMPTY (1 << 4)
+#define HMM_PFN_SPECIAL (1 << 5)
+#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6)
+#define HMM_PFN_SHIFT 7
+
+/*
+ * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t
+ * @pfn: hmm_pfn_t to convert to struct page
+ * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise
+ *
+ * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page
+ * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL.
+ */
+static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn)
+{
+ if (!(pfn & HMM_PFN_VALID))
+ return NULL;
+ return pfn_to_page(pfn >> HMM_PFN_SHIFT);
+}
+
+/*
+ * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t
+ * @pfn: hmm_pfn_t to extract pfn from
+ * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise
+ */
+static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn)
+{
+ if (!(pfn & HMM_PFN_VALID))
+ return -1UL;
+ return (pfn >> HMM_PFN_SHIFT);
+}
+
+/*
+ * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page
+ * @page: struct page pointer for which to create the hmm_pfn_t
+ * Returns: valid hmm_pfn_t for the page
+ */
+static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page)
+{
+ return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID;
+}
+
+/*
+ * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn
+ * @pfn: pfn value for which to create the hmm_pfn_t
+ * Returns: valid hmm_pfn_t for the pfn
+ */
+static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn)
+{
+ return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID;
+}
+
+
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+/*
+ * Mirroring: how to synchronize device page table with CPU page table.
+ *
+ * A device driver that is participating in HMM mirroring must always
+ * synchronize with CPU page table updates. For this, device drivers can either
+ * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
+ * drivers can decide to register one mirror per device per process, or just
+ * one mirror per process for a group of devices. The pattern is:
+ *
+ * int device_bind_address_space(..., struct mm_struct *mm, ...)
+ * {
+ * struct device_address_space *das;
+ *
+ * // Device driver specific initialization, and allocation of das
+ * // which contains an hmm_mirror struct as one of its fields.
+ * ...
+ *
+ * ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
+ * if (ret) {
+ * // Cleanup on error
+ * return ret;
+ * }
+ *
+ * // Other device driver specific initialization
+ * ...
+ * }
+ *
+ * Once an hmm_mirror is registered for an address space, the device driver
+ * will get callbacks through sync_cpu_device_pagetables() operation (see
+ * hmm_mirror_ops struct).
+ *
+ * Device driver must not free the struct containing the hmm_mirror struct
+ * before calling hmm_mirror_unregister(). The expected usage is to do that when
+ * the device driver is unbinding from an address space.
+ *
+ *
+ * void device_unbind_address_space(struct device_address_space *das)
+ * {
+ * // Device driver specific cleanup
+ * ...
+ *
+ * hmm_mirror_unregister(&das->mirror);
+ *
+ * // Other device driver specific cleanup, and now das can be freed
+ * ...
+ * }
+ */
+
+struct hmm_mirror;
+
+/*
+ * enum hmm_update_type - type of update
+ * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
+ */
+enum hmm_update_type {
+ HMM_UPDATE_INVALIDATE,
+};
+
+/*
+ * struct hmm_mirror_ops - HMM mirror device operations callback
+ *
+ * @update: callback to update range on a device
+ */
+struct hmm_mirror_ops {
+ /* sync_cpu_device_pagetables() - synchronize page tables
+ *
+ * @mirror: pointer to struct hmm_mirror
+ * @update_type: type of update that occurred to the CPU page table
+ * @start: virtual start address of the range to update
+ * @end: virtual end address of the range to update
+ *
+ * This callback ultimately originates from mmu_notifiers when the CPU
+ * page table is updated. The device driver must update its page table
+ * in response to this callback. The update argument tells what action
+ * to perform.
+ *
+ * The device driver must not return from this callback until the device
+ * page tables are completely updated (TLBs flushed, etc); this is a
+ * synchronous call.
+ */
+ void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+ enum hmm_update_type update_type,
+ unsigned long start,
+ unsigned long end);
+};
+
+/*
+ * struct hmm_mirror - mirror struct for a device driver
+ *
+ * @hmm: pointer to struct hmm (which is unique per mm_struct)
+ * @ops: device driver callback for HMM mirror operations
+ * @list: for list of mirrors of a given mm
+ *
+ * Each address space (mm_struct) being mirrored by a device must register one
+ * instance of an hmm_mirror struct with HMM. HMM will track the list of all
+ * mirrors for each mm_struct.
+ */
+struct hmm_mirror {
+ struct hmm *hmm;
+ const struct hmm_mirror_ops *ops;
+ struct list_head list;
+};
+
+int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
+void hmm_mirror_unregister(struct hmm_mirror *mirror);
+
+
+/*
+ * struct hmm_range - track invalidation lock on virtual address range
+ *
+ * @list: all range lock are on a list
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @pfns: array of pfns (big enough for the range)
+ * @valid: pfns array did not change since it has been fill by an HMM function
+ */
+struct hmm_range {
+ struct list_head list;
+ unsigned long start;
+ unsigned long end;
+ hmm_pfn_t *pfns;
+ bool valid;
+};
+
+/*
+ * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
+ * driver lock that serializes device page table updates, then call
+ * hmm_vma_range_done(), to check if the snapshot is still valid. The same
+ * device driver page table update lock must also be used in the
+ * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
+ * table invalidation serializes on it.
+ *
+ * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
+ * hmm_vma_get_pfns() WITHOUT ERROR !
+ *
+ * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
+ */
+int hmm_vma_get_pfns(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns);
+bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
+
+
+/*
+ * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
+ * not migrate any device memory back to system memory. The hmm_pfn_t array will
+ * be updated with the fault result and current snapshot of the CPU page table
+ * for the range.
+ *
+ * The mmap_sem must be taken in read mode before entering and it might be
+ * dropped by the function if the block argument is false. In that case, the
+ * function returns -EAGAIN.
+ *
+ * Return value does not reflect if the fault was successful for every single
+ * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to
+ * determine fault status for each address.
+ *
+ * Trying to fault inside an invalid vma will result in -EINVAL.
+ *
+ * See the function description in mm/hmm.c for further documentation.
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns,
+ bool write,
+ bool block);
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+struct hmm_devmem;
+
+struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+ unsigned long addr);
+
+/*
+ * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
+ *
+ * @free: call when refcount on page reach 1 and thus is no longer use
+ * @fault: call when there is a page fault to unaddressable memory
+ *
+ * Both callback happens from page_free() and page_fault() callback of struct
+ * dev_pagemap respectively. See include/linux/memremap.h for more details on
+ * those.
+ *
+ * The hmm_devmem_ops callback are just here to provide a coherent and
+ * uniq API to device driver and device driver should not register their
+ * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
+ * back.
+ */
+struct hmm_devmem_ops {
+ /*
+ * free() - free a device page
+ * @devmem: device memory structure (see struct hmm_devmem)
+ * @page: pointer to struct page being freed
+ *
+ * Call back occurs whenever a device page refcount reach 1 which
+ * means that no one is holding any reference on the page anymore
+ * (ZONE_DEVICE page have an elevated refcount of 1 as default so
+ * that they are not release to the general page allocator).
+ *
+ * Note that callback has exclusive ownership of the page (as no
+ * one is holding any reference).
+ */
+ void (*free)(struct hmm_devmem *devmem, struct page *page);
+ /*
+ * fault() - CPU page fault or get user page (GUP)
+ * @devmem: device memory structure (see struct hmm_devmem)
+ * @vma: virtual memory area containing the virtual address
+ * @addr: virtual address that faulted or for which there is a GUP
+ * @page: pointer to struct page backing virtual address (unreliable)
+ * @flags: FAULT_FLAG_* (see include/linux/mm.h)
+ * @pmdp: page middle directory
+ * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
+ * on error
+ *
+ * The callback occurs whenever there is a CPU page fault or GUP on a
+ * virtual address. This means that the device driver must migrate the
+ * page back to regular memory (CPU accessible).
+ *
+ * The device driver is free to migrate more than one page from the
+ * fault() callback as an optimization. However if device decide to
+ * migrate more than one page it must always priotirize the faulting
+ * address over the others.
+ *
+ * The struct page pointer is only given as an hint to allow quick
+ * lookup of internal device driver data. A concurrent migration
+ * might have already free that page and the virtual address might
+ * not longer be back by it. So it should not be modified by the
+ * callback.
+ *
+ * Note that mmap semaphore is held in read mode at least when this
+ * callback occurs, hence the vma is valid upon callback entry.
+ */
+ int (*fault)(struct hmm_devmem *devmem,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ const struct page *page,
+ unsigned int flags,
+ pmd_t *pmdp);
+};
+
+/*
+ * struct hmm_devmem - track device memory
+ *
+ * @completion: completion object for device memory
+ * @pfn_first: first pfn for this resource (set by hmm_devmem_add())
+ * @pfn_last: last pfn for this resource (set by hmm_devmem_add())
+ * @resource: IO resource reserved for this chunk of memory
+ * @pagemap: device page map for that chunk
+ * @device: device to bind resource to
+ * @ops: memory operations callback
+ * @ref: per CPU refcount
+ *
+ * This an helper structure for device drivers that do not wish to implement
+ * the gory details related to hotplugging new memoy and allocating struct
+ * pages.
+ *
+ * Device drivers can directly use ZONE_DEVICE memory on their own if they
+ * wish to do so.
+ */
+struct hmm_devmem {
+ struct completion completion;
+ unsigned long pfn_first;
+ unsigned long pfn_last;
+ struct resource *resource;
+ struct device *device;
+ struct dev_pagemap pagemap;
+ const struct hmm_devmem_ops *ops;
+ struct percpu_ref ref;
+};
+
+/*
+ * To add (hotplug) device memory, HMM assumes that there is no real resource
+ * that reserves a range in the physical address space (this is intended to be
+ * use by unaddressable device memory). It will reserve a physical range big
+ * enough and allocate struct page for it.
+ *
+ * The device driver can wrap the hmm_devmem struct inside a private device
+ * driver struct. The device driver must call hmm_devmem_remove() before the
+ * device goes away and before freeing the hmm_devmem struct memory.
+ */
+struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ unsigned long size);
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ struct resource *res);
+void hmm_devmem_remove(struct hmm_devmem *devmem);
+
+/*
+ * hmm_devmem_page_set_drvdata - set per-page driver data field
+ *
+ * @page: pointer to struct page
+ * @data: driver data value to set
+ *
+ * Because page can not be on lru we have an unsigned long that driver can use
+ * to store a per page field. This just a simple helper to do that.
+ */
+static inline void hmm_devmem_page_set_drvdata(struct page *page,
+ unsigned long data)
+{
+ unsigned long *drvdata = (unsigned long *)&page->pgmap;
+
+ drvdata[1] = data;
+}
+
+/*
+ * hmm_devmem_page_get_drvdata - get per page driver data field
+ *
+ * @page: pointer to struct page
+ * Return: driver data value
+ */
+static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page)
+{
+ unsigned long *drvdata = (unsigned long *)&page->pgmap;
+
+ return drvdata[1];
+}
+
+
+/*
+ * struct hmm_device - fake device to hang device memory onto
+ *
+ * @device: device struct
+ * @minor: device minor number
+ */
+struct hmm_device {
+ struct device device;
+ unsigned int minor;
+};
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper and
+ * it is not strictly needed, in order to make use of any HMM functionality.
+ */
+struct hmm_device *hmm_device_new(void *drvdata);
+void hmm_device_put(struct hmm_device *hmm_device);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+#endif /* IS_ENABLED(CONFIG_HMM) */
+
+/* Below are for HMM internal use only! Not to be used by device driver! */
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+void hmm_mm_destroy(struct mm_struct *mm);
+
+static inline void hmm_mm_init(struct mm_struct *mm)
+{
+ mm->hmm = NULL;
+}
+#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+static inline void hmm_mm_destroy(struct mm_struct *mm) {}
+static inline void hmm_mm_init(struct mm_struct *mm) {}
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+#else /* IS_ENABLED(CONFIG_HMM) */
+static inline void hmm_mm_destroy(struct mm_struct *mm) {}
+static inline void hmm_mm_init(struct mm_struct *mm) {}
+#endif /* LINUX_HMM_H */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ee696347f928..14bc21c2ee7f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -147,7 +147,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
#define split_huge_pmd(__vma, __pmd, __address) \
do { \
pmd_t *____pmd = (__pmd); \
- if (pmd_trans_huge(*____pmd) \
+ if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \
|| pmd_devmap(*____pmd)) \
__split_huge_pmd(__vma, __pmd, __address, \
false, NULL); \
@@ -178,12 +178,18 @@ extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma);
extern spinlock_t *__pud_trans_huge_lock(pud_t *pud,
struct vm_area_struct *vma);
+
+static inline int is_swap_pmd(pmd_t pmd)
+{
+ return !pmd_none(pmd) && !pmd_present(pmd);
+}
+
/* mmap_sem must be held on entry */
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
return __pmd_trans_huge_lock(pmd, vma);
else
return NULL;
@@ -233,6 +239,11 @@ void mm_put_huge_zero_page(struct mm_struct *mm);
#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
+static inline bool thp_migration_supported(void)
+{
+ return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
+}
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -294,6 +305,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
long adjust_next)
{
}
+static inline int is_swap_pmd(pmd_t pmd)
+{
+ return 0;
+}
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma)
{
@@ -336,6 +351,11 @@ static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
{
return NULL;
}
+
+static inline bool thp_migration_supported(void)
+{
+ return false;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 0e849715e5be..3c07ace5b431 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -175,9 +175,8 @@ extern struct cred init_cred;
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
- .pi_waiters = RB_ROOT, \
- .pi_top_task = NULL, \
- .pi_waiters_leftmost = NULL,
+ .pi_waiters = RB_ROOT_CACHED, \
+ .pi_top_task = NULL,
#else
# define INIT_RT_MUTEXES(tsk)
#endif
diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h
index 724556aa3c95..202ee1283f4b 100644
--- a/include/linux/interval_tree.h
+++ b/include/linux/interval_tree.h
@@ -11,13 +11,15 @@ struct interval_tree_node {
};
extern void
-interval_tree_insert(struct interval_tree_node *node, struct rb_root *root);
+interval_tree_insert(struct interval_tree_node *node,
+ struct rb_root_cached *root);
extern void
-interval_tree_remove(struct interval_tree_node *node, struct rb_root *root);
+interval_tree_remove(struct interval_tree_node *node,
+ struct rb_root_cached *root);
extern struct interval_tree_node *
-interval_tree_iter_first(struct rb_root *root,
+interval_tree_iter_first(struct rb_root_cached *root,
unsigned long start, unsigned long last);
extern struct interval_tree_node *
diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
index 58370e1862ad..1f97ce26cccc 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h
@@ -33,7 +33,7 @@
* ITSTATIC: 'static' or empty
* ITPREFIX: prefix to use for the inline tree definitions
*
- * Note - before using this, please consider if non-generic version
+ * Note - before using this, please consider if generic version
* (interval_tree.h) would work for you...
*/
@@ -65,11 +65,13 @@ RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \
\
/* Insert / remove interval nodes from the tree */ \
\
-ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root) \
+ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, \
+ struct rb_root_cached *root) \
{ \
- struct rb_node **link = &root->rb_node, *rb_parent = NULL; \
+ struct rb_node **link = &root->rb_root.rb_node, *rb_parent = NULL; \
ITTYPE start = ITSTART(node), last = ITLAST(node); \
ITSTRUCT *parent; \
+ bool leftmost = true; \
\
while (*link) { \
rb_parent = *link; \
@@ -78,18 +80,22 @@ ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root) \
parent->ITSUBTREE = last; \
if (start < ITSTART(parent)) \
link = &parent->ITRB.rb_left; \
- else \
+ else { \
link = &parent->ITRB.rb_right; \
+ leftmost = false; \
+ } \
} \
\
node->ITSUBTREE = last; \
rb_link_node(&node->ITRB, rb_parent, link); \
- rb_insert_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \
+ rb_insert_augmented_cached(&node->ITRB, root, \
+ leftmost, &ITPREFIX ## _augment); \
} \
\
-ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, struct rb_root *root) \
+ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \
+ struct rb_root_cached *root) \
{ \
- rb_erase_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \
+ rb_erase_augmented_cached(&node->ITRB, root, &ITPREFIX ## _augment); \
} \
\
/* \
@@ -140,15 +146,35 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \
} \
\
ITSTATIC ITSTRUCT * \
-ITPREFIX ## _iter_first(struct rb_root *root, ITTYPE start, ITTYPE last) \
+ITPREFIX ## _iter_first(struct rb_root_cached *root, \
+ ITTYPE start, ITTYPE last) \
{ \
- ITSTRUCT *node; \
+ ITSTRUCT *node, *leftmost; \
\
- if (!root->rb_node) \
+ if (!root->rb_root.rb_node) \
return NULL; \
- node = rb_entry(root->rb_node, ITSTRUCT, ITRB); \
+ \
+ /* \
+ * Fastpath range intersection/overlap between A: [a0, a1] and \
+ * B: [b0, b1] is given by: \
+ * \
+ * a0 <= b1 && b0 <= a1 \
+ * \
+ * ... where A holds the lock range and B holds the smallest \
+ * 'start' and largest 'last' in the tree. For the later, we \
+ * rely on the root node, which by augmented interval tree \
+ * property, holds the largest value in its last-in-subtree. \
+ * This allows mitigating some of the tree walk overhead for \
+ * for non-intersecting ranges, maintained and consulted in O(1). \
+ */ \
+ node = rb_entry(root->rb_root.rb_node, ITSTRUCT, ITRB); \
if (node->ITSUBTREE < start) \
return NULL; \
+ \
+ leftmost = rb_entry(root->rb_leftmost, ITSTRUCT, ITRB); \
+ if (ITSTART(leftmost) > last) \
+ return NULL; \
+ \
return ITPREFIX ## _subtree_search(node, start, last); \
} \
\
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6230064d7f95..f5cf32e80041 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -130,6 +130,8 @@ enum {
IORES_DESC_ACPI_NV_STORAGE = 3,
IORES_DESC_PERSISTENT_MEMORY = 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
+ IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
+ IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
};
/* helpers to define resources */
diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index fadd579d577d..92a2ccff80c5 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -3,7 +3,9 @@
#include <linux/spinlock.h>
#include <linux/uidgid.h>
+#include <linux/rhashtable.h>
#include <uapi/linux/ipc.h>
+#include <linux/refcount.h>
#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
@@ -21,8 +23,10 @@ struct kern_ipc_perm {
unsigned long seq;
void *security;
+ struct rhash_head khtnode;
+
struct rcu_head rcu;
- atomic_t refcount;
+ refcount_t refcount;
} ____cacheline_aligned_in_smp __randomize_layout;
#endif /* _LINUX_IPC_H */
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 65327ee0936b..83f0bf7a587d 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -7,19 +7,23 @@
#include <linux/notifier.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
+#include <linux/refcount.h>
+#include <linux/rhashtable.h>
struct user_namespace;
struct ipc_ids {
int in_use;
unsigned short seq;
+ bool tables_initialized;
struct rw_semaphore rwsem;
struct idr ipcs_idr;
int next_id;
+ struct rhashtable key_ht;
};
struct ipc_namespace {
- atomic_t count;
+ refcount_t count;
struct ipc_ids ids[3];
int sem_ctls[4];
@@ -118,7 +122,7 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags,
static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
if (ns)
- atomic_inc(&ns->count);
+ refcount_inc(&ns->count);
return ns;
}
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 6607225d0ea4..0ad4c3044cf9 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -78,8 +78,11 @@
#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
#define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
-#define DIV_ROUND_UP_ULL(ll,d) \
- ({ unsigned long long _tmp = (ll)+(d)-1; do_div(_tmp, d); _tmp; })
+
+#define DIV_ROUND_DOWN_ULL(ll, d) \
+ ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
+
+#define DIV_ROUND_UP_ULL(ll, d) DIV_ROUND_DOWN_ULL((ll) + (d) - 1, (d))
#if BITS_PER_LONG == 32
# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d)
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 655082c88fd9..40c89ad4bea6 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -19,6 +19,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#include <linux/umh.h>
#include <linux/gfp.h>
#include <linux/stddef.h>
#include <linux/errno.h>
@@ -44,63 +45,4 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS;
#define try_then_request_module(x, mod...) (x)
#endif
-
-struct cred;
-struct file;
-
-#define UMH_NO_WAIT 0 /* don't wait at all */
-#define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */
-#define UMH_WAIT_PROC 2 /* wait for the process to complete */
-#define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */
-
-struct subprocess_info {
- struct work_struct work;
- struct completion *complete;
- const char *path;
- char **argv;
- char **envp;
- int wait;
- int retval;
- int (*init)(struct subprocess_info *info, struct cred *new);
- void (*cleanup)(struct subprocess_info *info);
- void *data;
-} __randomize_layout;
-
-extern int
-call_usermodehelper(const char *path, char **argv, char **envp, int wait);
-
-extern struct subprocess_info *
-call_usermodehelper_setup(const char *path, char **argv, char **envp,
- gfp_t gfp_mask,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *), void *data);
-
-extern int
-call_usermodehelper_exec(struct subprocess_info *info, int wait);
-
-extern struct ctl_table usermodehelper_table[];
-
-enum umh_disable_depth {
- UMH_ENABLED = 0,
- UMH_FREEZING,
- UMH_DISABLED,
-};
-
-extern int __usermodehelper_disable(enum umh_disable_depth depth);
-extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
-
-static inline int usermodehelper_disable(void)
-{
- return __usermodehelper_disable(UMH_DISABLED);
-}
-
-static inline void usermodehelper_enable(void)
-{
- __usermodehelper_set_disable_depth(UMH_ENABLED);
-}
-
-extern int usermodehelper_read_trylock(void);
-extern long usermodehelper_read_lock_wait(long timeout);
-extern void usermodehelper_read_unlock(void);
-
#endif /* __LINUX_KMOD_H__ */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 5e6e4cc36ff4..0995e1a2b458 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -133,6 +133,17 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
extern int __add_pages(int nid, unsigned long start_pfn,
unsigned long nr_pages, bool want_memblock);
+#ifndef CONFIG_ARCH_HAS_ADD_PAGES
+static inline int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock)
+{
+ return __add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
+#else /* ARCH_HAS_ADD_PAGES */
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock);
+#endif /* ARCH_HAS_ADD_PAGES */
+
#ifdef CONFIG_NUMA
extern int memory_add_physaddr_to_nid(u64 start);
#else
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 93416196ba64..79f8ba7c3894 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -4,6 +4,8 @@
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>
+#include <asm/pgtable.h>
+
struct resource;
struct device;
@@ -35,24 +37,107 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
}
#endif
+/*
+ * Specialize ZONE_DEVICE memory into multiple types each having differents
+ * usage.
+ *
+ * MEMORY_DEVICE_HOST:
+ * Persistent device memory (pmem): struct page might be allocated in different
+ * memory and architecture might want to perform special actions. It is similar
+ * to regular memory, in that the CPU can access it transparently. However,
+ * it is likely to have different bandwidth and latency than regular memory.
+ * See Documentation/nvdimm/nvdimm.txt for more information.
+ *
+ * MEMORY_DEVICE_PRIVATE:
+ * Device memory that is not directly addressable by the CPU: CPU can neither
+ * read nor write private memory. In this case, we do still have struct pages
+ * backing the device memory. Doing so simplifies the implementation, but it is
+ * important to remember that there are certain points at which the struct page
+ * must be treated as an opaque object, rather than a "normal" struct page.
+ *
+ * A more complete discussion of unaddressable memory may be found in
+ * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_PUBLIC:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
+ */
+enum memory_type {
+ MEMORY_DEVICE_HOST = 0,
+ MEMORY_DEVICE_PRIVATE,
+ MEMORY_DEVICE_PUBLIC,
+};
+
+/*
+ * For MEMORY_DEVICE_PRIVATE we use ZONE_DEVICE and extend it with two
+ * callbacks:
+ * page_fault()
+ * page_free()
+ *
+ * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
+ * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief
+ * explanation in include/linux/memory_hotplug.h.
+ *
+ * The page_fault() callback must migrate page back, from device memory to
+ * system memory, so that the CPU can access it. This might fail for various
+ * reasons (device issues, device have been unplugged, ...). When such error
+ * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
+ * set the CPU page table entry to "poisoned".
+ *
+ * Note that because memory cgroup charges are transferred to the device memory,
+ * this should never fail due to memory restrictions. However, allocation
+ * of a regular system page might still fail because we are out of memory. If
+ * that happens, the page_fault() callback must return VM_FAULT_OOM.
+ *
+ * The page_fault() callback can also try to migrate back multiple pages in one
+ * chunk, as an optimization. It must, however, prioritize the faulting address
+ * over all the others.
+ *
+ *
+ * The page_free() callback is called once the page refcount reaches 1
+ * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
+ * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
+ */
+typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
+ unsigned long addr,
+ const struct page *page,
+ unsigned int flags,
+ pmd_t *pmdp);
+typedef void (*dev_page_free_t)(struct page *page, void *data);
+
/**
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
+ * @page_fault: callback when CPU fault on an unaddressable device page
+ * @page_free: free page callback when page refcount reaches 1
* @altmap: pre-allocated/reserved memory for vmemmap allocations
* @res: physical address range covered by @ref
* @ref: reference count that pins the devm_memremap_pages() mapping
* @dev: host device of the mapping for debug
+ * @data: private data pointer for page_free()
+ * @type: memory type: see MEMORY_* in memory_hotplug.h
*/
struct dev_pagemap {
+ dev_page_fault_t page_fault;
+ dev_page_free_t page_free;
struct vmem_altmap *altmap;
const struct resource *res;
struct percpu_ref *ref;
struct device *dev;
+ void *data;
+ enum memory_type type;
};
#ifdef CONFIG_ZONE_DEVICE
void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap);
struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+
+static inline bool is_zone_device_page(const struct page *page);
#else
static inline void *devm_memremap_pages(struct device *dev,
struct resource *res, struct percpu_ref *ref,
@@ -73,6 +158,20 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
}
#endif
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+static inline bool is_device_private_page(const struct page *page)
+{
+ return is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+}
+
+static inline bool is_device_public_page(const struct page *page)
+{
+ return is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+}
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
/**
* get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
* @pfn: page frame number to lookup page_map
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3e0d405dc842..643c7ae7d7b4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -35,15 +35,28 @@ static inline struct page *new_page_nodemask(struct page *page,
int preferred_nid, nodemask_t *nodemask)
{
gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL;
+ unsigned int order = 0;
+ struct page *new_page = NULL;
if (PageHuge(page))
return alloc_huge_page_nodemask(page_hstate(compound_head(page)),
preferred_nid, nodemask);
+ if (thp_migration_supported() && PageTransHuge(page)) {
+ order = HPAGE_PMD_ORDER;
+ gfp_mask |= GFP_TRANSHUGE;
+ }
+
if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
gfp_mask |= __GFP_HIGHMEM;
- return __alloc_pages_nodemask(gfp_mask, 0, preferred_nid, nodemask);
+ new_page = __alloc_pages_nodemask(gfp_mask, order,
+ preferred_nid, nodemask);
+
+ if (new_page && PageTransHuge(page))
+ prep_transhuge_page(new_page);
+
+ return new_page;
}
#ifdef CONFIG_MIGRATION
@@ -59,6 +72,7 @@ extern void putback_movable_page(struct page *page);
extern int migrate_prep(void);
extern int migrate_prep_local(void);
+extern void migrate_page_states(struct page *newpage, struct page *page);
extern void migrate_page_copy(struct page *newpage, struct page *page);
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
@@ -79,6 +93,10 @@ static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
static inline int migrate_prep(void) { return -ENOSYS; }
static inline int migrate_prep_local(void) { return -ENOSYS; }
+static inline void migrate_page_states(struct page *newpage, struct page *page)
+{
+}
+
static inline void migrate_page_copy(struct page *newpage,
struct page *page) {}
@@ -138,4 +156,136 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
}
#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
+
+#ifdef CONFIG_MIGRATION
+
+/*
+ * Watch out for PAE architecture, which has an unsigned long, and might not
+ * have enough bits to store all physical address and flags. So far we have
+ * enough room for all our flags.
+ */
+#define MIGRATE_PFN_VALID (1UL << 0)
+#define MIGRATE_PFN_MIGRATE (1UL << 1)
+#define MIGRATE_PFN_LOCKED (1UL << 2)
+#define MIGRATE_PFN_WRITE (1UL << 3)
+#define MIGRATE_PFN_DEVICE (1UL << 4)
+#define MIGRATE_PFN_ERROR (1UL << 5)
+#define MIGRATE_PFN_SHIFT 6
+
+static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
+{
+ if (!(mpfn & MIGRATE_PFN_VALID))
+ return NULL;
+ return pfn_to_page(mpfn >> MIGRATE_PFN_SHIFT);
+}
+
+static inline unsigned long migrate_pfn(unsigned long pfn)
+{
+ return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
+}
+
+/*
+ * struct migrate_vma_ops - migrate operation callback
+ *
+ * @alloc_and_copy: alloc destination memory and copy source memory to it
+ * @finalize_and_map: allow caller to map the successfully migrated pages
+ *
+ *
+ * The alloc_and_copy() callback happens once all source pages have been locked,
+ * unmapped and checked (checked whether pinned or not). All pages that can be
+ * migrated will have an entry in the src array set with the pfn value of the
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other
+ * flags might be set but should be ignored by the callback).
+ *
+ * The alloc_and_copy() callback can then allocate destination memory and copy
+ * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the
+ * callback must update each corresponding entry in the dst array with the pfn
+ * value of the destination page and with the MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages
+ * locked, via lock_page()).
+ *
+ * At this point the alloc_and_copy() callback is done and returns.
+ *
+ * Note that the callback does not have to migrate all the pages that are
+ * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration
+ * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also
+ * set in the src array entry). If the device driver cannot migrate a device
+ * page back to system memory, then it must set the corresponding dst array
+ * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to
+ * access any of the virtual addresses originally backed by this page. Because
+ * a SIGBUS is such a severe result for the userspace process, the device
+ * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
+ * unrecoverable state.
+ *
+ * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing device driver to allocate device memory for those unback virtual
+ * address. For this the device driver simply have to allocate device memory
+ * and properly set the destination entry like for regular migration. Note that
+ * this can still fails and thus inside the device driver must check if the
+ * migration was successful for those entry inside the finalize_and_map()
+ * callback just like for regular migration.
+ *
+ * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
+ * OR BAD THINGS WILL HAPPEN !
+ *
+ *
+ * The finalize_and_map() callback happens after struct page migration from
+ * source to destination (destination struct pages are the struct pages for the
+ * memory allocated by the alloc_and_copy() callback). Migration can fail, and
+ * thus the finalize_and_map() allows the driver to inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table from within the finalize_and_map()
+ * callback because both destination and source page are still locked, and the
+ * mmap_sem is held in read mode (hence no one can unmap the range being
+ * migrated).
+ *
+ * Once callback is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) then it returns. At this point,
+ * the HMM core will finish up the final steps, and the migration is complete.
+ *
+ * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY
+ * ENTRIES OR BAD THINGS WILL HAPPEN !
+ */
+struct migrate_vma_ops {
+ void (*alloc_and_copy)(struct vm_area_struct *vma,
+ const unsigned long *src,
+ unsigned long *dst,
+ unsigned long start,
+ unsigned long end,
+ void *private);
+ void (*finalize_and_map)(struct vm_area_struct *vma,
+ const unsigned long *src,
+ const unsigned long *dst,
+ unsigned long start,
+ unsigned long end,
+ void *private);
+};
+
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
+int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *src,
+ unsigned long *dst,
+ void *private);
+#else
+static inline int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *src,
+ unsigned long *dst,
+ void *private)
+{
+ return -EINVAL;
+}
+#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
+
+#endif /* CONFIG_MIGRATION */
+
#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index ebf3d89a3919..bdf66af9b937 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -6,11 +6,16 @@
* on most operations but not ->writepage as the potential stall time
* is too significant
* MIGRATE_SYNC will block when migrating pages
+ * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages
+ * with the CPU. Instead, page copy happens outside the migratepage()
+ * callback and is likely using a DMA engine. See migrate_vma() and HMM
+ * (mm/hmm.c) for users of this mode.
*/
enum migrate_mode {
MIGRATE_ASYNC,
MIGRATE_SYNC_LIGHT,
MIGRATE_SYNC,
+ MIGRATE_SYNC_NO_COPY,
};
#endif /* MIGRATE_MODE_H_INCLUDED */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 39db8e54c5d5..f8c10d336e42 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -23,6 +23,7 @@
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page_ref.h>
+#include <linux/memremap.h>
struct mempolicy;
struct anon_vma;
@@ -799,6 +800,28 @@ static inline bool is_zone_device_page(const struct page *page)
}
#endif
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page);
+DECLARE_STATIC_KEY_FALSE(device_private_key);
+#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key)
+static inline bool is_device_private_page(const struct page *page);
+static inline bool is_device_public_page(const struct page *page);
+#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+static inline void put_zone_device_private_or_public_page(struct page *page)
+{
+}
+#define IS_HMM_ENABLED 0
+static inline bool is_device_private_page(const struct page *page)
+{
+ return false;
+}
+static inline bool is_device_public_page(const struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+
static inline void get_page(struct page *page)
{
page = compound_head(page);
@@ -814,6 +837,18 @@ static inline void put_page(struct page *page)
{
page = compound_head(page);
+ /*
+ * For private device pages we need to catch refcount transition from
+ * 2 to 1, when refcount reach one it means the private device page is
+ * free and we need to inform the device driver through callback. See
+ * include/linux/memremap.h and HMM for details.
+ */
+ if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) ||
+ unlikely(is_device_public_page(page)))) {
+ put_zone_device_private_or_public_page(page);
+ return;
+ }
+
if (put_page_testzero(page))
__put_page(page);
}
@@ -1199,8 +1234,10 @@ struct zap_details {
pgoff_t last_index; /* Highest page->index to unmap */
};
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte);
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device);
+#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
+
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
@@ -1997,13 +2034,13 @@ extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
- struct rb_root *root);
+ struct rb_root_cached *root);
void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct vm_area_struct *prev,
- struct rb_root *root);
+ struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
- struct rb_root *root);
-struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root,
+ struct rb_root_cached *root);
+struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
unsigned long start, unsigned long last);
@@ -2013,11 +2050,12 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
vma; vma = vma_interval_tree_iter_next(vma, start, last))
void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
- struct rb_root *root);
+ struct rb_root_cached *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
- struct rb_root *root);
-struct anon_vma_chain *anon_vma_interval_tree_iter_first(
- struct rb_root *root, unsigned long start, unsigned long last);
+ struct rb_root_cached *root);
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
+ unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
struct anon_vma_chain *node, unsigned long start, unsigned long last);
#ifdef CONFIG_DEBUG_VM_RB
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f45ad815b7d7..46f4ecf5479a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,6 +23,7 @@
struct address_space;
struct mem_cgroup;
+struct hmm;
/*
* Each physical page in the system has a struct page associated with
@@ -503,6 +504,11 @@ struct mm_struct {
atomic_long_t hugetlb_usage;
#endif
struct work_struct async_put_work;
+
+#if IS_ENABLED(CONFIG_HMM)
+ /* HMM needs to track a few things per mm */
+ struct hmm *hmm;
+#endif
} __randomize_layout;
extern struct mm_struct init_mm;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e7e92c8f4883..356a814e7c8e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,20 @@ struct zone_padding {
#define ZONE_PADDING(name)
#endif
+#ifdef CONFIG_NUMA
+enum numa_stat_item {
+ NUMA_HIT, /* allocated in intended node */
+ NUMA_MISS, /* allocated in non intended node */
+ NUMA_FOREIGN, /* was intended here, hit elsewhere */
+ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
+ NUMA_LOCAL, /* allocation from local node */
+ NUMA_OTHER, /* allocation from other node */
+ NR_VM_NUMA_STAT_ITEMS
+};
+#else
+#define NR_VM_NUMA_STAT_ITEMS 0
+#endif
+
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
@@ -132,14 +146,6 @@ enum zone_stat_item {
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
-#ifdef CONFIG_NUMA
- NUMA_HIT, /* allocated in intended node */
- NUMA_MISS, /* allocated in non intended node */
- NUMA_FOREIGN, /* was intended here, hit elsewhere */
- NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
- NUMA_LOCAL, /* allocation from local node */
- NUMA_OTHER, /* allocation from other node */
-#endif
NR_FREE_CMA_PAGES,
NR_VM_ZONE_STAT_ITEMS };
@@ -276,6 +282,7 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
s8 expire;
+ u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
s8 stat_threshold;
@@ -496,6 +503,7 @@ struct zone {
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+ atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
enum pgdat_flags {
diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h
index 0035abe41b9a..56f35dd3d01d 100644
--- a/include/linux/pps-gpio.h
+++ b/include/linux/pps-gpio.h
@@ -29,4 +29,4 @@ struct pps_gpio_platform_data {
const char *gpio_label;
};
-#endif
+#endif /* _PPS_GPIO_H */
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 35ac903956c7..80a980cc8d95 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -22,7 +22,6 @@
#define LINUX_PPS_KERNEL_H
#include <linux/pps.h>
-
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/time.h>
@@ -35,9 +34,9 @@ struct pps_device;
/* The specific PPS source info */
struct pps_source_info {
- char name[PPS_MAX_NAME_LEN]; /* simbolic name */
+ char name[PPS_MAX_NAME_LEN]; /* symbolic name */
char path[PPS_MAX_NAME_LEN]; /* path of connected device */
- int mode; /* PPS's allowed mode */
+ int mode; /* PPS allowed mode */
void (*echo)(struct pps_device *pps,
int event, void *data); /* PPS echo function */
@@ -57,10 +56,10 @@ struct pps_event_time {
struct pps_device {
struct pps_source_info info; /* PSS source info */
- struct pps_kparams params; /* PPS's current params */
+ struct pps_kparams params; /* PPS current params */
- __u32 assert_sequence; /* PPS' assert event seq # */
- __u32 clear_sequence; /* PPS' clear event seq # */
+ __u32 assert_sequence; /* PPS assert event seq # */
+ __u32 clear_sequence; /* PPS clear event seq # */
struct pps_ktime assert_tu;
struct pps_ktime clear_tu;
int current_mode; /* PPS mode at event time */
@@ -69,7 +68,7 @@ struct pps_device {
wait_queue_head_t queue; /* PPS event queue */
unsigned int id; /* PPS source unique ID */
- void const *lookup_cookie; /* pps_lookup_dev only */
+ void const *lookup_cookie; /* For pps_lookup_dev() only */
struct cdev cdev;
struct device *dev;
struct fasync_struct *async_queue; /* fasync method */
@@ -101,7 +100,7 @@ extern struct pps_device *pps_register_source(
extern void pps_unregister_source(struct pps_device *pps);
extern void pps_event(struct pps_device *pps,
struct pps_event_time *ts, int event, void *data);
-/* Look up a pps device by magic cookie */
+/* Look up a pps_device by magic cookie */
struct pps_device *pps_lookup_dev(void const *cookie);
static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
@@ -132,4 +131,3 @@ static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta
}
#endif /* LINUX_PPS_KERNEL_H */
-
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 2d2bf592d9db..76124dd4e36d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -28,13 +28,7 @@ extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
const struct file_operations *,
void *);
-static inline struct proc_dir_entry *proc_create(
- const char *name, umode_t mode, struct proc_dir_entry *parent,
- const struct file_operations *proc_fops)
-{
- return proc_create_data(name, mode, parent, proc_fops, NULL);
-}
-
+struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops);
extern void proc_set_size(struct proc_dir_entry *, loff_t);
extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
extern void *PDE_DATA(const struct inode *);
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index e585018498d5..d574361943ea 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -44,10 +44,25 @@ struct rb_root {
struct rb_node *rb_node;
};
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+ struct rb_root rb_root;
+ struct rb_node *rb_leftmost;
+};
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
#define RB_ROOT (struct rb_root) { NULL, }
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL)
@@ -69,6 +84,12 @@ extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);
+extern void rb_insert_color_cached(struct rb_node *,
+ struct rb_root_cached *, bool);
+extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 9702b6e183bc..6bfd2b581f75 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -41,7 +41,9 @@ struct rb_augment_callbacks {
void (*rotate)(struct rb_node *old, struct rb_node *new);
};
-extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+extern void __rb_insert_augmented(struct rb_node *node,
+ struct rb_root *root,
+ bool newleft, struct rb_node **leftmost,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
/*
* Fixup the rbtree and update the augmented information when rebalancing.
@@ -57,7 +59,16 @@ static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
- __rb_insert_augmented(node, root, augment->rotate);
+ __rb_insert_augmented(node, root, false, NULL, augment->rotate);
+}
+
+static inline void
+rb_insert_augmented_cached(struct rb_node *node,
+ struct rb_root_cached *root, bool newleft,
+ const struct rb_augment_callbacks *augment)
+{
+ __rb_insert_augmented(node, &root->rb_root,
+ newleft, &root->rb_leftmost, augment->rotate);
}
#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \
@@ -150,6 +161,7 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+ struct rb_node **leftmost,
const struct rb_augment_callbacks *augment)
{
struct rb_node *child = node->rb_right;
@@ -157,6 +169,9 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
struct rb_node *parent, *rebalance;
unsigned long pc;
+ if (leftmost && node == *leftmost)
+ *leftmost = rb_next(node);
+
if (!tmp) {
/*
* Case 1: node to erase has no more than 1 child (easy!)
@@ -256,9 +271,21 @@ static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
- struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
+ struct rb_node *rebalance = __rb_erase_augmented(node, root,
+ NULL, augment);
if (rebalance)
__rb_erase_color(rebalance, root, augment->rotate);
}
+static __always_inline void
+rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
+ const struct rb_augment_callbacks *augment)
+{
+ struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root,
+ &root->rb_leftmost,
+ augment);
+ if (rebalance)
+ __rb_erase_color(rebalance, &root->rb_root, augment->rotate);
+}
+
#endif /* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 7d56a7ea2b2e..361c08e35dbc 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -127,7 +127,7 @@ struct rhashtable;
* @head_offset: Offset of rhash_head in struct to be hashed
* @max_size: Maximum size while expanding
* @min_size: Minimum size while shrinking
- * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
+ * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
* @automatic_shrinking: Enable automatic shrinking of tables
* @nulls_base: Base value to generate nulls marker
* @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 43ef2c30cb0f..733d3d8181e2 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -55,7 +55,9 @@ struct anon_vma {
* is serialized by a system wide lock only visible to
* mm_take_all_locks() (mm_all_locks_mutex).
*/
- struct rb_root rb_root; /* Interval tree of private "related" vmas */
+
+ /* Interval tree of private "related" vmas */
+ struct rb_root_cached rb_root;
};
/*
@@ -93,8 +95,9 @@ enum ttu_flags {
TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
* and caller guarantees they will
* do a final flush if necessary */
- TTU_RMAP_LOCKED = 0x80 /* do not grab rmap lock:
+ TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
* caller holds it */
+ TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */
};
#ifdef CONFIG_MMU
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 44fd002f7cd5..53fcbe9de7fd 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -22,18 +22,17 @@ extern int max_lock_depth; /* for sysctl */
* The rt_mutex structure
*
* @wait_lock: spinlock to protect the structure
- * @waiters: rbtree root to enqueue waiters in priority order
- * @waiters_leftmost: top waiter
+ * @waiters: rbtree root to enqueue waiters in priority order;
+ * caches top-waiter (leftmost node).
* @owner: the mutex owner
*/
struct rt_mutex {
raw_spinlock_t wait_lock;
- struct rb_root waiters;
- struct rb_node *waiters_leftmost;
+ struct rb_root_cached waiters;
struct task_struct *owner;
#ifdef CONFIG_DEBUG_RT_MUTEXES
int save_state;
- const char *name, *file;
+ const char *name, *file;
int line;
void *magic;
#endif
@@ -84,7 +83,7 @@ do { \
#define __RT_MUTEX_INITIALIZER(mutexname) \
{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
- , .waiters = RB_ROOT \
+ , .waiters = RB_ROOT_CACHED \
, .owner = NULL \
__DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
__DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68b38335d33c..92fb8dd5a9e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -812,8 +812,7 @@ struct task_struct {
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task: */
- struct rb_root pi_waiters;
- struct rb_node *pi_waiters_leftmost;
+ struct rb_root_cached pi_waiters;
/* Updated under owner's pi_lock and rq lock */
struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling: */
diff --git a/include/linux/string.h b/include/linux/string.h
index a467e617eeb0..c8bdafffd2f0 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -99,6 +99,36 @@ extern __kernel_size_t strcspn(const char *,const char *);
#ifndef __HAVE_ARCH_MEMSET
extern void * memset(void *,int,__kernel_size_t);
#endif
+
+#ifndef __HAVE_ARCH_MEMSET16
+extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET32
+extern void *memset32(uint32_t *, uint32_t, __kernel_size_t);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET64
+extern void *memset64(uint64_t *, uint64_t, __kernel_size_t);
+#endif
+
+static inline void *memset_l(unsigned long *p, unsigned long v,
+ __kernel_size_t n)
+{
+ if (BITS_PER_LONG == 32)
+ return memset32((uint32_t *)p, v, n);
+ else
+ return memset64((uint64_t *)p, v, n);
+}
+
+static inline void *memset_p(void **p, void *v, __kernel_size_t n)
+{
+ if (BITS_PER_LONG == 32)
+ return memset32((uint32_t *)p, (uintptr_t)v, n);
+ else
+ return memset64((uint64_t *)p, (uintptr_t)v, n);
+}
+
#ifndef __HAVE_ARCH_MEMCPY
extern void * memcpy(void *,const void *,__kernel_size_t);
#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8bf3487fb204..8a807292037f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -51,6 +51,23 @@ static inline int current_is_kswapd(void)
*/
/*
+ * Unaddressable device memory support. See include/linux/hmm.h and
+ * Documentation/vm/hmm.txt. Short description is we need struct pages for
+ * device memory that is unaddressable (inaccessible) by CPU, so that we can
+ * migrate part of a process memory to device memory.
+ *
+ * When a page is migrated from CPU to device, we set the CPU page table entry
+ * to a special SWP_DEVICE_* entry.
+ */
+#ifdef CONFIG_DEVICE_PRIVATE
+#define SWP_DEVICE_NUM 2
+#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
+#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
+#else
+#define SWP_DEVICE_NUM 0
+#endif
+
+/*
* NUMA node memory migration support
*/
#ifdef CONFIG_MIGRATION
@@ -72,7 +89,8 @@ static inline int current_is_kswapd(void)
#endif
#define MAX_SWAPFILES \
- ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
+ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
/*
* Magic header for a swap area. The first part of the union is
@@ -469,8 +487,8 @@ static inline void show_swap_cache_info(void)
{
}
-#define free_swap_and_cache(swp) is_migration_entry(swp)
-#define swapcache_prepare(swp) is_migration_entry(swp)
+#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index c5ff7b217ee6..291c4b534658 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -100,10 +100,79 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+{
+ return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ,
+ page_to_pfn(page));
+}
+
+static inline bool is_device_private_entry(swp_entry_t entry)
+{
+ int type = swp_type(entry);
+ return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
+}
+
+static inline void make_device_private_entry_read(swp_entry_t *entry)
+{
+ *entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry));
+}
+
+static inline bool is_write_device_private_entry(swp_entry_t entry)
+{
+ return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
+}
+
+static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+{
+ return pfn_to_page(swp_offset(entry));
+}
+
+int device_private_entry_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ swp_entry_t entry,
+ unsigned int flags,
+ pmd_t *pmdp);
+#else /* CONFIG_DEVICE_PRIVATE */
+static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+{
+ return swp_entry(0, 0);
+}
+
+static inline void make_device_private_entry_read(swp_entry_t *entry)
+{
+}
+
+static inline bool is_device_private_entry(swp_entry_t entry)
+{
+ return false;
+}
+
+static inline bool is_write_device_private_entry(swp_entry_t entry)
+{
+ return false;
+}
+
+static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+{
+ return NULL;
+}
+
+static inline int device_private_entry_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ swp_entry_t entry,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ return VM_FAULT_SIGBUS;
+}
+#endif /* CONFIG_DEVICE_PRIVATE */
+
#ifdef CONFIG_MIGRATION
static inline swp_entry_t make_migration_entry(struct page *page, int write)
{
- BUG_ON(!PageLocked(page));
+ BUG_ON(!PageLocked(compound_head(page)));
+
return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
page_to_pfn(page));
}
@@ -126,7 +195,7 @@ static inline struct page *migration_entry_to_page(swp_entry_t entry)
* Any use of migration entries may only occur while the
* corresponding page is locked
*/
- BUG_ON(!PageLocked(p));
+ BUG_ON(!PageLocked(compound_head(p)));
return p;
}
@@ -148,7 +217,11 @@ static inline int is_migration_entry(swp_entry_t swp)
{
return 0;
}
-#define migration_entry_to_page(swp) NULL
+static inline struct page *migration_entry_to_page(swp_entry_t entry)
+{
+ return NULL;
+}
+
static inline void make_migration_entry_read(swp_entry_t *entryp) { }
static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl) { }
@@ -163,6 +236,70 @@ static inline int is_write_migration_entry(swp_entry_t entry)
#endif
+struct page_vma_mapped_walk;
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+ struct page *page);
+
+extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
+ struct page *new);
+
+extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
+
+static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
+{
+ swp_entry_t arch_entry;
+
+ if (pmd_swp_soft_dirty(pmd))
+ pmd = pmd_swp_clear_soft_dirty(pmd);
+ arch_entry = __pmd_to_swp_entry(pmd);
+ return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
+}
+
+static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
+{
+ swp_entry_t arch_entry;
+
+ arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
+ return __swp_entry_to_pmd(arch_entry);
+}
+
+static inline int is_pmd_migration_entry(pmd_t pmd)
+{
+ return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
+}
+#else
+static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+ struct page *page)
+{
+ BUILD_BUG();
+}
+
+static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
+ struct page *new)
+{
+ BUILD_BUG();
+}
+
+static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }
+
+static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
+{
+ return swp_entry(0, 0);
+}
+
+static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
+{
+ return __pmd(0);
+}
+
+static inline int is_pmd_migration_entry(pmd_t pmd)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_MEMORY_FAILURE
extern atomic_long_t num_poisoned_pages __read_mostly;
diff --git a/include/linux/umh.h b/include/linux/umh.h
new file mode 100644
index 000000000000..244aff638220
--- /dev/null
+++ b/include/linux/umh.h
@@ -0,0 +1,69 @@
+#ifndef __LINUX_UMH_H__
+#define __LINUX_UMH_H__
+
+#include <linux/gfp.h>
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/workqueue.h>
+#include <linux/sysctl.h>
+
+struct cred;
+struct file;
+
+#define UMH_NO_WAIT 0 /* don't wait at all */
+#define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */
+#define UMH_WAIT_PROC 2 /* wait for the process to complete */
+#define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */
+
+struct subprocess_info {
+ struct work_struct work;
+ struct completion *complete;
+ const char *path;
+ char **argv;
+ char **envp;
+ int wait;
+ int retval;
+ int (*init)(struct subprocess_info *info, struct cred *new);
+ void (*cleanup)(struct subprocess_info *info);
+ void *data;
+} __randomize_layout;
+
+extern int
+call_usermodehelper(const char *path, char **argv, char **envp, int wait);
+
+extern struct subprocess_info *
+call_usermodehelper_setup(const char *path, char **argv, char **envp,
+ gfp_t gfp_mask,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *), void *data);
+
+extern int
+call_usermodehelper_exec(struct subprocess_info *info, int wait);
+
+extern struct ctl_table usermodehelper_table[];
+
+enum umh_disable_depth {
+ UMH_ENABLED = 0,
+ UMH_FREEZING,
+ UMH_DISABLED,
+};
+
+extern int __usermodehelper_disable(enum umh_disable_depth depth);
+extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
+
+static inline int usermodehelper_disable(void)
+{
+ return __usermodehelper_disable(UMH_DISABLED);
+}
+
+static inline void usermodehelper_enable(void)
+{
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
+}
+
+extern int usermodehelper_read_trylock(void);
+extern long usermodehelper_read_lock_wait(long timeout);
+extern void usermodehelper_read_unlock(void);
+
+#endif /* __LINUX_UMH_H__ */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 97e11ab573f0..ade7cb5f1359 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -107,8 +107,37 @@ static inline void vm_events_fold_cpu(int cpu)
* Zone and node-based page accounting with per cpu differentials.
*/
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
+extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
+#ifdef CONFIG_NUMA
+static inline void zone_numa_state_add(long x, struct zone *zone,
+ enum numa_stat_item item)
+{
+ atomic_long_add(x, &zone->vm_numa_stat[item]);
+ atomic_long_add(x, &vm_numa_stat[item]);
+}
+
+static inline unsigned long global_numa_state(enum numa_stat_item item)
+{
+ long x = atomic_long_read(&vm_numa_stat[item]);
+
+ return x;
+}
+
+static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
+ enum numa_stat_item item)
+{
+ long x = atomic_long_read(&zone->vm_numa_stat[item]);
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
+
+ return x;
+}
+#endif /* CONFIG_NUMA */
+
static inline void zone_page_state_add(long x, struct zone *zone,
enum zone_stat_item item)
{
@@ -194,8 +223,10 @@ static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
#ifdef CONFIG_NUMA
+extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
extern unsigned long sum_zone_node_page_state(int node,
- enum zone_stat_item item);
+ enum zone_stat_item item);
+extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item);
#else
diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index f38c10ba3ff5..30b6e0d2a942 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -13,6 +13,7 @@
#ifndef _LINUX_VT_BUFFER_H_
#define _LINUX_VT_BUFFER_H_
+#include <linux/string.h>
#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_MDA_CONSOLE)
#include <asm/vga.h>
@@ -26,24 +27,33 @@
#ifndef VT_BUF_HAVE_MEMSETW
static inline void scr_memsetw(u16 *s, u16 c, unsigned int count)
{
+#ifdef VT_BUF_HAVE_RW
count /= 2;
while (count--)
scr_writew(c, s++);
+#else
+ memset16(s, c, count / 2);
+#endif
}
#endif
#ifndef VT_BUF_HAVE_MEMCPYW
static inline void scr_memcpyw(u16 *d, const u16 *s, unsigned int count)
{
+#ifdef VT_BUF_HAVE_RW
count /= 2;
while (count--)
scr_writew(scr_readw(s++), d++);
+#else
+ memcpy(d, s, count);
+#endif
}
#endif
#ifndef VT_BUF_HAVE_MEMMOVEW
static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count)
{
+#ifdef VT_BUF_HAVE_RW
if (d < s)
scr_memcpyw(d, s, count);
else {
@@ -53,6 +63,9 @@ static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count)
while (count--)
scr_writew(scr_readw(--s), --d);
}
+#else
+ memmove(d, s, count);
+#endif
}
#endif
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index fb67554aabd6..5eb7f5bc8248 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -111,22 +111,25 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt,
void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset,
u64 bound);
-void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root);
-void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root);
+void rbt_ib_umem_insert(struct umem_odp_node *node,
+ struct rb_root_cached *root);
+void rbt_ib_umem_remove(struct umem_odp_node *node,
+ struct rb_root_cached *root);
typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
void *cookie);
/*
* Call the callback on each ib_umem in the range. Returns the logical or of
* the return values of the functions called.
*/
-int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end,
+int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
+ u64 start, u64 end,
umem_call_back cb, void *cookie);
/*
* Find first region intersecting with address range.
* Return NULL if not found
*/
-struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root,
+struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
u64 addr, u64 length);
static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e6df68048517..bdb1279a415b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1457,7 +1457,7 @@ struct ib_ucontext {
struct pid *tgid;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- struct rb_root umem_tree;
+ struct rb_root_cached umem_tree;
/*
* Protects .umem_rbroot and tree, as well as odp_mrs_count and
* mmu notifiers registration.
diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h
index 744b3d060968..5558db8e6646 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -16,7 +16,7 @@
#define AUTOFS_DEVICE_NAME "autofs"
#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1
-#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0
+#define AUTOFS_DEV_IOCTL_VERSION_MINOR 1
#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl)
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 7c6da423d54e..9453e9a07c9d 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -155,8 +155,6 @@ enum {
};
#define AUTOFS_IOC_EXPIRE_MULTI _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int)
-#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_EXPIRE_DIRECT AUTOFS_IOC_EXPIRE_MULTI
#define AUTOFS_IOC_PROTOSUBVER _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int)
#define AUTOFS_IOC_ASKUMOUNT _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int)
diff --git a/include/uapi/linux/pps.h b/include/uapi/linux/pps.h
index c1cb3825a8bc..c29d6b791c08 100644
--- a/include/uapi/linux/pps.h
+++ b/include/uapi/linux/pps.h
@@ -95,8 +95,8 @@ struct pps_kparams {
#define PPS_CAPTURECLEAR 0x02 /* capture clear events */
#define PPS_CAPTUREBOTH 0x03 /* capture assert and clear events */
-#define PPS_OFFSETASSERT 0x10 /* apply compensation for assert ev. */
-#define PPS_OFFSETCLEAR 0x20 /* apply compensation for clear ev. */
+#define PPS_OFFSETASSERT 0x10 /* apply compensation for assert event */
+#define PPS_OFFSETCLEAR 0x20 /* apply compensation for clear event */
#define PPS_CANWAIT 0x100 /* can we wait for an event? */
#define PPS_CANPOLL 0x200 /* bit reserved for future use */
diff --git a/init/main.c b/init/main.c
index 949306bb5b6a..0ee9c6866ada 100644
--- a/init/main.c
+++ b/init/main.c
@@ -515,12 +515,6 @@ asmlinkage __visible void __init start_kernel(void)
smp_setup_processor_id();
debug_objects_early_init();
- /*
- * Set up the initial canary ASAP:
- */
- add_latent_entropy();
- boot_init_stack_canary();
-
cgroup_init_early();
local_irq_disable();
@@ -534,6 +528,13 @@ asmlinkage __visible void __init start_kernel(void)
page_address_init();
pr_notice("%s", linux_banner);
setup_arch(&command_line);
+ /*
+ * Set up the the initial canary and entropy after arch
+ * and after adding latent and command line entropy.
+ */
+ add_latent_entropy();
+ add_device_randomness(command_line, strlen(command_line));
+ boot_init_stack_canary();
mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids();
diff --git a/ipc/msg.c b/ipc/msg.c
index 2c38f10d1483..df82bc9a5531 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1011,7 +1011,7 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
}
-void msg_init_ns(struct ipc_namespace *ns)
+int msg_init_ns(struct ipc_namespace *ns)
{
ns->msg_ctlmax = MSGMAX;
ns->msg_ctlmnb = MSGMNB;
@@ -1019,7 +1019,7 @@ void msg_init_ns(struct ipc_namespace *ns)
atomic_set(&ns->msg_bytes, 0);
atomic_set(&ns->msg_hdrs, 0);
- ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+ return ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
}
#ifdef CONFIG_IPC_NS
@@ -1027,6 +1027,7 @@ void msg_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &msg_ids(ns), freeque);
idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
+ rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
}
#endif
@@ -1058,11 +1059,12 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
}
#endif
-void __init msg_init(void)
+int __init msg_init(void)
{
- msg_init_ns(&init_ipc_ns);
+ const int err = msg_init_ns(&init_ipc_ns);
ipc_init_proc_interface("sysvipc/msg",
" key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
IPC_MSG_IDS, sysvipc_msg_proc_show);
+ return err;
}
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index bf74eaa5c39f..84598025a6ad 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -29,7 +29,7 @@ DEFINE_SPINLOCK(mq_lock);
* and not CONFIG_IPC_NS.
*/
struct ipc_namespace init_ipc_ns = {
- .count = ATOMIC_INIT(1),
+ .count = REFCOUNT_INIT(1),
.user_ns = &init_user_ns,
.ns.inum = PROC_IPC_INIT_INO,
#ifdef CONFIG_IPC_NS
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b4d80f9f7246..fc850c526698 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -50,20 +50,32 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
goto fail_free;
ns->ns.ops = &ipcns_operations;
- atomic_set(&ns->count, 1);
+ refcount_set(&ns->count, 1);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
- err = mq_init_ns(ns);
+ err = sem_init_ns(ns);
if (err)
goto fail_put;
+ err = msg_init_ns(ns);
+ if (err)
+ goto fail_destroy_sem;
+ err = shm_init_ns(ns);
+ if (err)
+ goto fail_destroy_msg;
- sem_init_ns(ns);
- msg_init_ns(ns);
- shm_init_ns(ns);
+ err = mq_init_ns(ns);
+ if (err)
+ goto fail_destroy_shm;
return ns;
+fail_destroy_shm:
+ shm_exit_ns(ns);
+fail_destroy_msg:
+ msg_exit_ns(ns);
+fail_destroy_sem:
+ sem_exit_ns(ns);
fail_put:
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
@@ -144,7 +156,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
*/
void put_ipc_ns(struct ipc_namespace *ns)
{
- if (atomic_dec_and_lock(&ns->count, &mq_lock)) {
+ if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
mq_clear_sbinfo(ns);
spin_unlock(&mq_lock);
mq_put_mnt(ns);
diff --git a/ipc/sem.c b/ipc/sem.c
index c6c50370504c..013c7981f3c7 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -122,7 +122,7 @@ struct sem_undo {
* that may be shared among all a CLONE_SYSVSEM task group.
*/
struct sem_undo_list {
- atomic_t refcnt;
+ refcount_t refcnt;
spinlock_t lock;
struct list_head list_proc;
};
@@ -130,8 +130,6 @@ struct sem_undo_list {
#define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS])
-#define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid)
-
static int newary(struct ipc_namespace *, struct ipc_params *);
static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
#ifdef CONFIG_PROC_FS
@@ -185,14 +183,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
#define sc_semopm sem_ctls[2]
#define sc_semmni sem_ctls[3]
-void sem_init_ns(struct ipc_namespace *ns)
+int sem_init_ns(struct ipc_namespace *ns)
{
ns->sc_semmsl = SEMMSL;
ns->sc_semmns = SEMMNS;
ns->sc_semopm = SEMOPM;
ns->sc_semmni = SEMMNI;
ns->used_sems = 0;
- ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
+ return ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
}
#ifdef CONFIG_IPC_NS
@@ -200,15 +198,18 @@ void sem_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &sem_ids(ns), freeary);
idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
+ rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
}
#endif
-void __init sem_init(void)
+int __init sem_init(void)
{
- sem_init_ns(&init_ipc_ns);
+ const int err = sem_init_ns(&init_ipc_ns);
+
ipc_init_proc_interface("sysvipc/sem",
" key semid perms nsems uid gid cuid cgid otime ctime\n",
IPC_SEM_IDS, sysvipc_sem_proc_show);
+ return err;
}
/**
@@ -1642,7 +1643,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
if (undo_list == NULL)
return -ENOMEM;
spin_lock_init(&undo_list->lock);
- atomic_set(&undo_list->refcnt, 1);
+ refcount_set(&undo_list->refcnt, 1);
INIT_LIST_HEAD(&undo_list->list_proc);
current->sysvsem.undo_list = undo_list;
@@ -1786,7 +1787,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
if (nsops > ns->sc_semopm)
return -E2BIG;
if (nsops > SEMOPM_FAST) {
- sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL);
+ sops = kvmalloc(sizeof(*sops)*nsops, GFP_KERNEL);
if (sops == NULL)
return -ENOMEM;
}
@@ -2018,7 +2019,7 @@ out_unlock_free:
rcu_read_unlock();
out_free:
if (sops != fast_sops)
- kfree(sops);
+ kvfree(sops);
return error;
}
@@ -2041,7 +2042,7 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
error = get_undo_list(&undo_list);
if (error)
return error;
- atomic_inc(&undo_list->refcnt);
+ refcount_inc(&undo_list->refcnt);
tsk->sysvsem.undo_list = undo_list;
} else
tsk->sysvsem.undo_list = NULL;
@@ -2070,7 +2071,7 @@ void exit_sem(struct task_struct *tsk)
return;
tsk->sysvsem.undo_list = NULL;
- if (!atomic_dec_and_test(&ulp->refcnt))
+ if (!refcount_dec_and_test(&ulp->refcnt))
return;
for (;;) {
diff --git a/ipc/shm.c b/ipc/shm.c
index 8828b4c3a190..8fc97beb5234 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -72,14 +72,14 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
#endif
-void shm_init_ns(struct ipc_namespace *ns)
+int shm_init_ns(struct ipc_namespace *ns)
{
ns->shm_ctlmax = SHMMAX;
ns->shm_ctlall = SHMALL;
ns->shm_ctlmni = SHMMNI;
ns->shm_rmid_forced = 0;
ns->shm_tot = 0;
- ipc_init_ids(&shm_ids(ns));
+ return ipc_init_ids(&shm_ids(ns));
}
/*
@@ -95,7 +95,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
/* Do not find it any more */
- shp->shm_perm.key = IPC_PRIVATE;
+ ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
shm_unlock(shp);
} else
shm_destroy(ns, shp);
@@ -106,13 +106,15 @@ void shm_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
+ rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
}
#endif
static int __init ipc_ns_init(void)
{
- shm_init_ns(&init_ipc_ns);
- return 0;
+ const int err = shm_init_ns(&init_ipc_ns);
+ WARN(err, "ipc: sysv shm_init_ns failed: %d\n", err);
+ return err;
}
pure_initcall(ipc_ns_init);
diff --git a/ipc/util.c b/ipc/util.c
index 1a2cb02467ab..78755873cc5b 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -83,27 +83,46 @@ struct ipc_proc_iface {
*/
static int __init ipc_init(void)
{
- sem_init();
- msg_init();
+ int err_sem, err_msg;
+
+ err_sem = sem_init();
+ WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem);
+ err_msg = msg_init();
+ WARN(err_msg, "ipc: sysv msg_init failed: %d\n", err_msg);
shm_init();
- return 0;
+
+ return err_msg ? err_msg : err_sem;
}
device_initcall(ipc_init);
+static const struct rhashtable_params ipc_kht_params = {
+ .head_offset = offsetof(struct kern_ipc_perm, khtnode),
+ .key_offset = offsetof(struct kern_ipc_perm, key),
+ .key_len = FIELD_SIZEOF(struct kern_ipc_perm, key),
+ .locks_mul = 1,
+ .automatic_shrinking = true,
+};
+
/**
* ipc_init_ids - initialise ipc identifiers
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
- * below IPCMNI) then initialise the ids idr.
+ * below IPCMNI) then initialise the keys hashtable and ids idr.
*/
-void ipc_init_ids(struct ipc_ids *ids)
+int ipc_init_ids(struct ipc_ids *ids)
{
+ int err;
ids->in_use = 0;
ids->seq = 0;
ids->next_id = -1;
init_rwsem(&ids->rwsem);
+ err = rhashtable_init(&ids->key_ht, &ipc_kht_params);
+ if (err)
+ return err;
idr_init(&ids->ipcs_idr);
+ ids->tables_initialized = true;
+ return 0;
}
#ifdef CONFIG_PROC_FS
@@ -147,28 +166,20 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
* Returns the locked pointer to the ipc structure if found or NULL
* otherwise. If key is found ipc points to the owning ipc structure
*
- * Called with ipc_ids.rwsem held.
+ * Called with writer ipc_ids.rwsem held.
*/
static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
{
- struct kern_ipc_perm *ipc;
- int next_id;
- int total;
-
- for (total = 0, next_id = 0; total < ids->in_use; next_id++) {
- ipc = idr_find(&ids->ipcs_idr, next_id);
-
- if (ipc == NULL)
- continue;
+ struct kern_ipc_perm *ipcp = NULL;
- if (ipc->key != key) {
- total++;
- continue;
- }
+ if (likely(ids->tables_initialized))
+ ipcp = rhashtable_lookup_fast(&ids->key_ht, &key,
+ ipc_kht_params);
+ if (ipcp) {
rcu_read_lock();
- ipc_lock_object(ipc);
- return ipc;
+ ipc_lock_object(ipcp);
+ return ipcp;
}
return NULL;
@@ -221,18 +232,18 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
{
kuid_t euid;
kgid_t egid;
- int id;
+ int id, err;
int next_id = ids->next_id;
if (size > IPCMNI)
size = IPCMNI;
- if (ids->in_use >= size)
+ if (!ids->tables_initialized || ids->in_use >= size)
return -ENOSPC;
idr_preload(GFP_KERNEL);
- atomic_set(&new->refcount, 1);
+ refcount_set(&new->refcount, 1);
spin_lock_init(&new->lock);
new->deleted = false;
rcu_read_lock();
@@ -246,6 +257,15 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
(next_id < 0) ? 0 : ipcid_to_idx(next_id), 0,
GFP_NOWAIT);
idr_preload_end();
+
+ if (id >= 0 && new->key != IPC_PRIVATE) {
+ err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
+ ipc_kht_params);
+ if (err < 0) {
+ idr_remove(&ids->ipcs_idr, id);
+ id = err;
+ }
+ }
if (id < 0) {
spin_unlock(&new->lock);
rcu_read_unlock();
@@ -377,6 +397,20 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
return err;
}
+/**
+ * ipc_kht_remove - remove an ipc from the key hashtable
+ * @ids: ipc identifier set
+ * @ipcp: ipc perm structure containing the key to remove
+ *
+ * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
+ * before this function is called, and remain locked on the exit.
+ */
+static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
+{
+ if (ipcp->key != IPC_PRIVATE)
+ rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode,
+ ipc_kht_params);
+}
/**
* ipc_rmid - remove an ipc identifier
@@ -391,19 +425,34 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
int lid = ipcid_to_idx(ipcp->id);
idr_remove(&ids->ipcs_idr, lid);
+ ipc_kht_remove(ids, ipcp);
ids->in_use--;
ipcp->deleted = true;
}
+/**
+ * ipc_set_key_private - switch the key of an existing ipc to IPC_PRIVATE
+ * @ids: ipc identifier set
+ * @ipcp: ipc perm structure containing the key to modify
+ *
+ * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
+ * before this function is called, and remain locked on the exit.
+ */
+void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
+{
+ ipc_kht_remove(ids, ipcp);
+ ipcp->key = IPC_PRIVATE;
+}
+
int ipc_rcu_getref(struct kern_ipc_perm *ptr)
{
- return atomic_inc_not_zero(&ptr->refcount);
+ return refcount_inc_not_zero(&ptr->refcount);
}
void ipc_rcu_putref(struct kern_ipc_perm *ptr,
void (*func)(struct rcu_head *head))
{
- if (!atomic_dec_and_test(&ptr->refcount))
+ if (!refcount_dec_and_test(&ptr->refcount))
return;
call_rcu(&ptr->rcu, func);
@@ -485,7 +534,7 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
}
/**
- * ipc_obtain_object
+ * ipc_obtain_object_idr
* @ids: ipc identifier set
* @id: ipc id to look for
*
@@ -499,6 +548,9 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
struct kern_ipc_perm *out;
int lid = ipcid_to_idx(id);
+ if (unlikely(!ids->tables_initialized))
+ return ERR_PTR(-EINVAL);
+
out = idr_find(&ids->ipcs_idr, lid);
if (!out)
return ERR_PTR(-EINVAL);
diff --git a/ipc/util.h b/ipc/util.h
index c692010e6f0a..80c9f51c3f07 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -15,8 +15,8 @@
#define SEQ_MULTIPLIER (IPCMNI)
-void sem_init(void);
-void msg_init(void);
+int sem_init(void);
+int msg_init(void);
void shm_init(void);
struct ipc_namespace;
@@ -30,17 +30,17 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }
#endif
#ifdef CONFIG_SYSVIPC
-void sem_init_ns(struct ipc_namespace *ns);
-void msg_init_ns(struct ipc_namespace *ns);
-void shm_init_ns(struct ipc_namespace *ns);
+int sem_init_ns(struct ipc_namespace *ns);
+int msg_init_ns(struct ipc_namespace *ns);
+int shm_init_ns(struct ipc_namespace *ns);
void sem_exit_ns(struct ipc_namespace *ns);
void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
-static inline void sem_init_ns(struct ipc_namespace *ns) { }
-static inline void msg_init_ns(struct ipc_namespace *ns) { }
-static inline void shm_init_ns(struct ipc_namespace *ns) { }
+static inline int sem_init_ns(struct ipc_namespace *ns) { return 0; }
+static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; }
+static inline int shm_init_ns(struct ipc_namespace *ns) { return 0; }
static inline void sem_exit_ns(struct ipc_namespace *ns) { }
static inline void msg_exit_ns(struct ipc_namespace *ns) { }
@@ -79,7 +79,7 @@ struct ipc_ops {
struct seq_file;
struct ipc_ids;
-void ipc_init_ids(struct ipc_ids *);
+int ipc_init_ids(struct ipc_ids *);
#ifdef CONFIG_PROC_FS
void __init ipc_init_proc_interface(const char *path, const char *header,
int ids, int (*show)(struct seq_file *, void *));
@@ -104,6 +104,9 @@ int ipc_get_maxid(struct ipc_ids *);
/* must be called with both locks acquired. */
void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);
+/* must be called with both locks acquired. */
+void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);
+
/* must be called with ipcp locked */
int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
diff --git a/kernel/Makefile b/kernel/Makefile
index 9c323a6daa46..ed470aac53da 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,12 +5,13 @@
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o
+obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/fork.c b/kernel/fork.c
index 24a4c0be80d5..6f1b0af00bda 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
+#include <linux/hmm.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -824,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
+ hmm_mm_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
@@ -903,6 +905,7 @@ void __mmdrop(struct mm_struct *mm)
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
destroy_context(mm);
+ hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
@@ -1459,8 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- p->pi_waiters = RB_ROOT;
- p->pi_waiters_leftmost = NULL;
+ p->pi_waiters = RB_ROOT_CACHED;
p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
diff --git a/kernel/kcov.c b/kernel/kcov.c
index cd771993f96f..3f693a0f6f3e 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -270,6 +270,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
static const struct file_operations kcov_fops = {
.open = kcov_open,
.unlocked_ioctl = kcov_ioctl,
+ .compat_ioctl = kcov_ioctl,
.mmap = kcov_mmap,
.release = kcov_close,
};
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2f37acde640b..bc6addd9152b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -1,23 +1,6 @@
/*
- kmod, the new module loader (replaces kerneld)
- Kirk Petersen
-
- Reorganized not to be a daemon by Adam Richter, with guidance
- from Greg Zornetzer.
-
- Modified to avoid chroot and file sharing problems.
- Mikael Pettersson
-
- Limit the concurrent number of kmod modprobes to catch loops from
- "modprobe needs a service that is in a module".
- Keith Owens <kaos@ocs.com.au> December 1999
-
- Unblock all signals when we exec a usermode process.
- Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
-
- call_usermodehelper wait flag, and remove exec_usermodehelper.
- Rusty Russell <rusty@rustcorp.com.au> Jan 2003
-*/
+ * kmod - the kernel module loader
+ */
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -45,15 +28,6 @@
#include <trace/events/module.h>
-#define CAP_BSET (void *)1
-#define CAP_PI (void *)2
-
-static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
-static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
-static DEFINE_SPINLOCK(umh_sysctl_lock);
-static DECLARE_RWSEM(umhelper_sem);
-
-#ifdef CONFIG_MODULES
/*
* Assuming:
*
@@ -202,536 +176,3 @@ int __request_module(bool wait, const char *fmt, ...)
return ret;
}
EXPORT_SYMBOL(__request_module);
-
-#endif /* CONFIG_MODULES */
-
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
- if (info->cleanup)
- (*info->cleanup)(info);
- kfree(info);
-}
-
-static void umh_complete(struct subprocess_info *sub_info)
-{
- struct completion *comp = xchg(&sub_info->complete, NULL);
- /*
- * See call_usermodehelper_exec(). If xchg() returns NULL
- * we own sub_info, the UMH_KILLABLE caller has gone away
- * or the caller used UMH_NO_WAIT.
- */
- if (comp)
- complete(comp);
- else
- call_usermodehelper_freeinfo(sub_info);
-}
-
-/*
- * This is the task which runs the usermode application
- */
-static int call_usermodehelper_exec_async(void *data)
-{
- struct subprocess_info *sub_info = data;
- struct cred *new;
- int retval;
-
- spin_lock_irq(&current->sighand->siglock);
- flush_signal_handlers(current, 1);
- spin_unlock_irq(&current->sighand->siglock);
-
- /*
- * Our parent (unbound workqueue) runs with elevated scheduling
- * priority. Avoid propagating that into the userspace child.
- */
- set_user_nice(current, 0);
-
- retval = -ENOMEM;
- new = prepare_kernel_cred(current);
- if (!new)
- goto out;
-
- spin_lock(&umh_sysctl_lock);
- new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
- new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
- new->cap_inheritable);
- spin_unlock(&umh_sysctl_lock);
-
- if (sub_info->init) {
- retval = sub_info->init(sub_info, new);
- if (retval) {
- abort_creds(new);
- goto out;
- }
- }
-
- commit_creds(new);
-
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
-out:
- sub_info->retval = retval;
- /*
- * call_usermodehelper_exec_sync() will call umh_complete
- * if UHM_WAIT_PROC.
- */
- if (!(sub_info->wait & UMH_WAIT_PROC))
- umh_complete(sub_info);
- if (!retval)
- return 0;
- do_exit(0);
-}
-
-/* Handles UMH_WAIT_PROC. */
-static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
-{
- pid_t pid;
-
- /* If SIGCLD is ignored sys_wait4 won't populate the status. */
- kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
- sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- sys_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
-
- /* Restore default kernel sig handler */
- kernel_sigaction(SIGCHLD, SIG_IGN);
-
- umh_complete(sub_info);
-}
-
-/*
- * We need to create the usermodehelper kernel thread from a task that is affine
- * to an optimized set of CPUs (or nohz housekeeping ones) such that they
- * inherit a widest affinity irrespective of call_usermodehelper() callers with
- * possibly reduced affinity (eg: per-cpu workqueues). We don't want
- * usermodehelper targets to contend a busy CPU.
- *
- * Unbound workqueues provide such wide affinity and allow to block on
- * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
- *
- * Besides, workqueues provide the privilege level that caller might not have
- * to perform the usermodehelper request.
- *
- */
-static void call_usermodehelper_exec_work(struct work_struct *work)
-{
- struct subprocess_info *sub_info =
- container_of(work, struct subprocess_info, work);
-
- if (sub_info->wait & UMH_WAIT_PROC) {
- call_usermodehelper_exec_sync(sub_info);
- } else {
- pid_t pid;
- /*
- * Use CLONE_PARENT to reparent it to kthreadd; we do not
- * want to pollute current->children, and we need a parent
- * that always ignores SIGCHLD to ensure auto-reaping.
- */
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
- CLONE_PARENT | SIGCHLD);
- if (pid < 0) {
- sub_info->retval = pid;
- umh_complete(sub_info);
- }
- }
-}
-
-/*
- * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
- * (used for preventing user land processes from being created after the user
- * land has been frozen during a system-wide hibernation or suspend operation).
- * Should always be manipulated under umhelper_sem acquired for write.
- */
-static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
-
-/* Number of helpers running */
-static atomic_t running_helpers = ATOMIC_INIT(0);
-
-/*
- * Wait queue head used by usermodehelper_disable() to wait for all running
- * helpers to finish.
- */
-static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
-
-/*
- * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
- * to become 'false'.
- */
-static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
-
-/*
- * Time to wait for running_helpers to become zero before the setting of
- * usermodehelper_disabled in usermodehelper_disable() fails
- */
-#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
-
-int usermodehelper_read_trylock(void)
-{
- DEFINE_WAIT(wait);
- int ret = 0;
-
- down_read(&umhelper_sem);
- for (;;) {
- prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
- TASK_INTERRUPTIBLE);
- if (!usermodehelper_disabled)
- break;
-
- if (usermodehelper_disabled == UMH_DISABLED)
- ret = -EAGAIN;
-
- up_read(&umhelper_sem);
-
- if (ret)
- break;
-
- schedule();
- try_to_freeze();
-
- down_read(&umhelper_sem);
- }
- finish_wait(&usermodehelper_disabled_waitq, &wait);
- return ret;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
-
-long usermodehelper_read_lock_wait(long timeout)
-{
- DEFINE_WAIT(wait);
-
- if (timeout < 0)
- return -EINVAL;
-
- down_read(&umhelper_sem);
- for (;;) {
- prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- if (!usermodehelper_disabled)
- break;
-
- up_read(&umhelper_sem);
-
- timeout = schedule_timeout(timeout);
- if (!timeout)
- break;
-
- down_read(&umhelper_sem);
- }
- finish_wait(&usermodehelper_disabled_waitq, &wait);
- return timeout;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
-
-void usermodehelper_read_unlock(void)
-{
- up_read(&umhelper_sem);
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
-
-/**
- * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Change the value of usermodehelper_disabled (under umhelper_sem locked for
- * writing) and wakeup tasks waiting for it to change.
- */
-void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
-{
- down_write(&umhelper_sem);
- usermodehelper_disabled = depth;
- wake_up(&usermodehelper_disabled_waitq);
- up_write(&umhelper_sem);
-}
-
-/**
- * __usermodehelper_disable - Prevent new helpers from being started.
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
- */
-int __usermodehelper_disable(enum umh_disable_depth depth)
-{
- long retval;
-
- if (!depth)
- return -EINVAL;
-
- down_write(&umhelper_sem);
- usermodehelper_disabled = depth;
- up_write(&umhelper_sem);
-
- /*
- * From now on call_usermodehelper_exec() won't start any new
- * helpers, so it is sufficient if running_helpers turns out to
- * be zero at one point (it may be increased later, but that
- * doesn't matter).
- */
- retval = wait_event_timeout(running_helpers_waitq,
- atomic_read(&running_helpers) == 0,
- RUNNING_HELPERS_TIMEOUT);
- if (retval)
- return 0;
-
- __usermodehelper_set_disable_depth(UMH_ENABLED);
- return -EAGAIN;
-}
-
-static void helper_lock(void)
-{
- atomic_inc(&running_helpers);
- smp_mb__after_atomic();
-}
-
-static void helper_unlock(void)
-{
- if (atomic_dec_and_test(&running_helpers))
- wake_up(&running_helpers_waitq);
-}
-
-/**
- * call_usermodehelper_setup - prepare to call a usermode helper
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @gfp_mask: gfp mask for memory allocation
- * @cleanup: a cleanup function
- * @init: an init function
- * @data: arbitrary context sensitive data
- *
- * Returns either %NULL on allocation failure, or a subprocess_info
- * structure. This should be passed to call_usermodehelper_exec to
- * exec the process and free the structure.
- *
- * The init function is used to customize the helper process prior to
- * exec. A non-zero return code causes the process to error out, exit,
- * and return the failure to the calling process
- *
- * The cleanup function is just before ethe subprocess_info is about to
- * be freed. This can be used for freeing the argv and envp. The
- * Function must be runnable in either a process context or the
- * context in which call_usermodehelper_exec is called.
- */
-struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
- char **envp, gfp_t gfp_mask,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info),
- void *data)
-{
- struct subprocess_info *sub_info;
- sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
- if (!sub_info)
- goto out;
-
- INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
-
-#ifdef CONFIG_STATIC_USERMODEHELPER
- sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
-#else
- sub_info->path = path;
-#endif
- sub_info->argv = argv;
- sub_info->envp = envp;
-
- sub_info->cleanup = cleanup;
- sub_info->init = init;
- sub_info->data = data;
- out:
- return sub_info;
-}
-EXPORT_SYMBOL(call_usermodehelper_setup);
-
-/**
- * call_usermodehelper_exec - start a usermode application
- * @sub_info: information about the subprocessa
- * @wait: wait for the application to finish and return status.
- * when UMH_NO_WAIT don't wait at all, but you get no useful error back
- * when the program couldn't be exec'ed. This makes it safe to call
- * from interrupt context.
- *
- * Runs a user-space application. The application is started
- * asynchronously if wait is not set, and runs as a child of system workqueues.
- * (ie. it runs with full root capabilities and optimized affinity).
- */
-int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
-{
- DECLARE_COMPLETION_ONSTACK(done);
- int retval = 0;
-
- if (!sub_info->path) {
- call_usermodehelper_freeinfo(sub_info);
- return -EINVAL;
- }
- helper_lock();
- if (usermodehelper_disabled) {
- retval = -EBUSY;
- goto out;
- }
-
- /*
- * If there is no binary for us to call, then just return and get out of
- * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
- * disable all call_usermodehelper() calls.
- */
- if (strlen(sub_info->path) == 0)
- goto out;
-
- /*
- * Set the completion pointer only if there is a waiter.
- * This makes it possible to use umh_complete to free
- * the data structure in case of UMH_NO_WAIT.
- */
- sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
- sub_info->wait = wait;
-
- queue_work(system_unbound_wq, &sub_info->work);
- if (wait == UMH_NO_WAIT) /* task has freed sub_info */
- goto unlock;
-
- if (wait & UMH_KILLABLE) {
- retval = wait_for_completion_killable(&done);
- if (!retval)
- goto wait_done;
-
- /* umh_complete() will see NULL and free sub_info */
- if (xchg(&sub_info->complete, NULL))
- goto unlock;
- /* fallthrough, umh_complete() was already called */
- }
-
- wait_for_completion(&done);
-wait_done:
- retval = sub_info->retval;
-out:
- call_usermodehelper_freeinfo(sub_info);
-unlock:
- helper_unlock();
- return retval;
-}
-EXPORT_SYMBOL(call_usermodehelper_exec);
-
-/**
- * call_usermodehelper() - prepare and start a usermode application
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @wait: wait for the application to finish and return status.
- * when UMH_NO_WAIT don't wait at all, but you get no useful error back
- * when the program couldn't be exec'ed. This makes it safe to call
- * from interrupt context.
- *
- * This function is the equivalent to use call_usermodehelper_setup() and
- * call_usermodehelper_exec().
- */
-int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
-{
- struct subprocess_info *info;
- gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-
- info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
- NULL, NULL, NULL);
- if (info == NULL)
- return -ENOMEM;
-
- return call_usermodehelper_exec(info, wait);
-}
-EXPORT_SYMBOL(call_usermodehelper);
-
-static int proc_cap_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
- kernel_cap_t new_cap;
- int err, i;
-
- if (write && (!capable(CAP_SETPCAP) ||
- !capable(CAP_SYS_MODULE)))
- return -EPERM;
-
- /*
- * convert from the global kernel_cap_t to the ulong array to print to
- * userspace if this is a read.
- */
- spin_lock(&umh_sysctl_lock);
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
- if (table->data == CAP_BSET)
- cap_array[i] = usermodehelper_bset.cap[i];
- else if (table->data == CAP_PI)
- cap_array[i] = usermodehelper_inheritable.cap[i];
- else
- BUG();
- }
- spin_unlock(&umh_sysctl_lock);
-
- t = *table;
- t.data = &cap_array;
-
- /*
- * actually read or write and array of ulongs from userspace. Remember
- * these are least significant 32 bits first
- */
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- /*
- * convert from the sysctl array of ulongs to the kernel_cap_t
- * internal representation
- */
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
- new_cap.cap[i] = cap_array[i];
-
- /*
- * Drop everything not in the new_cap (but don't add things)
- */
- spin_lock(&umh_sysctl_lock);
- if (write) {
- if (table->data == CAP_BSET)
- usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
- if (table->data == CAP_PI)
- usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
- }
- spin_unlock(&umh_sysctl_lock);
-
- return 0;
-}
-
-struct ctl_table usermodehelper_table[] = {
- {
- .procname = "bset",
- .data = CAP_BSET,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
- .mode = 0600,
- .proc_handler = proc_cap_handler,
- },
- {
- .procname = "inheritable",
- .data = CAP_PI,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
- .mode = 0600,
- .proc_handler = proc_cap_handler,
- },
- { }
-};
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index ac35e648b0e5..f4a74e78d467 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
void rt_mutex_debug_task_free(struct task_struct *task)
{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 649dc9d3951a..6f3dba6e4e9e 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &lock->waiters.rb_node;
+ struct rb_node **link = &lock->waiters.rb_root.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
- int leftmost = 1;
+ bool leftmost = true;
while (*link) {
parent = *link;
@@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost)
- lock->waiters_leftmost = &waiter->tree_entry;
-
rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color(&waiter->tree_entry, &lock->waiters);
+ rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
}
static void
@@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
if (RB_EMPTY_NODE(&waiter->tree_entry))
return;
- if (lock->waiters_leftmost == &waiter->tree_entry)
- lock->waiters_leftmost = rb_next(&waiter->tree_entry);
-
- rb_erase(&waiter->tree_entry, &lock->waiters);
+ rb_erase_cached(&waiter->tree_entry, &lock->waiters);
RB_CLEAR_NODE(&waiter->tree_entry);
}
static void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_node;
+ struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
- int leftmost = 1;
+ bool leftmost = true;
while (*link) {
parent = *link;
@@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost)
- task->pi_waiters_leftmost = &waiter->pi_tree_entry;
-
rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
}
static void
@@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
return;
- if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
- task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
-
- rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
@@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
{
lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
- lock->waiters = RB_ROOT;
- lock->waiters_leftmost = NULL;
+ lock->waiters = RB_ROOT_CACHED;
if (name && key)
debug_rt_mutex_init(lock, name, key);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 8d039b928d61..7453be0485a5 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -45,7 +45,7 @@ struct rt_mutex_waiter {
static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
- return !RB_EMPTY_ROOT(&lock->waiters);
+ return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
static inline struct rt_mutex_waiter *
@@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
{
struct rt_mutex_waiter *w;
- w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
- tree_entry);
+ w = rb_entry(lock->waiters.rb_leftmost,
+ struct rt_mutex_waiter, tree_entry);
BUG_ON(w->lock != lock);
return w;
@@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
static inline int task_has_pi_waiters(struct task_struct *p)
{
- return !RB_EMPTY_ROOT(&p->pi_waiters);
+ return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
}
static inline struct rt_mutex_waiter *
task_top_pi_waiter(struct task_struct *p)
{
- return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
+ return rb_entry(p->pi_waiters.rb_leftmost,
+ struct rt_mutex_waiter, pi_tree_entry);
}
#else
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 066e73c2fcc9..6bcbfbf1a8fd 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,13 +11,14 @@
* General Public License for more details.
*/
#include <linux/radix-tree.h>
-#include <linux/memremap.h>
#include <linux/device.h>
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#ifndef ioremap_cache
/* temporary while we convert existing ioremap_cache users to memremap */
@@ -219,6 +220,34 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff)
for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
pgoff += 1UL << order, order = order_at((res), pgoff))
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+int device_private_entry_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ swp_entry_t entry,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ struct page *page = device_private_entry_to_page(entry);
+
+ /*
+ * The page_fault() callback must migrate page back to system memory
+ * so that CPU can access it. This might fail for various reasons
+ * (device issue, device was unsafely unplugged, ...). When such
+ * error conditions happen, the callback must return VM_FAULT_SIGBUS.
+ *
+ * Note that because memory cgroup charges are accounted to the device
+ * memory, this should never fail because of memory restrictions (but
+ * allocation of regular system page might still fail because we are
+ * out of memory).
+ *
+ * There is a more in-depth description of what that callback can and
+ * cannot do, in include/linux/memremap.h
+ */
+ return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+}
+EXPORT_SYMBOL(device_private_entry_fault);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
static void pgmap_radix_release(struct resource *res)
{
unsigned long pgoff, order;
@@ -356,6 +385,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
}
pgmap->ref = ref;
pgmap->res = &page_map->res;
+ pgmap->type = MEMORY_DEVICE_HOST;
+ pgmap->page_fault = NULL;
+ pgmap->page_free = NULL;
+ pgmap->data = NULL;
mutex_lock(&pgmap_lock);
error = 0;
@@ -466,3 +499,28 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL;
}
#endif /* CONFIG_ZONE_DEVICE */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page)
+{
+ int count = page_ref_dec_return(page);
+
+ /*
+ * If refcount is 1 then page is freed and refcount is stable as nobody
+ * holds a reference on the page.
+ */
+ if (count == 1) {
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+ __ClearPageWaiters(page);
+
+ page->mapping = NULL;
+ mem_cgroup_uncharge(page);
+
+ page->pgmap->page_free(page, page->pgmap->data);
+ } else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(put_zone_device_private_or_public_page);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 84fe96641b2e..1250e4bd4b85 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4091,7 +4091,7 @@ static void __init rcu_init_geometry(void)
if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
nr_cpu_ids == NR_CPUS)
return;
- pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+ pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
rcu_fanout_leaf, nr_cpu_ids);
/*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 55bde94b9572..e012b9be777e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void)
if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
- pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+ pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
#ifdef CONFIG_RCU_BOOST
pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
#endif
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9e38df7649f4..0191ec7667c3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -296,7 +296,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
- return dl_rq->rb_leftmost == &dl_se->rb_node;
+ return dl_rq->root.rb_leftmost == &dl_se->rb_node;
}
void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
@@ -320,7 +320,7 @@ void init_dl_bw(struct dl_bw *dl_b)
void init_dl_rq(struct dl_rq *dl_rq)
{
- dl_rq->rb_root = RB_ROOT;
+ dl_rq->root = RB_ROOT_CACHED;
#ifdef CONFIG_SMP
/* zero means no -deadline tasks */
@@ -328,7 +328,7 @@ void init_dl_rq(struct dl_rq *dl_rq)
dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
- dl_rq->pushable_dl_tasks_root = RB_ROOT;
+ dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
#else
init_dl_bw(&dl_rq->dl_bw);
#endif
@@ -410,10 +410,10 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct dl_rq *dl_rq = &rq->dl;
- struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
+ struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct task_struct *entry;
- int leftmost = 1;
+ bool leftmost = true;
BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
@@ -425,17 +425,16 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
link = &parent->rb_left;
else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost) {
- dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+ if (leftmost)
dl_rq->earliest_dl.next = p->dl.deadline;
- }
rb_link_node(&p->pushable_dl_tasks, parent, link);
- rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ rb_insert_color_cached(&p->pushable_dl_tasks,
+ &dl_rq->pushable_dl_tasks_root, leftmost);
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
@@ -445,24 +444,23 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
return;
- if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
+ if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
struct rb_node *next_node;
next_node = rb_next(&p->pushable_dl_tasks);
- dl_rq->pushable_dl_tasks_leftmost = next_node;
if (next_node) {
dl_rq->earliest_dl.next = rb_entry(next_node,
struct task_struct, pushable_dl_tasks)->dl.deadline;
}
}
- rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
}
static inline int has_pushable_dl_tasks(struct rq *rq)
{
- return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
}
static int push_dl_task(struct rq *rq);
@@ -1266,7 +1264,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
dl_rq->earliest_dl.next = 0;
cpudl_clear(&rq->rd->cpudl, rq->cpu);
} else {
- struct rb_node *leftmost = dl_rq->rb_leftmost;
+ struct rb_node *leftmost = dl_rq->root.rb_leftmost;
struct sched_dl_entity *entry;
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
@@ -1313,7 +1311,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rb_node **link = &dl_rq->rb_root.rb_node;
+ struct rb_node **link = &dl_rq->root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_dl_entity *entry;
int leftmost = 1;
@@ -1331,11 +1329,8 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
}
}
- if (leftmost)
- dl_rq->rb_leftmost = &dl_se->rb_node;
-
rb_link_node(&dl_se->rb_node, parent, link);
- rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+ rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
inc_dl_tasks(dl_se, dl_rq);
}
@@ -1347,14 +1342,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
if (RB_EMPTY_NODE(&dl_se->rb_node))
return;
- if (dl_rq->rb_leftmost == &dl_se->rb_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&dl_se->rb_node);
- dl_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
+ rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
RB_CLEAR_NODE(&dl_se->rb_node);
dec_dl_tasks(dl_se, dl_rq);
@@ -1647,7 +1635,7 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
struct dl_rq *dl_rq)
{
- struct rb_node *left = dl_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&dl_rq->root);
if (!left)
return NULL;
@@ -1771,7 +1759,7 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
*/
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+ struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
struct task_struct *p = NULL;
if (!has_pushable_dl_tasks(rq))
@@ -1945,7 +1933,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
+ p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
struct task_struct, pushable_dl_tasks);
BUG_ON(rq->cpu != task_cpu(p));
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4a23bbc3111b..8e536d963652 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -530,7 +530,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_lock_irqsave(&rq->lock, flags);
- if (cfs_rq->rb_leftmost)
+ if (rb_first_cached(&cfs_rq->tasks_timeline))
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8bc0a883d190..a5d83ed8dd82 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -513,6 +513,7 @@ static inline int entity_before(struct sched_entity *a,
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
+ struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -523,10 +524,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
+ if (leftmost) { /* non-empty tree */
+ struct sched_entity *se;
+ se = rb_entry(leftmost, struct sched_entity, run_node);
if (!curr)
vruntime = se->vruntime;
@@ -547,10 +547,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+ struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
- int leftmost = 1;
+ bool leftmost = true;
/*
* Find the right place in the rbtree:
@@ -566,36 +566,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- /*
- * Maintain a cache of leftmost tree entries (it is frequently
- * used):
- */
- if (leftmost)
- cfs_rq->rb_leftmost = &se->run_node;
-
rb_link_node(&se->run_node, parent, link);
- rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_insert_color_cached(&se->run_node,
+ &cfs_rq->tasks_timeline, leftmost);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->rb_leftmost == &se->run_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&se->run_node);
- cfs_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *left = cfs_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
@@ -616,7 +603,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
if (!last)
return NULL;
@@ -9312,7 +9299,7 @@ static void set_curr_task_fair(struct rq *rq)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
- cfs_rq->tasks_timeline = RB_ROOT;
+ cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6ed7962dc896..746ac78ff492 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -426,8 +426,7 @@ struct cfs_rq {
u64 min_vruntime_copy;
#endif
- struct rb_root tasks_timeline;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached tasks_timeline;
/*
* 'curr' points to currently running entity on this cfs_rq.
@@ -550,8 +549,7 @@ struct rt_rq {
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
- struct rb_root rb_root;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached root;
unsigned long dl_nr_running;
@@ -575,8 +573,7 @@ struct dl_rq {
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
- struct rb_root pushable_dl_tasks_root;
- struct rb_node *pushable_dl_tasks_leftmost;
+ struct rb_root_cached pushable_dl_tasks_root;
#else
struct dl_bw dl_bw;
#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6f7b43982f73..5d0062cc10cb 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -473,7 +473,7 @@ static int __init isolated_cpu_setup(char *str)
alloc_bootmem_cpumask_var(&cpu_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
if (ret) {
- pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
return 0;
}
return 1;
diff --git a/kernel/smp.c b/kernel/smp.c
index 81cfca9b4cc3..c94dd85c8d41 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -550,7 +550,7 @@ static int __init maxcpus(char *str)
early_param("maxcpus", maxcpus);
/* Setup number of possible processor ids */
-int nr_cpu_ids __read_mostly = NR_CPUS;
+unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8ea4fb315719..2cafb49aa65e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
-#endif
+#endif /* CONFIG_NTP_PPS */
/**
* xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d56123cdcc89..b8f1f54731af 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs);
static __init int init_graph_trace(void)
{
- max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+ max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1);
if (!register_trace_event(&graph_trace_entry_event)) {
pr_warn("Warning: could not register graph trace events\n");
diff --git a/kernel/umh.c b/kernel/umh.c
new file mode 100644
index 000000000000..6ff9905250ff
--- /dev/null
+++ b/kernel/umh.c
@@ -0,0 +1,568 @@
+/*
+ * umh - the kernel usermode helper
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#include <trace/events/module.h>
+
+#define CAP_BSET (void *)1
+#define CAP_PI (void *)2
+
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
+static DECLARE_RWSEM(umhelper_sem);
+
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+ if (info->cleanup)
+ (*info->cleanup)(info);
+ kfree(info);
+}
+
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away
+ * or the caller used UMH_NO_WAIT.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int call_usermodehelper_exec_async(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ struct cred *new;
+ int retval;
+
+ spin_lock_irq(&current->sighand->siglock);
+ flush_signal_handlers(current, 1);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Our parent (unbound workqueue) runs with elevated scheduling
+ * priority. Avoid propagating that into the userspace child.
+ */
+ set_user_nice(current, 0);
+
+ retval = -ENOMEM;
+ new = prepare_kernel_cred(current);
+ if (!new)
+ goto out;
+
+ spin_lock(&umh_sysctl_lock);
+ new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+ new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+ new->cap_inheritable);
+ spin_unlock(&umh_sysctl_lock);
+
+ if (sub_info->init) {
+ retval = sub_info->init(sub_info, new);
+ if (retval) {
+ abort_creds(new);
+ goto out;
+ }
+ }
+
+ commit_creds(new);
+
+ retval = do_execve(getname_kernel(sub_info->path),
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
+out:
+ sub_info->retval = retval;
+ /*
+ * call_usermodehelper_exec_sync() will call umh_complete
+ * if UHM_WAIT_PROC.
+ */
+ if (!(sub_info->wait & UMH_WAIT_PROC))
+ umh_complete(sub_info);
+ if (!retval)
+ return 0;
+ do_exit(0);
+}
+
+/* Handles UMH_WAIT_PROC. */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
+{
+ pid_t pid;
+
+ /* If SIGCLD is ignored sys_wait4 won't populate the status. */
+ kernel_sigaction(SIGCHLD, SIG_DFL);
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ } else {
+ int ret = -ECHILD;
+ /*
+ * Normally it is bogus to call wait4() from in-kernel because
+ * wait4() wants to write the exit code to a userspace address.
+ * But call_usermodehelper_exec_sync() always runs as kernel
+ * thread (workqueue) and put_user() to a kernel address works
+ * OK for kernel threads, due to their having an mm_segment_t
+ * which spans the entire address space.
+ *
+ * Thus the __user pointer cast is valid here.
+ */
+ sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+ /*
+ * If ret is 0, either call_usermodehelper_exec_async failed and
+ * the real error code is already in sub_info->retval or
+ * sub_info->retval is 0 anyway, so don't mess with it then.
+ */
+ if (ret)
+ sub_info->retval = ret;
+ }
+
+ /* Restore default kernel sig handler */
+ kernel_sigaction(SIGCHLD, SIG_IGN);
+
+ umh_complete(sub_info);
+}
+
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
+{
+ struct subprocess_info *sub_info =
+ container_of(work, struct subprocess_info, work);
+
+ if (sub_info->wait & UMH_WAIT_PROC) {
+ call_usermodehelper_exec_sync(sub_info);
+ } else {
+ pid_t pid;
+ /*
+ * Use CLONE_PARENT to reparent it to kthreadd; we do not
+ * want to pollute current->children, and we need a parent
+ * that always ignores SIGCHLD to ensure auto-reaping.
+ */
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+ CLONE_PARENT | SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ umh_complete(sub_info);
+ }
+ }
+}
+
+/*
+ * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
+ * (used for preventing user land processes from being created after the user
+ * land has been frozen during a system-wide hibernation or suspend operation).
+ * Should always be manipulated under umhelper_sem acquired for write.
+ */
+static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
+
+/* Number of helpers running */
+static atomic_t running_helpers = ATOMIC_INIT(0);
+
+/*
+ * Wait queue head used by usermodehelper_disable() to wait for all running
+ * helpers to finish.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
+
+/*
+ * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
+ * to become 'false'.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
+
+/*
+ * Time to wait for running_helpers to become zero before the setting of
+ * usermodehelper_disabled in usermodehelper_disable() fails
+ */
+#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
+
+int usermodehelper_read_trylock(void)
+{
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ if (usermodehelper_disabled == UMH_DISABLED)
+ ret = -EAGAIN;
+
+ up_read(&umhelper_sem);
+
+ if (ret)
+ break;
+
+ schedule();
+ try_to_freeze();
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
+
+long usermodehelper_read_lock_wait(long timeout)
+{
+ DEFINE_WAIT(wait);
+
+ if (timeout < 0)
+ return -EINVAL;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ up_read(&umhelper_sem);
+
+ timeout = schedule_timeout(timeout);
+ if (!timeout)
+ break;
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return timeout;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
+
+void usermodehelper_read_unlock(void)
+{
+ up_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
+
+/**
+ * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Change the value of usermodehelper_disabled (under umhelper_sem locked for
+ * writing) and wakeup tasks waiting for it to change.
+ */
+void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+{
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ wake_up(&usermodehelper_disabled_waitq);
+ up_write(&umhelper_sem);
+}
+
+/**
+ * __usermodehelper_disable - Prevent new helpers from being started.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
+ */
+int __usermodehelper_disable(enum umh_disable_depth depth)
+{
+ long retval;
+
+ if (!depth)
+ return -EINVAL;
+
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ up_write(&umhelper_sem);
+
+ /*
+ * From now on call_usermodehelper_exec() won't start any new
+ * helpers, so it is sufficient if running_helpers turns out to
+ * be zero at one point (it may be increased later, but that
+ * doesn't matter).
+ */
+ retval = wait_event_timeout(running_helpers_waitq,
+ atomic_read(&running_helpers) == 0,
+ RUNNING_HELPERS_TIMEOUT);
+ if (retval)
+ return 0;
+
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
+ return -EAGAIN;
+}
+
+static void helper_lock(void)
+{
+ atomic_inc(&running_helpers);
+ smp_mb__after_atomic();
+}
+
+static void helper_unlock(void)
+{
+ if (atomic_dec_and_test(&running_helpers))
+ wake_up(&running_helpers_waitq);
+}
+
+/**
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
+ *
+ * Returns either %NULL on allocation failure, or a subprocess_info
+ * structure. This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed. This can be used for freeing the argv and envp. The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
+ char **envp, gfp_t gfp_mask,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
+{
+ struct subprocess_info *sub_info;
+ sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
+ if (!sub_info)
+ goto out;
+
+ INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+
+#ifdef CONFIG_STATIC_USERMODEHELPER
+ sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
+#else
+ sub_info->path = path;
+#endif
+ sub_info->argv = argv;
+ sub_info->envp = envp;
+
+ sub_info->cleanup = cleanup;
+ sub_info->init = init;
+ sub_info->data = data;
+ out:
+ return sub_info;
+}
+EXPORT_SYMBOL(call_usermodehelper_setup);
+
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * Runs a user-space application. The application is started
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
+ */
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ int retval = 0;
+
+ if (!sub_info->path) {
+ call_usermodehelper_freeinfo(sub_info);
+ return -EINVAL;
+ }
+ helper_lock();
+ if (usermodehelper_disabled) {
+ retval = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * If there is no binary for us to call, then just return and get out of
+ * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
+ * disable all call_usermodehelper() calls.
+ */
+ if (strlen(sub_info->path) == 0)
+ goto out;
+
+ /*
+ * Set the completion pointer only if there is a waiter.
+ * This makes it possible to use umh_complete to free
+ * the data structure in case of UMH_NO_WAIT.
+ */
+ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
+ sub_info->wait = wait;
+
+ queue_work(system_unbound_wq, &sub_info->work);
+ if (wait == UMH_NO_WAIT) /* task has freed sub_info */
+ goto unlock;
+
+ if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_killable(&done);
+ if (!retval)
+ goto wait_done;
+
+ /* umh_complete() will see NULL and free sub_info */
+ if (xchg(&sub_info->complete, NULL))
+ goto unlock;
+ /* fallthrough, umh_complete() was already called */
+ }
+
+ wait_for_completion(&done);
+wait_done:
+ retval = sub_info->retval;
+out:
+ call_usermodehelper_freeinfo(sub_info);
+unlock:
+ helper_unlock();
+ return retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_exec);
+
+/**
+ * call_usermodehelper() - prepare and start a usermode application
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
+ */
+int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
+{
+ struct subprocess_info *info;
+ gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+
+ info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+ NULL, NULL, NULL);
+ if (info == NULL)
+ return -ENOMEM;
+
+ return call_usermodehelper_exec(info, wait);
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
+static int proc_cap_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+ kernel_cap_t new_cap;
+ int err, i;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_MODULE)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ */
+ spin_lock(&umh_sysctl_lock);
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
+ if (table->data == CAP_BSET)
+ cap_array[i] = usermodehelper_bset.cap[i];
+ else if (table->data == CAP_PI)
+ cap_array[i] = usermodehelper_inheritable.cap[i];
+ else
+ BUG();
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ t = *table;
+ t.data = &cap_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant 32 bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ /*
+ * convert from the sysctl array of ulongs to the kernel_cap_t
+ * internal representation
+ */
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+ new_cap.cap[i] = cap_array[i];
+
+ /*
+ * Drop everything not in the new_cap (but don't add things)
+ */
+ spin_lock(&umh_sysctl_lock);
+ if (write) {
+ if (table->data == CAP_BSET)
+ usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+ if (table->data == CAP_PI)
+ usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+ {
+ .procname = "bset",
+ .data = CAP_BSET,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ {
+ .procname = "inheritable",
+ .data = CAP_PI,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ { }
+};
diff --git a/lib/Kconfig b/lib/Kconfig
index 6762529ad9e4..40b114a11d7c 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -575,4 +575,7 @@ config PARMAN
config PRIME_NUMBERS
tristate
+config STRING_SELFTEST
+ bool "Test string functions"
+
endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7396f5044397..b19c491cbc4e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1930,6 +1930,17 @@ config TEST_KMOD
If unsure, say N.
+config TEST_DEBUG_VIRTUAL
+ tristate "Test CONFIG_DEBUG_VIRTUAL feature"
+ depends on DEBUG_VIRTUAL
+ help
+ Test the kernel's ability to detect incorrect calls to
+ virt_to_phys() done against the non-linear part of the
+ kernel's virtual address map.
+
+ If unsure, say N.
+
+
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
diff --git a/lib/Makefile b/lib/Makefile
index 40c18372b301..469ce5e24e4f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
obj-$(CONFIG_TEST_UUID) += test_uuid.o
obj-$(CONFIG_TEST_PARMAN) += test_parman.o
obj-$(CONFIG_TEST_KMOD) += test_kmod.o
+obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
CFLAGS_kobject.o += -DDEBUG
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 9a532805364b..c82c61b66e16 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -513,7 +513,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
int nmaskbits)
{
unsigned int a, b, old_a, old_b;
- unsigned int group_size, used_size;
+ unsigned int group_size, used_size, off;
int c, old_c, totaldigits, ndigits;
const char __user __force *ubuf = (const char __user __force *)buf;
int at_start, in_range, in_partial_range;
@@ -599,6 +599,8 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
a = old_a;
b = old_b;
old_a = old_b = 0;
+ } else {
+ used_size = group_size = b - a + 1;
}
/* if no digit is after '-', it's wrong*/
if (at_start && in_range)
@@ -608,17 +610,9 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
if (b >= nmaskbits)
return -ERANGE;
while (a <= b) {
- if (in_partial_range) {
- static int pos_in_group = 1;
-
- if (pos_in_group <= used_size)
- set_bit(a, maskp);
-
- if (a == b || ++pos_in_group > group_size)
- pos_in_group = 1;
- } else
- set_bit(a, maskp);
- a++;
+ off = min(b - a + 1, used_size);
+ bitmap_set(maskp, a, off);
+ a += group_size;
}
} while (buflen && c == ',');
return 0;
diff --git a/lib/cmdline.c b/lib/cmdline.c
index 4c0888c4a68d..171c19b6888e 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -244,5 +244,4 @@ char *next_arg(char *args, char **param, char **val)
/* Chew up trailing spaces. */
return skip_spaces(next);
- //return next;
}
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 4731a0895760..8b1a1bd77539 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -6,6 +6,22 @@
#include <linux/bootmem.h>
/**
+ * cpumask_next - get the next cpu in a cpumask
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @srcp: the cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set.
+ */
+unsigned int cpumask_next(int n, const struct cpumask *srcp)
+{
+ /* -1 is a legal arg here. */
+ if (n != -1)
+ cpumask_check(n);
+ return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
+}
+EXPORT_SYMBOL(cpumask_next);
+
+/**
* cpumask_next_and - get the next cpu in *src1p & *src2p
* @n: the cpu prior to the place to search (ie. return will be > @n)
* @src1p: the first cpumask pointer
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 992457b1284c..81b70ed37209 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/ctype.h>
+#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <asm/unaligned.h>
@@ -42,7 +43,7 @@ EXPORT_SYMBOL(hex_to_bin);
* @src: ascii hexadecimal string
* @count: result length
*
- * Return 0 on success, -1 in case of bad input.
+ * Return 0 on success, -EINVAL in case of bad input.
*/
int hex2bin(u8 *dst, const char *src, size_t count)
{
@@ -51,7 +52,7 @@ int hex2bin(u8 *dst, const char *src, size_t count)
int lo = hex_to_bin(*src++);
if ((hi < 0) || (lo < 0))
- return -1;
+ return -EINVAL;
*dst++ = (hi << 4) | lo;
}
diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c
index df495fe81421..0e343fd29570 100644
--- a/lib/interval_tree_test.c
+++ b/lib/interval_tree_test.c
@@ -19,14 +19,14 @@ __param(bool, search_all, false, "Searches will iterate all nodes in the tree");
__param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint");
-static struct rb_root root = RB_ROOT;
+static struct rb_root_cached root = RB_ROOT_CACHED;
static struct interval_tree_node *nodes = NULL;
static u32 *queries = NULL;
static struct rnd_state rnd;
static inline unsigned long
-search(struct rb_root *root, unsigned long start, unsigned long last)
+search(struct rb_root_cached *root, unsigned long start, unsigned long last)
{
struct interval_tree_node *node;
unsigned long results = 0;
diff --git a/lib/oid_registry.c b/lib/oid_registry.c
index 318f382a010d..41b9e50711a7 100644
--- a/lib/oid_registry.c
+++ b/lib/oid_registry.c
@@ -142,9 +142,9 @@ int sprint_oid(const void *data, size_t datasize, char *buffer, size_t bufsize)
}
ret += count = snprintf(buffer, bufsize, ".%lu", num);
buffer += count;
- bufsize -= count;
- if (bufsize == 0)
+ if (bufsize <= count)
return -ENOBUFS;
+ bufsize -= count;
}
return ret;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9717e2a50374..8b1feca1230a 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -463,7 +463,7 @@ radix_tree_node_free(struct radix_tree_node *node)
* To make use of this facility, the radix tree must be initialised without
* __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
*/
-static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
+static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
struct radix_tree_preload *rtp;
struct radix_tree_node *node;
@@ -2104,7 +2104,8 @@ EXPORT_SYMBOL(radix_tree_tagged);
*/
void idr_preload(gfp_t gfp_mask)
{
- __radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE);
+ if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
+ preempt_disable();
}
EXPORT_SYMBOL(idr_preload);
@@ -2118,13 +2119,13 @@ EXPORT_SYMBOL(idr_preload);
*/
int ida_pre_get(struct ida *ida, gfp_t gfp)
{
- __radix_tree_preload(gfp, IDA_PRELOAD_SIZE);
/*
* The IDA API has no preload_end() equivalent. Instead,
* ida_get_new() can return -EAGAIN, prompting the caller
* to return to the ida_pre_get() step.
*/
- preempt_enable();
+ if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
+ preempt_enable();
if (!this_cpu_read(ida_bitmap)) {
struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 4ba2828a67c0..ba4a9d165f1b 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -95,22 +95,35 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
static __always_inline void
__rb_insert(struct rb_node *node, struct rb_root *root,
+ bool newleft, struct rb_node **leftmost,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+ if (newleft)
+ *leftmost = node;
+
while (true) {
/*
- * Loop invariant: node is red
- *
- * If there is a black parent, we are done.
- * Otherwise, take some corrective action as we don't
- * want a red root or two consecutive red nodes.
+ * Loop invariant: node is red.
*/
- if (!parent) {
+ if (unlikely(!parent)) {
+ /*
+ * The inserted node is root. Either this is the
+ * first node, or we recursed at Case 1 below and
+ * are no longer violating 4).
+ */
rb_set_parent_color(node, NULL, RB_BLACK);
break;
- } else if (rb_is_black(parent))
+ }
+
+ /*
+ * If there is a black parent, we are done.
+ * Otherwise, take some corrective action as,
+ * per 4), we don't want a red root or two
+ * consecutive red nodes.
+ */
+ if(rb_is_black(parent))
break;
gparent = rb_red_parent(parent);
@@ -119,7 +132,7 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
if (parent != tmp) { /* parent == gparent->rb_left */
if (tmp && rb_is_red(tmp)) {
/*
- * Case 1 - color flips
+ * Case 1 - node's uncle is red (color flips).
*
* G g
* / \ / \
@@ -142,7 +155,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
tmp = parent->rb_right;
if (node == tmp) {
/*
- * Case 2 - left rotate at parent
+ * Case 2 - node's uncle is black and node is
+ * the parent's right child (left rotate at parent).
*
* G G
* / \ / \
@@ -166,7 +180,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
}
/*
- * Case 3 - right rotate at gparent
+ * Case 3 - node's uncle is black and node is
+ * the parent's left child (right rotate at gparent).
*
* G P
* / \ / \
@@ -434,19 +449,38 @@ static const struct rb_augment_callbacks dummy_callbacks = {
void rb_insert_color(struct rb_node *node, struct rb_root *root)
{
- __rb_insert(node, root, dummy_rotate);
+ __rb_insert(node, root, false, NULL, dummy_rotate);
}
EXPORT_SYMBOL(rb_insert_color);
void rb_erase(struct rb_node *node, struct rb_root *root)
{
struct rb_node *rebalance;
- rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
+ rebalance = __rb_erase_augmented(node, root,
+ NULL, &dummy_callbacks);
if (rebalance)
____rb_erase_color(rebalance, root, dummy_rotate);
}
EXPORT_SYMBOL(rb_erase);
+void rb_insert_color_cached(struct rb_node *node,
+ struct rb_root_cached *root, bool leftmost)
+{
+ __rb_insert(node, &root->rb_root, leftmost,
+ &root->rb_leftmost, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_insert_color_cached);
+
+void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
+{
+ struct rb_node *rebalance;
+ rebalance = __rb_erase_augmented(node, &root->rb_root,
+ &root->rb_leftmost, &dummy_callbacks);
+ if (rebalance)
+ ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_erase_cached);
+
/*
* Augmented rbtree manipulation functions.
*
@@ -455,9 +489,10 @@ EXPORT_SYMBOL(rb_erase);
*/
void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+ bool newleft, struct rb_node **leftmost,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
- __rb_insert(node, root, augment_rotate);
+ __rb_insert(node, root, newleft, leftmost, augment_rotate);
}
EXPORT_SYMBOL(__rb_insert_augmented);
@@ -502,7 +537,7 @@ struct rb_node *rb_next(const struct rb_node *node)
* as we can.
*/
if (node->rb_right) {
- node = node->rb_right;
+ node = node->rb_right;
while (node->rb_left)
node=node->rb_left;
return (struct rb_node *)node;
@@ -534,7 +569,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
* as we can.
*/
if (node->rb_left) {
- node = node->rb_left;
+ node = node->rb_left;
while (node->rb_right)
node=node->rb_right;
return (struct rb_node *)node;
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 8b3c9dc88262..191a238e5a9d 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -1,11 +1,18 @@
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/rbtree_augmented.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <asm/timex.h>
-#define NODES 100
-#define PERF_LOOPS 100000
-#define CHECK_LOOPS 100
+#define __param(type, name, init, msg) \
+ static type name = init; \
+ module_param(name, type, 0444); \
+ MODULE_PARM_DESC(name, msg);
+
+__param(int, nnodes, 100, "Number of nodes in the rb-tree");
+__param(int, perf_loops, 100000, "Number of iterations modifying the rb-tree");
+__param(int, check_loops, 100, "Number of iterations modifying and verifying the rb-tree");
struct test_node {
u32 key;
@@ -16,14 +23,14 @@ struct test_node {
u32 augmented;
};
-static struct rb_root root = RB_ROOT;
-static struct test_node nodes[NODES];
+static struct rb_root_cached root = RB_ROOT_CACHED;
+static struct test_node *nodes = NULL;
static struct rnd_state rnd;
-static void insert(struct test_node *node, struct rb_root *root)
+static void insert(struct test_node *node, struct rb_root_cached *root)
{
- struct rb_node **new = &root->rb_node, *parent = NULL;
+ struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
u32 key = node->key;
while (*new) {
@@ -35,14 +42,40 @@ static void insert(struct test_node *node, struct rb_root *root)
}
rb_link_node(&node->rb, parent, new);
- rb_insert_color(&node->rb, root);
+ rb_insert_color(&node->rb, &root->rb_root);
+}
+
+static void insert_cached(struct test_node *node, struct rb_root_cached *root)
+{
+ struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
+ u32 key = node->key;
+ bool leftmost = true;
+
+ while (*new) {
+ parent = *new;
+ if (key < rb_entry(parent, struct test_node, rb)->key)
+ new = &parent->rb_left;
+ else {
+ new = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(&node->rb, parent, new);
+ rb_insert_color_cached(&node->rb, root, leftmost);
}
-static inline void erase(struct test_node *node, struct rb_root *root)
+static inline void erase(struct test_node *node, struct rb_root_cached *root)
{
- rb_erase(&node->rb, root);
+ rb_erase(&node->rb, &root->rb_root);
}
+static inline void erase_cached(struct test_node *node, struct rb_root_cached *root)
+{
+ rb_erase_cached(&node->rb, root);
+}
+
+
static inline u32 augment_recompute(struct test_node *node)
{
u32 max = node->val, child_augmented;
@@ -64,9 +97,10 @@ static inline u32 augment_recompute(struct test_node *node)
RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb,
u32, augmented, augment_recompute)
-static void insert_augmented(struct test_node *node, struct rb_root *root)
+static void insert_augmented(struct test_node *node,
+ struct rb_root_cached *root)
{
- struct rb_node **new = &root->rb_node, *rb_parent = NULL;
+ struct rb_node **new = &root->rb_root.rb_node, *rb_parent = NULL;
u32 key = node->key;
u32 val = node->val;
struct test_node *parent;
@@ -84,18 +118,53 @@ static void insert_augmented(struct test_node *node, struct rb_root *root)
node->augmented = val;
rb_link_node(&node->rb, rb_parent, new);
- rb_insert_augmented(&node->rb, root, &augment_callbacks);
+ rb_insert_augmented(&node->rb, &root->rb_root, &augment_callbacks);
+}
+
+static void insert_augmented_cached(struct test_node *node,
+ struct rb_root_cached *root)
+{
+ struct rb_node **new = &root->rb_root.rb_node, *rb_parent = NULL;
+ u32 key = node->key;
+ u32 val = node->val;
+ struct test_node *parent;
+ bool leftmost = true;
+
+ while (*new) {
+ rb_parent = *new;
+ parent = rb_entry(rb_parent, struct test_node, rb);
+ if (parent->augmented < val)
+ parent->augmented = val;
+ if (key < parent->key)
+ new = &parent->rb.rb_left;
+ else {
+ new = &parent->rb.rb_right;
+ leftmost = false;
+ }
+ }
+
+ node->augmented = val;
+ rb_link_node(&node->rb, rb_parent, new);
+ rb_insert_augmented_cached(&node->rb, root,
+ leftmost, &augment_callbacks);
+}
+
+
+static void erase_augmented(struct test_node *node, struct rb_root_cached *root)
+{
+ rb_erase_augmented(&node->rb, &root->rb_root, &augment_callbacks);
}
-static void erase_augmented(struct test_node *node, struct rb_root *root)
+static void erase_augmented_cached(struct test_node *node,
+ struct rb_root_cached *root)
{
- rb_erase_augmented(&node->rb, root, &augment_callbacks);
+ rb_erase_augmented_cached(&node->rb, root, &augment_callbacks);
}
static void init(void)
{
int i;
- for (i = 0; i < NODES; i++) {
+ for (i = 0; i < nnodes; i++) {
nodes[i].key = prandom_u32_state(&rnd);
nodes[i].val = prandom_u32_state(&rnd);
}
@@ -118,7 +187,7 @@ static void check_postorder_foreach(int nr_nodes)
{
struct test_node *cur, *n;
int count = 0;
- rbtree_postorder_for_each_entry_safe(cur, n, &root, rb)
+ rbtree_postorder_for_each_entry_safe(cur, n, &root.rb_root, rb)
count++;
WARN_ON_ONCE(count != nr_nodes);
@@ -128,7 +197,7 @@ static void check_postorder(int nr_nodes)
{
struct rb_node *rb;
int count = 0;
- for (rb = rb_first_postorder(&root); rb; rb = rb_next_postorder(rb))
+ for (rb = rb_first_postorder(&root.rb_root); rb; rb = rb_next_postorder(rb))
count++;
WARN_ON_ONCE(count != nr_nodes);
@@ -140,7 +209,7 @@ static void check(int nr_nodes)
int count = 0, blacks = 0;
u32 prev_key = 0;
- for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
+ for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) {
struct test_node *node = rb_entry(rb, struct test_node, rb);
WARN_ON_ONCE(node->key < prev_key);
WARN_ON_ONCE(is_red(rb) &&
@@ -155,7 +224,7 @@ static void check(int nr_nodes)
}
WARN_ON_ONCE(count != nr_nodes);
- WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1);
+ WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root.rb_root))) - 1);
check_postorder(nr_nodes);
check_postorder_foreach(nr_nodes);
@@ -166,7 +235,7 @@ static void check_augmented(int nr_nodes)
struct rb_node *rb;
check(nr_nodes);
- for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
+ for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) {
struct test_node *node = rb_entry(rb, struct test_node, rb);
WARN_ON_ONCE(node->augmented != augment_recompute(node));
}
@@ -176,6 +245,11 @@ static int __init rbtree_test_init(void)
{
int i, j;
cycles_t time1, time2, time;
+ struct rb_node *node;
+
+ nodes = kmalloc(nnodes * sizeof(*nodes), GFP_KERNEL);
+ if (!nodes)
+ return -ENOMEM;
printk(KERN_ALERT "rbtree testing");
@@ -184,27 +258,88 @@ static int __init rbtree_test_init(void)
time1 = get_cycles();
- for (i = 0; i < PERF_LOOPS; i++) {
- for (j = 0; j < NODES; j++)
+ for (i = 0; i < perf_loops; i++) {
+ for (j = 0; j < nnodes; j++)
insert(nodes + j, &root);
- for (j = 0; j < NODES; j++)
+ for (j = 0; j < nnodes; j++)
erase(nodes + j, &root);
}
time2 = get_cycles();
time = time2 - time1;
- time = div_u64(time, PERF_LOOPS);
- printk(" -> %llu cycles\n", (unsigned long long)time);
+ time = div_u64(time, perf_loops);
+ printk(" -> test 1 (latency of nnodes insert+delete): %llu cycles\n",
+ (unsigned long long)time);
+
+ time1 = get_cycles();
+
+ for (i = 0; i < perf_loops; i++) {
+ for (j = 0; j < nnodes; j++)
+ insert_cached(nodes + j, &root);
+ for (j = 0; j < nnodes; j++)
+ erase_cached(nodes + j, &root);
+ }
+
+ time2 = get_cycles();
+ time = time2 - time1;
+
+ time = div_u64(time, perf_loops);
+ printk(" -> test 2 (latency of nnodes cached insert+delete): %llu cycles\n",
+ (unsigned long long)time);
+
+ for (i = 0; i < nnodes; i++)
+ insert(nodes + i, &root);
+
+ time1 = get_cycles();
+
+ for (i = 0; i < perf_loops; i++) {
+ for (node = rb_first(&root.rb_root); node; node = rb_next(node))
+ ;
+ }
+
+ time2 = get_cycles();
+ time = time2 - time1;
+
+ time = div_u64(time, perf_loops);
+ printk(" -> test 3 (latency of inorder traversal): %llu cycles\n",
+ (unsigned long long)time);
+
+ time1 = get_cycles();
+
+ for (i = 0; i < perf_loops; i++)
+ node = rb_first(&root.rb_root);
+
+ time2 = get_cycles();
+ time = time2 - time1;
+
+ time = div_u64(time, perf_loops);
+ printk(" -> test 4 (latency to fetch first node)\n");
+ printk(" non-cached: %llu cycles\n", (unsigned long long)time);
+
+ time1 = get_cycles();
+
+ for (i = 0; i < perf_loops; i++)
+ node = rb_first_cached(&root);
+
+ time2 = get_cycles();
+ time = time2 - time1;
+
+ time = div_u64(time, perf_loops);
+ printk(" cached: %llu cycles\n", (unsigned long long)time);
- for (i = 0; i < CHECK_LOOPS; i++) {
+ for (i = 0; i < nnodes; i++)
+ erase(nodes + i, &root);
+
+ /* run checks */
+ for (i = 0; i < check_loops; i++) {
init();
- for (j = 0; j < NODES; j++) {
+ for (j = 0; j < nnodes; j++) {
check(j);
insert(nodes + j, &root);
}
- for (j = 0; j < NODES; j++) {
- check(NODES - j);
+ for (j = 0; j < nnodes; j++) {
+ check(nnodes - j);
erase(nodes + j, &root);
}
check(0);
@@ -216,32 +351,49 @@ static int __init rbtree_test_init(void)
time1 = get_cycles();
- for (i = 0; i < PERF_LOOPS; i++) {
- for (j = 0; j < NODES; j++)
+ for (i = 0; i < perf_loops; i++) {
+ for (j = 0; j < nnodes; j++)
insert_augmented(nodes + j, &root);
- for (j = 0; j < NODES; j++)
+ for (j = 0; j < nnodes; j++)
erase_augmented(nodes + j, &root);
}
time2 = get_cycles();
time = time2 - time1;
- time = div_u64(time, PERF_LOOPS);
- printk(" -> %llu cycles\n", (unsigned long long)time);
+ time = div_u64(time, perf_loops);
+ printk(" -> test 1 (latency of nnodes insert+delete): %llu cycles\n", (unsigned long long)time);
+
+ time1 = get_cycles();
+
+ for (i = 0; i < perf_loops; i++) {
+ for (j = 0; j < nnodes; j++)
+ insert_augmented_cached(nodes + j, &root);
+ for (j = 0; j < nnodes; j++)
+ erase_augmented_cached(nodes + j, &root);
+ }
+
+ time2 = get_cycles();
+ time = time2 - time1;
+
+ time = div_u64(time, perf_loops);
+ printk(" -> test 2 (latency of nnodes cached insert+delete): %llu cycles\n", (unsigned long long)time);
- for (i = 0; i < CHECK_LOOPS; i++) {
+ for (i = 0; i < check_loops; i++) {
init();
- for (j = 0; j < NODES; j++) {
+ for (j = 0; j < nnodes; j++) {
check_augmented(j);
insert_augmented(nodes + j, &root);
}
- for (j = 0; j < NODES; j++) {
- check_augmented(NODES - j);
+ for (j = 0; j < nnodes; j++) {
+ check_augmented(nnodes - j);
erase_augmented(nodes + j, &root);
}
check_augmented(0);
}
+ kfree(nodes);
+
return -EAGAIN; /* Fail will directly unload the module */
}
diff --git a/lib/string.c b/lib/string.c
index ebbb99c775bd..9921dc202db4 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -723,6 +723,72 @@ void memzero_explicit(void *s, size_t count)
}
EXPORT_SYMBOL(memzero_explicit);
+#ifndef __HAVE_ARCH_MEMSET16
+/**
+ * memset16() - Fill a memory area with a uint16_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint16_t instead
+ * of a byte. Remember that @count is the number of uint16_ts to
+ * store, not the number of bytes.
+ */
+void *memset16(uint16_t *s, uint16_t v, size_t count)
+{
+ uint16_t *xs = s;
+
+ while (count--)
+ *xs++ = v;
+ return s;
+}
+EXPORT_SYMBOL(memset16);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET32
+/**
+ * memset32() - Fill a memory area with a uint32_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint32_t instead
+ * of a byte. Remember that @count is the number of uint32_ts to
+ * store, not the number of bytes.
+ */
+void *memset32(uint32_t *s, uint32_t v, size_t count)
+{
+ uint32_t *xs = s;
+
+ while (count--)
+ *xs++ = v;
+ return s;
+}
+EXPORT_SYMBOL(memset32);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET64
+/**
+ * memset64() - Fill a memory area with a uint64_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint64_t instead
+ * of a byte. Remember that @count is the number of uint64_ts to
+ * store, not the number of bytes.
+ */
+void *memset64(uint64_t *s, uint64_t v, size_t count)
+{
+ uint64_t *xs = s;
+
+ while (count--)
+ *xs++ = v;
+ return s;
+}
+EXPORT_SYMBOL(memset64);
+#endif
+
#ifndef __HAVE_ARCH_MEMCPY
/**
* memcpy - Copy one area of memory to another
@@ -985,3 +1051,144 @@ void fortify_panic(const char *name)
BUG();
}
EXPORT_SYMBOL(fortify_panic);
+
+#ifdef CONFIG_STRING_SELFTEST
+#include <linux/slab.h>
+#include <linux/module.h>
+
+static __init int memset16_selftest(void)
+{
+ unsigned i, j, k;
+ u16 v, *p;
+
+ p = kmalloc(256 * 2 * 2, GFP_KERNEL);
+ if (!p)
+ return -1;
+
+ for (i = 0; i < 256; i++) {
+ for (j = 0; j < 256; j++) {
+ memset(p, 0xa1, 256 * 2 * sizeof(v));
+ memset16(p + i, 0xb1b2, j);
+ for (k = 0; k < 512; k++) {
+ v = p[k];
+ if (k < i) {
+ if (v != 0xa1a1)
+ goto fail;
+ } else if (k < i + j) {
+ if (v != 0xb1b2)
+ goto fail;
+ } else {
+ if (v != 0xa1a1)
+ goto fail;
+ }
+ }
+ }
+ }
+
+fail:
+ kfree(p);
+ if (i < 256)
+ return (i << 24) | (j << 16) | k;
+ return 0;
+}
+
+static __init int memset32_selftest(void)
+{
+ unsigned i, j, k;
+ u32 v, *p;
+
+ p = kmalloc(256 * 2 * 4, GFP_KERNEL);
+ if (!p)
+ return -1;
+
+ for (i = 0; i < 256; i++) {
+ for (j = 0; j < 256; j++) {
+ memset(p, 0xa1, 256 * 2 * sizeof(v));
+ memset32(p + i, 0xb1b2b3b4, j);
+ for (k = 0; k < 512; k++) {
+ v = p[k];
+ if (k < i) {
+ if (v != 0xa1a1a1a1)
+ goto fail;
+ } else if (k < i + j) {
+ if (v != 0xb1b2b3b4)
+ goto fail;
+ } else {
+ if (v != 0xa1a1a1a1)
+ goto fail;
+ }
+ }
+ }
+ }
+
+fail:
+ kfree(p);
+ if (i < 256)
+ return (i << 24) | (j << 16) | k;
+ return 0;
+}
+
+static __init int memset64_selftest(void)
+{
+ unsigned i, j, k;
+ u64 v, *p;
+
+ p = kmalloc(256 * 2 * 8, GFP_KERNEL);
+ if (!p)
+ return -1;
+
+ for (i = 0; i < 256; i++) {
+ for (j = 0; j < 256; j++) {
+ memset(p, 0xa1, 256 * 2 * sizeof(v));
+ memset64(p + i, 0xb1b2b3b4b5b6b7b8ULL, j);
+ for (k = 0; k < 512; k++) {
+ v = p[k];
+ if (k < i) {
+ if (v != 0xa1a1a1a1a1a1a1a1ULL)
+ goto fail;
+ } else if (k < i + j) {
+ if (v != 0xb1b2b3b4b5b6b7b8ULL)
+ goto fail;
+ } else {
+ if (v != 0xa1a1a1a1a1a1a1a1ULL)
+ goto fail;
+ }
+ }
+ }
+ }
+
+fail:
+ kfree(p);
+ if (i < 256)
+ return (i << 24) | (j << 16) | k;
+ return 0;
+}
+
+static __init int string_selftest_init(void)
+{
+ int test, subtest;
+
+ test = 1;
+ subtest = memset16_selftest();
+ if (subtest)
+ goto fail;
+
+ test = 2;
+ subtest = memset32_selftest();
+ if (subtest)
+ goto fail;
+
+ test = 3;
+ subtest = memset64_selftest();
+ if (subtest)
+ goto fail;
+
+ pr_info("String selftests succeeded\n");
+ return 0;
+fail:
+ pr_crit("String selftest failure %d.%08x\n", test, subtest);
+ return 0;
+}
+
+module_init(string_selftest_init);
+#endif /* CONFIG_STRING_SELFTEST */
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 2526a2975c51..599c6713f2a2 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -165,6 +165,96 @@ static void __init test_zero_fill_copy(void)
expect_eq_pbl("128-1023", bmap2, 1024);
}
+#define PARSE_TIME 0x1
+
+struct test_bitmap_parselist{
+ const int errno;
+ const char *in;
+ const unsigned long *expected;
+ const int nbits;
+ const int flags;
+};
+
+static const unsigned long exp[] __initconst = {
+ BITMAP_FROM_U64(1),
+ BITMAP_FROM_U64(2),
+ BITMAP_FROM_U64(0x0000ffff),
+ BITMAP_FROM_U64(0xffff0000),
+ BITMAP_FROM_U64(0x55555555),
+ BITMAP_FROM_U64(0xaaaaaaaa),
+ BITMAP_FROM_U64(0x11111111),
+ BITMAP_FROM_U64(0x22222222),
+ BITMAP_FROM_U64(0xffffffff),
+ BITMAP_FROM_U64(0xfffffffe),
+ BITMAP_FROM_U64(0x3333333311111111),
+ BITMAP_FROM_U64(0xffffffff77777777)
+};
+
+static const unsigned long exp2[] __initconst = {
+ BITMAP_FROM_U64(0x3333333311111111),
+ BITMAP_FROM_U64(0xffffffff77777777)
+};
+
+static const struct test_bitmap_parselist parselist_tests[] __initconst = {
+#define step (sizeof(u64) / sizeof(unsigned long))
+
+ {0, "0", &exp[0], 8, 0},
+ {0, "1", &exp[1 * step], 8, 0},
+ {0, "0-15", &exp[2 * step], 32, 0},
+ {0, "16-31", &exp[3 * step], 32, 0},
+ {0, "0-31:1/2", &exp[4 * step], 32, 0},
+ {0, "1-31:1/2", &exp[5 * step], 32, 0},
+ {0, "0-31:1/4", &exp[6 * step], 32, 0},
+ {0, "1-31:1/4", &exp[7 * step], 32, 0},
+ {0, "0-31:4/4", &exp[8 * step], 32, 0},
+ {0, "1-31:4/4", &exp[9 * step], 32, 0},
+ {0, "0-31:1/4,32-63:2/4", &exp[10 * step], 64, 0},
+ {0, "0-31:3/4,32-63:4/4", &exp[11 * step], 64, 0},
+
+ {0, "0-31:1/4,32-63:2/4,64-95:3/4,96-127:4/4", exp2, 128, 0},
+
+ {0, "0-2047:128/256", NULL, 2048, PARSE_TIME},
+
+ {-EINVAL, "-1", NULL, 8, 0},
+ {-EINVAL, "-0", NULL, 8, 0},
+ {-EINVAL, "10-1", NULL, 8, 0},
+ {-EINVAL, "0-31:10/1", NULL, 8, 0},
+};
+
+static void __init test_bitmap_parselist(void)
+{
+ int i;
+ int err;
+ cycles_t cycles;
+ DECLARE_BITMAP(bmap, 2048);
+
+ for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) {
+#define ptest parselist_tests[i]
+
+ cycles = get_cycles();
+ err = bitmap_parselist(ptest.in, bmap, ptest.nbits);
+ cycles = get_cycles() - cycles;
+
+ if (err != ptest.errno) {
+ pr_err("test %d: input is %s, errno is %d, expected %d\n",
+ i, ptest.in, err, ptest.errno);
+ continue;
+ }
+
+ if (!err && ptest.expected
+ && !__bitmap_equal(bmap, ptest.expected, ptest.nbits)) {
+ pr_err("test %d: input is %s, result is 0x%lx, expected 0x%lx\n",
+ i, ptest.in, bmap[0], *ptest.expected);
+ continue;
+ }
+
+ if (ptest.flags & PARSE_TIME)
+ pr_err("test %d: input is '%s' OK, Time: %llu\n",
+ i, ptest.in,
+ (unsigned long long)cycles);
+ }
+}
+
static void __init test_bitmap_u32_array_conversions(void)
{
DECLARE_BITMAP(bmap1, 1024);
@@ -365,6 +455,7 @@ static int __init test_bitmap_init(void)
{
test_zero_fill_copy();
test_bitmap_u32_array_conversions();
+ test_bitmap_parselist();
test_mem_optimisations();
if (failed_tests == 0)
diff --git a/lib/test_debug_virtual.c b/lib/test_debug_virtual.c
new file mode 100644
index 000000000000..b9cdeecc19dc
--- /dev/null
+++ b/lib/test_debug_virtual.c
@@ -0,0 +1,49 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/sizes.h>
+
+#include <asm/page.h>
+#ifdef CONFIG_MIPS
+#include <asm/bootinfo.h>
+#endif
+
+struct foo {
+ unsigned int bar;
+};
+
+struct foo *foo;
+
+static int __init test_debug_virtual_init(void)
+{
+ phys_addr_t pa;
+ void *va;
+
+ va = (void *)VMALLOC_START;
+ pa = virt_to_phys(va);
+
+ pr_info("PA: %pa for VA: 0x%lx\n", &pa, (unsigned long)va);
+
+ foo = kzalloc(sizeof(*foo), GFP_KERNEL);
+ if (!foo)
+ return -ENOMEM;
+
+ pa = virt_to_phys(foo);
+ va = foo;
+ pr_info("PA: %pa for VA: 0x%lx\n", &pa, (unsigned long)va);
+
+ return 0;
+}
+module_init(test_debug_virtual_init);
+
+static void __exit test_debug_virtual_exit(void)
+{
+ kfree(foo);
+}
+module_exit(test_debug_virtual_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Test module for CONFIG_DEBUG_VIRTUAL");
diff --git a/lib/test_kmod.c b/lib/test_kmod.c
index ff9148969b92..fba78d25e825 100644
--- a/lib/test_kmod.c
+++ b/lib/test_kmod.c
@@ -924,7 +924,7 @@ static int test_dev_config_update_uint_range(struct kmod_test_device *test_dev,
if (ret)
return ret;
- if (new < min || new > max || new > UINT_MAX)
+ if (new < min || new > max)
return -EINVAL;
mutex_lock(&test_dev->config_mutex);
@@ -946,7 +946,7 @@ static int test_dev_config_update_int(struct kmod_test_device *test_dev,
if (ret)
return ret;
- if (new > INT_MAX || new < INT_MIN)
+ if (new < INT_MIN || new > INT_MAX)
return -EINVAL;
mutex_lock(&test_dev->config_mutex);
diff --git a/mm/Kconfig b/mm/Kconfig
index 0ded10a22639..9c4bdddd80c2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -262,6 +262,9 @@ config MIGRATION
config ARCH_ENABLE_HUGEPAGE_MIGRATION
bool
+config ARCH_ENABLE_THP_MIGRATION
+ bool
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
@@ -673,7 +676,7 @@ config ARCH_HAS_ZONE_DEVICE
bool
config ZONE_DEVICE
- bool "Device memory (pmem, etc...) hotplug support"
+ bool "Device memory (pmem, HMM, etc...) hotplug support"
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
@@ -689,6 +692,55 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
+config ARCH_HAS_HMM
+ bool
+ default y
+ depends on (X86_64 || PPC64)
+ depends on ZONE_DEVICE
+ depends on MMU && 64BIT
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
+
+config MIGRATE_VMA_HELPER
+ bool
+
+config HMM
+ bool
+ select MIGRATE_VMA_HELPER
+
+config HMM_MIRROR
+ bool "HMM mirror CPU page table into a device page table"
+ depends on ARCH_HAS_HMM
+ select MMU_NOTIFIER
+ select HMM
+ help
+ Select HMM_MIRROR if you want to mirror range of the CPU page table of a
+ process into a device page table. Here, mirror means "keep synchronized".
+ Prerequisites: the device must provide the ability to write-protect its
+ page tables (at PAGE_SIZE granularity), and must be able to recover from
+ the resulting potential page faults.
+
+config DEVICE_PRIVATE
+ bool "Unaddressable device memory (GPU memory, ...)"
+ depends on ARCH_HAS_HMM
+ select HMM
+
+ help
+ Allows creation of struct pages to represent unaddressable device
+ memory; i.e., memory that is only accessible from the device (or
+ group of devices). You likely also want to select HMM_MIRROR.
+
+config DEVICE_PUBLIC
+ bool "Addressable device memory (like GPU memory)"
+ depends on ARCH_HAS_HMM
+ select HMM
+
+ help
+ Allows creation of struct pages to represent addressable device
+ memory; i.e., memory that is accessible from both the device and
+ the CPU
+
config FRAME_VECTOR
bool
diff --git a/mm/Makefile b/mm/Makefile
index 411bd24d4a7c..e3ac3aeb533b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,3 +104,4 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
+obj-$(CONFIG_HMM) += hmm.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index b06d9fe23a28..68d28924ba79 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -139,6 +139,14 @@ int balloon_page_migrate(struct address_space *mapping,
{
struct balloon_dev_info *balloon = balloon_page_device(page);
+ /*
+ * We can not easily support the no copy case here so ignore it as it
+ * is unlikely to be use with ballon pages. See include/linux/hmm.h for
+ * user of the MIGRATE_SYNC_NO_COPY mode.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a43013112581..702f239cd6db 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -52,7 +52,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
goto out;
}
- if (IS_DAX(inode)) {
+ bdi = inode_to_bdi(mapping->host);
+
+ if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) {
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
@@ -75,8 +77,6 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
else
endbyte--; /* inclusive */
- bdi = inode_to_bdi(mapping->host);
-
switch (advice) {
case POSIX_FADV_NORMAL:
f.file->f_ra.ra_pages = bdi->ra_pages;
diff --git a/mm/gup.c b/mm/gup.c
index 33d651deeae2..b2b4d4263768 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -234,6 +234,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return page;
return no_page_table(vma, flags);
}
+retry:
+ if (!pmd_present(*pmd)) {
+ if (likely(!(flags & FOLL_MIGRATION)))
+ return no_page_table(vma, flags);
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(*pmd));
+ if (is_pmd_migration_entry(*pmd))
+ pmd_migration_entry_wait(mm, pmd);
+ goto retry;
+ }
if (pmd_devmap(*pmd)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags);
@@ -247,7 +257,15 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
+retry_locked:
ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_present(*pmd))) {
+ spin_unlock(ptl);
+ if (likely(!(flags & FOLL_MIGRATION)))
+ return no_page_table(vma, flags);
+ pmd_migration_entry_wait(mm, pmd);
+ goto retry_locked;
+ }
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags);
@@ -424,7 +442,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pud = pud_offset(p4d, address);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
+ if (!pmd_present(*pmd))
return -EFAULT;
VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, address);
@@ -438,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap;
*page = pte_page(*pte);
+
+ /*
+ * This should never happen (a device public page in the gate
+ * area).
+ */
+ if (is_device_public_page(*page))
+ goto unmap;
}
get_page(*page);
out:
@@ -1534,7 +1559,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
+ if (!pmd_present(pmd))
return 0;
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
diff --git a/mm/hmm.c b/mm/hmm.c
new file mode 100644
index 000000000000..a88a847bccba
--- /dev/null
+++ b/mm/hmm.c
@@ -0,0 +1,1257 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
+ */
+/*
+ * Refer to include/linux/hmm.h for information about heterogeneous memory
+ * management or HMM for short.
+ */
+#include <linux/mm.h>
+#include <linux/hmm.h>
+#include <linux/init.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
+#include <linux/memremap.h>
+#include <linux/jump_label.h>
+#include <linux/mmu_notifier.h>
+#include <linux/memory_hotplug.h>
+
+#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+/*
+ * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
+ */
+DEFINE_STATIC_KEY_FALSE(device_private_key);
+EXPORT_SYMBOL(device_private_key);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @sequence: we track updates to the CPU page table with a sequence number
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ */
+struct hmm {
+ struct mm_struct *mm;
+ spinlock_t lock;
+ atomic_t sequence;
+ struct list_head ranges;
+ struct list_head mirrors;
+ struct mmu_notifier mmu_notifier;
+ struct rw_semaphore mirrors_sem;
+};
+
+/*
+ * hmm_register - register HMM against an mm (HMM internal)
+ *
+ * @mm: mm struct to attach to
+ *
+ * This is not intended to be used directly by device drivers. It allocates an
+ * HMM struct if mm does not have one, and initializes it.
+ */
+static struct hmm *hmm_register(struct mm_struct *mm)
+{
+ struct hmm *hmm = READ_ONCE(mm->hmm);
+ bool cleanup = false;
+
+ /*
+ * The hmm struct can only be freed once the mm_struct goes away,
+ * hence we should always have pre-allocated an new hmm struct
+ * above.
+ */
+ if (hmm)
+ return hmm;
+
+ hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+ if (!hmm)
+ return NULL;
+ INIT_LIST_HEAD(&hmm->mirrors);
+ init_rwsem(&hmm->mirrors_sem);
+ atomic_set(&hmm->sequence, 0);
+ hmm->mmu_notifier.ops = NULL;
+ INIT_LIST_HEAD(&hmm->ranges);
+ spin_lock_init(&hmm->lock);
+ hmm->mm = mm;
+
+ /*
+ * We should only get here if hold the mmap_sem in write mode ie on
+ * registration of first mirror through hmm_mirror_register()
+ */
+ hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
+ if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
+ kfree(hmm);
+ return NULL;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (!mm->hmm)
+ mm->hmm = hmm;
+ else
+ cleanup = true;
+ spin_unlock(&mm->page_table_lock);
+
+ if (cleanup) {
+ mmu_notifier_unregister(&hmm->mmu_notifier, mm);
+ kfree(hmm);
+ }
+
+ return mm->hmm;
+}
+
+void hmm_mm_destroy(struct mm_struct *mm)
+{
+ kfree(mm->hmm);
+}
+
+static void hmm_invalidate_range(struct hmm *hmm,
+ enum hmm_update_type action,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm_mirror *mirror;
+ struct hmm_range *range;
+
+ spin_lock(&hmm->lock);
+ list_for_each_entry(range, &hmm->ranges, list) {
+ unsigned long addr, idx, npages;
+
+ if (end < range->start || start >= range->end)
+ continue;
+
+ range->valid = false;
+ addr = max(start, range->start);
+ idx = (addr - range->start) >> PAGE_SHIFT;
+ npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
+ memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
+ }
+ spin_unlock(&hmm->lock);
+
+ down_read(&hmm->mirrors_sem);
+ list_for_each_entry(mirror, &hmm->mirrors, list)
+ mirror->ops->sync_cpu_device_pagetables(mirror, action,
+ start, end);
+ up_read(&hmm->mirrors_sem);
+}
+
+static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm *hmm = mm->hmm;
+
+ VM_BUG_ON(!hmm);
+
+ atomic_inc(&hmm->sequence);
+}
+
+static void hmm_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm *hmm = mm->hmm;
+
+ VM_BUG_ON(!hmm);
+
+ hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+}
+
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
+ .invalidate_range_start = hmm_invalidate_range_start,
+ .invalidate_range_end = hmm_invalidate_range_end,
+};
+
+/*
+ * hmm_mirror_register() - register a mirror against an mm
+ *
+ * @mirror: new mirror struct to register
+ * @mm: mm to register against
+ *
+ * To start mirroring a process address space, the device driver must register
+ * an HMM mirror struct.
+ *
+ * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
+ */
+int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
+{
+ /* Sanity check */
+ if (!mm || !mirror || !mirror->ops)
+ return -EINVAL;
+
+ mirror->hmm = hmm_register(mm);
+ if (!mirror->hmm)
+ return -ENOMEM;
+
+ down_write(&mirror->hmm->mirrors_sem);
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
+
+ return 0;
+}
+EXPORT_SYMBOL(hmm_mirror_register);
+
+/*
+ * hmm_mirror_unregister() - unregister a mirror
+ *
+ * @mirror: new mirror struct to register
+ *
+ * Stop mirroring a process address space, and cleanup.
+ */
+void hmm_mirror_unregister(struct hmm_mirror *mirror)
+{
+ struct hmm *hmm = mirror->hmm;
+
+ down_write(&hmm->mirrors_sem);
+ list_del(&mirror->list);
+ up_write(&hmm->mirrors_sem);
+}
+EXPORT_SYMBOL(hmm_mirror_unregister);
+
+struct hmm_vma_walk {
+ struct hmm_range *range;
+ unsigned long last;
+ bool fault;
+ bool block;
+ bool write;
+};
+
+static int hmm_vma_do_fault(struct mm_walk *walk,
+ unsigned long addr,
+ hmm_pfn_t *pfn)
+{
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int r;
+
+ flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
+ flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
+ r = handle_mm_fault(vma, addr, flags);
+ if (r & VM_FAULT_RETRY)
+ return -EBUSY;
+ if (r & VM_FAULT_ERROR) {
+ *pfn = HMM_PFN_ERROR;
+ return -EFAULT;
+ }
+
+ return -EAGAIN;
+}
+
+static void hmm_pfns_special(hmm_pfn_t *pfns,
+ unsigned long addr,
+ unsigned long end)
+{
+ for (; addr < end; addr += PAGE_SIZE, pfns++)
+ *pfns = HMM_PFN_SPECIAL;
+}
+
+static int hmm_pfns_bad(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_range *range = walk->private;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ for (; addr < end; addr += PAGE_SIZE, i++)
+ pfns[i] = HMM_PFN_ERROR;
+
+ return 0;
+}
+
+static void hmm_pfns_clear(hmm_pfn_t *pfns,
+ unsigned long addr,
+ unsigned long end)
+{
+ for (; addr < end; addr += PAGE_SIZE, pfns++)
+ *pfns = 0;
+}
+
+static int hmm_vma_walk_hole(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
+ hmm_vma_walk->last = addr;
+ i = (addr - range->start) >> PAGE_SHIFT;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ pfns[i] = HMM_PFN_EMPTY;
+ if (hmm_vma_walk->fault) {
+ int ret;
+
+ ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+ }
+
+ return hmm_vma_walk->fault ? -EAGAIN : 0;
+}
+
+static int hmm_vma_walk_clear(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
+ hmm_vma_walk->last = addr;
+ i = (addr - range->start) >> PAGE_SHIFT;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ pfns[i] = 0;
+ if (hmm_vma_walk->fault) {
+ int ret;
+
+ ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+ }
+
+ return hmm_vma_walk->fault ? -EAGAIN : 0;
+}
+
+static int hmm_vma_walk_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long addr = start, i;
+ bool write_fault;
+ hmm_pfn_t flag;
+ pte_t *ptep;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
+ write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
+
+again:
+ if (pmd_none(*pmdp))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
+ return hmm_pfns_bad(start, end, walk);
+
+ if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
+ unsigned long pfn;
+ pmd_t pmd;
+
+ /*
+ * No need to take pmd_lock here, even if some other threads
+ * is splitting the huge pmd we will get that event through
+ * mmu_notifier callback.
+ *
+ * So just read pmd value and check again its a transparent
+ * huge or device mapping one and compute corresponding pfn
+ * values.
+ */
+ pmd = pmd_read_atomic(pmdp);
+ barrier();
+ if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+ goto again;
+ if (pmd_protnone(pmd))
+ return hmm_vma_walk_clear(start, end, walk);
+
+ if (write_fault && !pmd_write(pmd))
+ return hmm_vma_walk_clear(start, end, walk);
+
+ pfn = pmd_pfn(pmd) + pte_index(addr);
+ flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
+ pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
+ return 0;
+ }
+
+ if (pmd_bad(*pmdp))
+ return hmm_pfns_bad(start, end, walk);
+
+ ptep = pte_offset_map(pmdp, addr);
+ for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
+ pte_t pte = *ptep;
+
+ pfns[i] = 0;
+
+ if (pte_none(pte)) {
+ pfns[i] = HMM_PFN_EMPTY;
+ if (hmm_vma_walk->fault)
+ goto fault;
+ continue;
+ }
+
+ if (!pte_present(pte)) {
+ swp_entry_t entry;
+
+ if (!non_swap_entry(entry)) {
+ if (hmm_vma_walk->fault)
+ goto fault;
+ continue;
+ }
+
+ entry = pte_to_swp_entry(pte);
+
+ /*
+ * This is a special swap entry, ignore migration, use
+ * device and report anything else as error.
+ */
+ if (is_device_private_entry(entry)) {
+ pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry));
+ if (is_write_device_private_entry(entry)) {
+ pfns[i] |= HMM_PFN_WRITE;
+ } else if (write_fault)
+ goto fault;
+ pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE;
+ pfns[i] |= flag;
+ } else if (is_migration_entry(entry)) {
+ if (hmm_vma_walk->fault) {
+ pte_unmap(ptep);
+ hmm_vma_walk->last = addr;
+ migration_entry_wait(vma->vm_mm,
+ pmdp, addr);
+ return -EAGAIN;
+ }
+ continue;
+ } else {
+ /* Report error for everything else */
+ pfns[i] = HMM_PFN_ERROR;
+ }
+ continue;
+ }
+
+ if (write_fault && !pte_write(pte))
+ goto fault;
+
+ pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
+ pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
+ continue;
+
+fault:
+ pte_unmap(ptep);
+ /* Fault all pages in range */
+ return hmm_vma_walk_clear(start, end, walk);
+ }
+ pte_unmap(ptep - 1);
+
+ return 0;
+}
+
+/*
+ * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
+ * @vma: virtual memory area containing the virtual address range
+ * @range: used to track snapshot validity
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
+ *
+ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
+ * validity is tracked by range struct. See hmm_vma_range_done() for further
+ * information.
+ *
+ * The range struct is initialized here. It tracks the CPU page table, but only
+ * if the function returns success (0), in which case the caller must then call
+ * hmm_vma_range_done() to stop CPU page table update tracking on this range.
+ *
+ * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
+ * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ */
+int hmm_vma_get_pfns(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns)
+{
+ struct hmm_vma_walk hmm_vma_walk;
+ struct mm_walk mm_walk;
+ struct hmm *hmm;
+
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(pfns, start, end);
+ return -EINVAL;
+ }
+
+ /* Sanity check, this really should not happen ! */
+ if (start < vma->vm_start || start >= vma->vm_end)
+ return -EINVAL;
+ if (end < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ hmm = hmm_register(vma->vm_mm);
+ if (!hmm)
+ return -ENOMEM;
+ /* Caller must have registered a mirror, via hmm_mirror_register() ! */
+ if (!hmm->mmu_notifier.ops)
+ return -EINVAL;
+
+ /* Initialize range to track CPU page table update */
+ range->start = start;
+ range->pfns = pfns;
+ range->end = end;
+ spin_lock(&hmm->lock);
+ range->valid = true;
+ list_add_rcu(&range->list, &hmm->ranges);
+ spin_unlock(&hmm->lock);
+
+ hmm_vma_walk.fault = false;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+
+ walk_page_range(start, end, &mm_walk);
+ return 0;
+}
+EXPORT_SYMBOL(hmm_vma_get_pfns);
+
+/*
+ * hmm_vma_range_done() - stop tracking change to CPU page table over a range
+ * @vma: virtual memory area containing the virtual address range
+ * @range: range being tracked
+ * Returns: false if range data has been invalidated, true otherwise
+ *
+ * Range struct is used to track updates to the CPU page table after a call to
+ * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
+ * using the data, or wants to lock updates to the data it got from those
+ * functions, it must call the hmm_vma_range_done() function, which will then
+ * stop tracking CPU page table updates.
+ *
+ * Note that device driver must still implement general CPU page table update
+ * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
+ * the mmu_notifier API directly.
+ *
+ * CPU page table update tracking done through hmm_range is only temporary and
+ * to be used while trying to duplicate CPU page table contents for a range of
+ * virtual addresses.
+ *
+ * There are two ways to use this :
+ * again:
+ * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * trans = device_build_page_table_update_transaction(pfns);
+ * device_page_table_lock();
+ * if (!hmm_vma_range_done(vma, range)) {
+ * device_page_table_unlock();
+ * goto again;
+ * }
+ * device_commit_transaction(trans);
+ * device_page_table_unlock();
+ *
+ * Or:
+ * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * device_page_table_lock();
+ * hmm_vma_range_done(vma, range);
+ * device_update_page_table(pfns);
+ * device_page_table_unlock();
+ */
+bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
+{
+ unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
+ struct hmm *hmm;
+
+ if (range->end <= range->start) {
+ BUG();
+ return false;
+ }
+
+ hmm = hmm_register(vma->vm_mm);
+ if (!hmm) {
+ memset(range->pfns, 0, sizeof(*range->pfns) * npages);
+ return false;
+ }
+
+ spin_lock(&hmm->lock);
+ list_del_rcu(&range->list);
+ spin_unlock(&hmm->lock);
+
+ return range->valid;
+}
+EXPORT_SYMBOL(hmm_vma_range_done);
+
+/*
+ * hmm_vma_fault() - try to fault some address in a virtual address range
+ * @vma: virtual memory area containing the virtual address range
+ * @range: use to track pfns array content validity
+ * @start: fault range virtual start address (inclusive)
+ * @end: fault range virtual end address (exclusive)
+ * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
+ * @write: is it a write fault
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ *
+ * This is similar to a regular CPU page fault except that it will not trigger
+ * any memory migration if the memory being faulted is not accessible by CPUs.
+ *
+ * On error, for one virtual address in the range, the function will set the
+ * hmm_pfn_t error flag for the corresponding pfn entry.
+ *
+ * Expected use pattern:
+ * retry:
+ * down_read(&mm->mmap_sem);
+ * // Find vma and address device wants to fault, initialize hmm_pfn_t
+ * // array accordingly
+ * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
+ * switch (ret) {
+ * case -EAGAIN:
+ * hmm_vma_range_done(vma, range);
+ * // You might want to rate limit or yield to play nicely, you may
+ * // also commit any valid pfn in the array assuming that you are
+ * // getting true from hmm_vma_range_monitor_end()
+ * goto retry;
+ * case 0:
+ * break;
+ * default:
+ * // Handle error !
+ * up_read(&mm->mmap_sem)
+ * return;
+ * }
+ * // Take device driver lock that serialize device page table update
+ * driver_lock_device_page_table_update();
+ * hmm_vma_range_done(vma, range);
+ * // Commit pfns we got from hmm_vma_fault()
+ * driver_unlock_device_page_table_update();
+ * up_read(&mm->mmap_sem)
+ *
+ * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
+ * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
+ *
+ * YOU HAVE BEEN WARNED !
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns,
+ bool write,
+ bool block)
+{
+ struct hmm_vma_walk hmm_vma_walk;
+ struct mm_walk mm_walk;
+ struct hmm *hmm;
+ int ret;
+
+ /* Sanity check, this really should not happen ! */
+ if (start < vma->vm_start || start >= vma->vm_end)
+ return -EINVAL;
+ if (end < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ hmm = hmm_register(vma->vm_mm);
+ if (!hmm) {
+ hmm_pfns_clear(pfns, start, end);
+ return -ENOMEM;
+ }
+ /* Caller must have registered a mirror using hmm_mirror_register() */
+ if (!hmm->mmu_notifier.ops)
+ return -EINVAL;
+
+ /* Initialize range to track CPU page table update */
+ range->start = start;
+ range->pfns = pfns;
+ range->end = end;
+ spin_lock(&hmm->lock);
+ range->valid = true;
+ list_add_rcu(&range->list, &hmm->ranges);
+ spin_unlock(&hmm->lock);
+
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(pfns, start, end);
+ return 0;
+ }
+
+ hmm_vma_walk.fault = true;
+ hmm_vma_walk.write = write;
+ hmm_vma_walk.block = block;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ hmm_vma_walk.last = range->start;
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+
+ do {
+ ret = walk_page_range(start, end, &mm_walk);
+ start = hmm_vma_walk.last;
+ } while (ret == -EAGAIN);
+
+ if (ret) {
+ unsigned long i;
+
+ i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
+ hmm_vma_range_done(vma, range);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(hmm_vma_fault);
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+
+ page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ if (!page)
+ return NULL;
+ lock_page(page);
+ return page;
+}
+EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
+
+
+static void hmm_devmem_ref_release(struct percpu_ref *ref)
+{
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(ref, struct hmm_devmem, ref);
+ complete(&devmem->completion);
+}
+
+static void hmm_devmem_ref_exit(void *data)
+{
+ struct percpu_ref *ref = data;
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(ref, struct hmm_devmem, ref);
+ percpu_ref_exit(ref);
+ devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
+}
+
+static void hmm_devmem_ref_kill(void *data)
+{
+ struct percpu_ref *ref = data;
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(ref, struct hmm_devmem, ref);
+ percpu_ref_kill(ref);
+ wait_for_completion(&devmem->completion);
+ devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
+}
+
+static int hmm_devmem_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ const struct page *page,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ struct hmm_devmem *devmem = page->pgmap->data;
+
+ return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
+}
+
+static void hmm_devmem_free(struct page *page, void *data)
+{
+ struct hmm_devmem *devmem = data;
+
+ devmem->ops->free(devmem, page);
+}
+
+static DEFINE_MUTEX(hmm_devmem_lock);
+static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
+
+static void hmm_devmem_radix_release(struct resource *resource)
+{
+ resource_size_t key, align_start, align_size, align_end;
+
+ align_start = resource->start & ~(PA_SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
+ align_end = align_start + align_size - 1;
+
+ mutex_lock(&hmm_devmem_lock);
+ for (key = resource->start;
+ key <= resource->end;
+ key += PA_SECTION_SIZE)
+ radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
+ mutex_unlock(&hmm_devmem_lock);
+}
+
+static void hmm_devmem_release(struct device *dev, void *data)
+{
+ struct hmm_devmem *devmem = data;
+ struct resource *resource = devmem->resource;
+ unsigned long start_pfn, npages;
+ struct zone *zone;
+ struct page *page;
+
+ if (percpu_ref_tryget_live(&devmem->ref)) {
+ dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+ percpu_ref_put(&devmem->ref);
+ }
+
+ /* pages are dead and unused, undo the arch mapping */
+ start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
+ npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
+
+ page = pfn_to_page(start_pfn);
+ zone = page_zone(page);
+
+ mem_hotplug_begin();
+ if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
+ __remove_pages(zone, start_pfn, npages);
+ else
+ arch_remove_memory(start_pfn << PAGE_SHIFT,
+ npages << PAGE_SHIFT);
+ mem_hotplug_done();
+
+ hmm_devmem_radix_release(resource);
+}
+
+static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT);
+}
+
+static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
+{
+ resource_size_t key, align_start, align_size, align_end;
+ struct device *device = devmem->device;
+ int ret, nid, is_ram;
+ unsigned long pfn;
+
+ align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
+ align_size = ALIGN(devmem->resource->start +
+ resource_size(devmem->resource),
+ PA_SECTION_SIZE) - align_start;
+
+ is_ram = region_intersects(align_start, align_size,
+ IORESOURCE_SYSTEM_RAM,
+ IORES_DESC_NONE);
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+ __func__, devmem->resource);
+ return -ENXIO;
+ }
+ if (is_ram == REGION_INTERSECTS)
+ return -ENXIO;
+
+ if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
+ devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
+ else
+ devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+
+ devmem->pagemap.res = devmem->resource;
+ devmem->pagemap.page_fault = hmm_devmem_fault;
+ devmem->pagemap.page_free = hmm_devmem_free;
+ devmem->pagemap.dev = devmem->device;
+ devmem->pagemap.ref = &devmem->ref;
+ devmem->pagemap.data = devmem;
+
+ mutex_lock(&hmm_devmem_lock);
+ align_end = align_start + align_size - 1;
+ for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
+ struct hmm_devmem *dup;
+
+ rcu_read_lock();
+ dup = hmm_devmem_find(key);
+ rcu_read_unlock();
+ if (dup) {
+ dev_err(device, "%s: collides with mapping for %s\n",
+ __func__, dev_name(dup->device));
+ mutex_unlock(&hmm_devmem_lock);
+ ret = -EBUSY;
+ goto error;
+ }
+ ret = radix_tree_insert(&hmm_devmem_radix,
+ key >> PA_SECTION_SHIFT,
+ devmem);
+ if (ret) {
+ dev_err(device, "%s: failed: %d\n", __func__, ret);
+ mutex_unlock(&hmm_devmem_lock);
+ goto error_radix;
+ }
+ }
+ mutex_unlock(&hmm_devmem_lock);
+
+ nid = dev_to_node(device);
+ if (nid < 0)
+ nid = numa_mem_id();
+
+ mem_hotplug_begin();
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For device public memory, which is accesible by the CPU, we do
+ * want the linear mapping and thus use arch_add_memory().
+ */
+ if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
+ ret = arch_add_memory(nid, align_start, align_size, false);
+ else
+ ret = add_pages(nid, align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, false);
+ if (ret) {
+ mem_hotplug_done();
+ goto error_add_memory;
+ }
+ move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT);
+ mem_hotplug_done();
+
+ for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ page->pgmap = &devmem->pagemap;
+ }
+ return 0;
+
+error_add_memory:
+ untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+error_radix:
+ hmm_devmem_radix_release(devmem->resource);
+error:
+ return ret;
+}
+
+static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
+{
+ struct hmm_devmem *devmem = data;
+
+ return devmem->resource == match_data;
+}
+
+static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
+{
+ devres_release(devmem->device, &hmm_devmem_release,
+ &hmm_devmem_match, devmem->resource);
+}
+
+/*
+ * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
+ *
+ * @ops: memory event device driver callback (see struct hmm_devmem_ops)
+ * @device: device struct to bind the resource too
+ * @size: size in bytes of the device memory to add
+ * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
+ *
+ * This function first finds an empty range of physical address big enough to
+ * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
+ * in turn allocates struct pages. It does not do anything beyond that; all
+ * events affecting the memory will go through the various callbacks provided
+ * by hmm_devmem_ops struct.
+ *
+ * Device driver should call this function during device initialization and
+ * is then responsible of memory management. HMM only provides helpers.
+ */
+struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ unsigned long size)
+{
+ struct hmm_devmem *devmem;
+ resource_size_t addr;
+ int ret;
+
+ static_branch_enable(&device_private_key);
+
+ devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+ GFP_KERNEL, dev_to_node(device));
+ if (!devmem)
+ return ERR_PTR(-ENOMEM);
+
+ init_completion(&devmem->completion);
+ devmem->pfn_first = -1UL;
+ devmem->pfn_last = -1UL;
+ devmem->resource = NULL;
+ devmem->device = device;
+ devmem->ops = ops;
+
+ ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+ 0, GFP_KERNEL);
+ if (ret)
+ goto error_percpu_ref;
+
+ ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ if (ret)
+ goto error_devm_add_action;
+
+ size = ALIGN(size, PA_SECTION_SIZE);
+ addr = min((unsigned long)iomem_resource.end,
+ (1UL << MAX_PHYSMEM_BITS) - 1);
+ addr = addr - size + 1UL;
+
+ /*
+ * FIXME add a new helper to quickly walk resource tree and find free
+ * range
+ *
+ * FIXME what about ioport_resource resource ?
+ */
+ for (; addr > size && addr >= iomem_resource.start; addr -= size) {
+ ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
+ if (ret != REGION_DISJOINT)
+ continue;
+
+ devmem->resource = devm_request_mem_region(device, addr, size,
+ dev_name(device));
+ if (!devmem->resource) {
+ ret = -ENOMEM;
+ goto error_no_resource;
+ }
+ break;
+ }
+ if (!devmem->resource) {
+ ret = -ERANGE;
+ goto error_no_resource;
+ }
+
+ devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+ devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+ devmem->pfn_last = devmem->pfn_first +
+ (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+ ret = hmm_devmem_pages_create(devmem);
+ if (ret)
+ goto error_pages;
+
+ devres_add(device, devmem);
+
+ ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+ if (ret) {
+ hmm_devmem_remove(devmem);
+ return ERR_PTR(ret);
+ }
+
+ return devmem;
+
+error_pages:
+ devm_release_mem_region(device, devmem->resource->start,
+ resource_size(devmem->resource));
+error_no_resource:
+error_devm_add_action:
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+error_percpu_ref:
+ devres_free(devmem);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(hmm_devmem_add);
+
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ struct resource *res)
+{
+ struct hmm_devmem *devmem;
+ int ret;
+
+ if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
+ return ERR_PTR(-EINVAL);
+
+ static_branch_enable(&device_private_key);
+
+ devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+ GFP_KERNEL, dev_to_node(device));
+ if (!devmem)
+ return ERR_PTR(-ENOMEM);
+
+ init_completion(&devmem->completion);
+ devmem->pfn_first = -1UL;
+ devmem->pfn_last = -1UL;
+ devmem->resource = res;
+ devmem->device = device;
+ devmem->ops = ops;
+
+ ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+ 0, GFP_KERNEL);
+ if (ret)
+ goto error_percpu_ref;
+
+ ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ if (ret)
+ goto error_devm_add_action;
+
+
+ devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+ devmem->pfn_last = devmem->pfn_first +
+ (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+ ret = hmm_devmem_pages_create(devmem);
+ if (ret)
+ goto error_devm_add_action;
+
+ devres_add(device, devmem);
+
+ ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+ if (ret) {
+ hmm_devmem_remove(devmem);
+ return ERR_PTR(ret);
+ }
+
+ return devmem;
+
+error_devm_add_action:
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+error_percpu_ref:
+ devres_free(devmem);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(hmm_devmem_add_resource);
+
+/*
+ * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
+ *
+ * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
+ *
+ * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
+ * of the device driver. It will free struct page and remove the resource that
+ * reserved the physical address range for this device memory.
+ */
+void hmm_devmem_remove(struct hmm_devmem *devmem)
+{
+ resource_size_t start, size;
+ struct device *device;
+ bool cdm = false;
+
+ if (!devmem)
+ return;
+
+ device = devmem->device;
+ start = devmem->resource->start;
+ size = resource_size(devmem->resource);
+
+ cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+ hmm_devmem_pages_remove(devmem);
+
+ if (!cdm)
+ devm_release_mem_region(device, start, size);
+}
+EXPORT_SYMBOL(hmm_devmem_remove);
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper
+ * and it is not needed to make use of any HMM functionality.
+ */
+#define HMM_DEVICE_MAX 256
+
+static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
+static DEFINE_SPINLOCK(hmm_device_lock);
+static struct class *hmm_device_class;
+static dev_t hmm_device_devt;
+
+static void hmm_device_release(struct device *device)
+{
+ struct hmm_device *hmm_device;
+
+ hmm_device = container_of(device, struct hmm_device, device);
+ spin_lock(&hmm_device_lock);
+ clear_bit(hmm_device->minor, hmm_device_mask);
+ spin_unlock(&hmm_device_lock);
+
+ kfree(hmm_device);
+}
+
+struct hmm_device *hmm_device_new(void *drvdata)
+{
+ struct hmm_device *hmm_device;
+
+ hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
+ if (!hmm_device)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock(&hmm_device_lock);
+ hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
+ if (hmm_device->minor >= HMM_DEVICE_MAX) {
+ spin_unlock(&hmm_device_lock);
+ kfree(hmm_device);
+ return ERR_PTR(-EBUSY);
+ }
+ set_bit(hmm_device->minor, hmm_device_mask);
+ spin_unlock(&hmm_device_lock);
+
+ dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
+ hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
+ hmm_device->minor);
+ hmm_device->device.release = hmm_device_release;
+ dev_set_drvdata(&hmm_device->device, drvdata);
+ hmm_device->device.class = hmm_device_class;
+ device_initialize(&hmm_device->device);
+
+ return hmm_device;
+}
+EXPORT_SYMBOL(hmm_device_new);
+
+void hmm_device_put(struct hmm_device *hmm_device)
+{
+ put_device(&hmm_device->device);
+}
+EXPORT_SYMBOL(hmm_device_put);
+
+static int __init hmm_init(void)
+{
+ int ret;
+
+ ret = alloc_chrdev_region(&hmm_device_devt, 0,
+ HMM_DEVICE_MAX,
+ "hmm_device");
+ if (ret)
+ return ret;
+
+ hmm_device_class = class_create(THIS_MODULE, "hmm_device");
+ if (IS_ERR(hmm_device_class)) {
+ unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
+ return PTR_ERR(hmm_device_class);
+ }
+ return 0;
+}
+
+device_initcall(hmm_init);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0b51e70e0a8b..269b5df58543 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -928,6 +928,25 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ if (unlikely(is_swap_pmd(pmd))) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+ VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ if (is_write_migration_entry(entry)) {
+ make_migration_entry_read(&entry);
+ pmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*src_pmd))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ ret = 0;
+ goto out_unlock;
+ }
+#endif
+
if (unlikely(!pmd_trans_huge(pmd))) {
pte_free(dst_mm, pgtable);
goto out_unlock;
@@ -1599,6 +1618,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_huge_zero_pmd(orig_pmd))
goto out;
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto out;
+ }
+
page = pmd_page(orig_pmd);
/*
* If other processes are mapping this page, we couldn't discard
@@ -1684,10 +1709,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
spin_unlock(ptl);
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
- struct page *page = pmd_page(orig_pmd);
- page_remove_rmap(page, true);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
- VM_BUG_ON_PAGE(!PageHead(page), page);
+ struct page *page = NULL;
+ int flush_needed = 1;
+
+ if (pmd_present(orig_pmd)) {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page, true);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ } else if (thp_migration_supported()) {
+ swp_entry_t entry;
+
+ VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
+ entry = pmd_to_swp_entry(orig_pmd);
+ page = pfn_to_page(swp_offset(entry));
+ flush_needed = 0;
+ } else
+ WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
if (PageAnon(page)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
@@ -1696,8 +1735,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
}
+
spin_unlock(ptl);
- tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
+ if (flush_needed)
+ tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
}
return 1;
}
@@ -1717,6 +1758,17 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
}
#endif
+static pmd_t move_soft_dirty_pmd(pmd_t pmd)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ if (unlikely(is_pmd_migration_entry(pmd)))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ else if (pmd_present(pmd))
+ pmd = pmd_mksoft_dirty(pmd);
+#endif
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
@@ -1759,7 +1811,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
- set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+ pmd = move_soft_dirty_pmd(pmd);
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
if (force_flush)
@@ -1794,6 +1847,27 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
preserve_write = prot_numa && pmd_write(*pmd);
ret = 1;
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ if (is_swap_pmd(*pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(*pmd);
+
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+ if (is_write_migration_entry(entry)) {
+ pmd_t newpmd;
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ make_migration_entry_read(&entry);
+ newpmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*pmd))
+ newpmd = pmd_swp_mksoft_dirty(newpmd);
+ set_pmd_at(mm, addr, pmd, newpmd);
+ }
+ goto unlock;
+ }
+#endif
+
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -1859,7 +1933,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
spinlock_t *ptl;
ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
+ pmd_devmap(*pmd)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -1977,14 +2052,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t _pmd;
- bool young, write, dirty, soft_dirty;
+ bool young, write, dirty, soft_dirty, pmd_migration = false;
unsigned long addr;
int i;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+ && !pmd_devmap(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -2009,7 +2085,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- page = pmd_page(*pmd);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ pmd_migration = is_pmd_migration_entry(*pmd);
+ if (pmd_migration) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(*pmd);
+ page = pfn_to_page(swp_offset(entry));
+ } else
+#endif
+ page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
write = pmd_write(*pmd);
@@ -2028,7 +2113,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* transferred to avoid any possibility of altering
* permissions across VMAs.
*/
- if (freeze) {
+ if (freeze || pmd_migration) {
swp_entry_t swp_entry;
swp_entry = make_migration_entry(page + i, write);
entry = swp_entry_to_pte(swp_entry);
@@ -2127,7 +2212,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
page = pmd_page(*pmd);
if (PageMlocked(page))
clear_page_mlock(page);
- } else if (!pmd_devmap(*pmd))
+ } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
goto out;
__split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
@@ -2210,7 +2295,7 @@ static void freeze_page(struct page *page)
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
- ttu_flags |= TTU_MIGRATION;
+ ttu_flags |= TTU_SPLIT_FREEZE;
unmap_success = try_to_unmap(page, ttu_flags);
VM_BUG_ON_PAGE(!unmap_success, page);
@@ -2745,3 +2830,66 @@ static int __init split_huge_pages_debugfs(void)
}
late_initcall(split_huge_pages_debugfs);
#endif
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+ struct page *page)
+{
+ struct vm_area_struct *vma = pvmw->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address = pvmw->address;
+ pmd_t pmdval;
+ swp_entry_t entry;
+ pmd_t pmdswp;
+
+ if (!(pvmw->pmd && !pvmw->pte))
+ return;
+
+ mmu_notifier_invalidate_range_start(mm, address,
+ address + HPAGE_PMD_SIZE);
+
+ flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
+ pmdval = *pvmw->pmd;
+ pmdp_invalidate(vma, address, pvmw->pmd);
+ if (pmd_dirty(pmdval))
+ set_page_dirty(page);
+ entry = make_migration_entry(page, pmd_write(pmdval));
+ pmdswp = swp_entry_to_pmd(entry);
+ if (pmd_soft_dirty(pmdval))
+ pmdswp = pmd_swp_mksoft_dirty(pmdswp);
+ set_pmd_at(mm, address, pvmw->pmd, pmdswp);
+ page_remove_rmap(page, true);
+ put_page(page);
+
+ mmu_notifier_invalidate_range_end(mm, address,
+ address + HPAGE_PMD_SIZE);
+}
+
+void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
+{
+ struct vm_area_struct *vma = pvmw->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address = pvmw->address;
+ unsigned long mmun_start = address & HPAGE_PMD_MASK;
+ pmd_t pmde;
+ swp_entry_t entry;
+
+ if (!(pvmw->pmd && !pvmw->pte))
+ return;
+
+ entry = pmd_to_swp_entry(*pvmw->pmd);
+ get_page(new);
+ pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
+ if (pmd_swp_soft_dirty(*pvmw->pmd))
+ pmde = pmd_mksoft_dirty(pmde);
+ if (is_write_migration_entry(entry))
+ pmde = maybe_pmd_mkwrite(pmde, vma);
+
+ flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
+ page_add_anon_rmap(new, vma, mmun_start, true);
+ set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_vma_page(new);
+ update_mmu_cache_pmd(vma, address, pvmw->pmd);
+}
+#endif
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index f2c2492681bf..b47664358796 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -28,7 +28,7 @@ INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct vm_area_struct *prev,
- struct rb_root *root)
+ struct rb_root_cached *root)
{
struct rb_node **link;
struct vm_area_struct *parent;
@@ -55,7 +55,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
node->shared.rb_subtree_last = last;
rb_link_node(&node->shared.rb, &parent->shared.rb, link);
- rb_insert_augmented(&node->shared.rb, root,
+ rb_insert_augmented(&node->shared.rb, &root->rb_root,
&vma_interval_tree_augment);
}
@@ -74,7 +74,7 @@ INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
static inline, __anon_vma_interval_tree)
void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
- struct rb_root *root)
+ struct rb_root_cached *root)
{
#ifdef CONFIG_DEBUG_VM_RB
node->cached_vma_start = avc_start_pgoff(node);
@@ -84,13 +84,13 @@ void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
}
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
- struct rb_root *root)
+ struct rb_root_cached *root)
{
__anon_vma_interval_tree_remove(node, root);
}
struct anon_vma_chain *
-anon_vma_interval_tree_iter_first(struct rb_root *root,
+anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
unsigned long first, unsigned long last)
{
return __anon_vma_interval_tree_iter_first(root, first, last);
diff --git a/mm/madvise.c b/mm/madvise.c
index eea1c733286f..21261ff0466f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
}
- page = vm_normal_page(vma, addr, ptent);
+ page = _vm_normal_page(vma, addr, ptent, true);
if (!page)
continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6532b219b222..15af3da5af02 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -119,6 +119,7 @@ static const char *const mem_cgroup_lru_names[] = {
struct mem_cgroup_tree_per_node {
struct rb_root rb_root;
+ struct rb_node *rb_rightmost;
spinlock_t lock;
};
@@ -386,6 +387,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup_per_node *mz_node;
+ bool rightmost = true;
if (mz->on_tree)
return;
@@ -397,8 +399,11 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
tree_node);
- if (mz->usage_in_excess < mz_node->usage_in_excess)
+ if (mz->usage_in_excess < mz_node->usage_in_excess) {
p = &(*p)->rb_left;
+ rightmost = false;
+ }
+
/*
* We can't avoid mem cgroups that are over their soft
* limit by the same amount
@@ -406,6 +411,10 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
else if (mz->usage_in_excess >= mz_node->usage_in_excess)
p = &(*p)->rb_right;
}
+
+ if (rightmost)
+ mctz->rb_rightmost = &mz->tree_node;
+
rb_link_node(&mz->tree_node, parent, p);
rb_insert_color(&mz->tree_node, &mctz->rb_root);
mz->on_tree = true;
@@ -416,6 +425,10 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
{
if (!mz->on_tree)
return;
+
+ if (&mz->tree_node == mctz->rb_rightmost)
+ mctz->rb_rightmost = rb_prev(&mz->tree_node);
+
rb_erase(&mz->tree_node, &mctz->rb_root);
mz->on_tree = false;
}
@@ -496,16 +509,15 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
- struct rb_node *rightmost = NULL;
struct mem_cgroup_per_node *mz;
retry:
mz = NULL;
- rightmost = rb_last(&mctz->rb_root);
- if (!rightmost)
+ if (!mctz->rb_rightmost)
goto done; /* Nothing to reclaim from */
- mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
+ mz = rb_entry(mctz->rb_rightmost,
+ struct mem_cgroup_per_node, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
@@ -1792,6 +1804,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
}
stock->nr_pages += nr_pages;
+ if (stock->nr_pages > CHARGE_BATCH)
+ drain_stock(stock);
+
local_irq_restore(flags);
}
@@ -4414,12 +4429,13 @@ enum mc_target_type {
MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
+ MC_TARGET_DEVICE,
};
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
- struct page *page = vm_normal_page(vma, addr, ptent);
+ struct page *page = _vm_normal_page(vma, addr, ptent, true);
if (!page || !page_mapped(page))
return NULL;
@@ -4436,7 +4452,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
return page;
}
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
pte_t ptent, swp_entry_t *entry)
{
@@ -4445,6 +4461,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
+
+ /*
+ * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+ * a device and because they are not accessible by CPU they are store
+ * as special swap entry in the CPU page table.
+ */
+ if (is_device_private_entry(ent)) {
+ page = device_private_entry_to_page(ent);
+ /*
+ * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+ * a refcount of 1 when free (unlike normal page)
+ */
+ if (!page_ref_add_unless(page, 1, 1))
+ return NULL;
+ return page;
+ }
+
/*
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
@@ -4605,6 +4638,13 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
+ * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * For now we such page is charge like a regular page would be as for all
+ * intent and purposes it is just special memory taking the place of a
+ * regular page.
+ *
+ * See Documentations/vm/hmm.txt and include/linux/hmm.h
*
* Called with pte lock held.
*/
@@ -4633,6 +4673,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
+ if (is_device_private_page(page) ||
+ is_device_public_page(page))
+ ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
}
@@ -4664,6 +4707,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
struct page *page = NULL;
enum mc_target_type ret = MC_TARGET_NONE;
+ if (unlikely(is_swap_pmd(pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(pmd));
+ return ret;
+ }
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!(mc.flags & MOVE_ANON))
@@ -4695,6 +4743,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
+ /*
+ * Note their can not be MC_TARGET_DEVICE for now as we do not
+ * support transparent huge page with MEMORY_DEVICE_PUBLIC or
+ * MEMORY_DEVICE_PRIVATE but this might change.
+ */
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
@@ -4910,6 +4963,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
putback_lru_page(page);
}
put_page(page);
+ } else if (target_type == MC_TARGET_DEVICE) {
+ page = target.page;
+ if (!mem_cgroup_move_account(page, true,
+ mc.from, mc.to)) {
+ mc.precharge -= HPAGE_PMD_NR;
+ mc.moved_charge += HPAGE_PMD_NR;
+ }
+ put_page(page);
}
spin_unlock(ptl);
return 0;
@@ -4921,12 +4982,16 @@ retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
+ bool device = false;
swp_entry_t ent;
if (!mc.precharge)
break;
switch (get_mctgt_type(vma, addr, ptent, &target)) {
+ case MC_TARGET_DEVICE:
+ device = true;
+ /* fall through */
case MC_TARGET_PAGE:
page = target.page;
/*
@@ -4937,7 +5002,7 @@ retry:
*/
if (PageTransCompound(page))
goto put;
- if (isolate_lru_page(page))
+ if (!device && isolate_lru_page(page))
goto put;
if (!mem_cgroup_move_account(page, false,
mc.from, mc.to)) {
@@ -4945,7 +5010,8 @@ retry:
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
- putback_lru_page(page);
+ if (!device)
+ putback_lru_page(page);
put: /* get_mctgt_type() gets the page */
put_page(page);
break;
@@ -5535,48 +5601,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
cancel_charge(memcg, nr_pages);
}
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
- unsigned long nr_anon, unsigned long nr_file,
- unsigned long nr_kmem, unsigned long nr_huge,
- unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+ struct mem_cgroup *memcg;
+ unsigned long pgpgout;
+ unsigned long nr_anon;
+ unsigned long nr_file;
+ unsigned long nr_kmem;
+ unsigned long nr_huge;
+ unsigned long nr_shmem;
+ struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
+{
+ memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
{
- unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+ unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
- if (!mem_cgroup_is_root(memcg)) {
- page_counter_uncharge(&memcg->memory, nr_pages);
+ if (!mem_cgroup_is_root(ug->memcg)) {
+ page_counter_uncharge(&ug->memcg->memory, nr_pages);
if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
- page_counter_uncharge(&memcg->kmem, nr_kmem);
- memcg_oom_recover(memcg);
+ page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+ page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+ memcg_oom_recover(ug->memcg);
}
local_irq_save(flags);
- __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
- __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
- __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
- __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
- __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
- __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
- memcg_check_events(memcg, dummy_page);
+ __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+ __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+ __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+ __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+ __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+ __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+ memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
- if (!mem_cgroup_is_root(memcg))
- css_put_many(&memcg->css, nr_pages);
+ if (!mem_cgroup_is_root(ug->memcg))
+ css_put_many(&ug->memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+ if (!page->mem_cgroup)
+ return;
+
+ /*
+ * Nobody should be changing or seriously looking at
+ * page->mem_cgroup at this point, we have fully
+ * exclusive access to the page.
+ */
+
+ if (ug->memcg != page->mem_cgroup) {
+ if (ug->memcg) {
+ uncharge_batch(ug);
+ uncharge_gather_clear(ug);
+ }
+ ug->memcg = page->mem_cgroup;
+ }
+
+ if (!PageKmemcg(page)) {
+ unsigned int nr_pages = 1;
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ ug->nr_huge += nr_pages;
+ }
+ if (PageAnon(page))
+ ug->nr_anon += nr_pages;
+ else {
+ ug->nr_file += nr_pages;
+ if (PageSwapBacked(page))
+ ug->nr_shmem += nr_pages;
+ }
+ ug->pgpgout++;
+ } else {
+ ug->nr_kmem += 1 << compound_order(page);
+ __ClearPageKmemcg(page);
+ }
+
+ ug->dummy_page = page;
+ page->mem_cgroup = NULL;
}
static void uncharge_list(struct list_head *page_list)
{
- struct mem_cgroup *memcg = NULL;
- unsigned long nr_shmem = 0;
- unsigned long nr_anon = 0;
- unsigned long nr_file = 0;
- unsigned long nr_huge = 0;
- unsigned long nr_kmem = 0;
- unsigned long pgpgout = 0;
+ struct uncharge_gather ug;
struct list_head *next;
- struct page *page;
+
+ uncharge_gather_clear(&ug);
/*
* Note that the list can be a single page->lru; hence the
@@ -5584,57 +5704,16 @@ static void uncharge_list(struct list_head *page_list)
*/
next = page_list->next;
do {
+ struct page *page;
+
page = list_entry(next, struct page, lru);
next = page->lru.next;
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
-
- if (!page->mem_cgroup)
- continue;
-
- /*
- * Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point, we have fully
- * exclusive access to the page.
- */
-
- if (memcg != page->mem_cgroup) {
- if (memcg) {
- uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_kmem, nr_huge, nr_shmem, page);
- pgpgout = nr_anon = nr_file = nr_kmem = 0;
- nr_huge = nr_shmem = 0;
- }
- memcg = page->mem_cgroup;
- }
-
- if (!PageKmemcg(page)) {
- unsigned int nr_pages = 1;
-
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- nr_huge += nr_pages;
- }
- if (PageAnon(page))
- nr_anon += nr_pages;
- else {
- nr_file += nr_pages;
- if (PageSwapBacked(page))
- nr_shmem += nr_pages;
- }
- pgpgout++;
- } else {
- nr_kmem += 1 << compound_order(page);
- __ClearPageKmemcg(page);
- }
-
- page->mem_cgroup = NULL;
+ uncharge_page(page, &ug);
} while (next != page_list);
- if (memcg)
- uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_kmem, nr_huge, nr_shmem, page);
+ if (ug.memcg)
+ uncharge_batch(&ug);
}
/**
@@ -5646,6 +5725,8 @@ static void uncharge_list(struct list_head *page_list)
*/
void mem_cgroup_uncharge(struct page *page)
{
+ struct uncharge_gather ug;
+
if (mem_cgroup_disabled())
return;
@@ -5653,8 +5734,9 @@ void mem_cgroup_uncharge(struct page *page)
if (!page->mem_cgroup)
return;
- INIT_LIST_HEAD(&page->lru);
- uncharge_list(&page->lru);
+ uncharge_gather_clear(&ug);
+ uncharge_page(page, &ug);
+ uncharge_batch(&ug);
}
/**
@@ -5819,8 +5901,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
- page_counter_uncharge(&memcg->memory, nr_pages);
- css_put_many(&memcg->css, nr_pages);
+ refill_stock(memcg, nr_pages);
}
static int __init cgroup_memory(char *s)
@@ -5876,6 +5957,7 @@ static int __init mem_cgroup_init(void)
node_online(node) ? node : NUMA_NO_NODE);
rtpn->rb_root = RB_ROOT;
+ rtpn->rb_rightmost = NULL;
spin_lock_init(&rtpn->lock);
soft_limit_tree.rb_tree_per_node[node] = rtpn;
}
diff --git a/mm/memory.c b/mm/memory.c
index 13ee83b43878..ec4e15494901 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/memremap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -817,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
#else
# define HAVE_PTE_SPECIAL 0
#endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device)
{
unsigned long pfn = pte_pfn(pte);
@@ -829,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return vma->vm_ops->find_special_page(vma, addr);
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
- if (!is_zero_pfn(pfn))
- print_bad_pte(vma, addr, pte, NULL);
+ if (is_zero_pfn(pfn))
+ return NULL;
+
+ /*
+ * Device public pages are special pages (they are ZONE_DEVICE
+ * pages but different from persistent memory). They behave
+ * allmost like normal pages. The difference is that they are
+ * not on the lru and thus should never be involve with any-
+ * thing that involve lru manipulation (mlock, numa balancing,
+ * ...).
+ *
+ * This is why we still want to return NULL for such page from
+ * vm_normal_page() so that we do not have to special case all
+ * call site of vm_normal_page().
+ */
+ if (likely(pfn < highest_memmap_pfn)) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (is_device_public_page(page)) {
+ if (with_public_device)
+ return page;
+ return NULL;
+ }
+ }
+ print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
@@ -956,6 +980,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_swp_mksoft_dirty(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
+ } else if (is_device_private_entry(entry)) {
+ page = device_private_entry_to_page(entry);
+
+ /*
+ * Update rss count even for unaddressable pages, as
+ * they should treated just like normal pages in this
+ * respect.
+ *
+ * We will likely want to have some new rss counters
+ * for unaddressable pages, at some point. But for now
+ * keep things as they are.
+ */
+ get_page(page);
+ rss[mm_counter(page)]++;
+ page_dup_rmap(page, false);
+
+ /*
+ * We do not preserve soft-dirty information, because so
+ * far, checkpoint/restore is the only feature that
+ * requires that. And checkpoint/restore does not work
+ * when a device driver is involved (you cannot easily
+ * save and restore device driver state).
+ */
+ if (is_write_device_private_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ make_device_private_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
}
goto out_set_pte;
}
@@ -982,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
+ } else if (pte_devmap(pte)) {
+ page = pte_page(pte);
+
+ /*
+ * Cache coherent device memory behave like regular page and
+ * not like persistent memory page. For more informations see
+ * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
+ */
+ if (is_device_public_page(page)) {
+ get_page(page);
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
+ }
}
out_set_pte:
@@ -1065,7 +1131,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
+ if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
+ || pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
err = copy_huge_pmd(dst_mm, src_mm,
@@ -1236,7 +1303,7 @@ again:
if (pte_present(ptent)) {
struct page *page;
- page = vm_normal_page(vma, addr, ptent);
+ page = _vm_normal_page(vma, addr, ptent, true);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
@@ -1273,6 +1340,29 @@ again:
}
continue;
}
+
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+ struct page *page = device_private_entry_to_page(entry);
+
+ if (unlikely(details && details->check_mapping)) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+ * invalidate cache without truncating:
+ * unmap shared but keep private pages.
+ */
+ if (details->check_mapping !=
+ page_rmapping(page))
+ continue;
+ }
+
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
+ put_page(page);
+ continue;
+ }
+
/* If details->check_mapping, we leave swap entries. */
if (unlikely(details))
continue;
@@ -1326,7 +1416,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
@@ -2671,7 +2761,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
-static inline void unmap_mapping_range_tree(struct rb_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
struct zap_details *details)
{
struct vm_area_struct *vma;
@@ -2735,7 +2825,7 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
i_mmap_lock_write(mapping);
- if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
}
@@ -2775,6 +2865,14 @@ int do_swap_page(struct vm_fault *vmf)
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
+ } else if (is_device_private_entry(entry)) {
+ /*
+ * For un-addressable device memory we call the pgmap
+ * fault handler callback. The callback must migrate
+ * the page back to some CPU accessible page.
+ */
+ ret = device_private_entry_fault(vma, vmf->address, entry,
+ vmf->flags, vmf->pmd);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
@@ -3863,6 +3961,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
@@ -3885,7 +3984,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
- unsigned int dirty = flags & FAULT_FLAG_WRITE;
/* NUMA case for anonymous PUDs would go here */
@@ -3911,12 +4009,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
pmd_t orig_pmd = *vmf.pmd;
barrier();
+ if (unlikely(is_swap_pmd(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ if (is_pmd_migration_entry(orig_pmd))
+ pmd_migration_entry_wait(mm, vmf.pmd);
+ return 0;
+ }
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if ((vmf.flags & FAULT_FLAG_WRITE) &&
- !pmd_write(orig_pmd)) {
+ if (dirty && !pmd_write(orig_pmd)) {
ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -3949,6 +4053,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ return VM_FAULT_SIGSEGV;
+
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
@@ -3956,11 +4065,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
- if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
- flags & FAULT_FLAG_INSTRUCTION,
- flags & FAULT_FLAG_REMOTE))
- return VM_FAULT_SIGSEGV;
-
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 73bf17df6899..e882cb6da994 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -99,7 +99,7 @@ void mem_hotplug_done(void)
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
- struct resource *res;
+ struct resource *res, *conflict;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
if (!res)
return ERR_PTR(-ENOMEM);
@@ -108,7 +108,13 @@ static struct resource *register_memory_resource(u64 start, u64 size)
res->start = start;
res->end = start + size - 1;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- if (request_resource(&iomem_resource, res) < 0) {
+ conflict = request_resource_conflict(&iomem_resource, res);
+ if (conflict) {
+ if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+ pr_debug("Device unaddressable memory block "
+ "memory hotplug at %#010llx !\n",
+ (unsigned long long)start);
+ }
pr_debug("System RAM resource %pR cannot be added\n", res);
kfree(res);
return ERR_PTR(-EEXIST);
@@ -1380,7 +1386,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (isolate_huge_page(page, &source))
move_pages -= 1 << compound_order(head);
continue;
- }
+ } else if (thp_migration_supported() && PageTransHuge(page))
+ pfn = page_to_pfn(compound_head(page))
+ + hpage_nr_pages(page) - 1;
if (!get_page_unless_zero(page))
continue;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 618ab125228b..006ba625c0b8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,6 +97,7 @@
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
+#include <linux/swapops.h>
#include <asm/tlbflush.h>
#include <linux/uaccess.h>
@@ -412,6 +413,64 @@ struct queue_pages {
};
/*
+ * Check if the page's nid is in qp->nmask.
+ *
+ * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
+ * in the invert of qp->nmask.
+ */
+static inline bool queue_pages_required(struct page *page,
+ struct queue_pages *qp)
+{
+ int nid = page_to_nid(page);
+ unsigned long flags = qp->flags;
+
+ return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
+}
+
+static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ int ret = 0;
+ struct page *page;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags;
+
+ if (unlikely(is_pmd_migration_entry(*pmd))) {
+ ret = 1;
+ goto unlock;
+ }
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
+ goto out;
+ }
+ if (!thp_migration_supported()) {
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ goto out;
+ }
+ if (!queue_pages_required(page, qp)) {
+ ret = 1;
+ goto unlock;
+ }
+
+ ret = 1;
+ flags = qp->flags;
+ /* go to thp migration */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ migrate_page_add(page, qp->pagelist, flags);
+unlock:
+ spin_unlock(ptl);
+out:
+ return ret;
+}
+
+/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*/
@@ -422,30 +481,15 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int nid, ret;
+ int ret;
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge(*pmd)) {
- ptl = pmd_lock(walk->mm, pmd);
- if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
- if (is_huge_zero_page(page)) {
- spin_unlock(ptl);
- __split_huge_pmd(vma, pmd, addr, false, NULL);
- } else {
- get_page(page);
- spin_unlock(ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (ret)
- return 0;
- }
- } else {
- spin_unlock(ptl);
- }
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
+ if (ret)
+ return 0;
}
if (pmd_trans_unstable(pmd))
@@ -464,10 +508,9 @@ retry:
*/
if (PageReserved(page))
continue;
- nid = page_to_nid(page);
- if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
+ if (!queue_pages_required(page, qp))
continue;
- if (PageTransCompound(page)) {
+ if (PageTransCompound(page) && !thp_migration_supported()) {
get_page(page);
pte_unmap_unlock(pte, ptl);
lock_page(page);
@@ -497,7 +540,6 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int nid;
struct page *page;
spinlock_t *ptl;
pte_t entry;
@@ -507,8 +549,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
if (!pte_present(entry))
goto unlock;
page = pte_page(entry);
- nid = page_to_nid(page);
- if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
+ if (!queue_pages_required(page, qp))
goto unlock;
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
@@ -881,19 +922,21 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
#ifdef CONFIG_MIGRATION
/*
- * page migration
+ * page migration, thp tail pages can be passed.
*/
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
+ struct page *head = compound_head(page);
/*
* Avoid migrating a page that is shared with others.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
- if (!isolate_lru_page(page)) {
- list_add_tail(&page->lru, pagelist);
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
+ if (!isolate_lru_page(head)) {
+ list_add_tail(&head->lru, pagelist);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_cache(head),
+ hpage_nr_pages(head));
}
}
}
@@ -903,7 +946,17 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
- else
+ else if (thp_migration_supported() && PageTransHuge(page)) {
+ struct page *thp;
+
+ thp = alloc_pages_node(node,
+ (GFP_TRANSHUGE | __GFP_THISNODE),
+ HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ } else
return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE, 0);
}
@@ -1069,6 +1122,15 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
if (PageHuge(page)) {
BUG_ON(!vma);
return alloc_huge_page_noerr(vma, address, 1);
+ } else if (thp_migration_supported() && PageTransHuge(page)) {
+ struct page *thp;
+
+ thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
+ HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
}
/*
* if !vma, alloc_page_vma() will use task or system default policy
@@ -1683,8 +1745,7 @@ unsigned int mempolicy_slab_node(void)
* node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
* number of present nodes.
*/
-static unsigned offset_il_node(struct mempolicy *pol,
- struct vm_area_struct *vma, unsigned long n)
+static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
@@ -1717,7 +1778,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
BUG_ON(shift < PAGE_SHIFT);
off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
off += (addr - vma->vm_start) >> shift;
- return offset_il_node(pol, vma, off);
+ return offset_il_node(pol, off);
} else
return interleave_nodes(pol);
}
@@ -2172,20 +2233,15 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
int polnid = -1;
int ret = -1;
- BUG_ON(!vma);
-
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
- BUG_ON(addr >= vma->vm_end);
- BUG_ON(addr < vma->vm_start);
-
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
- polnid = offset_il_node(pol, vma, pgoff);
+ polnid = offset_il_node(pol, pgoff);
break;
case MPOL_PREFERRED:
diff --git a/mm/migrate.c b/mm/migrate.c
index e84eeb4e4356..6954c1435833 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,9 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
@@ -185,8 +188,8 @@ void putback_movable_pages(struct list_head *l)
unlock_page(page);
put_page(page);
} else {
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_cache(page), -hpage_nr_pages(page));
putback_lru_page(page);
}
}
@@ -216,6 +219,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
new = page - pvmw.page->index +
linear_page_index(vma, pvmw.address);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte) {
+ VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+ remove_migration_pmd(&pvmw, new);
+ continue;
+ }
+#endif
+
get_page(new);
pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
if (pte_swp_soft_dirty(*pvmw.pte))
@@ -228,7 +240,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
- flush_dcache_page(new);
+ if (unlikely(is_zone_device_page(new))) {
+ if (is_device_private_page(new)) {
+ entry = make_device_private_entry(new, pte_write(pte));
+ pte = swp_entry_to_pte(entry);
+ } else if (is_device_public_page(new)) {
+ pte = pte_mkdevmap(pte);
+ flush_dcache_page(new);
+ }
+ } else
+ flush_dcache_page(new);
+
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
@@ -330,6 +352,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,
__migration_entry_wait(mm, pte, ptl);
}
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
+{
+ spinlock_t *ptl;
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmd);
+ if (!is_pmd_migration_entry(*pmd))
+ goto unlock;
+ page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
+ if (!get_page_unless_zero(page))
+ goto unlock;
+ spin_unlock(ptl);
+ wait_on_page_locked(page);
+ put_page(page);
+ return;
+unlock:
+ spin_unlock(ptl);
+}
+#endif
+
#ifdef CONFIG_BLOCK
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
@@ -398,6 +441,13 @@ int migrate_page_move_mapping(struct address_space *mapping,
int expected_count = 1 + extra_count;
void **pslot;
+ /*
+ * Device public or private pages have an extra refcount as they are
+ * ZONE_DEVICE pages.
+ */
+ expected_count += is_device_private_page(page);
+ expected_count += is_device_public_page(page);
+
if (!mapping) {
/* Anonymous page without mapping */
if (page_count(page) != expected_count)
@@ -604,15 +654,10 @@ static void copy_huge_page(struct page *dst, struct page *src)
/*
* Copy the page to its new location
*/
-void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_states(struct page *newpage, struct page *page)
{
int cpupid;
- if (PageHuge(page) || PageTransHuge(page))
- copy_huge_page(newpage, page);
- else
- copy_highpage(newpage, page);
-
if (PageError(page))
SetPageError(newpage);
if (PageReferenced(page))
@@ -666,6 +711,17 @@ void migrate_page_copy(struct page *newpage, struct page *page)
mem_cgroup_migrate(page, newpage);
}
+EXPORT_SYMBOL(migrate_page_states);
+
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ if (PageHuge(page) || PageTransHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
+
+ migrate_page_states(newpage, page);
+}
EXPORT_SYMBOL(migrate_page_copy);
/************************************************************
@@ -691,7 +747,10 @@ int migrate_page(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page);
@@ -741,12 +800,15 @@ int buffer_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
bh = head;
do {
unlock_buffer(bh);
- put_bh(bh);
+ put_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
@@ -805,8 +867,13 @@ static int fallback_migrate_page(struct address_space *mapping,
{
if (PageDirty(page)) {
/* Only writeback pages in full synchronous migration */
- if (mode != MIGRATE_SYNC)
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
return -EBUSY;
+ }
return writeout(mapping, page);
}
@@ -943,7 +1010,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* the retry loop is too short and in the sync-light case,
* the overhead of stalling is too much
*/
- if (mode != MIGRATE_SYNC) {
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
rc = -EBUSY;
goto out_unlock;
}
@@ -1088,7 +1159,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
}
- if (unlikely(PageTransHuge(page))) {
+ if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
lock_page(page);
rc = split_huge_page(page);
unlock_page(page);
@@ -1116,8 +1187,8 @@ out:
* as __PageMovable
*/
if (likely(!__PageMovable(page)))
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_cache(page), -hpage_nr_pages(page));
}
/*
@@ -1213,8 +1284,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return -ENOMEM;
if (!trylock_page(hpage)) {
- if (!force || mode != MIGRATE_SYNC)
+ if (!force)
+ goto out;
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
goto out;
+ }
lock_page(hpage);
}
@@ -1391,7 +1469,17 @@ static struct page *new_page_node(struct page *p, unsigned long private,
if (PageHuge(p))
return alloc_huge_page_node(page_hstate(compound_head(p)),
pm->node);
- else
+ else if (thp_migration_supported() && PageTransHuge(p)) {
+ struct page *thp;
+
+ thp = alloc_pages_node(pm->node,
+ (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
+ HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ } else
return __alloc_pages_node(pm->node,
GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
}
@@ -1418,6 +1506,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
struct vm_area_struct *vma;
struct page *page;
+ struct page *head;
+ unsigned int follflags;
err = -EFAULT;
vma = find_vma(mm, pp->addr);
@@ -1425,8 +1515,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto set_status;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, pp->addr,
- FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
+ follflags = FOLL_GET | FOLL_DUMP;
+ if (!thp_migration_supported())
+ follflags |= FOLL_SPLIT;
+ page = follow_page(vma, pp->addr, follflags);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1436,7 +1528,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!page)
goto set_status;
- pp->page = page;
err = page_to_nid(page);
if (err == pp->node)
@@ -1451,16 +1542,22 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto put_and_set;
if (PageHuge(page)) {
- if (PageHead(page))
+ if (PageHead(page)) {
isolate_huge_page(page, &pagelist);
+ err = 0;
+ pp->page = page;
+ }
goto put_and_set;
}
- err = isolate_lru_page(page);
+ pp->page = compound_head(page);
+ head = compound_head(page);
+ err = isolate_lru_page(head);
if (!err) {
- list_add_tail(&page->lru, &pagelist);
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ list_add_tail(&head->lru, &pagelist);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_cache(head),
+ hpage_nr_pages(head));
}
put_and_set:
/*
@@ -2029,3 +2126,859 @@ out_unlock:
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_NUMA */
+
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
+struct migrate_vma {
+ struct vm_area_struct *vma;
+ unsigned long *dst;
+ unsigned long *src;
+ unsigned long cpages;
+ unsigned long npages;
+ unsigned long start;
+ unsigned long end;
+};
+
+static int migrate_vma_collect_hole(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[migrate->npages] = 0;
+ migrate->cpages++;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_skip(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = 0;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+again:
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, walk);
+
+ if (pmd_trans_huge(*pmdp)) {
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (unlikely(!pmd_trans_huge(*pmdp))) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
+ page = pmd_page(*pmdp);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmdp, addr);
+ if (pmd_trans_unstable(pmdp))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ } else {
+ int ret;
+
+ get_page(page);
+ spin_unlock(ptl);
+ if (unlikely(!trylock_page(page)))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end,
+ walk);
+ }
+ }
+
+ if (unlikely(pmd_bad(*pmdp)))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+
+ for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ unsigned long mpfn, pfn;
+ struct page *page;
+ swp_entry_t entry;
+ pte_t pte;
+
+ pte = *ptep;
+ pfn = pte_pfn(pte);
+
+ if (pte_none(pte)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ pfn = 0;
+ goto next;
+ }
+
+ if (!pte_present(pte)) {
+ mpfn = pfn = 0;
+
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = pte_to_swp_entry(pte);
+ if (!is_device_private_entry(entry))
+ goto next;
+
+ page = device_private_entry_to_page(entry);
+ mpfn = migrate_pfn(page_to_pfn(page))|
+ MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+ if (is_write_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
+ if (is_zero_pfn(pfn)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ pfn = 0;
+ goto next;
+ }
+ page = _vm_normal_page(migrate->vma, addr, pte, true);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+
+ /* FIXME support THP */
+ if (!page || !page->mapping || PageTransCompound(page)) {
+ mpfn = pfn = 0;
+ goto next;
+ }
+ pfn = page_to_pfn(page);
+
+ /*
+ * By getting a reference on the page we pin it and that blocks
+ * any kind of migration. Side effect is that it "freezes" the
+ * pte.
+ *
+ * We drop this reference after isolating the page from the lru
+ * for non device page (device page are not on the lru and thus
+ * can't be dropped from it).
+ */
+ get_page(page);
+ migrate->cpages++;
+
+ /*
+ * Optimize for the common case where page is only mapped once
+ * in one process. If we can lock the page, then we can safely
+ * set up a special migration page table entry now.
+ */
+ if (trylock_page(page)) {
+ pte_t swp_pte;
+
+ mpfn |= MIGRATE_PFN_LOCKED;
+ ptep_get_and_clear(mm, addr, ptep);
+
+ /* Setup special migration page table entry */
+ entry = make_migration_entry(page, pte_write(pte));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, addr, ptep, swp_pte);
+
+ /*
+ * This is like regular unmap: we remove the rmap and
+ * drop page refcount. Page won't be freed, as we took
+ * a reference just above.
+ */
+ page_remove_rmap(page, false);
+ put_page(page);
+
+ if (pte_present(pte))
+ unmapped++;
+ }
+
+next:
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = mpfn;
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
+ /* Only flush the TLB if we actually modified any entries */
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return 0;
+}
+
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+ struct mm_walk mm_walk;
+
+ mm_walk.pmd_entry = migrate_vma_collect_pmd;
+ mm_walk.pte_entry = NULL;
+ mm_walk.pte_hole = migrate_vma_collect_hole;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.vma = migrate->vma;
+ mm_walk.mm = migrate->vma->vm_mm;
+ mm_walk.private = migrate;
+
+ mmu_notifier_invalidate_range_start(mm_walk.mm,
+ migrate->start,
+ migrate->end);
+ walk_page_range(migrate->start, migrate->end, &mm_walk);
+ mmu_notifier_invalidate_range_end(mm_walk.mm,
+ migrate->start,
+ migrate->end);
+
+ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * migrate_page_move_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page)
+{
+ /*
+ * One extra ref because caller holds an extra reference, either from
+ * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+ * a device page.
+ */
+ int extra = 1;
+
+ /*
+ * FIXME support THP (transparent huge page), it is bit more complex to
+ * check them than regular pages, because they can be mapped with a pmd
+ * or with a pte (split pte mapping).
+ */
+ if (PageCompound(page))
+ return false;
+
+ /* Page from ZONE_DEVICE have one extra reference */
+ if (is_zone_device_page(page)) {
+ /*
+ * Private page can never be pin as they have no valid pte and
+ * GUP will fail for those. Yet if there is a pending migration
+ * a thread might try to wait on the pte migration entry and
+ * will bump the page reference count. Sadly there is no way to
+ * differentiate a regular pin from migration wait. Hence to
+ * avoid 2 racing thread trying to migrate back to CPU to enter
+ * infinite loop (one stoping migration because the other is
+ * waiting on pte migration entry). We always return true here.
+ *
+ * FIXME proper solution is to rework migration_entry_wait() so
+ * it does not need to take a reference on page.
+ */
+ if (is_device_private_page(page))
+ return true;
+
+ /*
+ * Only allow device public page to be migrated and account for
+ * the extra reference count imply by ZONE_DEVICE pages.
+ */
+ if (!is_device_public_page(page))
+ return false;
+ extra++;
+ }
+
+ /* For file back page */
+ if (page_mapping(page))
+ extra += 1 + page_has_private(page);
+
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+ return true;
+}
+
+/*
+ * migrate_vma_prepare() - lock pages and isolate them from the lru
+ * @migrate: migrate struct containing all migration information
+ *
+ * This locks pages that have been collected by migrate_vma_collect(). Once each
+ * page is locked it is isolated from the lru (for non-device pages). Finally,
+ * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
+ * migrated by concurrent kernel threads.
+ */
+static void migrate_vma_prepare(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+ bool allow_drain = true;
+
+ lru_add_drain();
+
+ for (i = 0; (i < npages) && migrate->cpages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ bool remap = true;
+
+ if (!page)
+ continue;
+
+ if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
+ /*
+ * Because we are migrating several pages there can be
+ * a deadlock between 2 concurrent migration where each
+ * are waiting on each other page lock.
+ *
+ * Make migrate_vma() a best effort thing and backoff
+ * for any page we can not lock right away.
+ */
+ if (!trylock_page(page)) {
+ migrate->src[i] = 0;
+ migrate->cpages--;
+ put_page(page);
+ continue;
+ }
+ remap = false;
+ migrate->src[i] |= MIGRATE_PFN_LOCKED;
+ }
+
+ /* ZONE_DEVICE pages are not on LRU */
+ if (!is_zone_device_page(page)) {
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's pagevec */
+ lru_add_drain_all();
+ allow_drain = false;
+ }
+
+ if (isolate_lru_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+ put_page(page);
+ }
+ continue;
+ }
+
+ /* Drop the reference we took in collect */
+ put_page(page);
+ }
+
+ if (!migrate_vma_check_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+
+ if (!is_zone_device_page(page)) {
+ get_page(page);
+ putback_lru_page(page);
+ }
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+
+ if (!is_zone_device_page(page))
+ putback_lru_page(page);
+ else
+ put_page(page);
+ }
+ }
+ }
+
+ for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_pte(page, migrate->vma, addr, page);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ put_page(page);
+ restore--;
+ }
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Replace page mapping (CPU page table pte) with a special migration pte entry
+ * and check again if it has been pinned. Pinned pages are restored because we
+ * cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ if (page_mapped(page)) {
+ try_to_unmap(page, flags);
+ if (page_mapped(page))
+ goto restore;
+ }
+
+ if (migrate_vma_check_page(page))
+ continue;
+
+restore:
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ }
+
+ for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_ptes(page, page, false);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ restore--;
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+ }
+}
+
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ unsigned long *dst)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ spinlock_t *ptl;
+ pte_t entry;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ /* Only allow populating anonymous memory */
+ if (!vma_is_anonymous(vma))
+ goto abort;
+
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ goto abort;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ goto abort;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ goto abort;
+
+ if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ goto abort;
+
+ /*
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
+ * pte_offset_map() on pmds where a huge pmd might be created
+ * from a different thread.
+ *
+ * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * parallel threads are excluded by other means.
+ *
+ * Here we only have down_read(mmap_sem).
+ */
+ if (pte_alloc(mm, pmdp, addr))
+ goto abort;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(pmdp)))
+ goto abort;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto abort;
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ goto abort;
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ if (is_zone_device_page(page)) {
+ if (is_device_private_page(page)) {
+ swp_entry_t swp_entry;
+
+ swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+ entry = swp_entry_to_pte(swp_entry);
+ } else if (is_device_public_page(page)) {
+ entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_mkdevmap(entry);
+ }
+ } else {
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+
+ if (pte_present(*ptep)) {
+ unsigned long pfn = pte_pfn(*ptep);
+
+ if (!is_zero_pfn(pfn)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+ flush = true;
+ } else if (!pte_none(*ptep)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+
+ /*
+ * Check for usefaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+
+ inc_mm_counter(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
+ if (!is_zone_device_page(page))
+ lru_cache_add_active_or_unevictable(page, vma);
+ get_page(page);
+
+ if (flush) {
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush_notify(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ } else {
+ /* No need to invalidate - it was non-present before */
+ set_pte_at(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ }
+
+ pte_unmap_unlock(ptep, ptl);
+ *src = MIGRATE_PFN_MIGRATE;
+ return;
+
+abort:
+ *src &= ~MIGRATE_PFN_MIGRATE;
+}
+
+/*
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+static void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr, i, mmu_start;
+ bool notified = false;
+
+ for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct address_space *mapping;
+ int r;
+
+ if (!newpage) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ if (!page) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+ continue;
+ }
+ if (!notified) {
+ mmu_start = addr;
+ notified = true;
+ mmu_notifier_invalidate_range_start(mm,
+ mmu_start,
+ migrate->end);
+ }
+ migrate_vma_insert_page(migrate, addr, newpage,
+